cobol: Fix FUNCTION TRIM.

The FUNCTION TRIM now works properly with UTF16 inputs.

According to the ISO specification, the return type of a number of
intrinsic functions is defined by the variable type of their first
parameter.  A number of changes here cause more functions to honor that
requirement.

gcc/cobol/ChangeLog:

	* parse.y: BASECONVERT and TRIM take their type from their first
	parameter.
	* parse_util.h (intrinsic_return_field): The function_descrs[] is
	adjusted so that a number of functions take their return type from
	their first calling parameter.  intrinsic_return_field() has been
	refined.
	* symbols.cc (new_alphanumeric):  Use set_explicit() instead of
	set() in support of refined intrinsic function return type.

libgcobol/ChangeLog:

	* intrinsic.cc (__gg__trim):  Rewritten to work properly, and avoid
	unnecessary variable codeset encoding translation.
This commit is contained in:
Robert Dubner
2026-02-26 14:42:51 -05:00
parent ed2908e642
commit dc2f983e52
4 changed files with 128 additions and 99 deletions

View File

@@ -10892,7 +10892,7 @@ intrinsic: function_udf
| BASECONVERT '(' varg[r1] varg[r2] varg[r3] ')' {
location_set(@1);
$$ = new_alphanumeric("BASECONVERT");
$$ = new_alphanumeric("BASECONVERT", $r1->field->codeset.encoding);
cbl_unimplemented("BASECONVERT");
if( ! intrinsic_call_3($$, BASECONVERT, $r1, $r2, $r3 )) YYERROR;
}
@@ -11223,7 +11223,7 @@ intrinsic: function_udf
YYERROR;
break;
}
$$ = new_alphanumeric("TRIM");
$$ = new_alphanumeric("TRIM", $r1->field->codeset.encoding);
cbl_refer_t * how = new_reference($trim_trailing);
if( ! intrinsic_call_2($$, TRIM, $r1, how) ) YYERROR;
}

View File

@@ -46,15 +46,22 @@
* n variadic
* We use just A, I, N, or X, choosing the most general for each parameter.
*
* When FldInvalid is shown as the return type, it indicates that the type
* of the function is determined by the type of the first parameter.
* When FldInvalid is shown as the return type, it indicates that the Integer
* vs. Numeric type of the function is determined by the type of the first
* parameter.
*
* FldGroup is used when the first argument determines the encoding of the
* temporary. This is for functions that can be Alphanumeric or National.
*
* We use FldNumericBin5 for functions of type "Integer", and FldFloat for
* functions of type "Numeric",
*/
#define IntOrNum FldInvalid
#define AnumOrNat FldGroup
static const function_descr_t function_descrs[] = {
{ ABS, "ABS",
"__gg__abs", "N", {}, FldInvalid },
"__gg__abs", "N", {}, IntOrNum },
{ ACOS, "ACOS",
"__gg__acos", "N", {}, FldFloat },
{ ANNUITY, "ANNUITY",
@@ -63,7 +70,7 @@ static const function_descr_t function_descrs[] = {
"__gg__asin", "N", {}, FldFloat },
{ ATAN, "ATAN",
"__gg__atan", "N", {}, FldFloat },
{ BASECONVERT, "BASECONVERT",
{ BASECONVERT, "BASECONVERT", // See parse.y
"__gg__baseconvert", "XII", {}, FldAlphanumeric },
{ BIT_OF, "BIT-OF",
"__gg__bit_of", "X", {}, FldAlphanumeric },
@@ -81,9 +88,9 @@ static const function_descr_t function_descrs[] = {
{ COMBINED_DATETIME, "COMBINED-DATETIME",
"__gg__combined_datetime", "IN", {}, FldFloat },
{ CONCAT, "CONCAT",
"__gg__concat", "n", {}, FldAlphanumeric },
"__gg__concat", "n", {}, AnumOrNat },
{ CONVERT, "CONVERT",
"__gg__convert", "XII", {}, FldAlphanumeric },
"__gg__convert", "XII", {}, AnumOrNat },
{ COS, "COS",
"__gg__cos", "N", {}, FldFloat },
{ CURRENT_DATE, "CURRENT-DATE",
@@ -121,13 +128,13 @@ static const function_descr_t function_descrs[] = {
{ FIND_STRING, "FIND-STRING",
"__gg__find_string", "AXI", {}, FldNumericBin5 },
{ FORMATTED_CURRENT_DATE, "FORMATTED-CURRENT-DATE",
"__gg__formatted_current_date", "X", {}, FldAlphanumeric },
"__gg__formatted_current_date", "X", {}, AnumOrNat },
{ FORMATTED_DATE, "FORMATTED-DATE",
"__gg__formatted_date", "XX", {}, FldAlphanumeric },
"__gg__formatted_date", "XX", {}, AnumOrNat },
{ FORMATTED_DATETIME, "FORMATTED-DATETIME",
"__gg__formatted_datetime", "XINI", {}, FldAlphanumeric },
"__gg__formatted_datetime", "XINI", {}, AnumOrNat },
{ FORMATTED_TIME, "FORMATTED-TIME",
"__gg__formatted_time", "INI", {}, FldAlphanumeric },
"__gg__formatted_time", "INI", {}, AnumOrNat },
{ FRACTION_PART, "FRACTION-PART",
"__gg__fraction_part", "N", {}, FldFloat },
{ HEX_OF, "HEX-OF",
@@ -135,7 +142,7 @@ static const function_descr_t function_descrs[] = {
{ HEX_TO_CHAR, "HEX-TO-CHAR",
"__gg__hex_to_char", "X", {}, FldAlphanumeric },
{ HIGHEST_ALGEBRAIC, "HIGHEST-ALGEBRAIC",
"__gg__highest_algebraic", "N", {}, FldInvalid },
"__gg__highest_algebraic", "N", {}, IntOrNum },
{ INTEGER, "INTEGER",
"__gg__integer", "N", {}, FldNumericBin5 },
// requires FldBoolean
@@ -164,11 +171,11 @@ static const function_descr_t function_descrs[] = {
{ LOG10, "LOG10",
"__gg__log10", "N", {}, FldFloat },
{ LOWER_CASE, "LOWER-CASE",
"__gg__lower_case", "X", {}, FldAlphanumeric },
"__gg__lower_case", "X", {}, AnumOrNat },
{ LOWEST_ALGEBRAIC, "LOWEST-ALGEBRAIC",
"__gg__lowest_algebraic", "N", {}, FldInvalid },
"__gg__lowest_algebraic", "N", {}, IntOrNum },
{ MAXX, "MAX",
"__gg__max", "n", {}, FldInvalid },
"__gg__max", "n", {}, IntOrNum },
{ MEAN, "MEAN",
"__gg__mean", "n", {}, FldFloat },
{ MEDIAN, "MEDIAN",
@@ -176,7 +183,7 @@ static const function_descr_t function_descrs[] = {
{ MIDRANGE, "MIDRANGE",
"__gg__midrange", "n", {}, FldFloat },
{ MINN, "MIN",
"__gg__min", "n", {}, FldInvalid },
"__gg__min", "n", {}, IntOrNum },
{ MOD, "MOD",
"__gg__mod", "IN", {}, FldNumericBin5 },
{ MODULE_NAME, "MODULE-NAME",
@@ -202,11 +209,11 @@ static const function_descr_t function_descrs[] = {
{ RANDOM, "RANDOM",
"__gg__random", "I", {}, FldFloat },
{ RANGE, "RANGE",
"__gg__range", "n", {}, FldInvalid },
"__gg__range", "n", {}, IntOrNum },
{ REM, "REM",
"__gg__rem", "NN", {}, FldFloat },
{ REVERSE, "REVERSE",
"__gg__reverse", "X", {}, FldAlphanumeric },
"__gg__reverse", "X", {}, AnumOrNat },
{ SECONDS_FROM_FORMATTED_TIME, "SECONDS-FROM-FORMATTED-TIME",
"__gg__seconds_from_formatted_time", "XX", {}, FldFloat },
{ SECONDS_PAST_MIDNIGHT, "SECONDS_PAST_MIDNIGHT",
@@ -216,7 +223,7 @@ static const function_descr_t function_descrs[] = {
{ SIN, "SIN",
"__gg__sin", "N", {}, FldFloat },
{ SMALLEST_ALGEBRAIC, "SMALLEST-ALGEBRAIC",
"__gg__smallest_algebraic", "N", {}, FldInvalid },
"__gg__smallest_algebraic", "N", {}, IntOrNum },
{ SQRT, "SQRT",
"__gg__sqrt", "N", {}, FldFloat },
{ STANDARD_COMPARE, "STANDARD-COMPARE",
@@ -224,9 +231,9 @@ static const function_descr_t function_descrs[] = {
{ STANDARD_DEVIATION, "STANDARD-DEVIATION",
"__gg__standard_deviation", "n", {}, FldFloat },
{ SUBSTITUTE, "SUBSTITUTE",
"__gg__substitute", "XXX", {}, FldAlphanumeric },
"__gg__substitute", "XXX", {}, AnumOrNat },
{ SUM, "SUM",
"__gg__sum", "n", {}, FldInvalid },
"__gg__sum", "n", {}, IntOrNum },
{ TAN, "TAN",
"__gg__tan", "N", {}, FldFloat },
{ TEST_DATE_YYYYMMDD, "TEST-DATE-YYYYMMDD",
@@ -241,8 +248,8 @@ static const function_descr_t function_descrs[] = {
"__gg__test_numval_c", "XXU", {}, FldNumericBin5 },
{ TEST_NUMVAL_F, "TEST-NUMVAL-F",
"__gg__test_numval_f", "X", {}, FldNumericBin5 },
{ TRIM, "TRIM",
"__gg__trim", "XI", {}, FldNumericBin5 },
{ TRIM, "TRIM", // See parse.y
"__gg__trim", "XI", {}, FldAlphanumeric },
{ ULENGTH, "ULENGTH",
"__gg__ulength", "X", {}, FldAlphanumeric },
{ UPOS, "UPOS",
@@ -342,7 +349,8 @@ intrinsic_return_field(int token, std::vector<cbl_refer_t> args)
retval = new_tempnumeric_float();
break;
case FldInvalid:
// This is a flag that a function takes the type of its first input
// This is a flag that a function takes the Numeric vs Int type of its
// first argument
assert( args.size() );
switch(args[0].field->type)
{
@@ -350,7 +358,7 @@ intrinsic_return_field(int token, std::vector<cbl_refer_t> args)
case FldAlphanumeric:
case FldAlphaEdited:
case FldLiteralA:
retval = new_alphanumeric();
retval = new_alphanumeric(NULL, args[0].field->codeset.encoding);
break;
case FldNumericBinary:
case FldPacked:
@@ -370,6 +378,36 @@ intrinsic_return_field(int token, std::vector<cbl_refer_t> args)
break;
}
break;
case FldGroup:
// This is a flag that an alphanumeric function takes the encoding of the
// first argument
assert( args.size() );
switch(args[0].field->type)
{
case FldGroup:
case FldAlphanumeric:
case FldAlphaEdited:
case FldLiteralA:
case FldNumericBinary:
case FldPacked:
case FldNumericDisplay:
case FldNumericBin5:
case FldLiteralN:
case FldIndex:
case FldPointer:
retval = new_alphanumeric(NULL, args[0].field->codeset.encoding);
break;
case FldFloat:
retval = new_tempnumeric_float();
break;
default:
retval = NULL;
gcc_unreachable();
break;
}
break;
default:
retval = NULL;
gcc_unreachable();

View File

@@ -3783,14 +3783,22 @@ symbol_temporaries_free() {
cbl_field_t *
new_alphanumeric( const cbl_name_t name, cbl_encoding_t encoding ) {
cbl_field_t * field = new_temporary_impl(FldAlphanumeric, name);
//// if( encoding != no_encoding_e ) {
//// field->codeset.set(encoding);
//// }
//// //// Dubner hacking away: If name is non-null, then assume this is a
//// //// function definition, and force the codeset, which otherwise will have
//// //// defaulted to current_encoding('A'), and the valid() test in codeset.set
//// //// will have prevented it from being changed.
//// if( name && encoding != no_encoding_e ) {
//// field->codeset.set_explicit(encoding);
//// }
/* Jim's original code was hedged with protections apparently intended to
prevent encodings from changing. This proved unsatisfactor, especially
when I started implementing setting the temporary return type of functions
that take on the characteristics of their first parameter. So, I went
from codeset.set_encoding() to codeset.set_explicit(). */
if( encoding != no_encoding_e ) {
field->codeset.set(encoding);
}
//// Dubner hacking away: If name is non-null, then assume this is a
//// function definition, and force the codeset, which otherwise will have
//// defaulted to current_encoding('A'), and the valid() test in codeset.set
//// will have prevented it from being changed.
if( name && encoding != no_encoding_e ) {
field->codeset.set_explicit(encoding);
}
temporaries.add(field);

View File

@@ -3527,9 +3527,11 @@ __gg__trim( cblc_field_t *dest,
size_t arg2_offset,
size_t arg2_size)
{
cbl_encoding_t from = arg1->encoding;
cbl_encoding_t to = dest->encoding;
charmap_t *charmap = __gg__get_charmap(to);
// We assume that dest is an intermediate_e with the same encoding as arg1.
assert( dest->type == FldAlphanumeric
&& (dest->attr & intermediate_e)
&& dest->encoding == arg1->encoding );
charmap_t *charmap = __gg__get_charmap(arg1->encoding);
int stride = charmap->stride();
cbl_char_t mapped_space = charmap->mapped_character(ascii_space);
@@ -3539,80 +3541,61 @@ __gg__trim( cblc_field_t *dest,
arg2_offset,
arg2_size);
//static const int BOTH = 0;
static const int LEADING = 1; // Remove leading spaces
static const int TRAILING = 2; // Remove trailing spaces
#define LEADING 1 // Remove leading spaces
#define TRAILING 2 // Remove trailing spaces
if( dest->type != FldAlphanumeric ||
!(dest->attr & intermediate_e) )
char *left = reinterpret_cast<char *>(arg1->data) + arg1_offset;
char *right = left + arg1_size-stride; // Points AT the character, not beyond
switch(type)
{
fprintf(stderr,
"We expect the target of a FUNCTION TRIM to "
"be an intermediate alphanumeric\n");
abort();
}
// What is this all about?
dest->capacity = dest->offset;
// Make a copy of the input:
char *copy = static_cast<char *>(malloc(arg1_size));
massert(copy);
memcpy(copy, arg1->data+arg1_offset, arg1_size);
// Convert it to the destination encoding
__gg__convert_encoding_length(copy, arg1_size, from, to);
// No matter what, we want to find the leftmost non-space and the
// rightmost non-space:
char *left = copy;
char *right = left + arg1_size-stride;
// Find left and right: the first and last non-spaces
while( left <= right )
{
cbl_char_t cleft = charmap->getch(left, (size_t)0);
cbl_char_t cright = charmap->getch(right, (size_t)0);
if( cleft != mapped_space && cright != mapped_space )
case 0: // Strip off leading and trailing spaces
while(left <= right)
{
if( charmap->getch(left, (size_t)0) != mapped_space )
{
break;
}
left += stride;
}
while(left <= right)
{
if( charmap->getch(right, (size_t)0) != mapped_space )
{
break;
}
right -= stride;
}
break;
case LEADING: // Just leading
{
while(left <= right)
{
if( charmap->getch(left, (size_t)0) != mapped_space )
{
break;
}
left += stride;
}
break;
}
if( cleft == mapped_space )
case TRAILING: // Just trailing
{
left += stride;
}
if( cright == mapped_space )
{
right -= stride;
while(left <= right)
{
if( charmap->getch(right, (size_t)0) != mapped_space )
{
break;
}
right -= stride;
}
break;
}
}
if( type == LEADING )
{
// We want to leave any trailing spaces, so we return 'right' to its
// original value:
right = copy + arg1_size-1;
}
else if( type == TRAILING )
{
// We want to leave any leading spaces, so we return 'left' to its
// original value:
left = copy;
}
if( left > right )
{
// When the arg1 input string was empty, we want left to be right+1.
// The left/right loop can sometimes end up with left equal to right+2.
// That needs to be fixed:
left = right+stride;
}
size_t ncount = right+stride - left;
__gg__adjust_dest_size(dest, ncount);
memmove(dest->data, left, ncount);
free(copy);
}
#if HAVE_INITSTATE_R && HAVE_SRANDOM_R && HAVE_RANDOM_R