Tcl Source Code

Check-in [576177cd54]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[d2ffcca163] Limit parsing results that are documented to accept only ASCII chars to actually follow that constraint. This requires not trusting isalnum(.) and isalpha(.) to deliver portable identical results.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | potential incompatibility
Files: files | file ages | folders
SHA1: 576177cd54083e79a1e37649b3f7ebc2eaee8dd3
User & Date: dgp 2014-12-05 12:28:01
Context
2014-12-05
21:28
Ever since (Tcl)PushVarName() stopped making a recursive call to Tcl_ParseCommand() (in the pre-8.4.... check-in: 5c3b0b33f0 user: dgp tags: trunk
15:05
Bringing core_zip_vfs up to date with trunk check-in: 52241094a3 user: hypnotoad tags: core_zip_vfs
12:28
[d2ffcca163] Limit parsing results that are documented to accept only ASCII chars to actually follow... check-in: 576177cd54 user: dgp tags: trunk, potential incompatibility
2014-12-04
22:10
[d2ffcca163] Limit parsing results that are documented to accept only ASCII chars to actually follow... check-in: 02260e5f9f user: dgp tags: core-8-5-branch, potential incompatibility
2014-12-03
23:55
And then another merge mark to finish. check-in: e6dd11413c user: dgp tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclCompExpr.c.

1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991

1992
1993
1994
1995
1996
1997
1998
....
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
....
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
....
2050
2051
2052
2053
2054
2055
2056







2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
	    }
	}
	*lexemePtr = GREATER;
	return 1;

    case 'i':
	if ((numBytes > 1) && (start[1] == 'n')
		&& ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
	    /*
	     * Must make this check so we can tell the difference between the
	     * "in" operator and the "int" function name and the "infinity"
	     * numeric value.
	     */

	    *lexemePtr = IN_LIST;
	    return 2;
	}
	break;

    case 'e':
	if ((numBytes > 1) && (start[1] == 'q')
		&& ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
	    *lexemePtr = STREQ;
	    return 2;
	}
	break;

    case 'n':
	if ((numBytes > 1) && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {

	    switch (start[1]) {
	    case 'e':
		*lexemePtr = STRNEQ;
		return 2;
	    case 'i':
		*lexemePtr = NOT_IN_LIST;
		return 2;
................................................................................
	    }
	}
    }

    literal = Tcl_NewObj();
    if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end,
	    TCL_PARSE_NO_WHITESPACE) == TCL_OK) {
	if (end < start + numBytes && !isalnum(UCHAR(*end))
		&& UCHAR(*end) != '_') {
	
	number:
	    TclInitStringRep(literal, start, end-start);
	    *lexemePtr = NUMBER;
	    if (literalPtr) {
		*literalPtr = literal;
	    } else {
................................................................................
	     * Example: Inf + luence + () becomes a valid function call.
	     * [Bug 3401704]
	     */
	    if (literal->typePtr == &tclDoubleType) {
		const char *p = start;

		while (p < end) {
		    if (!isalnum(UCHAR(*p++))) {
			/*
			 * The number has non-bareword characters, so we 
			 * must treat it as a number.
			 */
			goto number;
		    }
		}
................................................................................

	    /*
	     * Otherwise, fall through and parse the whole as a bareword.
	     */
	}
    }








    if (Tcl_UtfCharComplete(start, numBytes)) {
	scanned = Tcl_UtfToUniChar(start, &ch);
    } else {
	char utfBytes[TCL_UTF_MAX];

	memcpy(utfBytes, start, (size_t) numBytes);
	utfBytes[numBytes] = '\0';
	scanned = Tcl_UtfToUniChar(utfBytes, &ch);
    }
    if (!isalnum(UCHAR(ch))) {
	*lexemePtr = INVALID;
	Tcl_DecrRefCount(literal);
	return scanned;
    }
    end = start;
    while (isalnum(UCHAR(ch)) || (UCHAR(ch) == '_')) {
	end += scanned;
	numBytes -= scanned;
	if (Tcl_UtfCharComplete(end, numBytes)) {
	    scanned = Tcl_UtfToUniChar(end, &ch);
	} else {
	    char utfBytes[TCL_UTF_MAX];

	    memcpy(utfBytes, end, (size_t) numBytes);
	    utfBytes[numBytes] = '\0';
	    scanned = Tcl_UtfToUniChar(utfBytes, &ch);
	}
    }
    *lexemePtr = BAREWORD;
    if (literalPtr) {
	Tcl_SetStringObj(literal, start, (int) (end-start));
	*literalPtr = literal;
    } else {
	Tcl_DecrRefCount(literal);






|













|






|
>







 







|
<







 







|







 







>
>
>
>
>
>
>
|
|
|
|

|
|
|
|
<





|
|
|
<
<
<
<
<
<
<
<
<







1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
....
2000
2001
2002
2003
2004
2005
2006
2007

2008
2009
2010
2011
2012
2013
2014
....
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
....
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072

2073
2074
2075
2076
2077
2078
2079
2080









2081
2082
2083
2084
2085
2086
2087
	    }
	}
	*lexemePtr = GREATER;
	return 1;

    case 'i':
	if ((numBytes > 1) && (start[1] == 'n')
		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
	    /*
	     * Must make this check so we can tell the difference between the
	     * "in" operator and the "int" function name and the "infinity"
	     * numeric value.
	     */

	    *lexemePtr = IN_LIST;
	    return 2;
	}
	break;

    case 'e':
	if ((numBytes > 1) && (start[1] == 'q')
		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
	    *lexemePtr = STREQ;
	    return 2;
	}
	break;

    case 'n':
	if ((numBytes > 1)
		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
	    switch (start[1]) {
	    case 'e':
		*lexemePtr = STRNEQ;
		return 2;
	    case 'i':
		*lexemePtr = NOT_IN_LIST;
		return 2;
................................................................................
	    }
	}
    }

    literal = Tcl_NewObj();
    if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end,
	    TCL_PARSE_NO_WHITESPACE) == TCL_OK) {
	if (end < start + numBytes && !TclIsBareword(*end)) {

	
	number:
	    TclInitStringRep(literal, start, end-start);
	    *lexemePtr = NUMBER;
	    if (literalPtr) {
		*literalPtr = literal;
	    } else {
................................................................................
	     * Example: Inf + luence + () becomes a valid function call.
	     * [Bug 3401704]
	     */
	    if (literal->typePtr == &tclDoubleType) {
		const char *p = start;

		while (p < end) {
		    if (!TclIsBareword(*p++)) {
			/*
			 * The number has non-bareword characters, so we 
			 * must treat it as a number.
			 */
			goto number;
		    }
		}
................................................................................

	    /*
	     * Otherwise, fall through and parse the whole as a bareword.
	     */
	}
    }

    /*
     * We reject leading underscores in bareword.  No sensible reason why.
     * Might be inspired by reserved identifier rules in C, which of course
     * have no direct relevance here.
     */  

    if (!TclIsBareword(*start) || *start == '_') {
	if (Tcl_UtfCharComplete(start, numBytes)) {
	    scanned = Tcl_UtfToUniChar(start, &ch);
	} else {
	    char utfBytes[TCL_UTF_MAX];

	    memcpy(utfBytes, start, (size_t) numBytes);
	    utfBytes[numBytes] = '\0';
	    scanned = Tcl_UtfToUniChar(utfBytes, &ch);
	}

	*lexemePtr = INVALID;
	Tcl_DecrRefCount(literal);
	return scanned;
    }
    end = start;
    while (numBytes && TclIsBareword(*end)) {
	end += 1;
	numBytes -= 1;









    }
    *lexemePtr = BAREWORD;
    if (literalPtr) {
	Tcl_SetStringObj(literal, start, (int) (end-start));
	*literalPtr = literal;
    } else {
	Tcl_DecrRefCount(literal);

Changes to generic/tclInt.h.

2983
2984
2985
2986
2987
2988
2989

2990
2991
2992
2993
2994
2995
2996
MODULE_SCOPE void	TclInitNamespaceSubsystem(void);
MODULE_SCOPE void	TclInitNotifier(void);
MODULE_SCOPE void	TclInitObjSubsystem(void);
MODULE_SCOPE void	TclInitSubsystems(void);
MODULE_SCOPE int	TclInterpReady(Tcl_Interp *interp);
MODULE_SCOPE int	TclIsLocalScalar(const char *src, int len);
MODULE_SCOPE int	TclIsSpaceProc(char byte);

MODULE_SCOPE Tcl_Obj *	TclJoinPath(int elements, Tcl_Obj * const objv[]);
MODULE_SCOPE int	TclJoinThread(Tcl_ThreadId id, int *result);
MODULE_SCOPE void	TclLimitRemoveAllHandlers(Tcl_Interp *interp);
MODULE_SCOPE Tcl_Obj *	TclLindexList(Tcl_Interp *interp,
			    Tcl_Obj *listPtr, Tcl_Obj *argPtr);
MODULE_SCOPE Tcl_Obj *	TclLindexFlat(Tcl_Interp *interp, Tcl_Obj *listPtr,
			    int indexCount, Tcl_Obj *const indexArray[]);






>







2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
MODULE_SCOPE void	TclInitNamespaceSubsystem(void);
MODULE_SCOPE void	TclInitNotifier(void);
MODULE_SCOPE void	TclInitObjSubsystem(void);
MODULE_SCOPE void	TclInitSubsystems(void);
MODULE_SCOPE int	TclInterpReady(Tcl_Interp *interp);
MODULE_SCOPE int	TclIsLocalScalar(const char *src, int len);
MODULE_SCOPE int	TclIsSpaceProc(char byte);
MODULE_SCOPE int	TclIsBareword(char byte);
MODULE_SCOPE Tcl_Obj *	TclJoinPath(int elements, Tcl_Obj * const objv[]);
MODULE_SCOPE int	TclJoinThread(Tcl_ThreadId id, int *result);
MODULE_SCOPE void	TclLimitRemoveAllHandlers(Tcl_Interp *interp);
MODULE_SCOPE Tcl_Obj *	TclLindexList(Tcl_Interp *interp,
			    Tcl_Obj *listPtr, Tcl_Obj *argPtr);
MODULE_SCOPE Tcl_Obj *	TclLindexFlat(Tcl_Interp *interp, Tcl_Obj *listPtr,
			    int indexCount, Tcl_Obj *const indexArray[]);

Changes to generic/tclParse.c.

617
618
619
620
621
622
623









































624
625
626
627
628
629
630
....
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
....
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
{
    return CHAR_TYPE(byte) & (TYPE_SPACE) || byte == '\n';
}
 
/*
 *----------------------------------------------------------------------
 *









































 * ParseWhiteSpace --
 *
 *	Scans up to numBytes bytes starting at src, consuming white space
 *	between words as defined by Tcl's parsing rules.
 *
 * Results:
 *	Returns the number of bytes recognized as white space. Records at
................................................................................
    int append)			/* Non-zero means append tokens to existing
				 * information in parsePtr; zero means ignore
				 * existing tokens in parsePtr and
				 * reinitialize it. */
{
    Tcl_Token *tokenPtr;
    register const char *src;
    unsigned char c;
    int varIndex, offset;
    Tcl_UniChar ch;
    unsigned array;

    if ((numBytes == 0) || (start == NULL)) {
	return TCL_ERROR;
    }
    if (numBytes < 0) {
	numBytes = strlen(start);
................................................................................
	src++;
    } else {
	tokenPtr->type = TCL_TOKEN_TEXT;
	tokenPtr->start = src;
	tokenPtr->numComponents = 0;

	while (numBytes) {
	    if (Tcl_UtfCharComplete(src, numBytes)) {
		offset = Tcl_UtfToUniChar(src, &ch);
	    } else {
		char utfBytes[TCL_UTF_MAX];

		memcpy(utfBytes, src, (size_t) numBytes);
		utfBytes[numBytes] = '\0';
		offset = Tcl_UtfToUniChar(utfBytes, &ch);
	    }
	    c = UCHAR(ch);
	    if (isalnum(c) || (c == '_')) {	/* INTL: ISO only, UCHAR. */
		src += offset;
		numBytes -= offset;
		continue;
	    }
	    if ((c == ':') && (numBytes != 1) && (src[1] == ':')) {
		src += 2;
		numBytes -= 2;
		while (numBytes && (*src == ':')) {
		    src++;
		    numBytes--;
		}
		continue;






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







<
|
<







 







|
<
<
<
<
<
<
<
<
<
<
|
|


|







617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
....
1383
1384
1385
1386
1387
1388
1389

1390

1391
1392
1393
1394
1395
1396
1397
....
1466
1467
1468
1469
1470
1471
1472
1473










1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
{
    return CHAR_TYPE(byte) & (TYPE_SPACE) || byte == '\n';
}
 
/*
 *----------------------------------------------------------------------
 *
 * TclIsBareword--
 *
 *	Report whether byte is one that can be part of a "bareword".
 *	This concept is named in expression parsing, where it determines
 *	what can be a legal function name, but is the same definition used
 *	in determining what variable names can be parsed as variable
 *	substitutions without the benefit of enclosing braces.  The set of
 *	ASCII chars that are accepted are the numeric chars ('0'-'9'),
 *	the alphabetic chars ('a'-'z', 'A'-'Z')	and underscore ('_').
 *
 * Results:
 *	Returns 1, if byte is in the accepted set of chars, 0 otherwise.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
TclIsBareword(
    char byte)
{
    if (byte < '0' || byte > 'z') {
	return 0;
    }
    if (byte <= '9' || byte >= 'a') {
	return 1;
    }
    if (byte == '_') {
	return 1;
    }
    if (byte < 'A' || byte > 'Z') {
	return 0;
    }
    return 1;
}
 
/*
 *----------------------------------------------------------------------
 *
 * ParseWhiteSpace --
 *
 *	Scans up to numBytes bytes starting at src, consuming white space
 *	between words as defined by Tcl's parsing rules.
 *
 * Results:
 *	Returns the number of bytes recognized as white space. Records at
................................................................................
    int append)			/* Non-zero means append tokens to existing
				 * information in parsePtr; zero means ignore
				 * existing tokens in parsePtr and
				 * reinitialize it. */
{
    Tcl_Token *tokenPtr;
    register const char *src;

    int varIndex;

    unsigned array;

    if ((numBytes == 0) || (start == NULL)) {
	return TCL_ERROR;
    }
    if (numBytes < 0) {
	numBytes = strlen(start);
................................................................................
	src++;
    } else {
	tokenPtr->type = TCL_TOKEN_TEXT;
	tokenPtr->start = src;
	tokenPtr->numComponents = 0;

	while (numBytes) {
	    if (TclIsBareword(*src)) {










		src += 1;
		numBytes -= 1;
		continue;
	    }
	    if ((src[0] == ':') && (numBytes != 1) && (src[1] == ':')) {
		src += 2;
		numBytes -= 2;
		while (numBytes && (*src == ':')) {
		    src++;
		    numBytes--;
		}
		continue;

Changes to tests/parse.test.

659
660
661
662
663
664
665



666
667
668
669
670
671
672
} {1 {missing )} {missing )
    (remainder of script: "(cd)")
    invoked from within
"testparsevarname {$ab(cd)} 6 0"}}
test parse-12.25 {Tcl_ParseVarName procedure, nested array reference} testparser {
    testparser {$x(a$y(b$z))} 0
} {- {$x(a$y(b$z))} 1 word {$x(a$y(b$z))} 8 variable {$x(a$y(b$z))} 7 text x 0 text a 0 variable {$y(b$z)} 4 text y 0 text b 0 variable {$z} 1 text z 0 {}}




test parse-13.1 {Tcl_ParseVar procedure} testparsevar {
    set abc 24
    testparsevar {$abc.fg}
} {24 .fg}
test parse-13.2 {Tcl_ParseVar procedure, no variable name} testparsevar {
    testparsevar {$}






>
>
>







659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
} {1 {missing )} {missing )
    (remainder of script: "(cd)")
    invoked from within
"testparsevarname {$ab(cd)} 6 0"}}
test parse-12.25 {Tcl_ParseVarName procedure, nested array reference} testparser {
    testparser {$x(a$y(b$z))} 0
} {- {$x(a$y(b$z))} 1 word {$x(a$y(b$z))} 8 variable {$x(a$y(b$z))} 7 text x 0 text a 0 variable {$y(b$z)} 4 text y 0 text b 0 variable {$z} 1 text z 0 {}}
test parse-12.26 {Tcl_ParseVarName [d2ffcca163] non-ascii} testparser {
    testparser "$\u0433" -1
} "- {$\u0433} 1 word {$\u0433} 2 text {$} 0 text \u0433 0 {}"

test parse-13.1 {Tcl_ParseVar procedure} testparsevar {
    set abc 24
    testparsevar {$abc.fg}
} {24 .fg}
test parse-13.2 {Tcl_ParseVar procedure, no variable name} testparsevar {
    testparsevar {$}

Changes to tests/parseExpr.test.

1060
1061
1062
1063
1064
1065
1066









1067
1068
1069
1070
    dict get $o -errorcode
} -result {TCL PARSE EXPR BADNUMBER BINARY}
test parseExpr-22.18 {Bug 3401704} -constraints testexprparser -body {
    catch {testexprparser 0b02 -1} m o
    dict get $o -errorcode
} -result {TCL PARSE EXPR BADNUMBER BINARY}











# cleanup
cleanupTests
return






>
>
>
>
>
>
>
>
>




1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
    dict get $o -errorcode
} -result {TCL PARSE EXPR BADNUMBER BINARY}
test parseExpr-22.18 {Bug 3401704} -constraints testexprparser -body {
    catch {testexprparser 0b02 -1} m o
    dict get $o -errorcode
} -result {TCL PARSE EXPR BADNUMBER BINARY}

test parseExpr-22.19 {Bug d2ffcca163} -constraints testexprparser -body {
    testexprparser \u0433 -1
} -returnCodes error -match glob -result {*invalid character*}
test parseExpr-22.20 {Bug d2ffcca163} -constraints testexprparser -body {
    testexprparser \u043f -1
} -returnCodes error -match glob -result {*invalid character*}
test parseExpr-22.21 {Bug d2ffcca163} -constraints testexprparser -body {
    testexprparser in\u0433(0) -1
} -returnCodes error -match glob -result {missing operand*}

# cleanup
cleanupTests
return