Tcl Source Code

Check-in [02260e5f9f]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[d2ffcca163] Limit parsing results that are documented to accept only ASCII chars to actually follow that constraint. This requires not trusting isalnum(.) and isalpha(.) to deliver portable identical results.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core-8-5-branch | potential incompatibility
Files: files | file ages | folders
SHA1: 02260e5f9fc4429ca93aa6d18dd5cdc69ed7d1c9
User & Date: dgp 2014-12-04 22:10:57
Context
2014-12-05
21:05
Ever since (Tcl)PushVarName() stopped making a recursive call to Tcl_ParseCommand() (in the pre-8.4.... check-in: 189de46e3a user: dgp tags: core-8-5-branch
12:28
[d2ffcca163] Limit parsing results that are documented to accept only ASCII chars to actually follow... check-in: 576177cd54 user: dgp tags: trunk, potential incompatibility
2014-12-04
22:10
[d2ffcca163] Limit parsing results that are documented to accept only ASCII chars to actually follow... check-in: 02260e5f9f user: dgp tags: core-8-5-branch, potential incompatibility
21:29
Limit isalpha(.) calls in the expr parser to only apply to known ASCII arguments to make the results... Closed-Leaf check-in: 3b91a3e3f4 user: dgp tags: bug-d2ffcca163
2014-12-03
23:46
[0c043a175] backport fix from trunk. check-in: 2f83d4eae5 user: dgp tags: core-8-5-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclCompExpr.c.

1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
....
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908

1909
1910
1911
1912
1913
1914
1915
....
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
....
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
....
1965
1966
1967
1968
1969
1970
1971







1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
	    }
	}
	*lexemePtr = GREATER;
	return 1;

    case 'i':
	if ((numBytes > 1) && (start[1] == 'n')
		&& ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {

	    /*
	     * Must make this check so we can tell the difference between
	     * the "in" operator and the "int" function name and the
	     * "infinity" numeric value.
	     */

................................................................................
	    *lexemePtr = IN_LIST;
	    return 2;
	}
	break;

    case 'e':
	if ((numBytes > 1) && (start[1] == 'q')
		&& ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {
	    *lexemePtr = STREQ;
	    return 2;
	}
	break;

    case 'n':
	if ((numBytes > 1) && ((numBytes == 2) || !isalpha(UCHAR(start[2])))) {

	    switch (start[1]) {
	    case 'e':
		*lexemePtr = STRNEQ;
		return 2;
	    case 'i':
		*lexemePtr = NOT_IN_LIST;
		return 2;
................................................................................
	    }
	}
    }

    literal = Tcl_NewObj();
    if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end,
	    TCL_PARSE_NO_WHITESPACE) == TCL_OK) {
	if (end < start + numBytes && !isalnum(UCHAR(*end))
		&& UCHAR(*end) != '_') {
	
	number:
	    TclInitStringRep(literal, start, end-start);
	    *lexemePtr = NUMBER;
	    if (literalPtr) {
		*literalPtr = literal;
	    } else {
................................................................................
	     * bareword syntax error?  Or should we join into one bareword?
	     * Example: Inf + luence + () becomes a valid function call.
	     * [Bug 3401704]
	     */
	    if (literal->typePtr == &tclDoubleType) {
		const char *p = start;
		while (p < end) {
		    if (!isalnum(UCHAR(*p++))) {
			/*
			 * The number has non-bareword characters, so we 
			 * must treat it as a number.
			 */
			goto number;
		    }
		}
................................................................................
	    }
	    /*
	     * Otherwise, fall through and parse the whole as a bareword.
	     */
	}
    }








    if (Tcl_UtfCharComplete(start, numBytes)) {
	scanned = Tcl_UtfToUniChar(start, &ch);
    } else {
	char utfBytes[TCL_UTF_MAX];
	memcpy(utfBytes, start, (size_t) numBytes);
	utfBytes[numBytes] = '\0';
	scanned = Tcl_UtfToUniChar(utfBytes, &ch);
    }
    if (!isalnum(UCHAR(ch))) {
	*lexemePtr = INVALID;
	Tcl_DecrRefCount(literal);
	return scanned;
    }
    end = start;
    while (isalnum(UCHAR(ch)) || (UCHAR(ch) == '_')) {
	end += scanned;
	numBytes -= scanned;
	if (Tcl_UtfCharComplete(end, numBytes)) {
	    scanned = Tcl_UtfToUniChar(end, &ch);
	} else {
	    char utfBytes[TCL_UTF_MAX];
	    memcpy(utfBytes, end, (size_t) numBytes);
	    utfBytes[numBytes] = '\0';
	    scanned = Tcl_UtfToUniChar(utfBytes, &ch);
	}
    }
    *lexemePtr = BAREWORD;
    if (literalPtr) {
	Tcl_SetStringObj(literal, start, (int) (end-start));
	*literalPtr = literal;
    } else {
	Tcl_DecrRefCount(literal);






|







 







|






|
>







 







|
<







 







|







 







>
>
>
>
>
>
>
|
|
|
|
|
|
|
|
<





|
|
|
<
<
<
<
<
<
<
<







1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
....
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
....
1917
1918
1919
1920
1921
1922
1923
1924

1925
1926
1927
1928
1929
1930
1931
....
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
....
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986

1987
1988
1989
1990
1991
1992
1993
1994








1995
1996
1997
1998
1999
2000
2001
	    }
	}
	*lexemePtr = GREATER;
	return 1;

    case 'i':
	if ((numBytes > 1) && (start[1] == 'n')
		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {

	    /*
	     * Must make this check so we can tell the difference between
	     * the "in" operator and the "int" function name and the
	     * "infinity" numeric value.
	     */

................................................................................
	    *lexemePtr = IN_LIST;
	    return 2;
	}
	break;

    case 'e':
	if ((numBytes > 1) && (start[1] == 'q')
		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
	    *lexemePtr = STREQ;
	    return 2;
	}
	break;

    case 'n':
	if ((numBytes > 1)
		&& ((numBytes == 2) || start[2] & 0x80 || !isalpha(start[2]))) {
	    switch (start[1]) {
	    case 'e':
		*lexemePtr = STRNEQ;
		return 2;
	    case 'i':
		*lexemePtr = NOT_IN_LIST;
		return 2;
................................................................................
	    }
	}
    }

    literal = Tcl_NewObj();
    if (TclParseNumber(NULL, literal, NULL, start, numBytes, &end,
	    TCL_PARSE_NO_WHITESPACE) == TCL_OK) {
	if (end < start + numBytes && !TclIsBareword(*end)) {

	
	number:
	    TclInitStringRep(literal, start, end-start);
	    *lexemePtr = NUMBER;
	    if (literalPtr) {
		*literalPtr = literal;
	    } else {
................................................................................
	     * bareword syntax error?  Or should we join into one bareword?
	     * Example: Inf + luence + () becomes a valid function call.
	     * [Bug 3401704]
	     */
	    if (literal->typePtr == &tclDoubleType) {
		const char *p = start;
		while (p < end) {
		    if (!TclIsBareword(*p++)) {
			/*
			 * The number has non-bareword characters, so we 
			 * must treat it as a number.
			 */
			goto number;
		    }
		}
................................................................................
	    }
	    /*
	     * Otherwise, fall through and parse the whole as a bareword.
	     */
	}
    }

    /*
     * We reject leading underscores in bareword.  No sensible reason why.
     * Might be inspired by reserved identifier rules in C, which of course
     * have no direct relevance here.
     */  

    if (!TclIsBareword(*start) || *start == '_') {
	if (Tcl_UtfCharComplete(start, numBytes)) {
	    scanned = Tcl_UtfToUniChar(start, &ch);
	} else {
	    char utfBytes[TCL_UTF_MAX];
	    memcpy(utfBytes, start, (size_t) numBytes);
	    utfBytes[numBytes] = '\0';
	    scanned = Tcl_UtfToUniChar(utfBytes, &ch);
	}

	*lexemePtr = INVALID;
	Tcl_DecrRefCount(literal);
	return scanned;
    }
    end = start;
    while (numBytes && TclIsBareword(*end)) {
	end += 1;
	numBytes -= 1;








    }
    *lexemePtr = BAREWORD;
    if (literalPtr) {
	Tcl_SetStringObj(literal, start, (int) (end-start));
	*literalPtr = literal;
    } else {
	Tcl_DecrRefCount(literal);

Changes to generic/tclInt.h.

2604
2605
2606
2607
2608
2609
2610

2611
2612
2613
2614
2615
2616
2617
MODULE_SCOPE void	TclInitNamespaceSubsystem(void);
MODULE_SCOPE void	TclInitNotifier(void);
MODULE_SCOPE void	TclInitObjSubsystem(void);
MODULE_SCOPE void	TclInitSubsystems(void);
MODULE_SCOPE int	TclInterpReady(Tcl_Interp *interp);
MODULE_SCOPE int	TclIsLocalScalar(const char *src, int len);
MODULE_SCOPE int	TclIsSpaceProc(char byte);

MODULE_SCOPE int	TclJoinThread(Tcl_ThreadId id, int *result);
MODULE_SCOPE void	TclLimitRemoveAllHandlers(Tcl_Interp *interp);
MODULE_SCOPE Tcl_Obj *	TclLindexList(Tcl_Interp *interp,
			    Tcl_Obj *listPtr, Tcl_Obj *argPtr);
MODULE_SCOPE Tcl_Obj *	TclLindexFlat(Tcl_Interp *interp, Tcl_Obj *listPtr,
			    int indexCount, Tcl_Obj *const indexArray[]);
/* TIP #280 */






>







2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
MODULE_SCOPE void	TclInitNamespaceSubsystem(void);
MODULE_SCOPE void	TclInitNotifier(void);
MODULE_SCOPE void	TclInitObjSubsystem(void);
MODULE_SCOPE void	TclInitSubsystems(void);
MODULE_SCOPE int	TclInterpReady(Tcl_Interp *interp);
MODULE_SCOPE int	TclIsLocalScalar(const char *src, int len);
MODULE_SCOPE int	TclIsSpaceProc(char byte);
MODULE_SCOPE int	TclIsBareword(char byte);
MODULE_SCOPE int	TclJoinThread(Tcl_ThreadId id, int *result);
MODULE_SCOPE void	TclLimitRemoveAllHandlers(Tcl_Interp *interp);
MODULE_SCOPE Tcl_Obj *	TclLindexList(Tcl_Interp *interp,
			    Tcl_Obj *listPtr, Tcl_Obj *argPtr);
MODULE_SCOPE Tcl_Obj *	TclLindexFlat(Tcl_Interp *interp, Tcl_Obj *listPtr,
			    int indexCount, Tcl_Obj *const indexArray[]);
/* TIP #280 */

Changes to generic/tclParse.c.

624
625
626
627
628
629
630









































631
632
633
634
635
636
637
....
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
....
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
{
    return CHAR_TYPE(byte) & (TYPE_SPACE) || byte == '\n';
}
 
/*
 *----------------------------------------------------------------------
 *









































 * ParseWhiteSpace --
 *
 *	Scans up to numBytes bytes starting at src, consuming white space
 *	between words as defined by Tcl's parsing rules.
 *
 * Results:
 *	Returns the number of bytes recognized as white space. Records at
................................................................................
    int append)			/* Non-zero means append tokens to existing
				 * information in parsePtr; zero means ignore
				 * existing tokens in parsePtr and
				 * reinitialize it. */
{
    Tcl_Token *tokenPtr;
    register const char *src;
    unsigned char c;
    int varIndex, offset;
    Tcl_UniChar ch;
    unsigned array;

    if ((numBytes == 0) || (start == NULL)) {
	return TCL_ERROR;
    }
    if (numBytes < 0) {
	numBytes = strlen(start);
................................................................................
	src++;
    } else {
	tokenPtr->type = TCL_TOKEN_TEXT;
	tokenPtr->start = src;
	tokenPtr->numComponents = 0;

	while (numBytes) {
	    if (Tcl_UtfCharComplete(src, numBytes)) {
		offset = Tcl_UtfToUniChar(src, &ch);
	    } else {
		char utfBytes[TCL_UTF_MAX];

		memcpy(utfBytes, src, (size_t) numBytes);
		utfBytes[numBytes] = '\0';
		offset = Tcl_UtfToUniChar(utfBytes, &ch);
	    }
	    c = UCHAR(ch);
	    if (isalnum(c) || (c == '_')) {	/* INTL: ISO only, UCHAR. */
		src += offset;
		numBytes -= offset;
		continue;
	    }
	    if ((c == ':') && (numBytes != 1) && (src[1] == ':')) {
		src += 2;
		numBytes -= 2;
		while (numBytes && (*src == ':')) {
		    src++;
		    numBytes--;
		}
		continue;






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







<
|
<







 







|
<
<
<
<
<
<
<
<
<
<
|
|


|







624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
....
1380
1381
1382
1383
1384
1385
1386

1387

1388
1389
1390
1391
1392
1393
1394
....
1463
1464
1465
1466
1467
1468
1469
1470










1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
{
    return CHAR_TYPE(byte) & (TYPE_SPACE) || byte == '\n';
}
 
/*
 *----------------------------------------------------------------------
 *
 * TclIsBareword--
 *
 *	Report whether byte is one that can be part of a "bareword".
 *	This concept is named in expression parsing, where it determines
 *	what can be a legal function name, but is the same definition used
 *	in determining what variable names can be parsed as variable
 *	substitutions without the benefit of enclosing braces.  The set of
 *	ASCII chars that are accepted are the numeric chars ('0'-'9'),
 *	the alphabetic chars ('a'-'z', 'A'-'Z')	and underscore ('_').
 *
 * Results:
 *	Returns 1, if byte is in the accepted set of chars, 0 otherwise.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
TclIsBareword(
    char byte)
{
    if (byte < '0' || byte > 'z') {
	return 0;
    }
    if (byte <= '9' || byte >= 'a') {
	return 1;
    }
    if (byte == '_') {
	return 1;
    }
    if (byte < 'A' || byte > 'Z') {
	return 0;
    }
    return 1;
}
 
/*
 *----------------------------------------------------------------------
 *
 * ParseWhiteSpace --
 *
 *	Scans up to numBytes bytes starting at src, consuming white space
 *	between words as defined by Tcl's parsing rules.
 *
 * Results:
 *	Returns the number of bytes recognized as white space. Records at
................................................................................
    int append)			/* Non-zero means append tokens to existing
				 * information in parsePtr; zero means ignore
				 * existing tokens in parsePtr and
				 * reinitialize it. */
{
    Tcl_Token *tokenPtr;
    register const char *src;

    int varIndex;

    unsigned array;

    if ((numBytes == 0) || (start == NULL)) {
	return TCL_ERROR;
    }
    if (numBytes < 0) {
	numBytes = strlen(start);
................................................................................
	src++;
    } else {
	tokenPtr->type = TCL_TOKEN_TEXT;
	tokenPtr->start = src;
	tokenPtr->numComponents = 0;

	while (numBytes) {
	    if (TclIsBareword(*src)) {










		src += 1;
		numBytes -= 1;
		continue;
	    }
	    if ((src[0] == ':') && (numBytes != 1) && (src[1] == ':')) {
		src += 2;
		numBytes -= 2;
		while (numBytes && (*src == ':')) {
		    src++;
		    numBytes--;
		}
		continue;

Changes to tests/parse.test.

652
653
654
655
656
657
658



659
660
661
662
663
664
665
} {1 {missing )} {missing )
    (remainder of script: "(cd)")
    invoked from within
"testparsevarname {$ab(cd)} 6 0"}}
test parse-12.25 {Tcl_ParseVarName procedure, nested array reference} testparser {
    testparser {$x(a$y(b$z))} 0
} {- {$x(a$y(b$z))} 1 word {$x(a$y(b$z))} 8 variable {$x(a$y(b$z))} 7 text x 0 text a 0 variable {$y(b$z)} 4 text y 0 text b 0 variable {$z} 1 text z 0 {}}




test parse-13.1 {Tcl_ParseVar procedure} testparsevar {
    set abc 24
    testparsevar {$abc.fg}
} {24 .fg}
test parse-13.2 {Tcl_ParseVar procedure, no variable name} testparsevar {
    testparsevar {$}






>
>
>







652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
} {1 {missing )} {missing )
    (remainder of script: "(cd)")
    invoked from within
"testparsevarname {$ab(cd)} 6 0"}}
test parse-12.25 {Tcl_ParseVarName procedure, nested array reference} testparser {
    testparser {$x(a$y(b$z))} 0
} {- {$x(a$y(b$z))} 1 word {$x(a$y(b$z))} 8 variable {$x(a$y(b$z))} 7 text x 0 text a 0 variable {$y(b$z)} 4 text y 0 text b 0 variable {$z} 1 text z 0 {}}
test parse-12.26 {Tcl_ParseVarName [d2ffcca163] non-ascii} testparser {
    testparser "$\u0433" -1
} "- {$\u0433} 1 word {$\u0433} 2 text {$} 0 text \u0433 0 {}"

test parse-13.1 {Tcl_ParseVar procedure} testparsevar {
    set abc 24
    testparsevar {$abc.fg}
} {24 .fg}
test parse-13.2 {Tcl_ParseVar procedure, no variable name} testparsevar {
    testparsevar {$}

Changes to tests/parseExpr.test.

1047
1048
1049
1050
1051
1052
1053









1054
1055
1056
1057
test parseExpr-22.17 {Bug 3401704} -constraints testexprparser -body {
    testexprparser 0b2 -1
} -returnCodes error -match glob -result {*invalid binary number*}
test parseExpr-22.18 {Bug 3401704} -constraints testexprparser -body {
    testexprparser 0b02 -1
} -returnCodes error -match glob -result {*invalid binary number*}











# cleanup
cleanupTests
return






>
>
>
>
>
>
>
>
>




1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
test parseExpr-22.17 {Bug 3401704} -constraints testexprparser -body {
    testexprparser 0b2 -1
} -returnCodes error -match glob -result {*invalid binary number*}
test parseExpr-22.18 {Bug 3401704} -constraints testexprparser -body {
    testexprparser 0b02 -1
} -returnCodes error -match glob -result {*invalid binary number*}

test parseExpr-22.19 {Bug d2ffcca163} -constraints testexprparser -body {
    testexprparser \u0433 -1
} -returnCodes error -match glob -result {*invalid character*}
test parseExpr-22.20 {Bug d2ffcca163} -constraints testexprparser -body {
    testexprparser \u043f -1
} -returnCodes error -match glob -result {*invalid character*}
test parseExpr-22.21 {Bug d2ffcca163} -constraints testexprparser -body {
    testexprparser in\u0433(0) -1
} -returnCodes error -match glob -result {missing operand*}

# cleanup
cleanupTests
return