Tcl Source Code

Check-in [efe8f3e6b0]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix [bd94500678e837d7]: SEGFAULT by conversion of unicode (out of BMP) to byte-array
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | core-8-branch
Files: files | file ages | folders
SHA3-256: efe8f3e6b041792bf0adb3019e3643ee9f1de2c37c5d72a9e43277dd8338aafa
User & Date: jan.nijtmans 2019-02-27 21:29:43.367
Context
2019-03-02
16:04
Backport [bd94500678e837d7] from 8.7, preventing endless loops in UTF-8 conversions when handling su... check-in: 9e1984c250 user: jan.nijtmans tags: core-8-6-branch
2019-03-01
20:24
A confusion about signed vs unsigned comparision caused Tcl_UtfToUniChar() to return the wrong answe... check-in: 81046b694f user: dgp tags: core-8-branch
2019-02-28
20:19
merge 8.7 check-in: 06badfc315 user: dgp tags: tip-445-api-fix
2019-02-27
22:45
merge core-8-branch check-in: 6331f74415 user: jan.nijtmans tags: tommath-refactor
21:58
Merge 8.7 check-in: 727e74f081 user: jan.nijtmans tags: trunk
21:29
Fix [bd94500678e837d7]: SEGFAULT by conversion of unicode (out of BMP) to byte-array check-in: efe8f3e6b0 user: jan.nijtmans tags: core-8-branch
08:34
More use of (efficient) TclHasIntRep() macro. Also eliminate many (size_t) and (unsigned) type-casts... check-in: 51c585131e user: jan.nijtmans tags: core-8-branch
2019-02-25
21:10
Finish complete fix, all corner-cases correct now. Also spurious UTF-8 testcase failure (as seen on ... Closed-Leaf check-in: b3d886c84f user: jan.nijtmans tags: bug-bd94500678
Changes
Unified Diff Ignore Whitespace Patch
Changes to generic/tclBinary.c.
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
 badIndex:
    errorString = "not enough arguments for all format specifiers";
    goto error;

 badField:
    {
	Tcl_UniChar ch = 0;
	char buf[TCL_UTF_MAX + 1];

	TclUtfToUniChar(errorString, &ch);
	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		"bad field specifier \"%s\"", buf));
	return TCL_ERROR;
    }







|







1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
 badIndex:
    errorString = "not enough arguments for all format specifiers";
    goto error;

 badField:
    {
	Tcl_UniChar ch = 0;
	char buf[TCL_UTF_MAX + 1] = "";

	TclUtfToUniChar(errorString, &ch);
	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		"bad field specifier \"%s\"", buf));
	return TCL_ERROR;
    }
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
 badIndex:
    errorString = "not enough arguments for all format specifiers";
    goto error;

 badField:
    {
	Tcl_UniChar ch = 0;
	char buf[TCL_UTF_MAX + 1];

	TclUtfToUniChar(errorString, &ch);
	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		"bad field specifier \"%s\"", buf));
	return TCL_ERROR;
    }







|







1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
 badIndex:
    errorString = "not enough arguments for all format specifiers";
    goto error;

 badField:
    {
	Tcl_UniChar ch = 0;
	char buf[TCL_UTF_MAX + 1] = "";

	TclUtfToUniChar(errorString, &ch);
	buf[Tcl_UniCharToUtf(ch, buf)] = '\0';
	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		"bad field specifier \"%s\"", buf));
	return TCL_ERROR;
    }
Changes to generic/tclCmdMZ.c.
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232

	for ( ; stringPtr < end; stringPtr += len) {
	    int fullchar;
	    len = TclUtfToUniChar(stringPtr, &ch);
	    fullchar = ch;

#if TCL_UTF_MAX <= 4
	    if (!len) {
		len += TclUtfToUniChar(stringPtr, &ch);
		fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	    }
#endif

	    /*
	     * Assume Tcl_UniChar is an integral type...
	     */







|
|







1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232

	for ( ; stringPtr < end; stringPtr += len) {
	    int fullchar;
	    len = TclUtfToUniChar(stringPtr, &ch);
	    fullchar = ch;

#if TCL_UTF_MAX <= 4
	    if ((ch >= 0xD800) && (len < 3)) {
		len += TclUtfToUniChar(stringPtr + len, &ch);
		fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	    }
#endif

	    /*
	     * Assume Tcl_UniChar is an integral type...
	     */
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
	 */

	if (TclIsPureByteArray(objv[1])) {
	    unsigned char uch = (unsigned char) ch;

	    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
	} else {
	    char buf[4];

	    length = Tcl_UniCharToUtf(ch, buf);
	    if (!length) {
		length = Tcl_UniCharToUtf(-1, buf);
	    }
	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
	}
    }
    return TCL_OK;
}








|


|
|







1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
	 */

	if (TclIsPureByteArray(objv[1])) {
	    unsigned char uch = (unsigned char) ch;

	    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
	} else {
	    char buf[TCL_UTF_MAX] = "";

	    length = Tcl_UniCharToUtf(ch, buf);
	    if ((ch >= 0xD800) && (length < 3)) {
		length += Tcl_UniCharToUtf(-1, buf + length);
	    }
	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length));
	}
    }
    return TCL_OK;
}

1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
	}
	end = string1 + length1;
	for (; string1 < end; string1 += length2, failat++) {
	    int fullchar;
	    length2 = TclUtfToUniChar(string1, &ch);
	    fullchar = ch;
#if TCL_UTF_MAX <= 4
	    if (!length2) {
	    	length2 = TclUtfToUniChar(string1, &ch);
	    	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	    }
#endif
	    if (!chcomp(fullchar)) {
		result = 0;
		break;
	    }







|
|







1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
	}
	end = string1 + length1;
	for (; string1 < end; string1 += length2, failat++) {
	    int fullchar;
	    length2 = TclUtfToUniChar(string1, &ch);
	    fullchar = ch;
#if TCL_UTF_MAX <= 4
	    if ((ch >= 0xD800) && (length2 < 3)) {
	    	length2 += TclUtfToUniChar(string1 + length2, &ch);
	    	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	    }
#endif
	    if (!chcomp(fullchar)) {
		result = 0;
		break;
	    }
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
	return TCL_ERROR;
    }

    if (objc == 4) {
	const char *string = TclGetStringFromObj(objv[1], &length2);

	if ((length2 > 1) &&
		strncmp(string, "-nocase", (size_t) length2) == 0) {
	    nocase = 1;
	} else {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "bad option \"%s\": must be -nocase", string));
	    Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option",
		    string, NULL);
	    return TCL_ERROR;







|







1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
	return TCL_ERROR;
    }

    if (objc == 4) {
	const char *string = TclGetStringFromObj(objv[1], &length2);

	if ((length2 > 1) &&
		strncmp(string, "-nocase", length2) == 0) {
	    nocase = 1;
	} else {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "bad option \"%s\": must be -nocase", string));
	    Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option",
		    string, NULL);
	    return TCL_ERROR;
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
    }

    if (objc == 4) {
	int length;
	const char *string = TclGetStringFromObj(objv[1], &length);

	if ((length > 1) &&
	    strncmp(string, "-nocase", (size_t) length) == 0) {
	    nocase = TCL_MATCH_NOCASE;
	} else {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "bad option \"%s\": must be -nocase", string));
	    Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option",
		    string, NULL);
	    return TCL_ERROR;







|







2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
    }

    if (objc == 4) {
	int length;
	const char *string = TclGetStringFromObj(objv[1], &length);

	if ((length > 1) &&
	    strncmp(string, "-nocase", length) == 0) {
	    nocase = TCL_MATCH_NOCASE;
	} else {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "bad option \"%s\": must be -nocase", string));
	    Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option",
		    string, NULL);
	    return TCL_ERROR;
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
	Tcl_WrongNumArgs(interp, 1, objv,
		"?-nocase? ?-length int? string1 string2");
	return TCL_ERROR;
    }

    for (i = 1; i < objc-2; i++) {
	string2 = TclGetStringFromObj(objv[i], &length);
	if ((length > 1) && !strncmp(string2, "-nocase", (size_t)length)) {
	    nocase = 1;
	} else if ((length > 1)
		&& !strncmp(string2, "-length", (size_t)length)) {
	    if (i+1 >= objc-2) {
		goto str_cmp_args;
	    }
	    i++;
	    if (TclGetIntFromObj(interp, objv[i], &reqlength) != TCL_OK) {
		return TCL_ERROR;
	    }







|


|







2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
	Tcl_WrongNumArgs(interp, 1, objv,
		"?-nocase? ?-length int? string1 string2");
	return TCL_ERROR;
    }

    for (i = 1; i < objc-2; i++) {
	string2 = TclGetStringFromObj(objv[i], &length);
	if ((length > 1) && !strncmp(string2, "-nocase", length)) {
	    nocase = 1;
	} else if ((length > 1)
		&& !strncmp(string2, "-length", length)) {
	    if (i+1 >= objc-2) {
		goto str_cmp_args;
	    }
	    i++;
	    if (TclGetIntFromObj(interp, objv[i], &reqlength) != TCL_OK) {
		return TCL_ERROR;
	    }
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
	Tcl_WrongNumArgs(interp, 1, objv,
		"?-nocase? ?-length int? string1 string2");
	return TCL_ERROR;
    }

    for (i = 1; i < objc-2; i++) {
	string = TclGetStringFromObj(objv[i], &length);
	if ((length > 1) && !strncmp(string, "-nocase", (size_t)length)) {
	    *nocase = 1;
	} else if ((length > 1)
		&& !strncmp(string, "-length", (size_t)length)) {
	    if (i+1 >= objc-2) {
		goto str_cmp_args;
	    }
	    i++;
	    if (TclGetIntFromObj(interp, objv[i], reqlength) != TCL_OK) {
		return TCL_ERROR;
	    }







|


|







2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
	Tcl_WrongNumArgs(interp, 1, objv,
		"?-nocase? ?-length int? string1 string2");
	return TCL_ERROR;
    }

    for (i = 1; i < objc-2; i++) {
	string = TclGetStringFromObj(objv[i], &length);
	if ((length > 1) && !strncmp(string, "-nocase", length)) {
	    *nocase = 1;
	} else if ((length > 1)
		&& !strncmp(string, "-length", length)) {
	    if (i+1 >= objc-2) {
		goto str_cmp_args;
	    }
	    i++;
	    if (TclGetIntFromObj(interp, objv[i], reqlength) != TCL_OK) {
		return TCL_ERROR;
	    }
Changes to generic/tclCompCmdsSZ.c.
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
	PUSH("");
	count++;
    }

    for (endTokenPtr = tokenPtr + parse.numTokens;
	    tokenPtr < endTokenPtr; tokenPtr = TokenAfter(tokenPtr)) {
	int length, literal, catchRange, breakJump;
	char buf[TCL_UTF_MAX];
	JumpFixup startFixup, okFixup, returnFixup, breakFixup;
	JumpFixup continueFixup, otherFixup, endFixup;

	switch (tokenPtr->type) {
	case TCL_TOKEN_TEXT:
	    literal = TclRegisterLiteral(envPtr,
		    tokenPtr->start, tokenPtr->size, 0);







|







1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
	PUSH("");
	count++;
    }

    for (endTokenPtr = tokenPtr + parse.numTokens;
	    tokenPtr < endTokenPtr; tokenPtr = TokenAfter(tokenPtr)) {
	int length, literal, catchRange, breakJump;
	char buf[TCL_UTF_MAX] = "";
	JumpFixup startFixup, okFixup, returnFixup, breakFixup;
	JumpFixup continueFixup, otherFixup, endFixup;

	switch (tokenPtr->type) {
	case TCL_TOKEN_TEXT:
	    literal = TclRegisterLiteral(envPtr,
		    tokenPtr->start, tokenPtr->size, 0);
Changes to generic/tclCompile.c.
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
	    if (tempPtr != NULL) {
		Tcl_AppendToObj(tempPtr, tokenPtr->start, tokenPtr->size);
	    }
	    break;

	case TCL_TOKEN_BS:
	    if (tempPtr != NULL) {
		char utfBuf[TCL_UTF_MAX];
		int length = TclParseBackslash(tokenPtr->start,
			tokenPtr->size, NULL, utfBuf);

		Tcl_AppendToObj(tempPtr, utfBuf, length);
	    }
	    break;








|







1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
	    if (tempPtr != NULL) {
		Tcl_AppendToObj(tempPtr, tokenPtr->start, tokenPtr->size);
	    }
	    break;

	case TCL_TOKEN_BS:
	    if (tempPtr != NULL) {
		char utfBuf[TCL_UTF_MAX] = "";
		int length = TclParseBackslash(tokenPtr->start,
			tokenPtr->size, NULL, utfBuf);

		Tcl_AppendToObj(tempPtr, utfBuf, length);
	    }
	    break;

2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
				 * compile. */
    int count,			/* Number of tokens to consider at tokenPtr.
				 * Must be at least 1. */
    CompileEnv *envPtr)		/* Holds the resulting instructions. */
{
    Tcl_DString textBuffer;	/* Holds concatenated chars from adjacent
				 * TCL_TOKEN_TEXT, TCL_TOKEN_BS tokens. */
    char buffer[TCL_UTF_MAX];
    int i, numObjsToConcat, length, adjust;
    unsigned char *entryCodeNext = envPtr->codeNext;
#define NUM_STATIC_POS 20
    int isLiteral, maxNumCL, numCL;
    int *clPosition = NULL;
    int depth = TclGetStackDepth(envPtr);








|







2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
				 * compile. */
    int count,			/* Number of tokens to consider at tokenPtr.
				 * Must be at least 1. */
    CompileEnv *envPtr)		/* Holds the resulting instructions. */
{
    Tcl_DString textBuffer;	/* Holds concatenated chars from adjacent
				 * TCL_TOKEN_TEXT, TCL_TOKEN_BS tokens. */
    char buffer[TCL_UTF_MAX] = "";
    int i, numObjsToConcat, length, adjust;
    unsigned char *entryCodeNext = envPtr->codeNext;
#define NUM_STATIC_POS 20
    int isLiteral, maxNumCL, numCL;
    int *clPosition = NULL;
    int depth = TclGetStackDepth(envPtr);

Changes to generic/tclEncoding.c.
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
	    src += 1;
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
	} else {
	    int len = TclUtfToUniChar(src, chPtr);
	    src += len;
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
#if TCL_UTF_MAX <= 4
	    if (!len) {
		src += TclUtfToUniChar(src, chPtr);
		dst += Tcl_UniCharToUtf(*chPtr, dst);
	    }
#endif
	}
    }

    *srcReadPtr = src - srcStart;







|
|







2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
	    src += 1;
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
	} else {
	    int len = TclUtfToUniChar(src, chPtr);
	    src += len;
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
#if TCL_UTF_MAX <= 4
	    if ((*chPtr >= 0xD800) && (len < 3)) {
		src += TclUtfToUniChar(src + len, chPtr);
		dst += Tcl_UniCharToUtf(*chPtr, dst);
	    }
#endif
	}
    }

    *srcReadPtr = src - srcStart;
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024

	/*
	 * Check for illegal characters.
	 */

	if (ch > 0xff
#if TCL_UTF_MAX <= 4
		|| !len
#endif
		) {
	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
#if TCL_UTF_MAX <= 4
	    if (!len) len = 4;
#endif
	    /*
	     * Plunge on, using '?' as a fallback character.
	     */

	    ch = (Tcl_UniChar) '?';
	}







|







|







3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024

	/*
	 * Check for illegal characters.
	 */

	if (ch > 0xff
#if TCL_UTF_MAX <= 4
		|| ((ch >= 0xD800) && (len < 3))
#endif
		) {
	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
#if TCL_UTF_MAX <= 4
	    if ((ch >= 0xD800) && (len < 3)) len = 4;
#endif
	    /*
	     * Plunge on, using '?' as a fallback character.
	     */

	    ch = (Tcl_UniChar) '?';
	}
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
		     */

		    state = oldState;
		    result = TCL_CONVERT_NOSPACE;
		    break;
		}
		memcpy(dst, subTablePtr->sequence,
			(size_t) subTablePtr->sequenceLen);
		dst += subTablePtr->sequenceLen;
	    }
	}

	if (tablePrefixBytes[(word >> 8)] != 0) {
	    if (dst + 1 > dstEnd) {
		result = TCL_CONVERT_NOSPACE;







|







3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
		     */

		    state = oldState;
		    result = TCL_CONVERT_NOSPACE;
		    break;
		}
		memcpy(dst, subTablePtr->sequence,
			subTablePtr->sequenceLen);
		dst += subTablePtr->sequenceLen;
	    }
	}

	if (tablePrefixBytes[(word >> 8)] != 0) {
	    if (dst + 1 > dstEnd) {
		result = TCL_CONVERT_NOSPACE;
Changes to generic/tclExecute.c.
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
	} else if (TclIsPureByteArray(valuePtr)) {
	    objResultPtr = Tcl_NewByteArrayObj(
		    Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1);
	} else if (valuePtr->bytes && length == valuePtr->length) {
	    objResultPtr = Tcl_NewStringObj((const char *)
		    valuePtr->bytes+index, 1);
	} else {
	    char buf[4];
	    int ch = Tcl_GetUniChar(valuePtr, index);

	    /*
	     * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1)
	     * but creating the object as a string seems to be faster in
	     * practical use.
	     */
	    if (ch == -1) {
		objResultPtr = Tcl_NewObj();
	    } else {
		length = Tcl_UniCharToUtf(ch, buf);
		if (!length) {
		    length = Tcl_UniCharToUtf(-1, buf);
		}
		objResultPtr = Tcl_NewStringObj(buf, length);
	    }
	}

	TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr)));
	NEXT_INST_F(1, 2, 1);







|











|
|







5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
	} else if (TclIsPureByteArray(valuePtr)) {
	    objResultPtr = Tcl_NewByteArrayObj(
		    Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1);
	} else if (valuePtr->bytes && length == valuePtr->length) {
	    objResultPtr = Tcl_NewStringObj((const char *)
		    valuePtr->bytes+index, 1);
	} else {
	    char buf[TCL_UTF_MAX] = "";
	    int ch = Tcl_GetUniChar(valuePtr, index);

	    /*
	     * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1)
	     * but creating the object as a string seems to be faster in
	     * practical use.
	     */
	    if (ch == -1) {
		objResultPtr = Tcl_NewObj();
	    } else {
		length = Tcl_UniCharToUtf(ch, buf);
		if ((ch >= 0xD800) && (length < 3)) {
		    length += Tcl_UniCharToUtf(-1, buf + length);
		}
		objResultPtr = Tcl_NewStringObj(buf, length);
	    }
	}

	TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr)));
	NEXT_INST_F(1, 2, 1);
Changes to generic/tclParse.c.
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
				 * written. At most TCL_UTF_MAX bytes will be
				 * written there. */
{
    register const char *p = src+1;
    Tcl_UniChar unichar = 0;
    int result;
    int count;
    char buf[TCL_UTF_MAX];

    if (numBytes == 0) {
	if (readPtr != NULL) {
	    *readPtr = 0;
	}
	return 0;
    }







|







787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
				 * written. At most TCL_UTF_MAX bytes will be
				 * written there. */
{
    register const char *p = src+1;
    Tcl_UniChar unichar = 0;
    int result;
    int count;
    char buf[TCL_UTF_MAX] = "";

    if (numBytes == 0) {
	if (readPtr != NULL) {
	    *readPtr = 0;
	}
	return 0;
    }
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
    }

  done:
    if (readPtr != NULL) {
	*readPtr = count;
    }
    count = Tcl_UniCharToUtf(result, dst);
    if (!count) {
	/* Special case for handling high surrogates. */
	count = Tcl_UniCharToUtf(-1, dst);
    }
    return count;
}

/*
 *----------------------------------------------------------------------
 *







|

|







935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
    }

  done:
    if (readPtr != NULL) {
	*readPtr = count;
    }
    count = Tcl_UniCharToUtf(result, dst);
    if ((result >= 0xD800) && (count < 3)) {
	/* Special case for handling high surrogates. */
	count += Tcl_UniCharToUtf(-1, dst + count);
    }
    return count;
}

/*
 *----------------------------------------------------------------------
 *
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161

    adjust = 0;
    result = NULL;
    for (; count>0 && code==TCL_OK ; count--, tokenPtr++) {
	Tcl_Obj *appendObj = NULL;
	const char *append = NULL;
	int appendByteLength = 0;
	char utfCharBytes[TCL_UTF_MAX];

	switch (tokenPtr->type) {
	case TCL_TOKEN_TEXT:
	    append = tokenPtr->start;
	    appendByteLength = tokenPtr->size;
	    break;








|







2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161

    adjust = 0;
    result = NULL;
    for (; count>0 && code==TCL_OK ; count--, tokenPtr++) {
	Tcl_Obj *appendObj = NULL;
	const char *append = NULL;
	int appendByteLength = 0;
	char utfCharBytes[TCL_UTF_MAX] = "";

	switch (tokenPtr->type) {
	case TCL_TOKEN_TEXT:
	    append = tokenPtr->start;
	    appendByteLength = tokenPtr->size;
	    break;

Changes to generic/tclScan.c.
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
				 * required. */
{
    int gotXpg, gotSequential, value, i, flags;
    char *end;
    Tcl_UniChar ch = 0;
    int objIndex, xpgSize, nspace = numVars;
    int *nassign = TclStackAlloc(interp, nspace * sizeof(int));
    char buf[TCL_UTF_MAX+1];
    Tcl_Obj *errorMsg;		/* Place to build an error messages. Note that
				 * these are messy operations because we do
				 * not want to use the formatting engine;
				 * we're inside there! */

    /*
     * Initialize an array that records the number of times a variable is







|







257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
				 * required. */
{
    int gotXpg, gotSequential, value, i, flags;
    char *end;
    Tcl_UniChar ch = 0;
    int objIndex, xpgSize, nspace = numVars;
    int *nassign = TclStackAlloc(interp, nspace * sizeof(int));
    char buf[TCL_UTF_MAX+1] = "";
    Tcl_Obj *errorMsg;		/* Place to build an error messages. Note that
				 * these are messy operations because we do
				 * not want to use the formatting engine;
				 * we're inside there! */

    /*
     * Initialize an array that records the number of times a variable is
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
	    /*
	     * Scan a single Unicode character.
	     */

	    offset = TclUtfToUniChar(string, &sch);
	    i = (int)sch;
#if TCL_UTF_MAX == 4
	    if (!offset) {
		offset = TclUtfToUniChar(string, &sch);
		i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF);
	    }
#endif
	    string += offset;
	    if (!(flags & SCAN_SUPPRESS)) {
		objPtr = Tcl_NewWideIntObj(i);
		Tcl_IncrRefCount(objPtr);







|
|







878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
	    /*
	     * Scan a single Unicode character.
	     */

	    offset = TclUtfToUniChar(string, &sch);
	    i = (int)sch;
#if TCL_UTF_MAX == 4
	    if (((sch & 0xFC00) == 0xD800) && (offset < 3)) {
		offset += TclUtfToUniChar(string+offset, &sch);
		i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF);
	    }
#endif
	    string += offset;
	    if (!(flags & SCAN_SUPPRESS)) {
		objPtr = Tcl_NewWideIntObj(i);
		Tcl_IncrRefCount(objPtr);
Changes to generic/tclStringObj.c.
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
	    char buf[4];
	    int code, length;

	    if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
		goto error;
	    }
	    length = Tcl_UniCharToUtf(code, buf);
	    if (!length) {
		/* Special case for handling high surrogates. */
		length = Tcl_UniCharToUtf(-1, buf);
	    }
	    segment = Tcl_NewStringObj(buf, length);
	    Tcl_IncrRefCount(segment);
	    allocSegment = 1;
	    break;
	}








|

|







2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
	    char buf[4];
	    int code, length;

	    if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
		goto error;
	    }
	    length = Tcl_UniCharToUtf(code, buf);
	    if ((code >= 0xD800) && (length < 3)) {
		/* Special case for handling high surrogates. */
		length += Tcl_UniCharToUtf(-1, buf + length);
	    }
	    segment = Tcl_NewStringObj(buf, length);
	    Tcl_IncrRefCount(segment);
	    allocSegment = 1;
	    break;
	}

4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
    if (size > stringPtr->allocated) {
	GrowStringBuffer(objPtr, size, 1);
    }

  copyBytes:
    dst = objPtr->bytes + origLength;
    for (i = 0; i < numChars; i++) {
	dst += Tcl_UniCharToUtf((int) unicode[i], dst);
    }
    *dst = '\0';
    objPtr->length = dst - objPtr->bytes;
    return numChars;
}

/*







|







4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
    if (size > stringPtr->allocated) {
	GrowStringBuffer(objPtr, size, 1);
    }

  copyBytes:
    dst = objPtr->bytes + origLength;
    for (i = 0; i < numChars; i++) {
	dst += Tcl_UniCharToUtf(unicode[i], dst);
    }
    *dst = '\0';
    objPtr->length = dst - objPtr->bytes;
    return numChars;
}

/*
Changes to generic/tclUtf.c.
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180



181
182
183
184
185
186
187
188
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {
	    if ((ch & 0xF800) == 0xD800) {
		if (ch & 0x0400) {
		    /* Low surrogate */
		    if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
			    && ((buf[2] & 0xCF) == 0)) {
			/* Previous Tcl_UniChar was a High surrogate, so combine */
			buf[3] = (char) ((ch & 0x3F) | 0x80);
			buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80);
			return 4;
		    }
		    /* Previous Tcl_UniChar was not a High surrogate, so just output */
		} else {
		    /* High surrogate */
		    ch += 0x40;
		    /* Fill buffer with specific 3-byte (invalid) byte combination,
		       so following Low surrogate can recognize it and combine */
		    buf[2] = (char) ((ch << 4) & 0x30);
		    buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
		    buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
		    return 0;
		}
	    }
	    goto three;
	}
	if (ch <= 0x10FFFF) {
	    buf[3] = (char) ((ch | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 18) | 0xF0);
	    return 4;
	}
    } else if (ch == -1) {
	if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
		&& ((buf[2] & 0xCF) == 0)) {
	    ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2)
		    + ((buf[2] & 0x30) >> 4);



	    goto three;
	}
    }

    ch = 0xFFFD;
three:
    buf[2] = (char) ((ch | 0x80) & 0xBF);
    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);







|
<
|
|
|
|

|




|



|












|
|
|
|
>
>
>
|







141
142
143
144
145
146
147
148

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {
	    if ((ch & 0xF800) == 0xD800) {
		if (ch & 0x0400) {
		    /* Low surrogate */
		    if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) {

			/* Previous Tcl_UniChar was a high surrogate, so combine */
			buf[2] = (char) ((ch & 0x3F) | 0x80);
			buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80);
			return 3;
		    }
		    /* Previous Tcl_UniChar was not a high surrogate, so just output */
		} else {
		    /* High surrogate */
		    ch += 0x40;
		    /* Fill buffer with specific 3-byte (invalid) byte combination,
		       so following low surrogate can recognize it and combine */
		    buf[2] = (char) ((ch << 4) & 0x30);
		    buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
		    buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
		    return 1;
		}
	    }
	    goto three;
	}
	if (ch <= 0x10FFFF) {
	    buf[3] = (char) ((ch | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 18) | 0xF0);
	    return 4;
	}
    } else if (ch == -1) {
	if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)
		&& ((buf[-1] & 0xF8) == 0xF0)) {
	    ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2)
		    + ((buf[1] & 0x30) >> 4);
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[-1] = (char) ((ch >> 12) | 0xE0);
	    return 2;
	}
    }

    ch = 0xFFFD;
three:
    buf[2] = (char) ((ch | 0x80) & 0xBF);
    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
233
234
235
236
237
238
239



240
241
242
243
244
245
246
    for (w = uniStr; w < wEnd; ) {
	if (!len && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	len = Tcl_UniCharToUtf(*w, p);
	p += len;



	w++;
    }
    if (!len) {
	/* Special case for handling high surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));







>
>
>







235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
    for (w = uniStr; w < wEnd; ) {
	if (!len && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	len = Tcl_UniCharToUtf(*w, p);
	p += len;
	if ((*w >= 0xD800) && (len < 3)) {
	    len = 0; /* Indication that high surrogate was found */
	}
	w++;
    }
    if (!len) {
	/* Special case for handling high surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314














315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399

int
Tcl_UtfToUniChar(
    register const char *src,	/* The UTF-8 string. */
    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
				 * the UTF-8 string. */
{
    register int byte;

    /*
     * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
     */

    byte = *((unsigned char *) src);
    if (byte < 0xC0) {
	/*
	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
	 * Treats naked trail bytes 0x80 to 0x9F as valid characters from
	 * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
	 * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
	 * characters representing themselves.
	 */















	if ((unsigned)(byte-0x80) < (unsigned) 0x20) {
	    *chPtr = (Tcl_UniChar) cp1252[byte-0x80];
	} else {
	    *chPtr = (Tcl_UniChar) byte;
	}
	return 1;
    } else if (byte < 0xE0) {
	if ((src[1] & 0xC0) == 0x80) {
	    /*
	     * Two-byte-character lead-byte followed by a trail-byte.
	     */

	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
	    if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
		return 2;
	    }
	}

	/*
	 * A two-byte-character lead-byte not followed by trail-byte
	 * represents itself.
	 */
    } else if (byte < 0xF0) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
	    /*
	     * Three-byte-character lead byte followed by two trail bytes.
	     */

	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
	    if (*chPtr > 0x7FF) {
		return 3;
	    }
	}

	/*
	 * A three-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }
    else if (byte < 0xF8) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
	    /*
	     * Four-byte-character lead byte followed by three trail bytes.
	     */
#if TCL_UTF_MAX <= 4
	    Tcl_UniChar surrogate;

	    byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
	    surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
	    if (byte & 0x100000) {
		/* out of range, < 0x10000 or > 0x10ffff */
	    } else if (*chPtr != surrogate) {
		/* produce high surrogate, but don't advance source pointer */
		*chPtr = surrogate;
		return 0;
	    } else {
		/* produce low surrogate, and advance source pointer */
		*chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF));
		return 4;
	    }
#else
	    *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
	    if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
		return 4;
	    }
#endif
	}

	/*
	 * A four-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }

    *chPtr = (Tcl_UniChar) byte;
    return 1;
}

/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfToUniCharDString --







|


|












>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|

|








|















|

















|
<
<
|
<
|

<
<
<
<

|
|
|


|

|











|







297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380


381

382
383




384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411

int
Tcl_UtfToUniChar(
    register const char *src,	/* The UTF-8 string. */
    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
				 * the UTF-8 string. */
{
    Tcl_UniChar byte;

    /*
     * Unroll 1 to 4 byte UTF-8 sequences.
     */

    byte = *((unsigned char *) src);
    if (byte < 0xC0) {
	/*
	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
	 * Treats naked trail bytes 0x80 to 0x9F as valid characters from
	 * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
	 * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
	 * characters representing themselves.
	 */

#if TCL_UTF_MAX <= 4
	/* If *chPtr contains a high surrogate (produced by a previous
	 * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation
	 * bytes, then we must produce a follow-up low surrogate. We only
	 * do that if the high surrogate matches the bits we encounter.
	 */
	if ((byte >= 0x80)
		&& (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
		&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
		&& ((src[2] & 0xC0) == 0x80)) {
	    *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
	    return 3;
	}
#endif
	if (byte-0x80 < 0x20) {
	    *chPtr = cp1252[byte-0x80];
	} else {
	    *chPtr = byte;
	}
	return 1;
    } else if (byte < 0xE0) {
	if ((src[1] & 0xC0) == 0x80) {
	    /*
	     * Two-byte-character lead-byte followed by a trail-byte.
	     */

	    *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F));
	    if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) {
		return 2;
	    }
	}

	/*
	 * A two-byte-character lead-byte not followed by trail-byte
	 * represents itself.
	 */
    } else if (byte < 0xF0) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
	    /*
	     * Three-byte-character lead byte followed by two trail bytes.
	     */

	    *chPtr = (((byte & 0x0F) << 12)
		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
	    if (*chPtr > 0x7FF) {
		return 3;
	    }
	}

	/*
	 * A three-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }
    else if (byte < 0xF8) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
	    /*
	     * Four-byte-character lead byte followed by three trail bytes.
	     */
#if TCL_UTF_MAX <= 4
	    Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)


		    | ((src[2] & 0x3F) >> 4)) - 0x40;

	    if (high >= 0x400) {
		/* out of range, < 0x10000 or > 0x10ffff */




	    } else {
		/* produce high surrogate, advance source pointer */
		*chPtr = 0xD800 + high;
		return 1;
	    }
#else
	    *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
	    if ((*chPtr - 0x10000) <= 0xFFFFF) {
		return 4;
	    }
#endif
	}

	/*
	 * A four-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }

    *chPtr = byte;
    return 1;
}

/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfToUniCharDString --
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
	p += TclUtfToUniChar(p, &ch);
	*w++ = ch;
    }
    end += 4;
    while (p < end) {
	if (Tcl_UtfCharComplete(p, end-p)) {
	    p += TclUtfToUniChar(p, &ch);
	} else if ((unsigned)((UCHAR(*p)-0x80)) < (unsigned) 0x20) {
	    ch = (Tcl_UniChar) cp1252[UCHAR(*p++)-0x80];
	} else {
	    ch = UCHAR(*p++);
	}
	*w++ = ch;
    }
    *w = '\0';
    Tcl_DStringSetLength(dsPtr,







|
|







458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
	p += TclUtfToUniChar(p, &ch);
	*w++ = ch;
    }
    end += 4;
    while (p < end) {
	if (Tcl_UtfCharComplete(p, end-p)) {
	    p += TclUtfToUniChar(p, &ch);
	} else if (((UCHAR(*p)-0x80)) < 0x20) {
	    ch = cp1252[UCHAR(*p++)-0x80];
	} else {
	    ch = UCHAR(*p++);
	}
	*w++ = ch;
    }
    *w = '\0';
    Tcl_DStringSetLength(dsPtr,
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
    int len, fullchar;
    Tcl_UniChar find = 0;

    while (1) {
	len = TclUtfToUniChar(src, &find);
	fullchar = find;
#if TCL_UTF_MAX <= 4
	if (!len) {
	    len += TclUtfToUniChar(src, &find);
	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
	}
#endif
	if (fullchar == ch) {
	    return src;
	}
	if (*src == '\0') {







|
|







590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
    int len, fullchar;
    Tcl_UniChar find = 0;

    while (1) {
	len = TclUtfToUniChar(src, &find);
	fullchar = find;
#if TCL_UTF_MAX <= 4
	if ((ch >= 0xD800) && (len < 3)) {
	    len += TclUtfToUniChar(src + len, &find);
	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
	}
#endif
	if (fullchar == ch) {
	    return src;
	}
	if (*src == '\0') {
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
    const char *last;

    last = NULL;
    while (1) {
	len = TclUtfToUniChar(src, &find);
	fullchar = find;
#if TCL_UTF_MAX <= 4
	if (!len) {
	    len += TclUtfToUniChar(src, &find);
	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
	}
#endif
	if (fullchar == ch) {
	    last = src;
	}
	if (*src == '\0') {







|
|







638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
    const char *last;

    last = NULL;
    while (1) {
	len = TclUtfToUniChar(src, &find);
	fullchar = find;
#if TCL_UTF_MAX <= 4
	if ((ch >= 0xD800) && (len < 3)) {
	    len += TclUtfToUniChar(src + len, &find);
	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
	}
#endif
	if (fullchar == ch) {
	    last = src;
	}
	if (*src == '\0') {
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
Tcl_UtfNext(
    const char *src)		/* The current location in the string. */
{
    Tcl_UniChar ch = 0;
    int len = TclUtfToUniChar(src, &ch);

#if TCL_UTF_MAX <= 4
    if (len == 0) {
      len = TclUtfToUniChar(src, &ch);
    }
#endif
    return src + len;
}

/*
 *---------------------------------------------------------------------------







|
|







681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
Tcl_UtfNext(
    const char *src)		/* The current location in the string. */
{
    Tcl_UniChar ch = 0;
    int len = TclUtfToUniChar(src, &ch);

#if TCL_UTF_MAX <= 4
    if ((ch >= 0xD800) && (len < 3)) {
	len += TclUtfToUniChar(src + len, &ch);
    }
#endif
    return src + len;
}

/*
 *---------------------------------------------------------------------------
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
Tcl_UniCharAtIndex(
    register const char *src,	/* The UTF-8 string to dereference. */
    register int index)		/* The position of the desired character. */
{
    Tcl_UniChar ch = 0;
    int fullchar = 0;
#if TCL_UTF_MAX <= 4
	int len = 1;
#endif

    while (index-- >= 0) {
#if TCL_UTF_MAX <= 4
	src += (len = TclUtfToUniChar(src, &ch));
#else
	src += TclUtfToUniChar(src, &ch);
#endif
    }
    fullchar = ch;
#if TCL_UTF_MAX <= 4
     if (!len) {
	/* If last Tcl_UniChar was a high surrogate, combine with low surrogate */
	(void)TclUtfToUniChar(src, &ch);
	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
    }
#endif
    return fullchar;
}







|











|







763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
Tcl_UniCharAtIndex(
    register const char *src,	/* The UTF-8 string to dereference. */
    register int index)		/* The position of the desired character. */
{
    Tcl_UniChar ch = 0;
    int fullchar = 0;
#if TCL_UTF_MAX <= 4
	int len = 0;
#endif

    while (index-- >= 0) {
#if TCL_UTF_MAX <= 4
	src += (len = TclUtfToUniChar(src, &ch));
#else
	src += TclUtfToUniChar(src, &ch);
#endif
    }
    fullchar = ch;
#if TCL_UTF_MAX <= 4
    if ((ch >= 0xD800) && (len < 3)) {
	/* If last Tcl_UniChar was a high surrogate, combine with low surrogate */
	(void)TclUtfToUniChar(src, &ch);
	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
    }
#endif
    return fullchar;
}
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819

const char *
Tcl_UtfAtIndex(
    register const char *src,	/* The UTF-8 string. */
    register int index)		/* The position of the desired character. */
{
    Tcl_UniChar ch = 0;
    int len = 1;

    while (index-- > 0) {
	len = TclUtfToUniChar(src, &ch);
	src += len;
    }
#if TCL_UTF_MAX <= 4
     if (!len) {
	/* Index points at character following High Surrogate */
	src += TclUtfToUniChar(src, &ch);
    }
#endif
    return src;
}

/*







|






|
|







809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831

const char *
Tcl_UtfAtIndex(
    register const char *src,	/* The UTF-8 string. */
    register int index)		/* The position of the desired character. */
{
    Tcl_UniChar ch = 0;
    int len = 0;

    while (index-- > 0) {
	len = TclUtfToUniChar(src, &ch);
	src += len;
    }
#if TCL_UTF_MAX <= 4
    if ((ch >= 0xD800) && (len < 3)) {
	/* Index points at character following high Surrogate */
	src += TclUtfToUniChar(src, &ch);
    }
#endif
    return src;
}

/*
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
int
Tcl_UtfToUpper(
    char *str)			/* String to convert in place. */
{
    Tcl_UniChar ch = 0;
    int upChar;
    char *src, *dst;
    int bytes;

    /*
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	upChar = ch;
#if TCL_UTF_MAX <= 4
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	upChar = Tcl_UniCharToUpper(upChar);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
	 */

	if ((bytes < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) {
	    memcpy(dst, src, bytes);
	    dst += bytes;
	} else {
	    dst += Tcl_UniCharToUtf(upChar, dst);
	}
	src += bytes;
    }
    *dst = '\0';
    return (dst - str);
}

/*
 *----------------------------------------------------------------------







|







|


|
<
|












|
|
|



|







902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920

921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
int
Tcl_UtfToUpper(
    char *str)			/* String to convert in place. */
{
    Tcl_UniChar ch = 0;
    int upChar;
    char *src, *dst;
    int len;

    /*
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	upChar = ch;
#if TCL_UTF_MAX <= 4
	if ((ch >= 0xD800) && (len < 3)) {

	    len += TclUtfToUniChar(src + len, &ch);
	    /* Combine surrogates */
	    upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	upChar = Tcl_UniCharToUpper(upChar);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
	 */

	if ((len < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) {
	    memcpy(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(upChar, dst);
	}
	src += len;
    }
    *dst = '\0';
    return (dst - str);
}

/*
 *----------------------------------------------------------------------
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
int
Tcl_UtfToLower(
    char *str)			/* String to convert in place. */
{
    Tcl_UniChar ch = 0;
    int lowChar;
    char *src, *dst;
    int bytes;

    /*
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	lowChar = ch;
#if TCL_UTF_MAX <= 4
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	lowChar = Tcl_UniCharToLower(lowChar);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */

	if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
	    memcpy(dst, src, bytes);
	    dst += bytes;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += bytes;
    }
    *dst = '\0';
    return (dst - str);
}

/*
 *----------------------------------------------------------------------







|







|


|
<
|












|
|
|



|







964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982

983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
int
Tcl_UtfToLower(
    char *str)			/* String to convert in place. */
{
    Tcl_UniChar ch = 0;
    int lowChar;
    char *src, *dst;
    int len;

    /*
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	lowChar = ch;
#if TCL_UTF_MAX <= 4
	if ((ch >= 0xD800) && (len < 3)) {

	    len += TclUtfToUniChar(src + len, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	lowChar = Tcl_UniCharToLower(lowChar);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */

	if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
	    memcpy(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += len;
    }
    *dst = '\0';
    return (dst - str);
}

/*
 *----------------------------------------------------------------------
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
int
Tcl_UtfToTitle(
    char *str)			/* String to convert in place. */
{
    Tcl_UniChar ch = 0;
    int titleChar, lowChar;
    char *src, *dst;
    int bytes;

    /*
     * Capitalize the first character and then lowercase the rest of the
     * characters until we get to a null.
     */

    src = dst = str;

    if (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	titleChar = ch;
#if TCL_UTF_MAX <= 4
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	titleChar = Tcl_UniCharToTitle(titleChar);

	if ((bytes < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) {
	    memcpy(dst, src, bytes);
	    dst += bytes;
	} else {
	    dst += Tcl_UniCharToUtf(titleChar, dst);
	}
	src += bytes;
    }
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	lowChar = ch;
#if TCL_UTF_MAX <= 4
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}

	if ((bytes < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
	    memcpy(dst, src, bytes);
	    dst += bytes;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += bytes;
    }
    *dst = '\0';
    return (dst - str);
}

/*
 *----------------------------------------------------------------------







|









|


|
<
|






|
|
|



|


|


|
<
|









|
|
|



|







1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047

1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067

1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
int
Tcl_UtfToTitle(
    char *str)			/* String to convert in place. */
{
    Tcl_UniChar ch = 0;
    int titleChar, lowChar;
    char *src, *dst;
    int len;

    /*
     * Capitalize the first character and then lowercase the rest of the
     * characters until we get to a null.
     */

    src = dst = str;

    if (*src) {
	len = TclUtfToUniChar(src, &ch);
	titleChar = ch;
#if TCL_UTF_MAX <= 4
	if ((ch >= 0xD800) && (len < 3)) {

	    len += TclUtfToUniChar(src + len, &ch);
	    /* Combine surrogates */
	    titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	titleChar = Tcl_UniCharToTitle(titleChar);

	if ((len < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) {
	    memcpy(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(titleChar, dst);
	}
	src += len;
    }
    while (*src) {
	len = TclUtfToUniChar(src, &ch);
	lowChar = ch;
#if TCL_UTF_MAX <= 4
	if ((ch >= 0xD800) && (len < 3)) {

	    len += TclUtfToUniChar(src + len, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}

	if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
	    memcpy(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += len;
    }
    *dst = '\0';
    return (dst - str);
}

/*
 *----------------------------------------------------------------------
Changes to generic/tclUtil.c.
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
char
Tcl_Backslash(
    const char *src,		/* Points to the backslash character of a
				 * backslash sequence. */
    int *readPtr)		/* Fill in with number of characters read from
				 * src, unless NULL. */
{
    char buf[TCL_UTF_MAX];
    Tcl_UniChar ch = 0;

    Tcl_UtfBackslash(src, readPtr, buf);
    TclUtfToUniChar(buf, &ch);
    return (char) ch;
}
#endif /* !TCL_NO_DEPRECATED */







|







1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
char
Tcl_Backslash(
    const char *src,		/* Points to the backslash character of a
				 * backslash sequence. */
    int *readPtr)		/* Fill in with number of characters read from
				 * src, unless NULL. */
{
    char buf[TCL_UTF_MAX] = "";
    Tcl_UniChar ch = 0;

    Tcl_UtfBackslash(src, readPtr, buf);
    TclUtfToUniChar(buf, &ch);
    return (char) ch;
}
#endif /* !TCL_NO_DEPRECATED */
Changes to generic/tclZipfs.c.
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
 *	This cache is never cleared.
 *
 *-------------------------------------------------------------------------
 */

#ifdef _WIN32
#define LIBRARY_SIZE	    64

static inline int
WCharToUtf(
    const WCHAR *wSrc,
    char *dst)
{
    char *start = dst;

    while (*wSrc != '\0') {
	dst += Tcl_UniCharToUtf(*wSrc, dst);
	wSrc++;
    }
    *dst = '\0';
    return (int) (dst - start);
}
#endif /* _WIN32 */

Tcl_Obj *
TclZipfs_TclLibrary(void)
{
    Tcl_Obj *vfsInitScript;
    int found;







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







3151
3152
3153
3154
3155
3156
3157















3158
3159
3160
3161
3162
3163
3164
 *	This cache is never cleared.
 *
 *-------------------------------------------------------------------------
 */

#ifdef _WIN32
#define LIBRARY_SIZE	    64















#endif /* _WIN32 */

Tcl_Obj *
TclZipfs_TclLibrary(void)
{
    Tcl_Obj *vfsInitScript;
    int found;
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
    /*
     * Look for the library file system within the DLL/shared library.  Note
     * that we must mount the zip file and dll before releasing to search.
     */

#if defined(_WIN32)
    hModule = TclWinGetTclInstance();
    if (GetModuleFileNameW(hModule, wName, MAX_PATH) == 0) {
	GetModuleFileNameA(hModule, dllName, MAX_PATH);
    } else {
	WCharToUtf(wName, dllName);
    }

    if (ZipfsAppHookFindTclInit(dllName) == TCL_OK) {
	return Tcl_NewStringObj(zipfs_literal_tcl_library, -1);
    }
#elif /* !_WIN32 && */ defined(CFG_RUNTIME_DLLFILE)
    if (ZipfsAppHookFindTclInit(
	    CFG_RUNTIME_LIBDIR "/" CFG_RUNTIME_DLLFILE) == TCL_OK) {







|
<
<
|
<







3194
3195
3196
3197
3198
3199
3200
3201


3202

3203
3204
3205
3206
3207
3208
3209
    /*
     * Look for the library file system within the DLL/shared library.  Note
     * that we must mount the zip file and dll before releasing to search.
     */

#if defined(_WIN32)
    hModule = TclWinGetTclInstance();
    GetModuleFileNameW(hModule, wName, MAX_PATH);


    WideCharToMultiByte(CP_UTF8, 0, wName, -1, dllName, sizeof(dllName), NULL, NULL);


    if (ZipfsAppHookFindTclInit(dllName) == TCL_OK) {
	return Tcl_NewStringObj(zipfs_literal_tcl_library, -1);
    }
#elif /* !_WIN32 && */ defined(CFG_RUNTIME_DLLFILE)
    if (ZipfsAppHookFindTclInit(
	    CFG_RUNTIME_LIBDIR "/" CFG_RUNTIME_DLLFILE) == TCL_OK) {
Changes to win/tclWin32Dll.c.
568
569
570
571
572
573
574




575
576
577
578
579
580
581
    for (w = (TCHAR *)string; w < wEnd; ) {
	if (!blen && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	blen = Tcl_UniCharToUtf(*w, p);
	p += blen;




	w++;
    }
    if (!blen) {
	/* Special case for handling high surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - result));







>
>
>
>







568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
    for (w = (TCHAR *)string; w < wEnd; ) {
	if (!blen && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	blen = Tcl_UniCharToUtf(*w, p);
	p += blen;
	if ((*w >= 0xD800) && (blen < 3)) {
	    /* Indication that high surrogate is handled */
	    blen = 0;
	}
	w++;
    }
    if (!blen) {
	/* Special case for handling high surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - result));