Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Backport [bd94500678e837d7] from 8.7, preventing endless loops in UTF-8 conversions when handling surrogates. Only effective when compiling with -DTCL_UTF_MAX=4|6 (default: 3). Meant for benefit of Androwish. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | core-8-6-branch |
Files: | files | file ages | folders |
SHA3-256: |
9e1984c250d1a859bc607d362dc55ae9 |
User & Date: | jan.nijtmans 2019-03-02 16:04:59.969 |
References
2019-11-13
| ||
09:21 | • Ticket [d433c0e0ad] TCL_UTF_MAX == 4 problems status still Pending with 3 other changes artifact: 3ed9242a56 user: jan.nijtmans | |
Context
2019-03-02
| ||
16:35 | Fix some "scan.test" test-cases when TCL_UTF_MAX=4. Wrongly resolved merge-conflict in previous che... check-in: 8d1ff82057 user: jan.nijtmans tags: core-8-6-branch | |
16:08 | Merge 8.6 (one forgotten adaptation of surrogate handling, only compiled on Cygwin) check-in: 4f781560c5 user: jan.nijtmans tags: core-8-branch | |
16:04 | Backport [bd94500678e837d7] from 8.7, preventing endless loops in UTF-8 conversions when handling su... check-in: 9e1984c250 user: jan.nijtmans tags: core-8-6-branch | |
2019-03-01
| ||
20:11 | Update all internal tables to Unicode 12.0 check-in: 88da1b0307 user: jan.nijtmans tags: core-8-6-branch | |
2019-02-27
| ||
21:29 | Fix [bd94500678e837d7]: SEGFAULT by conversion of unicode (out of BMP) to byte-array check-in: efe8f3e6b0 user: jan.nijtmans tags: core-8-branch | |
Changes
Changes to generic/tclBinary.c.
︙ | ︙ | |||
1207 1208 1209 1210 1211 1212 1213 | badIndex: errorString = "not enough arguments for all format specifiers"; goto error; badField: { Tcl_UniChar ch = 0; | | | 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 | badIndex: errorString = "not enough arguments for all format specifiers"; goto error; badField: { Tcl_UniChar ch = 0; char buf[TCL_UTF_MAX + 1] = ""; TclUtfToUniChar(errorString, &ch); buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; Tcl_SetObjResult(interp, Tcl_ObjPrintf( "bad field specifier \"%s\"", buf)); return TCL_ERROR; } |
︙ | ︙ | |||
1577 1578 1579 1580 1581 1582 1583 | badIndex: errorString = "not enough arguments for all format specifiers"; goto error; badField: { Tcl_UniChar ch = 0; | | | 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 | badIndex: errorString = "not enough arguments for all format specifiers"; goto error; badField: { Tcl_UniChar ch = 0; char buf[TCL_UTF_MAX + 1] = ""; TclUtfToUniChar(errorString, &ch); buf[Tcl_UniCharToUtf(ch, buf)] = '\0'; Tcl_SetObjResult(interp, Tcl_ObjPrintf( "bad field specifier \"%s\"", buf)); return TCL_ERROR; } |
︙ | ︙ |
Changes to generic/tclCmdMZ.c.
︙ | ︙ | |||
1081 1082 1083 1084 1085 1086 1087 | for ( ; stringPtr < end; stringPtr += len) { int fullchar; len = TclUtfToUniChar(stringPtr, &ch); fullchar = ch; #if TCL_UTF_MAX == 4 | | | | 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 | for ( ; stringPtr < end; stringPtr += len) { int fullchar; len = TclUtfToUniChar(stringPtr, &ch); fullchar = ch; #if TCL_UTF_MAX == 4 if ((ch >= 0xD800) && (len < 3)) { len += TclUtfToUniChar(stringPtr + len, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif /* * Assume Tcl_UniChar is an integral type... */ |
︙ | ︙ | |||
1421 1422 1423 1424 1425 1426 1427 | */ if (TclIsPureByteArray(objv[1])) { unsigned char uch = (unsigned char) ch; Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { | | > > > > > | 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 | */ if (TclIsPureByteArray(objv[1])) { unsigned char uch = (unsigned char) ch; Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { char buf[TCL_UTF_MAX] = ""; length = Tcl_UniCharToUtf(ch, buf); #if TCL_UTF_MAX > 3 if ((ch >= 0xD800) && (length < 3)) { length += Tcl_UniCharToUtf(-1, buf + length); } #endif Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, length)); } } return TCL_OK; } /* |
︙ | ︙ | |||
1791 1792 1793 1794 1795 1796 1797 | } end = string1 + length1; for (; string1 < end; string1 += length2, failat++) { int fullchar; length2 = TclUtfToUniChar(string1, &ch); fullchar = ch; #if TCL_UTF_MAX == 4 | | | | 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 | } end = string1 + length1; for (; string1 < end; string1 += length2, failat++) { int fullchar; length2 = TclUtfToUniChar(string1, &ch); fullchar = ch; #if TCL_UTF_MAX == 4 if ((ch >= 0xD800) && (length2 < 3)) { length2 += TclUtfToUniChar(string1 + length2, &ch); fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } #endif if (!chcomp(fullchar)) { result = 0; break; } |
︙ | ︙ | |||
1872 1873 1874 1875 1876 1877 1878 | return TCL_ERROR; } if (objc == 4) { const char *string = TclGetStringFromObj(objv[1], &length2); if ((length2 > 1) && | | | 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 | return TCL_ERROR; } if (objc == 4) { const char *string = TclGetStringFromObj(objv[1], &length2); if ((length2 > 1) && strncmp(string, "-nocase", length2) == 0) { nocase = 1; } else { Tcl_SetObjResult(interp, Tcl_ObjPrintf( "bad option \"%s\": must be -nocase", string)); Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option", string, NULL); return TCL_ERROR; |
︙ | ︙ | |||
2139 2140 2141 2142 2143 2144 2145 | } if (objc == 4) { int length; const char *string = TclGetStringFromObj(objv[1], &length); if ((length > 1) && | | | 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 | } if (objc == 4) { int length; const char *string = TclGetStringFromObj(objv[1], &length); if ((length > 1) && strncmp(string, "-nocase", length) == 0) { nocase = TCL_MATCH_NOCASE; } else { Tcl_SetObjResult(interp, Tcl_ObjPrintf( "bad option \"%s\": must be -nocase", string)); Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option", string, NULL); return TCL_ERROR; |
︙ | ︙ | |||
2607 2608 2609 2610 2611 2612 2613 | Tcl_WrongNumArgs(interp, 1, objv, "?-nocase? ?-length int? string1 string2"); return TCL_ERROR; } for (i = 1; i < objc-2; i++) { string2 = TclGetStringFromObj(objv[i], &length2); | | | | 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 | Tcl_WrongNumArgs(interp, 1, objv, "?-nocase? ?-length int? string1 string2"); return TCL_ERROR; } for (i = 1; i < objc-2; i++) { string2 = TclGetStringFromObj(objv[i], &length2); if ((length2 > 1) && !strncmp(string2, "-nocase", length2)) { nocase = 1; } else if ((length2 > 1) && !strncmp(string2, "-length", length2)) { if (i+1 >= objc-2) { goto str_cmp_args; } i++; if (TclGetIntFromObj(interp, objv[i], &reqlength) != TCL_OK) { return TCL_ERROR; } |
︙ | ︙ | |||
2884 2885 2886 2887 2888 2889 2890 | Tcl_WrongNumArgs(interp, 1, objv, "?-nocase? ?-length int? string1 string2"); return TCL_ERROR; } for (i = 1; i < objc-2; i++) { string = TclGetStringFromObj(objv[i], &length); | | | | 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 | Tcl_WrongNumArgs(interp, 1, objv, "?-nocase? ?-length int? string1 string2"); return TCL_ERROR; } for (i = 1; i < objc-2; i++) { string = TclGetStringFromObj(objv[i], &length); if ((length > 1) && !strncmp(string, "-nocase", length)) { *nocase = 1; } else if ((length > 1) && !strncmp(string, "-length", length)) { if (i+1 >= objc-2) { goto str_cmp_args; } i++; if (TclGetIntFromObj(interp, objv[i], reqlength) != TCL_OK) { return TCL_ERROR; } |
︙ | ︙ |
Changes to generic/tclCompCmdsSZ.c.
︙ | ︙ | |||
1492 1493 1494 1495 1496 1497 1498 | PUSH(""); count++; } for (endTokenPtr = tokenPtr + parse.numTokens; tokenPtr < endTokenPtr; tokenPtr = TokenAfter(tokenPtr)) { int length, literal, catchRange, breakJump; | | | 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 | PUSH(""); count++; } for (endTokenPtr = tokenPtr + parse.numTokens; tokenPtr < endTokenPtr; tokenPtr = TokenAfter(tokenPtr)) { int length, literal, catchRange, breakJump; char buf[TCL_UTF_MAX] = ""; JumpFixup startFixup, okFixup, returnFixup, breakFixup; JumpFixup continueFixup, otherFixup, endFixup; switch (tokenPtr->type) { case TCL_TOKEN_TEXT: literal = TclRegisterNewLiteral(envPtr, tokenPtr->start, tokenPtr->size); |
︙ | ︙ |
Changes to generic/tclCompile.c.
︙ | ︙ | |||
1719 1720 1721 1722 1723 1724 1725 | if (tempPtr != NULL) { Tcl_AppendToObj(tempPtr, tokenPtr->start, tokenPtr->size); } break; case TCL_TOKEN_BS: if (tempPtr != NULL) { | | | 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 | if (tempPtr != NULL) { Tcl_AppendToObj(tempPtr, tokenPtr->start, tokenPtr->size); } break; case TCL_TOKEN_BS: if (tempPtr != NULL) { char utfBuf[TCL_UTF_MAX] = ""; int length = TclParseBackslash(tokenPtr->start, tokenPtr->size, NULL, utfBuf); Tcl_AppendToObj(tempPtr, utfBuf, length); } break; |
︙ | ︙ | |||
2333 2334 2335 2336 2337 2338 2339 | * compile. */ int count, /* Number of tokens to consider at tokenPtr. * Must be at least 1. */ CompileEnv *envPtr) /* Holds the resulting instructions. */ { Tcl_DString textBuffer; /* Holds concatenated chars from adjacent * TCL_TOKEN_TEXT, TCL_TOKEN_BS tokens. */ | | | 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 | * compile. */ int count, /* Number of tokens to consider at tokenPtr. * Must be at least 1. */ CompileEnv *envPtr) /* Holds the resulting instructions. */ { Tcl_DString textBuffer; /* Holds concatenated chars from adjacent * TCL_TOKEN_TEXT, TCL_TOKEN_BS tokens. */ char buffer[TCL_UTF_MAX] = ""; int i, numObjsToConcat, length, adjust; unsigned char *entryCodeNext = envPtr->codeNext; #define NUM_STATIC_POS 20 int isLiteral, maxNumCL, numCL; int *clPosition = NULL; int depth = TclGetStackDepth(envPtr); |
︙ | ︙ |
Changes to generic/tclEncoding.c.
︙ | ︙ | |||
2361 2362 2363 2364 2365 2366 2367 | src += 1; dst += Tcl_UniCharToUtf(*chPtr, dst); } else { int len = TclUtfToUniChar(src, chPtr); src += len; dst += Tcl_UniCharToUtf(*chPtr, dst); #if TCL_UTF_MAX == 4 | | | | 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 | src += 1; dst += Tcl_UniCharToUtf(*chPtr, dst); } else { int len = TclUtfToUniChar(src, chPtr); src += len; dst += Tcl_UniCharToUtf(*chPtr, dst); #if TCL_UTF_MAX == 4 if ((*chPtr >= 0xD800) && (len < 3)) { src += TclUtfToUniChar(src + len, chPtr); dst += Tcl_UniCharToUtf(*chPtr, dst); } #endif } } *srcReadPtr = src - srcStart; |
︙ | ︙ | |||
2983 2984 2985 2986 2987 2988 2989 | /* * Check for illegal characters. */ if (ch > 0xff #if TCL_UTF_MAX == 4 | | | | 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 | /* * Check for illegal characters. */ if (ch > 0xff #if TCL_UTF_MAX == 4 || ((ch >= 0xD800) && (len < 3)) #endif ) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } #if TCL_UTF_MAX == 4 if ((ch >= 0xD800) && (len < 3)) len = 4; #endif /* * Plunge on, using '?' as a fallback character. */ ch = (Tcl_UniChar) '?'; |
︙ | ︙ | |||
3421 3422 3423 3424 3425 3426 3427 | */ state = oldState; result = TCL_CONVERT_NOSPACE; break; } memcpy(dst, subTablePtr->sequence, | | | 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 | */ state = oldState; result = TCL_CONVERT_NOSPACE; break; } memcpy(dst, subTablePtr->sequence, subTablePtr->sequenceLen); dst += subTablePtr->sequenceLen; } } if (tablePrefixBytes[(word >> 8)] != 0) { if (dst + 1 > dstEnd) { result = TCL_CONVERT_NOSPACE; |
︙ | ︙ |
Changes to generic/tclExecute.c.
︙ | ︙ | |||
5511 5512 5513 5514 5515 5516 5517 | } else if (TclIsPureByteArray(valuePtr)) { objResultPtr = Tcl_NewByteArrayObj( Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1); } else if (valuePtr->bytes && length == valuePtr->length) { objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { | | | 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 | } else if (TclIsPureByteArray(valuePtr)) { objResultPtr = Tcl_NewByteArrayObj( Tcl_GetByteArrayFromObj(valuePtr, NULL)+index, 1); } else if (valuePtr->bytes && length == valuePtr->length) { objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { char buf[TCL_UTF_MAX] = ""; Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index); /* * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1) * but creating the object as a string seems to be faster in * practical use. */ |
︙ | ︙ |
Changes to generic/tclParse.c.
︙ | ︙ | |||
840 841 842 843 844 845 846 | * written. At most TCL_UTF_MAX bytes will be * written there. */ { register const char *p = src+1; Tcl_UniChar unichar = 0; int result; int count; | | | 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 | * written. At most TCL_UTF_MAX bytes will be * written there. */ { register const char *p = src+1; Tcl_UniChar unichar = 0; int result; int count; char buf[TCL_UTF_MAX] = ""; if (numBytes == 0) { if (readPtr != NULL) { *readPtr = 0; } return 0; } |
︙ | ︙ | |||
989 990 991 992 993 994 995 | done: if (readPtr != NULL) { *readPtr = count; } count = Tcl_UniCharToUtf(result, dst); #if TCL_UTF_MAX > 3 | | | | 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 | done: if (readPtr != NULL) { *readPtr = count; } count = Tcl_UniCharToUtf(result, dst); #if TCL_UTF_MAX > 3 if ((result >= 0xD800) && (count < 3)) { count += Tcl_UniCharToUtf(-1, dst + count); } #endif return count; } /* *---------------------------------------------------------------------- |
︙ | ︙ | |||
2213 2214 2215 2216 2217 2218 2219 | adjust = 0; result = NULL; for (; count>0 && code==TCL_OK ; count--, tokenPtr++) { Tcl_Obj *appendObj = NULL; const char *append = NULL; int appendByteLength = 0; | | | 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 | adjust = 0; result = NULL; for (; count>0 && code==TCL_OK ; count--, tokenPtr++) { Tcl_Obj *appendObj = NULL; const char *append = NULL; int appendByteLength = 0; char utfCharBytes[TCL_UTF_MAX] = ""; switch (tokenPtr->type) { case TCL_TOKEN_TEXT: append = tokenPtr->start; appendByteLength = tokenPtr->size; break; |
︙ | ︙ |
Changes to generic/tclScan.c.
︙ | ︙ | |||
256 257 258 259 260 261 262 | * required. */ { int gotXpg, gotSequential, value, i, flags; char *end; Tcl_UniChar ch = 0; int objIndex, xpgSize, nspace = numVars; int *nassign = TclStackAlloc(interp, nspace * sizeof(int)); | | | 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | * required. */ { int gotXpg, gotSequential, value, i, flags; char *end; Tcl_UniChar ch = 0; int objIndex, xpgSize, nspace = numVars; int *nassign = TclStackAlloc(interp, nspace * sizeof(int)); char buf[TCL_UTF_MAX+1] = ""; Tcl_Obj *errorMsg; /* Place to build an error messages. Note that * these are messy operations because we do * not want to use the formatting engine; * we're inside there! */ /* * Initialize an array that records the number of times a variable is |
︙ | ︙ | |||
885 886 887 888 889 890 891 | * Scan a single Unicode character. */ offset = TclUtfToUniChar(string, &sch); i = (int)sch; #if TCL_UTF_MAX == 4 if (!offset) { | | | 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 | * Scan a single Unicode character. */ offset = TclUtfToUniChar(string, &sch); i = (int)sch; #if TCL_UTF_MAX == 4 if (!offset) { offset = TclUtfToUniChar(string, &sch); i = (((i<<10) & 0x0FFC00) + 0x10000) + (sch & 0x3FF); } #endif string += offset; if (!(flags & SCAN_SUPPRESS)) { objPtr = Tcl_NewIntObj(i); Tcl_IncrRefCount(objPtr); |
︙ | ︙ |
Changes to generic/tclStringObj.c.
︙ | ︙ | |||
1998 1999 2000 2001 2002 2003 2004 | int code, length; if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) { goto error; } length = Tcl_UniCharToUtf(code, buf); #if TCL_UTF_MAX > 3 | | | | 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 | int code, length; if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) { goto error; } length = Tcl_UniCharToUtf(code, buf); #if TCL_UTF_MAX > 3 if ((code >= 0xD800) && (length < 3)) { /* Special case for handling high surrogates. */ length += Tcl_UniCharToUtf(-1, buf + length); } #endif segment = Tcl_NewStringObj(buf, length); Tcl_IncrRefCount(segment); allocSegment = 1; break; } |
︙ | ︙ | |||
3172 3173 3174 3175 3176 3177 3178 | if (size > stringPtr->allocated) { GrowStringBuffer(objPtr, size, 1); } copyBytes: dst = objPtr->bytes + origLength; for (i = 0; i < numChars; i++) { | | | 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 | if (size > stringPtr->allocated) { GrowStringBuffer(objPtr, size, 1); } copyBytes: dst = objPtr->bytes + origLength; for (i = 0; i < numChars; i++) { dst += Tcl_UniCharToUtf(unicode[i], dst); } *dst = '\0'; objPtr->length = dst - objPtr->bytes; return numChars; } /* |
︙ | ︙ |
Changes to generic/tclStubInit.c.
︙ | ︙ | |||
283 284 285 286 287 288 289 290 291 292 293 294 295 296 | for (w = (wchar_t *)string; w < wEnd; ) { if (!blen && ((*w & 0xFC00) != 0xDC00)) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } blen = Tcl_UniCharToUtf(*w, p); p += blen; w++; } if (!blen) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); | > > > > | 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | for (w = (wchar_t *)string; w < wEnd; ) { if (!blen && ((*w & 0xFC00) != 0xDC00)) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } blen = Tcl_UniCharToUtf(*w, p); p += blen; if ((*w >= 0xD800) && (blen < 3)) { /* Indication that high surrogate is handled */ blen = 0; } w++; } if (!blen) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); |
︙ | ︙ |
Changes to generic/tclUtf.c.
︙ | ︙ | |||
154 155 156 157 158 159 160 | return 2; } if (ch <= 0xFFFF) { #if TCL_UTF_MAX > 3 if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ | | < | | | | | | | | | | | > > > | | 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | return 2; } if (ch <= 0xFFFF) { #if TCL_UTF_MAX > 3 if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { /* Previous Tcl_UniChar was a high surrogate, so combine */ buf[2] = (char) ((ch & 0x3F) | 0x80); buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80); return 3; } /* Previous Tcl_UniChar was not a high surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; /* Fill buffer with specific 3-byte (invalid) byte combination, so following low surrogate can recognize it and combine */ buf[2] = (char) ((ch << 4) & 0x30); buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); return 1; } } #endif goto three; } #if TCL_UTF_MAX > 3 if (ch <= 0x10FFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } } else if (ch == -1) { if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0) && ((buf[-1] & 0xF8) == 0xF0)) { ch = 0xD7C0 + ((buf[-1] & 0x07) << 8) + ((buf[0] & 0x3F) << 2) + ((buf[1] & 0x30) >> 4); buf[1] = (char) ((ch | 0x80) & 0xBF); buf[0] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[-1] = (char) ((ch >> 12) | 0xE0); return 2; } #endif } ch = 0xFFFD; three: buf[2] = (char) ((ch | 0x80) & 0xBF); |
︙ | ︙ | |||
294 295 296 297 298 299 300 | int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { | | > > > > > > > > > > > > > > | | | | < < | < | < < < < | | | | | | | 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 | int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { Tcl_UniChar byte; /* * Unroll 1 to 3 (or 4) byte UTF-8 sequences. */ byte = *((unsigned char *) src); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid * characters representing themselves. */ #if TCL_UTF_MAX == 4 /* If *chPtr contains a high surrogate (produced by a previous * Tcl_UtfToUniChar() call) and the next 3 bytes are UTF-8 continuation * bytes, then we must produce a follow-up low surrogate. We only * do that if the high surrogate matches the bits we encounter. */ if ((byte >= 0x80) && (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC)) && ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80)) && ((src[2] & 0xC0) == 0x80)) { *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00; return 3; } #endif *chPtr = byte; return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { /* * Two-byte-character lead-byte followed by a trail-byte. */ *chPtr = (((byte & 0x1F) << 6) | (src[1] & 0x3F)); if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { return 2; } } /* * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ } else if (byte < 0xF0) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* * Three-byte-character lead byte followed by two trail bytes. */ *chPtr = (((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); if (*chPtr > 0x7FF) { return 3; } } /* * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ } #if TCL_UTF_MAX > 3 else if (byte < 0xF8) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { /* * Four-byte-character lead byte followed by three trail bytes. */ #if TCL_UTF_MAX == 4 Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; if (high >= 0x400) { /* out of range, < 0x10000 or > 0x10ffff */ } else { /* produce high surrogate, advance source pointer */ *chPtr = 0xD800 + high; return 1; } #else *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); if ((*chPtr - 0x10000) <= 0xFFFFF) { return 4; } #endif } /* * A four-byte-character lead-byte not followed by two trail-bytes * represents itself. */ } #endif *chPtr = byte; return 1; } /* *--------------------------------------------------------------------------- * * Tcl_UtfToUniCharDString -- |
︙ | ︙ | |||
574 575 576 577 578 579 580 | int len, fullchar; Tcl_UniChar find = 0; while (1) { len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX == 4 | | | | 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 | int len, fullchar; Tcl_UniChar find = 0; while (1) { len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX == 4 if ((ch >= 0xD800) && (len < 3)) { len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } #endif if (fullchar == ch) { return src; } if (*src == '\0') { |
︙ | ︙ | |||
622 623 624 625 626 627 628 | const char *last; last = NULL; while (1) { len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX == 4 | | | | 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 | const char *last; last = NULL; while (1) { len = TclUtfToUniChar(src, &find); fullchar = find; #if TCL_UTF_MAX == 4 if ((ch >= 0xD800) && (len < 3)) { len += TclUtfToUniChar(src + len, &find); fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; } #endif if (fullchar == ch) { last = src; } if (*src == '\0') { |
︙ | ︙ | |||
665 666 667 668 669 670 671 | Tcl_UtfNext( const char *src) /* The current location in the string. */ { Tcl_UniChar ch = 0; int len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX == 4 | | | | 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 | Tcl_UtfNext( const char *src) /* The current location in the string. */ { Tcl_UniChar ch = 0; int len = TclUtfToUniChar(src, &ch); #if TCL_UTF_MAX == 4 if ((ch >= 0xD800) && (len < 3)) { len += TclUtfToUniChar(src + len, &ch); } #endif return src + len; } /* *--------------------------------------------------------------------------- |
︙ | ︙ | |||
775 776 777 778 779 780 781 | const char * Tcl_UtfAtIndex( register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; | | | | | 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 | const char * Tcl_UtfAtIndex( register const char *src, /* The UTF-8 string. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; int len = 0; while (index-- > 0) { len = TclUtfToUniChar(src, &ch); src += len; } #if TCL_UTF_MAX == 4 if ((ch >= 0xD800) && (len < 3)) { /* Index points at character following high Surrogate */ src += TclUtfToUniChar(src, &ch); } #endif return src; } /* |
︙ | ︙ | |||
867 868 869 870 871 872 873 | int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { Tcl_UniChar ch = 0, upChar; char *src, *dst; | | | | | | | | 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 | int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { Tcl_UniChar ch = 0, upChar; char *src, *dst; int len; /* * Iterate over the string until we hit the terminating null. */ src = dst = str; while (*src) { len = TclUtfToUniChar(src, &ch); upChar = Tcl_UniCharToUpper(ch); /* * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the upper case * char to dst if its size is <= the original char. */ if (len < UtfCount(upChar)) { memcpy(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(upChar, dst); } src += len; } *dst = '\0'; return (dst - str); } /* *---------------------------------------------------------------------- |
︙ | ︙ | |||
920 921 922 923 924 925 926 | int Tcl_UtfToLower( char *str) /* String to convert in place. */ { Tcl_UniChar ch = 0, lowChar; char *src, *dst; | | | | | | | | 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 | int Tcl_UtfToLower( char *str) /* String to convert in place. */ { Tcl_UniChar ch = 0, lowChar; char *src, *dst; int len; /* * Iterate over the string until we hit the terminating null. */ src = dst = str; while (*src) { len = TclUtfToUniChar(src, &ch); lowChar = Tcl_UniCharToLower(ch); /* * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the lower case * char to dst if its size is <= the original char. */ if (len < UtfCount(lowChar)) { memcpy(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } src += len; } *dst = '\0'; return (dst - str); } /* *---------------------------------------------------------------------- |
︙ | ︙ | |||
974 975 976 977 978 979 980 | int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { Tcl_UniChar ch = 0, titleChar, lowChar; char *src, *dst; | | | | | | | | | | | | | 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 | int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { Tcl_UniChar ch = 0, titleChar, lowChar; char *src, *dst; int len; /* * Capitalize the first character and then lowercase the rest of the * characters until we get to a null. */ src = dst = str; if (*src) { len = TclUtfToUniChar(src, &ch); titleChar = Tcl_UniCharToTitle(ch); if (len < UtfCount(titleChar)) { memcpy(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(titleChar, dst); } src += len; } while (*src) { len = TclUtfToUniChar(src, &ch); lowChar = ch; /* Special exception for Georgian Asomtavruli chars, no titlecase. */ if ((unsigned)(lowChar - 0x1C90) >= 0x30) { lowChar = Tcl_UniCharToLower(lowChar); } if (len < UtfCount(lowChar)) { memcpy(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } src += len; } *dst = '\0'; return (dst - str); } /* *---------------------------------------------------------------------- |
︙ | ︙ |
Changes to generic/tclUtil.c.
︙ | ︙ | |||
1645 1646 1647 1648 1649 1650 1651 | char Tcl_Backslash( const char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr) /* Fill in with number of characters read from * src, unless NULL. */ { | | | 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 | char Tcl_Backslash( const char *src, /* Points to the backslash character of a * backslash sequence. */ int *readPtr) /* Fill in with number of characters read from * src, unless NULL. */ { char buf[TCL_UTF_MAX] = ""; Tcl_UniChar ch = 0; Tcl_UtfBackslash(src, readPtr, buf); TclUtfToUniChar(buf, &ch); return (char) ch; } |
︙ | ︙ |
Changes to win/tclWin32Dll.c.
︙ | ︙ | |||
644 645 646 647 648 649 650 651 652 653 654 655 656 657 | for (w = (TCHAR *)string; w < wEnd; ) { if (!blen && ((*w & 0xFC00) != 0xDC00)) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } blen = Tcl_UniCharToUtf(*w, p); p += blen; w++; } if (!blen) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); | > > > > | 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 | for (w = (TCHAR *)string; w < wEnd; ) { if (!blen && ((*w & 0xFC00) != 0xDC00)) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } blen = Tcl_UniCharToUtf(*w, p); p += blen; if ((*w >= 0xD800) && (blen < 3)) { /* Indication that high surrogate is handled */ blen = 0; } w++; } if (!blen) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } Tcl_DStringSetLength(dsPtr, oldLength + (p - result)); |
︙ | ︙ |