Index: doc/Utf.3 ================================================================== --- doc/Utf.3 +++ doc/Utf.3 @@ -164,15 +164,16 @@ \fBTCL_UTF_MAX\fR is the maximum number of bytes that \fBTcl_UtfToUniChar\fR can consume in a single call. .PP \fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string in starting at \fIbuf\fR. The return value is the number of bytes stored -in \fIbuf\fR. If ch is a high surrogate (range U+D800 - U+DBFF), then -the return value will be 1 and a single byte in the range 0xF0 - 0xF4 -will be stored. If you still want to produce UTF-8 output for it (even -though knowing it's an illegal code-point on its own), just call -\fBTcl_UniCharToUtf\fR again specifying ch = -1. +in \fIbuf\fR. The character \fIch\fR can be or'ed with the value TCL_COMBINE +to enable special behavior, compatible with Tcl 8.x. Then, if ch is a high +surrogate (range U+D800 - U+DBFF), the return value will be 1 and a single +byte in the range 0xF0 - 0xF4 will be stored. If \fIch\fR is a low surrogate +(range U+DC00 - U+DFFF), an attempt is made to combine the result with +the earlier produced bytes, resulting in a 4-byte UTF-8 byte sequence. .PP \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the number of bytes read from \fIsrc\fR. The caller must ensure that the source buffer is long enough such that this routine does not run off the Index: generic/tcl.h ================================================================== --- generic/tcl.h +++ generic/tcl.h @@ -833,10 +833,17 @@ #define TCL_EXACT 1 #define TCL_INDEX_NULL_OK 32 #define TCL_INDEX_TEMP_TABLE 64 +/* + * Flags that may be passed to Tcl_UniCharToUtf. + * TCL_COMBINE Combine surrogates + */ + +#define TCL_COMBINE 0x1000000 + /* *---------------------------------------------------------------------------- * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv. * WARNING: these bit choices must not conflict with the bit choices for * evalFlag bits in tclInt.h! Index: generic/tclCmdMZ.c ================================================================== --- generic/tclCmdMZ.c +++ generic/tclCmdMZ.c @@ -1428,13 +1428,15 @@ Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { char buf[4] = ""; end = Tcl_UniCharToUtf(ch, buf); +#if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (end < 3)) { end += Tcl_UniCharToUtf(-1, buf + end); } +#endif Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end)); } } return TCL_OK; } Index: generic/tclDecls.h ================================================================== --- generic/tclDecls.h +++ generic/tclDecls.h @@ -4014,10 +4014,18 @@ # define Tcl_UtfToUniCharDString Tcl_UtfToChar16DString # undef Tcl_UtfToUniChar # define Tcl_UtfToUniChar Tcl_UtfToChar16 # undef Tcl_UniCharLen # define Tcl_UniCharLen Tcl_Char16Len +# undef Tcl_UniCharToUtf +# if defined(USE_TCL_STUBS) +# define Tcl_UniCharToUtf(c, p) \ + (tclStubsPtr->tcl_UniCharToUtf((c)|TCL_COMBINE, (p))) +# else +# define Tcl_UniCharToUtf(c, p) \ + ((Tcl_UniCharToUtf)((c)|TCL_COMBINE, (p))) +# endif #if !defined(BUILD_tcl) # undef Tcl_NumUtfChars # define Tcl_NumUtfChars TclNumUtfChars # undef Tcl_GetCharLength # define Tcl_GetCharLength TclGetCharLength Index: generic/tclEncoding.c ================================================================== --- generic/tclEncoding.c +++ generic/tclEncoding.c @@ -2314,11 +2314,10 @@ chbuf[0] = UCHAR(*src++); chbuf[1] = 0; TclUtfToUCS4(chbuf, &ch); } dst += Tcl_UniCharToUtf(ch, dst); } else { - int low; const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && !(flags & TCL_ENCODING_NOCOMPLAIN) && (flags & TCL_ENCODING_MODIFIED)) { result = TCL_CONVERT_SYNTAX; @@ -2332,35 +2331,39 @@ *dst++ = 0xED; *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); ch = (ch & 0x0CFF) | 0xDC00; } - goto cesu8; +#if TCL_UTF_MAX < 4 + cesu8: +#endif + *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((ch | 0x80) & 0xBF); + continue; +#if TCL_UTF_MAX < 4 } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ - low = ch; + int low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { if (!(flags & TCL_ENCODING_NOCOMPLAIN)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; } - cesu8: - *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((ch | 0x80) & 0xBF); - continue; + goto cesu8; } src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; +#endif } else if (!Tcl_UniCharIsUnicode(ch)) { if (!(flags & TCL_ENCODING_NOCOMPLAIN)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; @@ -2665,11 +2668,11 @@ */ if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { - dst += Tcl_UniCharToUtf(ch, dst); + dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); } src += sizeof(unsigned short); } *srcReadPtr = src - srcStart; Index: generic/tclExecute.c ================================================================== --- generic/tclExecute.c +++ generic/tclExecute.c @@ -5130,13 +5130,15 @@ */ if (ch == -1) { TclNewObj(objResultPtr); } else { slength = Tcl_UniCharToUtf(ch, buf); +#if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (slength < 3)) { slength += Tcl_UniCharToUtf(-1, buf + slength); } +#endif objResultPtr = Tcl_NewStringObj(buf, slength); } } TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr))); Index: generic/tclParse.c ================================================================== --- generic/tclParse.c +++ generic/tclParse.c @@ -867,10 +867,11 @@ if (count == 2) { /* * No hexdigits -> This is just "u". */ result = 'u'; +#if TCL_UTF_MAX < 4 } else if (((result & 0xFC00) == 0xD800) && (count == 6) && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) { /* If high surrogate is immediately followed by a low surrogate * escape, combine them into one character. */ int low; @@ -877,22 +878,20 @@ int count2 = ParseHex(p+7, 4, &low); if ((count2 == 4) && ((low & 0xFC00) == 0xDC00)) { result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000; count += count2 + 2; } +#endif } break; case 'U': count += ParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result); if (count == 2) { /* * No hexdigits -> This is just "U". */ result = 'U'; - } else if ((result | 0x7FF) == 0xDFFF) { - /* Upper or lower surrogate, not allowed in this syntax. */ - result = 0xFFFD; } break; case '\n': count--; do { @@ -952,14 +951,16 @@ done: if (readPtr != NULL) { *readPtr = count; } count = Tcl_UniCharToUtf(result, dst); +#if TCL_UTF_MAX < 4 if ((result >= 0xD800) && (count < 3)) { /* Special case for handling high surrogates. */ count += Tcl_UniCharToUtf(-1, dst + count); } +#endif return count; } /* *---------------------------------------------------------------------- Index: generic/tclStringObj.c ================================================================== --- generic/tclStringObj.c +++ generic/tclStringObj.c @@ -68,13 +68,18 @@ static void SetUnicodeObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, size_t numChars); static size_t UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); +#if TCL_UTF_MAX > 3 +#define ISCONTINUATION(bytes) (\ + ((bytes)[0] & 0xC0) == 0x80) +#else #define ISCONTINUATION(bytes) (\ ((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \ && (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80)))) +#endif /* * The structure below defines the string Tcl object type by means of * functions that can be invoked by generic object code. @@ -2122,14 +2127,16 @@ if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) { goto error; } length = Tcl_UniCharToUtf(code, buf); +#if TCL_UTF_MAX < 4 if ((code >= 0xD800) && (length < 3)) { /* Special case for handling high surrogates. */ length += Tcl_UniCharToUtf(-1, buf + length); } +#endif segment = Tcl_NewStringObj(buf, length); Tcl_IncrRefCount(segment); allocSegment = 1; break; } @@ -4180,18 +4187,10 @@ numAppendChars = 0; } dst = stringPtr->unicode + numOrigChars; if (numAppendChars-- > 0) { bytes += TclUtfToUniChar(bytes, &unichar); -#if TCL_UTF_MAX > 3 - /* join upper/lower surrogate */ - if (bytes && (stringPtr->unicode[numOrigChars - 1] | 0x3FF) == 0xDBFF && (unichar | 0x3FF) == 0xDFFF) { - stringPtr->numChars--; - unichar = ((stringPtr->unicode[numOrigChars - 1] & 0x3FF) << 10) + (unichar & 0x3FF) + 0x10000; - dst--; - } -#endif *dst++ = unichar; while (numAppendChars-- > 0) { bytes += TclUtfToUniChar(bytes, &unichar); *dst++ = unichar; } Index: generic/tclUtf.c ================================================================== --- generic/tclUtf.c +++ generic/tclUtf.c @@ -206,19 +206,27 @@ * None. * *--------------------------------------------------------------------------- */ +#undef Tcl_UniCharToUtf int Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the - * buffer. */ + * buffer. Can be or'ed with flag TCL_COMBINE */ char *buf) /* Buffer in which the UTF-8 representation of * the Tcl_UniChar is stored. Buffer must be * large enough to hold the UTF-8 character * (at most 4 bytes). */ { +#if TCL_UTF_MAX > 3 + int flags = ch; +#endif + + if (ch >= TCL_COMBINE) { + ch &= (TCL_COMBINE - 1); + } if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; } if (ch >= 0) { @@ -226,11 +234,15 @@ buf[1] = (char) ((ch | 0x80) & 0xBF); buf[0] = (char) ((ch >> 6) | 0xC0); return 2; } if (ch <= 0xFFFF) { - if ((ch & 0xF800) == 0xD800) { + if ( +#if TCL_UTF_MAX > 3 + (flags & TCL_COMBINE) && +#endif + ((ch & 0xF800) == 0xD800)) { if (ch & 0x0400) { /* Low surrogate */ if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { /* Previous Tcl_UniChar was a high surrogate, so combine */ buf[2] = (char) ((ch & 0x3F) | 0x80); @@ -375,11 +387,11 @@ for (w = uniStr; w < wEnd; ) { if (!len && ((*w & 0xFC00) != 0xDC00)) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } - len = Tcl_UniCharToUtf(*w, p); + len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p); p += len; if ((*w >= 0xD800) && (len < 3)) { len = 0; /* Indication that high surrogate was found */ } w++; @@ -1346,11 +1358,11 @@ * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the upper case * char to dst if its size is <= the original char. */ - if ((len < TclUtfCount(upChar)) || ((upChar & ~0x7FF) == 0xD800)) { + if (len < TclUtfCount(upChar)) { memmove(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(upChar, dst); } @@ -1399,11 +1411,11 @@ * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the lower case * char to dst if its size is <= the original char. */ - if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) { + if (len < TclUtfCount(lowChar)) { memmove(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } @@ -1449,11 +1461,11 @@ if (*src) { len = TclUtfToUCS4(src, &ch); titleChar = Tcl_UniCharToTitle(ch); - if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) { + if (len < TclUtfCount(titleChar)) { memmove(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(titleChar, dst); } @@ -1465,11 +1477,11 @@ /* Special exception for Georgian Asomtavruli chars, no titlecase. */ if ((unsigned)(lowChar - 0x1C90) >= 0x30) { lowChar = Tcl_UniCharToLower(lowChar); } - if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) { + if (len < TclUtfCount(lowChar)) { memmove(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } Index: tests/encoding.test ================================================================== --- tests/encoding.test +++ tests/encoding.test @@ -38,11 +38,14 @@ testConstraint testencoding [llength [info commands testencoding]] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint teststringbytes [llength [info commands teststringbytes]] testConstraint exec [llength [info commands exec]] testConstraint testgetencpath [llength [info commands testgetencpath]] - +testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] +testConstraint utf32 [expr {[testConstraint fullutf] + && [string length [format %c 0x10000]] == 1}] + # TclInitEncodingSubsystem is tested by the rest of this file # TclFinalizeEncodingSubsystem is not currently tested test encoding-1.1 {Tcl_GetEncoding: system encoding} -setup { set old [encoding system] @@ -334,78 +337,78 @@ binary scan [teststringbytes $y] H* z set z } c080 test encoding-15.4 {UtfToUtfProc emoji character input} -body { set x \xED\xA0\xBD\xED\xB8\x82 - set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82] + set y [encoding convertfrom -nocomplain utf-8 \xED\xA0\xBD\xED\xB8\x82] list [string length $x] $y -} -result "6 😂" +} -result "6 \uD83D\uDE02" test encoding-15.5 {UtfToUtfProc emoji character input} { set x \xF0\x9F\x98\x82 set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] list [string length $x] $y } "4 😂" -test encoding-15.6 {UtfToUtfProc emoji character output} { +test encoding-15.6 {UtfToUtfProc emoji character output} utf32 { set x \uDE02\uD83D\uDE02\uD83D set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z -} {10 edb882f09f9882eda0bd} -test encoding-15.7 {UtfToUtfProc emoji character output} { +} {12 efbfbdefbfbdefbfbdefbfbd} +test encoding-15.7 {UtfToUtfProc emoji character output} utf32 { set x \uDE02\uD83D\uD83D set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z -} {3 9 edb882eda0bdeda0bd} -test encoding-15.8 {UtfToUtfProc emoji character output} { +} {3 9 efbfbdefbfbdefbfbd} +test encoding-15.8 {UtfToUtfProc emoji character output} utf32 { set x \uDE02\uD83Dé set y [encoding convertto -nocomplain utf-8 \uDE02\uD83Dé] binary scan $y H* z list [string length $x] [string length $y] $z -} {3 8 edb882eda0bdc3a9} -test encoding-15.9 {UtfToUtfProc emoji character output} { +} {3 8 efbfbdefbfbdc3a9} +test encoding-15.9 {UtfToUtfProc emoji character output} utf32 { set x \uDE02\uD83DX set y [encoding convertto -nocomplain utf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z -} {3 7 edb882eda0bd58} -test encoding-15.10 {UtfToUtfProc high surrogate character output} { +} {3 7 efbfbdefbfbd58} +test encoding-15.10 {UtfToUtfProc high surrogate character output} utf32 { set x \uDE02é set y [encoding convertto -nocomplain utf-8 \uDE02é] binary scan $y H* z list [string length $x] [string length $y] $z -} {2 5 edb882c3a9} -test encoding-15.11 {UtfToUtfProc low surrogate character output} { +} {2 5 efbfbdc3a9} +test encoding-15.11 {UtfToUtfProc low surrogate character output} utf32 { set x \uDA02é set y [encoding convertto -nocomplain utf-8 \uDA02é] binary scan $y H* z list [string length $x] [string length $y] $z -} {2 5 eda882c3a9} -test encoding-15.12 {UtfToUtfProc high surrogate character output} { +} {2 5 efbfbdc3a9} +test encoding-15.12 {UtfToUtfProc high surrogate character output} utf32 { set x \uDE02Y set y [encoding convertto -nocomplain utf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z -} {2 4 edb88259} -test encoding-15.13 {UtfToUtfProc low surrogate character output} { +} {2 4 efbfbd59} +test encoding-15.13 {UtfToUtfProc low surrogate character output} utf32 { set x \uDA02Y set y [encoding convertto -nocomplain utf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z -} {2 4 eda88259} -test encoding-15.14 {UtfToUtfProc high surrogate character output} { +} {2 4 efbfbd59} +test encoding-15.14 {UtfToUtfProc high surrogate character output} utf32 { set x \uDE02 set y [encoding convertto -nocomplain utf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z -} {1 3 edb882} -test encoding-15.15 {UtfToUtfProc low surrogate character output} { +} {1 3 efbfbd} +test encoding-15.15 {UtfToUtfProc low surrogate character output} utf32 { set x \uDA02 set y [encoding convertto -nocomplain utf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z -} {1 3 eda882} +} {1 3 efbfbd} test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { set x \xF0\xA0\xA1\xC2 set y [encoding convertfrom -nocomplain utf-8 \xF0\xA0\xA1\xC2] list [string length $x] $y } "4 \xF0\xA0\xA1\xC2" Index: tests/string.test ================================================================== --- tests/string.test +++ tests/string.test @@ -32,10 +32,12 @@ testConstraint testobj [expr {[info commands testobj] ne {}}] testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] testConstraint utf32 [expr {[string length \U010000] == 1}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint utf32 [expr {[testConstraint fullutf] + && [string length [format %c 0x10000]] == 1}] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] if {[testConstraint memory]} { proc getbytes {} { @@ -1944,17 +1946,17 @@ } "\uF602Hello world!\uF602" test string-21.22.$noComp {string trimright, unicode} { run {string trimright "\uF602Hello world!\uF602" \uD83D\uDE02} } "\uF602Hello world!\uF602" test string-21.23.$noComp {string trim, unicode} { - run {string trim "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02} + run {string trim "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-21.24.$noComp {string trimleft, unicode} { run {string trimleft "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-21.25.$noComp {string trimright, unicode} { - run {string trimright "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02} + run {string trimright "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-22.1.$noComp {string wordstart} -body { list [catch {run {string word a}} msg] $msg } -result {1 {unknown or ambiguous subcommand "word": must be cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} @@ -2114,28 +2116,28 @@ } 030201 test string-24.15.$noComp {string reverse command - pure bytearray} { binary scan [run {tcl::string::reverse [binary format H* 010203]}] H* x set x } 030201 -test string-24.16.$noComp {string reverse command - surrogates} { +test string-24.16.$noComp {string reverse command - surrogates} utf32 { run {string reverse \u0444bulb\uD83D\uDE02} -} \uD83D\uDE02blub\u0444 -test string-24.17.$noComp {string reverse command - surrogates} { +} \uDE02\uD83Dblub\u0444 +test string-24.17.$noComp {string reverse command - surrogates} utf32 { run {string reverse \uD83D\uDE02hello\uD83D\uDE02} -} \uD83D\uDE02olleh\uD83D\uDE02 -test string-24.18.$noComp {string reverse command - surrogates} { +} \uDE02\uD83Dolleh\uDE02\uD83D +test string-24.18.$noComp {string reverse command - surrogates} utf32 { set s \u0444bulb\uD83D\uDE02 # shim shimmery ... string index $s 0 run {string reverse $s} -} \uD83D\uDE02blub\u0444 -test string-24.19.$noComp {string reverse command - surrogates} { +} \uDE02\uD83Dblub\u0444 +test string-24.19.$noComp {string reverse command - surrogates} utf32 { set s \uD83D\uDE02hello\uD83D\uDE02 # shim shimmery ... string index $s 0 run {string reverse $s} -} \uD83D\uDE02olleh\uD83D\uDE02 +} \uDE02\uD83Dolleh\uDE02\uD83D test string-25.1.$noComp {string is list} { run {string is list {a b c}} } 1 test string-25.2.$noComp {string is list} { Index: tests/utf.test ================================================================== --- tests/utf.test +++ tests/utf.test @@ -76,15 +76,15 @@ expr {[format %c 0xDC42] eq [testbytestring \xED\xB1\x82]} } 1 test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} { expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]} } 1 -test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} { +test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} Uesc { expr {"\UD842" eq "\uD842"} } 1 test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} { - expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]} + expr {"\UD842" eq [testbytestring \xED\xA1\x82]} } 1 test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} { set lo \uDE02 return \uD83D$lo } \uD83D\uDE02 @@ -1112,11 +1112,11 @@ string toupper 𐐨 } 𐐀 test utf-11.7 {Tcl_UtfToUpper beyond U+FFFF} fullutf { string toupper 𐐨 } 𐐀 -test utf-11.8 {Tcl_UtfToUpper low/high surrogate)} { +test utf-11.8 {Tcl_UtfToUpper low/high surrogate)} utf32 { string toupper \uDC24\uD824 } \uDC24\uD824 test utf-12.1 {Tcl_UtfToLower} { string tolower {} @@ -1131,11 +1131,11 @@ string tolower ǢGH } ǣgh test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} { string tolower აᲐ } აა -test utf-12.6 {Tcl_UtfToLower low/high surrogate)} { +test utf-12.6 {Tcl_UtfToLower low/high surrogate)} utf32 { string tolower \uDC24\uD824 } \uDC24\uD824 test utf-12.7 {Tcl_UtfToLower beyond U+FFFF} fullutf { string tolower 𐐀 } 𐐨 @@ -1159,11 +1159,11 @@ string totitle აᲐ } აᲐ test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { string totitle Აა } Აა -test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} { +test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} utf32 { string totitle \uDC24\uD824 } \uDC24\uD824 test utf-13.8 {Tcl_UtfToTitle beyond U+FFFF} fullutf { string totitle 𐐨𐐀 } 𐐀𐐨