Tcl Source Code

Changes On Branch tip-619
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch tip-619 Excluding Merge-Ins

This is equivalent to a diff from ccbe007452 to 631cce8656

2022-07-04
11:21
TIP #619: New TCL_COMBINE flag for Tcl_UniCharToUtf() check-in: 3bb6f6602c user: jan.nijtmans tags: trunk, main
2022-06-10
20:24
Merge 8.7 check-in: 8c31c163af user: jan.nijtmans tags: trunk, main
13:45
Provide help with solving merge conflicts ;-) check-in: 71597277a7 user: jan.nijtmans tags: dgp-refactor
2022-06-09
20:25
Merge 9.0 Closed-Leaf check-in: 631cce8656 user: jan.nijtmans tags: tip-619
16:31
Merge 9.0 check-in: 475b98506a user: jan.nijtmans tags: tip-626
13:45
Internal bug in Tcl_GetIndexFromObjStruct re-definition (works, as long as sizeof(struct) == sizeof(... check-in: ccbe007452 user: jan.nijtmans tags: trunk, main
07:03
More (internal) int -> size_t, allowing values > 2^31 check-in: 5d828b481d user: jan.nijtmans tags: trunk, main
2022-05-19
15:05
Merge 9.0 check-in: 60d39427c4 user: jan.nijtmans tags: tip-619

Changes to doc/Utf.3.

162
163
164
165
166
167
168
169

170
171
172
173

174
175
176
177
178
179
180
consists of a lead byte followed by some number of trail bytes.
.PP
\fBTCL_UTF_MAX\fR is the maximum number of bytes that \fBTcl_UtfToUniChar\fR
can consume in a single call.
.PP
\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string
in starting at \fIbuf\fR.  The return value is the number of bytes stored
in \fIbuf\fR. If ch is a high surrogate (range U+D800 - U+DBFF), then

the return value will be 1 and a single byte in the range 0xF0 - 0xF4
will be stored. If you still want to produce UTF-8 output for it (even
though knowing it's an illegal code-point on its own), just call
\fBTcl_UniCharToUtf\fR again specifying ch = -1.

.PP
\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
number of bytes read from \fIsrc\fR.  The caller must ensure that the
source buffer is long enough such that this routine does not run off the
end and dereference non-existent or random memory; if the source buffer
is known to be null-terminated, this will not happen.  If the input is







|
>
|
|
<
|
>







162
163
164
165
166
167
168
169
170
171
172

173
174
175
176
177
178
179
180
181
consists of a lead byte followed by some number of trail bytes.
.PP
\fBTCL_UTF_MAX\fR is the maximum number of bytes that \fBTcl_UtfToUniChar\fR
can consume in a single call.
.PP
\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string
in starting at \fIbuf\fR.  The return value is the number of bytes stored
in \fIbuf\fR. The character \fIch\fR can be or'ed with the value TCL_COMBINE
to enable special behavior, compatible with Tcl 8.x. Then, if ch is a high
surrogate (range U+D800 - U+DBFF), the return value will be 1 and a single
byte in the range 0xF0 - 0xF4 will be stored. If \fIch\fR is a low surrogate

(range U+DC00 - U+DFFF), an attempt is made to combine the result with
the earlier produced bytes, resulting in a 4-byte UTF-8 byte sequence.
.PP
\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
number of bytes read from \fIsrc\fR.  The caller must ensure that the
source buffer is long enough such that this routine does not run off the
end and dereference non-existent or random memory; if the source buffer
is known to be null-terminated, this will not happen.  If the input is

Changes to generic/tcl.h.

831
832
833
834
835
836
837







838
839
840
841
842
843
844
 *      a table that will not live long enough to make it worthwhile.
 */

#define TCL_EXACT		1
#define TCL_INDEX_NULL_OK	32
#define TCL_INDEX_TEMP_TABLE	64








/*
 *----------------------------------------------------------------------------
 * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv.
 * WARNING: these bit choices must not conflict with the bit choices for
 * evalFlag bits in tclInt.h!
 *
 * Meanings:







>
>
>
>
>
>
>







831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
 *      a table that will not live long enough to make it worthwhile.
 */

#define TCL_EXACT		1
#define TCL_INDEX_NULL_OK	32
#define TCL_INDEX_TEMP_TABLE	64

/*
 * Flags that may be passed to Tcl_UniCharToUtf.
 * TCL_COMBINE Combine surrogates
 */

#define TCL_COMBINE		0x1000000

/*
 *----------------------------------------------------------------------------
 * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv.
 * WARNING: these bit choices must not conflict with the bit choices for
 * evalFlag bits in tclInt.h!
 *
 * Meanings:

Changes to generic/tclCmdMZ.c.

1426
1427
1428
1429
1430
1431
1432

1433
1434
1435

1436
1437
1438
1439
1440
1441
1442
	    unsigned char uch = UCHAR(ch);

	    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
	} else {
	    char buf[4] = "";

	    end = Tcl_UniCharToUtf(ch, buf);

	    if ((ch >= 0xD800) && (end < 3)) {
		end += Tcl_UniCharToUtf(-1, buf + end);
	    }

	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end));
	}
    }
    return TCL_OK;
}

/*







>



>







1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
	    unsigned char uch = UCHAR(ch);

	    Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1));
	} else {
	    char buf[4] = "";

	    end = Tcl_UniCharToUtf(ch, buf);
#if TCL_UTF_MAX < 4
	    if ((ch >= 0xD800) && (end < 3)) {
		end += Tcl_UniCharToUtf(-1, buf + end);
	    }
#endif
	    Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end));
	}
    }
    return TCL_OK;
}

/*

Changes to generic/tclDecls.h.

4012
4013
4014
4015
4016
4017
4018








4019
4020
4021
4022
4023
4024
4025
#   define Tcl_UniCharToUtfDString Tcl_Char16ToUtfDString
#   undef Tcl_UtfToUniCharDString
#   define Tcl_UtfToUniCharDString Tcl_UtfToChar16DString
#   undef Tcl_UtfToUniChar
#   define Tcl_UtfToUniChar Tcl_UtfToChar16
#   undef Tcl_UniCharLen
#   define Tcl_UniCharLen Tcl_Char16Len








#if !defined(BUILD_tcl)
#   undef Tcl_NumUtfChars
#   define Tcl_NumUtfChars TclNumUtfChars
#   undef Tcl_GetCharLength
#   define Tcl_GetCharLength TclGetCharLength
#   undef Tcl_UtfAtIndex
#   define Tcl_UtfAtIndex TclUtfAtIndex







>
>
>
>
>
>
>
>







4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
#   define Tcl_UniCharToUtfDString Tcl_Char16ToUtfDString
#   undef Tcl_UtfToUniCharDString
#   define Tcl_UtfToUniCharDString Tcl_UtfToChar16DString
#   undef Tcl_UtfToUniChar
#   define Tcl_UtfToUniChar Tcl_UtfToChar16
#   undef Tcl_UniCharLen
#   define Tcl_UniCharLen Tcl_Char16Len
#   undef Tcl_UniCharToUtf
#   if defined(USE_TCL_STUBS)
#	define Tcl_UniCharToUtf(c, p) \
		(tclStubsPtr->tcl_UniCharToUtf((c)|TCL_COMBINE, (p)))
#   else
#	define Tcl_UniCharToUtf(c, p) \
		((Tcl_UniCharToUtf)((c)|TCL_COMBINE, (p)))
#   endif
#if !defined(BUILD_tcl)
#   undef Tcl_NumUtfChars
#   define Tcl_NumUtfChars TclNumUtfChars
#   undef Tcl_GetCharLength
#   define Tcl_GetCharLength TclGetCharLength
#   undef Tcl_UtfAtIndex
#   define Tcl_UtfAtIndex TclUtfAtIndex

Changes to generic/tclEncoding.c.

2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336

2337






2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361

2362
2363
2364
2365
2366
2367
2368
	    } else {
		char chbuf[2];
		chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
		TclUtfToUCS4(chbuf, &ch);
	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	} else {
	    int low;
	    const char *saveSrc = src;
	    size_t len = TclUtfToUCS4(src, &ch);
	    if ((len < 2) && (ch != 0) && !(flags & TCL_ENCODING_NOCOMPLAIN)
		    && (flags & TCL_ENCODING_MODIFIED)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	    src += len;
	    if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) {
		if (ch > 0xFFFF) {
		    /* CESU-8 6-byte sequence for chars > U+FFFF */
		    ch -= 0x10000;
		    *dst++ = 0xED;
		    *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
		    *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
		    ch = (ch & 0x0CFF) | 0xDC00;
		}

		goto cesu8;






	    } else if ((ch | 0x7FF) == 0xDFFF) {
		/*
		 * A surrogate character is detected, handle especially.
		 */

		low = ch;
		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;

		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {

		    if (!(flags & TCL_ENCODING_NOCOMPLAIN)) {
			result = TCL_CONVERT_UNKNOWN;
			src = saveSrc;
			break;
		    }
		cesu8:
		    *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
		    *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
		    *dst++ = (char) ((ch | 0x80) & 0xBF);
		    continue;
		}
		src += len;
		dst += Tcl_UniCharToUtf(ch, dst);
		ch = low;

	    } else if (!Tcl_UniCharIsUnicode(ch)) {
		if (!(flags & TCL_ENCODING_NOCOMPLAIN)) {
		    result = TCL_CONVERT_UNKNOWN;
		    src = saveSrc;
		    break;
		}
		if (!(flags & TCL_ENCODING_MODIFIED)) {







<

















>
|
>
>
>
>
>
>





|









|
<
<
<
<




>







2312
2313
2314
2315
2316
2317
2318

2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359




2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
	    } else {
		char chbuf[2];
		chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
		TclUtfToUCS4(chbuf, &ch);
	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	} else {

	    const char *saveSrc = src;
	    size_t len = TclUtfToUCS4(src, &ch);
	    if ((len < 2) && (ch != 0) && !(flags & TCL_ENCODING_NOCOMPLAIN)
		    && (flags & TCL_ENCODING_MODIFIED)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	    src += len;
	    if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) {
		if (ch > 0xFFFF) {
		    /* CESU-8 6-byte sequence for chars > U+FFFF */
		    ch -= 0x10000;
		    *dst++ = 0xED;
		    *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
		    *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
		    ch = (ch & 0x0CFF) | 0xDC00;
		}
#if TCL_UTF_MAX < 4
	    cesu8:
#endif
		*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
		*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
		*dst++ = (char) ((ch | 0x80) & 0xBF);
		continue;
#if TCL_UTF_MAX < 4
	    } else if ((ch | 0x7FF) == 0xDFFF) {
		/*
		 * A surrogate character is detected, handle especially.
		 */

		int low = ch;
		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;

		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {

		    if (!(flags & TCL_ENCODING_NOCOMPLAIN)) {
			result = TCL_CONVERT_UNKNOWN;
			src = saveSrc;
			break;
		    }
		    goto cesu8;




		}
		src += len;
		dst += Tcl_UniCharToUtf(ch, dst);
		ch = low;
#endif
	    } else if (!Tcl_UniCharIsUnicode(ch)) {
		if (!(flags & TCL_ENCODING_NOCOMPLAIN)) {
		    result = TCL_CONVERT_UNKNOWN;
		    src = saveSrc;
		    break;
		}
		if (!(flags & TCL_ENCODING_MODIFIED)) {
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */

	if (ch && ch < 0x80) {
	    *dst++ = (ch & 0xFF);
	} else {
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
	src += sizeof(unsigned short);
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;







|







2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */

	if (ch && ch < 0x80) {
	    *dst++ = (ch & 0xFF);
	} else {
	    dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
	}
	src += sizeof(unsigned short);
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;

Changes to generic/tclExecute.c.

5128
5129
5130
5131
5132
5133
5134

5135
5136
5137

5138
5139
5140
5141
5142
5143
5144
	     * but creating the object as a string seems to be faster in
	     * practical use.
	     */
	    if (ch == -1) {
		TclNewObj(objResultPtr);
	    } else {
		slength = Tcl_UniCharToUtf(ch, buf);

		if ((ch >= 0xD800) && (slength < 3)) {
		    slength += Tcl_UniCharToUtf(-1, buf + slength);
		}

		objResultPtr = Tcl_NewStringObj(buf, slength);
	    }
	}

	TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr)));
	NEXT_INST_F(1, 2, 1);








>



>







5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
	     * but creating the object as a string seems to be faster in
	     * practical use.
	     */
	    if (ch == -1) {
		TclNewObj(objResultPtr);
	    } else {
		slength = Tcl_UniCharToUtf(ch, buf);
#if TCL_UTF_MAX < 4
		if ((ch >= 0xD800) && (slength < 3)) {
		    slength += Tcl_UniCharToUtf(-1, buf + slength);
		}
#endif
		objResultPtr = Tcl_NewStringObj(buf, slength);
	    }
	}

	TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr)));
	NEXT_INST_F(1, 2, 1);

Changes to generic/tclParse.c.

865
866
867
868
869
870
871

872
873
874
875
876
877
878
879
880
881

882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
    case 'u':
	count += ParseHex(p+1, (numBytes > 5) ? 4 : numBytes-2, &result);
	if (count == 2) {
	    /*
	     * No hexdigits -> This is just "u".
	     */
	    result = 'u';

	} else if (((result & 0xFC00) == 0xD800) && (count == 6)
		    && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) {
	    /* If high surrogate is immediately followed by a low surrogate
	     * escape, combine them into one character. */
	    int low;
	    int count2 = ParseHex(p+7, 4, &low);
	    if ((count2 == 4) && ((low & 0xFC00) == 0xDC00)) {
		result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000;
		count += count2 + 2;
	    }

	}
	break;
    case 'U':
	count += ParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
	if (count == 2) {
	    /*
	     * No hexdigits -> This is just "U".
	     */
	    result = 'U';
	} else if ((result | 0x7FF) == 0xDFFF) {
	    /* Upper or lower surrogate, not allowed in this syntax. */
	    result = 0xFFFD;
	}
	break;
    case '\n':
	count--;
	do {
	    p++;
	    count++;







>










>









<
<
<







865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892



893
894
895
896
897
898
899
    case 'u':
	count += ParseHex(p+1, (numBytes > 5) ? 4 : numBytes-2, &result);
	if (count == 2) {
	    /*
	     * No hexdigits -> This is just "u".
	     */
	    result = 'u';
#if TCL_UTF_MAX < 4
	} else if (((result & 0xFC00) == 0xD800) && (count == 6)
		    && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) {
	    /* If high surrogate is immediately followed by a low surrogate
	     * escape, combine them into one character. */
	    int low;
	    int count2 = ParseHex(p+7, 4, &low);
	    if ((count2 == 4) && ((low & 0xFC00) == 0xDC00)) {
		result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000;
		count += count2 + 2;
	    }
#endif
	}
	break;
    case 'U':
	count += ParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
	if (count == 2) {
	    /*
	     * No hexdigits -> This is just "U".
	     */
	    result = 'U';



	}
	break;
    case '\n':
	count--;
	do {
	    p++;
	    count++;
950
951
952
953
954
955
956

957
958
959
960

961
962
963
964
965
966
967
    }

  done:
    if (readPtr != NULL) {
	*readPtr = count;
    }
    count = Tcl_UniCharToUtf(result, dst);

    if ((result >= 0xD800) && (count < 3)) {
	/* Special case for handling high surrogates. */
	count += Tcl_UniCharToUtf(-1, dst + count);
    }

    return count;
}

/*
 *----------------------------------------------------------------------
 *
 * ParseComment --







>




>







949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
    }

  done:
    if (readPtr != NULL) {
	*readPtr = count;
    }
    count = Tcl_UniCharToUtf(result, dst);
#if TCL_UTF_MAX < 4
    if ((result >= 0xD800) && (count < 3)) {
	/* Special case for handling high surrogates. */
	count += Tcl_UniCharToUtf(-1, dst + count);
    }
#endif
    return count;
}

/*
 *----------------------------------------------------------------------
 *
 * ParseComment --

Changes to generic/tclStringObj.c.

66
67
68
69
70
71
72




73
74
75

76
77
78
79
80
81
82
static void		GrowUnicodeBuffer(Tcl_Obj *objPtr, size_t needed);
static int		SetStringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);
static void		SetUnicodeObj(Tcl_Obj *objPtr,
			    const Tcl_UniChar *unicode, size_t numChars);
static size_t		UnicodeLength(const Tcl_UniChar *unicode);
static void		UpdateStringOfString(Tcl_Obj *objPtr);





#define ISCONTINUATION(bytes) (\
	((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \
	&& (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80))))



/*
 * The structure below defines the string Tcl object type by means of
 * functions that can be invoked by generic object code.
 */








>
>
>
>



>







66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
static void		GrowUnicodeBuffer(Tcl_Obj *objPtr, size_t needed);
static int		SetStringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);
static void		SetUnicodeObj(Tcl_Obj *objPtr,
			    const Tcl_UniChar *unicode, size_t numChars);
static size_t		UnicodeLength(const Tcl_UniChar *unicode);
static void		UpdateStringOfString(Tcl_Obj *objPtr);

#if TCL_UTF_MAX > 3
#define ISCONTINUATION(bytes) (\
	((bytes)[0] & 0xC0) == 0x80)
#else
#define ISCONTINUATION(bytes) (\
	((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \
	&& (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80))))
#endif


/*
 * The structure below defines the string Tcl object type by means of
 * functions that can be invoked by generic object code.
 */

2120
2121
2122
2123
2124
2125
2126

2127
2128
2129
2130

2131
2132
2133
2134
2135
2136
2137
	    char buf[4] = "";
	    int code, length;

	    if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
		goto error;
	    }
	    length = Tcl_UniCharToUtf(code, buf);

	    if ((code >= 0xD800) && (length < 3)) {
		/* Special case for handling high surrogates. */
		length += Tcl_UniCharToUtf(-1, buf + length);
	    }

	    segment = Tcl_NewStringObj(buf, length);
	    Tcl_IncrRefCount(segment);
	    allocSegment = 1;
	    break;
	}

	case 'u':







>




>







2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
	    char buf[4] = "";
	    int code, length;

	    if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
		goto error;
	    }
	    length = Tcl_UniCharToUtf(code, buf);
#if TCL_UTF_MAX < 4
	    if ((code >= 0xD800) && (length < 3)) {
		/* Special case for handling high surrogates. */
		length += Tcl_UniCharToUtf(-1, buf + length);
	    }
#endif
	    segment = Tcl_NewStringObj(buf, length);
	    Tcl_IncrRefCount(segment);
	    allocSegment = 1;
	    break;
	}

	case 'u':
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
	stringPtr->numChars = needed;
    } else {
	numAppendChars = 0;
    }
    dst = stringPtr->unicode + numOrigChars;
    if (numAppendChars-- > 0) {
	bytes += TclUtfToUniChar(bytes, &unichar);
#if TCL_UTF_MAX > 3
	/* join upper/lower surrogate */
	if (bytes && (stringPtr->unicode[numOrigChars - 1] | 0x3FF) == 0xDBFF && (unichar | 0x3FF) == 0xDFFF) {
		stringPtr->numChars--;
		unichar = ((stringPtr->unicode[numOrigChars - 1] & 0x3FF) << 10) + (unichar & 0x3FF) + 0x10000;
		dst--;
	}
#endif
	*dst++ = unichar;
	while (numAppendChars-- > 0) {
	    bytes += TclUtfToUniChar(bytes, &unichar);
	    *dst++ = unichar;
	}
    }
    *dst = 0;







<
<
<
<
<
<
<
<







4185
4186
4187
4188
4189
4190
4191








4192
4193
4194
4195
4196
4197
4198
	stringPtr->numChars = needed;
    } else {
	numAppendChars = 0;
    }
    dst = stringPtr->unicode + numOrigChars;
    if (numAppendChars-- > 0) {
	bytes += TclUtfToUniChar(bytes, &unichar);








	*dst++ = unichar;
	while (numAppendChars-- > 0) {
	    bytes += TclUtfToUniChar(bytes, &unichar);
	    *dst++ = unichar;
	}
    }
    *dst = 0;

Changes to generic/tclUtf.c.

204
205
206
207
208
209
210

211
212
213
214
215
216
217
218
219







220
221
222
223
224
225
226
227
228
229
230




231
232
233
234
235
236
237
238
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */


int
Tcl_UniCharToUtf(
    int ch,			/* The Tcl_UniChar to be stored in the
				 * buffer. */
    char *buf)			/* Buffer in which the UTF-8 representation of
				 * the Tcl_UniChar is stored. Buffer must be
				 * large enough to hold the UTF-8 character
				 * (at most 4 bytes). */
{







    if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
	buf[0] = (char) ch;
	return 1;
    }
    if (ch >= 0) {
	if (ch <= 0x7FF) {
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {




	    if ((ch & 0xF800) == 0xD800) {
		if (ch & 0x0400) {
		    /* Low surrogate */
		    if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) {
			/* Previous Tcl_UniChar was a high surrogate, so combine */
			buf[2] = (char) ((ch & 0x3F) | 0x80);
			buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80);
			return 3;







>



|





>
>
>
>
>
>
>











>
>
>
>
|







204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

#undef Tcl_UniCharToUtf
int
Tcl_UniCharToUtf(
    int ch,			/* The Tcl_UniChar to be stored in the
				 * buffer. Can be or'ed with flag TCL_COMBINE */
    char *buf)			/* Buffer in which the UTF-8 representation of
				 * the Tcl_UniChar is stored. Buffer must be
				 * large enough to hold the UTF-8 character
				 * (at most 4 bytes). */
{
#if TCL_UTF_MAX > 3
    int flags = ch;
#endif

    if (ch >= TCL_COMBINE) {
	ch &= (TCL_COMBINE - 1);
    }
    if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
	buf[0] = (char) ch;
	return 1;
    }
    if (ch >= 0) {
	if (ch <= 0x7FF) {
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {
	    if (
#if TCL_UTF_MAX > 3
		    (flags & TCL_COMBINE) &&
#endif
		    ((ch & 0xF800) == 0xD800)) {
		if (ch & 0x0400) {
		    /* Low surrogate */
		    if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) {
			/* Previous Tcl_UniChar was a high surrogate, so combine */
			buf[2] = (char) ((ch & 0x3F) | 0x80);
			buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80);
			return 3;
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
    p = string;
    wEnd = uniStr + uniLength;
    for (w = uniStr; w < wEnd; ) {
	if (!len && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	len = Tcl_UniCharToUtf(*w, p);
	p += len;
	if ((*w >= 0xD800) && (len < 3)) {
	    len = 0; /* Indication that high surrogate was found */
	}
	w++;
    }
    if (!len) {







|







385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
    p = string;
    wEnd = uniStr + uniLength;
    for (w = uniStr; w < wEnd; ) {
	if (!len && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p);
	p += len;
	if ((*w >= 0xD800) && (len < 3)) {
	    len = 0; /* Indication that high surrogate was found */
	}
	w++;
    }
    if (!len) {
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
	 */

	if ((len < TclUtfCount(upChar)) || ((upChar & ~0x7FF) == 0xD800)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(upChar, dst);
	}
	src += len;
    }







|







1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
	 */

	if (len < TclUtfCount(upChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(upChar, dst);
	}
	src += len;
    }
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */

	if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += len;
    }







|







1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */

	if (len < TclUtfCount(lowChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += len;
    }
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477

    src = dst = str;

    if (*src) {
	len = TclUtfToUCS4(src, &ch);
	titleChar = Tcl_UniCharToTitle(ch);

	if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(titleChar, dst);
	}
	src += len;
    }
    while (*src) {
	len = TclUtfToUCS4(src, &ch);
	lowChar = ch;
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}

	if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += len;
    }







|















|







1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489

    src = dst = str;

    if (*src) {
	len = TclUtfToUCS4(src, &ch);
	titleChar = Tcl_UniCharToTitle(ch);

	if (len < TclUtfCount(titleChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(titleChar, dst);
	}
	src += len;
    }
    while (*src) {
	len = TclUtfToUCS4(src, &ch);
	lowChar = ch;
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}

	if (len < TclUtfCount(lowChar)) {
	    memmove(dst, src, len);
	    dst += len;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);
	}
	src += len;
    }

Changes to tests/encoding.test.

36
37
38
39
40
41
42



43
44
45
46
47
48
49
50

# Some tests require the testencoding command
testConstraint testencoding [llength [info commands testencoding]]
testConstraint testbytestring [llength [info commands testbytestring]]
testConstraint teststringbytes [llength [info commands teststringbytes]]
testConstraint exec [llength [info commands exec]]
testConstraint testgetencpath [llength [info commands testgetencpath]]




# TclInitEncodingSubsystem is tested by the rest of this file
# TclFinalizeEncodingSubsystem is not currently tested

test encoding-1.1 {Tcl_GetEncoding: system encoding} -setup {
    set old [encoding system]
} -constraints {testencoding} -body {
    testencoding create foo [namespace origin toutf] [namespace origin fromutf]







>
>
>
|







36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

# Some tests require the testencoding command
testConstraint testencoding [llength [info commands testencoding]]
testConstraint testbytestring [llength [info commands testbytestring]]
testConstraint teststringbytes [llength [info commands teststringbytes]]
testConstraint exec [llength [info commands exec]]
testConstraint testgetencpath [llength [info commands testgetencpath]]
testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}]
testConstraint utf32 [expr {[testConstraint fullutf]
		&& [string length [format %c 0x10000]] == 1}]

# TclInitEncodingSubsystem is tested by the rest of this file
# TclFinalizeEncodingSubsystem is not currently tested

test encoding-1.1 {Tcl_GetEncoding: system encoding} -setup {
    set old [encoding system]
} -constraints {testencoding} -body {
    testencoding create foo [namespace origin toutf] [namespace origin fromutf]
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
test encoding-15.3 {UtfToUtfProc null character input} teststringbytes {
    set y [encoding convertfrom utf-8 [encoding convertto utf-8 \x00]]
    binary scan [teststringbytes $y] H* z
    set z
} c080
test encoding-15.4 {UtfToUtfProc emoji character input} -body {
    set x \xED\xA0\xBD\xED\xB8\x82
    set y [encoding convertfrom utf-8 \xED\xA0\xBD\xED\xB8\x82]
    list [string length $x] $y
} -result "6 😂"
test encoding-15.5 {UtfToUtfProc emoji character input} {
    set x \xF0\x9F\x98\x82
    set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
    list [string length $x] $y
} "4 😂"
test encoding-15.6 {UtfToUtfProc emoji character output} {
    set x \uDE02\uD83D\uDE02\uD83D
    set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uDE02\uD83D]
    binary scan $y H* z
    list [string length $y] $z
} {10 edb882f09f9882eda0bd}
test encoding-15.7 {UtfToUtfProc emoji character output} {
    set x \uDE02\uD83D\uD83D
    set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uD83D]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {3 9 edb882eda0bdeda0bd}
test encoding-15.8 {UtfToUtfProc emoji character output} {
    set x \uDE02\uD83Dé
    set y [encoding convertto -nocomplain utf-8 \uDE02\uD83Dé]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {3 8 edb882eda0bdc3a9}
test encoding-15.9 {UtfToUtfProc emoji character output} {
    set x \uDE02\uD83DX
    set y [encoding convertto -nocomplain utf-8 \uDE02\uD83DX]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {3 7 edb882eda0bd58}
test encoding-15.10 {UtfToUtfProc high surrogate character output} {
    set x \uDE02é
    set y [encoding convertto -nocomplain utf-8 \uDE02é]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {2 5 edb882c3a9}
test encoding-15.11 {UtfToUtfProc low surrogate character output} {
    set x \uDA02é
    set y [encoding convertto -nocomplain utf-8 \uDA02é]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {2 5 eda882c3a9}
test encoding-15.12 {UtfToUtfProc high surrogate character output} {
    set x \uDE02Y
    set y [encoding convertto -nocomplain utf-8 \uDE02Y]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {2 4 edb88259}
test encoding-15.13 {UtfToUtfProc low surrogate character output} {
    set x \uDA02Y
    set y [encoding convertto -nocomplain utf-8 \uDA02Y]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {2 4 eda88259}
test encoding-15.14 {UtfToUtfProc high surrogate character output} {
    set x \uDE02
    set y [encoding convertto -nocomplain utf-8 \uDE02]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {1 3 edb882}
test encoding-15.15 {UtfToUtfProc low surrogate character output} {
    set x \uDA02
    set y [encoding convertto -nocomplain utf-8 \uDA02]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {1 3 eda882}
test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} {
    set x \xF0\xA0\xA1\xC2
    set y [encoding convertfrom -nocomplain utf-8 \xF0\xA0\xA1\xC2]
    list [string length $x] $y
} "4 \xF0\xA0\xA1\xC2"
test encoding-15.17 {UtfToUtfProc emoji character output} {
    set x 😂







|

|





|




|
|




|
|




|
|




|
|




|
|




|
|




|
|




|
|




|
|




|







335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
test encoding-15.3 {UtfToUtfProc null character input} teststringbytes {
    set y [encoding convertfrom utf-8 [encoding convertto utf-8 \x00]]
    binary scan [teststringbytes $y] H* z
    set z
} c080
test encoding-15.4 {UtfToUtfProc emoji character input} -body {
    set x \xED\xA0\xBD\xED\xB8\x82
    set y [encoding convertfrom -nocomplain utf-8 \xED\xA0\xBD\xED\xB8\x82]
    list [string length $x] $y
} -result "6 \uD83D\uDE02"
test encoding-15.5 {UtfToUtfProc emoji character input} {
    set x \xF0\x9F\x98\x82
    set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82]
    list [string length $x] $y
} "4 😂"
test encoding-15.6 {UtfToUtfProc emoji character output} utf32 {
    set x \uDE02\uD83D\uDE02\uD83D
    set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uDE02\uD83D]
    binary scan $y H* z
    list [string length $y] $z
} {12 efbfbdefbfbdefbfbdefbfbd}
test encoding-15.7 {UtfToUtfProc emoji character output} utf32 {
    set x \uDE02\uD83D\uD83D
    set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uD83D]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {3 9 efbfbdefbfbdefbfbd}
test encoding-15.8 {UtfToUtfProc emoji character output} utf32 {
    set x \uDE02\uD83Dé
    set y [encoding convertto -nocomplain utf-8 \uDE02\uD83Dé]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {3 8 efbfbdefbfbdc3a9}
test encoding-15.9 {UtfToUtfProc emoji character output} utf32 {
    set x \uDE02\uD83DX
    set y [encoding convertto -nocomplain utf-8 \uDE02\uD83DX]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {3 7 efbfbdefbfbd58}
test encoding-15.10 {UtfToUtfProc high surrogate character output} utf32 {
    set x \uDE02é
    set y [encoding convertto -nocomplain utf-8 \uDE02é]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {2 5 efbfbdc3a9}
test encoding-15.11 {UtfToUtfProc low surrogate character output} utf32 {
    set x \uDA02é
    set y [encoding convertto -nocomplain utf-8 \uDA02é]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {2 5 efbfbdc3a9}
test encoding-15.12 {UtfToUtfProc high surrogate character output} utf32 {
    set x \uDE02Y
    set y [encoding convertto -nocomplain utf-8 \uDE02Y]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {2 4 efbfbd59}
test encoding-15.13 {UtfToUtfProc low surrogate character output} utf32 {
    set x \uDA02Y
    set y [encoding convertto -nocomplain utf-8 \uDA02Y]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {2 4 efbfbd59}
test encoding-15.14 {UtfToUtfProc high surrogate character output} utf32 {
    set x \uDE02
    set y [encoding convertto -nocomplain utf-8 \uDE02]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {1 3 efbfbd}
test encoding-15.15 {UtfToUtfProc low surrogate character output} utf32 {
    set x \uDA02
    set y [encoding convertto -nocomplain utf-8 \uDA02]
    binary scan $y H* z
    list [string length $x] [string length $y] $z
} {1 3 efbfbd}
test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} {
    set x \xF0\xA0\xA1\xC2
    set y [encoding convertfrom -nocomplain utf-8 \xF0\xA0\xA1\xC2]
    list [string length $x] $y
} "4 \xF0\xA0\xA1\xC2"
test encoding-15.17 {UtfToUtfProc emoji character output} {
    set x 😂

Changes to tests/string.test.

30
31
32
33
34
35
36


37
38
39
40
41
42
43
# Some tests require the testobj command

testConstraint testobj [expr {[info commands testobj] ne {}}]
testConstraint testindexobj [expr {[info commands testindexobj] ne {}}]
testConstraint testevalex [expr {[info commands testevalex] ne {}}]
testConstraint utf32 [expr {[string length \U010000] == 1}]
testConstraint testbytestring   [llength [info commands testbytestring]]



# Used for constraining memory leak tests
testConstraint memory [llength [info commands memory]]
if {[testConstraint memory]} {
    proc getbytes {} {
        set lines [split [memory info] \n]
        return [lindex $lines 3 3]







>
>







30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# Some tests require the testobj command

testConstraint testobj [expr {[info commands testobj] ne {}}]
testConstraint testindexobj [expr {[info commands testindexobj] ne {}}]
testConstraint testevalex [expr {[info commands testevalex] ne {}}]
testConstraint utf32 [expr {[string length \U010000] == 1}]
testConstraint testbytestring   [llength [info commands testbytestring]]
testConstraint utf32 [expr {[testConstraint fullutf]
		&& [string length [format %c 0x10000]] == 1}]

# Used for constraining memory leak tests
testConstraint memory [llength [info commands memory]]
if {[testConstraint memory]} {
    proc getbytes {} {
        set lines [split [memory info] \n]
        return [lindex $lines 3 3]
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
test string-21.21.$noComp {string trimleft, unicode} {
    run {string trimleft "\uF602Hello world!\uF602" \uD83D\uDE02}
} "\uF602Hello world!\uF602"
test string-21.22.$noComp {string trimright, unicode} {
    run {string trimright "\uF602Hello world!\uF602" \uD83D\uDE02}
} "\uF602Hello world!\uF602"
test string-21.23.$noComp {string trim, unicode} {
    run {string trim "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02}
} "\uD83D\uDE02Hello world!\uD83D\uDE02"
test string-21.24.$noComp {string trimleft, unicode} {
    run {string trimleft "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02}
} "\uD83D\uDE02Hello world!\uD83D\uDE02"
test string-21.25.$noComp {string trimright, unicode} {
    run {string trimright "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02}
} "\uD83D\uDE02Hello world!\uD83D\uDE02"

test string-22.1.$noComp {string wordstart} -body {
    list [catch {run {string word a}} msg] $msg
} -result {1 {unknown or ambiguous subcommand "word": must be cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
test string-22.2.$noComp {string wordstart} -body {
    list [catch {run {string wordstart a}} msg] $msg







|





|







1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
test string-21.21.$noComp {string trimleft, unicode} {
    run {string trimleft "\uF602Hello world!\uF602" \uD83D\uDE02}
} "\uF602Hello world!\uF602"
test string-21.22.$noComp {string trimright, unicode} {
    run {string trimright "\uF602Hello world!\uF602" \uD83D\uDE02}
} "\uF602Hello world!\uF602"
test string-21.23.$noComp {string trim, unicode} {
    run {string trim "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D}
} "\uD83D\uDE02Hello world!\uD83D\uDE02"
test string-21.24.$noComp {string trimleft, unicode} {
    run {string trimleft "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02}
} "\uD83D\uDE02Hello world!\uD83D\uDE02"
test string-21.25.$noComp {string trimright, unicode} {
    run {string trimright "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D}
} "\uD83D\uDE02Hello world!\uD83D\uDE02"

test string-22.1.$noComp {string wordstart} -body {
    list [catch {run {string word a}} msg] $msg
} -result {1 {unknown or ambiguous subcommand "word": must be cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
test string-22.2.$noComp {string wordstart} -body {
    list [catch {run {string wordstart a}} msg] $msg
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
    binary scan [run {string reverse [binary format H* 010203]}] H* x
    set x
} 030201
test string-24.15.$noComp {string reverse command - pure bytearray} {
    binary scan [run {tcl::string::reverse [binary format H* 010203]}] H* x
    set x
} 030201
test string-24.16.$noComp {string reverse command - surrogates} {
    run {string reverse \u0444bulb\uD83D\uDE02}
} \uD83D\uDE02blub\u0444
test string-24.17.$noComp {string reverse command - surrogates} {
    run {string reverse \uD83D\uDE02hello\uD83D\uDE02}
} \uD83D\uDE02olleh\uD83D\uDE02
test string-24.18.$noComp {string reverse command - surrogates} {
    set s \u0444bulb\uD83D\uDE02
    # shim shimmery ...
    string index $s 0
    run {string reverse $s}
} \uD83D\uDE02blub\u0444
test string-24.19.$noComp {string reverse command - surrogates} {
    set s \uD83D\uDE02hello\uD83D\uDE02
    # shim shimmery ...
    string index $s 0
    run {string reverse $s}
} \uD83D\uDE02olleh\uD83D\uDE02

test string-25.1.$noComp {string is list} {
    run {string is list {a b c}}
} 1
test string-25.2.$noComp {string is list} {
    run {string is list "a \{b c"}
} 0







|

|
|

|
|




|
|




|







2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
    binary scan [run {string reverse [binary format H* 010203]}] H* x
    set x
} 030201
test string-24.15.$noComp {string reverse command - pure bytearray} {
    binary scan [run {tcl::string::reverse [binary format H* 010203]}] H* x
    set x
} 030201
test string-24.16.$noComp {string reverse command - surrogates} utf32 {
    run {string reverse \u0444bulb\uD83D\uDE02}
} \uDE02\uD83Dblub\u0444
test string-24.17.$noComp {string reverse command - surrogates} utf32 {
    run {string reverse \uD83D\uDE02hello\uD83D\uDE02}
} \uDE02\uD83Dolleh\uDE02\uD83D
test string-24.18.$noComp {string reverse command - surrogates} utf32 {
    set s \u0444bulb\uD83D\uDE02
    # shim shimmery ...
    string index $s 0
    run {string reverse $s}
} \uDE02\uD83Dblub\u0444
test string-24.19.$noComp {string reverse command - surrogates} utf32 {
    set s \uD83D\uDE02hello\uD83D\uDE02
    # shim shimmery ...
    string index $s 0
    run {string reverse $s}
} \uDE02\uD83Dolleh\uDE02\uD83D

test string-25.1.$noComp {string is list} {
    run {string is list {a b c}}
} 1
test string-25.2.$noComp {string is list} {
    run {string is list "a \{b c"}
} 0

Changes to tests/utf.test.

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
} 1
test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring {
    expr {[format %c 0xDC42] eq [testbytestring \xED\xB1\x82]}
} 1
test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} {
    expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]}
} 1
test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} {Uesc ucs2} {
    expr {"\UD842" eq "\uD842"}
} 1
test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} {
    expr {"\UD842" eq [testbytestring \xEF\xBF\xBD]}
} 1
test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} {
    set lo \uDE02
    return \uD83D$lo
} \uD83D\uDE02
test utf-1.15 {Tcl_UniCharToUtf: surrogate pairs from concat} {
    set hi \uD83D







|



|







74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
} 1
test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring {
    expr {[format %c 0xDC42] eq [testbytestring \xED\xB1\x82]}
} 1
test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} {
    expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]}
} 1
test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} Uesc {
    expr {"\UD842" eq "\uD842"}
} 1
test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} {
    expr {"\UD842" eq [testbytestring \xED\xA1\x82]}
} 1
test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} {
    set lo \uDE02
    return \uD83D$lo
} \uD83D\uDE02
test utf-1.15 {Tcl_UniCharToUtf: surrogate pairs from concat} {
    set hi \uD83D
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
} ᲐᲐ
test utf-11.6 {Tcl_UtfToUpper beyond U+FFFF} fullutf {
    string toupper 𐐨
} 𐐀
test utf-11.7 {Tcl_UtfToUpper beyond U+FFFF} fullutf {
    string toupper 𐐨
} 𐐀
test utf-11.8 {Tcl_UtfToUpper low/high surrogate)} {
    string toupper \uDC24\uD824
} \uDC24\uD824

test utf-12.1 {Tcl_UtfToLower} {
    string tolower {}
} {}
test utf-12.2 {Tcl_UtfToLower} {
    string tolower ABC
} abc
test utf-12.3 {Tcl_UtfToLower} {
    string tolower ÃGH
} ãgh
test utf-12.4 {Tcl_UtfToLower} {
    string tolower ǢGH
} ǣgh
test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} {
    string tolower აᲐ
} აა
test utf-12.6 {Tcl_UtfToLower low/high surrogate)} {
    string tolower \uDC24\uD824
} \uDC24\uD824
test utf-12.7 {Tcl_UtfToLower beyond U+FFFF} fullutf {
    string tolower 𐐀
} 𐐨
test utf-12.8 {Tcl_UtfToLower beyond U+FFFF} fullutf {
    string tolower 𐐀







|


















|







1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
} ᲐᲐ
test utf-11.6 {Tcl_UtfToUpper beyond U+FFFF} fullutf {
    string toupper 𐐨
} 𐐀
test utf-11.7 {Tcl_UtfToUpper beyond U+FFFF} fullutf {
    string toupper 𐐨
} 𐐀
test utf-11.8 {Tcl_UtfToUpper low/high surrogate)} utf32 {
    string toupper \uDC24\uD824
} \uDC24\uD824

test utf-12.1 {Tcl_UtfToLower} {
    string tolower {}
} {}
test utf-12.2 {Tcl_UtfToLower} {
    string tolower ABC
} abc
test utf-12.3 {Tcl_UtfToLower} {
    string tolower ÃGH
} ãgh
test utf-12.4 {Tcl_UtfToLower} {
    string tolower ǢGH
} ǣgh
test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} {
    string tolower აᲐ
} აა
test utf-12.6 {Tcl_UtfToLower low/high surrogate)} utf32 {
    string tolower \uDC24\uD824
} \uDC24\uD824
test utf-12.7 {Tcl_UtfToLower beyond U+FFFF} fullutf {
    string tolower 𐐀
} 𐐨
test utf-12.8 {Tcl_UtfToLower beyond U+FFFF} fullutf {
    string tolower 𐐀
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
} Dzab
test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} {
    string totitle აᲐ
} აᲐ
test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} {
    string totitle Აა
} Აა
test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} {
    string totitle \uDC24\uD824
} \uDC24\uD824
test utf-13.8 {Tcl_UtfToTitle beyond U+FFFF} fullutf {
    string totitle 𐐨𐐀
} 𐐀𐐨
test utf-13.9 {Tcl_UtfToTitle beyond U+FFFF} fullutf {
    string totitle 𐐨𐐀







|







1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
} Dzab
test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} {
    string totitle აᲐ
} აᲐ
test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} {
    string totitle Აა
} Აა
test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} utf32 {
    string totitle \uDC24\uD824
} \uDC24\uD824
test utf-13.8 {Tcl_UtfToTitle beyond U+FFFF} fullutf {
    string totitle 𐐨𐐀
} 𐐀𐐨
test utf-13.9 {Tcl_UtfToTitle beyond U+FFFF} fullutf {
    string totitle 𐐨𐐀