Tcl Source Code

Changes On Branch tip-671
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch tip-671 Excluding Merge-Ins

This is equivalent to a diff from 6bdb0864f1 to 0e7c7d2154

2023-09-15
06:53
3 places where TCLFSENCODING is not appropricate (use system-encoding, not utf-8, on Windows) check-in: 8f8ceca803 user: jan.nijtmans tags: tip-657
2023-09-12
14:23
Rebase to tip-657 Leaf check-in: 0e7c7d2154 user: jan.nijtmans tags: tip-671
13:59
Rebase to 9.0 check-in: 6bdb0864f1 user: jan.nijtmans tags: tip-657
12:26
Merge-mark check-in: 5d5ce90d0c user: jan.nijtmans tags: trunk, main
2023-08-30
13:58
Rebase to tip-657 check-in: cb4d02077f user: jan.nijtmans tags: tip-671
13:45
Rebase to 9.0 check-in: 1c4d579fdf user: jan.nijtmans tags: tip-657

Changes to generic/tcl.h.

2034
2035
2036
2037
2038
2039
2040

2041
2042
2043
2044
2045
2046
2047
 * Reserve top byte for profile values (disjoint, not a mask). In case of
 * changes, ensure ENCODING_PROFILE_* macros in tclInt.h are modified if
 * necessary.
 */
#define TCL_ENCODING_PROFILE_STRICT   TCL_ENCODING_STOPONERROR
#define TCL_ENCODING_PROFILE_TCL8     0x01000000
#define TCL_ENCODING_PROFILE_REPLACE  0x02000000


/*
 * The following definitions are the error codes returned by the conversion
 * routines:
 *
 * TCL_OK -			All characters were converted.
 * TCL_CONVERT_NOSPACE -	The output buffer would not have been large







>







2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
 * Reserve top byte for profile values (disjoint, not a mask). In case of
 * changes, ensure ENCODING_PROFILE_* macros in tclInt.h are modified if
 * necessary.
 */
#define TCL_ENCODING_PROFILE_STRICT   TCL_ENCODING_STOPONERROR
#define TCL_ENCODING_PROFILE_TCL8     0x01000000
#define TCL_ENCODING_PROFILE_REPLACE  0x02000000
#define TCL_ENCODING_PROFILE_LOSSLESS 0x03000000

/*
 * The following definitions are the error codes returned by the conversion
 * routines:
 *
 * TCL_OK -			All characters were converted.
 * TCL_CONVERT_NOSPACE -	The output buffer would not have been large

Changes to generic/tclEncoding.c.

79
80
81
82
83
84
85


86
87
88
89
90
91
92
				/* Two dimensional sparse matrix to map
				 * characters from Unicode to the encoding.
				 * Each element of the fromUnicode array
				 * points to an array of 256 shorts. If there
				 * is no corresponding character the encoding,
				 * the value in the matrix is 0x0000.
				 * malloc'd. */


} TableEncodingData;

/*
 * Each of the following structures is the clientData for a dynamically-loaded
 * escape-driven encoding that is itself comprised of other simpler encodings.
 * An example is "iso-2022-jp", which uses escape sequences to switch between
 * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven"







>
>







79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
				/* Two dimensional sparse matrix to map
				 * characters from Unicode to the encoding.
				 * Each element of the fromUnicode array
				 * points to an array of 256 shorts. If there
				 * is no corresponding character the encoding,
				 * the value in the matrix is 0x0000.
				 * malloc'd. */
    int flags;			/* Miscellaneous flags */
#define ENCODING_ASCII_COMPATIBLE 0x1
} TableEncodingData;

/*
 * Each of the following structures is the clientData for a dynamically-loaded
 * escape-driven encoding that is itself comprised of other simpler encodings.
 * An example is "iso-2022-jp", which uses escape sequences to switch between
 * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven"
192
193
194
195
196
197
198

199
200
201
202
203
204
205
206
207
208



209
210
211
212
213
214
215
216
217
 * Names of encoding profiles and corresponding integer values.
 * Keep alphabetical order for error messages.
 */
static struct TclEncodingProfiles {
    const char *name;
    int value;
} encodingProfiles[] = {

    {"replace", TCL_ENCODING_PROFILE_REPLACE},
    {"strict", TCL_ENCODING_PROFILE_STRICT},
    {"tcl8", TCL_ENCODING_PROFILE_TCL8},
};
#define PROFILE_TCL8(flags_)                                           \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_TCL8)

#define PROFILE_REPLACE(flags_)                                        \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE)




#define PROFILE_STRICT(flags_)                                         \
    (!PROFILE_TCL8(flags_) && !PROFILE_REPLACE(flags_))

#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD)
#define SURROGATE(c_)      (((c_) & ~0x7FF) == 0xD800)
#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800)
#define LOW_SURROGATE(c_)  (((c_) & ~0x3FF) == 0xDC00)

/*







>










>
>
>

|







194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
 * Names of encoding profiles and corresponding integer values.
 * Keep alphabetical order for error messages.
 */
static struct TclEncodingProfiles {
    const char *name;
    int value;
} encodingProfiles[] = {
    {"lossless", TCL_ENCODING_PROFILE_LOSSLESS},
    {"replace", TCL_ENCODING_PROFILE_REPLACE},
    {"strict", TCL_ENCODING_PROFILE_STRICT},
    {"tcl8", TCL_ENCODING_PROFILE_TCL8},
};
#define PROFILE_TCL8(flags_)                                           \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_TCL8)

#define PROFILE_REPLACE(flags_)                                        \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE)

#define PROFILE_LOSSLESS(flags_)                                       \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_LOSSLESS)

#define PROFILE_STRICT(flags_)                                         \
    (!PROFILE_TCL8(flags_) && !PROFILE_REPLACE(flags_) && !PROFILE_LOSSLESS(flags_))

#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD)
#define SURROGATE(c_)      (((c_) & ~0x7FF) == 0xD800)
#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800)
#define LOW_SURROGATE(c_)  (((c_) & ~0x3FF) == 0xDC00)

/*
253
254
255
256
257
258
259














260
261
262
263
264
265
266
static Tcl_EncodingConvertProc	Utf16ToUtfProc;
static Tcl_EncodingConvertProc	UtfToUtf16Proc;
static Tcl_EncodingConvertProc	UtfToUcs2Proc;
static Tcl_EncodingConvertProc	UtfToUtfProc;
static Tcl_EncodingConvertProc	Iso88591FromUtfProc;
static Tcl_EncodingConvertProc	Iso88591ToUtfProc;
















/*
 * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
 * of the internalrep. This should help the lifetime of encodings be more useful.
 * See concerns raised in [Bug 1077262].
 */








>
>
>
>
>
>
>
>
>
>
>
>
>
>







259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
static Tcl_EncodingConvertProc	Utf16ToUtfProc;
static Tcl_EncodingConvertProc	UtfToUtf16Proc;
static Tcl_EncodingConvertProc	UtfToUcs2Proc;
static Tcl_EncodingConvertProc	UtfToUtfProc;
static Tcl_EncodingConvertProc	Iso88591FromUtfProc;
static Tcl_EncodingConvertProc	Iso88591ToUtfProc;

/* Return 1/0 if unich is a lossless wrapper */
static inline int IsLosslessWrapper(Tcl_UniChar unich) {
    return (unich >= 0xDC00 && unich <= 0xDCFF);
}
/* Convert a byte to internal lossless representation */
static inline Tcl_UniChar ToLossless(char ch) {
    /* Only encode if non-ASCII for security reasons. See TIP */
    return 0x80 & ch ? 0xDC00 + UCHAR(ch) : UNICODE_REPLACE_CHAR;
}
/* Convert an internal lossless representation to raw byte */
static inline unsigned char FromLossless(Tcl_UniChar unich) {
    assert(IsLosslessWrapper(unich));
    return (unsigned char)(unich - 0xDC00);
}

/*
 * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
 * of the internalrep. This should help the lifetime of encodings be more useful.
 * See concerns raised in [Bug 1077262].
 */

285
286
287
288
289
290
291




































292
293
294
295
296
297
298
    do {								\
	const Tcl_ObjInternalRep *irPtr;					\
	irPtr = TclFetchInternalRep ((objPtr), &encodingType);		\
	(encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL;		\
    } while (0)






































/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetEncodingFromObj --
 *
 *	Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr), if
 *	possible, and returns TCL_OK. If no such encoding exists, TCL_ERROR is







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
    do {								\
	const Tcl_ObjInternalRep *irPtr;					\
	irPtr = TclFetchInternalRep ((objPtr), &encodingType);		\
	(encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL;		\
    } while (0)


/*
 *------------------------------------------------------------------------
 *
 * ToLosslessUtf8 --
 *
 *    Converts an entire string of bytes to their lossless utf-8 representation.
 *    Caller has to ensure the entire string is to be treated as invalid encoding.
 *
 * Results:
 *    Number of bytes in converted utf-8 output or a negative value if
 *    insufficient space.
 *
 * Side effects:
 *    The dst buffer is filled with the utf-8 lossless representation.
 *
 *------------------------------------------------------------------------
 */
static Tcl_Size
ToLosslessUtf8(
	const char *src, /* Source bytes */
	Tcl_Size srcLen, /* Number of source bytes */
	char *dst,       /* Destination buffer */
	Tcl_Size dstLen) /* Size of destination buffer */
{
    if ((dstLen / 3) < srcLen) {
	return -1; /* No space */
    }
    const char *srcEnd = src + srcLen;
    char *dstStart = dst;
    while (src < srcEnd) {
	dst += Tcl_UniCharToUtf(ToLossless(UCHAR(*src)), dst);
	++src;
    }
    return (dst - dstStart);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetEncodingFromObj --
 *
 *	Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr), if
 *	possible, and returns TCL_OK. If no such encoding exists, TCL_ERROR is
2104
2105
2106
2107
2108
2109
2110

2111
2112

2113
2114
2115
















2116
2117
2118
2119
2120
2121
2122
	    pageMemPtr++;
	    p += 4;
	}
    }
    TclDecrRefCount(objPtr);

    if (type == ENCODING_DOUBLEBYTE) {

	memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
    } else {

	for (hi = 1; hi < 256; hi++) {
	    if (dataPtr->toUnicode[hi] != NULL) {
		dataPtr->prefixBytes[hi] = 1;
















	    }
	}
    }

    /*
     * Invert the toUnicode array to produce the fromUnicode array. Performs a
     * single malloc to get the memory for the array and all the pages needed







>


>



>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
	    pageMemPtr++;
	    p += 4;
	}
    }
    TclDecrRefCount(objPtr);

    if (type == ENCODING_DOUBLEBYTE) {
	/* DBCS never ascii compatible so no need to set dataPtr->flags */
	memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
    } else {
	int asciiCompatible = 1;
	for (hi = 1; hi < 256; hi++) {
	    if (dataPtr->toUnicode[hi] != NULL) {
		dataPtr->prefixBytes[hi] = 1;
		if (hi < 128) {
		    /* any byte < 128 is a prefix => not ASCII compatible */
		    asciiCompatible = 0;
		}
	    }
	}
	if (asciiCompatible) {
	    for (lo = 1; lo < 128; ++lo) {
		if (dataPtr->toUnicode[0][lo] != lo) {
		    /* any byte < 128 does not map to itself => not ASCII compatible */
		    asciiCompatible = 0;
		    break;
		}
	    }
	    if (asciiCompatible) {
		dataPtr->flags |= ENCODING_ASCII_COMPATIBLE;
	    }
	}
    }

    /*
     * Invert the toUnicode array to produce the fromUnicode array. Performs a
     * single malloc to get the memory for the array and all the pages needed
2552
2553
2554
2555
2556
2557
2558
2559











2560
2561
2562


2563
2564
2565
2566
2567



2568
2569
2570



2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583

2584
2585
2586
2587

2588

2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599

2600
2601
2602
2603
2604

2605


2606


2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621






2622





2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640

2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651

2652


2653
2654
2655
2656
2657
2658
2659
	    /*
	     * Copy 7bit characters, but skip null-bytes when we are in input
	     * mode, so that they get converted to \xC0\x80.
	     */
	    *dst++ = *src++;
	} else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) &&
		 (UCHAR(src[1]) == 0x80) &&
		 (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) ||











		  PROFILE_REPLACE(profile))) {
	    /* Special sequence \xC0\x80 */
	    if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) {


		if (PROFILE_REPLACE(profile)) {
		   dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		   src += 2;
		} else {
		   /* PROFILE_STRICT */



		   result = TCL_CONVERT_SYNTAX;
		   break;
		}



	    } else {
		/*
		 * For output convert 0xC080 to a real null.
		 */
		*dst++ = 0;
		src += 2;
	    }

	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    /*
	     * Incomplete byte sequence.
		 * Always check before using Tcl_UtfToUniChar. Not doing so can cause it
		 * run beyond the end of the buffer! If we happen on such an incomplete

		 * char its bytes are made to represent themselves unless the user has
		 * explicitly asked to be told.
	     */


	    if (flags & ENCODING_INPUT) {

		/* Incomplete bytes for modified UTF-8 target */
		if (PROFILE_STRICT(profile)) {
		    result = (flags & TCL_ENCODING_CHAR_LIMIT)
			       ? TCL_CONVERT_MULTIBYTE
			       : TCL_CONVERT_SYNTAX;
		    break;
		}
	    }
	    if (PROFILE_REPLACE(profile)) {
		ch = UNICODE_REPLACE_CHAR;
		++src;

	    } else {
		/* TCL_ENCODING_PROFILE_TCL8 */
		char chbuf[2];
		chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
		Tcl_UtfToUniChar(chbuf, &ch);

	    }


	    dst += Tcl_UniCharToUtf(ch, dst);


	} else {
	    int isInvalid = 0;
	    size_t len = Tcl_UtfToUniChar(src, &ch);
	    if (flags & ENCODING_INPUT) {
		if ((len < 2) && (ch != 0)) {
		    isInvalid = 1;
		} else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) {
		    isInvalid = 1;
		}
		if (isInvalid) {
		    if (PROFILE_STRICT(profile)) {
			result = TCL_CONVERT_SYNTAX;
			break;
		    } else if (PROFILE_REPLACE(profile)) {
			ch = UNICODE_REPLACE_CHAR;






		    }





		}
	    }

	    const char *saveSrc = src;
	    src += len;
	    if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) {
		if (ch > 0xFFFF) {
		    /* CESU-8 6-byte sequence for chars > U+FFFF */
		    ch -= 0x10000;
		    *dst++ = 0xED;
		    *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
		    *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
		    ch = (ch & 0x0CFF) | 0xDC00;
		}
		*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
		*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
		*dst++ = (char) ((ch | 0x80) & 0xBF);
		continue;

	    } else if (PROFILE_STRICT(profile) &&
		       (!(flags & ENCODING_INPUT)) &&
		       SURROGATE(ch)) {
		result = TCL_CONVERT_UNKNOWN;
		src = saveSrc;
		break;
	    } else if (PROFILE_STRICT(profile) &&
		       (flags & ENCODING_INPUT) &&
		       SURROGATE(ch)) {
		result = TCL_CONVERT_SYNTAX;
		src = saveSrc;

		break;


	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;







|
>
>
>
>
>
>
>
>
>
>
>
|
<
|
>
>
|



|
>
>
>
|
|
|
>
>
>

<
|
<



<


|
|
|
>
|
|

|
>

>


|
|
|
|





>
|
<



>
|
>
>
|
>
>















>
>
>
>
>
>
|
>
>
>
>
>


















>
|
|
<
|
|
|
|
<
|
<
<
>
|
>
>







2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645

2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663

2664

2665
2666
2667

2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693

2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751

2752
2753
2754
2755

2756


2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
	    /*
	     * Copy 7bit characters, but skip null-bytes when we are in input
	     * mode, so that they get converted to \xC0\x80.
	     */
	    *dst++ = *src++;
	} else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) &&
		 (UCHAR(src[1]) == 0x80) &&
		 (!(flags & ENCODING_INPUT) || ! PROFILE_TCL8(profile))) {
	    /*
	     * \xC0\x80 is handled specially for either of the following:
	     *   1. We are doing (internal) modified utf-8 to (external)
	     *   conformant utf-8. C080 is valid internal utf-8 so we
	     *   simply output a \0. Note this overrides case 2.
	     *   2. The profile in use is not TCL8, in which case we have to
	     *   to take a profile-dependent action.
	     * Note the remaining case of external->internal with a TCL8
	     * profile is handled in the default if clause later. (TODO - why not here?)
	     */
	    if (flags & ENCODING_INPUT) {
		assert(!PROFILE_TCL8(profile));

		if (PROFILE_STRICT(profile)) {
		   result = TCL_CONVERT_SYNTAX;
		   break;
		} else if (PROFILE_REPLACE(profile)) {
		   dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		   src += 2;
		} else {
		   assert(PROFILE_LOSSLESS(profile));
		   Tcl_Size len =
		       ToLosslessUtf8(src, 2, dst, dstLen - (dst - dstStart));
		   if (len < 0) {
		       result = TCL_CONVERT_NOSPACE;
		       break;
		   }
		   dst += len;
		   src += 2;
		}
	    } else {

		/* For output convert 0xC080 to a real null. */

		*dst++ = 0;
		src += 2;
	    }

	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    /*
	     * Incomplete byte sequence. We need to do this check before the
	     * Tcl_UtfToUniChar checks in the next sibling if clause. Not doing
	     * so can cause it run beyond the end of the buffer! If we
	     * happen on such an incomplete char its bytes are made to
	     * represent themselves unless the user has explicitly asked to
	     * be told.
	     */
	    assert(flags & TCL_ENCODING_END); /* Else break earlier would
	    					 trigger (srcClose compare) */
	    if (flags & ENCODING_INPUT) {
		/* TODO - why is this inside a ENCODING_INPUT check? */
		/* Incomplete bytes for modified UTF-8 target */
		if (PROFILE_STRICT(profile)) {
		   result = (flags & TCL_ENCODING_CHAR_LIMIT)
			      ? TCL_CONVERT_MULTIBYTE
			      : TCL_CONVERT_SYNTAX;
		   break;
		}
	    }
	    if (PROFILE_REPLACE(profile)) {
		ch = UNICODE_REPLACE_CHAR;
		++src;
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else if (PROFILE_TCL8(profile)) {

		char chbuf[2];
		chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
		Tcl_UtfToUniChar(chbuf, &ch);
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
		assert(PROFILE_LOSSLESS(profile));
                ch = ToLossless(UCHAR(*src));
		dst += Tcl_UniCharToUtf(ch, dst);
		src += 1;
	    }
	} else {
	    int isInvalid = 0;
	    size_t len = Tcl_UtfToUniChar(src, &ch);
	    if (flags & ENCODING_INPUT) {
		if ((len < 2) && (ch != 0)) {
		    isInvalid = 1;
		} else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) {
		    isInvalid = 1;
		}
		if (isInvalid) {
		    if (PROFILE_STRICT(profile)) {
			result = TCL_CONVERT_SYNTAX;
			break;
		    } else if (PROFILE_REPLACE(profile)) {
			ch = UNICODE_REPLACE_CHAR;
		    } else if (PROFILE_LOSSLESS(profile)) {
			Tcl_Size n = ToLosslessUtf8(
			    src, len, dst, dstLen - (dst - dstStart));
			if (n < 0) {
			    result = TCL_CONVERT_NOSPACE;
			    break;
			}
			dst += n;
			src += len;
			continue;
		    }
		    /* else PROFILE_TCL8 - treat as normal char below */
		}
	    }

	    const char *saveSrc = src;
	    src += len;
	    if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) {
		if (ch > 0xFFFF) {
		    /* CESU-8 6-byte sequence for chars > U+FFFF */
		    ch -= 0x10000;
		    *dst++ = 0xED;
		    *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
		    *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
		    ch = (ch & 0x0CFF) | 0xDC00;
		}
		*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
		*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
		*dst++ = (char) ((ch | 0x80) & 0xBF);
		continue;
	    } else if (SURROGATE(ch)) {
		if (PROFILE_STRICT(profile)) {
		    result = (flags & ENCODING_INPUT) ? TCL_CONVERT_SYNTAX

						      : TCL_CONVERT_UNKNOWN;
		    src = saveSrc;
		    break;
		} else if (PROFILE_LOSSLESS(profile)) {

		    if (IsLosslessWrapper(ch)) {


			*dst++ = FromLossless(ch); /* Invalid UTF8 by design! */
			continue;
		    }
		}
	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
2742
2743
2744
2745
2746
2747
2748
2749

2750
2751
2752
2753
2754
2755
2756
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	} else if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
	    result = TCL_CONVERT_SYNTAX;
	    break;
	} else if (PROFILE_REPLACE(flags) && SURROGATE(ch)) {

	    ch = UNICODE_REPLACE_CHAR;
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */







|
>







2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	} else if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
	    result = TCL_CONVERT_SYNTAX;
	    break;
	} else if ((! PROFILE_TCL8(flags)) && SURROGATE(ch)) {
            /* PROFILE_REPLACE | PROFILE_LOSSLESS */
	    ch = UNICODE_REPLACE_CHAR;
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	} else {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
	    } else {
		/* PROFILE_REPLACE or PROFILE_TCL8 */
		result = TCL_OK;
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		numChars++;
		src += bytesLeft; /* Go past truncated code unit */
	    }
	}
    }







|







2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	} else {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
	    } else {
		/* PROFILE_REPLACE | PROFILE_LOSSLESS | PROFILE_TCL8 */
		result = TCL_OK;
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		numChars++;
		src += bytesLeft; /* Go past truncated code unit */
	    }
	}
    }
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
	    break;
	}
	len = Tcl_UtfToUniChar(src, &ch);
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    if (PROFILE_REPLACE(flags)) {
		ch = UNICODE_REPLACE_CHAR;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
	    *dst++ = (ch & 0xFF);
	    *dst++ = ((ch >> 8) & 0xFF);







|
|







2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
	    break;
	}
	len = Tcl_UtfToUniChar(src, &ch);
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    } else if (! PROFILE_TCL8(flags)) {
                /* PROFILE_REPLACE | PROFILE_LOSSLESS */
		ch = UNICODE_REPLACE_CHAR;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
	    *dst++ = (ch & 0xFF);
	    *dst++ = ((ch >> 8) & 0xFF);
2979
2980
2981
2982
2983
2984
2985
2986



2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018


3019

3020


3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037

3038
3039
3040
3041
3042
3043
3044
3045

3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
	if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		src -= 2; /* Go back to beginning of high surrogate */
		dst--; /* Also undo writing a single byte too much */
		numChars--;
		break;
	    } else if (PROFILE_REPLACE(flags)) {



		/*
		 * Previous loop wrote a single byte to mark the high surrogate.
		 * Replace it with the replacement character. Further, restart
		 * current loop iteration since need to recheck destination space
		 * and reset processing of current character.
		 */
		ch = UNICODE_REPLACE_CHAR;
		dst--;
		dst += Tcl_UniCharToUtf(ch, dst);
		src -= 2;
		numChars--;
		continue;
	    } else {
	    /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
		dst += Tcl_UniCharToUtf(-1, dst);
	    }
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */

	if ((unsigned)ch - 1 < 0x7F) {
	    *dst++ = (ch & 0xFF);
	} else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) {
	    dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
	} else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) {
	    /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;


	    } else {

		/* PROFILE_REPLACE */


		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
	    }
	} else {
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    if (HIGH_SURROGATE(ch)) {
	if (PROFILE_STRICT(flags)) {
	    result = TCL_CONVERT_SYNTAX;
	    src -= 2;
	    dst--;
	    numChars--;
	} else if (PROFILE_REPLACE(flags)) {
	    dst--;
	    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
	} else {

	    /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
	    dst += Tcl_UniCharToUtf(-1, dst);
	}
    }

    /*
     * If we had a truncated code unit at the end AND this is the last
     * fragment AND profile is not "strict", stick FFFD in its place.

     */
    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	} else {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
	    } else {
		/* PROFILE_REPLACE or PROFILE_TCL8 */
		result = TCL_OK;
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		numChars++;
		src++; /* Go past truncated code unit */
	    }
	}
    }







|
>
>
>












<
<
<












|




>
>

>
|
>
>

|











|
<
|

>
|
|





|
>








|







3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110



3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148

3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
	if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		src -= 2; /* Go back to beginning of high surrogate */
		dst--; /* Also undo writing a single byte too much */
		numChars--;
		break;
	    } else if (PROFILE_TCL8(flags)) {
                /* Bug [10c2c17c32]. Hi surrogate not followed by Lo: finish 3-byte UTF-8 */
		dst += Tcl_UniCharToUtf(-1, dst);
	    } else {
		/*
		 * Previous loop wrote a single byte to mark the high surrogate.
		 * Replace it with the replacement character. Further, restart
		 * current loop iteration since need to recheck destination space
		 * and reset processing of current character.
		 */
		ch = UNICODE_REPLACE_CHAR;
		dst--;
		dst += Tcl_UniCharToUtf(ch, dst);
		src -= 2;
		numChars--;
		continue;



	    }
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */

	if ((unsigned)ch - 1 < 0x7F) {
	    *dst++ = (ch & 0xFF);
	} else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) {
	    dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
	} else if (LOW_SURROGATE(ch)) {
	    /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    } else if (PROFILE_TCL8(flags)) {
                dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
                /*
                 * PROFILE_REPLACE | PROFILE_LOSSLESS. LOSSLESS treated like
                 * REPLACE for UTF16 - see TIP 671
                 */
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
            }
	} else {
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    if (HIGH_SURROGATE(ch)) {
	if (PROFILE_STRICT(flags)) {
	    result = TCL_CONVERT_SYNTAX;
	    src -= 2;
	    dst--;
	    numChars--;
	} else if (PROFILE_TCL8(flags)) {

	    dst += Tcl_UniCharToUtf(-1, dst);
	} else {
            /* PROFILE_REPLACE | PROFILE_LOSSLESS */
	    dst--;
	    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
	}
    }

    /*
     * If we had a truncated code unit at the end AND this is the last
     * fragment AND profile is not "strict", use the appropriate replacement
     * strategy.
     */
    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	} else {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
	    } else {
		/* PROFILE_REPLACE | PROFILE_TCL8 | PROFILE_LOSSLESS */
		result = TCL_OK;
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		numChars++;
		src++; /* Go past truncated code unit */
	    }
	}
    }
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
	    break;
	}
	len = Tcl_UtfToUniChar(src, &ch);
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    if (PROFILE_REPLACE(flags)) {
		ch = UNICODE_REPLACE_CHAR;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
	    if (ch <= 0xFFFF) {
		*dst++ = (ch & 0xFF);







|
|







3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
	    break;
	}
	len = Tcl_UtfToUniChar(src, &ch);
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    } else if (! PROFILE_TCL8(flags)) {
                /* PROFILE_REPLACE | PROFILE_LOSSLESS */
		ch = UNICODE_REPLACE_CHAR;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
	    if (ch <= 0xFFFF) {
		*dst++ = (ch & 0xFF);
3247
3248
3249
3250
3251
3252
3253

3254
3255
3256




3257
3258
3259
3260
3261
3262
3263
	if (ch > 0xFFFF) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    ch = UNICODE_REPLACE_CHAR;
	}

	if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
	    result = TCL_CONVERT_SYNTAX;
	    break;




	}

	src += len;

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]







>
|
|
|
>
>
>
>







3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
	if (ch > 0xFFFF) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    ch = UNICODE_REPLACE_CHAR;
	}
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    } else if (!PROFILE_TCL8(flags)) {
                /* PROFILE_REPLACE | PROFILE_LOSSLESS */
                ch = UNICODE_REPLACE_CHAR;
            }
	}

	src += len;

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
3345
3346
3347
3348
3349
3350
3351

3352
3353
3354
3355
3356
3357
3358
3359
3360






3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380


3381
3382
3383
3384
3385
3386
3387
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	byte = *((unsigned char *) src);
	if (prefixBytes[byte]) {
	    if (src >= srcEnd-1) {
		/* Prefix byte but nothing after it */

		if (!(flags & TCL_ENCODING_END)) {
		    /* More data to come */
		    result = TCL_CONVERT_MULTIBYTE;
		    break;
		} else if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_SYNTAX;
		    break;
		} else if (PROFILE_REPLACE(flags)) {
		    ch = UNICODE_REPLACE_CHAR;






		} else {
		    ch = (Tcl_UniChar)byte;
		}
	    } else {
		ch = toUnicode[byte][*((unsigned char *)++src)];
	    }
	} else {
	    ch = pageZero[byte];
	}
	if ((ch == 0) && (byte != 0)) {
	    /* Prefix+suffix pair is invalid */
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	    if (prefixBytes[byte]) {
		src--;
	    }
	    if (PROFILE_REPLACE(flags)) {
		ch = UNICODE_REPLACE_CHAR;


	    } else {
		ch = (Tcl_UniChar)byte;
	    }
	}

	/*
	 * Special case for 1-byte Utf chars for speed.







>









>
>
>
>
>
>




















>
>







3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	byte = *((unsigned char *) src);
	if (prefixBytes[byte]) {
	    if (src >= srcEnd-1) {
		/* Prefix byte but nothing after it */
		/* Truncated sequence ... */
		if (!(flags & TCL_ENCODING_END)) {
		    /* More data to come */
		    result = TCL_CONVERT_MULTIBYTE;
		    break;
		} else if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_SYNTAX;
		    break;
		} else if (PROFILE_REPLACE(flags)) {
		    ch = UNICODE_REPLACE_CHAR;
		} else if (PROFILE_LOSSLESS(flags)) {
		    if (dataPtr->flags & ENCODING_ASCII_COMPATIBLE) {
			ch = ToLossless(byte);
		    } else {
			ch = UNICODE_REPLACE_CHAR;
		    }
		} else {
		    ch = (Tcl_UniChar)byte;
		}
	    } else {
		ch = toUnicode[byte][*((unsigned char *)++src)];
	    }
	} else {
	    ch = pageZero[byte];
	}
	if ((ch == 0) && (byte != 0)) {
	    /* Prefix+suffix pair is invalid */
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	    if (prefixBytes[byte]) {
		src--;
	    }
	    if (PROFILE_REPLACE(flags)) {
		ch = UNICODE_REPLACE_CHAR;
	    } else if (PROFILE_LOSSLESS(flags)) {
		ch = ToLossless(byte);
	    } else {
		ch = (Tcl_UniChar)byte;
	    }
	}

	/*
	 * Special case for 1-byte Utf chars for speed.
3444
3445
3446
3447
3448
3449
3450

3451
3452
3453
3454
3455
3456
3457
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd, *prefixBytes;
    Tcl_UniChar ch = 0;
    int result, len, word, numChars;
    TableEncodingData *dataPtr = (TableEncodingData *)clientData;

    const unsigned short *const *fromUnicode;

    result = TCL_OK;

    prefixBytes = dataPtr->prefixBytes;
    fromUnicode = (const unsigned short *const *) dataPtr->fromUnicode;








>







3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd, *prefixBytes;
    Tcl_UniChar ch = 0;
    int result, len, word, numChars;
    TableEncodingData *dataPtr = (TableEncodingData *)clientData;
    int asciiCompatible = dataPtr->flags & ENCODING_ASCII_COMPATIBLE;
    const unsigned short *const *fromUnicode;

    result = TCL_OK;

    prefixBytes = dataPtr->prefixBytes;
    fromUnicode = (const unsigned short *const *) dataPtr->fromUnicode;

3480
3481
3482
3483
3484
3485
3486

3487
3488
3489
3490
3491






3492
3493

3494
3495
3496
3497
3498
3499
3500
3501
	/* Unicode chars > +U0FFFF cannot be represented in any table encoding */
	if (ch & 0xFFFF0000) {
	    word = 0;
	} else {
	word = fromUnicode[(ch >> 8)][ch & 0xFF];
	}


	if ((word == 0) && (ch != 0)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }






	    word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */
	}

	if (prefixBytes[(word >> 8)] != 0) {
	    if (dst + 1 > dstEnd) {
		result = TCL_CONVERT_NOSPACE;
		break;
	    }
	    dst[0] = (char) (word >> 8);
	    dst[1] = (char) word;
	    dst += 2;







>





>
>
>
>
>
>
|
|
>
|







3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
	/* Unicode chars > +U0FFFF cannot be represented in any table encoding */
	if (ch & 0xFFFF0000) {
	    word = 0;
	} else {
	word = fromUnicode[(ch >> 8)][ch & 0xFF];
	}

	int isWrappedLossless = 0;
	if ((word == 0) && (ch != 0)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    if (PROFILE_LOSSLESS(flags) && IsLosslessWrapper(ch) &&
		asciiCompatible) {
		word = FromLossless(ch);
		isWrappedLossless = 1;
	    }
	    else {
		word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */
	    }
	}
	if (prefixBytes[(word >> 8)] != 0 && !isWrappedLossless) {
	    if (dst + 1 > dstEnd) {
		result = TCL_CONVERT_NOSPACE;
		break;
	    }
	    dst[0] = (char) (word >> 8);
	    dst[1] = (char) word;
	    dst += 2;
3564
3565
3566
3567
3568
3569
3570






3571
3572
3573
3574
3575
3576
3577
    }
    srcStart = src;
    srcEnd = src + srcLen;

    dstStart = dst;
    dstEnd = dst + dstLen - TCL_UTF_MAX;







    result = TCL_OK;
    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
	Tcl_UniChar ch = 0;

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;







>
>
>
>
>
>







3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
    }
    srcStart = src;
    srcEnd = src + srcLen;

    dstStart = dst;
    dstEnd = dst + dstLen - TCL_UTF_MAX;

    /*
     * Note with respect to profiles: all byte values are mapped
     * to Unicode characters on input so there is no question of invalid
     * 8859-1 characters.
     */

    result = TCL_OK;
    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
	Tcl_UniChar ch = 0;

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
3669
3670
3671
3672
3673
3674
3675
3676

3677
3678
3679
3680

3681
3682
3683
3684
3685
3686
3687
	 */

	if (ch > 0xFF) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    /*

	     * Plunge on, using '?' as a fallback character.
	     */

	    ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */

	}

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	*(dst++) = (char) ch;







<
>
|
<
|
|
>







3813
3814
3815
3816
3817
3818
3819

3820
3821

3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
	 */

	if (ch > 0xFF) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }

	    if (PROFILE_LOSSLESS(flags) && IsLosslessWrapper(ch)) {
		ch = FromLossless(ch);

	    } else {
		ch = (Tcl_UniChar)'?';
	    }
	}

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	*(dst++) = (char) ch;
3887
3888
3889
3890
3891
3892
3893
3894


3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
	     * We have a split-up or unrecognized escape sequence. If we
	     * checked all the sequences, then it's a syntax error, otherwise
	     * we need more bytes to determine a match.
	     */

	    if ((checked == dataPtr->numSubTables + 2)
		    || (flags & TCL_ENCODING_END)) {
		if (!PROFILE_STRICT(flags)) {


		    unsigned skip = longest > left ? left : longest;
		    /* Unknown escape sequence */
		    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		    src += skip;
		    continue;
		}
		result = TCL_CONVERT_SYNTAX;
	    } else {
		result = TCL_CONVERT_MULTIBYTE;
	    }
	    break;
	}

	if (encodingPtr == NULL) {







|
>
>






<







4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046

4047
4048
4049
4050
4051
4052
4053
	     * We have a split-up or unrecognized escape sequence. If we
	     * checked all the sequences, then it's a syntax error, otherwise
	     * we need more bytes to determine a match.
	     */

	    if ((checked == dataPtr->numSubTables + 2)
		    || (flags & TCL_ENCODING_END)) {
		if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_SYNTAX;
		} else {
		    unsigned skip = longest > left ? left : longest;
		    /* Unknown escape sequence */
		    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		    src += skip;
		    continue;
		}

	    } else {
		result = TCL_CONVERT_MULTIBYTE;
	    }
	    break;
	}

	if (encodingPtr == NULL) {
4066
4067
4068
4069
4070
4071
4072

4073
4074
4075
4076
4077
4078
4079
4080
	    if (word == 0) {
		state = oldState;
		if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_UNKNOWN;
		    break;
		}
		encodingPtr = GetTableEncoding(dataPtr, state);

		tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
		word = tableDataPtr->fallback;
	    }

	    tablePrefixBytes = (const char *) tableDataPtr->prefixBytes;
	    tableFromUnicode = (const unsigned short *const *)
		    tableDataPtr->fromUnicode;








>
|







4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
	    if (word == 0) {
		state = oldState;
		if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_UNKNOWN;
		    break;
		}
		encodingPtr = GetTableEncoding(dataPtr, state);
		tableDataPtr =
		    (const TableEncodingData *)encodingPtr->clientData;
		word = tableDataPtr->fallback;
	    }

	    tablePrefixBytes = (const char *) tableDataPtr->prefixBytes;
	    tableFromUnicode = (const unsigned short *const *)
		    tableDataPtr->fromUnicode;

4471
4472
4473
4474
4475
4476
4477
4478
4479




















































































4480
4481
4482
4483
4484
4485
    objPtr = Tcl_NewListObj(n, NULL);
    for (i = 0; i < n; ++i) {
	Tcl_ListObjAppendElement(
	    interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
    }
    Tcl_SetObjResult(interp, objPtr);
}

/*




















































































 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */









>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
    objPtr = Tcl_NewListObj(n, NULL);
    for (i = 0; i < n; ++i) {
	Tcl_ListObjAppendElement(
	    interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
    }
    Tcl_SetObjResult(interp, objPtr);
}

/*
 *------------------------------------------------------------------------
 *
 * TclSystemToInternalEncoding --
 *
 *    Converts a string encoded in the system encoding to Tcl's internal UTF8
 *    using the lossless profile.
 *
 * Results:
 *    Tcl_OK / TCL_ERROR
 *
 * Side effects:
 *    On success *dsPtr holds the converted string. On error, *dsPtr is
 *    cleared, an error message is stored in interp (if not NULL), and a
 *    POSIX error code stored in errno.
 *
 *------------------------------------------------------------------------
 */
int
TclSystemToInternalEncoding(
    Tcl_Interp *interp, /* For error messages, may be NULL */
    const char *src,    /* String in system encoding */
    Tcl_Size srcLen,    /* Number of bytes passed in */
    Tcl_DString *dsPtr) /* Pointer to uninitialized or cleared Tcl_DString. */
{
    Tcl_Size errorLoc;
    int ret;
    ret = Tcl_ExternalToUtfDStringEx(interp,
				     NULL,
				     src,
				     srcLen,
				     TCL_ENCODING_PROFILE_LOSSLESS,
				     dsPtr,
				     &errorLoc);
    /* On TCL_OK, caller owns *dsPtr. On failure we have to free it. */
    if (ret != TCL_OK) {
	Tcl_DStringFree(dsPtr);
	ret = TCL_ERROR; /* Map TCL_CONVERT_* to TCL_ERROR */
    }
    return ret;
}

/*
 *------------------------------------------------------------------------
 *
 * TclInternalToSystemEncoding --
 *
 *    Converts a string to the system encoding using the lossless profile
 *
 * Results:
 *    Tcl_OK / TCL_ERROR
 *
 * Side effects:
 *    On success *dsPtr holds the converted string. On error, *dsPtr is
 *    cleared, an error message is stored in interp (if not NULL), and a
 *    POSIX error code stored in errno.
 *
 *------------------------------------------------------------------------
 */
int
TclInternalToSystemEncoding(
    Tcl_Interp *interp, /* For error messages, may be NULL */
    const char *src,    /* String in system encoding */
    Tcl_Size srcLen,    /* Number of bytes passed in */
    Tcl_DString *dsPtr) /* Pointer to uninitialized or cleared Tcl_DString. */
{
    Tcl_Size errorLoc;
    int ret;
    ret = Tcl_UtfToExternalDStringEx(interp,
				     NULL,
				     src,
				     srcLen,
				     TCL_ENCODING_PROFILE_LOSSLESS,
				     dsPtr,
				     &errorLoc);
    /* On TCL_OK, caller owns *dsPtr. On failure we have to free it. */
    if (ret != TCL_OK) {
	Tcl_DStringFree(dsPtr);
	ret = TCL_ERROR; /* Map TCL_CONVERT_* to TCL_ERROR */
    }
    return ret;
}


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */

Changes to generic/tclEnv.c.

15
16
17
18
19
20
21

22
23


24
25

26
27
28
29
30
31
32

33



34
35





36
37
38
39
40
41
42

#include "tclInt.h"

TCL_DECLARE_MUTEX(envMutex)	/* To serialize access to environ. */

#if defined(_WIN32)
#  define tenviron _wenviron

#  define tenviron2utfdstr(str, dsPtr) (Tcl_DStringInit(dsPtr), \
		(char *)Tcl_Char16ToUtfDString((const unsigned short *)(str), -1, (dsPtr)))


#  define utf2tenvirondstr(str, dsPtr) (Tcl_DStringInit(dsPtr), \
		(const WCHAR *)Tcl_UtfToChar16DString((str), -1, (dsPtr)))

#  define techar WCHAR
#  ifdef USE_PUTENV
#    define putenv(env) _wputenv((const wchar_t *)env)
#  endif
#else
#  define tenviron environ
#  define tenviron2utfdstr(str, dsPtr) \

		Tcl_ExternalToUtfDString(NULL, str, -1, dsPtr)



#  define utf2tenvirondstr(str, dsPtr) \
		Tcl_UtfToExternalDString(NULL, str, -1, dsPtr)





#  define techar char
#endif


/* MODULE_SCOPE */
size_t TclEnvEpoch = 0;	/* Epoch of the tcl environment
				 * (if changed with tcl-env). */







>
|
|
>
>
|
|
>






|
>
|
>
>
>
|
|
>
>
>
>
>







15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

#include "tclInt.h"

TCL_DECLARE_MUTEX(envMutex)	/* To serialize access to environ. */

#if defined(_WIN32)
#  define tenviron _wenviron
static inline char *tenviron2utfdstr(const WCHAR *str, Tcl_DString *dsPtr) {
    Tcl_DStringInit(dsPtr);
    return Tcl_Char16ToUtfDString(str, -1, dsPtr);
}
static inline WCHAR *utf2tenvirondstr(const char *str, Tcl_DString *dsPtr) {
    Tcl_DStringInit(dsPtr);
    return Tcl_UtfToChar16DString(str, -1, dsPtr);
}
#  define techar WCHAR
#  ifdef USE_PUTENV
#    define putenv(env) _wputenv((const wchar_t *)env)
#  endif
#else
#  define tenviron environ
static inline char *tenviron2utfdstr(const char *str, Tcl_DString *dsPtr) {
    if (TclSystemToInternalEncoding(NULL,str,-1,dsPtr) == TCL_OK) {
        return Tcl_DStringValue(dsPtr);
    }
    return NULL;
}
static inline char *utf2tenvirondstr(const char *str, Tcl_DString *dsPtr) {
    Tcl_DStringInit(dsPtr);
    if (TclInternalToSystemEncoding(NULL,str,-1,dsPtr) == TCL_OK) {
        return Tcl_DStringValue(dsPtr);
    }
    return NULL;
}
#  define techar char
#endif


/* MODULE_SCOPE */
size_t TclEnvEpoch = 0;	/* Epoch of the tcl environment
				 * (if changed with tcl-env). */

Changes to generic/tclInt.h.

3040
3041
3042
3043
3044
3045
3046




3047
3048
3049
3050
3051
3052
3053
MODULE_SCOPE int
TclEncodingProfileNameToId(Tcl_Interp *interp,
			   const char *profileName,
			   int *profilePtr);
MODULE_SCOPE const char *TclEncodingProfileIdToName(Tcl_Interp *interp,
						    int profileId);
MODULE_SCOPE void TclGetEncodingProfiles(Tcl_Interp *interp);





/*
 * TIP #233 (Virtualized Time)
 * Data for the time hooks, if any.
 */

MODULE_SCOPE Tcl_GetTimeProc *tclGetTimeProcPtr;







>
>
>
>







3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
MODULE_SCOPE int
TclEncodingProfileNameToId(Tcl_Interp *interp,
			   const char *profileName,
			   int *profilePtr);
MODULE_SCOPE const char *TclEncodingProfileIdToName(Tcl_Interp *interp,
						    int profileId);
MODULE_SCOPE void TclGetEncodingProfiles(Tcl_Interp *interp);
MODULE_SCOPE int TclInternalToSystemEncoding(Tcl_Interp *interp,
			const char *src, Tcl_Size srcLen, Tcl_DString *dsPtr);
MODULE_SCOPE int TclSystemToInternalEncoding(Tcl_Interp *interp,
			const char *src, Tcl_Size srcLen, Tcl_DString *dsPtr);

/*
 * TIP #233 (Virtualized Time)
 * Data for the time hooks, if any.
 */

MODULE_SCOPE Tcl_GetTimeProc *tclGetTimeProcPtr;

Changes to generic/tclMain.c.

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
{
    Tcl_DString ds;

#ifdef UNICODE
    Tcl_DStringInit(&ds);
    Tcl_WCharToUtfDString(string, -1, &ds);
#else
    (void)Tcl_ExternalToUtfDString(NULL, (char *)string, -1, &ds);
#endif
    return Tcl_DStringToObj(&ds);
}

/*
 * Declarations for various library functions and variables (don't want to
 * include tclPort.h here, because people might copy this file out of the Tcl







|







49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
{
    Tcl_DString ds;

#ifdef UNICODE
    Tcl_DStringInit(&ds);
    Tcl_WCharToUtfDString(string, -1, &ds);
#else
    (void) TclSystemToInternalEncoding(NULL, string, -1, &ds);
#endif
    return Tcl_DStringToObj(&ds);
}

/*
 * Declarations for various library functions and variables (don't want to
 * include tclPort.h here, because people might copy this file out of the Tcl

Changes to generic/tclTest.c.

2124
2125
2126
2127
2128
2129
2130

2131
2132
2133
2134
2135
2136
2137
	{"end", TCL_ENCODING_END},
	{"stoponerror", TCL_ENCODING_STOPONERROR},
	{"noterminate", TCL_ENCODING_NO_TERMINATE},
	{"charlimit", TCL_ENCODING_CHAR_LIMIT},
	{"profiletcl8", TCL_ENCODING_PROFILE_TCL8},
	{"profilestrict", TCL_ENCODING_PROFILE_STRICT},
	{"profilereplace", TCL_ENCODING_PROFILE_REPLACE},

	{NULL, 0}
    };
    Tcl_Size i;
    Tcl_WideInt wide;

    if (objc < 7 || objc > 10) {
        Tcl_WrongNumArgs(interp,







>







2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
	{"end", TCL_ENCODING_END},
	{"stoponerror", TCL_ENCODING_STOPONERROR},
	{"noterminate", TCL_ENCODING_NO_TERMINATE},
	{"charlimit", TCL_ENCODING_CHAR_LIMIT},
	{"profiletcl8", TCL_ENCODING_PROFILE_TCL8},
	{"profilestrict", TCL_ENCODING_PROFILE_STRICT},
	{"profilereplace", TCL_ENCODING_PROFILE_REPLACE},
	{"profilelossless", TCL_ENCODING_PROFILE_LOSSLESS},
	{NULL, 0}
    };
    Tcl_Size i;
    Tcl_WideInt wide;

    if (objc < 7 || objc > 10) {
        Tcl_WrongNumArgs(interp,

Changes to generic/tclUtil.c.

4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
	     */

	    Tcl_DString native, newValue;

	    Tcl_MutexLock(&pgvPtr->mutex);
	    epoch = ++pgvPtr->epoch;
	    Tcl_UtfToExternalDStringEx(NULL, pgvPtr->encoding, pgvPtr->value,
		pgvPtr->numBytes, TCL_ENCODING_PROFILE_TCL8, &native, NULL);
	    Tcl_ExternalToUtfDStringEx(NULL, current, Tcl_DStringValue(&native),
		Tcl_DStringLength(&native), TCL_ENCODING_PROFILE_TCL8,
		&newValue, NULL);
	    Tcl_DStringFree(&native);
	    Tcl_Free(pgvPtr->value);
	    pgvPtr->value = (char *)Tcl_Alloc(Tcl_DStringLength(&newValue) + 1);
	    memcpy(pgvPtr->value, Tcl_DStringValue(&newValue),
		    Tcl_DStringLength(&newValue) + 1);
	    Tcl_DStringFree(&newValue);







|

|







4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
	     */

	    Tcl_DString native, newValue;

	    Tcl_MutexLock(&pgvPtr->mutex);
	    epoch = ++pgvPtr->epoch;
	    Tcl_UtfToExternalDStringEx(NULL, pgvPtr->encoding, pgvPtr->value,
		pgvPtr->numBytes, TCL_ENCODING_PROFILE_LOSSLESS, &native, NULL);
	    Tcl_ExternalToUtfDStringEx(NULL, current, Tcl_DStringValue(&native),
		Tcl_DStringLength(&native), TCL_ENCODING_PROFILE_LOSSLESS,
		&newValue, NULL);
	    Tcl_DStringFree(&native);
	    Tcl_Free(pgvPtr->value);
	    pgvPtr->value = (char *)Tcl_Alloc(Tcl_DStringLength(&newValue) + 1);
	    memcpy(pgvPtr->value, Tcl_DStringValue(&newValue),
		    Tcl_DStringLength(&newValue) + 1);
	    Tcl_DStringFree(&newValue);

Changes to macosx/tclMacOSXFCmd.c.

639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
    const char *string;
    int result = TCL_OK;
    Tcl_DString ds;
    Tcl_Encoding encoding = Tcl_GetEncoding(NULL, "macRoman");
    Tcl_Size length;

    string = Tcl_GetStringFromObj(objPtr, &length);
    Tcl_UtfToExternalDStringEx(NULL, encoding, string, length, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);

    if (Tcl_DStringLength(&ds) > 4) {
	if (interp) {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "expected Macintosh OS type but got \"%s\": ", string));
	    Tcl_SetErrorCode(interp, "TCL", "VALUE", "MAC_OSTYPE", NULL);
	}







|







639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
    const char *string;
    int result = TCL_OK;
    Tcl_DString ds;
    Tcl_Encoding encoding = Tcl_GetEncoding(NULL, "macRoman");
    Tcl_Size length;

    string = Tcl_GetStringFromObj(objPtr, &length);
    Tcl_UtfToExternalDStringEx(NULL, encoding, string, length, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL);

    if (Tcl_DStringLength(&ds) > 4) {
	if (interp) {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "expected Macintosh OS type but got \"%s\": ", string));
	    Tcl_SetErrorCode(interp, "TCL", "VALUE", "MAC_OSTYPE", NULL);
	}

Changes to tests/cmdAH.test.

642
643
644
645
646
647
648
649





































































































650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
            # Failure expected
            set result $prefix
            incr expected_failidx $prefixLen
        }
        testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix $result $expected_failidx $profile
    }
}






































































































test cmdAH-4.4.xx {convertto -profile strict} -constraints {testbytestring knownBug} -body {
    # TODO - what does testbytestring even test? Invalid UTF8 in the Tcl_Obj bytes field
    encoding convertto -profile strict utf-8 A[testbytestring \x80]B
} -returnCodes error -result {unexpected byte sequence starting at index 1: '\x80'}

#
# encoding names 4.5.*
badnumargs cmdAH-4.5.1 {encoding names} {foo}
test cmdAH-4.5.2 {encoding names should include at least utf-8 and iso8859-1 and at least one more} -body {
    set names [encoding names]
    list [expr {"utf-8" in $names}] [expr {"iso8859-1" in $names}] [expr {[llength $names] > 2}]
} -result {1 1 1}

#
# encoding profiles 4.6.*
badnumargs cmdAH-4.6.1 {encoding profiles} {foo}
test cmdAH-4.6.2 {encoding profiles} -body {
    lsort [encoding profiles]
} -result {replace strict tcl8}

#
# file command

test cmdAH-5.1 {Tcl_FileObjCmd} -returnCodes error -body {
    file
} -result {wrong # args: should be "file subcommand ?arg ...?"}








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
















|







642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
            # Failure expected
            set result $prefix
            incr expected_failidx $prefixLen
        }
        testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix $result $expected_failidx $profile
    }
}

proc ascii_compatible {enc} {
    # All bytes under 128 should map to their ascii values
    for {set i 0} {$i < 128} {incr i} {
        set bin [binary format c $i]
        if {[catch {set ch [encoding convertfrom -profile strict $enc $bin]}]} {
            return 0
        }
        if {$ch ne [encoding convertfrom -profile strict ascii $bin]} {
            return 0
        }
    }
    return 1
}

#
# Roundtrip tests for lossless profile
foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
    if {"knownBug" in $ctrl} continue
    if {$profile ne "lossless"} continue
    # There are multiple test cases based on location of invalid bytes
    set bytes [binary decode hex $hex]
    set prefix A
    set suffix B
    set prefix_bytes [encoding convertto $enc $prefix]
    set suffix_bytes [encoding convertto $enc $suffix]
    set prefixLen [string length $prefix_bytes]
    if {![ascii_compatible $enc]} {
        # These do not implement lossless behaviors
        test cmdAH-4.4.15.$hex.solo.$enc "Invalid byte under 128 for lossless profile" -body {
            encoding convertfrom -profile lossless $enc $bytes
        } -result \uFFFD* -match glob
        continue
    }
    if {$ctrl eq {} || "solo" in $ctrl} {
        test cmdAH-4.4.15.$hex.solo.$enc "Lossless roundtrip $hex for $enc" -body {
            set decoded [encoding convertfrom -profile lossless $enc $bytes]
            string equal $bytes [encoding convertto -profile lossless $enc $decoded]
        } -result 1
    }
    if {$ctrl eq {} || "lead" in $ctrl} {
        test cmdAH-4.4.15.$hex.lead.$enc "Lossless roundtrip $hex for $enc" -body {
            set decoded [encoding convertfrom -profile lossless $enc $bytes$suffix_bytes]
            string equal $bytes$suffix_bytes [encoding convertto -profile lossless $enc $decoded]
        } -result 1
    }
    if {$ctrl eq {} || "tail" in $ctrl} {
        test cmdAH-4.4.15.$hex.tail.$enc "Lossless roundtrip $hex for $enc" -body {
            set decoded [encoding convertfrom -profile lossless $enc $prefix_bytes$bytes]
            string equal $prefix_bytes$bytes [encoding convertto -profile lossless $enc $decoded]
        } -result 1
    }
    if {$ctrl eq {} || "middle" in $ctrl} {
        test cmdAH-4.4.15.$hex.middle.$enc "Lossless roundtrip $hex for $enc" -body {
            set decoded [encoding convertfrom -profile lossless $enc $prefix_bytes$bytes$suffix_bytes]
            string equal $prefix_bytes$bytes$suffix_bytes [encoding convertto -profile lossless $enc $decoded]
        } -result 1
    }
}

#
# Non-ascii encoding should not output lossless wrappers
foreach enc [encoding names] {
    if {$enc eq "cesu-8"} {
        test cmdAH-4.4.16.$enc "Lossless output for CESU-8" -body {
            encoding convertto -profile lossless $enc \uDC41
        } -result \xED\xB1\x81
    } elseif {[ascii_compatible $enc]} {
        test cmdAH-4.4.16.$enc "Lossless output for ascii-compatible encodings" -body {
            encoding convertto -profile lossless $enc \uDC41
        } -result A
    } else {
        test cmdAH-4.4.16.$enc "Lossless output for ascii-incompatible encodings" -body {
            encoding convertto -profile lossless $enc \uDC41
        } -result [encoding convertto -profile tcl8 $enc \uFFFD]
    }
}

#
# Invalid bytes < 128 should map to FFFD for lossless profile

# Find an invalid byte within a range for the given encoding
proc find_invalid_byte {enc {lo 0} {hi 127}} {
    for {set i $lo} {$i <= $hi} {incr i} {
        set bin [binary format c $i]
        if {[catch {set ch [encoding convertfrom -profile strict $enc $bin]}]} {
            return $bin
        }
    }
    # All bytes under 128 are valid
    return ""
}
foreach enc [encoding names] {
    set byte [find_invalid_byte $enc]
    if {$byte ne ""} {
        test cmdAH-4.4.17.$enc "Invalid byte under 128 for lossless profile" -body {
            encoding convertfrom -profile lossless $enc $byte
        } -result \uFFFD
    }
}


test cmdAH-4.4.xx {convertto -profile strict} -constraints {testbytestring knownBug} -body {
    # TODO - what does testbytestring even test? Invalid UTF8 in the Tcl_Obj bytes field?
    encoding convertto -profile strict utf-8 A[testbytestring \x80]B
} -returnCodes error -result {unexpected byte sequence starting at index 1: '\x80'}

#
# encoding names 4.5.*
badnumargs cmdAH-4.5.1 {encoding names} {foo}
test cmdAH-4.5.2 {encoding names should include at least utf-8 and iso8859-1 and at least one more} -body {
    set names [encoding names]
    list [expr {"utf-8" in $names}] [expr {"iso8859-1" in $names}] [expr {[llength $names] > 2}]
} -result {1 1 1}

#
# encoding profiles 4.6.*
badnumargs cmdAH-4.6.1 {encoding profiles} {foo}
test cmdAH-4.6.2 {encoding profiles} -body {
    lsort [encoding profiles]
} -result {lossless replace strict tcl8}

#
# file command

test cmdAH-5.1 {Tcl_FileObjCmd} -returnCodes error -body {
    file
} -result {wrong # args: should be "file subcommand ?arg ...?"}

Changes to tests/encoding.test.

809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
test encoding-24.15.tcl8 {Parse invalid utf-8, -profile tcl8} -body {
    encoding convertfrom -profile tcl8 utf-8 "Z\xE0\x80"
} -result Z\xE0\u20AC
test encoding-24.16 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\u4343\x80"]
} -returnCodes 1 -result {expected byte sequence but character 1 was '䍃€' (U+004343)}
test encoding-24.17 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\xE0\x80"]
} -result "Z\xC3\xA0\xE2\x82\xAC"
test encoding-24.18 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\xE0\x80xxxxxx"]
} -result "Z\xC3\xA0\xE2\x82\xACxxxxxx"
test encoding-24.19.1 {Parse valid or invalid utf-8} -body {
    encoding convertto -profile tcl8 utf-8 "ZX\uD800"
} -result ZX\xED\xA0\x80







|







809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
test encoding-24.15.tcl8 {Parse invalid utf-8, -profile tcl8} -body {
    encoding convertfrom -profile tcl8 utf-8 "Z\xE0\x80"
} -result Z\xE0\u20AC
test encoding-24.16 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\u4343\x80"]
} -returnCodes 1 -result {expected byte sequence but character 1 was '䍃€' (U+004343)}
test encoding-24.17 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto -profile tcl8 utf-8 [testbytestring "Z\xE0\x80"]
} -result "Z\xC3\xA0\xE2\x82\xAC"
test encoding-24.18 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\xE0\x80xxxxxx"]
} -result "Z\xC3\xA0\xE2\x82\xACxxxxxx"
test encoding-24.19.1 {Parse valid or invalid utf-8} -body {
    encoding convertto -profile tcl8 utf-8 "ZX\uD800"
} -result ZX\xED\xA0\x80
1160
1161
1162
1163
1164
1165
1166








1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178








1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
    encoding convertfrom -profile tcl8 gb12345 x
} -result x
test encoding-bug-66ffafd309-1-strict {Bug [66ffafd309] - truncated DBCS} -body {
    encoding convertfrom -profile strict gb12345 x
} -result {unexpected byte sequence starting at index 0: '\x78'} -returnCodes error
test encoding-bug-66ffafd309-1-replace {Bug [66ffafd309] - truncated DBCS} -body {
    encoding convertfrom -profile replace gb12345 x








} -result \uFFFD
test encoding-bug-66ffafd309-2-tcl8 {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile tcl8 jis0208 \x78\x79
} -result \x78\x79
test encoding-bug-66ffafd309-2-strict {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile strict jis0208 \x78\x79
} -result {unexpected byte sequence starting at index 1: '\x79'} -returnCodes error
test encoding-bug-66ffafd309-2-replace {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile replace jis0208 \x78\x79








} -result \uFFFD\uFFFD

# cleanup
namespace delete ::tcl::test::encoding
::tcltest::cleanupTests
return

# Local Variables:
# mode: tcl
# End:







>
>
>
>
>
>
>
>












>
>
>
>
>
>
>
>










1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
    encoding convertfrom -profile tcl8 gb12345 x
} -result x
test encoding-bug-66ffafd309-1-strict {Bug [66ffafd309] - truncated DBCS} -body {
    encoding convertfrom -profile strict gb12345 x
} -result {unexpected byte sequence starting at index 0: '\x78'} -returnCodes error
test encoding-bug-66ffafd309-1-replace {Bug [66ffafd309] - truncated DBCS} -body {
    encoding convertfrom -profile replace gb12345 x
} -result \uFFFD
test encoding-bug-66ffafd309-1-lossless-a {Bug [66ffafd309] - truncated DBCS} -body {
    # lossless - byte < 128
    encoding convertfrom -profile lossless gb12345 x
} -result \uFFFD
test encoding-bug-66ffafd309-1-lossless-b {Bug [66ffafd309] - truncated DBCS} -body {
    # lossless - byte > 128
    encoding convertfrom -profile lossless gb12345 \x82
} -result \uFFFD
test encoding-bug-66ffafd309-2-tcl8 {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile tcl8 jis0208 \x78\x79
} -result \x78\x79
test encoding-bug-66ffafd309-2-strict {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile strict jis0208 \x78\x79
} -result {unexpected byte sequence starting at index 1: '\x79'} -returnCodes error
test encoding-bug-66ffafd309-2-replace {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile replace jis0208 \x78\x79
} -result \uFFFD\uFFFD
test encoding-bug-66ffafd309-2-lossless-a {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid. \x78 is invalid prefix
    encoding convertfrom -profile lossless jis0208 \x78\x79
} -result \uFFFD\uFFFD
test encoding-bug-66ffafd309-2-lossless-b {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid. \x21 is valid prefix but FF is not valid suffix
    encoding convertfrom -profile lossless jis0208 \x21\xFF
} -result \uFFFD\uFFFD

# cleanup
namespace delete ::tcl::test::encoding
::tcltest::cleanupTests
return

# Local Variables:
# mode: tcl
# End:

Changes to tests/encodingVectors.tcl.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# This file contains test vectors for verifying various encodings. They are
# stored in a common file so that they can be sourced into the various test
# modules that are dependent on encodings. This file contains statically defined
# test vectors. In addition, it sources the ICU-generated test vectors from
# icuUcmTests.tcl.
#
# Note that sourcing the file will reinitialize any existing encoding test
# vectors.
#

# List of defined encoding profiles
set encProfiles {tcl8 strict replace}
set encDefaultProfile strict; # Should reflect the default from implementation

# encValidStrings - Table of valid strings.
#
# Each row is <ENCODING STR BYTES CTRL COMMENT>
# The pair <ENCODING,STR> should be unique for generated test ids to be unique.
# STR is a string that can be encoded in the encoding ENCODING resulting











|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# This file contains test vectors for verifying various encodings. They are
# stored in a common file so that they can be sourced into the various test
# modules that are dependent on encodings. This file contains statically defined
# test vectors. In addition, it sources the ICU-generated test vectors from
# icuUcmTests.tcl.
#
# Note that sourcing the file will reinitialize any existing encoding test
# vectors.
#

# List of defined encoding profiles
set encProfiles {tcl8 strict replace lossless}
set encDefaultProfile strict; # Should reflect the default from implementation

# encValidStrings - Table of valid strings.
#
# Each row is <ENCODING STR BYTES CTRL COMMENT>
# The pair <ENCODING,STR> should be unique for generated test ids to be unique.
# STR is a string that can be encoded in the encoding ENCODING resulting
107
108
109
110
111
112
113

114
115
116
117
118
119
120
# ascii - Any byte above 127 is invalid and is mapped
# to the same numeric code point except for the range
# 80-9F which is treated as cp1252.
# This tests the TableToUtfProc code path.
lappend encInvalidBytes {*}{
    ascii 80 tcl8    \u20AC -1 {knownBug} {map to cp1252}
    ascii 80 replace \uFFFD -1 {} {Smallest invalid byte}

    ascii 80 strict  {}      0 {} {Smallest invalid byte}
    ascii 81 tcl8    \u0081 -1 {knownBug} {map to cp1252}
    ascii 82 tcl8    \u201A -1 {knownBug} {map to cp1252}
    ascii 83 tcl8    \u0192 -1 {knownBug} {map to cp1252}
    ascii 84 tcl8    \u201E -1 {knownBug} {map to cp1252}
    ascii 85 tcl8    \u2026 -1 {knownBug} {map to cp1252}
    ascii 86 tcl8    \u2020 -1 {knownBug} {map to cp1252}







>







107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# ascii - Any byte above 127 is invalid and is mapped
# to the same numeric code point except for the range
# 80-9F which is treated as cp1252.
# This tests the TableToUtfProc code path.
lappend encInvalidBytes {*}{
    ascii 80 tcl8    \u20AC -1 {knownBug} {map to cp1252}
    ascii 80 replace \uFFFD -1 {} {Smallest invalid byte}
    ascii 80 lossless \uDC80 -1 {} {Smallest invalid byte}
    ascii 80 strict  {}      0 {} {Smallest invalid byte}
    ascii 81 tcl8    \u0081 -1 {knownBug} {map to cp1252}
    ascii 82 tcl8    \u201A -1 {knownBug} {map to cp1252}
    ascii 83 tcl8    \u0192 -1 {knownBug} {map to cp1252}
    ascii 84 tcl8    \u201E -1 {knownBug} {map to cp1252}
    ascii 85 tcl8    \u2026 -1 {knownBug} {map to cp1252}
    ascii 86 tcl8    \u2020 -1 {knownBug} {map to cp1252}
142
143
144
145
146
147
148

149
150
151
152
153
154
155
    ascii 9C tcl8    \u0153 -1 {knownBug} {map to cp1252}
    ascii 9D tcl8    \u009D -1 {knownBug} {map to cp1252}
    ascii 9E tcl8    \u017E -1 {knownBug} {map to cp1252}
    ascii 9F tcl8    \u0178 -1 {knownBug} {map to cp1252}

    ascii FF tcl8    \u00FF -1 {} {Largest invalid byte}
    ascii FF replace \uFFFD -1 {} {Largest invalid byte}

    ascii FF strict  {}      0 {} {Largest invalid byte}
}

# utf-8 - valid sequences based on Table 3.7 in the Unicode
# standard.
#
# Code Points        First   Second  Third   Fourth Byte







>







143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
    ascii 9C tcl8    \u0153 -1 {knownBug} {map to cp1252}
    ascii 9D tcl8    \u009D -1 {knownBug} {map to cp1252}
    ascii 9E tcl8    \u017E -1 {knownBug} {map to cp1252}
    ascii 9F tcl8    \u0178 -1 {knownBug} {map to cp1252}

    ascii FF tcl8    \u00FF -1 {} {Largest invalid byte}
    ascii FF replace \uFFFD -1 {} {Largest invalid byte}
    ascii FF lossless \uDCFF -1 {} {Largest invalid byte}
    ascii FF strict  {}      0 {} {Largest invalid byte}
}

# utf-8 - valid sequences based on Table 3.7 in the Unicode
# standard.
#
# Code Points        First   Second  Third   Fourth Byte
166
167
168
169
170
171
172

173
174
175
176
177
178
179
# Tests below are based on the "gaps" in the above table. Note ascii test
# values are repeated because internally a different code path is used
# (UtfToUtfProc).
# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080
lappend encInvalidBytes {*}{
    utf-8 80 tcl8    \u20AC -1 {} {map to cp1252}
    utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte}

    utf-8 80 strict  {}      0 {} {Smallest invalid byte}
    utf-8 81 tcl8    \u0081 -1 {} {map to cp1252}
    utf-8 82 tcl8    \u201A -1 {} {map to cp1252}
    utf-8 83 tcl8    \u0192 -1 {} {map to cp1252}
    utf-8 84 tcl8    \u201E -1 {} {map to cp1252}
    utf-8 85 tcl8    \u2026 -1 {} {map to cp1252}
    utf-8 86 tcl8    \u2020 -1 {} {map to cp1252}







>







168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Tests below are based on the "gaps" in the above table. Note ascii test
# values are repeated because internally a different code path is used
# (UtfToUtfProc).
# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080
lappend encInvalidBytes {*}{
    utf-8 80 tcl8    \u20AC -1 {} {map to cp1252}
    utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte}
    utf-8 80 lossless \uDC80 -1 {} {Smallest invalid byte}
    utf-8 80 strict  {}      0 {} {Smallest invalid byte}
    utf-8 81 tcl8    \u0081 -1 {} {map to cp1252}
    utf-8 82 tcl8    \u201A -1 {} {map to cp1252}
    utf-8 83 tcl8    \u0192 -1 {} {map to cp1252}
    utf-8 84 tcl8    \u201E -1 {} {map to cp1252}
    utf-8 85 tcl8    \u2026 -1 {} {map to cp1252}
    utf-8 86 tcl8    \u2020 -1 {} {map to cp1252}
202
203
204
205
206
207
208

209
210
211

212
213

214
215
216

217
218
219

220
221
222

223
224
225
226

227
228
229

230
231
232

233
234
235
236

237
238
239

240
241
242

243
244
245

246
247
248

249
250
251
252

253
254
255

256
257
258

259
260
261

262
263
264

265
266
267

268
269
270

271
272
273

274
275
276
277

278
279
280

281
282
283

284
285
286

287
288
289

290
291
292

293
294
295

296
297
298

299
300
301

302
303
304

305
306
307

308
309
310

311
312
313
314
315
316
317

318
319
320

321
322
323

324
325
326

327
328
329

330
331
332

333
334
335

336
337
338

339
340
341

342
343
344

345
346
347

348
349
350

351
352
353
354

355
356
357

358
359
360

361
362
363

364
365
366

367
368
369

370
371
372

373
374
375

376
377
378

379
380
381

382
383
384

385
386
387

388
389
390

391
392
393

394
395
396
397

398
399
400

401
402
403

404
405
406

407
408
409

410
411
412

413
414
415

416
417
418

419
420
421

422
423
424

425
426
427

428
429
430

431
432
433
434

435
436
437

438
439
440

441
442
443

444
445
446

447
448
449

450
451
452

453
454
455

456
457
458

459
460
461

462
463
464

465
466
467

468
469
470

471
472
473

474
475
476

477
478
479

480
481
482

483
484
485

486
487
488

489
490
491

492
493
494

495
496
497

498
499
500
501

502
503
504

505
506
507

508
509
510

511
512
513

514
515
516

517
518
519

520
521
522

523
524
525

526
527
528

529
530
531

532
533
534
535

536
537
538

539
540
541

542

543

544

545
546
547
548
549
550
551
552
553
554


555
556

557
558
559

560
561
562
563
564


565
566

567
568
569

570
571
572
573
574
575
576
577
578
579

580
581
582

583
584
585

586
587
588

589
590
591

592
593
594

595
596

597
598
599

600
601
602
603
604

605
606
607

608
609
610

611
612
613

614
615
616

617
618
619

620
621

622
623
624

625
626
627





628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654

655
    utf-8 9D tcl8    \u009D -1 {} {map to cp1252}
    utf-8 9E tcl8    \u017E -1 {} {map to cp1252}
    utf-8 9F tcl8    \u0178 -1 {} {map to cp1252}

    utf-8 C0 tcl8    \u00C0 -1 {} {C0 is invalid anywhere}
    utf-8 C0 strict  {}      0 {} {C0 is invalid anywhere}
    utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere}

    utf-8 C080 tcl8    \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8}
    utf-8 C080 strict  {}      0 {} {C080 -> invalid}
    utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char}

    utf-8 C0A2 tcl8    \u00C0\u00A2 -1 {} {websec.github.io - A}
    utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A}

    utf-8 C0A2 strict  {}            0 {} {websec.github.io - A}
    utf-8 C0A7 tcl8    \u00C0\u00A7 -1 {} {websec.github.io - double quote}
    utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote}

    utf-8 C0A7 strict  {}            0 {} {websec.github.io - double quote}
    utf-8 C0AE tcl8    \u00C0\u00AE -1 {} {websec.github.io - full stop}
    utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop}

    utf-8 C0AE strict  {}            0 {} {websec.github.io - full stop}
    utf-8 C0AF tcl8    \u00C0\u00AF -1 {} {websec.github.io - solidus}
    utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus}

    utf-8 C0AF strict  {}            0 {} {websec.github.io - solidus}

    utf-8 C1 tcl8    \u00C1 -1 {} {C1 is invalid everywhere}
    utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere}

    utf-8 C1 strict  {}      0 {} {C1 is invalid everywhere}
    utf-8 C181 tcl8    \u00C1\u0081 -1 {} {websec.github.io - base test (A)}
    utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)}

    utf-8 C181 strict  {}            0 {} {websec.github.io - base test (A)}
    utf-8 C19C tcl8    \u00C1\u0153 -1 {} {websec.github.io - reverse solidus}
    utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}

    utf-8 C19C strict  {}            0 {} {websec.github.io - reverse solidus}

    utf-8 C2 tcl8      \u00C2     -1 {} {Missing trail byte}
    utf-8 C2 replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 C2 strict    {}          0 {} {Missing trail byte}
    utf-8 C27F tcl8    \u00C2\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 C27F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}

    utf-8 C27F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 DF tcl8      \u00DF     -1 {} {Missing trail byte}
    utf-8 DF replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 DF strict    {}          0 {} {Missing trail byte}
    utf-8 DF7F tcl8    \u00DF\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 DF7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}

    utf-8 DF7F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 DFE0A080 tcl8    \u00DF\u0800 -1 {} {Invalid trail byte is start of valid sequence}
    utf-8 DFE0A080 replace \uFFFD\u0800 -1 {} {Invalid trail byte is start of valid sequence}

    utf-8 DFE0A080 strict  {}            0 {} {Invalid trail byte is start of valid sequence}

    utf-8 E0 tcl8      \u00E0     -1 {} {Missing trail byte}
    utf-8 E0 replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 E0 strict    {}          0 {} {Missing trail byte}
    utf-8 E080 tcl8      \u00E0\u20AC   -1 {} {First trail byte must be A0:BF}
    utf-8 E080 replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}

    utf-8 E080 strict    {}              0 {} {First trail byte must be A0:BF}
    utf-8 E0819C tcl8    \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus}
    utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}

    utf-8 E0819C strict  {}                  0 {} {websec.github.io - reverse solidus}
    utf-8 E09F tcl8      \u00E0\u0178   -1 {} {First trail byte must be A0:BF}
    utf-8 E09F replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}

    utf-8 E09F strict    {}              0 {} {First trail byte must be A0:BF}
    utf-8 E0A0 tcl8      \u00E0\u00A0   -1 {} {Missing second trail byte}
    utf-8 E0A0 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 E0A0 strict    {}              0 {} {Missing second trail byte}
    utf-8 E0BF tcl8      \u00E0\u00BF   -1 {} {Missing second trail byte}
    utf-8 E0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 E0BF strict    {}              0 {} {Missing second trail byte}
    utf-8 E0A07F tcl8    \u00E0\u00A0\x7F   -1 {}     {Second trail byte must be 80:BF}
    utf-8 E0A07F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 E0A07F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 E0BF7F tcl8    \u00E0\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 E0BF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 E0BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}

    utf-8 E1 tcl8      \u00E1     -1 {} {Missing trail byte}
    utf-8 E1 replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 E1 strict    {}          0 {} {Missing trail byte}
    utf-8 E17F tcl8    \u00E1\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}

    utf-8 E17F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 E181 tcl8      \u00E1\u0081   -1 {} {Missing second trail byte}
    utf-8 E181 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 E181 strict    {}              0 {} {Missing second trail byte}
    utf-8 E1BF tcl8      \u00E1\u00BF   -1 {} {Missing second trail byte}
    utf-8 E1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 E1BF strict    {}              0 {} {Missing second trail byte}
    utf-8 E1807F tcl8    \u00E1\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
    utf-8 E1807F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 E1807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 E1BF7F tcl8    \u00E1\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 E1BF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 E1BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 EC tcl8      \u00EC     -1 {} {Missing trail byte}
    utf-8 EC replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 EC strict    {}          0 {} {Missing trail byte}
    utf-8 EC7F tcl8    \u00EC\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}

    utf-8 EC7F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 EC81 tcl8      \u00EC\u0081   -1 {} {Missing second trail byte}
    utf-8 EC81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EC81 strict    {}              0 {} {Missing second trail byte}
    utf-8 ECBF tcl8      \u00EC\u00BF   -1 {} {Missing second trail byte}
    utf-8 ECBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 ECBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EC807F tcl8    \u00EC\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
    utf-8 EC807F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EC807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 ECBF7F tcl8    \u00EC\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 ECBF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 ECBF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}

    utf-8 ED tcl8       \u00ED        -1 {} {Missing trail byte}
    utf-8 ED replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 ED strict     {}             0 {} {Missing trail byte}
    utf-8 ED7F tcl8     \u00ED\u7F    -1 {} {First trail byte must be 80:9F}
    utf-8 ED7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:9F}

    utf-8 ED7F strict   {}             0 {} {First trail byte must be 80:9F}
    utf-8 EDA0 tcl8     \u00ED\u00A0  -1 {} {First trail byte must be 80:9F}
    utf-8 EDA0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:9F}

    utf-8 EDA0 strict   {}             0 {} {First trail byte must be 80:9F}
    utf-8 ED81 tcl8      \u00ED\u0081   -1 {} {Missing second trail byte}
    utf-8 ED81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 ED81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EDBF tcl8      \u00ED\u00BF   -1 {} {Missing second trail byte}
    utf-8 EDBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EDBF strict    {}              0 {} {Missing second trail byte}
    utf-8 ED807F tcl8      \u00ED\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 ED807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 ED9F7F tcl8      \u00ED\u0178\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED9F7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 ED9F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EDA080 tcl8       \uD800          -1 {}  {High surrogate}
    utf-8 EDA080 replace    \uFFFD          -1 {knownBug}  {High surrogate}

    utf-8 EDA080 strict     {}               0 {}  {High surrogate}
    utf-8 EDAFBF tcl8       \uDBFF          -1 {}  {High surrogate}
    utf-8 EDAFBF replace    \uFFFD          -1 {knownBug}  {High surrogate}

    utf-8 EDAFBF strict     {}               0 {}  {High surrogate}
    utf-8 EDB080 tcl8       \uDC00          -1 {}  {Low surrogate}
    utf-8 EDB080 replace    \uFFFD          -1 {knownBug}  {Low surrogate}

    utf-8 EDB080 strict     {}               0 {}  {Low surrogate}
    utf-8 EDBFBF tcl8       \uDFFF          -1 {knownBug}  {Low surrogate}
    utf-8 EDBFBF replace    \uFFFD          -1 {knownBug}  {Low surrogate}

    utf-8 EDBFBF strict     {}               0 {}  {Low surrogate}
    utf-8 EDA080EDB080 tcl8 \U00010000      -1 {knownBug}  {High low surrogate pair}
    utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {knownBug}  {High low surrogate pair}

    utf-8 EDA080EDB080 strict {}             0 {}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF      -1 {knownBug}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {knownBug}  {High low surrogate pair}

    utf-8 EDAFBFEDBFBF strict {}             0 {}  {High low surrogate pair}

    utf-8 EE tcl8       \u00EE        -1 {} {Missing trail byte}
    utf-8 EE replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 EE strict     {}             0 {} {Missing trail byte}
    utf-8 EE7F tcl8     \u00EE\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EE7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:BF}

    utf-8 EE7F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EED0 tcl8     \u00EE\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 EED0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}

    utf-8 EED0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EE81 tcl8      \u00EE\u0081   -1 {} {Missing second trail byte}
    utf-8 EE81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EE81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EEBF tcl8      \u00EE\u00BF   -1 {} {Missing second trail byte}
    utf-8 EEBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EEBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EE807F tcl8      \u00EE\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EE807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EE807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EEBF7F tcl8      \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EEBF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EEBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EF tcl8       \u00EF        -1 {} {Missing trail byte}
    utf-8 EF replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 EF strict     {}             0 {} {Missing trail byte}
    utf-8 EF7F tcl8     \u00EF\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EF7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:BF}

    utf-8 EF7F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EFD0 tcl8     \u00EF\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 EFD0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}

    utf-8 EFD0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EF81 tcl8      \u00EF\u0081   -1 {} {Missing second trail byte}
    utf-8 EF81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EF81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EFBF tcl8      \u00EF\u00BF   -1 {} {Missing second trail byte}
    utf-8 EFBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EFBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EF807F tcl8      \u00EF\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EF807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EF807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EFBF7F tcl8      \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EFBF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EFBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}

    utf-8 F0 tcl8       \u00F0        -1 {} {Missing trail byte}
    utf-8 F0 replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 F0 strict     {}             0 {} {Missing trail byte}
    utf-8 F080 tcl8     \u00F0\u20AC  -1 {} {First trail byte must be 90:BF}
    utf-8 F080 replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}

    utf-8 F080 strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F08F tcl8     \u00F0\u8F    -1 {} {First trail byte must be 90:BF}
    utf-8 F08F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}

    utf-8 F08F strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F0D0 tcl8     \u00F0\u00D0  -1 {} {First trail byte must be 90:BF}
    utf-8 F0D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 90:BF}

    utf-8 F0D0 strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F090 tcl8      \u00F0\u0090   -1 {} {Missing second trail byte}
    utf-8 F090 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F090 strict    {}              0 {} {Missing second trail byte}
    utf-8 F0BF tcl8      \u00F0\u00BF   -1 {} {Missing second trail byte}
    utf-8 F0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F0BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F0907F tcl8      \u00F0\u0090\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0907F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F0907F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F0BF7F tcl8      \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F0BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F090BF tcl8      \u00F0\u0090\u00BF   -1 {} {Missing third trail byte}
    utf-8 F090BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F090BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F0BF81 tcl8      \u00F0\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F0BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F0BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F0BF807F tcl8      \u00F0\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F0BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F0BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 tcl8      \u00F0\u0090\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F090BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F1 tcl8       \u00F1        -1 {} {Missing trail byte}
    utf-8 F1 replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 F1 strict     {}             0 {} {Missing trail byte}
    utf-8 F17F tcl8     \u00F1\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 F17F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}

    utf-8 F17F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F1D0 tcl8     \u00F1\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 F1D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}

    utf-8 F1D0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F180 tcl8      \u00F1\u20AC   -1 {} {Missing second trail byte}
    utf-8 F180 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F180 strict    {}              0 {} {Missing second trail byte}
    utf-8 F1BF tcl8      \u00F1\u00BF   -1 {} {Missing second trail byte}
    utf-8 F1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F1BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F1807F tcl8      \u00F1\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F1807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F1BF7F tcl8      \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F1BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F180BF tcl8      \u00F1\u20AC\u00BF   -1 {} {Missing third trail byte}
    utf-8 F180BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F180BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F1BF81 tcl8      \u00F1\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F1BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F1BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F1BF807F tcl8      \u00F1\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F1BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F1BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 tcl8      \u00F1\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F180BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F3 tcl8       \u00F3        -1 {} {Missing trail byte}
    utf-8 F3 replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 F3 strict     {}             0 {} {Missing trail byte}
    utf-8 F37F tcl8     \u00F3\x7F    -1 {} {First trail byte must be 80:BF}
    utf-8 F37F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}

    utf-8 F37F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F3D0 tcl8     \u00F3\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 F3D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}

    utf-8 F3D0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F380 tcl8      \u00F3\u20AC   -1 {} {Missing second trail byte}
    utf-8 F380 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F380 strict    {}              0 {} {Missing second trail byte}
    utf-8 F3BF tcl8      \u00F3\u00BF   -1 {} {Missing second trail byte}
    utf-8 F3BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F3BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F3807F tcl8      \u00F3\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F3807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F3BF7F tcl8      \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F3BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F380BF tcl8      \u00F3\u20AC\u00BF   -1 {} {Missing third trail byte}
    utf-8 F380BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F380BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F3BF81 tcl8      \u00F3\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F3BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F3BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F3BF807F tcl8      \u00F3\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F3BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F3BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 tcl8      \u00F3\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F380BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F4 tcl8       \u00F4        -1 {} {Missing trail byte}
    utf-8 F4 replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 F4 strict     {}             0 {} {Missing trail byte}
    utf-8 F47F tcl8     \u00F4\u7F    -1 {} {First trail byte must be 80:8F}
    utf-8 F47F replace  \uFFFD\u7F    -1 {knownW3C} {First trail byte must be 80:8F}

    utf-8 F47F strict   {}             0 {} {First trail byte must be 80:8F}
    utf-8 F490 tcl8     \u00F4\u0090  -1 {} {First trail byte must be 80:8F}
    utf-8 F490 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:8F}

    utf-8 F490 strict   {}             0 {} {First trail byte must be 80:8F}
    utf-8 F480 tcl8      \u00F4\u20AC   -1 {} {Missing second trail byte}
    utf-8 F480 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F480 strict    {}              0 {} {Missing second trail byte}
    utf-8 F48F tcl8      \u00F4\u008F   -1 {} {Missing second trail byte}
    utf-8 F48F replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F48F strict    {}              0 {} {Missing second trail byte}
    utf-8 F4807F tcl8      \u00F4\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F4807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F4807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F48F7F tcl8      \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F48F7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F48F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F48081 tcl8      \u00F4\u20AC\u0081   -1 {} {Missing third trail byte}
    utf-8 F48081 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F48081 strict    {}              0 {} {Missing third trail byte}
    utf-8 F48F81 tcl8      \u00F4\u008F\u0081   -1 {} {Missing third trail byte}
    utf-8 F48F81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F48F81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F481817F tcl8      \u00F4\u0081\u0081\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F480817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F480817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 tcl8      \u00F4\u008F\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F48FBFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F5 tcl8    \u00F5 -1 {} {F5:FF are invalid everywhere}
    utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere}

    utf-8 F5 strict  {}      0 {} {F5:FF are invalid everywhere}
    utf-8 FF tcl8    \u00FF -1 {} {F5:FF are invalid everywhere}
    utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere}

    utf-8 FF strict  {}      0 {} {F5:FF are invalid everywhere}

    utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8}

    utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3-9}

    utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10}

    utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30                         -1 {knownW3C} {Unicode Table 3.11}

}

# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated
# based on these depending on platform endianness. Note truncated tests can only
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
    utf-16le 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
    utf-16le 41      replace   \uFFFD -1 {solo tail} {Truncated}
    utf-16le 41      strict    {}      0 {solo tail} {Truncated}


    utf-16le 00D8    tcl8      \uD800 -1 {} {Missing low surrogate}
    utf-16le 00D8    replace   \uFFFD -1 {} {Missing low surrogate}

    utf-16le 00D8    strict    {}      0 {knownBug} {Missing low surrogate}
    utf-16le 00DC    tcl8      \uDC00 -1 {} {Missing high surrogate}
    utf-16le 00DC    replace   \uFFFD -1 {} {Missing high surrogate}

    utf-16le 00DC    strict    {}      0 {knownBug} {Missing high surrogate}

    utf-16be 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
    utf-16be 41      replace   \uFFFD -1 {solo tail} {Truncated}
    utf-16be 41      strict    {}      0 {solo tail} {Truncated}


    utf-16be D800    tcl8      \uD800 -1 {} {Missing low surrogate}
    utf-16be D800    replace   \uFFFD -1 {knownBug} {Missing low surrogate}

    utf-16be D800    strict    {}      0 {knownBug} {Missing low surrogate}
    utf-16be DC00    tcl8      \uDC00 -1 {} {Missing high surrogate}
    utf-16be DC00    replace   \uFFFD -1 {knownBug} {Missing high surrogate}

    utf-16be DC00    strict    {}      0 {knownBug} {Missing high surrogate}
}

# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated
# based on these depending on platform endianness. Note truncated tests can only
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
    utf-32le 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 41      replace   \uFFFD  -1 {solo} {Truncated}

    utf-32le 41      strict    {}   0 {solo tail} {Truncated}
    utf-32le 4100    tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 4100    replace   \uFFFD  -1 {solo} {Truncated}

    utf-32le 4100    strict    {}   0 {solo tail} {Truncated}
    utf-32le 410000  tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 410000  replace   \uFFFD  -1 {solo} {Truncated}

    utf-32le 410000  strict    {}       0 {solo tail} {Truncated}
    utf-32le 00D80000 tcl8     \uD800   -1 {} {High-surrogate}
    utf-32le 00D80000 replace  \uFFFD   -1 {} {High-surrogate}

    utf-32le 00D80000 strict   {}        0 {} {High-surrogate}
    utf-32le 00DC0000 tcl8     \uDC00   -1 {} {Low-surrogate}
    utf-32le 00DC0000 replace  \uFFFD   -1 {} {Low-surrogate}

    utf-32le 00DC0000 strict   {}        0 {} {Low-surrogate}
    utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
    utf-32le 00D8000000DC0000 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}

    utf-32le 00D8000000DC0000 strict  {}            0 {} {High-low-surrogate-pair}
    utf-32le 00001100 tcl8 \uFFFD    -1 {} {Out of range}

    utf-32le 00001100 replace \uFFFD -1 {} {Out of range}
    utf-32le 00001100 strict {}       0 {} {Out of range}
    utf-32le FFFFFFFF tcl8 \uFFFD    -1 {} {Out of range}

    utf-32le FFFFFFFF replace \uFFFD -1 {} {Out of range}
    utf-32le FFFFFFFF strict {}       0 {} {Out of range}

    utf-32be 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 41      replace   \uFFFD  -1 {solo tail} {Truncated}

    utf-32be 41      strict    {}       0 {solo tail} {Truncated}
    utf-32be 0041    tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 0041    replace   \uFFFD  -1 {solo} {Truncated}

    utf-32be 0041    strict    {}   0 {solo tail} {Truncated}
    utf-32be 000041  tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 000041  replace   \uFFFD  -1 {solo} {Truncated}

    utf-32be 000041  strict    {}       0 {solo tail} {Truncated}
    utf-32be 0000D800 tcl8     \uD800   -1 {} {High-surrogate}
    utf-32be 0000D800 replace  \uFFFD   -1 {} {High-surrogate}

    utf-32be 0000D800 strict   {}        0 {} {High-surrogate}
    utf-32be 0000DC00 tcl8     \uDC00   -1 {} {Low-surrogate}
    utf-32be 0000DC00 replace  \uFFFD   -1 {} {Low-surrogate}

    utf-32be 0000DC00 strict   {}        0 {} {Low-surrogate}
    utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
    utf-32be 0000D8000000DC00 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}

    utf-32be 0000D8000000DC00 strict  {}            0 {} {High-low-surrogate-pair}
    utf-32be 00110000 tcl8 \uFFFD    -1 {} {Out of range}

    utf-32be 00110000 replace \uFFFD -1 {} {Out of range}
    utf-32be 00110000 strict {}       0 {} {Out of range}
    utf-32be FFFFFFFF tcl8 \uFFFD    -1 {} {Out of range}

    utf-32be FFFFFFFF replace \uFFFD -1 {} {Out of range}
    utf-32be FFFFFFFF strict {}       0 {} {Out of range}
}






# Strings that cannot be encoded for specific encoding / profiles
# <ENCODING STRING PROFILE EXPECTEDRESULT EXPECTEDFAILINDEX CTRL COMMENT>
# <ENCODING,STRING,PROFILE> should be unique for test ids to be unique.
# See earlier comments about CTRL field.
#
# Note utf-16, utf-32 missing because they are automatically
# generated based on le/be versions.
# TODO - out of range code point (note cannot be generated by \U notation)
lappend encUnencodableStrings {*}{
    ascii \u00e0 tcl8    3f -1 {} {unencodable}
    ascii \u00e0 strict  {}  0 {} {unencodable}

    iso8859-1 \u0141 tcl8    3f -1 {} unencodable
    iso8859-1 \u0141 strict  {}  0 {} unencodable

    utf-8 \uD800 tcl8    eda080 -1 {} High-surrogate
    utf-8 \uD800 strict  {}      0 {} High-surrogate
    utf-8 \uDC00 tcl8    edb080 -1 {} High-surrogate
    utf-8 \uDC00 strict  {}      0 {} High-surrogate
}


# The icuUcmTests.tcl is generated by the tools/ucm2tests.tcl script
# and generates test vectors for the above tables for various encodings
# based on ICU UCM files.
# TODO - commented out for now as generating a lot of mismatches.

# source [file join [file dirname [info script]] icuUcmTests.tcl]







>



>


>



>



>



>




>



>



>




>



>



>



>



>




>



>



>



>



>



>


|
>


|
>




>



>



>



>


|
>


|
>



>



>



>



>


|
>


|
>





|
|
>



>



>



>


|
>


|
>



>



>



>



>



>



>




>

|
|
>



>



>



>


|
>


|
>



>



>



>



>



>


|
>


|
>




>



>



>



>



>



>



>



>



>



>



>



>




>



>



>



>



>


|
>


|
>



>



>

|

>



>



>



>



>



>



>

|
|
>



>



>



>



>



>




>



>



>



>



>



>



>



>



>



>



>




>



>



>

>

>

>










>
>


>



>





>
>


>



>










>



>



>



>



>


|
>


>
|


>
|




>



>



>



>



>


|
>


>
|


>
|


>
>
>
>
>






<
<


















|
>

205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776


777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
    utf-8 9D tcl8    \u009D -1 {} {map to cp1252}
    utf-8 9E tcl8    \u017E -1 {} {map to cp1252}
    utf-8 9F tcl8    \u0178 -1 {} {map to cp1252}

    utf-8 C0 tcl8    \u00C0 -1 {} {C0 is invalid anywhere}
    utf-8 C0 strict  {}      0 {} {C0 is invalid anywhere}
    utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere}
    utf-8 C0 lossless \uDCC0 -1 {} {C0 is invalid anywhere}
    utf-8 C080 tcl8    \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8}
    utf-8 C080 strict  {}      0 {} {C080 -> invalid}
    utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char}
    utf-8 C080 lossless \uDCC0\uDC80 -1 {} {C080 -> two lossless wrappers}
    utf-8 C0A2 tcl8    \u00C0\u00A2 -1 {} {websec.github.io - A}
    utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A}
    utf-8 C0A2 lossless \uDCC0\uDCA2 -1 {} {websec.github.io - A}
    utf-8 C0A2 strict  {}            0 {} {websec.github.io - A}
    utf-8 C0A7 tcl8    \u00C0\u00A7 -1 {} {websec.github.io - double quote}
    utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote}
    utf-8 C0A7 lossless \uDCC0\uDCA7 -1 {} {websec.github.io - A}
    utf-8 C0A7 strict  {}            0 {} {websec.github.io - double quote}
    utf-8 C0AE tcl8    \u00C0\u00AE -1 {} {websec.github.io - full stop}
    utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop}
    utf-8 C0AE lossless \uDCC0\uDCAE -1 {} {websec.github.io - A}
    utf-8 C0AE strict  {}            0 {} {websec.github.io - full stop}
    utf-8 C0AF tcl8    \u00C0\u00AF -1 {} {websec.github.io - solidus}
    utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus}
    utf-8 C0AF lossless \uDCC0\uDCAF -1 {} {websec.github.io - A}
    utf-8 C0AF strict  {}            0 {} {websec.github.io - solidus}

    utf-8 C1 tcl8    \u00C1 -1 {} {C1 is invalid everywhere}
    utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere}
    utf-8 C1 lossless \uDCC1 -1 {} {C1 is invalid anywhere}
    utf-8 C1 strict  {}      0 {} {C1 is invalid everywhere}
    utf-8 C181 tcl8    \u00C1\u0081 -1 {} {websec.github.io - base test (A)}
    utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)}
    utf-8 C181 lossless \uDCC1\uDC81 -1 {} {websec.github.io - base test (A)}
    utf-8 C181 strict  {}            0 {} {websec.github.io - base test (A)}
    utf-8 C19C tcl8    \u00C1\u0153 -1 {} {websec.github.io - reverse solidus}
    utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}
    utf-8 C19C lossless \uDCC1\uDC9C -1 {} {websec.github.io - reverse solidus}
    utf-8 C19C strict  {}            0 {} {websec.github.io - reverse solidus}

    utf-8 C2 tcl8      \u00C2     -1 {} {Missing trail byte}
    utf-8 C2 replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 C2 lossless   \uDCC2     -1 {} {Missing trail byte}
    utf-8 C2 strict    {}          0 {} {Missing trail byte}
    utf-8 C27F tcl8    \u00C2\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 C27F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 C27F lossless \uDCC2\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 C27F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 DF tcl8      \u00DF     -1 {} {Missing trail byte}
    utf-8 DF replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 DF lossless   \uDCDF     -1 {} {Missing trail byte}
    utf-8 DF strict    {}          0 {} {Missing trail byte}
    utf-8 DF7F tcl8    \u00DF\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 DF7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 DF7F lossless \uDCDF\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 DF7F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 DFE0A080 tcl8    \u00DF\u0800 -1 {} {Invalid trail byte is start of valid sequence}
    utf-8 DFE0A080 replace \uFFFD\u0800 -1 {} {Invalid trail byte is start of valid sequence}
    utf-8 DFE0A080 lossless \uDCDF\u0800 -1 {} {Invalid trail byte is start of valid sequence}
    utf-8 DFE0A080 strict  {}            0 {} {Invalid trail byte is start of valid sequence}

    utf-8 E0 tcl8      \u00E0     -1 {} {Missing trail byte}
    utf-8 E0 replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 E0 lossless  \uDCE0    -1 {} {Missing trail byte}
    utf-8 E0 strict    {}          0 {} {Missing trail byte}
    utf-8 E080 tcl8      \u00E0\u20AC   -1 {} {First trail byte must be A0:BF}
    utf-8 E080 replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}
    utf-8 E080 lossless   \uDCE0\uDC80  -1 {} {First trail byte must be A0:BF}
    utf-8 E080 strict    {}              0 {} {First trail byte must be A0:BF}
    utf-8 E0819C tcl8    \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus}
    utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}
    utf-8 E0819C lossless \uDCE0\uDC81\uDC9C -1 {} {websec.github.io - reverse solidus}
    utf-8 E0819C strict  {}                  0 {} {websec.github.io - reverse solidus}
    utf-8 E09F tcl8      \u00E0\u0178   -1 {} {First trail byte must be A0:BF}
    utf-8 E09F replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}
    utf-8 E09F lossless  \uDCE0\uDC9F   -1 {} {First trail byte must be A0:BF}
    utf-8 E09F strict    {}              0 {} {First trail byte must be A0:BF}
    utf-8 E0A0 tcl8      \u00E0\u00A0   -1 {} {Missing second trail byte}
    utf-8 E0A0 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 E0A0 lossless  \uDCE0\uDCA0   -1 {} {Missing second trail byte}
    utf-8 E0A0 strict    {}              0 {} {Missing second trail byte}
    utf-8 E0BF tcl8      \u00E0\u00BF   -1 {} {Missing second trail byte}
    utf-8 E0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 E0BF lossless  \uDCE0\uDCBF   -1 {} {Missing second trail byte}
    utf-8 E0BF strict    {}              0 {} {Missing second trail byte}
    utf-8 E0A07F tcl8    \u00E0\u00A0\x7F   -1 {}     {Second trail byte must be 80:BF}
    utf-8 E0A07F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 E0A07F lossless \uDCE0\uDCA0\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 E0A07F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 E0BF7F tcl8    \u00E0\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 E0BF7F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 E0BF7F lossless \uDCE0\uDCBF\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 E0BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}

    utf-8 E1 tcl8      \u00E1     -1 {} {Missing trail byte}
    utf-8 E1 replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 E1 lossless  \uDCE1     -1 {} {Missing trail byte}
    utf-8 E1 strict    {}          0 {} {Missing trail byte}
    utf-8 E17F tcl8    \u00E1\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 E17F lossless \uDCE1\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 E17F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 E181 tcl8      \u00E1\u0081   -1 {} {Missing second trail byte}
    utf-8 E181 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 E181 lossless  \uDCE1\uDC81   -1 {} {Missing second trail byte}
    utf-8 E181 strict    {}              0 {} {Missing second trail byte}
    utf-8 E1BF tcl8      \u00E1\u00BF   -1 {} {Missing second trail byte}
    utf-8 E1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 E1BF lossless   \uDCE1\uDCBF  -1 {} {Missing second trail byte}
    utf-8 E1BF strict    {}              0 {} {Missing second trail byte}
    utf-8 E1807F tcl8    \u00E1\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
    utf-8 E1807F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 E1807F lossless \uDCE1\uDC80\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 E1807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 E1BF7F tcl8    \u00E1\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 E1BF7F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 E1BF7F lossless \uDCE1\uDCBF\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 E1BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 EC tcl8      \u00EC     -1 {} {Missing trail byte}
    utf-8 EC replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 EC lossless   \uDCEC    -1 {} {Missing trail byte}
    utf-8 EC strict    {}          0 {} {Missing trail byte}
    utf-8 EC7F tcl8    \u00EC\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 EC7F lossless \uDCEC\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 EC7F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 EC81 tcl8      \u00EC\u0081   -1 {} {Missing second trail byte}
    utf-8 EC81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EC81 lossless   \uDCEC\uDC81  -1 {} {Missing second trail byte}
    utf-8 EC81 strict    {}              0 {} {Missing second trail byte}
    utf-8 ECBF tcl8      \u00EC\u00BF   -1 {} {Missing second trail byte}
    utf-8 ECBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 ECBF lossless   \uDCEC\uDCBF  -1 {} {Missing second trail byte}
    utf-8 ECBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EC807F tcl8    \u00EC\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
    utf-8 EC807F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EC807F lossless \uDCEC\uDC80\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 EC807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 ECBF7F tcl8    \u00EC\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 ECBF7F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 ECBF7F lossless \uDCEC\uDCBF\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 ECBF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}

    utf-8 ED tcl8       \u00ED        -1 {} {Missing trail byte}
    utf-8 ED replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 ED strict     {}             0 {} {Missing trail byte}
    utf-8 ED7F tcl8     \u00ED\x7F    -1 {} {First trail byte must be 80:9F}
    utf-8 ED7F replace  \uFFFD\x7F    -1 {} {First trail byte must be 80:9F}
    utf-8 ED7F lossless  \uDCED\x7F    -1 {} {First trail byte must be 80:9F}
    utf-8 ED7F strict   {}             0 {} {First trail byte must be 80:9F}
    utf-8 EDA0 tcl8     \u00ED\u00A0  -1 {} {First trail byte must be 80:9F}
    utf-8 EDA0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:9F}
    utf-8 EDA0 lossless  \uDCED\uDCA0  -1 {} {First trail byte must be 80:9F}
    utf-8 EDA0 strict   {}             0 {} {First trail byte must be 80:9F}
    utf-8 ED81 tcl8      \u00ED\u0081   -1 {} {Missing second trail byte}
    utf-8 ED81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 ED81 lossless   \uDCED\uDC81  -1 {} {Missing second trail byte}
    utf-8 ED81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EDBF tcl8      \u00ED\u00BF   -1 {} {Missing second trail byte}
    utf-8 EDBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EDBF lossless   \uDCED\uDCBF  -1 {} {Missing second trail byte}
    utf-8 EDBF strict    {}              0 {} {Missing second trail byte}
    utf-8 ED807F tcl8      \u00ED\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED807F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 ED807F lossless   \uDCED\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 ED9F7F tcl8      \u00ED\u0178\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED9F7F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 ED9F7F lossless   \uDCED\uDC9F\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED9F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EDA080 tcl8       \uD800          -1 {}  {High surrogate}
    utf-8 EDA080 replace    \uFFFD          -1 {knownBug}  {High surrogate}
    utf-8 EDA080 lossless    \uDCED\uDCA0\uED80 -1 {knownBug}  {High surrogate}
    utf-8 EDA080 strict     {}               0 {}  {High surrogate}
    utf-8 EDAFBF tcl8       \uDBFF          -1 {}  {High surrogate}
    utf-8 EDAFBF replace    \uFFFD          -1 {knownBug}  {High surrogate}
    utf-8 EDAFBF lossless    \uDCED\uDCAF\uDCBF -1 {knownBug}  {High surrogate}
    utf-8 EDAFBF strict     {}               0 {}  {High surrogate}
    utf-8 EDB080 tcl8       \uDC00          -1 {}  {Low surrogate}
    utf-8 EDB080 replace    \uFFFD          -1 {knownBug}  {Low surrogate}
    utf-8 EDB080 lossless    \uDCED\uDCB0\uDC80 -1 {knownBug}  {Low surrogate}
    utf-8 EDB080 strict     {}               0 {}  {Low surrogate}
    utf-8 EDBFBF tcl8       \uDFFF          -1 {knownBug}  {Low surrogate}
    utf-8 EDBFBF replace    \uFFFD          -1 {knownBug}  {Low surrogate}
    utf-8 EDBFBF lossless    \uDCED\uDCBF\uDCBF -1 {knownBug}  {Low surrogate}
    utf-8 EDBFBF strict     {}               0 {}  {Low surrogate}
    utf-8 EDA080EDB080 tcl8 \U00010000      -1 {knownBug}  {High low surrogate pair}
    utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {knownBug}  {High low surrogate pair}
    utf-8 EDA080EDB080 lossless \uDCED\uDCA0\uDC80\uDCED\uDCB0\uDC80  -1 {knownBug}  {High low surrogate pair}
    utf-8 EDA080EDB080 strict {}             0 {}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF      -1 {knownBug}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {knownBug}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF lossless \uDCED\uDCAF\uDCBF\uDCED\uDCBF\uDCBF  -1 {knownBug}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF strict {}             0 {}  {High low surrogate pair}

    utf-8 EE tcl8       \u00EE        -1 {} {Missing trail byte}
    utf-8 EE replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 EE lossless    \uDCEE       -1 {} {Missing trail byte}
    utf-8 EE strict     {}             0 {} {Missing trail byte}
    utf-8 EE7F tcl8     \u00EE\x7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EE7F replace  \uFFFD\x7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EE7F lossless  \uDCEE\x7F     -1 {} {First trail byte must be 80:BF}
    utf-8 EE7F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EED0 tcl8     \u00EE\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 EED0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
    utf-8 EED0 lossless  \uDCEE\uDCD0   -1 {} {First trail byte must be 80:BF}
    utf-8 EED0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EE81 tcl8      \u00EE\u0081   -1 {} {Missing second trail byte}
    utf-8 EE81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EE81 lossless   \uDCEE\uDC81  -1 {} {Missing second trail byte}
    utf-8 EE81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EEBF tcl8      \u00EE\u00BF   -1 {} {Missing second trail byte}
    utf-8 EEBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EEBF lossless   \uDCEE\uDCBF  -1 {} {Missing second trail byte}
    utf-8 EEBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EE807F tcl8      \u00EE\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EE807F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EE807F lossless  \uDCEE\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EE807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EEBF7F tcl8      \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EEBF7F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EEBF7F lossless  \uDCEE\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EEBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EF tcl8       \u00EF        -1 {} {Missing trail byte}
    utf-8 EF replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 EF lossless    \uDCEF       -1 {} {Missing trail byte}
    utf-8 EF strict     {}             0 {} {Missing trail byte}
    utf-8 EF7F tcl8     \u00EF\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EF7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EF7F lossless  \uDCEF\x7F -1 {} {First trail byte must be 80:BF}
    utf-8 EF7F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EFD0 tcl8     \u00EF\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 EFD0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
    utf-8 EFD0 lossless  \uDCEF\uDCD0 -1 {} {First trail byte must be 80:BF}
    utf-8 EFD0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EF81 tcl8      \u00EF\u0081   -1 {} {Missing second trail byte}
    utf-8 EF81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EF81 lossless  \uDCEF\uDC81   -1 {} {Missing second trail byte}
    utf-8 EF81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EFBF tcl8      \u00EF\u00BF   -1 {} {Missing second trail byte}
    utf-8 EFBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EFBF lossless  \uDCEF\uDCBF   -1 {} {Missing second trail byte}
    utf-8 EFBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EF807F tcl8      \u00EF\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EF807F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EF807F lossless  \uDCEF\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EF807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EFBF7F tcl8      \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EFBF7F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EFBF7F lossless  \uDCEF\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EFBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}

    utf-8 F0 tcl8       \u00F0        -1 {} {Missing trail byte}
    utf-8 F0 replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 F0 lossless   \uDCF0       -1 {} {Missing trail byte}
    utf-8 F0 strict     {}             0 {} {Missing trail byte}
    utf-8 F080 tcl8     \u00F0\u20AC  -1 {} {First trail byte must be 90:BF}
    utf-8 F080 replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}
    utf-8 F080 lossless \uDCF0\uDC80 -1 {} {First trail byte must be 90:BF}
    utf-8 F080 strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F08F tcl8     \u00F0\u8F    -1 {} {First trail byte must be 90:BF}
    utf-8 F08F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}
    utf-8 F08F lossless \uDCF0\uDC8F -1 {} {First trail byte must be 90:BF}
    utf-8 F08F strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F0D0 tcl8     \u00F0\u00D0  -1 {} {First trail byte must be 90:BF}
    utf-8 F0D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 90:BF}
    utf-8 F0D0 lossless \uDCF0\uDCD0 -1 {} {First trail byte must be 90:BF}
    utf-8 F0D0 strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F090 tcl8      \u00F0\u0090   -1 {} {Missing second trail byte}
    utf-8 F090 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F090 lossless  \uDCF0\uDC90   -1 {} {Missing second trail byte}
    utf-8 F090 strict    {}              0 {} {Missing second trail byte}
    utf-8 F0BF tcl8      \u00F0\u00BF   -1 {} {Missing second trail byte}
    utf-8 F0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F0BF lossless   \uDCF0\uDCBF  -1 {} {Missing second trail byte}
    utf-8 F0BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F0907F tcl8      \u00F0\u0090\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0907F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F0907F lossless   \uDCF0\uDC90\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0907F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F0BF7F tcl8      \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F0BF7F lossless  \uDCF0\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F090BF tcl8      \u00F0\u0090\u00BF   -1 {} {Missing third trail byte}
    utf-8 F090BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F090BF lossless  \uDCF0\uDC90\uDCBF  -1 {} {Missing third trail byte}
    utf-8 F090BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F0BF81 tcl8      \u00F0\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F0BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F0BF81 lossless  \uDCF0\uDCBF\uDC81  -1 {} {Missing third trail byte}
    utf-8 F0BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F0BF807F tcl8      \u00F0\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F0BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F0BF817F lossless   \uDCF0\uDCBF\uDC81\x7F -1 {} {Third trail byte must be 80:BF}
    utf-8 F0BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 tcl8      \u00F0\u0090\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 lossless   \uDCF0\uDC90\uDCBF\uDCD0 -1 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F1 tcl8       \u00F1        -1 {} {Missing trail byte}
    utf-8 F1 replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 F1 lossless    \uDCF1       -1 {} {Missing trail byte}
    utf-8 F1 strict     {}             0 {} {Missing trail byte}
    utf-8 F17F tcl8     \u00F1\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 F17F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}
    utf-8 F17F lossless  \uDCF1\x7F -1 {} {First trail byte must be 80:BF}
    utf-8 F17F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F1D0 tcl8     \u00F1\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 F1D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
    utf-8 F1D0 lossless  \uDCF1\uDCD0 -1 {} {First trail byte must be 80:BF}
    utf-8 F1D0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F180 tcl8      \u00F1\u20AC   -1 {} {Missing second trail byte}
    utf-8 F180 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F180 lossless   \uDCF1\uDC80  -1 {} {Missing second trail byte}
    utf-8 F180 strict    {}              0 {} {Missing second trail byte}
    utf-8 F1BF tcl8      \u00F1\u00BF   -1 {} {Missing second trail byte}
    utf-8 F1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F1BF lossless   \uDCF1\uDCBF  -1 {} {Missing second trail byte}
    utf-8 F1BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F1807F tcl8      \u00F1\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1807F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F1807F lossless   \uDCF1\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F1BF7F tcl8      \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1BF7F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F1BF7F lossless   \uDCF1\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F180BF tcl8      \u00F1\u20AC\u00BF   -1 {} {Missing third trail byte}
    utf-8 F180BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F180BF lossless   \uDCF1\uDC80\uDCBF -1 {} {Missing third trail byte}
    utf-8 F180BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F1BF81 tcl8      \u00F1\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F1BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F1BF81 lossless   \uDCF1\uDCBF\uDC81  -1 {} {Missing third trail byte}
    utf-8 F1BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F1BF807F tcl8      \u00F1\u00BF\u20AC\x7F  -1 {} {Third trail byte must be 80:BF}
    utf-8 F1BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F1BF817F lossless   \uDCF1\uDCBF\uDC81\x7F -1 {} {Third trail byte must be 80:BF}
    utf-8 F1BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 tcl8      \u00F1\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 lossless   \uDCF1\uDC80\uDCBF\uDCD0 -1 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F3 tcl8       \u00F3        -1 {} {Missing trail byte}
    utf-8 F3 replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 F3 lossless    \uDCF3       -1 {} {Missing trail byte}
    utf-8 F3 strict     {}             0 {} {Missing trail byte}
    utf-8 F37F tcl8     \u00F3\x7F    -1 {} {First trail byte must be 80:BF}
    utf-8 F37F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}
    utf-8 F37F lossless  \uDCF3\x7F   -1 {} {First trail byte must be 80:BF}
    utf-8 F37F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F3D0 tcl8     \u00F3\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 F3D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
    utf-8 F3D0 lossless  \uDCF3\uDCD0 -1 {} {First trail byte must be 80:BF}
    utf-8 F3D0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F380 tcl8      \u00F3\u20AC   -1 {} {Missing second trail byte}
    utf-8 F380 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F380 lossless   \uDCF3\uDC80  -1 {} {Missing second trail byte}
    utf-8 F380 strict    {}              0 {} {Missing second trail byte}
    utf-8 F3BF tcl8      \u00F3\u00BF   -1 {} {Missing second trail byte}
    utf-8 F3BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F3BF lossless   \uDCF3\uDCBF  -1 {} {Missing second trail byte}
    utf-8 F3BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F3807F tcl8     \u00F3\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3807F replace  \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F3807F lossless  \uDCF3\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F3BF7F tcl8      \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F3BF7F lossless   \uDCF3\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F380BF tcl8      \u00F3\u20AC\u00BF   -1 {} {Missing third trail byte}
    utf-8 F380BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F380BF lossless   \uDCF3\uDC80\uDCBF -1 {} {Missing third trail byte}
    utf-8 F380BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F3BF81 tcl8      \u00F3\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F3BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F3BF81 lossless   \uDCF3\uDCBF\uDC81 -1 {} {Missing third trail byte}
    utf-8 F3BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F3BF807F tcl8      \u00F3\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F3BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F3BF817F lossless   \uDCF3\uDCBF\uDC81\x7F -1 {} {Third trail byte must be 80:BF}
    utf-8 F3BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 tcl8      \u00F3\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 lossless   \uDCF3\uDC80\uDCBF\uDCD0 -1 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F4 tcl8       \u00F4        -1 {} {Missing trail byte}
    utf-8 F4 replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 F4 lossless    \uDCF4       -1 {} {Missing trail byte}
    utf-8 F4 strict     {}             0 {} {Missing trail byte}
    utf-8 F47F tcl8     \u00F4\u7F    -1 {} {First trail byte must be 80:8F}
    utf-8 F47F replace  \uFFFD\u7F    -1 {knownW3C} {First trail byte must be 80:8F}
    utf-8 F47F lossless  \uDCF4\x7F -1 {} {First trail byte must be 80:8F}
    utf-8 F47F strict   {}             0 {} {First trail byte must be 80:8F}
    utf-8 F490 tcl8     \u00F4\u0090  -1 {} {First trail byte must be 80:8F}
    utf-8 F490 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:8F}
    utf-8 F490 lossless  \uDCF4\uDC90 -1 {} {First trail byte must be 80:8F}
    utf-8 F490 strict   {}             0 {} {First trail byte must be 80:8F}
    utf-8 F480 tcl8      \u00F4\u20AC   -1 {} {Missing second trail byte}
    utf-8 F480 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F480 lossless   \uDCF4\uDC80  -1 {} {Missing second trail byte}
    utf-8 F480 strict    {}              0 {} {Missing second trail byte}
    utf-8 F48F tcl8      \u00F4\u008F   -1 {} {Missing second trail byte}
    utf-8 F48F replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F48F lossless   \uDCF4\uDC8F  -1 {} {Missing second trail byte}
    utf-8 F48F strict    {}              0 {} {Missing second trail byte}
    utf-8 F4807F tcl8      \u00F4\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F4807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F4807F lossless   \uDCF4\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F4807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F48F7F tcl8      \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F48F7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F48F7F lossless   \uDCF4\uDC8F\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F48F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F48081 tcl8      \u00F4\u20AC\u0081   -1 {} {Missing third trail byte}
    utf-8 F48081 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F48081 lossless   \uDCF4\uDC80\uDC81  -1 {} {Missing third trail byte}
    utf-8 F48081 strict    {}              0 {} {Missing third trail byte}
    utf-8 F48F81 tcl8      \u00F4\u008F\u0081   -1 {} {Missing third trail byte}
    utf-8 F48F81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F48F81 lossless   \uDCF4\uDC8F\uDC81  -1 {} {Missing third trail byte}
    utf-8 F48F81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F481817F tcl8      \u00F4\u0081\u0081\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F480817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F480817F lossless   \uDCF4\uDC80\uDC81\x7F -1 {} {Third trail byte must be 80:BF}
    utf-8 F480817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 tcl8      \u00F4\u008F\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 lossless   \uDCF4\uDC8F\uDCBF\uDCD0 -1 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F5 tcl8    \u00F5 -1 {} {F5:FF are invalid everywhere}
    utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere}
    utf-8 F5 lossless \uDCF5 -1 {} {F5:FF are invalid everywhere}
    utf-8 F5 strict  {}      0 {} {F5:FF are invalid everywhere}
    utf-8 FF tcl8    \u00FF -1 {} {F5:FF are invalid everywhere}
    utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere}
    utf-8 FF lossless \uDCFF -1 {} {F5:FF are invalid everywhere}
    utf-8 FF strict  {}      0 {} {F5:FF are invalid everywhere}

    utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8}
    utf-8 C0AFE080BFF0818130 lossless \uDCC0\uDCAF\uDCE0\uDC80\uDCBF\uDCF0\uDC81\uDC81\x30 -1 {} {Unicode Table 3-8}
    utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3-9}
    utf-8 EDA080EDBFBFEDAF30 lossless \uD800\uDFFF\uDCED\uDCAF0 -1 {} {Unicode Table 3-9 - TODO assumes surrogates permitted in utf-8 lossless}
    utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10}
    utf-8 F4919293FF4180BF30 lossless \uDCF4\uDC91\uDC92\uDC93\uDCFF\u0041\uDC80\uDCBF\x30 -1 {} {Unicode Table 3-10}
    utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30                         -1 {knownW3C} {Unicode Table 3.11}
    utf-8 E180E2F09192F1BF30 lossless \uDCE1\uDC80\uDCE2\uDCF0\uDC91\uDC92\uDCF1\uDCBF\x30 -1 {} {Unicode Table 3.11}
}

# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated
# based on these depending on platform endianness. Note truncated tests can only
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
    utf-16le 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
    utf-16le 41      replace   \uFFFD -1 {solo tail} {Truncated}
    utf-16le 41      strict    {}      0 {solo tail} {Truncated}
    utf-16le 41      lossless  \uFFFD -1 {solo tail} {Truncated - byte < 0x80}
    utf-16le 80      lossless  \uFFFD -1 {solo tail} {Truncated - byte >= 0x80}
    utf-16le 00D8    tcl8      \uD800 -1 {} {Missing low surrogate}
    utf-16le 00D8    replace   \uFFFD -1 {} {Missing low surrogate}
    utf-16le 00D8    lossless  \uFFFD -1 {} {Missing low surrogate}
    utf-16le 00D8    strict    {}      0 {knownBug} {Missing low surrogate}
    utf-16le 00DC    tcl8      \uDC00 -1 {} {Missing high surrogate}
    utf-16le 00DC    replace   \uFFFD -1 {} {Missing high surrogate}
    utf-16le 00DC    lossless  \uFFFD -1 {} {Missing high surrogate}
    utf-16le 00DC    strict    {}      0 {knownBug} {Missing high surrogate}

    utf-16be 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
    utf-16be 41      replace   \uFFFD -1 {solo tail} {Truncated}
    utf-16be 41      strict    {}      0 {solo tail} {Truncated}
    utf-16be 41      lossless  \uFFFD -1 {solo tail} {Truncated - byte < 0x80}
    utf-16be 80      lossless  \uFFFD -1 {solo tail} {Truncated - byte >= 0x80}
    utf-16be D800    tcl8      \uD800 -1 {} {Missing low surrogate}
    utf-16be D800    replace   \uFFFD -1 {knownBug} {Missing low surrogate}
    utf-16be D800    lossless  \uFFFD -1 {knownBug} {Missing low surrogate}
    utf-16be D800    strict    {}      0 {knownBug} {Missing low surrogate}
    utf-16be DC00    tcl8      \uDC00 -1 {} {Missing high surrogate}
    utf-16be DC00    replace   \uFFFD -1 {knownBug} {Missing high surrogate}
    utf-16be DC00    lossless  \uFFFD -1 {knownBug} {Missing high surrogate}
    utf-16be DC00    strict    {}      0 {knownBug} {Missing high surrogate}
}

# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated
# based on these depending on platform endianness. Note truncated tests can only
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
    utf-32le 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 41      replace   \uFFFD  -1 {solo} {Truncated}
    utf-32le 41      lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32le 41      strict    {}   0 {solo tail} {Truncated}
    utf-32le 4100    tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 4100    replace   \uFFFD  -1 {solo} {Truncated}
    utf-32le 4100    lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32le 4100    strict    {}   0 {solo tail} {Truncated}
    utf-32le 410000  tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 410000  replace   \uFFFD  -1 {solo} {Truncated}
    utf-32le 410000  lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32le 410000  strict    {}       0 {solo tail} {Truncated}
    utf-32le 00D80000 tcl8     \uD800   -1 {} {High-surrogate}
    utf-32le 00D80000 replace  \uFFFD   -1 {} {High-surrogate}
    utf-32le 00D80000 lossless \uFFFD   -1 {} {High-surrogate}
    utf-32le 00D80000 strict   {}        0 {} {High-surrogate}
    utf-32le 00DC0000 tcl8     \uDC00   -1 {} {Low-surrogate}
    utf-32le 00DC0000 replace  \uFFFD   -1 {} {Low-surrogate}
    utf-32le 00DC0000 lossless \uFFFD   -1 {} {Low-surrogate}
    utf-32le 00DC0000 strict   {}        0 {} {Low-surrogate}
    utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
    utf-32le 00D8000000DC0000 replace  \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
    utf-32le 00D8000000DC0000 lossless \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
    utf-32le 00D8000000DC0000 strict  {}            0 {} {High-low-surrogate-pair}
    utf-32le 00001100 tcl8 \uFFFD    -1 {} {Out of range}
    utf-32le 00001100 replace  \uFFFD -1 {} {Out of range}
    utf-32le 00001100 lossless \uFFFD -1 {} {Out of range}
    utf-32le 00001100 strict {}       0 {} {Out of range}
    utf-32le FFFFFFFF tcl8 \uFFFD    -1 {} {Out of range}
    utf-32le FFFFFFFF replace  \uFFFD -1 {} {Out of range}
    utf-32le FFFFFFFF lossless \uFFFD -1 {} {Out of range}
    utf-32le FFFFFFFF strict {}       0 {} {Out of range}

    utf-32be 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 41      replace   \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 41      lossless  \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 41      strict    {}       0 {solo tail} {Truncated}
    utf-32be 0041    tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 0041    replace   \uFFFD  -1 {solo} {Truncated}
    utf-32be 0041    lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32be 0041    strict    {}   0 {solo tail} {Truncated}
    utf-32be 000041  tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 000041  replace   \uFFFD  -1 {solo} {Truncated}
    utf-32be 000041  lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32be 000041  strict    {}       0 {solo tail} {Truncated}
    utf-32be 0000D800 tcl8     \uD800   -1 {} {High-surrogate}
    utf-32be 0000D800 replace  \uFFFD   -1 {} {High-surrogate}
    utf-32be 0000D800 lossless \uFFFD   -1 {} {High-surrogate}
    utf-32be 0000D800 strict   {}        0 {} {High-surrogate}
    utf-32be 0000DC00 tcl8     \uDC00   -1 {} {Low-surrogate}
    utf-32be 0000DC00 replace  \uFFFD   -1 {} {Low-surrogate}
    utf-32be 0000DC00 lossless \uFFFD   -1 {} {Low-surrogate}
    utf-32be 0000DC00 strict   {}        0 {} {Low-surrogate}
    utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
    utf-32be 0000D8000000DC00 replace  \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
    utf-32be 0000D8000000DC00 lossless \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
    utf-32be 0000D8000000DC00 strict  {}            0 {} {High-low-surrogate-pair}
    utf-32be 00110000 tcl8 \uFFFD    -1 {} {Out of range}
    utf-32be 00110000 replace  \uFFFD -1 {} {Out of range}
    utf-32be 00110000 lossless \uFFFD -1 {} {Out of range}
    utf-32be 00110000 strict {}       0 {} {Out of range}
    utf-32be FFFFFFFF tcl8 \uFFFD    -1 {} {Out of range}
    utf-32be FFFFFFFF replace  \uFFFD -1 {} {Out of range}
    utf-32be FFFFFFFF lossless \uFFFD -1 {} {Out of range}
    utf-32be FFFFFFFF strict {}       0 {} {Out of range}
}

# escape tables - TODO
# This tests the EscapeToUtf code path.
lappend encInvalidBytes {*}{
}

# Strings that cannot be encoded for specific encoding / profiles
# <ENCODING STRING PROFILE EXPECTEDRESULT EXPECTEDFAILINDEX CTRL COMMENT>
# <ENCODING,STRING,PROFILE> should be unique for test ids to be unique.
# See earlier comments about CTRL field.
#


# TODO - out of range code point (note cannot be generated by \U notation)
lappend encUnencodableStrings {*}{
    ascii \u00e0 tcl8    3f -1 {} {unencodable}
    ascii \u00e0 strict  {}  0 {} {unencodable}

    iso8859-1 \u0141 tcl8    3f -1 {} unencodable
    iso8859-1 \u0141 strict  {}  0 {} unencodable

    utf-8 \uD800 tcl8    eda080 -1 {} High-surrogate
    utf-8 \uD800 strict  {}      0 {} High-surrogate
    utf-8 \uDC00 tcl8    edb080 -1 {} High-surrogate
    utf-8 \uDC00 strict  {}      0 {} High-surrogate
}


# The icuUcmTests.tcl is generated by the tools/ucm2tests.tcl script
# and generates test vectors for the above tables for various encodings
# based on ICU UCM files.
# TODO - commented out for now as generating a lot of mismatches, mainly
# due to Tcl using ? for replacement char and ICU often using ^Z.
# source [file join [file dirname [info script]] icuUcmTests.tcl]

Changes to tests/env.test.

14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
if {"::tcltest" ni [namespace children]} {
    package require tcltest 2.5
    namespace import -force ::tcltest::*
}

source [file join [file dirname [info script]] tcltests.tcl]

testConstraint utf8system [string equal [encoding system] utf-8]
if {[llength [auto_execok bash]]} {
    testConstraint haveBash 1
}

# [exec] is required here to see the actual environment received by child
# processes.
proc getenv {} {







<







14
15
16
17
18
19
20

21
22
23
24
25
26
27
if {"::tcltest" ni [namespace children]} {
    package require tcltest 2.5
    namespace import -force ::tcltest::*
}

source [file join [file dirname [info script]] tcltests.tcl]


if {[llength [auto_execok bash]]} {
    testConstraint haveBash 1
}

# [exec] is required here to see the actual environment received by child
# processes.
proc getenv {} {
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527




528
529
530
531










































532
533
534
535
536
537
538
    set result [gets $pipe]
    close $pipe
    if {$result ne $::env(USERPROFILE)} {
	list ERROR $result ne $::env(USERPROFILE)
    }
} -result {}

test env-10.0 {
    Unequal environment strings test should test unequal
} -constraints {unix haveBash utf8system knownBug} -setup {
    set tclScript [makeFile {
        puts [string equal $env(XX) $env(YY)]
    } tclScript]
    set shellCode {
        export XX=$'\351'
        export YY=$'\303\251'
    }
    append shellCode "[info nameofexecutable] $tclScript\n"
    set shScript [makeFile $shellCode shScript]




} -body {
    exec {*}[auto_execok bash] $shScript
} -result 0













































# cleanup
rename getenv {}
rename envrestore {}
rename envprep {}
rename encodingrestore {}







|
|
|









>
>
>
>




>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
    set result [gets $pipe]
    close $pipe
    if {$result ne $::env(USERPROFILE)} {
	list ERROR $result ne $::env(USERPROFILE)
    }
} -result {}

test env-10.1 {
    Unequal environment strings test should test unequal (failed pre-TIP 671)
} -constraints {unix haveBash} -setup {
    set tclScript [makeFile {
        puts [string equal $env(XX) $env(YY)]
    } tclScript]
    set shellCode {
        export XX=$'\351'
        export YY=$'\303\251'
    }
    append shellCode "[info nameofexecutable] $tclScript\n"
    set shScript [makeFile $shellCode shScript]
    set oldEnc [encoding system]
    encoding system utf-8
} -cleanup {
    encoding system $oldEnc
} -body {
    exec {*}[auto_execok bash] $shScript
} -result 0

test env-10.2 {
    Read invalidly encoded bytes in environment value - TIP 671
} -constraints {unix haveBash} -setup {
    set tclScript [makeFile {
	puts [format {%04x} [scan $env(XX) %c]]
    } tclScript]
    # Note following requires bash, not sh! Dunno the equivalent in sh
    set shellCode {
        export XX=$'\xe9'
    }
    append shellCode "[info nameofexecutable] $tclScript\n"
    set shScript [makeFile $shellCode shScript]
    set oldEnc [encoding system]
    encoding system utf-8
} -cleanup {
    encoding system $oldEnc
} -body {
    exec {*}[auto_execok bash] $shScript
} -result dce9

test env-10.3 {
    Write invalidly encoded bytes to environment value - TIP 671
} -constraints {unix haveBash} -setup {
    set tclScript [makeFile {
        set env(YY) A$env(XX)B
        set line [lindex [split [exec {*}[auto_execok bash] -c {echo $YY | od -t x1}] \n] 0]
        puts [lrange $line 1 3]
    } tclScript]
    # Note following requires bash, not sh! Dunno the equivalent in sh
    set shellCode {
        export XX=$'\xe9'
    }
    append shellCode "[info nameofexecutable] $tclScript\n"
    set shScript [makeFile $shellCode shScript]
    set oldEnc [encoding system]
    encoding system utf-8
} -cleanup {
    encoding system $oldEnc
} -body {
    exec {*}[auto_execok bash] $shScript
} -result {41 e9 42}



# cleanup
rename getenv {}
rename envrestore {}
rename envprep {}
rename encodingrestore {}

Changes to tests/ioCmd.test.

367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
    fconfigure $console -blah blih
} -returnCodes error -result [expectedOpts "-blah" {-inputmode}]
# TODO: Test parsing of serial channel options (nonPortable, since requires an
# open channel to work with).

test iocmd-8.23 {fconfigure -profile badprofile} -body {
    fconfigure stdin -profile froboz
} -returnCodes error -result {bad profile name "froboz": must be replace, strict, or tcl8}

test iocmd-9.1 {eof command} {
    list [catch {eof} msg] $msg $::errorCode
} {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}}
test iocmd-9.2 {eof command} {
    list [catch {eof a b} msg] $msg $::errorCode
} {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}}







|







367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
    fconfigure $console -blah blih
} -returnCodes error -result [expectedOpts "-blah" {-inputmode}]
# TODO: Test parsing of serial channel options (nonPortable, since requires an
# open channel to work with).

test iocmd-8.23 {fconfigure -profile badprofile} -body {
    fconfigure stdin -profile froboz
} -returnCodes error -result {bad profile name "froboz": must be lossless, replace, strict, or tcl8}

test iocmd-9.1 {eof command} {
    list [catch {eof} msg] $msg $::errorCode
} {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}}
test iocmd-9.2 {eof command} {
    list [catch {eof a b} msg] $msg $::errorCode
} {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}}

Changes to tests/main.test.

1
2
3
4
5
6
7
8
9
10
11
12




13
14
15
16
17
18
19
# This file contains a collection of tests for generic/tclMain.c.

if {"::tcltest" ni [namespace children]} {
    package require tcltest 2.5
    namespace import -force ::tcltest::*
}

namespace eval ::tcl::test::main {
    namespace import ::tcltest::*

    # Is [exec] defined?
    testConstraint exec [llength [info commands exec]]





    # Is the tcl::test package loaded?
    testConstraint tcl::test [expr {
	[llength [package provide tcl::test]]
	&& [package vsatisfies [package provide tcl::test] 8.5-]}]

    # Procedure to simulate interactive typing of commands, line by line












>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# This file contains a collection of tests for generic/tclMain.c.

if {"::tcltest" ni [namespace children]} {
    package require tcltest 2.5
    namespace import -force ::tcltest::*
}

namespace eval ::tcl::test::main {
    namespace import ::tcltest::*

    # Is [exec] defined?
    testConstraint exec [llength [info commands exec]]

    if {[llength [auto_execok bash]]} {
        testConstraint haveBash 1
    }

    # Is the tcl::test package loaded?
    testConstraint tcl::test [expr {
	[llength [package provide tcl::test]]
	&& [package vsatisfies [package provide tcl::test] 8.5-]}]

    # Procedure to simulate interactive typing of commands, line by line
1279
1280
1281
1282
1283
1284
1285



















1286
1287
1288
1289
1290
1291
1292
1293
		set tcl_interactive 1} >& result
	set f [open result]
	read $f
    } -cleanup {
	close $f
	file delete result
    } -result "1\nfoo\n"




















    cd [workingDirectory]

    cleanupTests
}

namespace delete ::tcl::test::main
return







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>








1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
		set tcl_interactive 1} >& result
	set f [open result]
	read $f
    } -cleanup {
	close $f
	file delete result
    } -result "1\nfoo\n"

    test Tcl_Main-10.1 {
        Invalidly encoded bytes in arguments - TIP 671
    } -constraints {unix haveBash} -setup {
        set tclScript [makeFile {
            lassign $::argv a b
            # a should be \udce9 (lossless map ofe9) b should be \ue9
            puts [list [format %x [scan $a %c]] [format %x [scan $b %c]]]
        } tclScript]
        # Note following requires bash, not sh! Dunno the equivalent in sh
        set shellCode [string cat "[info nameofexecutable] $tclScript" { $'\351'} { $'\303\251'} \n]
        set shScript [makeFile $shellCode shScript]
        set oldEnc [encoding system]
        encoding system utf-8
    } -cleanup {
        encoding system $oldEnc
    } -body {
        exec {*}[auto_execok bash] $shScript
    } -result {dce9 e9}

    cd [workingDirectory]

    cleanupTests
}

namespace delete ::tcl::test::main
return

Changes to unix/tclUnixChan.c.

1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
    if (len==0 || (len>1 && strncmp(optionName, "-xchar", len)==0)) {
	Tcl_DString ds;

	valid = 1;
	tcgetattr(fsPtr->fileState.fd, &iostate);
	Tcl_DStringInit(&ds);

	Tcl_ExternalToUtfDStringEx(NULL, NULL, (char *) &iostate.c_cc[VSTART], 1, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);
	Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds));
	TclDStringClear(&ds);

	Tcl_ExternalToUtfDStringEx(NULL, NULL, (char *) &iostate.c_cc[VSTOP], 1, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);
	Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds));
	Tcl_DStringFree(&ds);
    }
    if (len == 0) {
	Tcl_DStringEndSublist(dsPtr);
    }








|



|







1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
    if (len==0 || (len>1 && strncmp(optionName, "-xchar", len)==0)) {
	Tcl_DString ds;

	valid = 1;
	tcgetattr(fsPtr->fileState.fd, &iostate);
	Tcl_DStringInit(&ds);

	TclSystemToInternalEncoding(NULL, (char *) &iostate.c_cc[VSTART], 1, &ds);
	Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds));
	TclDStringClear(&ds);

	TclSystemToInternalEncoding(NULL, (char *) &iostate.c_cc[VSTOP], 1, &ds);
	Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds));
	Tcl_DStringFree(&ds);
    }
    if (len == 0) {
	Tcl_DStringEndSublist(dsPtr);
    }

Changes to unix/tclUnixFile.c.

151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#ifdef DJGPP
    if (name[1] == ':')
#else
    if (name[0] == '/')
#endif
    {
	encoding = Tcl_GetEncoding(NULL, NULL);
	Tcl_ExternalToUtfDStringEx(NULL, encoding, name, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &utfName, NULL);
	TclSetObjNameOfExecutable(
		Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding);
	Tcl_DStringFree(&utfName);
	goto done;
    }

    if (TclpGetCwd(NULL, &cwd) == NULL) {







|







151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#ifdef DJGPP
    if (name[1] == ':')
#else
    if (name[0] == '/')
#endif
    {
	encoding = Tcl_GetEncoding(NULL, NULL);
	Tcl_ExternalToUtfDStringEx(NULL, encoding, name, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &utfName, NULL);
	TclSetObjNameOfExecutable(
		Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding);
	Tcl_DStringFree(&utfName);
	goto done;
    }

    if (TclpGetCwd(NULL, &cwd) == NULL) {
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    }

    Tcl_DStringInit(&nameString);
    Tcl_DStringAppend(&nameString, name, TCL_INDEX_NONE);

    Tcl_DStringFree(&buffer);
    Tcl_UtfToExternalDStringEx(NULL, NULL, Tcl_DStringValue(&cwd),
	    Tcl_DStringLength(&cwd), TCL_ENCODING_PROFILE_TCL8, &buffer, NULL);
    if (Tcl_DStringValue(&cwd)[Tcl_DStringLength(&cwd) -1] != '/') {
	TclDStringAppendLiteral(&buffer, "/");
    }
    Tcl_DStringFree(&cwd);
    TclDStringAppendDString(&buffer, &nameString);
    Tcl_DStringFree(&nameString);

    encoding = Tcl_GetEncoding(NULL, NULL);
    Tcl_ExternalToUtfDStringEx(NULL, encoding, Tcl_DStringValue(&buffer), TCL_INDEX_NONE,
	    TCL_ENCODING_PROFILE_TCL8, &utfName, NULL);
    TclSetObjNameOfExecutable(
	    Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding);
    Tcl_DStringFree(&utfName);

  done:
    Tcl_DStringFree(&buffer);
}







|









|







179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    }

    Tcl_DStringInit(&nameString);
    Tcl_DStringAppend(&nameString, name, TCL_INDEX_NONE);

    Tcl_DStringFree(&buffer);
    Tcl_UtfToExternalDStringEx(NULL, NULL, Tcl_DStringValue(&cwd),
	    Tcl_DStringLength(&cwd), TCL_ENCODING_PROFILE_LOSSLESS, &buffer, NULL);
    if (Tcl_DStringValue(&cwd)[Tcl_DStringLength(&cwd) -1] != '/') {
	TclDStringAppendLiteral(&buffer, "/");
    }
    Tcl_DStringFree(&cwd);
    TclDStringAppendDString(&buffer, &nameString);
    Tcl_DStringFree(&nameString);

    encoding = Tcl_GetEncoding(NULL, NULL);
    Tcl_ExternalToUtfDStringEx(NULL, encoding, Tcl_DStringValue(&buffer), TCL_INDEX_NONE,
	    TCL_ENCODING_PROFILE_LOSSLESS, &utfName, NULL);
    TclSetObjNameOfExecutable(
	    Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding);
    Tcl_DStringFree(&utfName);

  done:
    Tcl_DStringFree(&buffer);
}
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
	    }
	}

	/*
	 * Now open the directory for reading and iterate over the contents.
	 */

	if (Tcl_UtfToExternalDStringEx(interp, NULL, dirName, TCL_INDEX_NONE, 0, &ds, NULL) != TCL_OK) {
	    Tcl_DStringFree(&dsOrig);
	    Tcl_DStringFree(&ds);
	    Tcl_DecrRefCount(fileNamePtr);
	    return TCL_ERROR;
	}
	native = Tcl_DStringValue(&ds);








|







304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
	    }
	}

	/*
	 * Now open the directory for reading and iterate over the contents.
	 */

	if (Tcl_UtfToExternalDStringEx(interp, NULL, dirName, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	    Tcl_DStringFree(&dsOrig);
	    Tcl_DStringFree(&ds);
	    Tcl_DecrRefCount(fileNamePtr);
	    return TCL_ERROR;
	}
	native = Tcl_DStringValue(&ds);

375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

	    /*
	     * Now check to see if the file matches, according to both type
	     * and pattern. If so, add the file to the result.
	     */

	    if (Tcl_ExternalToUtfDStringEx(interp, NULL, entryPtr->d_name, TCL_INDEX_NONE,
		    0, &utfDs, NULL) != TCL_OK) {
		matchResult = -1;
		break;
	    }
	    utfname = Tcl_DStringValue(&utfDs);
	    if (Tcl_StringCaseMatch(utfname, pattern, 0)) {
		int typeOk = 1;








|







375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

	    /*
	     * Now check to see if the file matches, according to both type
	     * and pattern. If so, add the file to the result.
	     */

	    if (Tcl_ExternalToUtfDStringEx(interp, NULL, entryPtr->d_name, TCL_INDEX_NONE,
		    TCL_ENCODING_PROFILE_LOSSLESS, &utfDs, NULL) != TCL_OK) {
		matchResult = -1;
		break;
	    }
	    utfname = Tcl_DStringValue(&utfDs);
	    if (Tcl_StringCaseMatch(utfname, pattern, 0)) {
		int typeOk = 1;

607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
    Tcl_DString *bufferPtr)	/* Uninitialized or free DString filled with
				 * name of user's home directory. */
{
    struct passwd *pwPtr;
    Tcl_DString ds;
    const char *native;

    if (Tcl_UtfToExternalDStringEx(NULL, NULL, name, TCL_INDEX_NONE, 0, &ds, NULL) != TCL_OK) {
	Tcl_DStringFree(&ds);
	return NULL;
    }
    native = Tcl_DStringValue(&ds);

    pwPtr = TclpGetPwNam(native);			/* INTL: Native. */
    Tcl_DStringFree(&ds);

    if (pwPtr == NULL) {
	return NULL;
    }
    if (Tcl_ExternalToUtfDStringEx(NULL, NULL, pwPtr->pw_dir, TCL_INDEX_NONE, 0, bufferPtr, NULL) != TCL_OK) {
	return NULL;
    } else {
	return Tcl_DStringValue(bufferPtr);
    }
}

/*







|











|







607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
    Tcl_DString *bufferPtr)	/* Uninitialized or free DString filled with
				 * name of user's home directory. */
{
    struct passwd *pwPtr;
    Tcl_DString ds;
    const char *native;

    if (Tcl_UtfToExternalDStringEx(NULL, NULL, name, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	Tcl_DStringFree(&ds);
	return NULL;
    }
    native = Tcl_DStringValue(&ds);

    pwPtr = TclpGetPwNam(native);			/* INTL: Native. */
    Tcl_DStringFree(&ds);

    if (pwPtr == NULL) {
	return NULL;
    }
    if (Tcl_ExternalToUtfDStringEx(NULL, NULL, pwPtr->pw_dir, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, bufferPtr, NULL) != TCL_OK) {
	return NULL;
    } else {
	return Tcl_DStringValue(bufferPtr);
    }
}

/*
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
	if (interp != NULL) {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "error getting working directory name: %s",
		    Tcl_PosixError(interp)));
	}
	return NULL;
    }
    if (Tcl_ExternalToUtfDStringEx(interp, NULL, buffer, TCL_INDEX_NONE, 0, bufferPtr, NULL) != TCL_OK) {
	return NULL;
    }
    return Tcl_DStringValue(bufferPtr);
}

/*
 *---------------------------------------------------------------------------







|







801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
	if (interp != NULL) {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "error getting working directory name: %s",
		    Tcl_PosixError(interp)));
	}
	return NULL;
    }
    if (Tcl_ExternalToUtfDStringEx(interp, NULL, buffer, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, bufferPtr, NULL) != TCL_OK) {
	return NULL;
    }
    return Tcl_DStringValue(bufferPtr);
}

/*
 *---------------------------------------------------------------------------
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
{
#ifndef DJGPP
    char link[MAXPATHLEN];
    Tcl_Size length;
    const char *native;
    Tcl_DString ds;

    if (Tcl_UtfToExternalDStringEx(NULL, NULL, path, TCL_INDEX_NONE, 0, &ds, NULL) != TCL_OK) {
	Tcl_DStringFree(&ds);
	return NULL;
    }
    native = Tcl_DStringValue(&ds);
    length = readlink(native, link, sizeof(link));	/* INTL: Native. */
    Tcl_DStringFree(&ds);

    if (length < 0) {
	return NULL;
    }

    if (Tcl_ExternalToUtfDStringEx(NULL, NULL, link, length, 0, linkPtr, NULL) == TCL_OK) {
	return Tcl_DStringValue(linkPtr);
    }
#endif /* !DJGPP */

    return NULL;
}








|











|







839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
{
#ifndef DJGPP
    char link[MAXPATHLEN];
    Tcl_Size length;
    const char *native;
    Tcl_DString ds;

    if (Tcl_UtfToExternalDStringEx(NULL, NULL, path, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	Tcl_DStringFree(&ds);
	return NULL;
    }
    native = Tcl_DStringValue(&ds);
    length = readlink(native, link, sizeof(link));	/* INTL: Native. */
    Tcl_DStringFree(&ds);

    if (length < 0) {
	return NULL;
    }

    if (Tcl_ExternalToUtfDStringEx(NULL, NULL, link, length, TCL_ENCODING_PROFILE_LOSSLESS, linkPtr, NULL) == TCL_OK) {
	return Tcl_DStringValue(linkPtr);
    }
#endif /* !DJGPP */

    return NULL;
}

986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	     */

	    transPtr = Tcl_FSGetTranslatedPath(NULL, toPtr);
	    if (transPtr == NULL) {
		return NULL;
	    }
	    target = Tcl_GetStringFromObj(transPtr, &length);
	    if (Tcl_UtfToExternalDStringEx(NULL, NULL, target, length, 0, &ds, NULL) != TCL_OK) {
		Tcl_DStringFree(&ds);
		return NULL;
	    }
	    target = Tcl_DStringValue(&ds);
	    Tcl_DecrRefCount(transPtr);

	    if (symlink(target, src) != 0) {







|







986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	     */

	    transPtr = Tcl_FSGetTranslatedPath(NULL, toPtr);
	    if (transPtr == NULL) {
		return NULL;
	    }
	    target = Tcl_GetStringFromObj(transPtr, &length);
	    if (Tcl_UtfToExternalDStringEx(NULL, NULL, target, length, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
		Tcl_DStringFree(&ds);
		return NULL;
	    }
	    target = Tcl_DStringValue(&ds);
	    Tcl_DecrRefCount(transPtr);

	    if (symlink(target, src) != 0) {
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
	Tcl_DecrRefCount(transPtr);

	length = readlink((const char *)Tcl_FSGetNativePath(pathPtr), link, sizeof(link));
	if (length < 0) {
	    return NULL;
	}

	if (Tcl_ExternalToUtfDStringEx(NULL, NULL, link, (size_t)length, 0, &ds, NULL) != TCL_OK) {
	    return NULL;
	}
	linkPtr = Tcl_DStringToObj(&ds);
	Tcl_IncrRefCount(linkPtr);
	return linkPtr;
    }
}







|







1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
	Tcl_DecrRefCount(transPtr);

	length = readlink((const char *)Tcl_FSGetNativePath(pathPtr), link, sizeof(link));
	if (length < 0) {
	    return NULL;
	}

	if (Tcl_ExternalToUtfDStringEx(NULL, NULL, link, (size_t)length, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	    return NULL;
	}
	linkPtr = Tcl_DStringToObj(&ds);
	Tcl_IncrRefCount(linkPtr);
	return linkPtr;
    }
}
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106

Tcl_Obj *
TclpNativeToNormalized(
    void *clientData)
{
    Tcl_DString ds;

    Tcl_ExternalToUtfDStringEx(NULL, NULL, (const char *) clientData, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);
    return Tcl_DStringToObj(&ds);
}

/*
 *---------------------------------------------------------------------------
 *
 * TclNativeCreateNativeRep --







|







1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106

Tcl_Obj *
TclpNativeToNormalized(
    void *clientData)
{
    Tcl_DString ds;

    Tcl_ExternalToUtfDStringEx(NULL, NULL, (const char *) clientData, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL);
    return Tcl_DStringToObj(&ds);
}

/*
 *---------------------------------------------------------------------------
 *
 * TclNativeCreateNativeRep --
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
	if (validPathPtr == NULL) {
	    return NULL;
	}
	Tcl_IncrRefCount(validPathPtr);
    }

    str = Tcl_GetStringFromObj(validPathPtr, &len);
    if (Tcl_UtfToExternalDStringEx(NULL, NULL, str, len, 0, &ds, NULL) != TCL_OK) {
	Tcl_DecrRefCount(validPathPtr);
	Tcl_DStringFree(&ds);
	return NULL;
    }
    len = Tcl_DStringLength(&ds) + sizeof(char);
    if (strlen(Tcl_DStringValue(&ds)) < len - sizeof(char)) {
	/* See bug [3118489]: NUL in filenames */







|







1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
	if (validPathPtr == NULL) {
	    return NULL;
	}
	Tcl_IncrRefCount(validPathPtr);
    }

    str = Tcl_GetStringFromObj(validPathPtr, &len);
    if (Tcl_UtfToExternalDStringEx(NULL, NULL, str, len, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	Tcl_DecrRefCount(validPathPtr);
	Tcl_DStringFree(&ds);
	return NULL;
    }
    len = Tcl_DStringLength(&ds) + sizeof(char);
    if (strlen(Tcl_DStringValue(&ds)) < len - sizeof(char)) {
	/* See bug [3118489]: NUL in filenames */

Changes to unix/tclUnixInit.c.

469
470
471
472
473
474
475
476
477




478
479
480
481
482
483
484
     * Look for the library relative to the TCL_LIBRARY env variable. If the
     * last dirname in the TCL_LIBRARY path does not match the last dirname in
     * the installLib variable, use the last dir name of installLib in
     * addition to the original TCL_LIBRARY path.
     */

    str = getenv("TCL_LIBRARY");			/* INTL: Native. */
    Tcl_ExternalToUtfDStringEx(NULL, NULL, str, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &buffer, NULL);
    str = Tcl_DStringValue(&buffer);





    if ((str != NULL) && (str[0] != '\0')) {
	Tcl_DString ds;
	Tcl_Size pathc;
	const char **pathv;
	char installLib[LIBRARY_SIZE];








|
|
>
>
>
>







469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
     * Look for the library relative to the TCL_LIBRARY env variable. If the
     * last dirname in the TCL_LIBRARY path does not match the last dirname in
     * the installLib variable, use the last dir name of installLib in
     * addition to the original TCL_LIBRARY path.
     */

    str = getenv("TCL_LIBRARY");			/* INTL: Native. */
    if (TclSystemToInternalEncoding(NULL, str, -1, &buffer) == TCL_OK) {
        str = Tcl_DStringValue(&buffer);
    } else {
        /* Note buffer is initialized even on error so can be cleared later */
        str = NULL;
    }

    if ((str != NULL) && (str[0] != '\0')) {
	Tcl_DString ds;
	Tcl_Size pathc;
	const char **pathv;
	char installLib[LIBRARY_SIZE];

Changes to win/tclWinSock.c.

370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
	Tcl_DString inDs;

	Tcl_DStringInit(&inDs);
	Tcl_DStringSetLength(&inDs, 256);
	if (gethostname(Tcl_DStringValue(&inDs),
		Tcl_DStringLength(&inDs)) == 0) {
	    Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(&inDs),
		    TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);
	}
	Tcl_DStringFree(&inDs);
    }

    *encodingPtr = Tcl_GetEncoding(NULL, "utf-8");
    *lengthPtr = Tcl_DStringLength(&ds);
    *valuePtr = (char *)Tcl_Alloc(*lengthPtr + 1);







|







370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
	Tcl_DString inDs;

	Tcl_DStringInit(&inDs);
	Tcl_DStringSetLength(&inDs, 256);
	if (gethostname(Tcl_DStringValue(&inDs),
		Tcl_DStringLength(&inDs)) == 0) {
	    Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(&inDs),
                -1, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL);
	}
	Tcl_DStringFree(&inDs);
    }

    *encodingPtr = Tcl_GetEncoding(NULL, "utf-8");
    *lengthPtr = Tcl_DStringLength(&ds);
    *valuePtr = (char *)Tcl_Alloc(*lengthPtr + 1);