Tcl Source Code

Check-in [56319f4d2a]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:TIP #547 implementation: New encodings: UTF-16, UCS-2
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | tip-547
Files: files | file ages | folders
SHA3-256: 56319f4d2a8c0ffa2083fde7862b28fc7502231899c17cc57958b5233a141cd2
User & Date: jan.nijtmans 2019-05-31 23:19:07
Context
2019-06-09
21:16
Merge 8.7 check-in: 83c21b831e user: jan.nijtmans tags: tip-547
2019-06-03
19:48
TIP #548: Deprecate Tcl_WinUtfToTChar() and Tcl_WinTCharToUtf() and provide more flexible replacemen... check-in: 49785ba3b0 user: jan.nijtmans tags: tip-548
2019-05-31
23:35
Merge tip-547 check-in: 9265001ec1 user: jan.nijtmans tags: utf-max
23:19
TIP #547 implementation: New encodings: UTF-16, UCS-2 check-in: 56319f4d2a user: jan.nijtmans tags: tip-547
23:14
Merge 8.6 check-in: 856a391eaa user: jan.nijtmans tags: core-8-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclEncoding.c.

230
231
232
233
234
235
236
237





238
239
240
241
242
243
244
245
246
247
248
249
...
560
561
562
563
564
565
566




567
568
569
570
571

572
573
574
575
576
577
578
...
591
592
593
594
595
596
597










598



599
600
601
602

603
604











605
606
607
608
609
610
611
....
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
....
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
....
2464
2465
2466
2467
2468
2469
2470





2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
....
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
....
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582

2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597

2598
2599
2600
2601
2602
2603























































2604

2605



















































2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		TableToUtfProc(ClientData clientData, const char *src,
			    int srcLen, int flags, Tcl_EncodingState *statePtr,
			    char *dst, int dstLen, int *srcReadPtr,
			    int *dstWrotePtr, int *dstCharsPtr);
static size_t		unilen(const char *src);
static int		UniCharToUtfProc(ClientData clientData,





			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUniCharProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
................................................................................
void
TclInitEncodingSubsystem(void)
{
    Tcl_EncodingType type;
    TableEncodingData *dataPtr;
    unsigned size;
    unsigned short i;





    if (encodingsInitialized) {
	return;
    }


    Tcl_MutexLock(&encodingMutex);
    Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
    Tcl_MutexUnlock(&encodingMutex);

    /*
     * Create a few initial encodings. Note that the UTF-8 to UTF-8
     * translation is not a no-op, because it will turn a stream of improperly
................................................................................
    type.toUtfProc	= UtfExtToUtfIntProc;
    type.fromUtfProc	= UtfIntToUtfExtProc;
    type.freeProc	= NULL;
    type.nullSize	= 1;
    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);











    type.encodingName   = "unicode";



    type.toUtfProc	= UniCharToUtfProc;
    type.fromUtfProc    = UtfToUniCharProc;
    type.freeProc	= NULL;
    type.nullSize	= 2;

    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);












    /*
     * Need the iso8859-1 encoding in order to process binary data, so force
     * it to always be embedded. Note that this encoding *must* be a proper
     * table encoding or some of the escape encodings crash! Hence the ugly
     * code to duplicate the structure of a table encoding here.
     */
................................................................................

	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
		flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
		dstCharsPtr);
	if (*dstCharsPtr <= maxChars) {
	    break;
	}
	dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX;
	flags = savedFlags;
	*statePtr = savedState;
    } while (1);
    if (!noTerminate) {
	/* ...and then append it */

	dst[*dstWrotePtr] = '\0';
................................................................................
    *dstCharsPtr = numChars;
    return result;
}
 
/*
 *-------------------------------------------------------------------------
 *
 * UniCharToUtfProc --
 *
 *	Convert from Unicode to UTF-8.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
 *
 *-------------------------------------------------------------------------
 */

static int
UniCharToUtfProc(
    ClientData clientData,	/* Not used. */
    const char *src,		/* Source string in Unicode. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
				 * information used during a piecewise
				 * conversion. Contents of statePtr are
				 * initialized and/or reset by conversion
................................................................................

    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}






	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */

	ch = *(unsigned short *)src;
	if (ch && ch < 0x80) {
	    *dst++ = (ch & 0xFF);
	} else {
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
	src += sizeof(unsigned short);
    }
................................................................................
    *dstCharsPtr = numChars;
    return result;
}
 
/*
 *-------------------------------------------------------------------------
 *
 * UtfToUniCharProc --
 *
 *	Convert from UTF-8 to Unicode.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
 *
 *-------------------------------------------------------------------------
 */

static int
UtfToUniCharProc(
    ClientData clientData,	/* TableEncodingData that specifies
				 * encoding. */
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
				 * information used during a piecewise
				 * conversion. Contents of statePtr are
				 * initialized and/or reset by conversion
................................................................................
	src += TclUtfToUniChar(src, chPtr);

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
	 */

#ifdef WORDS_BIGENDIAN
#if TCL_UTF_MAX > 4
	if (*chPtr <= 0xFFFF) {
	    *dst++ = (*chPtr >> 8);
	    *dst++ = (*chPtr & 0xFF);
	} else {
	    *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
	    *dst++ = (*chPtr & 0xFF);
	    *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;

	    *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
	}
#else
	*dst++ = (*chPtr >> 8);
	*dst++ = (*chPtr & 0xFF);
#endif
#else
#if TCL_UTF_MAX > 4
	if (*chPtr <= 0xFFFF) {
	    *dst++ = (*chPtr & 0xFF);
	    *dst++ = (*chPtr >> 8);
	} else {
	    *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
	    *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
	    *dst++ = (*chPtr & 0xFF);

	    *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
	}
#else
	*dst++ = (*chPtr & 0xFF);
	*dst++ = (*chPtr >> 8);
#endif























































#endif

    }



















































    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}
 
/*
 *-------------------------------------------------------------------------
 *
 * TableToUtfProc --
 *
 *	Convert from the encoding specified by the TableEncodingData into
 *	UTF-8.






|
>
>
>
>
>




|







 







>
>
>
>





>







 







>
>
>
>
>
>
>
>
>
>
|
>
>
>
|
|


>
|

>
>
>
>
>
>
>
>
>
>
>







 







|







 







|

|











|
|







 







>
>
>
>
>




<
<







 







|

|











|
|
<







 







|

|
|
|
|
|
<
|
>
|
|

|
|

|

|
|
|
|
|
<
|
>
|
|

|
|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>





|







230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
...
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
...
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
....
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
....
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
....
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514


2515
2516
2517
2518
2519
2520
2521
....
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547

2548
2549
2550
2551
2552
2553
2554
....
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617

2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632

2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		TableToUtfProc(ClientData clientData, const char *src,
			    int srcLen, int flags, Tcl_EncodingState *statePtr,
			    char *dst, int dstLen, int *srcReadPtr,
			    int *dstWrotePtr, int *dstCharsPtr);
static size_t		unilen(const char *src);
static int		Utf16ToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUtf16Proc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUcs2Proc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
			    int *srcReadPtr, int *dstWrotePtr,
			    int *dstCharsPtr);
static int		UtfToUtfProc(ClientData clientData,
			    const char *src, int srcLen, int flags,
			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
................................................................................
void
TclInitEncodingSubsystem(void)
{
    Tcl_EncodingType type;
    TableEncodingData *dataPtr;
    unsigned size;
    unsigned short i;
    union {
        char c;
        short s;
    } isLe;

    if (encodingsInitialized) {
	return;
    }

    isLe.s = 1;
    Tcl_MutexLock(&encodingMutex);
    Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
    Tcl_MutexUnlock(&encodingMutex);

    /*
     * Create a few initial encodings. Note that the UTF-8 to UTF-8
     * translation is not a no-op, because it will turn a stream of improperly
................................................................................
    type.toUtfProc	= UtfExtToUtfIntProc;
    type.fromUtfProc	= UtfIntToUtfExtProc;
    type.freeProc	= NULL;
    type.nullSize	= 1;
    type.clientData	= NULL;
    Tcl_CreateEncoding(&type);

    type.toUtfProc	= Utf16ToUtfProc;
    type.fromUtfProc    = UtfToUcs2Proc;
    type.freeProc	= NULL;
    type.nullSize	= 2;
    type.encodingName   = "ucs-2le";
    type.clientData	= INT2PTR(1);
    Tcl_CreateEncoding(&type);
    type.encodingName   = "ucs-2be";
    type.clientData	= INT2PTR(0);
    Tcl_CreateEncoding(&type);
    type.encodingName   = "ucs-2";
    type.clientData	= INT2PTR(isLe.c);
    Tcl_CreateEncoding(&type);

    type.toUtfProc	= Utf16ToUtfProc;
    type.fromUtfProc    = UtfToUtf16Proc;
    type.freeProc	= NULL;
    type.nullSize	= 2;
    type.encodingName   = "utf-16le";
    type.clientData	= INT2PTR(1);;
    Tcl_CreateEncoding(&type);
    type.encodingName   = "utf-16be";
    type.clientData	= INT2PTR(0);
    Tcl_CreateEncoding(&type);
    type.encodingName   = "utf-16";
    type.clientData	= INT2PTR(isLe.c);;
    Tcl_CreateEncoding(&type);

#ifndef TCL_NO_DEPRECATED
    type.encodingName   = "unicode";
    Tcl_CreateEncoding(&type);
#endif

    /*
     * Need the iso8859-1 encoding in order to process binary data, so force
     * it to always be embedded. Note that this encoding *must* be a proper
     * table encoding or some of the escape encodings crash! Hence the ugly
     * code to duplicate the structure of a table encoding here.
     */
................................................................................

	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
		flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
		dstCharsPtr);
	if (*dstCharsPtr <= maxChars) {
	    break;
	}
	dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
	flags = savedFlags;
	*statePtr = savedState;
    } while (1);
    if (!noTerminate) {
	/* ...and then append it */

	dst[*dstWrotePtr] = '\0';
................................................................................
    *dstCharsPtr = numChars;
    return result;
}
 
/*
 *-------------------------------------------------------------------------
 *
 * Utf16ToUtfProc --
 *
 *	Convert from UTF-16 to UTF-8.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
 *
 *-------------------------------------------------------------------------
 */

static int
Utf16ToUtfProc(
    ClientData clientData,	/* != NULL means LE, == NUL means BE */
    const char *src,		/* Source string in Unicode. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
				 * information used during a piecewise
				 * conversion. Contents of statePtr are
				 * initialized and/or reset by conversion
................................................................................

    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}

	if (clientData) {
	    ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
	} else {
	    ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
	}
	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */


	if (ch && ch < 0x80) {
	    *dst++ = (ch & 0xFF);
	} else {
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
	src += sizeof(unsigned short);
    }
................................................................................
    *dstCharsPtr = numChars;
    return result;
}
 
/*
 *-------------------------------------------------------------------------
 *
 * UtfToUtf16Proc --
 *
 *	Convert from UTF-8 to UTF-16.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
 *
 *-------------------------------------------------------------------------
 */

static int
UtfToUtf16Proc(
    ClientData clientData,	/* != NULL means LE, == NUL means BE */

    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
				 * information used during a piecewise
				 * conversion. Contents of statePtr are
				 * initialized and/or reset by conversion
................................................................................
	src += TclUtfToUniChar(src, chPtr);

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
	 */

	if (clientData) {
#if TCL_UTF_MAX > 4
	    if (*chPtr <= 0xFFFF) {
		*dst++ = (*chPtr & 0xFF);
		*dst++ = (*chPtr >> 8);
	    } else {
		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);

		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (*chPtr & 0xFF);
		*dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
	    }
#else
	    *dst++ = (*chPtr & 0xFF);
	    *dst++ = (*chPtr >> 8);
#endif
	} else {
#if TCL_UTF_MAX > 4
	    if (*chPtr <= 0xFFFF) {
		*dst++ = (*chPtr >> 8);
		*dst++ = (*chPtr & 0xFF);
	    } else {
		*dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;

		*dst++ = (*chPtr & 0xFF);
		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
	    }
#else
	    *dst++ = (*chPtr >> 8);
	    *dst++ = (*chPtr & 0xFF);
#endif
	}
    }
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}
 
/*
 *-------------------------------------------------------------------------
 *
 * UtfToUcs2Proc --
 *
 *	Convert from UTF-8 to UCS-2.
 *
 * Results:
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
 *
 *-------------------------------------------------------------------------
 */

static int
UtfToUcs2Proc(
    ClientData clientData,	/* != NULL means LE, == NUL means BE */
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
				 * information used during a piecewise
				 * conversion. Contents of statePtr are
				 * initialized and/or reset by conversion
				 * routine under control of flags argument. */
    char *dst,			/* Output buffer in which converted string is
				 * stored. */
    int dstLen,			/* The maximum length of output buffer in
				 * bytes. */
    int *srcReadPtr,		/* Filled with the number of bytes from the
				 * source string that were converted. This may
				 * be less than the original source length if
				 * there was a problem converting some source
				 * characters. */
    int *dstWrotePtr,		/* Filled with the number of bytes that were
				 * stored in the output buffer as a result of
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
    int result, numChars;
#if TCL_UTF_MAX <= 4
    int len;
#endif
    Tcl_UniChar ch = 0;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }

    dstStart = dst;
    dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);

    result = TCL_OK;
    for (numChars = 0; src < srcEnd; numChars++) {
	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
	    /*
	     * If there is more string to follow, this will ensure that the
	     * last UTF-8 character in the source buffer hasn't been cut off.
	     */

	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
#if TCL_UTF_MAX <= 4
	src += (len = TclUtfToUniChar(src, &ch));
	if ((ch >= 0xD800) && (len < 3)) {
	    src += TclUtfToUniChar(src, &ch);
	    ch = 0xFFFD;
	}
#else
	src += TclUtfToUniChar(src, &ch);
	if (ch > 0xFFFF) {
	    ch = 0xFFFD;
	}
#endif

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
	 */

	if (clientData) {
	    *dst++ = (ch & 0xFF);
	    *dst++ = (ch >> 8);
	} else {
	    *dst++ = (ch >> 8);
	    *dst++ = (ch & 0xFF);
	}
    }
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}

/*
 *-------------------------------------------------------------------------
 *
 * TableToUtfProc --
 *
 *	Convert from the encoding specified by the TableEncodingData into
 *	UTF-8.

Changes to tests/binary.test.

2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
    apply {{a b} {
	set one [binary format H* $a]
	return $one[binary format H* $b]
    }} ab cd
} [binary format H* abcd]

test binary-78.1 {unicode (out of BMP) to byte-array conversion, bug-[bd94500678]} -body {
    # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX <= 4):
    binary encode hex \U0001f415
    binary scan \U0001f415 a* v; set v
    set str {}
} -result {}
 
# ----------------------------------------------------------------------
# cleanup






|







2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
    apply {{a b} {
	set one [binary format H* $a]
	return $one[binary format H* $b]
    }} ab cd
} [binary format H* abcd]

test binary-78.1 {unicode (out of BMP) to byte-array conversion, bug-[bd94500678]} -body {
    # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX == 3):
    binary encode hex \U0001f415
    binary scan \U0001f415 a* v; set v
    set str {}
} -result {}
 
# ----------------------------------------------------------------------
# cleanup

Changes to tests/chanio.test.

884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
....
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
} -result {bbbbbbbbbbbbbbb 15 123456789abcdef 1 4 abcd 0 3 efg}
test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} -setup {
    set x ""
} -constraints {stdio testchannel openpipe fileevent} -body {
    # Tcl_ExternalToUtf()
    set f [openpipe w+ $path(cat)]
    chan configure $f -translation {auto lf} -buffering none
    chan configure $f -encoding unicode
    chan puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r"
    chan configure $f -buffersize 16
    chan gets $f
    chan configure $f -blocking 0
    lappend x [chan gets $f line] $line [testchannel queuedcr $f]
    chan configure $f -blocking 1
    chan puts -nonewline $f "\nabcd\refg"
................................................................................
    # not (bufPtr->nextPtr == NULL)
    set f [openpipe w+ $path(cat)]
    chan configure $f -translation lf -encoding ascii -buffering none
    chan puts -nonewline $f "123456789012345\r\nbcdefghijklmnopqrstuvwxyz"
    chan event $f read [namespace code {
	lappend x [chan gets $f line] $line [testchannel inputbuffered $f]
    }]
    chan configure $f -encoding unicode -buffersize 16 -blocking 0
    vwait [namespace which -variable x]
    chan configure $f -translation auto -encoding ascii -blocking 1
    # here
    vwait [namespace which -variable x]
    return $x
} -cleanup {
    chan close $f






|







 







|







884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
....
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
} -result {bbbbbbbbbbbbbbb 15 123456789abcdef 1 4 abcd 0 3 efg}
test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} -setup {
    set x ""
} -constraints {stdio testchannel openpipe fileevent} -body {
    # Tcl_ExternalToUtf()
    set f [openpipe w+ $path(cat)]
    chan configure $f -translation {auto lf} -buffering none
    chan configure $f -encoding utf-16
    chan puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r"
    chan configure $f -buffersize 16
    chan gets $f
    chan configure $f -blocking 0
    lappend x [chan gets $f line] $line [testchannel queuedcr $f]
    chan configure $f -blocking 1
    chan puts -nonewline $f "\nabcd\refg"
................................................................................
    # not (bufPtr->nextPtr == NULL)
    set f [openpipe w+ $path(cat)]
    chan configure $f -translation lf -encoding ascii -buffering none
    chan puts -nonewline $f "123456789012345\r\nbcdefghijklmnopqrstuvwxyz"
    chan event $f read [namespace code {
	lappend x [chan gets $f line] $line [testchannel inputbuffered $f]
    }]
    chan configure $f -encoding utf-16 -buffersize 16 -blocking 0
    vwait [namespace which -variable x]
    chan configure $f -translation auto -encoding ascii -blocking 1
    # here
    vwait [namespace which -variable x]
    return $x
} -cleanup {
    chan close $f

Changes to tests/encoding.test.

318
319
320
321
322
323
324
325
326
327
328
329




330




331
332
333
334
335
336



337
338
339
340
341
342
343
} 00
test encoding-15.3 {UtfToUtfProc null character input} teststringbytes {
    set y [encoding convertfrom utf-8 [encoding convertto utf-8 \u0000]]
    binary scan [teststringbytes $y] H* z
    set z
} c080

test encoding-16.1 {UnicodeToUtfProc} -body {
    set val [encoding convertfrom unicode NN]
    list $val [format %x [scan $val %c]]
} -result "\u4e4e 4e4e"
test encoding-16.2 {UnicodeToUtfProc} -body {




    set val [encoding convertfrom unicode "\xd8\xd8\xdc\xdc"]




    list $val [format %x [scan $val %c]]
} -result "\U460dc 460dc"

test encoding-17.1 {UtfToUnicodeProc} -body {
    encoding convertto unicode "\U460dc"
} -result "\xd8\xd8\xdc\xdc"




test encoding-18.1 {TableToUtfProc} {
} {}

test encoding-19.1 {TableFromUtfProc} {
} {}







|
|


|
>
>
>
>
|
>
>
>
>



|
|

>
>
>







318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
} 00
test encoding-15.3 {UtfToUtfProc null character input} teststringbytes {
    set y [encoding convertfrom utf-8 [encoding convertto utf-8 \u0000]]
    binary scan [teststringbytes $y] H* z
    set z
} c080

test encoding-16.1 {Utf16ToUtfProc} -body {
    set val [encoding convertfrom utf-16 NN]
    list $val [format %x [scan $val %c]]
} -result "\u4e4e 4e4e"
test encoding-16.2 {Utf16ToUtfProc} -body {
    set val [encoding convertfrom utf-16 "\xd8\xd8\xdc\xdc"]
    list $val [format %x [scan $val %c]]
} -result "\U460dc 460dc"
test encoding-16.3 {Ucs2ToUtfProc} -body {
    set val [encoding convertfrom ucs-2 NN]
    list $val [format %x [scan $val %c]]
} -result "\u4e4e 4e4e"
test encoding-16.4 {Ucs2ToUtfProc} -body {
    set val [encoding convertfrom ucs-2 "\xd8\xd8\xdc\xdc"]
    list $val [format %x [scan $val %c]]
} -result "\U460dc 460dc"

test encoding-17.1 {UtfToUtf16Proc} -body {
    encoding convertto utf-16 "\U460dc"
} -result "\xd8\xd8\xdc\xdc"
test encoding-17.2 {UtfToUcs2Proc} -body {
    encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460dc"]
} -result "\ufffd"

test encoding-18.1 {TableToUtfProc} {
} {}

test encoding-19.1 {TableFromUtfProc} {
} {}

Changes to tests/io.test.

914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
....
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
    set x
} [list "bbbbbbbbbbbbbbb" 15 "123456789abcdef" 1 4 "abcd" 0 3 "efg"]
test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent} {
    # Tcl_ExternalToUtf()

    set f [open "|[list [interpreter] $path(cat)]" w+]
    fconfigure $f -translation {auto lf} -buffering none
    fconfigure $f -encoding unicode
    puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r"
    fconfigure $f -buffersize 16
    gets $f
    fconfigure $f -blocking 0
    set x [list [gets $f line] $line [testchannel queuedcr $f]]
    fconfigure $f -blocking 1
    puts -nonewline $f "\nabcd\refg"
................................................................................
    puts -nonewline $f "123456789012345\r\nbcdefghijklmnopqrstuvwxyz"
    variable x {}
    fileevent $f read [namespace code "ready $f"]
    proc ready {f} {
	variable x
	lappend x [gets $f line] $line [testchannel inputbuffered $f]
    }
    fconfigure $f -encoding unicode -buffersize 16 -blocking 0
    vwait [namespace which -variable x]
    fconfigure $f -translation auto -encoding ascii -blocking 1
    # here
    vwait [namespace which -variable x]
    close $f
    set x
} [list -1 "" 42 15 "123456789012345" 25]






|







 







|







914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
....
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
    set x
} [list "bbbbbbbbbbbbbbb" 15 "123456789abcdef" 1 4 "abcd" 0 3 "efg"]
test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent} {
    # Tcl_ExternalToUtf()

    set f [open "|[list [interpreter] $path(cat)]" w+]
    fconfigure $f -translation {auto lf} -buffering none
    fconfigure $f -encoding utf-16
    puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r"
    fconfigure $f -buffersize 16
    gets $f
    fconfigure $f -blocking 0
    set x [list [gets $f line] $line [testchannel queuedcr $f]]
    fconfigure $f -blocking 1
    puts -nonewline $f "\nabcd\refg"
................................................................................
    puts -nonewline $f "123456789012345\r\nbcdefghijklmnopqrstuvwxyz"
    variable x {}
    fileevent $f read [namespace code "ready $f"]
    proc ready {f} {
	variable x
	lappend x [gets $f line] $line [testchannel inputbuffered $f]
    }
    fconfigure $f -encoding utf-16 -buffersize 16 -blocking 0
    vwait [namespace which -variable x]
    fconfigure $f -translation auto -encoding ascii -blocking 1
    # here
    vwait [namespace which -variable x]
    close $f
    set x
} [list -1 "" 42 15 "123456789012345" 25]

Changes to tests/ioCmd.test.

237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
test iocmd-8.6 {fconfigure command} -returnCodes error -body {
    fconfigure stdin -translation froboz
} -result {bad value for -translation: must be one of auto, binary, cr, lf, crlf, or platform}
test iocmd-8.7 {fconfigure command} -setup {
    file delete $path(test1)
} -body {
    set f1 [open $path(test1) w]
    fconfigure $f1 -translation lf -eofchar {} -encoding unicode
    fconfigure $f1
} -cleanup {
    catch {close $f1}
} -result {-blocking 1 -buffering full -buffersize 4096 -encoding unicode -eofchar {} -translation lf}
test iocmd-8.8 {fconfigure command} -setup {
    file delete $path(test1)
    set x {}
} -body {
    set f1 [open $path(test1) w]
    fconfigure $f1 -translation lf -buffering line -buffersize 3030 \
		-eofchar {} -encoding unicode
    lappend x [fconfigure $f1 -buffering]
    lappend x [fconfigure $f1]
} -cleanup {
    catch {close $f1}
} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding unicode -eofchar {} -translation lf}}
test iocmd-8.9 {fconfigure command} -setup {
    file delete $path(test1)
} -body {
    set f1 [open $path(test1) w]
    fconfigure $f1 -translation binary -buffering none -buffersize 4040 \
		-eofchar {} -encoding binary
    fconfigure $f1






|



|






|




|







237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
test iocmd-8.6 {fconfigure command} -returnCodes error -body {
    fconfigure stdin -translation froboz
} -result {bad value for -translation: must be one of auto, binary, cr, lf, crlf, or platform}
test iocmd-8.7 {fconfigure command} -setup {
    file delete $path(test1)
} -body {
    set f1 [open $path(test1) w]
    fconfigure $f1 -translation lf -eofchar {} -encoding utf-16
    fconfigure $f1
} -cleanup {
    catch {close $f1}
} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -eofchar {} -translation lf}
test iocmd-8.8 {fconfigure command} -setup {
    file delete $path(test1)
    set x {}
} -body {
    set f1 [open $path(test1) w]
    fconfigure $f1 -translation lf -buffering line -buffersize 3030 \
		-eofchar {} -encoding utf-16
    lappend x [fconfigure $f1 -buffering]
    lappend x [fconfigure $f1]
} -cleanup {
    catch {close $f1}
} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -eofchar {} -translation lf}}
test iocmd-8.9 {fconfigure command} -setup {
    file delete $path(test1)
} -body {
    set f1 [open $path(test1) w]
    fconfigure $f1 -translation binary -buffering none -buffersize 4040 \
		-eofchar {} -encoding binary
    fconfigure $f1

Changes to tests/source.test.

236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
    # and use of the Control-Z character (\u001A) as a cross-platform
    # EOF character by [source].  Here we write out and the [source] a
    # file that contains the byte \x1A, although not the character \u001A in
    # the indicated encoding.
    set sourcefile [makeFile {} source.file]
    file delete $sourcefile
    set f [open $sourcefile w]
    fconfigure $f -encoding unicode
    puts $f "set symbol(square-root) \u221A; set x correct"
    close $f
} -body {
    set x unset
    source -encoding unicode $sourcefile
    set x
} -cleanup {
    removeFile source.file
} -result correct
test source-7.3 {source -encoding: syntax} -body {
    # Have to spell out the -encoding option
    source -e utf-8 no_file






|




|







236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
    # and use of the Control-Z character (\u001A) as a cross-platform
    # EOF character by [source].  Here we write out and the [source] a
    # file that contains the byte \x1A, although not the character \u001A in
    # the indicated encoding.
    set sourcefile [makeFile {} source.file]
    file delete $sourcefile
    set f [open $sourcefile w]
    fconfigure $f -encoding utf-16
    puts $f "set symbol(square-root) \u221A; set x correct"
    close $f
} -body {
    set x unset
    source -encoding utf-16 $sourcefile
    set x
} -cleanup {
    removeFile source.file
} -result correct
test source-7.3 {source -encoding: syntax} -body {
    # Have to spell out the -encoding option
    source -e utf-8 no_file