Tcl Source Code

Check-in [c41cbc5340]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add some state to encodings, so we can do better surrogate handling for TCL_UTF_MAX >= 4. Backported from TIP #389.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core-8-6-branch
Files: files | file ages | folders
SHA3-256: c41cbc53406c6f552f7e7cf30554aba7204ba1c9b35bc2d66c705b6c733bf289
User & Date: jan.nijtmans 2018-04-23 14:56:44
Context
2018-04-23
23:23
Bug-fix in Tcl_UtfAtIndex (for TCL_UTF_MAX=4 only). With test-case (in "string totitle") demonstrati... check-in: 3d8301e3c6 user: jan.nijtmans tags: core-8-6-branch
15:06
Some better surrogate handling in encodings. Not spotted before ... check-in: 0e622ccdb3 user: jan.nijtmans tags: core-8-branch
14:56
Add some state to encodings, so we can do better surrogate handling for TCL_UTF_MAX >= 4. Backported... check-in: c41cbc5340 user: jan.nijtmans tags: core-8-6-branch
2018-04-22
13:22
[46a2410650] compiled [unset] was bypassing cleanup of active array search. Overdue thanks to Andy G... check-in: 244c0b893d user: dgp tags: core-8-6-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclEncoding.c.

2292
2293
2294
2295
2296
2297
2298
2299
2300



2301
2302
2303
2304
2305
2306
2307
....
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355





2356
2357


2358
2359
2360
2361
2362
2363
2364
....
2406
2407
2408
2409
2410
2411
2412
2413
2414



2415
2416
2417
2418
2419
2420
2421
....
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
....
2496
2497
2498
2499
2500
2501
2502
2503
2504



2505
2506
2507
2508
2509
2510
2511
....
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
....
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764




2765
2766
2767
2768
2769
2770
2771
....
2956
2957
2958
2959
2960
2961
2962
2963




2964
2965
2966
2967



2968
2969
2970
2971
2972
2973
2974
    int pureNullMode)		/* Convert embedded nulls from internal
				 * representation to real null-bytes or vice
				 * versa. */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd;
    int result, numChars, charLimit = INT_MAX;
    Tcl_UniChar ch = 0;




    result = TCL_OK;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
................................................................................
	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    /*
	     * Always check before using TclUtfToUniChar. Not doing can so
	     * cause it run beyond the end of the buffer! If we happen such an
	     * incomplete char its bytes are made to represent themselves.
	     */

	    ch = (unsigned char) *src;
	    src += 1;
	    dst += Tcl_UniCharToUtf(ch, dst);
	} else {





	    src += TclUtfToUniChar(src, &ch);
	    dst += Tcl_UniCharToUtf(ch, dst);


	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
................................................................................
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd;
    const char *dstEnd, *dstStart;
    int result, numChars, charLimit = INT_MAX;
    Tcl_UniChar ch = 0;




    if (flags & TCL_ENCODING_CHAR_LIMIT) {
	charLimit = *dstCharsPtr;
    }
    result = TCL_OK;
    if ((srcLen % sizeof(Tcl_UniChar)) != 0) {
	result = TCL_CONVERT_MULTIBYTE;
	srcLen /= sizeof(Tcl_UniChar);
................................................................................
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * Tcl_UniChar-size data.
	 */

	ch = *(Tcl_UniChar *)src;
	if (ch && ch < 0x80) {
	    *dst++ = (ch & 0xFF);
	} else {
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
	src += sizeof(Tcl_UniChar);
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
................................................................................
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
    int result, numChars;
    Tcl_UniChar ch = 0;




    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }

................................................................................
	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	src += TclUtfToUniChar(src, &ch);

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
	 */

#ifdef WORDS_BIGENDIAN
#if TCL_UTF_MAX > 4
	*dst++ = (ch >> 24);
	*dst++ = ((ch >> 16) & 0xFF);
	*dst++ = ((ch >> 8) & 0xFF);
	*dst++ = (ch & 0xFF);
#else
	*dst++ = (ch >> 8);
	*dst++ = (ch & 0xFF);
#endif
#else
#if TCL_UTF_MAX > 4
	*dst++ = (ch & 0xFF);
	*dst++ = ((ch >> 8) & 0xFF);
	*dst++ = ((ch >> 16) & 0xFF);
	*dst++ = (ch >> 24);
#else
	*dst++ = (ch & 0xFF);
	*dst++ = (ch >> 8);
#endif
#endif
    }
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
................................................................................
	     */

	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	len = TclUtfToUniChar(src, &ch);

#if TCL_UTF_MAX > 3
	/*
	 * This prevents a crash condition. More evaluation is required for
	 * full support of int Tcl_UniChar. [Bug 1004065]
	 */

	if (ch & 0xffff0000) {
	    word = 0;




	} else
#endif
	    word = fromUnicode[(ch >> 8)][ch & 0xff];

	if ((word == 0) && (ch != 0)) {
	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
................................................................................
	}
	len = TclUtfToUniChar(src, &ch);

	/*
	 * Check for illegal characters.
	 */

	if (ch > 0xff) {




	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }




	    /*
	     * Plunge on, using '?' as a fallback character.
	     */

	    ch = (Tcl_UniChar) '?';
	}






|

>
>
>







 







|

|

>
>
>
>
>
|
|
>
>







 







|

>
>
>







 







|
|
|

|







 







|

>
>
>







 







|








|
|
|
|

|
|



|
|
|
|

|
|







 







|







>
>
>
>







 







|
>
>
>
>




>
>
>







2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
....
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
....
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
....
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
....
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
....
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
....
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
....
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
    int pureNullMode)		/* Convert embedded nulls from internal
				 * representation to real null-bytes or vice
				 * versa. */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd;
    int result, numChars, charLimit = INT_MAX;
    Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;

    if (flags & TCL_ENCODING_START) {
    	*statePtr = 0;
    }
    result = TCL_OK;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
................................................................................
	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    /*
	     * Always check before using TclUtfToUniChar. Not doing can so
	     * cause it run beyond the end of the buffer! If we happen such an
	     * incomplete char its bytes are made to represent themselves.
	     */

	    *chPtr = (unsigned char) *src;
	    src += 1;
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
	} else {
	    int len = TclUtfToUniChar(src, chPtr);
	    src += len;
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
#if TCL_UTF_MAX == 4
	    if (!len) {
		src += TclUtfToUniChar(src, chPtr);
		dst += Tcl_UniCharToUtf(*chPtr, dst);
	    }
#endif
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
................................................................................
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd;
    const char *dstEnd, *dstStart;
    int result, numChars, charLimit = INT_MAX;
    Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;

    if (flags & TCL_ENCODING_START) {
    	*statePtr = 0;
    }
    if (flags & TCL_ENCODING_CHAR_LIMIT) {
	charLimit = *dstCharsPtr;
    }
    result = TCL_OK;
    if ((srcLen % sizeof(Tcl_UniChar)) != 0) {
	result = TCL_CONVERT_MULTIBYTE;
	srcLen /= sizeof(Tcl_UniChar);
................................................................................
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * Tcl_UniChar-size data.
	 */

	*chPtr = *(Tcl_UniChar *)src;
	if (*chPtr && *chPtr < 0x80) {
	    *dst++ = (*chPtr & 0xFF);
	} else {
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
	}
	src += sizeof(Tcl_UniChar);
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
................................................................................
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
    int result, numChars;
    Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;

    if (flags & TCL_ENCODING_START) {
    	*statePtr = 0;
    }
    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }

................................................................................
	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	src += TclUtfToUniChar(src, chPtr);

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
	 */

#ifdef WORDS_BIGENDIAN
#if TCL_UTF_MAX > 4
	*dst++ = (*chPtr >> 24);
	*dst++ = ((*chPtr >> 16) & 0xFF);
	*dst++ = ((*chPtr >> 8) & 0xFF);
	*dst++ = (*chPtr & 0xFF);
#else
	*dst++ = (*chPtr >> 8);
	*dst++ = (*chPtr & 0xFF);
#endif
#else
#if TCL_UTF_MAX > 4
	*dst++ = (*chPtr & 0xFF);
	*dst++ = ((*chPtr >> 8) & 0xFF);
	*dst++ = ((*chPtr >> 16) & 0xFF);
	*dst++ = (*chPtr >> 24);
#else
	*dst++ = (*chPtr & 0xFF);
	*dst++ = (*chPtr >> 8);
#endif
#endif
    }
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
................................................................................
	     */

	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	len = TclUtfToUniChar(src, &ch);

#if TCL_UTF_MAX > 4
	/*
	 * This prevents a crash condition. More evaluation is required for
	 * full support of int Tcl_UniChar. [Bug 1004065]
	 */

	if (ch & 0xffff0000) {
	    word = 0;
	} else
#elif TCL_UTF_MAX == 4
	if (!len) {
	    word = 0;
	} else
#endif
	    word = fromUnicode[(ch >> 8)][ch & 0xff];

	if ((word == 0) && (ch != 0)) {
	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
................................................................................
	}
	len = TclUtfToUniChar(src, &ch);

	/*
	 * Check for illegal characters.
	 */

	if (ch > 0xff
#if TCL_UTF_MAX == 4
		|| !len
#endif
		) {
	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
#if TCL_UTF_MAX == 4
	    if (!len) len = 4;
#endif

	    /*
	     * Plunge on, using '?' as a fallback character.
	     */

	    ch = (Tcl_UniChar) '?';
	}