Tcl Source Code

Check-in [8481a52495]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix 00a27923ee: (Tcl part, remaining is in Tk) text/entry dysfunctional when pasting an emoji on MacOSX. This changes the handling of incoming valid 4-byte UTF-8 sequences: Those are no longer split in 4 separate characters (as was done for invalid byte sequences) but replaced by a single ' replacement character' .
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core-8-branch
Files: files | file ages | folders
SHA3-256: 8481a52495dbecc1976edc52825e2cb84c6c5f3b86137747b22485169cfb70e8
User & Date: jan.nijtmans 2018-01-10 08:25:13
Context
2018-01-10
14:02
Re-implement Tcl_WinTCharToUtf/Tcl_WinUtfToTChar in pure win32 api, even for TCL_UTF_MAX=3. We can d... check-in: a2c5eee57d user: jan.nijtmans tags: core-8-branch
08:27
merge core-8-branch check-in: b3fc2fbe3d user: jan.nijtmans tags: tip-389
08:25
Fix 00a27923ee: (Tcl part, re... check-in: 8481a52495 user: jan.nijtmans tags: core-8-branch
2018-01-09
11:15
(partial) fix for 00a27923ee:... Closed-Leaf check-in: f0adfe7dac user: jan.nijtmans tags: bug-00a27923ee
00:10
Some refactoring and tidying up of comments. check-in: 85aedb56e8 user: dkf tags: core-8-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclUtf.c.

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
...
324
325
326
327
328
329
330
331
332
333
334
335
336
337










338
339
340
341
342
343
344
...
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
...
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
...
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#if TCL_UTF_MAX > 3
    4,4,4,4,4,4,4,4,
#else
    1,1,1,1,1,1,1,1,
#endif
    1,1,1,1,1,1,1,1
};
 
/*
 *---------------------------------------------------------------------------
 *
 * TclUtfCount --
................................................................................
	}

	/*
	 * A three-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }
#if TCL_UTF_MAX > 3
    else if (byte < 0xF8) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
	    /*
	     * Four-byte-character lead byte followed by three trail bytes.
	     */
#if TCL_UTF_MAX == 4










	    Tcl_UniChar surrogate;

	    byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
	    surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
	    if (byte & 0x100000) {
		/* out of range, < 0x10000 or > 0x10ffff */
................................................................................
	}

	/*
	 * A four-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }
#endif

    *chPtr = (Tcl_UniChar) byte;
    return 1;
}
 
/*
 *---------------------------------------------------------------------------
................................................................................
    if (length < 0) {
	while (*src != '\0') {
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	if (i < 0) i = INT_MAX; /* Bug [2738427] */
    } else {
	register const char *endPtr = src + length - TCL_UTF_MAX;

	while (src < endPtr) {
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	endPtr += TCL_UTF_MAX;
	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	if (src < endPtr) {
	    i += endPtr - src;
	}
................................................................................
    const char *start)		/* Pointer to the beginning of the string, to
				 * avoid going backwards too far. */
{
    const char *look;
    int i, byte;

    look = --src;
    for (i = 0; i < TCL_UTF_MAX; i++) {
	if (look < start) {
	    if (src < start) {
		src = start;
	    }
	    break;
	}
	byte = *((unsigned char *) look);






<

<
<
<







 







<





|
>
>
>
>
>
>
>
>
>
>







 







<







 







|





|







 







|







64
65
66
67
68
69
70

71



72
73
74
75
76
77
78
...
320
321
322
323
324
325
326

327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
...
366
367
368
369
370
371
372

373
374
375
376
377
378
379
...
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
...
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,

    4,4,4,4,4,4,4,4,



    1,1,1,1,1,1,1,1
};
 
/*
 *---------------------------------------------------------------------------
 *
 * TclUtfCount --
................................................................................
	}

	/*
	 * A three-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }

    else if (byte < 0xF8) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
	    /*
	     * Four-byte-character lead byte followed by three trail bytes.
	     */
#if TCL_UTF_MAX == 3
	    byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
	    if (byte & 0x100000) {
		/* out of range, < 0x10000 or > 0x10ffff */
	    } else {
		/* produce replacement character, and advance source pointer */
		*chPtr = (Tcl_UniChar) 0xFFFD;
		return 4;
	    }
#elif TCL_UTF_MAX == 4
	    Tcl_UniChar surrogate;

	    byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000;
	    surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10));
	    if (byte & 0x100000) {
		/* out of range, < 0x10000 or > 0x10ffff */
................................................................................
	}

	/*
	 * A four-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */
    }


    *chPtr = (Tcl_UniChar) byte;
    return 1;
}
 
/*
 *---------------------------------------------------------------------------
................................................................................
    if (length < 0) {
	while (*src != '\0') {
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	if (i < 0) i = INT_MAX; /* Bug [2738427] */
    } else {
	register const char *endPtr = src + length - 4;

	while (src < endPtr) {
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	endPtr += 4;
	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	if (src < endPtr) {
	    i += endPtr - src;
	}
................................................................................
    const char *start)		/* Pointer to the beginning of the string, to
				 * avoid going backwards too far. */
{
    const char *look;
    int i, byte;

    look = --src;
    for (i = 0; i < 4; i++) {
	if (look < start) {
	    if (src < start) {
		src = start;
	    }
	    break;
	}
	byte = *((unsigned char *) look);