Tcl Source Code

Check-in [78f530026b]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Merge 8.7
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 78f530026be0e2ab929f4c69fba03edd73081c1e8db88625acf40ecbd1881316
User & Date: dgp 2018-04-24 13:51:12
Context
2018-04-24
23:00
Merge fork check-in: 0f76722982 user: andy tags: trunk
13:51
Merge 8.7 check-in: 78f530026b user: dgp tags: trunk
13:45
Merge 8.6 (bug-fix and test-case for Tcl_UtfAtIndex with TCL_UTF_MAX=4) ((Replacement checkin for ea... check-in: a4b3649641 user: dgp tags: core-8-branch
2018-04-23
15:07
merge 8.7 check-in: aaa3ab6a72 user: jan.nijtmans tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclUtf.c.

210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
...
751
752
753
754
755
756
757
758


759
760
761
762
763
764
765
...
768
769
770
771
772
773
774

775
776






777
778

779
780
781
782
783
784
785
...
863
864
865
866
867
868
869

870
871
872
873
874
875

876
877
878
879
880
881
882
...
924
925
926
927
928
929
930

931
932
933
934
935
936

937
938
939
940
941
942
943
...
988
989
990
991
992
993
994

995
996
997
998
999
1000

1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013

1014
1015
1016
1017
1018
1019

1020
1021
1022
1023
1024
1025
1026
				 * to this previously initialized DString. */
{
    const Tcl_UniChar *w, *wEnd;
    char *p, *string;
    int oldLength;

    /*
     * UTF-8 string length in bytes will be <= Unicode string length *
     * TCL_UTF_MAX.
     */

    oldLength = Tcl_DStringLength(dsPtr);
    Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
    string = Tcl_DStringValue(dsPtr) + oldLength;

    p = string;
    wEnd = uniStr + uniLength;
    for (w = uniStr; w < wEnd; ) {
	p += Tcl_UniCharToUtf(*w, p);
	w++;
................................................................................
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfAtIndex --
 *
 *	Returns a pointer to the specified character (not byte) position in
 *	the UTF-8 string.


 *
 * Results:
 *	As above.
 *
 * Side effects:
 *	None.
 *
................................................................................

const char *
Tcl_UtfAtIndex(
    register const char *src,	/* The UTF-8 string. */
    register int index)		/* The position of the desired character. */
{
    Tcl_UniChar ch = 0;


    while (index-- > 0) {






	src += TclUtfToUniChar(src, &ch);
    }

    return src;
}
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfBackslash --
................................................................................
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	upChar = ch;

	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}

	upChar = Tcl_UniCharToUpper(upChar);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
	 */
................................................................................
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	lowChar = ch;

	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}

	lowChar = Tcl_UniCharToLower(lowChar);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */
................................................................................
     */

    src = dst = str;

    if (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	titleChar = ch;

	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}

	titleChar = Tcl_UniCharToTitle(titleChar);

	if (bytes < TclUtfCount(titleChar)) {
	    memcpy(dst, src, (size_t) bytes);
	    dst += bytes;
	} else {
	    dst += Tcl_UniCharToUtf(titleChar, dst);
	}
	src += bytes;
    }
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	lowChar = ch;

	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}

	lowChar = Tcl_UniCharToLower(lowChar);

	if (bytes < TclUtfCount(lowChar)) {
	    memcpy(dst, src, (size_t) bytes);
	    dst += bytes;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);






|
<



|







 







|
>
>







 







>


>
>
>
>
>
>


>







 







>






>







 







>






>







 







>






>













>






>







210
211
212
213
214
215
216
217

218
219
220
221
222
223
224
225
226
227
228
...
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
...
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
...
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
...
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
....
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
				 * to this previously initialized DString. */
{
    const Tcl_UniChar *w, *wEnd;
    char *p, *string;
    int oldLength;

    /*
     * UTF-8 string length in bytes will be <= Unicode string length * 4.

     */

    oldLength = Tcl_DStringLength(dsPtr);
    Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * 4);
    string = Tcl_DStringValue(dsPtr) + oldLength;

    p = string;
    wEnd = uniStr + uniLength;
    for (w = uniStr; w < wEnd; ) {
	p += Tcl_UniCharToUtf(*w, p);
	w++;
................................................................................
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfAtIndex --
 *
 *	Returns a pointer to the specified character (not byte) position in
 *	the UTF-8 string. If TCL_UTF_MAX <= 4, characters > U+FFFF count as
 *      2 positions, but then the pointer should never be placed between
 *      the two positions.
 *
 * Results:
 *	As above.
 *
 * Side effects:
 *	None.
 *
................................................................................

const char *
Tcl_UtfAtIndex(
    register const char *src,	/* The UTF-8 string. */
    register int index)		/* The position of the desired character. */
{
    Tcl_UniChar ch = 0;
    int len = 1;

    while (index-- > 0) {
	len = TclUtfToUniChar(src, &ch);
	src += len;
    }
#if TCL_UTF_MAX <= 4
     if (!len) {
	/* Index points at character following High Surrogate */
	src += TclUtfToUniChar(src, &ch);
    }
#endif
    return src;
}
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfBackslash --
................................................................................
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	upChar = ch;
#if TCL_UTF_MAX <= 4
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	upChar = Tcl_UniCharToUpper(upChar);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the upper case
	 * char to dst if its size is <= the original char.
	 */
................................................................................
     * Iterate over the string until we hit the terminating null.
     */

    src = dst = str;
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	lowChar = ch;
#if TCL_UTF_MAX <= 4
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	lowChar = Tcl_UniCharToLower(lowChar);

	/*
	 * To keep badly formed Utf strings from getting inflated by the
	 * conversion (thereby causing a segfault), only copy the lower case
	 * char to dst if its size is <= the original char.
	 */
................................................................................
     */

    src = dst = str;

    if (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	titleChar = ch;
#if TCL_UTF_MAX <= 4
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	titleChar = Tcl_UniCharToTitle(titleChar);

	if (bytes < TclUtfCount(titleChar)) {
	    memcpy(dst, src, (size_t) bytes);
	    dst += bytes;
	} else {
	    dst += Tcl_UniCharToUtf(titleChar, dst);
	}
	src += bytes;
    }
    while (*src) {
	bytes = TclUtfToUniChar(src, &ch);
	lowChar = ch;
#if TCL_UTF_MAX <= 4
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	lowChar = Tcl_UniCharToLower(lowChar);

	if (bytes < TclUtfCount(lowChar)) {
	    memcpy(dst, src, (size_t) bytes);
	    dst += bytes;
	} else {
	    dst += Tcl_UniCharToUtf(lowChar, dst);

Changes to tests/string.test.

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
...
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
....
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
....
1721
1722
1723
1724
1725
1726
1727




1728
1729
1730
1731
1732
1733
1734
proc makeShared {s} {uplevel 1 [list lappend copy $s]; return $s}

# Some tests require the testobj command

testConstraint testobj [expr {[info commands testobj] ne {}}]
testConstraint testindexobj [expr {[info commands testindexobj] ne {}}]
testConstraint testevalex [expr {[info commands testevalex] ne {}}]
testConstraint fullutf [expr {[format %c 0x010000] != "\ufffd"}]

# Used for constraining memory leak tests
testConstraint memory [llength [info commands memory]]
if {[testConstraint memory]} {
    proc getbytes {} {
        set lines [split [memory info] \n]
        return [lindex $lines 3 3]
................................................................................
} -match glob -result {1 {*}}
test string-5.19.$noComp {string index, bytearray object out of bounds} {
    run {string index [binary format I* {0x50515253 0x52}] -1}
} {}
test string-5.20.$noComp {string index, bytearray object out of bounds} {
    run {string index [binary format I* {0x50515253 0x52}] 20}
} {}
test string-5.21.$noComp {string index, surrogates, bug [11ae2be95dac9417]} fullutf {
    list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3]
} [list \U100000 {} b]


proc largest_int {} {
    # This will give us what the largest valid int on this machine is,
    # so we can test for overflow properly below on >32 bit systems
    set int 1
................................................................................
    list $input_hex $rxBuffer_hex $rxCRC_hex
} {000341 000341 0341}
test string-12.22.$noComp {string range, shimmering binary/index} {
    set s 0000000001
    binary scan $s a* x
    run {string range $s $s end}
} 000000001
test string-12.23.$noComp {string range, surrogates, bug [11ae2be95dac9417]} fullutf {
    run {list [string range a\U100000b 1 1] [string range a\U100000b 2 2] [string range a\U100000b 3 3]}
} [list \U100000 {} b]

test string-13.1.$noComp {string repeat} {
    list [catch {run {string repeat}} msg] $msg
} {1 {wrong # args: should be "string repeat string count"}}
test string-13.2.$noComp {string repeat} {
................................................................................
} "Abcabc\xe7\xe7"
test string-17.7.$noComp {string totitle, unicode} {
    run {string totitle \u01f3BCabc\xc7\xe7}
} "\u01f2bcabc\xe7\xe7"
test string-17.8.$noComp {string totitle, compiled} {
    lindex [run {string totitle [list aa bb [list cc]]}] 0
} Aa





test string-18.1.$noComp {string trim} {
    list [catch {run {string trim}} msg] $msg
} {1 {wrong # args: should be "string trim string ?chars?"}}
test string-18.2.$noComp {string trim} {
    list [catch {run {string trim a b c}} msg] $msg
} {1 {wrong # args: should be "string trim string ?chars?"}}






|







 







|
|







 







|







 







>
>
>
>







27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
...
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
....
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
....
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
proc makeShared {s} {uplevel 1 [list lappend copy $s]; return $s}

# Some tests require the testobj command

testConstraint testobj [expr {[info commands testobj] ne {}}]
testConstraint testindexobj [expr {[info commands testindexobj] ne {}}]
testConstraint testevalex [expr {[info commands testevalex] ne {}}]
testConstraint tip389 [expr {[string length \U010000] == 2}]

# Used for constraining memory leak tests
testConstraint memory [llength [info commands memory]]
if {[testConstraint memory]} {
    proc getbytes {} {
        set lines [split [memory info] \n]
        return [lindex $lines 3 3]
................................................................................
} -match glob -result {1 {*}}
test string-5.19.$noComp {string index, bytearray object out of bounds} {
    run {string index [binary format I* {0x50515253 0x52}] -1}
} {}
test string-5.20.$noComp {string index, bytearray object out of bounds} {
    run {string index [binary format I* {0x50515253 0x52}] 20}
} {}
test string-5.21.$noComp {string index, surrogates, bug [11ae2be95dac9417]} tip389 {
    run {list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3]}
} [list \U100000 {} b]


proc largest_int {} {
    # This will give us what the largest valid int on this machine is,
    # so we can test for overflow properly below on >32 bit systems
    set int 1
................................................................................
    list $input_hex $rxBuffer_hex $rxCRC_hex
} {000341 000341 0341}
test string-12.22.$noComp {string range, shimmering binary/index} {
    set s 0000000001
    binary scan $s a* x
    run {string range $s $s end}
} 000000001
test string-12.23.$noComp {string range, surrogates, bug [11ae2be95dac9417]} tip389 {
    run {list [string range a\U100000b 1 1] [string range a\U100000b 2 2] [string range a\U100000b 3 3]}
} [list \U100000 {} b]

test string-13.1.$noComp {string repeat} {
    list [catch {run {string repeat}} msg] $msg
} {1 {wrong # args: should be "string repeat string count"}}
test string-13.2.$noComp {string repeat} {
................................................................................
} "Abcabc\xe7\xe7"
test string-17.7.$noComp {string totitle, unicode} {
    run {string totitle \u01f3BCabc\xc7\xe7}
} "\u01f2bcabc\xe7\xe7"
test string-17.8.$noComp {string totitle, compiled} {
    lindex [run {string totitle [list aa bb [list cc]]}] 0
} Aa
test string-17.9.$noComp {string totitle, surrogates, bug [11ae2be95dac9417]} tip389 {
    run {list [string totitle a\U118c0c 1 1] [string totitle a\U118c0c 2 2] \
	[string totitle a\U118c0c 3 3]}
} [list a\U118a0c a\U118c0C a\U118c0C]

test string-18.1.$noComp {string trim} {
    list [catch {run {string trim}} msg] $msg
} {1 {wrong # args: should be "string trim string ?chars?"}}
test string-18.2.$noComp {string trim} {
    list [catch {run {string trim a b c}} msg] $msg
} {1 {wrong # args: should be "string trim string ?chars?"}}