Tcl Source Code

Check-in [2b82daafb8]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Merge 8.7
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 2b82daafb8c4f1df0028b7abf360436eaa0961e68e997922e7ab5ecbc4da3501
User & Date: jan.nijtmans 2019-02-19 20:17:01
Context
2019-02-22
20:24
merge trunk Closed-Leaf check-in: 01853d6504 user: dgp tags: dgp-scratch
18:09
Scratch workspace branch to ease difficult merges. Closed-Leaf check-in: 67392b9ff9 user: dgp tags: mistake
18:01
merge 8.7 check-in: adfe04dbbd user: dgp tags: trunk
2019-02-19
20:17
Merge 8.7 check-in: 2b82daafb8 user: jan.nijtmans tags: trunk
20:16
Merge 8.6 check-in: 1b17625b60 user: jan.nijtmans tags: core-8-branch
2019-02-18
20:42
Merge 8.7 check-in: 7aa80fc0ad user: jan.nijtmans tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to doc/Utf.3.

125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
sequence consists of a lead byte followed by some number of trail bytes.
.PP
\fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
represent one Unicode character in the UTF-8 representation.
.PP
\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string
in starting at \fIbuf\fR.  The return value is the number of bytes stored
in \fIbuf\fR. If ch is an upper surrogate (range U+D800 - U+DBFF), then
the return value will be 0 and nothing will be stored. If you still
want to produce UTF-8 output for it (even though knowing it's an illegal
code-point on its own), just call \fBTcl_UniCharToUtf\fR again using ch = -1.
.PP
\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
number of bytes read from \fIsrc\fR.  The caller must ensure that the






|







125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
sequence consists of a lead byte followed by some number of trail bytes.
.PP
\fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
represent one Unicode character in the UTF-8 representation.
.PP
\fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string
in starting at \fIbuf\fR.  The return value is the number of bytes stored
in \fIbuf\fR. If ch is a high surrogate (range U+D800 - U+DBFF), then
the return value will be 0 and nothing will be stored. If you still
want to produce UTF-8 output for it (even though knowing it's an illegal
code-point on its own), just call \fBTcl_UniCharToUtf\fR again using ch = -1.
.PP
\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
number of bytes read from \fIsrc\fR.  The caller must ensure that the

Changes to generic/tclParse.c.

937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
  done:
    if (readPtr != NULL) {
	*readPtr = count;
    }
    count = Tcl_UniCharToUtf(result, dst);
    if (!count) {
	/* Special case for handling upper surrogates. */
	count = Tcl_UniCharToUtf(-1, dst);
    }
    return count;
}
 
/*
 *----------------------------------------------------------------------






|







937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
  done:
    if (readPtr != NULL) {
	*readPtr = count;
    }
    count = Tcl_UniCharToUtf(result, dst);
    if (!count) {
	/* Special case for handling high surrogates. */
	count = Tcl_UniCharToUtf(-1, dst);
    }
    return count;
}
 
/*
 *----------------------------------------------------------------------

Changes to generic/tclStringObj.c.

1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
	    int code, length;

	    if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
		goto error;
	    }
	    length = Tcl_UniCharToUtf(code, buf);
	    if (!length) {
		/* Special case for handling upper surrogates. */
		length = Tcl_UniCharToUtf(-1, buf);
	    }
	    segment = Tcl_NewStringObj(buf, length);
	    Tcl_IncrRefCount(segment);
	    allocSegment = 1;
	    break;
	}






|







1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
	    int code, length;

	    if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) {
		goto error;
	    }
	    length = Tcl_UniCharToUtf(code, buf);
	    if (!length) {
		/* Special case for handling high surrogates. */
		length = Tcl_UniCharToUtf(-1, buf);
	    }
	    segment = Tcl_NewStringObj(buf, length);
	    Tcl_IncrRefCount(segment);
	    allocSegment = 1;
	    break;
	}

Changes to generic/tclStubInit.c.

207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    Tcl_DStringSetLength(dsPtr, oldLength + (len + 1) * 4);
    result = Tcl_DStringValue(dsPtr) + oldLength;

    p = result;
    wEnd = (wchar_t *)string + len;
    for (w = (wchar_t *)string; w < wEnd; ) {
	if (!blen && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling upper surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	blen = Tcl_UniCharToUtf(*w, p);
	p += blen;
	w++;
    }
    if (!blen) {
	/* Special case for handling upper surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - result));

    return result;
#else
    return Tcl_UniCharToUtfDString((Tcl_UniChar *)string, len, dsPtr);






|







|







207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    Tcl_DStringSetLength(dsPtr, oldLength + (len + 1) * 4);
    result = Tcl_DStringValue(dsPtr) + oldLength;

    p = result;
    wEnd = (wchar_t *)string + len;
    for (w = (wchar_t *)string; w < wEnd; ) {
	if (!blen && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	blen = Tcl_UniCharToUtf(*w, p);
	p += blen;
	w++;
    }
    if (!blen) {
	/* Special case for handling high surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - result));

    return result;
#else
    return Tcl_UniCharToUtfDString((Tcl_UniChar *)string, len, dsPtr);

Changes to generic/tclUtf.c.

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
...
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
    Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4);
    string = Tcl_DStringValue(dsPtr) + oldLength;

    p = string;
    wEnd = uniStr + uniLength;
    for (w = uniStr; w < wEnd; ) {
	if (!len && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling upper surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	len = Tcl_UniCharToUtf(*w, p);
	p += len;
	w++;
    }
    if (!len) {
	/* Special case for handling upper surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));

    return string;
}
 
................................................................................
#else
	src += TclUtfToUniChar(src, &ch);
#endif
    }
    fullchar = ch;
#if TCL_UTF_MAX <= 4
     if (!len) {
	/* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */
	(void)TclUtfToUniChar(src, &ch);
	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
    }
#endif
    return fullchar;
}
 






|







|







 







|







229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
...
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
    Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * 4);
    string = Tcl_DStringValue(dsPtr) + oldLength;

    p = string;
    wEnd = uniStr + uniLength;
    for (w = uniStr; w < wEnd; ) {
	if (!len && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	len = Tcl_UniCharToUtf(*w, p);
	p += len;
	w++;
    }
    if (!len) {
	/* Special case for handling high surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));

    return string;
}
 
................................................................................
#else
	src += TclUtfToUniChar(src, &ch);
#endif
    }
    fullchar = ch;
#if TCL_UTF_MAX <= 4
     if (!len) {
	/* If last Tcl_UniChar was a high surrogate, combine with low surrogate */
	(void)TclUtfToUniChar(src, &ch);
	fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
    }
#endif
    return fullchar;
}
 

Changes to tests/utf.test.

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
...
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
} 1
test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring {
    expr {[format %c -1] eq [testbytestring "\xef\xbf\xbd"]}
} 1
test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints testbytestring -body {
    expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]}
} -result 1
test utf-1.8 {Tcl_UniCharToUtf: 3 byte sequence, upper surrogate} testbytestring {
    expr {"\ud842" eq [testbytestring "\xed\xa1\x82"]}
} 1
test utf-1.9 {Tcl_UniCharToUtf: 3 byte sequence, lower surrogate} testbytestring {
    expr {"\udc42" eq [testbytestring "\xed\xb1\x82"]}
} 1
test utf-1.10 {Tcl_UniCharToUtf: 3 byte sequence, upper surrogate} testbytestring {
    expr {[format %c 0xd842] eq [testbytestring "\xed\xa1\x82"]}
} 1
test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, lower surrogate} testbytestring {
    expr {[format %c 0xdc42] eq [testbytestring "\xed\xb1\x82"]}
} 1

test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
    string length "abc"
} {3}
test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring {
................................................................................
} "\u4e4e"
test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
    string index abcd 2
} {c}
test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
    string index \u4e4e\u25a\xff\u543 2
} "\uff"
test utf-8.5 {Tcl_UniCharAtIndex: upper surrogate} {
    string index \ud842 0
} "\ud842"
test utf-8.5 {Tcl_UniCharAtIndex: lower surrogate} {
    string index \udc42 0
} "\udc42"

test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
    string range abcd 0 2
} {abc}
test utf-9.2 {Tcl_UtfAtIndex: index > 0} {






|


|


|


|







 







|


|







40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
...
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
} 1
test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring {
    expr {[format %c -1] eq [testbytestring "\xef\xbf\xbd"]}
} 1
test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints testbytestring -body {
    expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]}
} -result 1
test utf-1.8 {Tcl_UniCharToUtf: 3 byte sequence, high surrogate} testbytestring {
    expr {"\ud842" eq [testbytestring "\xed\xa1\x82"]}
} 1
test utf-1.9 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring {
    expr {"\udc42" eq [testbytestring "\xed\xb1\x82"]}
} 1
test utf-1.10 {Tcl_UniCharToUtf: 3 byte sequence, high surrogate} testbytestring {
    expr {[format %c 0xd842] eq [testbytestring "\xed\xa1\x82"]}
} 1
test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring {
    expr {[format %c 0xdc42] eq [testbytestring "\xed\xb1\x82"]}
} 1

test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
    string length "abc"
} {3}
test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring {
................................................................................
} "\u4e4e"
test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
    string index abcd 2
} {c}
test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
    string index \u4e4e\u25a\xff\u543 2
} "\uff"
test utf-8.5 {Tcl_UniCharAtIndex: high surrogate} {
    string index \ud842 0
} "\ud842"
test utf-8.5 {Tcl_UniCharAtIndex: low surrogate} {
    string index \udc42 0
} "\udc42"

test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
    string range abcd 0 2
} {abc}
test utf-9.2 {Tcl_UtfAtIndex: index > 0} {

Changes to win/tclWin32Dll.c.

563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
    Tcl_DStringSetLength(dsPtr, oldLength + (len + 1) * 4);
    result = Tcl_DStringValue(dsPtr) + oldLength;

    p = result;
    wEnd = (TCHAR *)string + len;
    for (w = (TCHAR *)string; w < wEnd; ) {
	if (!blen && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling upper surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	blen = Tcl_UniCharToUtf(*w, p);
	p += blen;
	w++;
    }
    if (!blen) {
	/* Special case for handling upper surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - result));

    return result;
#else
    return Tcl_UniCharToUtfDString((Tcl_UniChar *)string, len, dsPtr);






|







|







563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
    Tcl_DStringSetLength(dsPtr, oldLength + (len + 1) * 4);
    result = Tcl_DStringValue(dsPtr) + oldLength;

    p = result;
    wEnd = (TCHAR *)string + len;
    for (w = (TCHAR *)string; w < wEnd; ) {
	if (!blen && ((*w & 0xFC00) != 0xDC00)) {
	    /* Special case for handling high surrogates. */
	    p += Tcl_UniCharToUtf(-1, p);
	}
	blen = Tcl_UniCharToUtf(*w, p);
	p += blen;
	w++;
    }
    if (!blen) {
	/* Special case for handling high surrogates. */
	p += Tcl_UniCharToUtf(-1, p);
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - result));

    return result;
#else
    return Tcl_UniCharToUtfDString((Tcl_UniChar *)string, len, dsPtr);