Tcl Source Code

Check-in [b2521a4844]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Treat invalid UTF-8 characters in the range 0x80-0x9F as cp1252: See https://en.wikipedia.org/wiki/UTF-8. To be added to TIP #389
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | tip-389
Files: files | file ages | folders
SHA3-256: b2521a484432b2861bbece7b72e0d5e30337992828653845fdf90cc096b28e64
User & Date: jan.nijtmans 2017-11-29 08:59:49
Context
2017-11-29
09:49
Fix [8e1e31eac0]: lsort treats NUL chars strangely check-in: e2a6110884 user: jan.nijtmans tags: tip-389
08:59
Treat invalid UTF-8 characters in the range 0x80-0x9F as cp1252: See [https://en.wikipedia.org/wiki/... check-in: b2521a4844 user: jan.nijtmans tags: tip-389
08:41
merge core-8-branch check-in: 32734bbfaf user: jan.nijtmans tags: tip-389
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to doc/Utf.3.

136
137
138
139
140
141
142



143
144
145
146
147
148
149
.PP
\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
number of bytes read from \fIsrc\fR.  The caller must ensure that the
source buffer is long enough such that this routine does not run off the
end and dereference non-existent or random memory; if the source buffer
is known to be null-terminated, this will not happen.  If the input is



not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
0x00ff and return 1.
.PP
\fBTcl_UniCharToUtfDString\fR converts the given Unicode string
to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR.
You must specify \fIuniLength\fR, the length of the given Unicode string.






>
>
>







136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
.PP
\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
number of bytes read from \fIsrc\fR.  The caller must ensure that the
source buffer is long enough such that this routine does not run off the
end and dereference non-existent or random memory; if the source buffer
is known to be null-terminated, this will not happen.  If the input is
a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the
cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR
and returns 1. If the input is otherwise
not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
0x00ff and return 1.
.PP
\fBTcl_UniCharToUtfDString\fR converts the given Unicode string
to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR.
You must specify \fIuniLength\fR, the length of the given Unicode string.

Changes to generic/tclInt.h.

4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
 * the result of Tcl_UtfToUniChar. The ANSI C "prototype" for this macro is:
 *
 * MODULE_SCOPE int	TclUtfToUniChar(const char *string, Tcl_UniChar *ch);
 *----------------------------------------------------------------
 */

#define TclUtfToUniChar(str, chPtr) \
	((((unsigned char) *(str)) < 0xC0) ?		\
	    ((*(chPtr) = (unsigned char) *(str)), 1)	\
	    : Tcl_UtfToUniChar(str, chPtr))

/*
 *----------------------------------------------------------------
 * Macro counterpart of the Tcl_NumUtfChars() function. To be used in speed-
 * -sensitive points where it pays to avoid a function call in the common case






|







4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
 * the result of Tcl_UtfToUniChar. The ANSI C "prototype" for this macro is:
 *
 * MODULE_SCOPE int	TclUtfToUniChar(const char *string, Tcl_UniChar *ch);
 *----------------------------------------------------------------
 */

#define TclUtfToUniChar(str, chPtr) \
	((((unsigned char) *(str)) < 0x80) ?		\
	    ((*(chPtr) = (unsigned char) *(str)), 1)	\
	    : Tcl_UtfToUniChar(str, chPtr))

/*
 *----------------------------------------------------------------
 * Macro counterpart of the Tcl_NumUtfChars() function. To be used in speed-
 * -sensitive points where it pays to avoid a function call in the common case

Changes to generic/tclUtf.c.

267
268
269
270
271
272
273







274
275
276
277
278
279
280
...
284
285
286
287
288
289
290
291


292
293
294



295

296
297
298
299
300
301
302
 *	number of bytes from the UTF-8 string that were consumed.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */








int
Tcl_UtfToUniChar(
    register const char *src,	/* The UTF-8 string. */
    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
				 * the UTF-8 string. */
{
................................................................................
     * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
     */

    byte = *((unsigned char *) src);
    if (byte < 0xC0) {
	/*
	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid


	 * characters representing themselves.
	 */




	*chPtr = (Tcl_UniChar) byte;

	return 1;
    } else if (byte < 0xE0) {
	if ((src[1] & 0xC0) == 0x80) {
	    /*
	     * Two-byte-character lead-byte followed by a trail-byte.
	     */







>
>
>
>
>
>
>







 







|
>
>



>
>
>
|
>







267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
...
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
 *	number of bytes from the UTF-8 string that were consumed.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

static const unsigned short cp1252[32] = {
  0x20ac,   0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,   0x8D, 0x017D,   0x8F,
    0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
   0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
};

int
Tcl_UtfToUniChar(
    register const char *src,	/* The UTF-8 string. */
    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
				 * the UTF-8 string. */
{
................................................................................
     * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
     */

    byte = *((unsigned char *) src);
    if (byte < 0xC0) {
	/*
	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
	 * Treats naked trail bytes 0x80 to 0x9F as valid characters from
	 * the cp1252 table. See: <https://en.wikipedia.org/wiki/UTF-8>
	 * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid
	 * characters representing themselves.
	 */

	if ((unsigned)(byte-0x80) < (unsigned) 0x20) {
	    *chPtr = (Tcl_UniChar) cp1252[byte-0x80];
	} else {
	    *chPtr = (Tcl_UniChar) byte;
	}
	return 1;
    } else if (byte < 0xE0) {
	if ((src[1] & 0xC0) == 0x80) {
	    /*
	     * Two-byte-character lead-byte followed by a trail-byte.
	     */