Tcl Source Code

Check-in [aa4790b28f]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:merge 8.7
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: aa4790b28f943eb74f0df8cff881edd9562c06d06d3abe5b23368c1029e3d55a
User & Date: jan.nijtmans 2018-05-07 07:42:03
Context
2018-05-08
15:22
merge 8.7 check-in: 966bdc055f user: dgp tags: trunk
2018-05-07
07:42
merge 8.7 check-in: aa4790b28f user: jan.nijtmans tags: trunk
07:40
Remove some tip389 restrictions in test-cases, which are no longer necessary. Eliminate gcc compile... check-in: 385fda311b user: jan.nijtmans tags: core-8-branch
2018-05-04
19:07
Update TZ info to tzdata2018e. check-in: e95474c36a user: jima tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tcl.h.

1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
#define TCL_CONVERT_MULTIBYTE	(-1)
#define TCL_CONVERT_SYNTAX	(-2)
#define TCL_CONVERT_UNKNOWN	(-3)
#define TCL_CONVERT_NOSPACE	(-4)

/*
 * The maximum number of bytes that are necessary to represent a single
 * Unicode character in UTF-8. The valid values should be 3, 4 or 6
 * (or perhaps 1 if we want to support a non-unicode enabled core). If 3 or
 * 4, then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6,
 * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode
 * is the default and recommended mode. UCS-4 is experimental and not
 * recommended. It works for the core, but most extensions expect UCS-2.
 */

#ifndef TCL_UTF_MAX
#define TCL_UTF_MAX		4






|
|
|







1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
#define TCL_CONVERT_MULTIBYTE	(-1)
#define TCL_CONVERT_SYNTAX	(-2)
#define TCL_CONVERT_UNKNOWN	(-3)
#define TCL_CONVERT_NOSPACE	(-4)

/*
 * The maximum number of bytes that are necessary to represent a single
 * Unicode character in UTF-8. The valid values are 4 and 6
 * (or perhaps 1 if we want to support a non-unicode enabled core). If 4,
 * then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6,
 * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode
 * is the default and recommended mode. UCS-4 is experimental and not
 * recommended. It works for the core, but most extensions expect UCS-2.
 */

#ifndef TCL_UTF_MAX
#define TCL_UTF_MAX		4

Changes to generic/tclCmdMZ.c.

593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
....
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
....
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
	    && (strpbrk(TclGetString(objv[2]), "&\\") == NULL)
	    && (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) {
	/*
	 * This is a simple one pair string map situation. We make use of a
	 * slightly modified version of the one pair STR_MAP code.
	 */

	int slen, nocase;
	int (*strCmpFn)(const Tcl_UniChar*,const Tcl_UniChar*,unsigned long);
	Tcl_UniChar *p, wsrclc;

	numMatches = 0;
	nocase = (cflags & TCL_REG_NOCASE);
	strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp;

	wsrc = Tcl_GetUnicodeFromObj(objv[0], &slen);
	wstring = Tcl_GetUnicodeFromObj(objv[1], &wlen);
................................................................................
	/*
	 * Special case for one map pair which avoids the extra for loop and
	 * extra calls to get Unicode data. The algorithm is otherwise
	 * identical to the multi-pair case. This will be >30% faster on
	 * larger strings.
	 */

	int mapLen;
	Tcl_UniChar *mapString, u2lc;

	ustring2 = Tcl_GetUnicodeFromObj(mapElemv[0], &length2);
	p = ustring1;
	if ((length2 > length1) || (length2 == 0)) {
	    /*
	     * Match string is either longer than input or empty.
	     */
................................................................................
		    ustring1 = p - 1;

		    Tcl_AppendUnicodeToObj(resultPtr, mapString, mapLen);
		}
	    }
	}
    } else {
	Tcl_UniChar **mapStrings, *u2lc = NULL;
	int *mapLens;

	/*
	 * Precompute pointers to the unicode string and length. This saves us
	 * repeated function calls later, significantly speeding up the
	 * algorithm. We only need the lowercase first char in the nocase
	 * case.
	 */

	mapStrings = TclStackAlloc(interp, mapElemc*2*sizeof(Tcl_UniChar *));
	mapLens = TclStackAlloc(interp, mapElemc * 2 * sizeof(int));
	if (nocase) {
	    u2lc = TclStackAlloc(interp, mapElemc * sizeof(Tcl_UniChar));
	}
	for (index = 0; index < mapElemc; index++) {
	    mapStrings[index] = Tcl_GetUnicodeFromObj(mapElemv[index],
		    mapLens+index);
	    if (nocase && ((index % 2) == 0)) {
		u2lc[index/2] = Tcl_UniCharToLower(*mapStrings[index]);
	    }






|

|







 







|
|







 







|
|











|







593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
....
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
....
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
	    && (strpbrk(TclGetString(objv[2]), "&\\") == NULL)
	    && (strpbrk(TclGetString(objv[0]), "*+?{}()[].\\|^$") == NULL)) {
	/*
	 * This is a simple one pair string map situation. We make use of a
	 * slightly modified version of the one pair STR_MAP code.
	 */

	int slen, nocase, wsrclc;
	int (*strCmpFn)(const Tcl_UniChar*,const Tcl_UniChar*,unsigned long);
	Tcl_UniChar *p;

	numMatches = 0;
	nocase = (cflags & TCL_REG_NOCASE);
	strCmpFn = nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp;

	wsrc = Tcl_GetUnicodeFromObj(objv[0], &slen);
	wstring = Tcl_GetUnicodeFromObj(objv[1], &wlen);
................................................................................
	/*
	 * Special case for one map pair which avoids the extra for loop and
	 * extra calls to get Unicode data. The algorithm is otherwise
	 * identical to the multi-pair case. This will be >30% faster on
	 * larger strings.
	 */

	int mapLen, u2lc;
	Tcl_UniChar *mapString;

	ustring2 = Tcl_GetUnicodeFromObj(mapElemv[0], &length2);
	p = ustring1;
	if ((length2 > length1) || (length2 == 0)) {
	    /*
	     * Match string is either longer than input or empty.
	     */
................................................................................
		    ustring1 = p - 1;

		    Tcl_AppendUnicodeToObj(resultPtr, mapString, mapLen);
		}
	    }
	}
    } else {
	Tcl_UniChar **mapStrings;
	int *mapLens, *u2lc = NULL;

	/*
	 * Precompute pointers to the unicode string and length. This saves us
	 * repeated function calls later, significantly speeding up the
	 * algorithm. We only need the lowercase first char in the nocase
	 * case.
	 */

	mapStrings = TclStackAlloc(interp, mapElemc*2*sizeof(Tcl_UniChar *));
	mapLens = TclStackAlloc(interp, mapElemc * 2 * sizeof(int));
	if (nocase) {
	    u2lc = TclStackAlloc(interp, mapElemc * sizeof(int));
	}
	for (index = 0; index < mapElemc; index++) {
	    mapStrings[index] = Tcl_GetUnicodeFromObj(mapElemv[index],
		    mapLens+index);
	    if (nocase && ((index % 2) == 0)) {
		u2lc[index/2] = Tcl_UniCharToLower(*mapStrings[index]);
	    }

Changes to generic/tclUtf.c.

1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	/* Special exception for Gregorian characters, which don't have titlecase */
	if ((lowChar < 0x1C90) || (lowChar >= 0x1CC0)) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}

	if (bytes < TclUtfCount(lowChar)) {
	    memcpy(dst, src, (size_t) bytes);
	    dst += bytes;
	} else {






|
|







1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
	if (!bytes) {
	    /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */
	    bytes = TclUtfToUniChar(src, &ch);
	    /* Combine surrogates */
	    lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000;
	}
#endif
	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
	    lowChar = Tcl_UniCharToLower(lowChar);
	}

	if (bytes < TclUtfCount(lowChar)) {
	    memcpy(dst, src, (size_t) bytes);
	    dst += bytes;
	} else {

Changes to tests/utf.test.

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
...
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
} 1
test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring {
    expr {[format %c 0x110000] eq [testbytestring "\xef\xbf\xbd"]}
} 1
test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring {
    expr {[format %c -1] eq [testbytestring "\xef\xbf\xbd"]}
} 1
test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints {tip389 testbytestring} -body {
    expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]}
} -result 1

test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
    string length "abc"
} {3}
test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring {
................................................................................
bsCheck \Ua	10
bsCheck \UA	10
bsCheck \Ua1	161
bsCheck \U4e21	20001
bsCheck \U004e21	20001
bsCheck \U00004e21	20001
bsCheck \U0000004e21	78
if {[testConstraint tip389]} {
    bsCheck \U00110000	69632
    bsCheck \U01100000	69632
    bsCheck \U11000000	69632
    bsCheck \U0010FFFF	1114111
    bsCheck \U010FFFF0	1114111
    bsCheck \U10FFFF00	1114111
    bsCheck \UFFFFFFFF	1048575
}

test utf-11.1 {Tcl_UtfToUpper} {
    string toupper {}
} {}
test utf-11.2 {Tcl_UtfToUpper} {
    string toupper abc
} ABC






|







 







<
|
|
|
|
|
|
|
<







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
...
224
225
226
227
228
229
230

231
232
233
234
235
236
237

238
239
240
241
242
243
244
} 1
test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring {
    expr {[format %c 0x110000] eq [testbytestring "\xef\xbf\xbd"]}
} 1
test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring {
    expr {[format %c -1] eq [testbytestring "\xef\xbf\xbd"]}
} 1
test utf-1.7 {Tcl_UniCharToUtf: 4 byte sequences} -constraints testbytestring -body {
    expr {"\U014e4e" eq [testbytestring "\xf0\x94\xb9\x8e"]}
} -result 1

test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
    string length "abc"
} {3}
test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring {
................................................................................
bsCheck \Ua	10
bsCheck \UA	10
bsCheck \Ua1	161
bsCheck \U4e21	20001
bsCheck \U004e21	20001
bsCheck \U00004e21	20001
bsCheck \U0000004e21	78

bsCheck \U00110000	69632
bsCheck \U01100000	69632
bsCheck \U11000000	69632
bsCheck \U0010FFFF	1114111
bsCheck \U010FFFF0	1114111
bsCheck \U10FFFF00	1114111
bsCheck \UFFFFFFFF	1048575


test utf-11.1 {Tcl_UtfToUpper} {
    string toupper {}
} {}
test utf-11.2 {Tcl_UtfToUpper} {
    string toupper abc
} ABC