Tcl Source Code

Check-in [e2a6110884]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix [8e1e31eac0]: lsort treats NUL chars strangely
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | tip-389
Files: files | file ages | folders
SHA3-256: e2a61108848b0f997ca8a6cf8305b50408fae5d5c42576337d8cd97658f53ca8
User & Date: jan.nijtmans 2017-11-29 09:49:31
Context
2017-11-29
11:49
Merge core-8-branch. Also, use a different value for TCL_STUB_MAGIC when TCL_UTF_MAX>4. check-in: 1916b6a72e user: jan.nijtmans tags: tip-389
09:49
Fix [8e1e31eac0]: lsort treats NUL chars strangely check-in: e2a6110884 user: jan.nijtmans tags: tip-389
08:59
Treat invalid UTF-8 characters in the range 0x80-0x9F as cp1252: See [https://en.wikipedia.org/wiki/... check-in: b2521a4844 user: jan.nijtmans tags: tip-389
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclCmdIL.c.

2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
....
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
    int i, match, index, result, listc, length, elemLen, bisect;
    int dataType, isIncreasing, lower, upper, offset;
    Tcl_WideInt patWide, objWide;
    int allMatches, inlineReturn, negatedMatch, returnSubindices, noCase;
    double patDouble, objDouble;
    SortInfo sortInfo;
    Tcl_Obj *patObj, **listv, *listPtr, *startPtr, *itemPtr;
    SortStrCmpFn_t strCmpFn = strcmp;
    Tcl_RegExp regexp = NULL;
    static const char *const options[] = {
	"-all",	    "-ascii",   "-bisect", "-decreasing", "-dictionary",
	"-exact",   "-glob",    "-increasing", "-index",
	"-inline",  "-integer", "-nocase",     "-not",
	"-real",    "-regexp",  "-sorted",     "-start",
	"-subindices", NULL
................................................................................
				/* Values to be compared. */
    SortInfo *infoPtr)		/* Information passed from the top-level
				 * "lsort" command. */
{
    int order = 0;

    if (infoPtr->sortMode == SORTMODE_ASCII) {
	order = strcmp(elemPtr1->collationKey.strValuePtr,
		elemPtr2->collationKey.strValuePtr);
    } else if (infoPtr->sortMode == SORTMODE_ASCII_NC) {
	order = TclUtfCasecmp(elemPtr1->collationKey.strValuePtr,
		elemPtr2->collationKey.strValuePtr);
    } else if (infoPtr->sortMode == SORTMODE_DICTIONARY) {
	order = DictionaryCompare(elemPtr1->collationKey.strValuePtr,
		elemPtr2->collationKey.strValuePtr);






|







 







|







2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
....
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
    int i, match, index, result, listc, length, elemLen, bisect;
    int dataType, isIncreasing, lower, upper, offset;
    Tcl_WideInt patWide, objWide;
    int allMatches, inlineReturn, negatedMatch, returnSubindices, noCase;
    double patDouble, objDouble;
    SortInfo sortInfo;
    Tcl_Obj *patObj, **listv, *listPtr, *startPtr, *itemPtr;
    SortStrCmpFn_t strCmpFn = TclUtfCmp;
    Tcl_RegExp regexp = NULL;
    static const char *const options[] = {
	"-all",	    "-ascii",   "-bisect", "-decreasing", "-dictionary",
	"-exact",   "-glob",    "-increasing", "-index",
	"-inline",  "-integer", "-nocase",     "-not",
	"-real",    "-regexp",  "-sorted",     "-start",
	"-subindices", NULL
................................................................................
				/* Values to be compared. */
    SortInfo *infoPtr)		/* Information passed from the top-level
				 * "lsort" command. */
{
    int order = 0;

    if (infoPtr->sortMode == SORTMODE_ASCII) {
	order = TclUtfCmp(elemPtr1->collationKey.strValuePtr,
		elemPtr2->collationKey.strValuePtr);
    } else if (infoPtr->sortMode == SORTMODE_ASCII_NC) {
	order = TclUtfCasecmp(elemPtr1->collationKey.strValuePtr,
		elemPtr2->collationKey.strValuePtr);
    } else if (infoPtr->sortMode == SORTMODE_DICTIONARY) {
	order = DictionaryCompare(elemPtr1->collationKey.strValuePtr,
		elemPtr2->collationKey.strValuePtr);

Changes to generic/tclCmdMZ.c.

3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
	"--", NULL
    };
    enum options {
	OPT_EXACT, OPT_GLOB, OPT_INDEXV, OPT_MATCHV, OPT_NOCASE, OPT_REGEXP,
	OPT_LAST
    };
    typedef int (*strCmpFn_t)(const char *, const char *);
    strCmpFn_t strCmpFn = strcmp;

    mode = OPT_EXACT;
    foundmode = 0;
    indexVarObj = NULL;
    matchVarObj = NULL;
    numMatchesSaved = 0;
    noCase = 0;






|







3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
	"--", NULL
    };
    enum options {
	OPT_EXACT, OPT_GLOB, OPT_INDEXV, OPT_MATCHV, OPT_NOCASE, OPT_REGEXP,
	OPT_LAST
    };
    typedef int (*strCmpFn_t)(const char *, const char *);
    strCmpFn_t strCmpFn = TclUtfCmp;

    mode = OPT_EXACT;
    foundmode = 0;
    indexVarObj = NULL;
    matchVarObj = NULL;
    numMatchesSaved = 0;
    noCase = 0;

Changes to generic/tclInt.h.

3215
3216
3217
3218
3219
3220
3221

3222
3223
3224
3225
3226
3227
3228
MODULE_SCOPE int	TclSubstTokens(Tcl_Interp *interp, Tcl_Token *tokenPtr,
			    int count, int *tokensLeftPtr, int line,
			    int *clNextOuter, const char *outerScript);
MODULE_SCOPE int	TclTrimLeft(const char *bytes, int numBytes,
			    const char *trim, int numTrim);
MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
			    const char *trim, int numTrim);

MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
MODULE_SCOPE int	TclUtfCount(int ch);
MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
MODULE_SCOPE Tcl_Obj *	TclpFilesystemPathType(Tcl_Obj *pathPtr);
MODULE_SCOPE int	TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
			    Tcl_LoadHandle *loadHandle,
			    Tcl_FSUnloadFileProc **unloadProcPtr, int flags);






>







3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
MODULE_SCOPE int	TclSubstTokens(Tcl_Interp *interp, Tcl_Token *tokenPtr,
			    int count, int *tokensLeftPtr, int line,
			    int *clNextOuter, const char *outerScript);
MODULE_SCOPE int	TclTrimLeft(const char *bytes, int numBytes,
			    const char *trim, int numTrim);
MODULE_SCOPE int	TclTrimRight(const char *bytes, int numBytes,
			    const char *trim, int numTrim);
MODULE_SCOPE int	TclUtfCmp(const char *cs, const char *ct);
MODULE_SCOPE int	TclUtfCasecmp(const char *cs, const char *ct);
MODULE_SCOPE int	TclUtfCount(int ch);
MODULE_SCOPE Tcl_Obj *	TclpNativeToNormalized(ClientData clientData);
MODULE_SCOPE Tcl_Obj *	TclpFilesystemPathType(Tcl_Obj *pathPtr);
MODULE_SCOPE int	TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
			    Tcl_LoadHandle *loadHandle,
			    Tcl_FSUnloadFileProc **unloadProcPtr, int flags);

Changes to generic/tclUtf.c.

1104
1105
1106
1107
1108
1109
1110









1111
1112
1113
1114
1115
1116
1117
....
1136
1137
1138
1139
1140
1141
1142

1143
1144
1145
1146
1147
1148
1149
1150









1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165













































1166
1167
1168
1169
1170
1171
1172
....
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187

1188
1189









1190
1191
1192
1193
1194
1195
1196
	 * n must be interpreted as chars, not bytes. This should be called
	 * only when both strings are of at least n chars long (no need for \0
	 * check)
	 */

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);









	if (ch1 != ch2) {
	    return (ch1 - ch2);
	}
    }
    return 0;
}
 
................................................................................
int
Tcl_UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;

    while (numChars-- > 0) {
	/*
	 * n must be interpreted as chars, not bytes.
	 * This should be called only when both strings are of
	 * at least n chars long (no need for \0 check)
	 */
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);









	if (ch1 != ch2) {
	    ch1 = Tcl_UniCharToLower(ch1);
	    ch2 = Tcl_UniCharToLower(ch2);
	    if (ch1 != ch2) {
		return (ch1 - ch2);
	    }
	}
    }
    return 0;
}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_UtfNcasecmp --













































 *
 *	Compare UTF chars of string cs to string ct case insensitively.
 *	Replacement for strcasecmp in Tcl core, in places where UTF-8 should
 *	be handled.
 *
 * Results:
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
................................................................................
 */

int
TclUtfCasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct)		/* UTF string cs is compared to. */
{
    while (*cs && *ct) {
	Tcl_UniChar ch1 = 0, ch2 = 0;


	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);









	if (ch1 != ch2) {
	    ch1 = Tcl_UniCharToLower(ch1);
	    ch2 = Tcl_UniCharToLower(ch2);
	    if (ch1 != ch2) {
		return ch1 - ch2;
	    }
	}






>
>
>
>
>
>
>
>
>







 







>








>
>
>
>
>
>
>
>
>










|



|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







<
|

>


>
>
>
>
>
>
>
>
>







1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
....
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
....
1242
1243
1244
1245
1246
1247
1248

1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
	 * n must be interpreted as chars, not bytes. This should be called
	 * only when both strings are of at least n chars long (no need for \0
	 * check)
	 */

	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);
#if TCL_UTF_MAX == 4
    /* map high surrogate characters to values > 0xffff */
    if ((ch1 & 0xFC00) == 0xD800) {
	ch1 += 0x4000;
    }
    if ((ch2 & 0xFC00) == 0xD800) {
	ch2 += 0x4000;
    }
#endif
	if (ch1 != ch2) {
	    return (ch1 - ch2);
	}
    }
    return 0;
}
 
................................................................................
int
Tcl_UtfNcasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct,		/* UTF string cs is compared to. */
    unsigned long numChars)	/* Number of UTF chars to compare. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;

    while (numChars-- > 0) {
	/*
	 * n must be interpreted as chars, not bytes.
	 * This should be called only when both strings are of
	 * at least n chars long (no need for \0 check)
	 */
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);
#if TCL_UTF_MAX == 4
    /* map high surrogate characters to values > 0xffff */
    if ((ch1 & 0xFC00) == 0xD800) {
	ch1 += 0x4000;
    }
    if ((ch2 & 0xFC00) == 0xD800) {
	ch2 += 0x4000;
    }
#endif
	if (ch1 != ch2) {
	    ch1 = Tcl_UniCharToLower(ch1);
	    ch2 = Tcl_UniCharToLower(ch2);
	    if (ch1 != ch2) {
		return (ch1 - ch2);
	    }
	}
    }
    return 0;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UtfCmp --
 *
 *	Compare UTF chars of string cs to string ct case sensitively.
 *	Replacement for strcmp in Tcl core, in places where UTF-8 should
 *	be handled.
 *
 * Results:
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
TclUtfCmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct)		/* UTF string cs is compared to. */
{
    Tcl_UniChar ch1 = 0, ch2 = 0;

    while (*cs && *ct) {
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);
#if TCL_UTF_MAX == 4
    /* map high surrogate characters to values > 0xffff */
    if ((ch1 & 0xFC00) == 0xD800) {
	ch1 += 0x4000;
    }
    if ((ch2 & 0xFC00) == 0xD800) {
	ch2 += 0x4000;
    }
#endif
	if (ch1 != ch2) {
	    return ch1 - ch2;
	}
    }
    return UCHAR(*cs) - UCHAR(*ct);
}

 
/*
 *----------------------------------------------------------------------
 *
 * TclUtfCasecmp --
 *
 *	Compare UTF chars of string cs to string ct case insensitively.
 *	Replacement for strcasecmp in Tcl core, in places where UTF-8 should
 *	be handled.
 *
 * Results:
 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
................................................................................
 */

int
TclUtfCasecmp(
    const char *cs,		/* UTF string to compare to ct. */
    const char *ct)		/* UTF string cs is compared to. */
{

    Tcl_UniChar ch1 = 0, ch2 = 0;

    while (*cs && *ct) {
	cs += TclUtfToUniChar(cs, &ch1);
	ct += TclUtfToUniChar(ct, &ch2);
#if TCL_UTF_MAX == 4
    /* map high surrogate characters to values > 0xffff */
    if ((ch1 & 0xFC00) == 0xD800) {
	ch1 += 0x4000;
    }
    if ((ch2 & 0xFC00) == 0xD800) {
	ch2 += 0x4000;
    }
#endif
	if (ch1 != ch2) {
	    ch1 = Tcl_UniCharToLower(ch1);
	    ch2 = Tcl_UniCharToLower(ch2);
	    if (ch1 != ch2) {
		return ch1 - ch2;
	    }
	}