Tcl Source Code

Check-in [47ace058d4]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to [email protected]
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Start implementing TIP #497. regexp's now are >BMP-aware. WIP
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | tip-497
Files: files | file ages | folders
SHA3-256: 47ace058d45a4dfaa58b28f1fc58445d64879b39583e51af39b25fbd82dd6226
User & Date: jan.nijtmans 2018-05-01 19:02:32
Context
2018-10-09
18:52
Merge trunk check-in: 2ec0fe1752 user: jan.nijtmans tags: tip-497
2018-05-01
19:02
Start implementing TIP #497. regexp's now are >BMP-aware. WIP check-in: 47ace058d4 user: jan.nijtmans tags: tip-497
18:43
merge 8.7 check-in: 6fe50c20de user: jan.nijtmans tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/regc_locale.c.

824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
...
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
    NOTE(REG_ULOCALE);

    /*
     * Search table.
     */

    Tcl_DStringInit(&ds);
    np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);
    for (cn=cnames; cn->name!=NULL; cn++) {
	if (strlen(cn->name)==len && strncmp(cn->name, np, len)==0) {
	    break;			/* NOTE BREAK OUT */
	}
    }
    Tcl_DStringFree(&ds);
    if (cn->name != NULL) {
................................................................................

    /*
     * Extract the class name
     */

    len = endp - startp;
    Tcl_DStringInit(&ds);
    np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);

    /*
     * Map the name to the corresponding enumerated value.
     */

    index = -1;
    for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {






|







 







|







824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
...
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
    NOTE(REG_ULOCALE);

    /*
     * Search table.
     */

    Tcl_DStringInit(&ds);
    np = TclUnicodeToUtfDString(startp, (int)len, &ds);
    for (cn=cnames; cn->name!=NULL; cn++) {
	if (strlen(cn->name)==len && strncmp(cn->name, np, len)==0) {
	    break;			/* NOTE BREAK OUT */
	}
    }
    Tcl_DStringFree(&ds);
    if (cn->name != NULL) {
................................................................................

    /*
     * Extract the class name
     */

    len = endp - startp;
    Tcl_DStringInit(&ds);
    np = TclUnicodeToUtfDString(startp, (int)len, &ds);

    /*
     * Map the name to the corresponding enumerated value.
     */

    index = -1;
    for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {

Changes to generic/regcustom.h.

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
..
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#ifdef __REG_NOFRONT
#undef __REG_NOFRONT
#endif
#ifdef __REG_NOCHAR
#undef __REG_NOCHAR
#endif
/* Interface types */
#define	__REG_WIDE_T	Tcl_UniChar
#define	__REG_REGOFF_T	long	/* Not really right, but good enough... */
/* Names and declarations */
#define	__REG_WIDE_COMPILE	TclReComp
#define	__REG_WIDE_EXEC		TclReExec
#define	__REG_NOFRONT		/* Don't want regcomp() and regexec() */
#define	__REG_NOCHAR		/* Or the char versions */
#define	regfree		TclReFree
................................................................................
#define	regerror	TclReError
/* --- end --- */

/*
 * Internal character type and related.
 */

typedef Tcl_UniChar chr;	/* The type itself. */
typedef int pchr;		/* What it promotes to. */
typedef unsigned uchr;		/* Unsigned type that will hold a chr. */
typedef int celt;		/* Type to hold chr, or NOCELT */
#define	NOCELT (-1)		/* Celt value which is not valid chr */
#define	CHR(c) (UCHAR(c))	/* Turn char literal into chr literal */
#define	DIGITVAL(c) ((c)-'0')	/* Turn chr digit into its value */
#if TCL_UTF_MAX > 4
#define	CHRBITS	32		/* Bits in a chr; must not use sizeof */
#define	CHR_MIN	0x00000000	/* Smallest and largest chr; the value */
#define	CHR_MAX	0x10ffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */
#else
#define	CHRBITS	16		/* Bits in a chr; must not use sizeof */
#define	CHR_MIN	0x0000		/* Smallest and largest chr; the value */
#define	CHR_MAX	0xffff		/* CHR_MAX-CHR_MIN+1 should fit in uchr */
#endif

/*
 * Functions operating on chr.
 */

#define	iscalnum(x)	Tcl_UniCharIsAlnum(x)
#define	iscalpha(x)	Tcl_UniCharIsAlpha(x)






|







 







|






<


|
<
<
<
<
<







62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
..
77
78
79
80
81
82
83
84
85
86
87
88
89
90

91
92
93





94
95
96
97
98
99
100
#ifdef __REG_NOFRONT
#undef __REG_NOFRONT
#endif
#ifdef __REG_NOCHAR
#undef __REG_NOCHAR
#endif
/* Interface types */
#define	__REG_WIDE_T	unsigned
#define	__REG_REGOFF_T	long	/* Not really right, but good enough... */
/* Names and declarations */
#define	__REG_WIDE_COMPILE	TclReComp
#define	__REG_WIDE_EXEC		TclReExec
#define	__REG_NOFRONT		/* Don't want regcomp() and regexec() */
#define	__REG_NOCHAR		/* Or the char versions */
#define	regfree		TclReFree
................................................................................
#define	regerror	TclReError
/* --- end --- */

/*
 * Internal character type and related.
 */

typedef unsigned chr;	/* The type itself. */
typedef int pchr;		/* What it promotes to. */
typedef unsigned uchr;		/* Unsigned type that will hold a chr. */
typedef int celt;		/* Type to hold chr, or NOCELT */
#define	NOCELT (-1)		/* Celt value which is not valid chr */
#define	CHR(c) (UCHAR(c))	/* Turn char literal into chr literal */
#define	DIGITVAL(c) ((c)-'0')	/* Turn chr digit into its value */

#define	CHRBITS	32		/* Bits in a chr; must not use sizeof */
#define	CHR_MIN	0x00000000	/* Smallest and largest chr; the value */
#define	CHR_MAX	0x0010ffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */






/*
 * Functions operating on chr.
 */

#define	iscalnum(x)	Tcl_UniCharIsAlnum(x)
#define	iscalpha(x)	Tcl_UniCharIsAlpha(x)

Changes to generic/regex.h.

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#ifdef __REG_NOFRONT
#undef __REG_NOFRONT
#endif
#ifdef __REG_NOCHAR
#undef __REG_NOCHAR
#endif
/* interface types */
#define	__REG_WIDE_T	Tcl_UniChar
#define	__REG_REGOFF_T	long	/* not really right, but good enough... */
/* names and declarations */
#define	__REG_WIDE_COMPILE	TclReComp
#define	__REG_WIDE_EXEC		TclReExec
#define	__REG_NOFRONT		/* don't want regcomp() and regexec() */
#define	__REG_NOCHAR		/* or the char versions */
#define	regfree		TclReFree






|







95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#ifdef __REG_NOFRONT
#undef __REG_NOFRONT
#endif
#ifdef __REG_NOCHAR
#undef __REG_NOCHAR
#endif
/* interface types */
#define	__REG_WIDE_T	unsigned
#define	__REG_REGOFF_T	long	/* not really right, but good enough... */
/* names and declarations */
#define	__REG_WIDE_COMPILE	TclReComp
#define	__REG_WIDE_EXEC		TclReExec
#define	__REG_NOFRONT		/* don't want regcomp() and regexec() */
#define	__REG_NOCHAR		/* or the char versions */
#define	regfree		TclReFree

Changes to generic/tclInt.h.

3981
3982
3983
3984
3985
3986
3987









3988
3989
3990
3991
3992
3993
3994
MODULE_SCOPE int	TclPtrUnsetVarIdx(Tcl_Interp *interp, Var *varPtr,
			    Var *arrayPtr, Tcl_Obj *part1Ptr,
			    Tcl_Obj *part2Ptr, const int flags,
			    int index);
MODULE_SCOPE void	TclInvalidateNsPath(Namespace *nsPtr);
MODULE_SCOPE void	TclFindArrayPtrElements(Var *arrayPtr,
			    Tcl_HashTable *tablePtr);










/*
 * The new extended interface to the variable traces.
 */

MODULE_SCOPE int	TclObjCallVarTraces(Interp *iPtr, Var *arrayPtr,
			    Var *varPtr, Tcl_Obj *part1Ptr, Tcl_Obj *part2Ptr,






>
>
>
>
>
>
>
>
>







3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
MODULE_SCOPE int	TclPtrUnsetVarIdx(Tcl_Interp *interp, Var *varPtr,
			    Var *arrayPtr, Tcl_Obj *part1Ptr,
			    Tcl_Obj *part2Ptr, const int flags,
			    int index);
MODULE_SCOPE void	TclInvalidateNsPath(Namespace *nsPtr);
MODULE_SCOPE void	TclFindArrayPtrElements(Var *arrayPtr,
			    Tcl_HashTable *tablePtr);
#if TCL_UTF_MAX <= 4
MODULE_SCOPE char *	TclUnicodeToUtfDString(const unsigned *uniStr,
				int uniLength, Tcl_DString *dsPtr);
MODULE_SCOPE unsigned *	TclUtfToUnicodeDString(const char *src, int length,
				Tcl_DString *dsPtr);
#else
#   define TclUnicodeToUtfDString Tcl_UniCharToUtfDString
#   define TclUtfToUnicodeDString Tcl_UtfToUniCharDString
#endif

/*
 * The new extended interface to the variable traces.
 */

MODULE_SCOPE int	TclObjCallVarTraces(Interp *iPtr, Var *arrayPtr,
			    Var *varPtr, Tcl_Obj *part1Ptr, Tcl_Obj *part2Ptr,

Changes to generic/tclRegexp.c.

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
...
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
...
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
...
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
...
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
...
428
429
430
431
432
433
434

435
436
437
438
439
440
441
442
443
...
460
461
462
463
464
465
466
467


468
469
470
471
472
473
474
475


476
477
478
479
480
481
482
...
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
...
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
static TclRegexp *	CompileRegexp(Tcl_Interp *interp, const char *pattern,
			    int length, int flags);
static void		DupRegexpInternalRep(Tcl_Obj *srcPtr,
			    Tcl_Obj *copyPtr);
static void		FinalizeRegexp(ClientData clientData);
static void		FreeRegexp(TclRegexp *regexpPtr);
static void		FreeRegexpInternalRep(Tcl_Obj *objPtr);
static int		RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re,
			    const Tcl_UniChar *uniString, int numChars,
			    int nmatches, int flags);
static int		SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);

/*
 * The regular expression Tcl object type. This serves as a cache of the
 * compiled form of the regular expression.
 */
................................................................................
    const char *start)		/* If text is part of a larger string, this
				 * identifies beginning of larger string, so
				 * that "^" won't match. */
{
    int flags, result, numChars;
    TclRegexp *regexp = (TclRegexp *) re;
    Tcl_DString ds;
    const Tcl_UniChar *ustr;

    /*
     * If the starting point is offset from the beginning of the buffer, then
     * we need to tell the regexp engine not to match "^".
     */

    if (text > start) {
................................................................................
    regexp->objPtr = NULL;

    /*
     * Convert the string to Unicode and perform the match.
     */

    Tcl_DStringInit(&ds);
    ustr = Tcl_UtfToUniCharDString(text, -1, &ds);
    numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar);
    result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */,
	    flags);
    Tcl_DStringFree(&ds);

    return result;
}
 
/*
................................................................................
	*endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo);
    }
}
 
/*
 *---------------------------------------------------------------------------
 *
 * RegExpExecUniChar --
 *
 *	Execute the regular expression matcher using a compiled form of a
 *	regular expression and save information about any match that is found.
 *
 * Results:
 *	If an error occurs during the matching operation then -1 is returned
 *	and an error message is left in interp's result. Otherwise the return
................................................................................
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static int
RegExpExecUniChar(
    Tcl_Interp *interp,		/* Interpreter to use for error reporting. */
    Tcl_RegExp re,		/* Compiled regular expression; returned by a
				 * previous call to Tcl_GetRegExpFromObj */
    const Tcl_UniChar *wString,	/* String against which to match re. */
    int numChars,		/* Length of Tcl_UniChar string (must be
				 * >=0). */
    int nmatches,		/* How many subexpression matches (counting
				 * the whole match as subexpression 0) are of
				 * interest. -1 means "don't know". */
    int flags)			/* Regular expression flags. */
{
    int status;
................................................................................
				 * should begin. */
    int nmatches,		/* How many subexpression matches (counting
				 * the whole match as subexpression 0) are of
				 * interest. -1 means all of them. */
    int flags)			/* Regular expression execution flags. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;

    Tcl_UniChar *udata;
    int length;
    int reflags = regexpPtr->flags;
#define TCL_REG_GLOBOK_FLAGS \
	(TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE)

    /*
     * Take advantage of the equivalent glob pattern, if one exists.
     * This is possible based only on the right mix of incoming flags (0)
................................................................................
    /*
     * Save the target object so we can extract strings from it later.
     */

    regexpPtr->string = NULL;
    regexpPtr->objPtr = textObj;

    udata = Tcl_GetUnicodeFromObj(textObj, &length);



    if (offset > length) {
	offset = length;
    }
    udata += offset;
    length -= offset;

    return RegExpExecUniChar(interp, re, udata, length, nmatches, flags);


}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpMatchObj --
 *
................................................................................
CompileRegexp(
    Tcl_Interp *interp,		/* Used for error reporting if not NULL. */
    const char *string,		/* The regexp to compile (UTF-8). */
    int length,			/* The length of the string in bytes. */
    int flags)			/* Compilation flags. */
{
    TclRegexp *regexpPtr;
    const Tcl_UniChar *uniString;
    int numChars, status, i, exact;
    Tcl_DString stringBuf;
    ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);

    if (!tsdPtr->initialized) {
	tsdPtr->initialized = 1;
	Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL);
................................................................................
    regexpPtr->details.rm_extend.rm_eo = -1;

    /*
     * Get the up-to-date string representation and map to unicode.
     */

    Tcl_DStringInit(&stringBuf);
    uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf);
    numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);

    /*
     * Compile the string and check for errors.
     */

    regexpPtr->flags = flags;
    status = TclReComp(&regexpPtr->re, uniString, (size_t) numChars, flags);






|
|







 







|







 







|
|
|







 







|







 







|



|
|







 







>
|
|







 







|
>
>







|
>
>







 







|







 







|
|







86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
...
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
...
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
...
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
...
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
...
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
...
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
...
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
...
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
static TclRegexp *	CompileRegexp(Tcl_Interp *interp, const char *pattern,
			    int length, int flags);
static void		DupRegexpInternalRep(Tcl_Obj *srcPtr,
			    Tcl_Obj *copyPtr);
static void		FinalizeRegexp(ClientData clientData);
static void		FreeRegexp(TclRegexp *regexpPtr);
static void		FreeRegexpInternalRep(Tcl_Obj *objPtr);
static int		RegExpExecUnicode(Tcl_Interp *interp, Tcl_RegExp re,
			    const __REG_WIDE_T *uniString, int numChars,
			    int nmatches, int flags);
static int		SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);

/*
 * The regular expression Tcl object type. This serves as a cache of the
 * compiled form of the regular expression.
 */
................................................................................
    const char *start)		/* If text is part of a larger string, this
				 * identifies beginning of larger string, so
				 * that "^" won't match. */
{
    int flags, result, numChars;
    TclRegexp *regexp = (TclRegexp *) re;
    Tcl_DString ds;
    const __REG_WIDE_T *ustr;

    /*
     * If the starting point is offset from the beginning of the buffer, then
     * we need to tell the regexp engine not to match "^".
     */

    if (text > start) {
................................................................................
    regexp->objPtr = NULL;

    /*
     * Convert the string to Unicode and perform the match.
     */

    Tcl_DStringInit(&ds);
    ustr = TclUtfToUnicodeDString(text, -1, &ds);
    numChars = Tcl_DStringLength(&ds) / sizeof(__REG_WIDE_T);
    result = RegExpExecUnicode(interp, re, ustr, numChars, -1 /* nmatches */,
	    flags);
    Tcl_DStringFree(&ds);

    return result;
}
 
/*
................................................................................
	*endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo);
    }
}
 
/*
 *---------------------------------------------------------------------------
 *
 * RegExpExecUnicode --
 *
 *	Execute the regular expression matcher using a compiled form of a
 *	regular expression and save information about any match that is found.
 *
 * Results:
 *	If an error occurs during the matching operation then -1 is returned
 *	and an error message is left in interp's result. Otherwise the return
................................................................................
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static int
RegExpExecUnicode(
    Tcl_Interp *interp,		/* Interpreter to use for error reporting. */
    Tcl_RegExp re,		/* Compiled regular expression; returned by a
				 * previous call to Tcl_GetRegExpFromObj */
    const __REG_WIDE_T *wString,	/* String against which to match re. */
    int numChars,		/* Length of Unicode string (must be
				 * >=0). */
    int nmatches,		/* How many subexpression matches (counting
				 * the whole match as subexpression 0) are of
				 * interest. -1 means "don't know". */
    int flags)			/* Regular expression flags. */
{
    int status;
................................................................................
				 * should begin. */
    int nmatches,		/* How many subexpression matches (counting
				 * the whole match as subexpression 0) are of
				 * interest. -1 means all of them. */
    int flags)			/* Regular expression execution flags. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;
    Tcl_DString ds;
    __REG_WIDE_T *udata;
    int length, result;
    int reflags = regexpPtr->flags;
#define TCL_REG_GLOBOK_FLAGS \
	(TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE)

    /*
     * Take advantage of the equivalent glob pattern, if one exists.
     * This is possible based only on the right mix of incoming flags (0)
................................................................................
    /*
     * Save the target object so we can extract strings from it later.
     */

    regexpPtr->string = NULL;
    regexpPtr->objPtr = textObj;

    Tcl_DStringInit(&ds);
    udata = TclUtfToUnicodeDString(Tcl_GetString(textObj), -1, &ds);
    length = Tcl_DStringLength(&ds)/sizeof(__REG_WIDE_T);

    if (offset > length) {
	offset = length;
    }
    udata += offset;
    length -= offset;

    result = RegExpExecUnicode(interp, re, udata, length, nmatches, flags);
    Tcl_DStringFree(&ds);
    return result;
}
 
/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpMatchObj --
 *
................................................................................
CompileRegexp(
    Tcl_Interp *interp,		/* Used for error reporting if not NULL. */
    const char *string,		/* The regexp to compile (UTF-8). */
    int length,			/* The length of the string in bytes. */
    int flags)			/* Compilation flags. */
{
    TclRegexp *regexpPtr;
    const __REG_WIDE_T *uniString;
    int numChars, status, i, exact;
    Tcl_DString stringBuf;
    ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);

    if (!tsdPtr->initialized) {
	tsdPtr->initialized = 1;
	Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL);
................................................................................
    regexpPtr->details.rm_extend.rm_eo = -1;

    /*
     * Get the up-to-date string representation and map to unicode.
     */

    Tcl_DStringInit(&stringBuf);
    uniString = TclUtfToUnicodeDString(string, length, &stringBuf);
    numChars = Tcl_DStringLength(&stringBuf) / sizeof(__REG_WIDE_T);

    /*
     * Compile the string and check for errors.
     */

    regexpPtr->flags = flags;
    status = TclReComp(&regexpPtr->re, uniString, (size_t) numChars, flags);

Changes to generic/tclUtf.c.

231
232
233
234
235
236
237

























































238
239
240
241
242
243
244
...
432
433
434
435
436
437
438




































































439
440
441
442
443
444
445
    return string;
}
 
/*
 *---------------------------------------------------------------------------
 *

























































 * Tcl_UtfToUniChar --
 *
 *	Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
 *	sequences are converted to valid Tcl_UniChars and processing
 *	continues. Equivalent to Plan 9 chartorune().
 *
 *	The caller must ensure that the source buffer is long enough that this
................................................................................
    *w = '\0';
    Tcl_DStringSetLength(dsPtr,
	    (oldLength + ((char *) w - (char *) wString)));

    return wString;
}
 




































































/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfCharComplete --
 *
 *	Determine if the UTF-8 string of the given length is long enough to be
 *	decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
...
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
    return string;
}
 
/*
 *---------------------------------------------------------------------------
 *
 * TclUnicodeToUtfDString --
 *
 *	Convert the given Unicode string to UTF-8.
 *
 * Results:
 *	The return value is a pointer to the UTF-8 representation of the
 *	Unicode string. Storage for the return value is appended to the end of
 *	dsPtr.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

#if TCL_UTF_MAX <= 4
char *
TclUnicodeToUtfDString(
    const unsigned *uniStr,	/* Unicode string to convert to UTF-8. */
    int uniLength,		/* Length of Unicode string in Tcl_UniChars
				 * (must be >= 0). */
    Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
				 * to this previously initialized DString. */
{
    const unsigned *w, *wEnd;
    char *p, *string;
    int oldLength;

    /*
     * UTF-8 string length in bytes will be <= Unicode string length * 4.
     */

    oldLength = Tcl_DStringLength(dsPtr);
    Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * 4);
    string = Tcl_DStringValue(dsPtr) + oldLength;

    p = string;
    wEnd = uniStr + uniLength;
    for (w = uniStr; w < wEnd; ) {
	if ((*w & 0xD800) == 0xD800) {
	    *p++ = (*w >> 12) | 0xE0;
	    *p++ = ((*w >> 6) & 0x3F) | 0x80;
	    *p++ = (*w & 0x3F) | 0xE0;
	} else {
	    p += Tcl_UniCharToUtf(*w, p);
	}
	w++;
    }
    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));

    return string;
}
#endif
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfToUniChar --
 *
 *	Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
 *	sequences are converted to valid Tcl_UniChars and processing
 *	continues. Equivalent to Plan 9 chartorune().
 *
 *	The caller must ensure that the source buffer is long enough that this
................................................................................
    *w = '\0';
    Tcl_DStringSetLength(dsPtr,
	    (oldLength + ((char *) w - (char *) wString)));

    return wString;
}
 
/*
 *---------------------------------------------------------------------------
 *
 * TclUtfToUnicodeDString --
 *
 *	Convert the UTF-8 string to Unicode.
 *
 * Results:
 *	The return value is a pointer to the Unicode representation of the
 *	UTF-8 string. Storage for the return value is appended to the end of
 *	dsPtr. The Unicode string is terminated with a Unicode NULL character.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */
#if TCL_UTF_MAX <= 4
unsigned *
TclUtfToUnicodeDString(
    const char *src,		/* UTF-8 string to convert to Unicode. */
    int length,			/* Length of UTF-8 string in bytes, or -1 for
				 * strlen(). */
    Tcl_DString *dsPtr)		/* Unicode representation of string is
				 * appended to this previously initialized
				 * DString. */
{
    Tcl_UniChar ch = 0;
    unsigned *w, *wString;
    const char *p, *end;
    int oldLength, len;

    if (length < 0) {
	length = strlen(src);
    }

    /*
     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
     * bytes.
     */

    oldLength = Tcl_DStringLength(dsPtr);
/* TODO: fix overreach! */
    Tcl_DStringSetLength(dsPtr,
	    (int) ((oldLength + length + 1) * sizeof(unsigned)));
    wString = (unsigned *) (Tcl_DStringValue(dsPtr) + oldLength);

    w = wString;
    end = src + length;
    for (p = src; p < end; ) {
	len = TclUtfToUniChar(p, &ch);
	if (!len) {
	    int high = ch;
	    len = TclUtfToUniChar(p, &ch);
	    *w++ = ((high & 0x7ff) << 10) + (ch & 0x7ff) + 0x10000;
	} else {
	    *w++ = ch;
	}
	p += len;
    }
    *w = '\0';
    Tcl_DStringSetLength(dsPtr,
	    (oldLength + ((char *) w - (char *) wString)));

    return wString;
}
#endif
 
/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UtfCharComplete --
 *
 *	Determine if the UTF-8 string of the given length is long enough to be
 *	decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8