Tcl Source Code

Check-in [341e1e54c4]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Merge 8.7
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk | main
Files: files | file ages | folders
SHA3-256: 341e1e54c4dfdf0c21bce7b2a99d146c63d29126bc9e9fdd75134fa246730970
User & Date: jan.nijtmans 2021-03-17 15:05:02.324
Context
2021-03-18
09:13
Merge 8.7 check-in: 702b3410f7 user: jan.nijtmans tags: trunk, main
2021-03-17
15:05
Merge 8.7 check-in: 341e1e54c4 user: jan.nijtmans tags: trunk, main
15:01
Simplify UtfToUtfProc() and UtfToUtf16Proc(): Using TclUtfToUCS4() internally means that we don't ha... check-in: 357301b2a8 user: jan.nijtmans tags: core-8-branch
09:07
Merge 8.7 check-in: f4fc2b9a0d user: jan.nijtmans tags: trunk, main
Changes
Unified Diff Ignore Whitespace Patch
Changes to generic/tclEncoding.c.
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227

static int
UtfToUtfProc(
    TCL_UNUSED(ClientData),
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
				 * information used during a piecewise
				 * conversion. Contents of statePtr are
				 * initialized and/or reset by conversion
				 * routine under control of flags argument. */
    char *dst,			/* Output buffer in which converted string is
				 * stored. */
    int dstLen,			/* The maximum length of output buffer in
				 * bytes. */
    int *srcReadPtr,		/* Filled with the number of bytes from the
				 * source string that were converted. This may
				 * be less than the original source length if







|
<
<
<
<







2209
2210
2211
2212
2213
2214
2215
2216




2217
2218
2219
2220
2221
2222
2223

static int
UtfToUtfProc(
    TCL_UNUSED(ClientData),
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    TCL_UNUSED(Tcl_EncodingState *),




    char *dst,			/* Output buffer in which converted string is
				 * stored. */
    int dstLen,			/* The maximum length of output buffer in
				 * bytes. */
    int *srcReadPtr,		/* Filled with the number of bytes from the
				 * source string that were converted. This may
				 * be less than the original source length if
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
    int pureNullMode)		/* Convert embedded nulls from internal
				 * representation to real null-bytes or vice
				 * versa. Also combine or separate surrogate pairs */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd;
    int result, numChars, charLimit = INT_MAX;
    int *chPtr = (int *) statePtr;

    if (flags & TCL_ENCODING_START) {
    	*statePtr = 0;
    }
    result = TCL_OK;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= 6;







<
|
<
<
|







2232
2233
2234
2235
2236
2237
2238

2239


2240
2241
2242
2243
2244
2245
2246
2247
    int pureNullMode)		/* Convert embedded nulls from internal
				 * representation to real null-bytes or vice
				 * versa. Also combine or separate surrogate pairs */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd;
    int result, numChars, charLimit = INT_MAX;

    int ch;



    result = TCL_OK;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= 6;
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    /*
	     * Always check before using TclUtfToUCS4. Not doing can so
	     * cause it run beyond the end of the buffer! If we happen such an
	     * incomplete char its bytes are made to represent themselves.
	     */

	    *chPtr = UCHAR(*src);
	    src += 1;
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
	} else {
	    src += TclUtfToUCS4(src, chPtr);
	    if ((*chPtr | 0x7FF) == 0xDFFF) {
		/* A surrogate character is detected, handle especially */
		int low = *chPtr;
		size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
		if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) {
			*dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
			*dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
			*dst++ = (char) ((*chPtr | 0x80) & 0xBF);
			continue;
		}
		src += len;
		dst += Tcl_UniCharToUtf(*chPtr, dst);
		*chPtr = low;
	    }
	    dst += Tcl_UniCharToUtf(*chPtr, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;







|

|

|
|

|

|
|
|
|



|
|

|







2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    /*
	     * Always check before using TclUtfToUCS4. Not doing can so
	     * cause it run beyond the end of the buffer! If we happen such an
	     * incomplete char its bytes are made to represent themselves.
	     */

	    ch = UCHAR(*src);
	    src += 1;
	    dst += Tcl_UniCharToUtf(ch, dst);
	} else {
	    src += TclUtfToUCS4(src, &ch);
	    if ((ch | 0x7FF) == 0xDFFF) {
		/* A surrogate character is detected, handle especially */
		int low = ch;
		size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
			*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
			*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
			*dst++ = (char) ((ch | 0x80) & 0xBF);
			continue;
		}
		src += len;
		dst += Tcl_UniCharToUtf(ch, dst);
		ch = low;
	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479

static int
UtfToUtf16Proc(
    ClientData clientData,	/* != NULL means LE, == NUL means BE */
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
				 * information used during a piecewise
				 * conversion. Contents of statePtr are
				 * initialized and/or reset by conversion
				 * routine under control of flags argument. */
    char *dst,			/* Output buffer in which converted string is
				 * stored. */
    int dstLen,			/* The maximum length of output buffer in
				 * bytes. */
    int *srcReadPtr,		/* Filled with the number of bytes from the
				 * source string that were converted. This may
				 * be less than the original source length if
				 * there was a problem converting some source
				 * characters. */
    int *dstWrotePtr,		/* Filled with the number of bytes that were
				 * stored in the output buffer as a result of
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
    int result, numChars;
    Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;

    if (flags & TCL_ENCODING_START) {
    	*statePtr = 0;
    }
    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }








|
<
<
<
<


















<
|
<
<
|







2431
2432
2433
2434
2435
2436
2437
2438




2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456

2457


2458
2459
2460
2461
2462
2463
2464
2465

static int
UtfToUtf16Proc(
    ClientData clientData,	/* != NULL means LE, == NUL means BE */
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    TCL_UNUSED(Tcl_EncodingState *),




    char *dst,			/* Output buffer in which converted string is
				 * stored. */
    int dstLen,			/* The maximum length of output buffer in
				 * bytes. */
    int *srcReadPtr,		/* Filled with the number of bytes from the
				 * source string that were converted. This may
				 * be less than the original source length if
				 * there was a problem converting some source
				 * characters. */
    int *dstWrotePtr,		/* Filled with the number of bytes that were
				 * stored in the output buffer as a result of
				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
    int result, numChars;

    int ch;



    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }

2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	src += TclUtfToUniChar(src, chPtr);

	if (clientData) {
#if TCL_UTF_MAX > 3
	    if (*chPtr <= 0xFFFF) {
		*dst++ = (*chPtr & 0xFF);
		*dst++ = (*chPtr >> 8);
	    } else {
		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (*chPtr & 0xFF);
		*dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
	    }
#else
	    *dst++ = (*chPtr & 0xFF);
	    *dst++ = (*chPtr >> 8);
#endif
	} else {
#if TCL_UTF_MAX > 3
	    if (*chPtr <= 0xFFFF) {
		*dst++ = (*chPtr >> 8);
		*dst++ = (*chPtr & 0xFF);
	    } else {
		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
		*dst++ = ((*chPtr >> 8) & 0x3) | 0xDC;
		*dst++ = (*chPtr & 0xFF);
	    }
#else
	    *dst++ = (*chPtr >> 8);
	    *dst++ = (*chPtr & 0xFF);
#endif
	}
    }
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}







|
<

<
|
|
|

|
|
|
|

<
<
<
<

<
|
|
|

|
|
|
|

<
<
<
<







2477
2478
2479
2480
2481
2482
2483
2484

2485

2486
2487
2488
2489
2490
2491
2492
2493
2494




2495

2496
2497
2498
2499
2500
2501
2502
2503
2504




2505
2506
2507
2508
2509
2510
2511
	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	src += TclUtfToUCS4(src, &ch);

	if (clientData) {

	    if (ch <= 0xFFFF) {
		*dst++ = (ch & 0xFF);
		*dst++ = (ch >> 8);
	    } else {
		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (ch & 0xFF);
		*dst++ = ((ch >> 8) & 0x3) | 0xDC;
	    }




	} else {

	    if (ch <= 0xFFFF) {
		*dst++ = (ch >> 8);
		*dst++ = (ch & 0xFF);
	    } else {
		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
		*dst++ = ((ch >> 8) & 0x3) | 0xDC;
		*dst++ = (ch & 0xFF);
	    }




	}
    }
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}