Tcl Source Code

Check-in [cbaa5e7016]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Remove all checks for noncharacters
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | bug-17a1cb8d6e
Files: files | file ages | folders
SHA3-256: cbaa5e70167db75b07d12efb23a3fa76164f53ca55a2391236a0894a74a5682e
User & Date: jan.nijtmans 2022-12-19 22:19:59
References
2022-12-19
22:21 Pending ticket [17a1cb8d6e]: Tcl 9: "illegal byte sequence" ?! plus 5 other changes artifact: 6bfb791677 user: jan.nijtmans
Context
2022-12-20
09:32
Fix [17a1cb8d6e2a51bd]. From now on, noncharacters are no longer rejected in -strict mode check-in: aa03339d48 user: jan.nijtmans tags: core-8-branch
2022-12-19
22:19
Remove all checks for noncharacters Closed-Leaf check-in: cbaa5e7016 user: jan.nijtmans tags: bug-17a1cb8d6e
15:43
Add two testcases, showing how we can distinguish Invalid Byte sequences, Surrogates, and Noncharact... check-in: dbe0586aaa user: jan.nijtmans tags: core-8-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclEncoding.c.

2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
		    *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
		    *dst++ = (char) ((ch | 0x80) & 0xBF);
		    continue;
		}
		src += len;
		dst += Tcl_UniCharToUtf(ch, dst);
		ch = low;
	    } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && !Tcl_UniCharIsUnicode(ch)
		    && (((ch  & ~0x7FF) == 0xD800) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
		result = TCL_CONVERT_UNKNOWN;
		src = saveSrc;
		break;
	    } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
		    && (flags & TCL_ENCODING_MODIFIED) && !Tcl_UniCharIsUnicode(ch)) {
		result = TCL_CONVERT_SYNTAX;
		src = saveSrc;
		break;
	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }







|
<




|







2464
2465
2466
2467
2468
2469
2470
2471

2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
		    *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
		    *dst++ = (char) ((ch | 0x80) & 0xBF);
		    continue;
		}
		src += len;
		dst += Tcl_UniCharToUtf(ch, dst);
		ch = low;
	    } else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch  & ~0x7FF) == 0xD800))) {

		result = TCL_CONVERT_UNKNOWN;
		src = saveSrc;
		break;
	    } else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
		    && (flags & TCL_ENCODING_MODIFIED) && ((ch  & ~0x7FF) == 0xD800)) {
		result = TCL_CONVERT_SYNTAX;
		src = saveSrc;
		break;
	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576

	if (flags & TCL_ENCODING_LE) {
	    ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
	} else {
	    ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
	}
	if  ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
		&& !Tcl_UniCharIsUnicode(ch))) {
	    if (STOPONERROR) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	}

	/*







|







2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575

	if (flags & TCL_ENCODING_LE) {
	    ch = (src[3] & 0xFF) << 24 | (src[2] & 0xFF) << 16 | (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
	} else {
	    ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
	}
	if  ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
		&& ((ch  & ~0x7FF) == 0xD800))) {
	    if (STOPONERROR) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	}

	/*
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	len = TclUtfToUCS4(src, &ch);
	if (!Tcl_UniCharIsUnicode(ch) && (((ch  & ~0x7FF) == 0xD800) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
	    if (STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {







|







2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	len = TclUtfToUCS4(src, &ch);
	if ((ch  & ~0x7FF) == 0xD800) {
	    if (STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	len = TclUtfToUCS4(src, &ch);
	if (!Tcl_UniCharIsUnicode(ch) && (((ch  & ~0x7FF) == 0xD800) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
	    if (STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {







|







2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	len = TclUtfToUCS4(src, &ch);
	if ((ch  & ~0x7FF) == 0xD800) {
	    if (STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {

Changes to tests/encoding.test.

699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xF0'}
test encoding-24.28 {Parse invalid utf-8 with -strict} -body {
    encoding convertfrom -strict utf-8 "\xFF\x00\x00"
} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xFF'}
test encoding-24.29 {Parse invalid utf-8} -body {
    encoding convertfrom utf-8 \xEF\xBF\xBF
} -result \uFFFF
test encoding-24.30 {Parse invalid utf-8 with -strict} -body {
    encoding convertfrom -strict utf-8 \xEF\xBF\xBF
} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xEF'}
test encoding-24.31 {Parse invalid utf-8 with -nocomplain} -body {
    encoding convertfrom -nocomplain utf-8 \xEF\xBF\xBF
} -result \uFFFF
test encoding-24.32 {Try to generate invalid utf-8} -body {
    encoding convertto utf-8 \uFFFF
} -result \xEF\xBF\xBF
test encoding-24.33 {Try to generate invalid utf-8 with -strict} -body {
    encoding convertto -strict utf-8 \uFFFF
} -returnCodes 1 -result {unexpected character at index 0: 'U+00FFFF'}
test encoding-24.34 {Try to generate invalid utf-8 with -nocomplain} -body {
    encoding convertto -nocomplain utf-8 \uFFFF
} -result \xEF\xBF\xBF
test encoding-24.35 {Parse invalid utf-8} -constraints deprecated -body {
    encoding convertfrom utf-8 \xED\xA0\x80
} -result \uD800
test encoding-24.36 {Parse invalid utf-8 with -strict} -body {







|

|






|

|







699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xF0'}
test encoding-24.28 {Parse invalid utf-8 with -strict} -body {
    encoding convertfrom -strict utf-8 "\xFF\x00\x00"
} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xFF'}
test encoding-24.29 {Parse invalid utf-8} -body {
    encoding convertfrom utf-8 \xEF\xBF\xBF
} -result \uFFFF
test encoding-24.30 {Parse noncharacter with -strict} -body {
    encoding convertfrom -strict utf-8 \xEF\xBF\xBF
} -result \uFFFF
test encoding-24.31 {Parse invalid utf-8 with -nocomplain} -body {
    encoding convertfrom -nocomplain utf-8 \xEF\xBF\xBF
} -result \uFFFF
test encoding-24.32 {Try to generate invalid utf-8} -body {
    encoding convertto utf-8 \uFFFF
} -result \xEF\xBF\xBF
test encoding-24.33 {Try to generate noncharacter with -strict} -body {
    encoding convertto -strict utf-8 \uFFFF
} -result \xEF\xBF\xBF
test encoding-24.34 {Try to generate invalid utf-8 with -nocomplain} -body {
    encoding convertto -nocomplain utf-8 \uFFFF
} -result \xEF\xBF\xBF
test encoding-24.35 {Parse invalid utf-8} -constraints deprecated -body {
    encoding convertfrom utf-8 \xED\xA0\x80
} -result \uD800
test encoding-24.36 {Parse invalid utf-8 with -strict} -body {