Tcl Source Code

Check-in [7beae32e00]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Implement unicode is CLASS to match string is CLASS
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | tip-726-plus
Files: files | file ages | folders
SHA3-256: 7beae32e00845e1dc3a451532463bf845b08a121a8e31fe1da78be69cca8c8ce
User & Date: apnadkarni 2025-07-25 15:49:23.654
Context
2025-07-25
16:05
Disable tests for bug 1ecea011 check-in: 13c2631e68 user: apnadkarni tags: tip-726-plus
15:49
Implement unicode is CLASS to match string is CLASS check-in: 7beae32e00 user: apnadkarni tags: tip-726-plus
2025-07-23
16:28
Merged tip-726 check-in: d5fb960aa0 user: apnadkarni tags: tip-726-plus
Changes
Unified Diff Ignore Whitespace Patch
Changes to generic/tclCmdMZ.c.
5505
5506
5507
5508
5509
5510
5511

5512
5513
5514

5515
5516
5517
5518
5519
5520
5521
5522

    /*
     * When entering here, result == 1 and failat == 0.
     */

    switch (index) {
    case STR_IS_ALNUM:

	chcomp = Tcl_UniCharIsAlnum;
	break;
    case STR_IS_ALPHA:

	chcomp = Tcl_UniCharIsAlpha;
	break;
    case STR_IS_ASCII:
	chcomp = UniCharIsAscii;
	break;
    case STR_IS_BOOL:
    case STR_IS_TRUE:
    case STR_IS_FALSE:







>
|


>
|







5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524

    /*
     * When entering here, result == 1 and failat == 0.
     */

    switch (index) {
    case STR_IS_ALNUM:
	int Tcl_Utf8procIsAlnum(int ch);
	chcomp = Tcl_Utf8procIsAlnum;
	break;
    case STR_IS_ALPHA:
	int Tcl_Utf8procIsAlpha(int ch);
	chcomp = Tcl_Utf8procIsAlpha;
	break;
    case STR_IS_ASCII:
	chcomp = UniCharIsAscii;
	break;
    case STR_IS_BOOL:
    case STR_IS_TRUE:
    case STR_IS_FALSE:
5530
5531
5532
5533
5534
5535
5536

5537
5538
5539
5540
5541
5542
5543
5544
	    }
	} else if ((objPtr->internalRep.wideValue != 0)
		? (index == STR_IS_FALSE) : (index == STR_IS_TRUE)) {
	    result = 0;
	}
	break;
    case STR_IS_CONTROL:

	chcomp = Tcl_UniCharIsControl;
	break;
    case STR_IS_DICT: {
	int dresult;
	Tcl_Size dsize;

	dresult = Tcl_DictObjSize(interp, objPtr, &dsize);
	Tcl_ResetResult(interp);







>
|







5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
	    }
	} else if ((objPtr->internalRep.wideValue != 0)
		? (index == STR_IS_FALSE) : (index == STR_IS_TRUE)) {
	    result = 0;
	}
	break;
    case STR_IS_CONTROL:
	int Tcl_Utf8procIsControl(int ch);
	chcomp = Tcl_Utf8procIsControl;
	break;
    case STR_IS_DICT: {
	int dresult;
	Tcl_Size dsize;

	dresult = Tcl_DictObjSize(interp, objPtr, &dsize);
	Tcl_ResetResult(interp);
5582
5583
5584
5585
5586
5587
5588

5589
5590

5591
5592
5593
5594
5595
5596
5597
		    break;
		}
	    }
	}
	break;
    }
    case STR_IS_DIGIT:

	chcomp = Tcl_UniCharIsDigit;
	break;

    case STR_IS_DOUBLE: {
	if (TclHasInternalRep(objPtr, &tclDoubleType) ||
		TclHasInternalRep(objPtr, &tclIntType) ||
		TclHasInternalRep(objPtr, &tclBignumType)) {
	    break;
	}
	string1 = TclGetStringFromObj(objPtr, &length1);







>
|

>







5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
		    break;
		}
	    }
	}
	break;
    }
    case STR_IS_DIGIT:
	int Tcl_Utf8procIsDigit(int ch);
	chcomp = Tcl_Utf8procIsDigit;
	break;

    case STR_IS_DOUBLE: {
	if (TclHasInternalRep(objPtr, &tclDoubleType) ||
		TclHasInternalRep(objPtr, &tclIntType) ||
		TclHasInternalRep(objPtr, &tclBignumType)) {
	    break;
	}
	string1 = TclGetStringFromObj(objPtr, &length1);
5612
5613
5614
5615
5616
5617
5618

5619
5620
5621
5622
5623
5624
5625
5626
		result = 0;
		TclFreeInternalRep(objPtr);
	    }
	}
	break;
    }
    case STR_IS_GRAPH:

	chcomp = Tcl_UniCharIsGraph;
	break;
    case STR_IS_INT:
    case STR_IS_ENTIER:
	if (TclHasInternalRep(objPtr, &tclIntType) ||
		TclHasInternalRep(objPtr, &tclBignumType)) {
	    break;
	}







>
|







5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
		result = 0;
		TclFreeInternalRep(objPtr);
	    }
	}
	break;
    }
    case STR_IS_GRAPH:
	int Tcl_Utf8procIsGraph(int ch);
	chcomp = Tcl_Utf8procIsGraph;
	break;
    case STR_IS_INT:
    case STR_IS_ENTIER:
	if (TclHasInternalRep(objPtr, &tclIntType) ||
		TclHasInternalRep(objPtr, &tclBignumType)) {
	    break;
	}
5763
5764
5765
5766
5767
5768
5769

5770
5771
5772

5773
5774
5775

5776
5777
5778

5779
5780
5781

5782
5783
5784

5785
5786
5787
5788
5789
5790
5791
5792
		    break;
		}
	    }
	}
	result = 0;
	break;
    case STR_IS_LOWER:

	chcomp = utf8proc_islower;
	break;
    case STR_IS_PRINT:

	chcomp = Tcl_UniCharIsPrint;
	break;
    case STR_IS_PUNCT:

	chcomp = Tcl_UniCharIsPunct;
	break;
    case STR_IS_SPACE:

	chcomp = Tcl_UniCharIsSpace;
	break;
    case STR_IS_UPPER:

	chcomp = utf8proc_isupper;
	break;
    case STR_IS_WORD:

	chcomp = Tcl_UniCharIsWordChar;
	break;
    case STR_IS_XDIGIT:
	chcomp = UniCharIsHexDigit;
	break;
    default:
	TCL_UNREACHABLE();
    }







>
|


>
|


>
|


>
|


>
|


>
|







5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
		    break;
		}
	    }
	}
	result = 0;
	break;
    case STR_IS_LOWER:
	int Tcl_Utf8procIsLower(int ch);
	chcomp = Tcl_Utf8procIsLower;
	break;
    case STR_IS_PRINT:
	int Tcl_Utf8procIsPrint(int ch);
	chcomp = Tcl_Utf8procIsPrint;
	break;
    case STR_IS_PUNCT:
	int Tcl_Utf8procIsPunct(int ch);
	chcomp = Tcl_Utf8procIsPunct;
	break;
    case STR_IS_SPACE:
	int Tcl_Utf8procIsSpace(int ch);
	chcomp = Tcl_Utf8procIsSpace;
	break;
    case STR_IS_UPPER:
	int Tcl_Utf8procIsUpper(int ch);
	chcomp = Tcl_Utf8procIsUpper;
	break;
    case STR_IS_WORD:
	int Tcl_Utf8procIsWordChar(int ch);
	chcomp = Tcl_Utf8procIsWordChar;
	break;
    case STR_IS_XDIGIT:
	chcomp = UniCharIsHexDigit;
	break;
    default:
	TCL_UNREACHABLE();
    }
5822
5823
5824
5825
5826
5827
5828







































5829
5830
5831
5832
5833
5834
5835
	if (Tcl_ObjSetVar2(interp, failVarObj, NULL, objPtr, TCL_LEAVE_ERR_MSG) == NULL) {
	    return TCL_ERROR;
	}
    }
    Tcl_SetObjResult(interp, Tcl_NewBooleanObj(result));
    return TCL_OK;
}








































/*
 * TclUnicodeNormalizeCmd --
 *
 *	This procedure implements the "unicode tonfc|tonfd|tonfkc|tonfkd"
 *	commands. See the user documentation for details on what it does.
 *







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
	if (Tcl_ObjSetVar2(interp, failVarObj, NULL, objPtr, TCL_LEAVE_ERR_MSG) == NULL) {
	    return TCL_ERROR;
	}
    }
    Tcl_SetObjResult(interp, Tcl_NewBooleanObj(result));
    return TCL_OK;
}

/*
 * TclUnicodeNormalizeCmd --
 *
 *	This procedure implements the "unicode tonfc|tonfd|tonfkc|tonfkd"
 *	commands. See the user documentation for details on what it does.
 *
 * Results:
 *	A standard Tcl result.
 *
 * Side effects:
 *	Stores the normalized string in the interpreter result.
 */
static int
TclUnicodeCategoryCmd(
    void *clientData,		/* TCL_{NFC,NFD,NFKC,NFKD} */
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Argument objects. */
{
    if (objc != 2) {
	Tcl_WrongNumArgs(interp, 1, objv,
		"str");
	return TCL_ERROR;
    }
    Tcl_Obj *categoriesObj = Tcl_NewListObj(0, NULL);
    Tcl_Size length1, length2;
    const char *string = TclGetStringFromObj(objv[1], &length1);
    const char *end = string + length1;
    for (; string < end; string += length2) {
	int ucs4;

	length2 = TclUtfToUniChar(string, &ucs4);
	Tcl_ListObjAppendElement(interp, categoriesObj,
	    Tcl_NewStringObj(utf8proc_category_string(ucs4), -1));
    }
    Tcl_SetObjResult(interp, categoriesObj);
    return TCL_OK;
}

/*
 * TclUnicodeNormalizeCmd --
 *
 *	This procedure implements the "unicode tonfc|tonfd|tonfkc|tonfkd"
 *	commands. See the user documentation for details on what it does.
 *
5902
5903
5904
5905
5906
5907
5908

5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
{
    static const EnsembleImplMap unicodeImplMap[] = {
	{"tonfc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFC, 0},
	{"tonfd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFD, 0},
	{"tonfkc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKC, 0},
	{"tonfkd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKD, 0},
	{"is", TclUnicodeIsCmd, NULL, NULL, NULL, 0},

	{NULL, NULL, NULL, NULL, NULL, 0}
    };
    return TclMakeEnsemble(interp, "unicode", unicodeImplMap);
}


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */







>













5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
{
    static const EnsembleImplMap unicodeImplMap[] = {
	{"tonfc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFC, 0},
	{"tonfd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFD, 0},
	{"tonfkc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKC, 0},
	{"tonfkd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKD, 0},
	{"is", TclUnicodeIsCmd, NULL, NULL, NULL, 0},
	{"category", TclUnicodeCategoryCmd, NULL, NULL, NULL, 0},
	{NULL, NULL, NULL, NULL, NULL, 0}
    };
    return TclMakeEnsemble(interp, "unicode", unicodeImplMap);
}


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */
Changes to generic/tclUtf.c.
1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
/*
 * tclUtf.c --
 *
 *	Routines for manipulating UTF-8 strings.
 *
 * Copyright © 1997-1998 Sun Microsystems, Inc.
 *
 * See the file "license.terms" for information on usage and redistribution of
 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
 */

#include "tclInt.h"


/*
 * Include the static character classification tables and macros.
 */

#include "tclUniData.c"













>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/*
 * tclUtf.c --
 *
 *	Routines for manipulating UTF-8 strings.
 *
 * Copyright © 1997-1998 Sun Microsystems, Inc.
 *
 * See the file "license.terms" for information on usage and redistribution of
 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
 */

#include "tclInt.h"
#include "../utf8proc/utf8proc.h"

/*
 * Include the static character classification tables and macros.
 */

#include "tclUniData.c"

44
45
46
47
48
49
50




































51
52
53
54
55
56
57
    GRAPH_BITS = WORD_BITS | PUNCT_BITS |
	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) |
	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) |
	(1 << OTHER_NUMBER) |
	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) |
	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)
};





































/*
 * Unicode characters less than this value are represented by themselves in
 * UTF-8 strings.
 */

#define UNICODE_SELF	0x80







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    GRAPH_BITS = WORD_BITS | PUNCT_BITS |
	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) |
	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) |
	(1 << OTHER_NUMBER) |
	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) |
	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)
};

enum Utf8ProcCharacterCategoryMasks {
    UTF8PROC_ALPHA_BITS =
	(1 << UTF8PROC_CATEGORY_LU) | (1 << UTF8PROC_CATEGORY_LL) |
	(1 << UTF8PROC_CATEGORY_LT) | (1 << UTF8PROC_CATEGORY_LM) |
	(1 << UTF8PROC_CATEGORY_LO),

    UTF8PROC_CONTROL_BITS =
	(1 << UTF8PROC_CATEGORY_CC) | (1 << UTF8PROC_CATEGORY_CF),

    UTF8PROC_DIGIT_BITS = (1 << UTF8PROC_CATEGORY_ND),

    UTF8PROC_SPACE_BITS = (1 << UTF8PROC_CATEGORY_ZS) |
			  (1 << UTF8PROC_CATEGORY_ZL) |
			  (1 << UTF8PROC_CATEGORY_ZP),

    UTF8PROC_WORD_BITS =
	UTF8PROC_ALPHA_BITS | UTF8PROC_DIGIT_BITS | (1 << UTF8PROC_CATEGORY_PC),

    UTF8PROC_PUNCT_BITS =
	(1 << UTF8PROC_CATEGORY_PC) | (1 << UTF8PROC_CATEGORY_PD) |
	(1 << UTF8PROC_CATEGORY_PS) | (1 << UTF8PROC_CATEGORY_PE) |
	(1 << UTF8PROC_CATEGORY_PI) | (1 << UTF8PROC_CATEGORY_PF) |
	(1 << UTF8PROC_CATEGORY_PO),

    UTF8PROC_GRAPH_BITS = UTF8PROC_WORD_BITS | UTF8PROC_PUNCT_BITS |
			  (1 << UTF8PROC_CATEGORY_MN) |
			  (1 << UTF8PROC_CATEGORY_MC) |
			  (1 << UTF8PROC_CATEGORY_ME) |
			  (1 << UTF8PROC_CATEGORY_NL) |
			  (1 << UTF8PROC_CATEGORY_NO) |
			  (1 << UTF8PROC_CATEGORY_SM) |
			  (1 << UTF8PROC_CATEGORY_SC) |
			  (1 << UTF8PROC_CATEGORY_SK) |
			  (1 << UTF8PROC_CATEGORY_SO)
};

/*
 * Unicode characters less than this value are represented by themselves in
 * UTF-8 strings.
 */

#define UNICODE_SELF	0x80
2022
2023
2024
2025
2026
2027
2028






2029
2030
2031
2032
2033
2034
2035
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsAlpha --
 *
 *	Test if a character is an alphabetic Unicode character.







>
>
>
>
>
>







2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}
int
Tcl_Utf8procIsAlnum(
    int ch)			/* Unicode character to test. */
{
    return ((1 << utf8proc_category(ch)) & (UTF8PROC_ALPHA_BITS|UTF8PROC_DIGIT_BITS)) != 0;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsAlpha --
 *
 *	Test if a character is an alphabetic Unicode character.
2048
2049
2050
2051
2052
2053
2054






2055
2056
2057
2058
2059
2060
2061
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsControl --
 *
 *	Test if a character is a Unicode control character.







>
>
>
>
>
>







2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}
int
Tcl_Utf8procIsAlpha(
    int ch)			/* Unicode character to test. */
{
    return ((1 << utf8proc_category(ch)) & UTF8PROC_ALPHA_BITS) != 0;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsControl --
 *
 *	Test if a character is a Unicode control character.
2076
2077
2078
2079
2080
2081
2082






2083
2084
2085
2086
2087
2088
2089
    if (UNICODE_OUT_OF_RANGE(ch)) {
	/* Clear away extension bits, if any */
	ch &= 0x1FFFFF;
	return ((ch == 0xE0001) || ((unsigned)(ch - 0xE0020) <= 0x5F));
    }
    return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsDigit --
 *
 *	Test if a character is a numeric Unicode character.







>
>
>
>
>
>







2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
    if (UNICODE_OUT_OF_RANGE(ch)) {
	/* Clear away extension bits, if any */
	ch &= 0x1FFFFF;
	return ((ch == 0xE0001) || ((unsigned)(ch - 0xE0020) <= 0x5F));
    }
    return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}
int
Tcl_Utf8procIsControl(
    int ch)			/* Unicode character to test. */
{
    return ((1 << utf8proc_category(ch)) & UTF8PROC_CONTROL_BITS) != 0;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsDigit --
 *
 *	Test if a character is a numeric Unicode character.
2102
2103
2104
2105
2106
2107
2108






2109
2110
2111
2112
2113
2114
2115
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsGraph --
 *
 *	Test if a character is any Unicode print character except space.







>
>
>
>
>
>







2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}
int
Tcl_Utf8procIsDigit(
    int ch)			/* Unicode character to test. */
{
    return (utf8proc_category(ch) == UTF8PROC_CATEGORY_ND);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsGraph --
 *
 *	Test if a character is any Unicode print character except space.
2128
2129
2130
2131
2132
2133
2134






2135
2136
2137
2138
2139
2140
2141
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
    }
    return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsLower --
 *
 *	Test if a character is a lowercase Unicode character.







>
>
>
>
>
>







2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
    }
    return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}
int
Tcl_Utf8procIsGraph(
    int ch)			/* Unicode character to test. */
{
    return ((1 << utf8proc_category(ch)) & UTF8PROC_GRAPH_BITS) != 0;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsLower --
 *
 *	Test if a character is a lowercase Unicode character.
2154
2155
2156
2157
2158
2159
2160






2161
2162
2163
2164
2165
2166
2167
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return (GetCategory(ch) == LOWERCASE_LETTER);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsPrint --
 *
 *	Test if a character is a Unicode print character.







>
>
>
>
>
>







2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return (GetCategory(ch) == LOWERCASE_LETTER);
}
int
Tcl_Utf8procIsLower(
    int ch)			/* Unicode character to test. */
{
    return (utf8proc_category(ch) == UTF8PROC_CATEGORY_LL);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsPrint --
 *
 *	Test if a character is a Unicode print character.
2180
2181
2182
2183
2184
2185
2186






2187
2188
2189
2190
2191
2192
2193
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
    }
    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsPunct --
 *
 *	Test if a character is a Unicode punctuation character.







>
>
>
>
>
>







2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
    }
    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}
int
Tcl_Utf8procIsPrint(
    int ch)			/* Unicode character to test. */
{
    return ((1 << utf8proc_category(ch)) & (UTF8PROC_SPACE_BITS|UTF8PROC_GRAPH_BITS)) != 0;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsPunct --
 *
 *	Test if a character is a Unicode punctuation character.
2206
2207
2208
2209
2210
2211
2212






2213
2214
2215
2216
2217
2218
2219
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsSpace --
 *
 *	Test if a character is a whitespace Unicode character.







>
>
>
>
>
>







2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}
int
Tcl_Utf8procIsPunct(
    int ch)			/* Unicode character to test. */
{
    return ((1 << utf8proc_category(ch)) & UTF8PROC_PUNCT_BITS) != 0;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsSpace --
 *
 *	Test if a character is a whitespace Unicode character.
2246
2247
2248
2249
2250
2251
2252























2253
2254
2255
2256
2257
2258
2259
    } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
	    || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
	return 1;
    } else {
	return ((SPACE_BITS >> GetCategory(ch)) & 1);
    }
}
























/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsUpper --
 *
 *	Test if a character is a uppercase Unicode character.







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
    } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
	    || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
	return 1;
    } else {
	return ((SPACE_BITS >> GetCategory(ch)) & 1);
    }
}
int
Tcl_Utf8procIsSpace(
    int ch)			/* Unicode character to test. */
{
    /* Ignore upper 11 bits. */
    ch &= 0x1FFFFF;

    /*
     * If the character is within the first 127 characters, just use the
     * standard C function, otherwise consult the Unicode table.
     */

    if (ch < 0x80) {
	return TclIsSpaceProcM((char) ch);
    } else if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
	    || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) {
	return 1;
    } else {
	return ((1 << utf8proc_category(ch)) & UTF8PROC_SPACE_BITS) != 0;
    }
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsUpper --
 *
 *	Test if a character is a uppercase Unicode character.
2272
2273
2274
2275
2276
2277
2278






2279
2280
2281
2282
2283
2284
2285
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return (GetCategory(ch) == UPPERCASE_LETTER);
}







/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsWordChar --
 *
 *	Test if a character is alphanumeric or a connector punctuation mark.







>
>
>
>
>
>







2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return (GetCategory(ch) == UPPERCASE_LETTER);
}
int
Tcl_Utf8procIsUpper(
    int ch)			/* Unicode character to test. */
{
    return (utf8proc_category(ch) == UTF8PROC_CATEGORY_LU);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsWordChar --
 *
 *	Test if a character is alphanumeric or a connector punctuation mark.
2298
2299
2300
2301
2302
2303
2304






2305
2306
2307
2308
2309
2310
2311
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return ((WORD_BITS >> GetCategory(ch)) & 1);
}







/*
 *----------------------------------------------------------------------
 *
 * TclUniCharCaseMatch --
 *
 *	See if a particular Unicode string matches a particular pattern.







>
>
>
>
>
>







2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
    int ch)			/* Unicode character to test. */
{
    if (UNICODE_OUT_OF_RANGE(ch)) {
	return 0;
    }
    return ((WORD_BITS >> GetCategory(ch)) & 1);
}
int
Tcl_Utf8procIsWordChar(
    int ch)			/* Unicode character to test. */
{
    return ((1 << utf8proc_category(ch)) & UTF8PROC_WORD_BITS) != 0;
}

/*
 *----------------------------------------------------------------------
 *
 * TclUniCharCaseMatch --
 *
 *	See if a particular Unicode string matches a particular pattern.
Changes to tests/unicodeProperties.test.
95
96
97
98
99
100
101




















102
103
104
105
106
    } -constraints ucdproperties -result {Upper case mismatches:}

    test unicode-is-upper-outofrange "unicode is upper out of range" -cleanup {
        testobj freeallvars
    } -body {
    	unicode is upper [teststringobj newunicode 1 0x110000]
    } -constraints teststringobj -result 0




















}

::tcltest::cleanupTests
namespace delete unicode::test
return







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>





95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
    } -constraints ucdproperties -result {Upper case mismatches:}

    test unicode-is-upper-outofrange "unicode is upper out of range" -cleanup {
        testobj freeallvars
    } -body {
    	unicode is upper [teststringobj newunicode 1 0x110000]
    } -constraints teststringobj -result 0

	###
    # Compatibility tests between the string and unicode commands.
	proc testStringUnicodeCompatibility {class} {
        set mismatches "is $class mismatches:"
        foreach codePoint [lseq 0 $::tcltests::ucd::maxCodepoint] {
            set ch [format %c $codePoint]
            if {[string is $class $ch] != [unicode is $class $ch]} {
                append mismatches " " U+[format %x $codePoint]
                break
            }
        }
        return $mismatches
    }

    foreach class {alpha alnum control digit graph lower print space upper wordchar} {
    	test string-vs-unicode-is-$class "string is $class vs unicode" -body {
            testStringUnicodeCompatibility $class
		} -result "is $class mismatches:"
    }
}

::tcltest::cleanupTests
namespace delete unicode::test
return