Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Implement unicode is CLASS to match string is CLASS |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | tip-726-plus |
Files: | files | file ages | folders |
SHA3-256: |
7beae32e00845e1dc3a451532463bf84 |
User & Date: | apnadkarni 2025-07-25 15:49:23.654 |
Context
2025-07-25
| ||
16:05 | Disable tests for bug 1ecea011 check-in: 13c2631e68 user: apnadkarni tags: tip-726-plus | |
15:49 | Implement unicode is CLASS to match string is CLASS check-in: 7beae32e00 user: apnadkarni tags: tip-726-plus | |
2025-07-23
| ||
16:28 | Merged tip-726 check-in: d5fb960aa0 user: apnadkarni tags: tip-726-plus | |
Changes
Changes to generic/tclCmdMZ.c.
︙ | ︙ | |||
5505 5506 5507 5508 5509 5510 5511 | /* * When entering here, result == 1 and failat == 0. */ switch (index) { case STR_IS_ALNUM: | > | > | | 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 | /* * When entering here, result == 1 and failat == 0. */ switch (index) { case STR_IS_ALNUM: int Tcl_Utf8procIsAlnum(int ch); chcomp = Tcl_Utf8procIsAlnum; break; case STR_IS_ALPHA: int Tcl_Utf8procIsAlpha(int ch); chcomp = Tcl_Utf8procIsAlpha; break; case STR_IS_ASCII: chcomp = UniCharIsAscii; break; case STR_IS_BOOL: case STR_IS_TRUE: case STR_IS_FALSE: |
︙ | ︙ | |||
5530 5531 5532 5533 5534 5535 5536 | } } else if ((objPtr->internalRep.wideValue != 0) ? (index == STR_IS_FALSE) : (index == STR_IS_TRUE)) { result = 0; } break; case STR_IS_CONTROL: | > | | 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 | } } else if ((objPtr->internalRep.wideValue != 0) ? (index == STR_IS_FALSE) : (index == STR_IS_TRUE)) { result = 0; } break; case STR_IS_CONTROL: int Tcl_Utf8procIsControl(int ch); chcomp = Tcl_Utf8procIsControl; break; case STR_IS_DICT: { int dresult; Tcl_Size dsize; dresult = Tcl_DictObjSize(interp, objPtr, &dsize); Tcl_ResetResult(interp); |
︙ | ︙ | |||
5582 5583 5584 5585 5586 5587 5588 | break; } } } break; } case STR_IS_DIGIT: | > | > | 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 | break; } } } break; } case STR_IS_DIGIT: int Tcl_Utf8procIsDigit(int ch); chcomp = Tcl_Utf8procIsDigit; break; case STR_IS_DOUBLE: { if (TclHasInternalRep(objPtr, &tclDoubleType) || TclHasInternalRep(objPtr, &tclIntType) || TclHasInternalRep(objPtr, &tclBignumType)) { break; } string1 = TclGetStringFromObj(objPtr, &length1); |
︙ | ︙ | |||
5612 5613 5614 5615 5616 5617 5618 | result = 0; TclFreeInternalRep(objPtr); } } break; } case STR_IS_GRAPH: | > | | 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 | result = 0; TclFreeInternalRep(objPtr); } } break; } case STR_IS_GRAPH: int Tcl_Utf8procIsGraph(int ch); chcomp = Tcl_Utf8procIsGraph; break; case STR_IS_INT: case STR_IS_ENTIER: if (TclHasInternalRep(objPtr, &tclIntType) || TclHasInternalRep(objPtr, &tclBignumType)) { break; } |
︙ | ︙ | |||
5763 5764 5765 5766 5767 5768 5769 | break; } } } result = 0; break; case STR_IS_LOWER: | > | > | > | > | > | > | | 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 | break; } } } result = 0; break; case STR_IS_LOWER: int Tcl_Utf8procIsLower(int ch); chcomp = Tcl_Utf8procIsLower; break; case STR_IS_PRINT: int Tcl_Utf8procIsPrint(int ch); chcomp = Tcl_Utf8procIsPrint; break; case STR_IS_PUNCT: int Tcl_Utf8procIsPunct(int ch); chcomp = Tcl_Utf8procIsPunct; break; case STR_IS_SPACE: int Tcl_Utf8procIsSpace(int ch); chcomp = Tcl_Utf8procIsSpace; break; case STR_IS_UPPER: int Tcl_Utf8procIsUpper(int ch); chcomp = Tcl_Utf8procIsUpper; break; case STR_IS_WORD: int Tcl_Utf8procIsWordChar(int ch); chcomp = Tcl_Utf8procIsWordChar; break; case STR_IS_XDIGIT: chcomp = UniCharIsHexDigit; break; default: TCL_UNREACHABLE(); } |
︙ | ︙ | |||
5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 | if (Tcl_ObjSetVar2(interp, failVarObj, NULL, objPtr, TCL_LEAVE_ERR_MSG) == NULL) { return TCL_ERROR; } } Tcl_SetObjResult(interp, Tcl_NewBooleanObj(result)); return TCL_OK; } /* * TclUnicodeNormalizeCmd -- * * This procedure implements the "unicode tonfc|tonfd|tonfkc|tonfkd" * commands. See the user documentation for details on what it does. * | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 | if (Tcl_ObjSetVar2(interp, failVarObj, NULL, objPtr, TCL_LEAVE_ERR_MSG) == NULL) { return TCL_ERROR; } } Tcl_SetObjResult(interp, Tcl_NewBooleanObj(result)); return TCL_OK; } /* * TclUnicodeNormalizeCmd -- * * This procedure implements the "unicode tonfc|tonfd|tonfkc|tonfkd" * commands. See the user documentation for details on what it does. * * Results: * A standard Tcl result. * * Side effects: * Stores the normalized string in the interpreter result. */ static int TclUnicodeCategoryCmd( void *clientData, /* TCL_{NFC,NFD,NFKC,NFKD} */ Tcl_Interp *interp, /* Current interpreter. */ int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { if (objc != 2) { Tcl_WrongNumArgs(interp, 1, objv, "str"); return TCL_ERROR; } Tcl_Obj *categoriesObj = Tcl_NewListObj(0, NULL); Tcl_Size length1, length2; const char *string = TclGetStringFromObj(objv[1], &length1); const char *end = string + length1; for (; string < end; string += length2) { int ucs4; length2 = TclUtfToUniChar(string, &ucs4); Tcl_ListObjAppendElement(interp, categoriesObj, Tcl_NewStringObj(utf8proc_category_string(ucs4), -1)); } Tcl_SetObjResult(interp, categoriesObj); return TCL_OK; } /* * TclUnicodeNormalizeCmd -- * * This procedure implements the "unicode tonfc|tonfd|tonfkc|tonfkd" * commands. See the user documentation for details on what it does. * |
︙ | ︙ | |||
5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 | { static const EnsembleImplMap unicodeImplMap[] = { {"tonfc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFC, 0}, {"tonfd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFD, 0}, {"tonfkc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKC, 0}, {"tonfkd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKD, 0}, {"is", TclUnicodeIsCmd, NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0} }; return TclMakeEnsemble(interp, "unicode", unicodeImplMap); } /* * Local Variables: * mode: c * c-basic-offset: 4 * fill-column: 78 * End: */ | > | 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 | { static const EnsembleImplMap unicodeImplMap[] = { {"tonfc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFC, 0}, {"tonfd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFD, 0}, {"tonfkc", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKC, 0}, {"tonfkd", TclUnicodeNormalizeCmd, NULL, NULL, (void *)TCL_NFKD, 0}, {"is", TclUnicodeIsCmd, NULL, NULL, NULL, 0}, {"category", TclUnicodeCategoryCmd, NULL, NULL, NULL, 0}, {NULL, NULL, NULL, NULL, NULL, 0} }; return TclMakeEnsemble(interp, "unicode", unicodeImplMap); } /* * Local Variables: * mode: c * c-basic-offset: 4 * fill-column: 78 * End: */ |
Changes to generic/tclUtf.c.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | /* * tclUtf.c -- * * Routines for manipulating UTF-8 strings. * * Copyright © 1997-1998 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. */ #include "tclInt.h" /* * Include the static character classification tables and macros. */ #include "tclUniData.c" | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | /* * tclUtf.c -- * * Routines for manipulating UTF-8 strings. * * Copyright © 1997-1998 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and redistribution of * this file, and for a DISCLAIMER OF ALL WARRANTIES. */ #include "tclInt.h" #include "../utf8proc/utf8proc.h" /* * Include the static character classification tables and macros. */ #include "tclUniData.c" |
︙ | ︙ | |||
44 45 46 47 48 49 50 51 52 53 54 55 56 57 | GRAPH_BITS = WORD_BITS | PUNCT_BITS | (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | (1 << OTHER_NUMBER) | (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL) }; /* * Unicode characters less than this value are represented by themselves in * UTF-8 strings. */ #define UNICODE_SELF 0x80 | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | GRAPH_BITS = WORD_BITS | PUNCT_BITS | (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | (1 << OTHER_NUMBER) | (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL) }; enum Utf8ProcCharacterCategoryMasks { UTF8PROC_ALPHA_BITS = (1 << UTF8PROC_CATEGORY_LU) | (1 << UTF8PROC_CATEGORY_LL) | (1 << UTF8PROC_CATEGORY_LT) | (1 << UTF8PROC_CATEGORY_LM) | (1 << UTF8PROC_CATEGORY_LO), UTF8PROC_CONTROL_BITS = (1 << UTF8PROC_CATEGORY_CC) | (1 << UTF8PROC_CATEGORY_CF), UTF8PROC_DIGIT_BITS = (1 << UTF8PROC_CATEGORY_ND), UTF8PROC_SPACE_BITS = (1 << UTF8PROC_CATEGORY_ZS) | (1 << UTF8PROC_CATEGORY_ZL) | (1 << UTF8PROC_CATEGORY_ZP), UTF8PROC_WORD_BITS = UTF8PROC_ALPHA_BITS | UTF8PROC_DIGIT_BITS | (1 << UTF8PROC_CATEGORY_PC), UTF8PROC_PUNCT_BITS = (1 << UTF8PROC_CATEGORY_PC) | (1 << UTF8PROC_CATEGORY_PD) | (1 << UTF8PROC_CATEGORY_PS) | (1 << UTF8PROC_CATEGORY_PE) | (1 << UTF8PROC_CATEGORY_PI) | (1 << UTF8PROC_CATEGORY_PF) | (1 << UTF8PROC_CATEGORY_PO), UTF8PROC_GRAPH_BITS = UTF8PROC_WORD_BITS | UTF8PROC_PUNCT_BITS | (1 << UTF8PROC_CATEGORY_MN) | (1 << UTF8PROC_CATEGORY_MC) | (1 << UTF8PROC_CATEGORY_ME) | (1 << UTF8PROC_CATEGORY_NL) | (1 << UTF8PROC_CATEGORY_NO) | (1 << UTF8PROC_CATEGORY_SM) | (1 << UTF8PROC_CATEGORY_SC) | (1 << UTF8PROC_CATEGORY_SK) | (1 << UTF8PROC_CATEGORY_SO) }; /* * Unicode characters less than this value are represented by themselves in * UTF-8 strings. */ #define UNICODE_SELF 0x80 |
︙ | ︙ | |||
2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsAlpha -- * * Test if a character is an alphabetic Unicode character. | > > > > > > | 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); } int Tcl_Utf8procIsAlnum( int ch) /* Unicode character to test. */ { return ((1 << utf8proc_category(ch)) & (UTF8PROC_ALPHA_BITS|UTF8PROC_DIGIT_BITS)) != 0; } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsAlpha -- * * Test if a character is an alphabetic Unicode character. |
︙ | ︙ | |||
2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return ((ALPHA_BITS >> GetCategory(ch)) & 1); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsControl -- * * Test if a character is a Unicode control character. | > > > > > > | 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return ((ALPHA_BITS >> GetCategory(ch)) & 1); } int Tcl_Utf8procIsAlpha( int ch) /* Unicode character to test. */ { return ((1 << utf8proc_category(ch)) & UTF8PROC_ALPHA_BITS) != 0; } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsControl -- * * Test if a character is a Unicode control character. |
︙ | ︙ | |||
2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 | if (UNICODE_OUT_OF_RANGE(ch)) { /* Clear away extension bits, if any */ ch &= 0x1FFFFF; return ((ch == 0xE0001) || ((unsigned)(ch - 0xE0020) <= 0x5F)); } return ((CONTROL_BITS >> GetCategory(ch)) & 1); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsDigit -- * * Test if a character is a numeric Unicode character. | > > > > > > | 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 | if (UNICODE_OUT_OF_RANGE(ch)) { /* Clear away extension bits, if any */ ch &= 0x1FFFFF; return ((ch == 0xE0001) || ((unsigned)(ch - 0xE0020) <= 0x5F)); } return ((CONTROL_BITS >> GetCategory(ch)) & 1); } int Tcl_Utf8procIsControl( int ch) /* Unicode character to test. */ { return ((1 << utf8proc_category(ch)) & UTF8PROC_CONTROL_BITS) != 0; } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsDigit -- * * Test if a character is a numeric Unicode character. |
︙ | ︙ | |||
2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsGraph -- * * Test if a character is any Unicode print character except space. | > > > > > > | 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); } int Tcl_Utf8procIsDigit( int ch) /* Unicode character to test. */ { return (utf8proc_category(ch) == UTF8PROC_CATEGORY_ND); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsGraph -- * * Test if a character is any Unicode print character except space. |
︙ | ︙ | |||
2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF); } return ((GRAPH_BITS >> GetCategory(ch)) & 1); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsLower -- * * Test if a character is a lowercase Unicode character. | > > > > > > | 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF); } return ((GRAPH_BITS >> GetCategory(ch)) & 1); } int Tcl_Utf8procIsGraph( int ch) /* Unicode character to test. */ { return ((1 << utf8proc_category(ch)) & UTF8PROC_GRAPH_BITS) != 0; } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsLower -- * * Test if a character is a lowercase Unicode character. |
︙ | ︙ | |||
2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return (GetCategory(ch) == LOWERCASE_LETTER); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsPrint -- * * Test if a character is a Unicode print character. | > > > > > > | 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return (GetCategory(ch) == LOWERCASE_LETTER); } int Tcl_Utf8procIsLower( int ch) /* Unicode character to test. */ { return (utf8proc_category(ch) == UTF8PROC_CATEGORY_LL); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsPrint -- * * Test if a character is a Unicode print character. |
︙ | ︙ | |||
2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF); } return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsPunct -- * * Test if a character is a Unicode punctuation character. | > > > > > > | 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF); } return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); } int Tcl_Utf8procIsPrint( int ch) /* Unicode character to test. */ { return ((1 << utf8proc_category(ch)) & (UTF8PROC_SPACE_BITS|UTF8PROC_GRAPH_BITS)) != 0; } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsPunct -- * * Test if a character is a Unicode punctuation character. |
︙ | ︙ | |||
2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return ((PUNCT_BITS >> GetCategory(ch)) & 1); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsSpace -- * * Test if a character is a whitespace Unicode character. | > > > > > > | 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return ((PUNCT_BITS >> GetCategory(ch)) & 1); } int Tcl_Utf8procIsPunct( int ch) /* Unicode character to test. */ { return ((1 << utf8proc_category(ch)) & UTF8PROC_PUNCT_BITS) != 0; } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsSpace -- * * Test if a character is a whitespace Unicode character. |
︙ | ︙ | |||
2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 | } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); } } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsUpper -- * * Test if a character is a uppercase Unicode character. | > > > > > > > > > > > > > > > > > > > > > > > | 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 | } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); } } int Tcl_Utf8procIsSpace( int ch) /* Unicode character to test. */ { /* Ignore upper 11 bits. */ ch &= 0x1FFFFF; /* * If the character is within the first 127 characters, just use the * standard C function, otherwise consult the Unicode table. */ if (ch < 0x80) { return TclIsSpaceProcM((char) ch); } else if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { return 1; } else { return ((1 << utf8proc_category(ch)) & UTF8PROC_SPACE_BITS) != 0; } } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsUpper -- * * Test if a character is a uppercase Unicode character. |
︙ | ︙ | |||
2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return (GetCategory(ch) == UPPERCASE_LETTER); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsWordChar -- * * Test if a character is alphanumeric or a connector punctuation mark. | > > > > > > | 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return (GetCategory(ch) == UPPERCASE_LETTER); } int Tcl_Utf8procIsUpper( int ch) /* Unicode character to test. */ { return (utf8proc_category(ch) == UTF8PROC_CATEGORY_LU); } /* *---------------------------------------------------------------------- * * Tcl_UniCharIsWordChar -- * * Test if a character is alphanumeric or a connector punctuation mark. |
︙ | ︙ | |||
2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return ((WORD_BITS >> GetCategory(ch)) & 1); } /* *---------------------------------------------------------------------- * * TclUniCharCaseMatch -- * * See if a particular Unicode string matches a particular pattern. | > > > > > > | 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 | int ch) /* Unicode character to test. */ { if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } return ((WORD_BITS >> GetCategory(ch)) & 1); } int Tcl_Utf8procIsWordChar( int ch) /* Unicode character to test. */ { return ((1 << utf8proc_category(ch)) & UTF8PROC_WORD_BITS) != 0; } /* *---------------------------------------------------------------------- * * TclUniCharCaseMatch -- * * See if a particular Unicode string matches a particular pattern. |
︙ | ︙ |
Changes to tests/unicodeProperties.test.
︙ | ︙ | |||
95 96 97 98 99 100 101 102 103 104 105 106 | } -constraints ucdproperties -result {Upper case mismatches:} test unicode-is-upper-outofrange "unicode is upper out of range" -cleanup { testobj freeallvars } -body { unicode is upper [teststringobj newunicode 1 0x110000] } -constraints teststringobj -result 0 } ::tcltest::cleanupTests namespace delete unicode::test return | > > > > > > > > > > > > > > > > > > > > | 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | } -constraints ucdproperties -result {Upper case mismatches:} test unicode-is-upper-outofrange "unicode is upper out of range" -cleanup { testobj freeallvars } -body { unicode is upper [teststringobj newunicode 1 0x110000] } -constraints teststringobj -result 0 ### # Compatibility tests between the string and unicode commands. proc testStringUnicodeCompatibility {class} { set mismatches "is $class mismatches:" foreach codePoint [lseq 0 $::tcltests::ucd::maxCodepoint] { set ch [format %c $codePoint] if {[string is $class $ch] != [unicode is $class $ch]} { append mismatches " " U+[format %x $codePoint] break } } return $mismatches } foreach class {alpha alnum control digit graph lower print space upper wordchar} { test string-vs-unicode-is-$class "string is $class vs unicode" -body { testStringUnicodeCompatibility $class } -result "is $class mismatches:" } } ::tcltest::cleanupTests namespace delete unicode::test return |