Index: generic/tclUtil.c ================================================================== --- generic/tclUtil.c +++ generic/tclUtil.c @@ -1916,214 +1916,224 @@ CONST char *str, /* String. */ CONST char *pattern, /* Pattern, which may contain special * characters. */ int nocase) /* 0 for case sensitive, 1 for insensitive */ { - int p, charLen; - CONST char *pstart = pattern; - Tcl_UniChar ch1, ch2; - - while (1) { - p = *pattern; - - /* - * See if we're at the end of both the pattern and the string. If so, - * we succeeded. If we're at the end of the pattern but not at the end - * of the string, we failed. - */ - - if (p == '\0') { - return (*str == '\0'); - } - if ((*str == '\0') && (p != '*')) { - return 0; - } - - /* - * Check for a "*" as the next pattern character. It matches any - * substring. We handle this by calling ourselves recursively for each - * postfix of string, until either we match or we reach the end of the - * string. - */ - - if (p == '*') { - /* - * Skip all successive *'s in the pattern - */ - - while (*(++pattern) == '*') {} - p = *pattern; - if (p == '\0') { - return 1; - } - - /* - * This is a special case optimization for single-byte utf. - */ - - if (UCHAR(*pattern) < 0x80) { - ch2 = (Tcl_UniChar) - (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - } else { - Tcl_UtfToUniChar(pattern, &ch2); - if (nocase) { - ch2 = Tcl_UniCharToLower(ch2); - } - } - - while (1) { - /* - * Optimization for matching - cruise through the string - * quickly if the next char in the pattern isn't a special - * character - */ - - if ((p != '[') && (p != '?') && (p != '\\')) { - if (nocase) { - while (*str) { - charLen = TclUtfToUniChar(str, &ch1); - if (ch2==ch1 || ch2==Tcl_UniCharToLower(ch1)) { - break; - } - str += charLen; - } - } else { - /* - * There's no point in trying to make this code - * shorter, as the number of bytes you want to compare - * each time is non-constant. - */ - - while (*str) { - charLen = TclUtfToUniChar(str, &ch1); - if (ch2 == ch1) { - break; - } - str += charLen; - } - } - } - if (Tcl_StringCaseMatch(str, pattern, nocase)) { - return 1; - } - if (*str == '\0') { - return 0; - } - str += TclUtfToUniChar(str, &ch1); - } - } - - /* - * Check for a "?" as the next pattern character. It matches any - * single character. - */ - - if (p == '?') { - pattern++; - str += TclUtfToUniChar(str, &ch1); - continue; - } - - /* - * Check for a "[" as the next pattern character. It is followed by a - * list of characters that are acceptable, or by a range (two - * characters separated by "-"). - */ - - if (p == '[') { - Tcl_UniChar startChar, endChar; - - pattern++; - if (UCHAR(*str) < 0x80) { - ch1 = (Tcl_UniChar) - (nocase ? tolower(UCHAR(*str)) : UCHAR(*str)); - str++; - } else { - str += Tcl_UtfToUniChar(str, &ch1); - if (nocase) { - ch1 = Tcl_UniCharToLower(ch1); - } - } - while (1) { - if ((*pattern == ']') || (*pattern == '\0')) { - return 0; - } - if (UCHAR(*pattern) < 0x80) { - startChar = (Tcl_UniChar) (nocase - ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - pattern++; - } else { - pattern += Tcl_UtfToUniChar(pattern, &startChar); - if (nocase) { - startChar = Tcl_UniCharToLower(startChar); - } - } - if (*pattern == '-') { - pattern++; - if (*pattern == '\0') { - return 0; - } - if (UCHAR(*pattern) < 0x80) { - endChar = (Tcl_UniChar) (nocase - ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); - pattern++; - } else { - pattern += Tcl_UtfToUniChar(pattern, &endChar); - if (nocase) { - endChar = Tcl_UniCharToLower(endChar); - } - } - if (((startChar <= ch1) && (ch1 <= endChar)) - || ((endChar <= ch1) && (ch1 <= startChar))) { - /* - * Matches ranges of form [a-z] or [z-a]. - */ - - break; - } - } else if (startChar == ch1) { - break; - } - } - while (*pattern != ']') { - if (*pattern == '\0') { - pattern = Tcl_UtfPrev(pattern, pstart); - break; - } - pattern++; - } - pattern++; - continue; - } - - /* - * If the next pattern character is '\', just strip off the '\' so we - * do exact matching on the character that follows. - */ - - if (p == '\\') { - pattern++; - if (*pattern == '\0') { - return 0; - } - } - - /* - * There's no special character. Just make sure that the next bytes of - * each string match. - */ - - str += TclUtfToUniChar(str, &ch1); - pattern += TclUtfToUniChar(pattern, &ch2); - if (nocase) { - if (Tcl_UniCharToLower(ch1) != Tcl_UniCharToLower(ch2)) { - return 0; - } - } else if (ch1 != ch2) { - return 0; - } - } + int charLen; + CONST char *pstart = pattern; + Tcl_UniChar pch, sch; + Tcl_UniChar startChar, endChar; + + CONST char *pnext = 0, *snext; + + while (1) { +matchLoop: + switch (*pattern) { + case '\0': + if (*str == '\0') { + return 1; + } else { + goto matchFail; + } + + case '?': + if (*str == '\0') { + goto matchFail; + } else { + str += TclUtfToUniChar(str, &sch); + ++pattern; + goto matchLoop; + } + + case '*': + /* + * Skip past the '*', and any following '*'s + */ + while (*(++pattern) == '*') {} + + /* + * If the pattern ends after '*', we have a match + */ + if (*pattern == '\0') { + return 1; + } + + /* + * Peek at the next pattern char + */ + if (UCHAR(*pattern) < 0x80) { + pch = (Tcl_UniChar) + (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); + } else { + TclUtfToUniChar(pattern, &pch); + if (nocase) { + pch = Tcl_UniCharToLower(pch); + } + } + + /* + * If the next char in pattern is a literal, zoom through str to the next match + */ + switch (*pattern) { + case '[': case '?': case '\\': + break; + default: + while (*str) { + charLen = TclUtfToUniChar(str, &sch); + if ( (pch == sch) + || (nocase && (pch == Tcl_UniCharToLower(sch)))) { + break; + } + str += charLen; + } + } + + /* + * Avoid recursion, using technique described at https://research.swtch.com/glob + * + * `pnext` stores the first pattern char following a (sequence of) '*'. + * `snext` stores the current match position - on resume at matchFail it + * will need to be incremented. + */ + pnext = pattern - 1; + snext = str; + + goto matchLoop; + + case '[': + ++pattern; + + /* + * Take the next char from input string to match + * against '[]' group + */ + if(UCHAR(*str) < 0x80) { + sch = (Tcl_UniChar) + (nocase ? tolower(UCHAR(*str)) : UCHAR(*str)); + ++str; + } else { + str += TclUtfToUniChar(str, &sch); + if (nocase) { + sch = Tcl_UniCharToLower(sch); + } + } + + /* + * '[]' group loop: process single chars and a-z ranges + */ + while (1) { + if (*pattern == ']') { + /* + * End of [] range with no match - fail! + */ + goto matchFail; + } + if (*pattern == '\0') { + return 0; /* Illegal pattern! (incomplete [..) */ + } + if (UCHAR(*pattern) < 0x80) { + pch = (Tcl_UniChar) + (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); + ++pattern; + } else { + pattern += TclUtfToUniChar(pattern, &pch); + if (nocase) { + pch = Tcl_UniCharToLower(pch); + } + } + if (*pattern != '-') { + /* + * Try to match a single char in '[]' group + */ + if (pch == sch) break; + continue; + } else { + /* + * Start of a 'a-z' style range. + */ + ++pattern; + if (*pattern == '\0') { + return 0; /* Illegal pattern! (incomplete [..) */ + } + // if (*pattern == ']') { + // return 0; /* Illegal pattern! (incomplete [a-]) */ + // } + if (UCHAR(*pattern) < 0x80) { + pch = (Tcl_UniChar) + (nocase ? tolower(UCHAR(*pattern)) : UCHAR(*pattern)); + ++pattern; + } else { + pattern += TclUtfToUniChar(pattern, &pch); + if (nocase) { + pch = Tcl_UniCharToLower(pch); + } + } + endChar = pch; + /* + * We have a range ('a-z' inside '[]') with + * startChar, endChar as the endpoints. + */ + if (((startChar <= sch) && (sch <= endChar)) + || ((endChar <= sch) && (sch <= startChar))) { + break; + } + /* + * No match with this character or range. Carry + * on with the next part of '[]' + */ + continue; + } + } + /* + * If we get here, a [] group has successfully matched. *pattern + * is still inside the brackets, so advance past next ']' + */ + while (*pattern != ']') { + if (*pattern == '\0') { + return 0; /* Illegal pattern! (incomplete [..) */ + } + pattern++; + } + pattern++; + goto matchLoop; + + case '\\': + ++pattern; + if (*pattern == '\0') { + return 0; /* Illegal pattern! (lone \ at end) */ + } + goto matchLiteral; + + default: +matchLiteral: + /* + * Try to match a literal character in pattern + */ + str += TclUtfToUniChar(str, &sch); + pattern += TclUtfToUniChar(pattern, &pch); + if (nocase) { + if (Tcl_UniCharToLower(sch) != Tcl_UniCharToLower(pch)) { + goto matchFail; + } + } else if (sch != pch) { + goto matchFail; + } + goto matchLoop; + } + } +matchFail: + if (pnext != 0) { + /* + * Backtrack to position following last '*' in `pattern`. + * `snext` points to where we last tried to match, so advance + * it first. + */ + pattern = pnext; + str = snext + TclUtfToUniChar(snext, &sch); + if (*str) goto matchLoop; + } + return 0; } /* *---------------------------------------------------------------------- *