Attachment "quantified-backrefs.patch" to
ticket [0e0e150e49]
added by
tgl
2015-09-19 15:55:52.
diff -pcdr tcl8.6.4.orig/generic/regcomp.c tcl8.6.4/generic/regcomp.c
*** tcl8.6.4.orig/generic/regcomp.c Thu Feb 26 11:57:28 2015
--- tcl8.6.4/generic/regcomp.c Fri Sep 18 12:16:30 2015
*************** parseqatom(
*** 1102,1112 ****
/*
* Prepare a general-purpose state skeleton.
*
! * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
! * / /
! * [lp] ----> [s2] ----bypass---------------------
*
! * where bypass is an empty, and prefix is some repetitions of atom
*/
s = newstate(v->nfa); /* first, new endpoints for the atom */
--- 1102,1118 ----
/*
* Prepare a general-purpose state skeleton.
*
! * In the no-backrefs case, we want this:
*
! * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
! *
! * where prefix is some repetitions of atom. In the general case we need
! *
! * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
! *
! * where the iterator wraps around [begin] ---atom---> [end]
! *
! * We make the s state here for both cases; s2 is made below if needed
*/
s = newstate(v->nfa); /* first, new endpoints for the atom */
*************** parseqatom(
*** 1117,1127 ****
NOERR();
atom->begin = s;
atom->end = s2;
! s = newstate(v->nfa); /* and spots for prefix and bypass */
! s2 = newstate(v->nfa);
NOERR();
EMPTYARC(lp, s);
- EMPTYARC(lp, s2);
NOERR();
/*
--- 1123,1131 ----
NOERR();
atom->begin = s;
atom->end = s2;
! s = newstate(v->nfa); /* set up starting state */
NOERR();
EMPTYARC(lp, s);
NOERR();
/*
*************** parseqatom(
*** 1166,1192 ****
}
/*
! * It's quantifier time; first, turn x{0,...} into x{1,...}|empty
! */
!
! if (m == 0) {
! EMPTYARC(s2, atom->end);/* the bypass */
! assert(PREF(qprefer) != 0);
! f = COMBINE(qprefer, atom->flags);
! t = subre(v, '|', f, lp, atom->end);
! NOERR();
! t->left = atom;
! t->right = subre(v, '|', PREF(f), s2, atom->end);
! NOERR();
! t->right->left = subre(v, '=', 0, s2, atom->end);
! NOERR();
! *atomp = t;
! atomp = &t->left;
! m = 1;
! }
!
! /*
! * Deal with the rest of the quantifier.
*/
if (atomtype == BACKREF) {
--- 1170,1177 ----
}
/*
! * It's quantifier time. If the atom is just a backref, we'll let it deal
! * with quantifiers internally.
*/
if (atomtype == BACKREF) {
*************** parseqatom(
*** 1204,1219 ****
atom->min = (short) m;
atom->max = (short) n;
atom->flags |= COMBINE(qprefer, atom->flags);
} else if (m == 1 && n == 1) {
/*
* No/vacuous quantifier: done.
*/
EMPTYARC(s, atom->begin); /* empty prefix */
! } else {
/*
! * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second
! * x
*/
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
--- 1189,1212 ----
atom->min = (short) m;
atom->max = (short) n;
atom->flags |= COMBINE(qprefer, atom->flags);
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
} else if (m == 1 && n == 1) {
/*
* No/vacuous quantifier: done.
*/
EMPTYARC(s, atom->begin); /* empty prefix */
! /* rest of branch can be strung starting from atom->end */
! s2 = atom->end;
! } else if (m > 0 && !(atom->flags & BACKR)) {
/*
! * If there's no backrefs involved, we can turn x{m,n} into
! * x{m-1,n-1}x, with capturing parens in only the second x. This
! * is valid because we only care about capturing matches from the
! * final iteration of the quantifier. It's a win because we can
! * implement the backref-free left side as a plain DFA node, since
! * we don't really care where its submatches are.
*/
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
*************** parseqatom(
*** 1226,1231 ****
--- 1219,1242 ----
NOERR();
t->right = atom;
*atomp = t;
+ /* rest of branch can be strung starting from atom->end */
+ s2 = atom->end;
+ } else {
+ /* general case: need an iteration node */
+ s2 = newstate(v->nfa);
+ NOERR();
+ moveouts(v->nfa, atom->end, s2);
+ NOERR();
+ dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+ repeat(v, s, s2, m, n);
+ f = COMBINE(qprefer, atom->flags);
+ t = subre(v, '*', f, s, s2);
+ NOERR();
+ t->min = (short) m;
+ t->max = (short) n;
+ t->left = atom;
+ *atomp = t;
+ /* rest of branch is to be strung from iteration's end state */
}
/*
*************** parseqatom(
*** 1234,1243 ****
t = top->right;
if (!(SEE('|') || SEE(stopper) || SEE(EOS))) {
! t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
} else {
! EMPTYARC(atom->end, rp);
! t->right = subre(v, '=', 0, atom->end, rp);
}
NOERR();
assert(SEE('|') || SEE(stopper) || SEE(EOS));
--- 1245,1254 ----
t = top->right;
if (!(SEE('|') || SEE(stopper) || SEE(EOS))) {
! t->right = parsebranch(v, stopper, type, s2, rp, 1);
} else {
! EMPTYARC(s2, rp);
! t->right = subre(v, '=', 0, s2, rp);
}
NOERR();
assert(SEE('|') || SEE(stopper) || SEE(EOS));
*************** scannum(
*** 1304,1309 ****
--- 1315,1322 ----
/*
- repeat - replicate subNFA for quantifiers
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
* The duplication sequences used here are chosen carefully so that any
* pointers starting out pointing into the subexpression end up pointing into
* the last occurrence. (Note that it may not be strung between the same left
*************** subre(
*** 1713,1723 ****
v->treechain = ret;
}
! assert(strchr("|.b(=", op) != NULL);
ret->op = op;
ret->flags = flags;
! ret->retry = 0;
ret->subno = 0;
ret->min = ret->max = 1;
ret->left = NULL;
--- 1726,1736 ----
v->treechain = ret;
}
! assert(strchr("=b|.*(", op) != NULL);
ret->op = op;
ret->flags = flags;
! ret->id = 0; /* will be assigned later */
ret->subno = 0;
ret->min = ret->max = 1;
ret->left = NULL;
*************** optst(
*** 1798,1804 ****
}
/*
! - numst - number tree nodes (assigning retry indexes)
^ static int numst(struct subre *, int);
*/
static int /* next number */
--- 1811,1817 ----
}
/*
! - numst - number tree nodes (assigning "id" indexes)
^ static int numst(struct subre *, int);
*/
static int /* next number */
*************** numst(
*** 1811,1817 ****
assert(t != NULL);
i = start;
! t->retry = (short) i++;
if (t->left != NULL) {
i = numst(t->left, i);
}
--- 1824,1830 ----
assert(t != NULL);
i = start;
! t->id = (short) i++;
if (t->left != NULL) {
i = numst(t->left, i);
}
*************** stid(
*** 2146,2159 ****
size_t bufsize)
{
/*
! * Big enough for hex int or decimal t->retry?
*/
! if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) {
return "unable";
}
! if (t->retry != 0) {
! sprintf(buf, "%d", t->retry);
} else {
sprintf(buf, "%p", t);
}
--- 2159,2172 ----
size_t bufsize)
{
/*
! * Big enough for hex int or decimal t->id?
*/
! if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->id)*3 + 1) {
return "unable";
}
! if (t->id != 0) {
! sprintf(buf, "%d", t->id);
} else {
sprintf(buf, "%p", t);
}
diff -pcdr tcl8.6.4.orig/generic/regexec.c tcl8.6.4/generic/regexec.c
*** tcl8.6.4.orig/generic/regexec.c Thu Feb 26 11:57:28 2015
--- tcl8.6.4/generic/regexec.c Fri Sep 18 12:24:14 2015
*************** struct vars {
*** 107,113 ****
chr *start; /* start of string */
chr *stop; /* just past end of string */
int err; /* error code if any (0 none) */
- regoff_t *mem; /* memory vector for backtracking */
struct smalldfa dfa1;
struct smalldfa dfa2;
};
--- 107,112 ----
*************** int exec(regex_t *, const chr *, size_t,
*** 129,146 ****
static int simpleFind(struct vars *const, struct cnfa *const, struct colormap *const);
static int complicatedFind(struct vars *const, struct cnfa *const, struct colormap *const);
static int complicatedFindLoop(struct vars *const, struct cnfa *const, struct colormap *const, struct dfa *const, struct dfa *const, chr **const);
! static void zapSubexpressions(regmatch_t *const, const size_t);
! static void zapSubtree(struct vars *const, struct subre *const);
static void subset(struct vars *const, struct subre *const, chr *const, chr *const);
! static int dissect(struct vars *const, struct subre *, chr *const, chr *const);
! static int concatenationDissect(struct vars *const, struct subre *const, chr *const, chr *const);
! static int alternationDissect(struct vars *const, struct subre *, chr *const, chr *const);
! static inline int complicatedDissect(struct vars *const, struct subre *const, chr *const, chr *const);
! static int complicatedCapturingDissect(struct vars *const, struct subre *const, chr *const, chr *const);
! static int complicatedConcatenationDissect(struct vars *const, struct subre *const, chr *const, chr *const);
! static int complicatedReversedDissect(struct vars *const, struct subre *const, chr *const, chr *const);
! static int complicatedBackrefDissect(struct vars *const, struct subre *const, chr *const, chr *const);
! static int complicatedAlternationDissect(struct vars *const, struct subre *, chr *const, chr *const);
/* === rege_dfa.c === */
static chr *longest(struct vars *const, struct dfa *const, chr *const, chr *const, int *const);
static chr *shortest(struct vars *const, struct dfa *const, chr *const, chr *const, chr *const, chr **const, int *const);
--- 128,143 ----
static int simpleFind(struct vars *const, struct cnfa *const, struct colormap *const);
static int complicatedFind(struct vars *const, struct cnfa *const, struct colormap *const);
static int complicatedFindLoop(struct vars *const, struct cnfa *const, struct colormap *const, struct dfa *const, struct dfa *const, chr **const);
! static void zapallsubs(regmatch_t *const, const size_t);
! static void zaptreesubs(struct vars *const, struct subre *const);
static void subset(struct vars *const, struct subre *const, chr *const, chr *const);
! static int cdissect(struct vars *, struct subre *, chr *, chr *);
! static int ccondissect(struct vars *, struct subre *, chr *, chr *);
! static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
! static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
! static int caltdissect(struct vars *, struct subre *, chr *, chr *);
! static int citerdissect(struct vars *, struct subre *, chr *, chr *);
! static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
/* === rege_dfa.c === */
static chr *longest(struct vars *const, struct dfa *const, chr *const, chr *const, int *const);
static chr *shortest(struct vars *const, struct dfa *const, chr *const, chr *const, chr *const, chr **const, int *const);
*************** exec(
*** 176,183 ****
size_t n;
#define LOCALMAT 20
regmatch_t mat[LOCALMAT];
- #define LOCALMEM 40
- regoff_t mem[LOCALMEM];
/*
* Sanity checks.
--- 173,178 ----
*************** exec(
*** 235,262 ****
v->start = (chr *)string;
v->stop = (chr *)string + len;
v->err = 0;
- if (backref) {
- /*
- * Need retry memory.
- */
-
- assert(v->g->ntree >= 0);
- n = (size_t)v->g->ntree;
- if (n <= LOCALMEM) {
- v->mem = mem;
- } else {
- v->mem = (regoff_t *) MALLOC(n*sizeof(regoff_t));
- }
- if (v->mem == NULL) {
- if (v->pmatch != pmatch && v->pmatch != mat) {
- FREE(v->pmatch);
- }
- FreeVars(v);
- return REG_ESPACE;
- }
- } else {
- v->mem = NULL;
- }
/*
* Do it.
--- 230,235 ----
*************** exec(
*** 274,280 ****
*/
if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
! zapSubexpressions(pmatch, nmatch);
n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
}
--- 247,253 ----
*/
if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
! zapallsubs(pmatch, nmatch);
n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
}
*************** exec(
*** 286,294 ****
if (v->pmatch != pmatch && v->pmatch != mat) {
FREE(v->pmatch);
}
- if (v->mem != NULL && v->mem != mem) {
- FREE(v->mem);
- }
FreeVars(v);
return st;
}
--- 259,264 ----
*************** simpleFind(
*** 388,398 ****
}
/*
! * Submatches.
*/
! zapSubexpressions(v->pmatch, v->nmatch);
! return dissect(v, v->g->tree, begin, end);
}
/*
--- 358,368 ----
}
/*
! * Find submatches.
*/
! zapallsubs(v->pmatch, v->nmatch);
! return cdissect(v, v->g->tree, begin, end);
}
/*
*************** complicatedFindLoop(
*** 488,496 ****
}
MDEBUG(("tentative end %ld\n", LOFF(end)));
! zapSubexpressions(v->pmatch, v->nmatch);
! zapSubtree(v, v->g->tree);
! er = complicatedDissect(v, v->g->tree, begin, end);
if (er == REG_OKAY) {
if (v->nmatch > 0) {
v->pmatch[0].rm_so = OFF(begin);
--- 458,465 ----
}
MDEBUG(("tentative end %ld\n", LOFF(end)));
! zapallsubs(v->pmatch, v->nmatch);
! er = cdissect(v, v->g->tree, begin, end);
if (er == REG_OKAY) {
if (v->nmatch > 0) {
v->pmatch[0].rm_so = OFF(begin);
*************** complicatedFindLoop(
*** 525,535 ****
}
/*
! - zapSubexpressions - initialize the subexpression matches to "no match"
! ^ static void zapSubexpressions(regmatch_t *, size_t);
*/
static void
! zapSubexpressions(
regmatch_t *const p,
const size_t n)
{
--- 494,504 ----
}
/*
! - zapallsubs - initialize all subexpression matches to "no match"
! ^ static void zapallsubs(regmatch_t *, size_t);
*/
static void
! zapallsubs(
regmatch_t *const p,
const size_t n)
{
*************** zapSubexpressions(
*** 542,577 ****
}
/*
! - zapSubtree - initialize the retry memory of a subtree to zeros
! ^ static void zapSubtree(struct vars *, struct subre *);
*/
static void
! zapSubtree(
struct vars *const v,
struct subre *const t)
{
- if (t == NULL) {
- return;
- }
-
- assert(v->mem != NULL);
- v->mem[t->retry] = 0;
if (t->op == '(') {
! assert(t->subno > 0);
! v->pmatch[t->subno].rm_so = -1;
! v->pmatch[t->subno].rm_eo = -1;
}
if (t->left != NULL) {
! zapSubtree(v, t->left);
}
if (t->right != NULL) {
! zapSubtree(v, t->right);
}
}
/*
! - subset - set any subexpression relevant to a successful subre
^ static void subset(struct vars *, struct subre *, chr *, chr *);
*/
static void
--- 511,543 ----
}
/*
! - zaptreesubs - initialize subexpressions within subtree to "no match"
! ^ static void zaptreesubs(struct vars *, struct subre *);
*/
static void
! zaptreesubs(
struct vars *const v,
struct subre *const t)
{
if (t->op == '(') {
! int n = t->subno;
! assert(n > 0);
! if ((size_t) n < v->nmatch) {
! v->pmatch[n].rm_so = -1;
! v->pmatch[n].rm_eo = -1;
! }
}
if (t->left != NULL) {
! zaptreesubs(v, t->left);
}
if (t->right != NULL) {
! zaptreesubs(v, t->right);
}
}
/*
! - subset - set subexpression match data for a successful subre
^ static void subset(struct vars *, struct subre *, chr *, chr *);
*/
static void
*************** subset(
*** 594,836 ****
}
/*
! - dissect - determine subexpression matches (uncomplicated case)
! ^ static int dissect(struct vars *, struct subre *, chr *, chr *);
! */
! static int /* regexec return code */
! dissect(
! struct vars *const v,
! struct subre *t,
! chr *const begin, /* beginning of relevant substring */
! chr *const end) /* end of same */
! {
! #ifndef COMPILER_DOES_TAILCALL_OPTIMIZATION
! restart:
! #endif
! assert(t != NULL);
! MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
!
! switch (t->op) {
! case '=': /* terminal node */
! assert(t->left == NULL && t->right == NULL);
! return REG_OKAY; /* no action, parent did the work */
! case '|': /* alternation */
! assert(t->left != NULL);
! return alternationDissect(v, t, begin, end);
! case 'b': /* back ref -- shouldn't be calling us! */
! return REG_ASSERT;
! case '.': /* concatenation */
! assert(t->left != NULL && t->right != NULL);
! return concatenationDissect(v, t, begin, end);
! case '(': /* capturing */
! assert(t->left != NULL && t->right == NULL);
! assert(t->subno > 0);
! subset(v, t, begin, end);
! #ifndef COMPILER_DOES_TAILCALL_OPTIMIZATION
! t = t->left;
! goto restart;
! #else
! return dissect(v, t->left, begin, end);
! #endif
! default:
! return REG_ASSERT;
! }
! }
!
! /*
! - concatenationDissect - determine concatenation subexpression matches
! - (uncomplicated)
! ^ static int concatenationDissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! concatenationDissect(
struct vars *const v,
struct subre *const t,
chr *const begin, /* beginning of relevant substring */
chr *const end) /* end of same */
{
! struct dfa *d, *d2;
! chr *mid;
! int i;
! int shorter = (t->left->flags&SHORTER) ? 1 : 0;
! chr *stop = (shorter) ? end : begin;
!
! assert(t->op == '.');
! assert(t->left != NULL && t->left->cnfa.nstates > 0);
! assert(t->right != NULL && t->right->cnfa.nstates > 0);
!
! d = newDFA(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
! NOERR();
! d2 = newDFA(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
! if (ISERR()) {
! assert(d2 == NULL);
! freeDFA(d);
! return v->err;
! }
!
! /*
! * Pick a tentative midpoint.
! */
!
! if (shorter) {
! mid = shortest(v, d, begin, begin, end, NULL, NULL);
! } else {
! mid = longest(v, d, begin, end, NULL);
! }
! if (mid == NULL) {
! freeDFA(d);
! freeDFA(d2);
! return REG_ASSERT;
! }
! MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
!
! /*
! * Iterate until satisfaction or failure.
! */
!
! while (longest(v, d2, mid, end, NULL) != end) {
! /*
! * That midpoint didn't work, find a new one.
! */
!
! if (mid == stop) {
! /*
! * All possibilities exhausted!
! */
!
! MDEBUG(("no midpoint!\n"));
! freeDFA(d);
! freeDFA(d2);
! return REG_ASSERT;
! }
! if (shorter) {
! mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
! } else {
! mid = longest(v, d, begin, mid-1, NULL);
! }
! if (mid == NULL) {
! /*
! * Failed to find a new one!
! */
!
! MDEBUG(("failed midpoint!\n"));
! freeDFA(d);
! freeDFA(d2);
! return REG_ASSERT;
! }
! MDEBUG(("new midpoint %ld\n", LOFF(mid)));
! }
!
! /*
! * Satisfaction.
! */
!
! MDEBUG(("successful\n"));
! freeDFA(d);
! freeDFA(d2);
! i = dissect(v, t->left, begin, mid);
! if (i != REG_OKAY) {
! return i;
! }
! return dissect(v, t->right, mid, end);
! }
!
! /*
! - alternationDissect - determine alternative subexpression matches (uncomplicated)
! ^ static int alternationDissect(struct vars *, struct subre *, chr *, chr *);
! */
! static int /* regexec return code */
! alternationDissect(
! struct vars *const v,
! struct subre *t,
! chr *const begin, /* beginning of relevant substring */
! chr *const end) /* end of same */
! {
! int i;
assert(t != NULL);
! assert(t->op == '|');
!
! for (i = 0; t != NULL; t = t->right, i++) {
! struct dfa *d;
!
! MDEBUG(("trying %dth\n", i));
! assert(t->left != NULL && t->left->cnfa.nstates > 0);
! d = newDFA(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
! if (ISERR()) {
! return v->err;
! }
! if (longest(v, d, begin, end, NULL) == end) {
! MDEBUG(("success\n"));
! freeDFA(d);
! return dissect(v, t->left, begin, end);
! }
! freeDFA(d);
! }
! return REG_ASSERT; /* none of them matched?!? */
! }
!
! /*
! - complicatedDissect - determine subexpression matches (with complications)
! * The retry memory stores the offset of the trial midpoint from begin, plus 1
! * so that 0 uniquely means "clean slate".
! ^ static int complicatedDissect(struct vars *, struct subre *, chr *, chr *);
! */
! static inline int /* regexec return code */
! complicatedDissect(
! struct vars *const v,
! struct subre *const t,
! chr *const begin, /* beginning of relevant substring */
! chr *const end) /* end of same */
! {
! assert(t != NULL);
! MDEBUG(("complicatedDissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
switch (t->op) {
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
! return REG_OKAY; /* no action, parent did the work */
! case '|': /* alternation */
! assert(t->left != NULL);
! return complicatedAlternationDissect(v, t, begin, end);
! case 'b': /* back ref -- shouldn't be calling us! */
assert(t->left == NULL && t->right == NULL);
! return complicatedBackrefDissect(v, t, begin, end);
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
! return complicatedConcatenationDissect(v, t, begin, end);
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0);
! return complicatedCapturingDissect(v, t, begin, end);
default:
! return REG_ASSERT;
}
- }
! static int /* regexec return code */
! complicatedCapturingDissect(
! struct vars *const v,
! struct subre *const t,
! chr *const begin, /* beginning of relevant substring */
! chr *const end) /* end of same */
! {
! int er = complicatedDissect(v, t->left, begin, end);
- if (er == REG_OKAY) {
- subset(v, t, begin, end);
- }
return er;
}
/*
! - complicatedConcatenationDissect - concatenation subexpression matches (with complications)
! * The retry memory stores the offset of the trial midpoint from begin, plus 1
! * so that 0 uniquely means "clean slate".
! ^ static int complicatedConcatenationDissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! complicatedConcatenationDissect(
struct vars *const v,
struct subre *const t,
chr *const begin, /* beginning of relevant substring */
--- 560,646 ----
}
/*
! - cdissect - check backrefs and determine subexpression matches
! * cdissect recursively processes a subre tree to check matching of backrefs
! * and/or identify submatch boundaries for capture nodes. The proposed match
! * runs from "begin" to "end" (not including "end"), and we are basically
! * "dissecting" it to see where the submatches are.
! * Before calling any level of cdissect, the caller must have run the node's
! * DFA and found that the proposed substring satisfies the DFA. (We make
! * the caller do that because in concatenation and iteration nodes, it's
! * much faster to check all the substrings against the child DFAs before we
! * recurse.) Also, caller must have cleared subexpression match data via
! * zaptreesubs (or zapallsubs at the top level).
! ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! cdissect(
struct vars *const v,
struct subre *const t,
chr *const begin, /* beginning of relevant substring */
chr *const end) /* end of same */
{
! int er;
assert(t != NULL);
! MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
switch (t->op) {
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
! er = REG_OKAY; /* no action, parent did the work */
! break;
! case 'b': /* back reference */
assert(t->left == NULL && t->right == NULL);
! er = cbrdissect(v, t, begin, end);
! break;
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
! if (t->left->flags & SHORTER) /* reverse scan */
! er = crevcondissect(v, t, begin, end);
! else
! er = ccondissect(v, t, begin, end);
! break;
! case '|': /* alternation */
! assert(t->left != NULL);
! er = caltdissect(v, t, begin, end);
! break;
! case '*': /* iteration */
! assert(t->left != NULL);
! if (t->left->flags & SHORTER) /* reverse scan */
! er = creviterdissect(v, t, begin, end);
! else
! er = citerdissect(v, t, begin, end);
! break;
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0);
! er = cdissect(v, t->left, begin, end);
! if (er == REG_OKAY) {
! subset(v, t, begin, end);
! }
! break;
default:
! er = REG_ASSERT;
! break;
}
! /*
! * We should never have a match failure unless backrefs lurk below;
! * otherwise, either caller failed to check the DFA, or there's some
! * inconsistency between the DFA and the node's innards.
! */
! assert(er != REG_NOMATCH || (t->flags & BACKR));
return er;
}
/*
! - ccondissect - concatenation subexpression matches (with complications)
! ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! ccondissect(
struct vars *const v,
struct subre *const t,
chr *const begin, /* beginning of relevant substring */
*************** complicatedConcatenationDissect(
*** 842,851 ****
assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->right != NULL && t->right->cnfa.nstates > 0);
!
! if (t->left->flags&SHORTER) { /* reverse scan */
! return complicatedReversedDissect(v, t, begin, end);
! }
d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR()) {
--- 652,658 ----
assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->right != NULL && t->right->cnfa.nstates > 0);
! assert(!(t->left->flags & SHORTER));
d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR()) {
*************** complicatedConcatenationDissect(
*** 856,880 ****
freeDFA(d);
return v->err;
}
! MDEBUG(("cConcat %d\n", t->retry));
/*
* Pick a tentative midpoint.
*/
!
! if (v->mem[t->retry] == 0) {
! mid = longest(v, d, begin, end, NULL);
! if (mid == NULL) {
! freeDFA(d);
! freeDFA(d2);
! return REG_NOMATCH;
! }
! MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
! v->mem[t->retry] = (mid - begin) + 1;
! } else {
! mid = begin + (v->mem[t->retry] - 1);
! MDEBUG(("working midpoint %ld\n", LOFF(mid)));
}
/*
* Iterate until satisfaction or failure.
--- 663,680 ----
freeDFA(d);
return v->err;
}
! MDEBUG(("cConcat %d\n", t->id));
/*
* Pick a tentative midpoint.
*/
! mid = longest(v, d, begin, end, (int *) NULL);
! if (mid == NULL) {
! freeDFA(d);
! freeDFA(d2);
! return REG_NOMATCH;
}
+ MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
/*
* Iterate until satisfaction or failure.
*************** complicatedConcatenationDissect(
*** 886,895 ****
*/
if (longest(v, d2, mid, end, NULL) == end) {
! int er = complicatedDissect(v, t->left, begin, mid);
if (er == REG_OKAY) {
! er = complicatedDissect(v, t->right, mid, end);
if (er == REG_OKAY) {
/*
* Satisfaction.
--- 686,695 ----
*/
if (longest(v, d2, mid, end, NULL) == end) {
! int er = cdissect(v, t->left, begin, mid);
if (er == REG_OKAY) {
! er = cdissect(v, t->right, mid, end);
if (er == REG_OKAY) {
/*
* Satisfaction.
*************** complicatedConcatenationDissect(
*** 901,907 ****
return REG_OKAY;
}
}
! if ((er != REG_OKAY) && (er != REG_NOMATCH)) {
freeDFA(d);
freeDFA(d2);
return er;
--- 701,707 ----
return REG_OKAY;
}
}
! if (er != REG_NOMATCH) {
freeDFA(d);
freeDFA(d2);
return er;
*************** complicatedConcatenationDissect(
*** 917,923 ****
* All possibilities exhausted.
*/
! MDEBUG(("%d no midpoint\n", t->retry));
freeDFA(d);
freeDFA(d2);
return REG_NOMATCH;
--- 717,723 ----
* All possibilities exhausted.
*/
! MDEBUG(("%d no midpoint\n", t->id));
freeDFA(d);
freeDFA(d2);
return REG_NOMATCH;
*************** complicatedConcatenationDissect(
*** 928,954 ****
* Failed to find a new one.
*/
! MDEBUG(("%d failed midpoint\n", t->retry));
freeDFA(d);
freeDFA(d2);
return REG_NOMATCH;
}
! MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
! v->mem[t->retry] = (mid - begin) + 1;
! zapSubtree(v, t->left);
! zapSubtree(v, t->right);
}
}
/*
! - complicatedReversedDissect - determine backref shortest-first subexpression
! - matches
! * The retry memory stores the offset of the trial midpoint from begin, plus 1
! * so that 0 uniquely means "clean slate".
! ^ static int complicatedReversedDissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! complicatedReversedDissect(
struct vars *const v,
struct subre *const t,
chr *const begin, /* beginning of relevant substring */
--- 728,750 ----
* Failed to find a new one.
*/
! MDEBUG(("%d failed midpoint\n", t->id));
freeDFA(d);
freeDFA(d2);
return REG_NOMATCH;
}
! MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
! zaptreesubs(v, t->left);
! zaptreesubs(v, t->right);
}
}
/*
! - crevcondissect - dissect match for concatenation node, shortest-first
! ^ static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! crevcondissect(
struct vars *const v,
struct subre *const t,
chr *const begin, /* beginning of relevant substring */
*************** complicatedReversedDissect(
*** 962,971 ****
assert(t->right != NULL && t->right->cnfa.nstates > 0);
assert(t->left->flags&SHORTER);
- /*
- * Concatenation -- need to split the substring between parts.
- */
-
d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR()) {
return v->err;
--- 758,763 ----
*************** complicatedReversedDissect(
*** 975,999 ****
freeDFA(d);
return v->err;
}
! MDEBUG(("cRev %d\n", t->retry));
/*
* Pick a tentative midpoint.
*/
! if (v->mem[t->retry] == 0) {
! mid = shortest(v, d, begin, begin, end, NULL, NULL);
! if (mid == NULL) {
! freeDFA(d);
! freeDFA(d2);
! return REG_NOMATCH;
! }
! MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
! v->mem[t->retry] = (mid - begin) + 1;
! } else {
! mid = begin + (v->mem[t->retry] - 1);
! MDEBUG(("working midpoint %ld\n", LOFF(mid)));
}
/*
* Iterate until satisfaction or failure.
--- 767,785 ----
freeDFA(d);
return v->err;
}
! MDEBUG(("crevcon %d\n", t->id));
/*
* Pick a tentative midpoint.
*/
! mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
! if (mid == NULL) {
! freeDFA(d);
! freeDFA(d2);
! return REG_NOMATCH;
}
+ MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
/*
* Iterate until satisfaction or failure.
*************** complicatedReversedDissect(
*** 1005,1014 ****
*/
if (longest(v, d2, mid, end, NULL) == end) {
! int er = complicatedDissect(v, t->left, begin, mid);
if (er == REG_OKAY) {
! er = complicatedDissect(v, t->right, mid, end);
if (er == REG_OKAY) {
/*
* Satisfaction.
--- 791,800 ----
*/
if (longest(v, d2, mid, end, NULL) == end) {
! int er = cdissect(v, t->left, begin, mid);
if (er == REG_OKAY) {
! er = cdissect(v, t->right, mid, end);
if (er == REG_OKAY) {
/*
* Satisfaction.
*************** complicatedReversedDissect(
*** 1020,1026 ****
return REG_OKAY;
}
}
! if (er != REG_OKAY && er != REG_NOMATCH) {
freeDFA(d);
freeDFA(d2);
return er;
--- 806,812 ----
return REG_OKAY;
}
}
! if (er != REG_NOMATCH) {
freeDFA(d);
freeDFA(d2);
return er;
*************** complicatedReversedDissect(
*** 1036,1042 ****
* All possibilities exhausted.
*/
! MDEBUG(("%d no midpoint\n", t->retry));
freeDFA(d);
freeDFA(d2);
return REG_NOMATCH;
--- 822,828 ----
* All possibilities exhausted.
*/
! MDEBUG(("%d no midpoint\n", t->id));
freeDFA(d);
freeDFA(d2);
return REG_NOMATCH;
*************** complicatedReversedDissect(
*** 1047,1210 ****
* Failed to find a new one.
*/
! MDEBUG(("%d failed midpoint\n", t->retry));
freeDFA(d);
freeDFA(d2);
return REG_NOMATCH;
}
! MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
! v->mem[t->retry] = (mid - begin) + 1;
! zapSubtree(v, t->left);
! zapSubtree(v, t->right);
}
}
/*
! - complicatedBackrefDissect - determine backref subexpression matches
! ^ static int complicatedBackrefDissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! complicatedBackrefDissect(
struct vars *const v,
struct subre *const t,
chr *const begin, /* beginning of relevant substring */
chr *const end) /* end of same */
{
! int i, n = t->subno, min = t->min, max = t->max;
! chr *paren, *p, *stop;
! size_t len;
assert(t != NULL);
assert(t->op == 'b');
assert(n >= 0);
assert((size_t)n < v->nmatch);
! MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
if (v->pmatch[n].rm_so == -1) {
return REG_NOMATCH;
}
! paren = v->start + v->pmatch[n].rm_so;
! len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
/*
! * No room to maneuver -- retries are pointless.
*/
! if (v->mem[t->retry]) {
return REG_NOMATCH;
}
- v->mem[t->retry] = 1;
! /*
! * Special-case zero-length string.
! */
! if (len == 0) {
! if (begin == end) {
! return REG_OKAY;
}
! return REG_NOMATCH;
}
/*
! * And too-short string.
*/
!
! assert(end >= begin);
! if ((size_t)(end - begin) < len) {
! return REG_NOMATCH;
}
- stop = end - len;
/*
! * Count occurrences.
*/
! i = 0;
! for (p = begin; p <= stop && (i < max || max == DUPINF); p += len) {
! if (v->g->compare(paren, p, len) != 0) {
! break;
! }
! i++;
}
! MDEBUG(("cbackref found %d\n", i));
/*
! * And sort it out.
*/
! if (p != end) { /* didn't consume all of it */
! return REG_NOMATCH;
! }
! if (min <= i && (i <= max || max == DUPINF)) {
! return REG_OKAY;
}
! return REG_NOMATCH; /* out of range */
}
/*
! - complicatedAlternationDissect - determine alternative subexpression matches (w.
! - complications)
! ^ static int complicatedAlternationDissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! complicatedAlternationDissect(
! struct vars *const v,
! struct subre *t,
! chr *const begin, /* beginning of relevant substring */
! chr *const end) /* end of same */
{
! int er;
! #define UNTRIED 0 /* not yet tried at all */
! #define TRYING 1 /* top matched, trying submatches */
! #define TRIED 2 /* top didn't match or submatches exhausted */
! #ifndef COMPILER_DOES_TAILCALL_OPTIMIZATION
! if (0) {
! doRight:
! t = t->right;
! }
! #endif
! if (t == NULL) {
! return REG_NOMATCH;
}
! assert(t->op == '|');
! if (v->mem[t->retry] == TRIED) {
! goto doRight;
}
! MDEBUG(("cAlt n%d\n", t->retry));
! assert(t->left != NULL);
! if (v->mem[t->retry] == UNTRIED) {
! struct dfa *d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
! if (ISERR()) {
! return v->err;
}
! if (longest(v, d, begin, end, NULL) != end) {
freeDFA(d);
! v->mem[t->retry] = TRIED;
! goto doRight;
}
- freeDFA(d);
- MDEBUG(("cAlt matched\n"));
- v->mem[t->retry] = TRYING;
- }
! er = complicatedDissect(v, t->left, begin, end);
! if (er != REG_NOMATCH) {
! return er;
}
! v->mem[t->retry] = TRIED;
! #ifndef COMPILER_DOES_TAILCALL_OPTIMIZATION
! goto doRight;
! #else
! doRight:
! return complicatedAlternationDissect(v, t->right, begin, end);
! #endif
}
#include "rege_dfa.c"
--- 833,1316 ----
* Failed to find a new one.
*/
! MDEBUG(("%d failed midpoint\n", t->id));
freeDFA(d);
freeDFA(d2);
return REG_NOMATCH;
}
! MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
! zaptreesubs(v, t->left);
! zaptreesubs(v, t->right);
}
}
/*
! - cbrdissect - dissect match for backref node
! ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! cbrdissect(
struct vars *const v,
struct subre *const t,
chr *const begin, /* beginning of relevant substring */
chr *const end) /* end of same */
{
! int n = t->subno, min = t->min, max = t->max;
! size_t numreps;
! size_t tlen;
! size_t brlen;
! chr *brstring;
! chr *p;
assert(t != NULL);
assert(t->op == 'b');
assert(n >= 0);
assert((size_t)n < v->nmatch);
! MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
+ /* get the backreferenced string */
if (v->pmatch[n].rm_so == -1) {
return REG_NOMATCH;
}
! brstring = v->start + v->pmatch[n].rm_so;
! brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
!
! /* special cases for zero-length strings */
! if (brlen == 0) {
! /*
! * matches only if target is zero length, but any number of
! * repetitions can be considered to be present
! */
! if (begin == end && min <= max) {
! MDEBUG(("cbackref matched trivially\n"));
! return REG_OKAY;
! }
! return REG_NOMATCH;
! }
! if (begin == end) {
! /* matches only if zero repetitions are okay */
! if (min == 0) {
! MDEBUG(("cbackref matched trivially\n"));
! return REG_OKAY;
! }
! return REG_NOMATCH;
! }
/*
! * check target length to see if it could possibly be an allowed number of
! * repetitions of brstring
*/
! assert(end > begin);
! tlen = end - begin;
! if (tlen % brlen != 0)
! return REG_NOMATCH;
! numreps = tlen / brlen;
! if (numreps < min || (numreps > max && max != DUPINF))
return REG_NOMATCH;
+
+ /* okay, compare the actual string contents */
+ p = begin;
+ while (numreps-- > 0) {
+ if ((*v->g->compare) (brstring, p, brlen) != 0)
+ return REG_NOMATCH;
+ p += brlen;
}
! MDEBUG(("cbackref matched\n"));
! return REG_OKAY;
! }
!
! /*
! - caltdissect - dissect match for alternation node
! ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
! */
! static int /* regexec return code */
! caltdissect(
! struct vars *const v,
! struct subre *t,
! chr *const begin, /* beginning of relevant substring */
! chr *const end) /* end of same */
! {
! struct dfa *d;
! int er;
! /* We loop, rather than tail-recurse, to handle a chain of alternatives */
! while (t != NULL) {
! assert(t->op == '|');
! assert(t->left != NULL && t->left->cnfa.nstates > 0);
!
! MDEBUG(("calt n%d\n", t->id));
!
! d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
! NOERR();
! if (longest(v, d, begin, end, (int *) NULL) == end) {
! freeDFA(d);
! MDEBUG(("calt matched\n"));
! er = cdissect(v, t->left, begin, end);
! if (er != REG_NOMATCH) {
! return er;
! }
}
! freeDFA(d);
!
! t = t->right;
}
+ return REG_NOMATCH;
+ }
+
+ /*
+ - citerdissect - dissect match for iteration node
+ ^ static int citerdissect(struct vars *, struct subre *, chr *, chr *);
+ */
+ static int /* regexec return code */
+ citerdissect(struct vars * v,
+ struct subre * t,
+ chr *begin, /* beginning of relevant substring */
+ chr *end) /* end of same */
+ {
+ struct dfa *d;
+ chr **endpts;
+ chr *limit;
+ int min_matches;
+ size_t max_matches;
+ int nverified;
+ int k;
+ int i;
+ int er;
+
+ assert(t->op == '*');
+ assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(!(t->left->flags & SHORTER));
+ assert(begin <= end);
+
/*
! * If zero matches are allowed, and target string is empty, just declare
! * victory. OTOH, if target string isn't empty, zero matches can't work
! * so we pretend the min is 1.
*/
! min_matches = t->min;
! if (min_matches <= 0) {
! if (begin == end)
! return REG_OKAY;
! min_matches = 1;
}
/*
! * We need workspace to track the endpoints of each sub-match. Normally
! * we consider only nonzero-length sub-matches, so there can be at most
! * end-begin of them. However, if min is larger than that, we will also
! * consider zero-length sub-matches in order to find enough matches.
! *
! * For convenience, endpts[0] contains the "begin" pointer and we store
! * sub-match endpoints in endpts[1..max_matches].
*/
+ max_matches = end - begin;
+ if (max_matches > t->max && t->max != DUPINF)
+ max_matches = t->max;
+ if (max_matches < min_matches)
+ max_matches = min_matches;
+ endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+ if (endpts == NULL)
+ return REG_ESPACE;
+ endpts[0] = begin;
! d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
! if (ISERR()) {
! FREE(endpts);
! return v->err;
}
! MDEBUG(("citer %d\n", t->id));
/*
! * Our strategy is to first find a set of sub-match endpoints that are
! * valid according to the child node's DFA, and then recursively dissect
! * each sub-match to confirm validity. If any validity check fails,
! * backtrack the last sub-match and try again. And, when we next try for
! * a validity check, we need not recheck any successfully verified
! * sub-matches that we didn't move the endpoints of. nverified remembers
! * how many sub-matches are currently known okay.
*/
! /* initialize to consider first sub-match */
! nverified = 0;
! k = 1;
! limit = end;
!
! /* iterate until satisfaction or failure */
! while (k > 0) {
! /* try to find an endpoint for the k'th sub-match */
! endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
! if (endpts[k] == NULL) {
! /* no match possible, so see if we can shorten previous one */
! k--;
! goto backtrack;
! }
! MDEBUG(("%d: working endpoint %d: %ld\n",
! t->id, k, LOFF(endpts[k])));
!
! /* k'th sub-match can no longer be considered verified */
! if (nverified >= k)
! nverified = k - 1;
!
! if (endpts[k] != end) {
! /* haven't reached end yet, try another iteration if allowed */
! if (k >= max_matches) {
! /* must try to shorten some previous match */
! k--;
! goto backtrack;
! }
!
! /* reject zero-length match unless necessary to achieve min */
! if (endpts[k] == endpts[k - 1] &&
! (k >= min_matches || min_matches - k < end - endpts[k]))
! goto backtrack;
!
! k++;
! limit = end;
! continue;
! }
!
! /*
! * We've identified a way to divide the string into k sub-matches
! * that works so far as the child DFA can tell. If k is an allowed
! * number of matches, start the slow part: recurse to verify each
! * sub-match. We always have k <= max_matches, needn't check that.
! */
! if (k < min_matches)
! goto backtrack;
!
! MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
!
! for (i = nverified + 1; i <= k; i++) {
! zaptreesubs(v, t->left);
! er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
! if (er == REG_OKAY) {
! nverified = i;
! continue;
! }
! if (er == REG_NOMATCH)
! break;
! /* oops, something failed */
! freeDFA(d);
! FREE(endpts);
! return er;
! }
!
! if (i > k) {
! /* satisfaction */
! MDEBUG(("%d successful\n", t->id));
! freeDFA(d);
! FREE(endpts);
! return REG_OKAY;
! }
!
! /* match failed to verify, so backtrack */
!
! backtrack:
! /*
! * Must consider shorter versions of the current sub-match. However,
! * we'll only ask for a zero-length match if necessary.
! */
! while (k > 0) {
! chr *prev_end = endpts[k - 1];
!
! if (endpts[k] > prev_end) {
! limit = endpts[k] - 1;
! if (limit > prev_end ||
! (k < min_matches && min_matches - k >= end - prev_end)) {
! /* break out of backtrack loop, continue the outer one */
! break;
! }
! }
! /* can't shorten k'th sub-match any more, consider previous one */
! k--;
! }
}
!
! /* all possibilities exhausted */
! MDEBUG(("%d failed\n", t->id));
! freeDFA(d);
! FREE(endpts);
! return REG_NOMATCH;
}
/*
! - creviterdissect - dissect match for iteration node, shortest-first
! ^ static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
*/
static int /* regexec return code */
! creviterdissect(struct vars * v,
! struct subre * t,
! chr *begin, /* beginning of relevant substring */
! chr *end) /* end of same */
{
! struct dfa *d;
! chr **endpts;
! chr *limit;
! int min_matches;
! size_t max_matches;
! int nverified;
! int k;
! int i;
! int er;
! assert(t->op == '*');
! assert(t->left != NULL && t->left->cnfa.nstates > 0);
! assert(t->left->flags & SHORTER);
! assert(begin <= end);
!
! /*
! * If zero matches are allowed, and target string is empty, just declare
! * victory. OTOH, if target string isn't empty, zero matches can't work
! * so we pretend the min is 1.
! */
! min_matches = t->min;
! if (min_matches <= 0) {
! if (begin == end)
! return REG_OKAY;
! min_matches = 1;
}
!
! /*
! * We need workspace to track the endpoints of each sub-match. Normally
! * we consider only nonzero-length sub-matches, so there can be at most
! * end-begin of them. However, if min is larger than that, we will also
! * consider zero-length sub-matches in order to find enough matches.
! *
! * For convenience, endpts[0] contains the "begin" pointer and we store
! * sub-match endpoints in endpts[1..max_matches].
! */
! max_matches = end - begin;
! if (max_matches > t->max && t->max != DUPINF)
! max_matches = t->max;
! if (max_matches < min_matches)
! max_matches = min_matches;
! endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
! if (endpts == NULL)
! return REG_ESPACE;
! endpts[0] = begin;
!
! d = newDFA(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
! if (ISERR()) {
! FREE(endpts);
! return v->err;
}
+ MDEBUG(("creviter %d\n", t->id));
! /*
! * Our strategy is to first find a set of sub-match endpoints that are
! * valid according to the child node's DFA, and then recursively dissect
! * each sub-match to confirm validity. If any validity check fails,
! * backtrack the last sub-match and try again. And, when we next try for
! * a validity check, we need not recheck any successfully verified
! * sub-matches that we didn't move the endpoints of. nverified remembers
! * how many sub-matches are currently known okay.
! */
! /* initialize to consider first sub-match */
! nverified = 0;
! k = 1;
! limit = begin;
! /* iterate until satisfaction or failure */
! while (k > 0) {
! /* disallow zero-length match unless necessary to achieve min */
! if (limit == endpts[k - 1] &&
! limit != end &&
! (k >= min_matches || min_matches - k < end - limit))
! limit++;
!
! /* if this is the last allowed sub-match, it must reach to the end */
! if (k >= max_matches)
! limit = end;
!
! /* try to find an endpoint for the k'th sub-match */
! endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
! (chr **) NULL, (int *) NULL);
! if (endpts[k] == NULL) {
! /* no match possible, so see if we can lengthen previous one */
! k--;
! goto backtrack;
}
! MDEBUG(("%d: working endpoint %d: %ld\n",
! t->id, k, LOFF(endpts[k])));
!
! /* k'th sub-match can no longer be considered verified */
! if (nverified >= k)
! nverified = k - 1;
!
! if (endpts[k] != end) {
! /* haven't reached end yet, try another iteration if allowed */
! if (k >= max_matches) {
! /* must try to lengthen some previous match */
! k--;
! goto backtrack;
! }
!
! k++;
! limit = endpts[k - 1];
! continue;
! }
!
! /*
! * We've identified a way to divide the string into k sub-matches
! * that works so far as the child DFA can tell. If k is an allowed
! * number of matches, start the slow part: recurse to verify each
! * sub-match. We always have k <= max_matches, needn't check that.
! */
! if (k < min_matches)
! goto backtrack;
!
! MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
!
! for (i = nverified + 1; i <= k; i++) {
! zaptreesubs(v, t->left);
! er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
! if (er == REG_OKAY) {
! nverified = i;
! continue;
! }
! if (er == REG_NOMATCH)
! break;
! /* oops, something failed */
freeDFA(d);
! FREE(endpts);
! return er;
}
! if (i > k) {
! /* satisfaction */
! MDEBUG(("%d successful\n", t->id));
! freeDFA(d);
! FREE(endpts);
! return REG_OKAY;
! }
!
! /* match failed to verify, so backtrack */
!
! backtrack:
! /*
! * Must consider longer versions of the current sub-match.
! */
! while (k > 0) {
! if (endpts[k] < end) {
! limit = endpts[k] + 1;
! /* break out of backtrack loop, continue the outer one */
! break;
! }
! /* can't lengthen k'th sub-match any more, consider previous one */
! k--;
! }
}
! /* all possibilities exhausted */
! MDEBUG(("%d failed\n", t->id));
! freeDFA(d);
! FREE(endpts);
! return REG_NOMATCH;
}
#include "rege_dfa.c"
diff -pcdr tcl8.6.4.orig/generic/regguts.h tcl8.6.4/generic/regguts.h
*** tcl8.6.4.orig/generic/regguts.h Thu Feb 26 11:57:28 2015
--- tcl8.6.4/generic/regguts.h Thu Sep 17 18:52:18 2015
*************** struct cnfa {
*** 329,339 ****
/*
* subexpression tree
*/
struct subre {
! char op; /* '|', '.' (concat), 'b' (backref), '(',
! * '=' */
char flags;
#define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */
--- 329,356 ----
/*
* subexpression tree
+ *
+ * "op" is one of:
+ * '=' plain regex without interesting substructure (implemented as DFA)
+ * 'b' back-reference (has no substructure either)
+ * '(' capture node: captures the match of its single child
+ * '.' concatenation: matches a match for left, then a match for right
+ * '|' alternation: matches a match for left or a match for right
+ * '*' iteration: matches some number of matches of its single child
+ *
+ * Note: the right child of an alternation must be another alternation or
+ * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
+ * might expect. This could stand to be changed. Actually I'd rather see
+ * a single alternation node with N children, but that will take revising
+ * the representation of struct subre.
+ *
+ * Note: when a backref is directly quantified, we stick the min/max counts
+ * into the backref rather than plastering an iteration node on top. This is
+ * for efficiency: there is no need to search for possible division points.
*/
struct subre {
! char op; /* see type codes above */
char flags;
#define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */
*************** struct subre {
*** 349,358 ****
#define PREF(f) ((f)&NOPROP)
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
! short retry; /* index into retry memory */
int subno; /* subexpression number (for 'b' and '(') */
! short min; /* min repetitions, for backref only */
! short max; /* max repetitions, for backref only */
struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */
--- 366,375 ----
#define PREF(f) ((f)&NOPROP)
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
! short id; /* ID of subre (1..ntree) */
int subno; /* subexpression number (for 'b' and '(') */
! short min; /* min repetitions for iteration or backref */
! short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */
diff -pcdr tcl8.6.4.orig/tests/reg.test tcl8.6.4/tests/reg.test
*** tcl8.6.4.orig/tests/reg.test Thu Feb 26 11:57:28 2015
--- tcl8.6.4/tests/reg.test Fri Sep 18 12:26:33 2015
*************** expectError 14.19 - {a(b)c\2} ESUBREG
*** 669,675 ****
expectMatch 14.20 bR {a\(b*\)c\1} abbcbb abbcbb bb
expectMatch 14.21 RP {^([bc])\1*$} bbb bbb b
expectMatch 14.22 RP {^([bc])\1*$} ccc ccc c
! knownBug expectNomatch 14.23 R {^([bc])\1*$} bcb
doing 15 "octal escapes vs back references"
--- 669,681 ----
expectMatch 14.20 bR {a\(b*\)c\1} abbcbb abbcbb bb
expectMatch 14.21 RP {^([bc])\1*$} bbb bbb b
expectMatch 14.22 RP {^([bc])\1*$} ccc ccc c
! expectNomatch 14.23 RP {^([bc])\1*$} bcb
! expectMatch 14.24 LRP {^(\w+)( \1)+$} {abc abc abc} {abc abc abc} abc { abc}
! expectNomatch 14.25 LRP {^(\w+)( \1)+$} {abc abd abc}
! expectNomatch 14.26 LRP {^(\w+)( \1)+$} {abc abc abd}
! expectMatch 14.27 RP {^(.+)( \1)+$} {abc abc abc} {abc abc abc} abc { abc}
! expectNomatch 14.28 RP {^(.+)( \1)+$} {abc abd abc}
! expectNomatch 14.29 RP {^(.+)( \1)+$} {abc abc abd}
doing 15 "octal escapes vs back references"
*************** expectMatch 21.31 LP "\\y(\\w+)\\y" "--
*** 796,801 ****
--- 802,808 ----
expectMatch 21.32 - a((b|c)d+)+ abacdbd acdbd bd b
expectMatch 21.33 N (.*).* abc abc abc
expectMatch 21.34 N (a*)* bc "" ""
+ expectMatch 21.35 M { TO (([a-z0-9._]+|"([^"]+|"")+")+)} {asd TO foo} { TO foo} foo o {}
doing 22 "multicharacter collating elements"
*************** expectMatch 24.9 - 3z* 123zzzz456 3zz
*** 848,853 ****
--- 855,861 ----
expectMatch 24.10 PT 3z*? 123zzzz456 3
expectMatch 24.11 - z*4 123zzzz456 zzzz4
expectMatch 24.12 PT z*?4 123zzzz456 zzzz4
+ expectMatch 24.13 PT {^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$} {foo/bar/baz} {foo/bar/baz} {foo} {bar} {baz}
doing 25 "mixed quantifiers"