From 8357fa551c8a5a4f14540b250bc2485c4812a234 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Fri, 17 Apr 2020 10:12:01 +0900 Subject: [PATCH] dfa: build auxiliary indexes before remove epsilon closure When remove epsilon closure, so far searched nodes including epsilon closure in all nodes sequentially, but it is slow for some cases. Now build auxiliary indexes before search. Problem reported in: https://debbugs.gnu.org/cgi/bugreport.cgi?bug=40634 * lib/dfa.c (epsclosure): build auxiliary indexes before remove epsilon closure. --- lib/dfa.c | 129 +++++++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 95 insertions(+), 34 deletions(-) diff --git a/lib/dfa.c b/lib/dfa.c index 9939d22..5f90a92 100644 --- a/lib/dfa.c +++ b/lib/dfa.c @@ -2298,45 +2298,106 @@ static void epsclosure (struct dfa const *d) { position_set tmp; + idx_t *currs, *backs; + idx_t ncurr = 0; + position_set *prevs; + alloc_position_set (&tmp, d->nleaves); + currs = xnmalloc (d->nleaves, sizeof *currs); + backs = xnmalloc (d->tindex, sizeof *backs); + for (idx_t i = 0; i < d->tindex; i++) - if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR - && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR - && d->tokens[i] != MBCSET && d->tokens[i] < CSET) - { - unsigned int constraint; - switch (d->tokens[i]) - { - case BEGLINE: - constraint = BEGLINE_CONSTRAINT; - break; - case ENDLINE: - constraint = ENDLINE_CONSTRAINT; - break; - case BEGWORD: - constraint = BEGWORD_CONSTRAINT; - break; - case ENDWORD: - constraint = ENDWORD_CONSTRAINT; - break; - case LIMWORD: - constraint = LIMWORD_CONSTRAINT; - break; - case NOTLIMWORD: - constraint = NOTLIMWORD_CONSTRAINT; - break; - default: - constraint = NO_CONSTRAINT; - break; - } + { + if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR + && d->tokens[i] != ANYCHAR && d->tokens[i] != BEG + && d->tokens[i] != BACKREF && d->tokens[i] != MBCSET + && d->tokens[i] < CSET) + { + currs[ncurr] = i; + backs[i] = ncurr++; + } + else + backs[i] = -1; + } - delete (i, &d->follows[i]); + prevs = xnmalloc (ncurr, sizeof *prevs); - for (idx_t j = 0; j < d->tindex; j++) - if (i != j && d->follows[j].nelem > 0) - replace (&d->follows[j], i, &d->follows[i], constraint, &tmp); - } + for (idx_t i = 0; i < ncurr; i++) + { + prevs[i].elems = NULL; + prevs[i].nelem = 0; + } + + for (idx_t i = 0; i < d->tindex; i++) + { + for (idx_t j = 0, k = 0; j < d->follows[i].nelem && k < ncurr;) + { + if (d->follows[i].elems[j].index < currs[k]) + j++; + else if (currs[k] < d->follows[i].elems[j].index) + k++; + else + { + if (currs[k] != i) + { + position p; + p.index = i; + p.constraint = NO_CONSTRAINT; + if (prevs[k].nelem == 0) + alloc_position_set (&prevs[k], 1); + insert (p, &prevs[k]); + } + j++; + k++; + } + } + } + + for (idx_t i = 0; i < ncurr; i++) + { + unsigned int constraint; + switch (d->tokens[currs[i]]) + { + case BEGLINE: + constraint = BEGLINE_CONSTRAINT; + break; + case ENDLINE: + constraint = ENDLINE_CONSTRAINT; + break; + case BEGWORD: + constraint = BEGWORD_CONSTRAINT; + break; + case ENDWORD: + constraint = ENDWORD_CONSTRAINT; + break; + case LIMWORD: + constraint = LIMWORD_CONSTRAINT; + break; + case NOTLIMWORD: + constraint = NOTLIMWORD_CONSTRAINT; + break; + default: + constraint = NO_CONSTRAINT; + break; + } + + delete (currs[i], &d->follows[currs[i]]); + + for (idx_t j = 0; j < prevs[i].nelem; j++) + replace (&d->follows[prevs[i].elems[j].index], currs[i], + &d->follows[currs[i]], constraint, &tmp); + for (idx_t j = 0; j < d->follows[currs[i]].nelem; j++) + if (backs[d->follows[currs[i]].elems[j].index] >= 0) + replace (&prevs[backs[d->follows[currs[i]].elems[j].index]], + currs[i], &prevs[i], NO_CONSTRAINT, &tmp); + } + + for (idx_t i = 0; i < ncurr; i++) + free (prevs[i].elems); + free (prevs); free (tmp.elems); + free (currs); + free (backs); } /* Returns the set of contexts for which there is at least one -- 1.7.1