[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 4/4] dfa: optimize wide characters in a bracket expression
From: |
Paolo Bonzini |
Subject: |
[PATCH 4/4] dfa: optimize wide characters in a bracket expression |
Date: |
Tue, 7 Jun 2011 13:03:40 +0200 |
* src/dfa.c (addtok): Compile characters to an alternation. Handle the
case when nothing else remains in the MBCSET.
---
src/dfa.c | 36 ++++++++++++++++++++++++++++++++++--
1 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 8fc6ed0..aecaad9 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1449,6 +1449,8 @@ addtok_mb (token t, int mbprop)
dfa->depth = depth;
}
+static void addtok_wc (wint_t wc);
+
/* Add the given token to the parse tree, maintaining the depth count and
updating the maximum depth if necessary. */
static void
@@ -1457,8 +1459,24 @@ addtok (token t)
#if MBS_SUPPORT
if (MB_CUR_MAX > 1 && t == MBCSET)
{
+ bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
+ /* Extract wide characters into alternations if possible (for
+ better performance). This does not require UTF-8. */
+ if (!work_mbc->invert)
+ {
+ int i;
+ for (i = 0; i < work_mbc->nchars; i++)
+ {
+ addtok_wc (work_mbc->chars[i]);
+ if (need_or)
+ addtok (OR);
+ need_or = true;
+ }
+ work_mbc->nchars = 0;
+ }
+
/* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
if (work_mbc->invert
|| (!using_utf8() && work_mbc->cset != -1)
@@ -1467,9 +1485,23 @@ addtok (token t)
|| work_mbc->nranges != 0
|| work_mbc->nequivs != 0
|| work_mbc->ncoll_elems != 0)
- addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+ {
+ addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+ if (need_or)
+ addtok (OR);
+ }
else
- addtok (CSET + work_mbc->cset);
+ {
+ /* Characters have been handled above, so it is possible
+ that the mbcset is empty now. Do nothing in that case. */
+ if (work_mbc->cset != -1)
+ {
+ assert (using_utf8 ());
+ addtok (CSET + work_mbc->cset);
+ if (need_or)
+ addtok (OR);
+ }
+ }
}
else
#endif
--
1.7.4.4