From 7c68757561c595a5966da718f9bc0bffcd84a373 Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Thu, 1 Sep 2016 08:43:48 -0700
Subject: [PATCH 2/2] grep: avoid code duplication with -iF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* NEWS: Simplify description of -iF improvement.
* src/dfa.c: Do not include wctype.h.
(lonesome_lower, case_folded_counterparts): Move to localeinfo.c.
(CASE_FOLDED_BUFSIZE): Move to localeinfo.h.
* src/grep.c: Do not include wctype.h.
(lonesome_lower): Remove.
(fgrep_icase_available): Use case_folded_counterparts instead.
Do not call it for the same character twice.
Return false on wcrtomb failures (which should never happen).
(fgrep_to_grep_pattern, main): Simplify. Let fgrep_to_grep’s
caller fiddle with the global variables.
* src/localeinfo.c: Include
(lonesome_lower, case_folded_counterparts):
Move here from src/dfa.c. Return int, not unsigned int.
Verify that CASE_FOLDED_BUFSIZE is big enough.
* src/localeinfo.h (CASE_FOLDED_BUFSIZE): Now 32, so that
we don’t expose lonesome_lower’s size.
* src/searchutils.c (kwsinit): Return new kwset instead of
storing it via a pointer. All callers changed. Simplify a bit.
---
NEWS | 9 ++--
src/dfa.c | 48 -------------------
src/dfasearch.c | 2 +-
src/grep.c | 135 +++++++++++++++++++-----------------------------------
src/kwsearch.c | 2 +-
src/localeinfo.c | 47 +++++++++++++++++++
src/localeinfo.h | 7 +++
src/search.h | 2 +-
src/searchutils.c | 24 ++++------
9 files changed, 115 insertions(+), 161 deletions(-)
diff --git a/NEWS b/NEWS
index 65e663d..91f1bfc 100644
--- a/NEWS
+++ b/NEWS
@@ -10,18 +10,15 @@ GNU grep NEWS -*- outline -*-
as it now uses the Aho-Corasick algorithm instead of the
Commentz-Walter algorithm in that case.
+ grep -iF is typically much faster in a multibyte locale, if the
+ pattern and its case counterparts contain only single byte characters.
+
In multibyte locales that are not UTF-8, grep now handles leading
"." in patterns more efficiently.
grep now prints a "FILENAME:LINENO: " prefix when diagnosing an
invalid regular expression that was read from an '-f'-specified file.
- grep -F now tries fgrep matcher for case insensitive matching in
- multibyte locale. It improves performance for a case which a pattern
- is composed of only single byte characters and their all counterparts
- are also single byte characters and the pattern does not have invalid
- sequences.
-
* Noteworthy changes in release 2.25 (2016-04-21) [stable]
diff --git a/src/dfa.c b/src/dfa.c
index cc96740..0f5109e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -50,7 +50,6 @@
#define _(str) gettext (str)
#include
-#include
/* HPUX defines these as macros in sys/param.h. */
#ifdef setbit
@@ -857,53 +856,6 @@ using_simple_locale (bool multibyte)
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
-/* The set of wchar_t values C such that there's a useful locale
- somewhere where C != towupper (C) && C != towlower (towupper (C)).
- For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
- towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
- towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
-static short const lonesome_lower[] =
- {
- 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
- 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
- /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
- counterpart in locales predating Unicode 4.0.0 (April 2003). */
- 0x03F2,
-
- 0x03F5, 0x1E9B, 0x1FBE,
- };
-
-/* Maximum number of characters that can be the case-folded
- counterparts of a single character, not counting the character
- itself. This is 1 for towupper, 1 for towlower, and 1 for each
- entry in LONESOME_LOWER. */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
- itself, and store them into FOLDED. Return the number of characters
- stored. */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
- unsigned int i;
- unsigned int n = 0;
- wint_t uc = towupper (c);
- wint_t lc = towlower (uc);
- if (uc != c)
- folded[n++] = uc;
- if (lc != uc && lc != c && towupper (lc) == uc)
- folded[n++] = lc;
- for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
- {
- wint_t li = lonesome_lower[i];
- if (li != lc && li != uc && li != c && towupper (li) == uc)
- folded[n++] = li;
- }
- return n;
-}
-
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 548ef08..61b1f70 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -89,7 +89,7 @@ kwsmusts (void)
struct dfamust *dm = dfamust (dfa);
if (!dm)
return;
- kwsinit (&kwset, false);
+ kwset = kwsinit (false);
if (dm->exact)
{
/* Prepare a substring whose presence implies a match.
diff --git a/src/grep.c b/src/grep.c
index ae3b6e7..15c6dc6 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -22,7 +22,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -2265,91 +2264,53 @@ contains_encoding_error (char const *pat, size_t patlen)
return false;
}
-/* The set of wchar_t values C such that there's a useful locale
- somewhere where C != towupper (C) && C != towlower (towupper (C)).
- For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
- towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
- towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
-static short const lonesome_lower[] =
- {
- 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
- 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
- /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
- counterpart in locales predating Unicode 4.0.0 (April 2003). */
- 0x03F2,
-
- 0x03F5, 0x1E9B, 0x1FBE
- };
+/* Return true if the fgrep pattern PAT, of size PATLEN, matches only
+ single-byte characters including case folding, and so is suitable
+ to be converted to a grep pattern. */
static bool
fgrep_icase_available (char const *pat, size_t patlen)
{
- for (size_t i = 0; i < patlen; ++i)
- {
- unsigned char c = pat[i];
- if (localeinfo.sbclen[c] > 1)
- return false;
- }
+ bool used[UCHAR_MAX + 1] = { 0, };
- for (size_t i = 0; i < patlen; ++i)
+ for (size_t i = 0; i < patlen; i++)
{
unsigned char c = pat[i];
-
- wint_t wc = localeinfo.sbctowc[c];
- if (wc == WEOF)
+ if (localeinfo.sbctowc[c] == WEOF)
return false;
+ used[c] = true;
+ }
- wint_t uwc = towupper (wc);
- if (uwc != wc)
- {
- char s[MB_LEN_MAX];
- mbstate_t mb_state = { 0 };
- size_t len = wcrtomb (s, uwc, &mb_state);
- if (len > 1 && len != (size_t) -1)
- return false;
- }
-
- wint_t lwc = towlower (uwc);
- if (lwc != uwc && lwc != wc && towupper (lwc) == uwc)
- {
- char s[MB_LEN_MAX];
- mbstate_t mb_state = { 0 };
- size_t len = wcrtomb (s, lwc, &mb_state);
- if (len > 1 && len != (size_t) -1)
- return false;
- }
+ for (int c = 0; c <= UCHAR_MAX; c++)
+ if (used[c])
+ {
+ wint_t wc = localeinfo.sbctowc[c];
+ wchar_t folded[CASE_FOLDED_BUFSIZE];
+ size_t nfolded = case_folded_counterparts (wc, folded);
- for (size_t j = 0; lonesome_lower[j]; j++)
- {
- wint_t li = lonesome_lower[j];
- if (li != lwc && li != uwc && li != wc && towupper (li) == uwc)
- {
- char s[MB_LEN_MAX];
- mbstate_t mb_state = { 0 };
- size_t len = wcrtomb (s, li, &mb_state);
- if (len > 1 && len != (size_t) -1)
- return false;
- }
- }
- }
+ for (size_t i = 0; i < nfolded; i++)
+ {
+ char s[MB_LEN_MAX];
+ mbstate_t mb_state = { 0 };
+ if (1 < wcrtomb (s, folded[i], &mb_state))
+ return false;
+ }
+ }
return true;
}
-/* Change a pattern for fgrep into grep. */
+/* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style. */
+
static void
fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
{
- char *keys, *new_keys, *p;
+ size_t len = *len_p;
+ char *keys = *keys_p;
mbstate_t mb_state = { 0 };
- size_t len, n;
-
- len = *len_p;
- keys = *keys_p;
-
- new_keys = xnmalloc (len + 1, 2);
- p = new_keys;
+ char *new_keys = xnmalloc (len + 1, 2);
+ char *p = new_keys;
+ size_t n;
for (; len; keys += n, len -= n)
{
@@ -2365,14 +2326,15 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
case (size_t) -1:
memset (&mb_state, 0, sizeof mb_state);
+ n = 1;
/* Fall through. */
case 1:
- *p = '\\';
- p += strchr ("$*.[\\^", *keys) != NULL;
- /* Fall through. */
- case 0:
+ switch (*keys)
+ {
+ case '$': case '*': case '.': case '[': case '\\': case '^':
+ *p++ = '\\'; break;
+ }
*p++ = *keys;
- n = 1;
break;
}
}
@@ -2380,10 +2342,6 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
free (*keys_p);
*keys_p = new_keys;
*len_p = p - new_keys;
-
- matcher = "grep";
- compile = Gcompile;
- execute = EGexecute;
}
int
@@ -2814,19 +2772,18 @@ main (int argc, char **argv)
/* In a unibyte locale, switch from fgrep to grep if
the pattern matches words (where grep is typically faster).
In a multibyte locale, switch from fgrep to grep if either
- (1) case is ignored (where grep is typically faster), or
- (2) the pattern has an encoding error (where fgrep might not work). */
- if (compile == Fcompile)
+ (1) the pattern has an encoding error (where fgrep might not work), or
+ (2) case is ignored and a fast fgrep ignore-case is not available. */
+ if (compile == Fcompile
+ && (MB_CUR_MAX <= 1
+ ? match_words
+ : (contains_encoding_error (keys, keycc)
+ || (match_icase && !fgrep_icase_available (keys, keycc)))))
{
- if (MB_CUR_MAX > 1)
- {
- if (contains_encoding_error (keys, keycc))
- fgrep_to_grep_pattern (&keys, &keycc);
- else if (match_icase && !fgrep_icase_available (keys, keycc))
- fgrep_to_grep_pattern (&keys, &keycc);
- }
- else if (match_words)
- fgrep_to_grep_pattern (&keys, &keycc);
+ fgrep_to_grep_pattern (&keys, &keycc);
+ matcher = "grep";
+ compile = Gcompile;
+ execute = EGexecute;
}
compile (keys, keycc);
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7fe08aa..29d140c 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -38,7 +38,7 @@ Fcompile (char const *pattern, size_t size)
{
size_t total = size;
- kwsinit (&kwset, true);
+ kwset = kwsinit (true);
char const *p = pattern;
do
diff --git a/src/localeinfo.c b/src/localeinfo.c
index 329d431..ca96afc 100644
--- a/src/localeinfo.c
+++ b/src/localeinfo.c
@@ -29,6 +29,7 @@
#include
#include
#include
+#include
/* The sbclen implementation relies on this. */
verify (MB_LEN_MAX <= SCHAR_MAX);
@@ -64,3 +65,49 @@ init_localeinfo (struct localeinfo *localeinfo)
localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
}
}
+
+/* The set of wchar_t values C such that there's a useful locale
+ somewhere where C != towupper (C) && C != towlower (towupper (C)).
+ For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+ towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+ towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
+static short const lonesome_lower[] =
+ {
+ 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+ 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+ /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+ counterpart in locales predating Unicode 4.0.0 (April 2003). */
+ 0x03F2,
+
+ 0x03F5, 0x1E9B, 0x1FBE,
+ };
+
+/* Verify that the worst case fits. This is 1 for towupper, 1 for
+ towlower, and 1 for each entry in LONESOME_LOWER. */
+verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
+ <= CASE_FOLDED_BUFSIZE);
+
+/* Find the characters equal to C after case-folding, other than C
+ itself, and store them into FOLDED. Return the number of characters
+ stored. */
+
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+ int i;
+ int n = 0;
+ wint_t uc = towupper (c);
+ wint_t lc = towlower (uc);
+ if (uc != c)
+ folded[n++] = uc;
+ if (lc != uc && lc != c && towupper (lc) == uc)
+ folded[n++] = lc;
+ for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+ {
+ wint_t li = lonesome_lower[i];
+ if (li != lc && li != uc && li != c && towupper (li) == uc)
+ folded[n++] = li;
+ }
+ return n;
+}
diff --git a/src/localeinfo.h b/src/localeinfo.h
index 70b55a8..cf2f9a6 100644
--- a/src/localeinfo.h
+++ b/src/localeinfo.h
@@ -45,3 +45,10 @@ struct localeinfo
};
extern void init_localeinfo (struct localeinfo *);
+
+/* Maximum number of characters that can be the case-folded
+ counterparts of a single character, not counting the character
+ itself. This is a generous upper bound. */
+enum { CASE_FOLDED_BUFSIZE = 32 };
+
+extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/search.h b/src/search.h
index 534a49e..fb4e5c8 100644
--- a/src/search.h
+++ b/src/search.h
@@ -47,7 +47,7 @@ _GL_INLINE_HEADER_BEGIN
typedef signed char mb_len_map_t;
/* searchutils.c */
-extern void kwsinit (kwset_t *, bool);
+extern kwset_t kwsinit (bool);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
extern wint_t mb_prev_wc (char const *, char const *, char const *);
extern wint_t mb_next_wc (char const *, char const *);
diff --git a/src/searchutils.c b/src/searchutils.c
index 87f51a4..73d6c1c 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -24,42 +24,36 @@
#define NCHAR (UCHAR_MAX + 1)
-void
-kwsinit (kwset_t *kwset, bool mb_trans)
+kwset_t
+kwsinit (bool mb_trans)
{
static char trans[NCHAR];
- int i;
+ char *transptr = NULL;
if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
{
if (MB_CUR_MAX == 1)
- for (i = 0; i < NCHAR; ++i)
+ for (int i = 0; i < NCHAR; i++)
trans[i] = toupper (i);
else
- for (i = 0; i < NCHAR; ++i)
+ for (int i = 0; i < NCHAR; i++)
{
wint_t wc = localeinfo.sbctowc[i];
wint_t uwc = towupper (wc);
if (uwc != wc)
{
- char s[MB_LEN_MAX];
mbstate_t mbs = { 0 };
- size_t len = wcrtomb (s, uwc, &mbs);
- if (len > 1)
+ size_t len = wcrtomb (&trans[i], uwc, &mbs);
+ if (len != 1)
abort ();
- trans[i] = s[0];
}
else
trans[i] = i;
}
-
- *kwset = kwsalloc (trans, false);
+ transptr = trans;
}
- else
- *kwset = kwsalloc (NULL, false);
- if (!*kwset)
- xalloc_die ();
+ return kwsalloc (transptr, false);
}
/* In the buffer *MB_START, return the number of bytes needed to go
--
2.7.4