From 07a4f69da701abfdee047f26c603002c20d4c7d4 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Wed, 19 Feb 2014 19:22:24 -0800 Subject: [PATCH 1/2] maint: factor out using_utf8 function for use in main.c * src/searchutils.c (is_mb_middle): Use using_utf8 rather than rolling our own. (using_utf8): New function (copy of the one in dfa.c). * src/search.h (using_utf8): Declare it. --- src/search.h | 2 ++ src/searchutils.c | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/search.h b/src/search.h index 12d0822..167e0e7 100644 --- a/src/search.h +++ b/src/search.h @@ -80,4 +80,6 @@ mb_case_map_apply (mb_len_map_t const *map, size_t *off, size_t *len) } } +int using_utf8 (void); + #endif /* GREP_SEARCH_H */ diff --git a/src/searchutils.c b/src/searchutils.c index 3478417..51bba59 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -234,13 +234,8 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; -#if HAVE_LANGINFO_CODESET - static int is_utf8 = -1; - - if (is_utf8 == -1) - is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8"); - if (is_utf8 && buf - p > MB_CUR_MAX) + if (using_utf8 () && buf - p > MB_CUR_MAX) { for (p = buf; buf - p > MB_CUR_MAX; p--) if (mbclen_cache[to_uchar (*p)] != (size_t) -1) @@ -249,7 +244,6 @@ is_mb_middle (const char **good, const char *buf, const char *end, if (buf - p == MB_CUR_MAX) p = buf; } -#endif memset (&cur_state, 0, sizeof cur_state); @@ -283,3 +277,21 @@ is_mb_middle (const char **good, const char *buf, const char *end, return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state); } #endif /* MBS_SUPPORT */ + +/* UTF-8 encoding allows some optimizations that we can't otherwise + assume in a multibyte encoding. */ +int +using_utf8 (void) +{ + static int utf8 = -1; + if (utf8 == -1) + { +#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT + utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8")); +#else + utf8 = 0; +#endif + } + + return utf8; +} -- 1.9.0 From 6053c388d4f56fae2b639f566f2bd0f9830f0276 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Wed, 19 Feb 2014 19:31:43 -0800 Subject: [PATCH 2/2] grep -i: avoid 200x perf. regression in multibyte non-UTF8 locales * src/main.c (trivial_case_ignore): Perform this optimization only for UTF8 locales. This rectifies a 200x performance regression in multi-byte non-UTF8 locales like ja_JP.eucJP. The regression was introduced by the 10x UTF8/grep-i speedup, commit v2.16-4-g97318f5. Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50 * NEWS (Bug fixes): Mention it. --- NEWS | 5 +++++ src/main.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/NEWS b/NEWS index 6785a96..49a17b0 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Bug fixes + + grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower + than in 2.16. [bug introduced in grep-2.17] + * Noteworthy changes in release 2.17 (2014-02-17) [stable] diff --git a/src/main.c b/src/main.c index bd20297..ca7c7b3 100644 --- a/src/main.c +++ b/src/main.c @@ -1883,6 +1883,11 @@ static bool trivial_case_ignore (size_t len, char const *keys, size_t *new_len, char **new_keys) { + /* Perform this translation only for UTF-8. Otherwise, this would induce + a 100-200x performance penalty for non-UTF8 multibyte locales. */ + if ( ! using_utf8 ()) + return false; + /* FIXME: consider removing the following restriction: Reject if KEYS contain ASCII '\\' or '['. */ if (memchr (keys, '\\', len) || memchr (keys, '[', len)) -- 1.9.0