[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v3.3-27-g449f1c5
From: |
Jim Meyering |
Subject: |
grep branch, master, updated. v3.3-27-g449f1c5 |
Date: |
Sun, 17 Nov 2019 10:16:00 -0500 (EST) |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".
The branch, master has been updated
via 449f1c5805adba00ddd6edad30d96dbaeb8a91a3 (commit)
via cea97a849038754933dadce9db4ab9761b681c92 (commit)
from 0172bf6825710b510b05c56136aee2d5f8d400e4 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=449f1c5805adba00ddd6edad30d96dbaeb8a91a3
commit 449f1c5805adba00ddd6edad30d96dbaeb8a91a3
Author: Norihiro Tanaka <address@hidden>
Date: Sun Nov 17 07:29:15 2019 +0900
grep: improve grep -Fw performance in non-UTF8 multibyte locales
* src/searchutils.c (mb_goback): New parameter. All callers changed.
* src/search.h (mb_goback): Update prototype.
* src/kwsearch.c (Fexecute): Use mb_goback's MBCLEN to detect a
word-boundary even more efficiently.
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 3ebd25e..6c95d8c 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -279,7 +279,7 @@ EGexecute (void *vdc, char const *buf, size_t size, size_t
*match_size,
goto success;
if (mb_start < beg)
mb_start = beg;
- if (mb_goback (&mb_start, match, buflim) == 0)
+ if (mb_goback (&mb_start, NULL, match, buflim) == 0)
goto success;
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
diff --git a/src/kwsearch.c b/src/kwsearch.c
index f590d19..f121816 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -161,6 +161,7 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t
*match_size,
bool longest;
struct kwsearch *kwsearch = vcp;
kwset_t kwset = kwsearch->kwset;
+ size_t mbclen;
if (match_lines)
mb_check = longest = false;
@@ -194,7 +195,9 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t
*match_size,
return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
}
- if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0)
+ mbclen = 0;
+ if (mb_check
+ && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
{
/* We have matched a single byte that is not at the beginning of a
multibyte character. mb_goback has advanced MB_START past that
@@ -225,22 +228,19 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t
*match_size,
/* We need a preceding mb_start pointer. Use the beginning of line
if there is a preceding newline. */
- if (mb_check)
+ if (mbclen == 0)
{
- char const *nl = memrchr (buf, eol, beg - buf);
- mb_start = nl ? nl + 1 : buf;
- }
- else
- {
- char const *nl = memrchr (mb_start, eol, beg - mb_start);
- if (nl)
- mb_start = nl + 1;
+ char const *nl = memrchr (mb_start, eol, beg - mb_start);
+ if (nl)
+ mb_start = nl + 1;
}
/* Succeed if neither the preceding nor the following character is a
word constituent. If the preceding is not, yet the following
character IS a word constituent, keep trying with shorter matches. */
- if (! wordchar_prev (mb_start, beg, buf + size))
+ if (mbclen > 0
+ ? ! wordchar_next (beg - mbclen, buf + size)
+ : ! wordchar_prev (mb_start, beg, buf + size))
for (;;)
{
if (! wordchar_next (beg + len, buf + size))
diff --git a/src/search.h b/src/search.h
index a782a0c..d6010b9 100644
--- a/src/search.h
+++ b/src/search.h
@@ -52,7 +52,8 @@ extern size_t wordchars_size (char const *, char const *)
_GL_ATTRIBUTE_PURE;
extern size_t wordchar_next (char const *, char const *) _GL_ATTRIBUTE_PURE;
extern size_t wordchar_prev (char const *, char const *, char const *)
_GL_ATTRIBUTE_PURE;
-extern ptrdiff_t mb_goback (char const **, char const *, char const *);
+extern ptrdiff_t mb_goback (char const **, size_t *, char const *,
+ char const *);
/* dfasearch.c */
extern void *GEAcompile (char *, size_t, reg_syntax_t);
diff --git a/src/searchutils.c b/src/searchutils.c
index 9bb35fd..d6a36f1 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -75,18 +75,21 @@ kwsinit (bool mb_trans)
back from CUR to the previous boundary, where a "boundary" is the
start of a multibyte character or is an error-encoding byte. The
buffer ends at END (i.e., one past the address of the buffer's last
- byte). If CUR is already at a boundary, return 0. If *MB_START is
- greater than CUR, return the negative value CUR - *MB_START.
+ byte). If CUR is already at a boundary, return 0. If CUR is no
+ larger than *MB_START, return CUR - *MB_START without modifying
+ *MB_START or *MBCLEN.
When returning zero, set *MB_START to CUR. When returning a
- positive value, set *MB_START to the next boundary after CUR, or to
- END if there is no such boundary. When returning a negative value,
- leave *MB_START alone. */
+ positive value, set *MB_START to the next boundary after CUR,
+ or to END if there is no such boundary, and set *MBCLEN to the
+ length of the preceding character. */
ptrdiff_t
-mb_goback (char const **mb_start, char const *cur, char const *end)
+mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
+ char const *end)
{
const char *p = *mb_start;
const char *p0 = p;
+ size_t clen;
if (cur <= p)
return cur - p;
@@ -94,13 +97,14 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
if (localeinfo.using_utf8)
{
p = cur;
+ clen = 1;
if (cur < end && (*cur & 0xc0) == 0x80)
for (int i = 1; i <= 3; i++)
if ((cur[-i] & 0xc0) != 0x80)
{
mbstate_t mbs = { 0 };
- size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ clen = mb_clen (cur - i, end - (cur - i), &mbs);
if (i < clen && clen < (size_t) -2)
{
p0 = cur - i;
@@ -114,7 +118,7 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
mbstate_t mbs = { 0 };
do
{
- size_t clen = mb_clen (p, end - p, &mbs);
+ clen = mb_clen (p, end - p, &mbs);
if ((size_t) -2 <= clen)
{
@@ -130,6 +134,8 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
}
*mb_start = p;
+ if (mbclen)
+ *mbclen = clen;
return p == cur ? 0 : cur - p0;
}
@@ -192,6 +198,6 @@ wordchar_prev (char const *buf, char const *cur, char const
*end)
|| (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
return sbwordchar[b];
char const *p = buf;
- cur -= mb_goback (&p, cur, end);
+ cur -= mb_goback (&p, NULL, cur, end);
return wordchar_next (cur, end);
}
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=cea97a849038754933dadce9db4ab9761b681c92
commit 449f1c5805adba00ddd6edad30d96dbaeb8a91a3
Author: Norihiro Tanaka <address@hidden>
Date: Sun Nov 17 07:29:15 2019 +0900
grep: improve grep -Fw performance in non-UTF8 multibyte locales
* src/searchutils.c (mb_goback): New parameter. All callers changed.
* src/search.h (mb_goback): Update prototype.
* src/kwsearch.c (Fexecute): Use mb_goback's MBCLEN to detect a
word-boundary even more efficiently.
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 3ebd25e..6c95d8c 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -279,7 +279,7 @@ EGexecute (void *vdc, char const *buf, size_t size, size_t
*match_size,
goto success;
if (mb_start < beg)
mb_start = beg;
- if (mb_goback (&mb_start, match, buflim) == 0)
+ if (mb_goback (&mb_start, NULL, match, buflim) == 0)
goto success;
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
diff --git a/src/kwsearch.c b/src/kwsearch.c
index f590d19..f121816 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -161,6 +161,7 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t
*match_size,
bool longest;
struct kwsearch *kwsearch = vcp;
kwset_t kwset = kwsearch->kwset;
+ size_t mbclen;
if (match_lines)
mb_check = longest = false;
@@ -194,7 +195,9 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t
*match_size,
return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
}
- if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0)
+ mbclen = 0;
+ if (mb_check
+ && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
{
/* We have matched a single byte that is not at the beginning of a
multibyte character. mb_goback has advanced MB_START past that
@@ -225,22 +228,19 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t
*match_size,
/* We need a preceding mb_start pointer. Use the beginning of line
if there is a preceding newline. */
- if (mb_check)
+ if (mbclen == 0)
{
- char const *nl = memrchr (buf, eol, beg - buf);
- mb_start = nl ? nl + 1 : buf;
- }
- else
- {
- char const *nl = memrchr (mb_start, eol, beg - mb_start);
- if (nl)
- mb_start = nl + 1;
+ char const *nl = memrchr (mb_start, eol, beg - mb_start);
+ if (nl)
+ mb_start = nl + 1;
}
/* Succeed if neither the preceding nor the following character is a
word constituent. If the preceding is not, yet the following
character IS a word constituent, keep trying with shorter matches. */
- if (! wordchar_prev (mb_start, beg, buf + size))
+ if (mbclen > 0
+ ? ! wordchar_next (beg - mbclen, buf + size)
+ : ! wordchar_prev (mb_start, beg, buf + size))
for (;;)
{
if (! wordchar_next (beg + len, buf + size))
diff --git a/src/search.h b/src/search.h
index a782a0c..d6010b9 100644
--- a/src/search.h
+++ b/src/search.h
@@ -52,7 +52,8 @@ extern size_t wordchars_size (char const *, char const *)
_GL_ATTRIBUTE_PURE;
extern size_t wordchar_next (char const *, char const *) _GL_ATTRIBUTE_PURE;
extern size_t wordchar_prev (char const *, char const *, char const *)
_GL_ATTRIBUTE_PURE;
-extern ptrdiff_t mb_goback (char const **, char const *, char const *);
+extern ptrdiff_t mb_goback (char const **, size_t *, char const *,
+ char const *);
/* dfasearch.c */
extern void *GEAcompile (char *, size_t, reg_syntax_t);
diff --git a/src/searchutils.c b/src/searchutils.c
index 9bb35fd..d6a36f1 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -75,18 +75,21 @@ kwsinit (bool mb_trans)
back from CUR to the previous boundary, where a "boundary" is the
start of a multibyte character or is an error-encoding byte. The
buffer ends at END (i.e., one past the address of the buffer's last
- byte). If CUR is already at a boundary, return 0. If *MB_START is
- greater than CUR, return the negative value CUR - *MB_START.
+ byte). If CUR is already at a boundary, return 0. If CUR is no
+ larger than *MB_START, return CUR - *MB_START without modifying
+ *MB_START or *MBCLEN.
When returning zero, set *MB_START to CUR. When returning a
- positive value, set *MB_START to the next boundary after CUR, or to
- END if there is no such boundary. When returning a negative value,
- leave *MB_START alone. */
+ positive value, set *MB_START to the next boundary after CUR,
+ or to END if there is no such boundary, and set *MBCLEN to the
+ length of the preceding character. */
ptrdiff_t
-mb_goback (char const **mb_start, char const *cur, char const *end)
+mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
+ char const *end)
{
const char *p = *mb_start;
const char *p0 = p;
+ size_t clen;
if (cur <= p)
return cur - p;
@@ -94,13 +97,14 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
if (localeinfo.using_utf8)
{
p = cur;
+ clen = 1;
if (cur < end && (*cur & 0xc0) == 0x80)
for (int i = 1; i <= 3; i++)
if ((cur[-i] & 0xc0) != 0x80)
{
mbstate_t mbs = { 0 };
- size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ clen = mb_clen (cur - i, end - (cur - i), &mbs);
if (i < clen && clen < (size_t) -2)
{
p0 = cur - i;
@@ -114,7 +118,7 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
mbstate_t mbs = { 0 };
do
{
- size_t clen = mb_clen (p, end - p, &mbs);
+ clen = mb_clen (p, end - p, &mbs);
if ((size_t) -2 <= clen)
{
@@ -130,6 +134,8 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
}
*mb_start = p;
+ if (mbclen)
+ *mbclen = clen;
return p == cur ? 0 : cur - p0;
}
@@ -192,6 +198,6 @@ wordchar_prev (char const *buf, char const *cur, char const
*end)
|| (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
return sbwordchar[b];
char const *p = buf;
- cur -= mb_goback (&p, cur, end);
+ cur -= mb_goback (&p, NULL, cur, end);
return wordchar_next (cur, end);
}
-----------------------------------------------------------------------
Summary of changes:
src/dfasearch.c | 2 +-
src/kwsearch.c | 19 ++++++++++++++-----
src/search.h | 3 ++-
src/searchutils.c | 24 +++++++++++++++---------
4 files changed, 32 insertions(+), 16 deletions(-)
hooks/post-receive
--
grep
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- grep branch, master, updated. v3.3-27-g449f1c5,
Jim Meyering <=