[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.27-20-gfdf9fad
From: |
Paul Eggert |
Subject: |
grep branch, master, updated. v2.27-20-gfdf9fad |
Date: |
Sat, 24 Dec 2016 01:22:58 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".
The branch, master has been updated
via fdf9fadd03160680e4d3edc886b1e31e424b535f (commit)
via d628657e87f4a7d7e242129b318a903d1ea26b74 (commit)
via 192d61e2828e13c4a2f1a81cd128721a229c88f9 (commit)
via 4fa1971d98c79b56b466eff57117351dc395ee2a (commit)
via 4dd5274d6a8519d08ede792baafb0f9415cf4f9f (commit)
via a01232744cb71fdde44a3c8b50dcbf34181e16a1 (commit)
via 3cec0670866aaffbd4506ad934029f21983c99b9 (commit)
from 290ca116c9172d97b2b026951fac722d3bd3ced9 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=fdf9fadd03160680e4d3edc886b1e31e424b535f
commit fdf9fadd03160680e4d3edc886b1e31e424b535f
Author: Paul Eggert <address@hidden>
Date: Fri Dec 23 16:16:01 2016 -0800
grep: improve word checking with UTF-8
* src/searchutils.c: Do not include <verify.h>.
(word_start): Remove, replacing with ...
(sbwordchar): New static var. All uses changed.
(wordchar_prev): Return size_t, not bool, as this generates
slightly better code. Go back faster if UTF-8.
diff --git a/src/search.h b/src/search.h
index 1def4d6..b700ed5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -50,7 +50,7 @@ extern void wordinit (void);
extern kwset_t kwsinit (bool);
extern size_t wordchars_size (char const *, char const *);
extern size_t wordchar_next (char const *, char const *);
-extern bool wordchar_prev (char const *, char const *, char const *);
+extern size_t wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
/* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 6f6ae0b..3ba3cdb 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,14 +22,9 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
-#include <verify.h>
-
-/* For each byte B, word_start[B] is 1 if B is a single-byte character
- that is a word constituent, 0 if B cannot start a word constituent,
- and -1 if B might be or might not be the start of a word
- constituent. */
-static wint_t word_start[NCHAR];
-verify (WEOF != 0 && WEOF != 1);
+/* For each byte B, sbwordchar[B] is true if B is a single-byte
+ character that is a word constituent, and is false otherwise. */
+static bool sbwordchar[NCHAR];
/* Whether -w considers WC to be a word constituent. */
static bool
@@ -42,8 +37,7 @@ void
wordinit (void)
{
for (int i = 0; i < NCHAR; i++)
- word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
- : wordchar (localeinfo.sbctowc[i]));
+ sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
}
kwset_t
@@ -94,23 +88,46 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
{
const char *p = *mb_start;
const char *p0 = p;
- mbstate_t cur_state;
- memset (&cur_state, 0, sizeof cur_state);
+ if (cur <= p)
+ return cur - p;
- while (p < cur)
+ if (localeinfo.using_utf8)
{
- size_t clen = mb_clen (p, end - p, &cur_state);
-
- if ((size_t) -2 <= clen)
+ p = cur;
+
+ if (cur < end && (*cur & 0xc0) == 0x80)
+ for (int i = 1; i <= 3; i++)
+ if ((cur[-i] & 0xc0) != 0x80)
+ {
+ mbstate_t mbs = { 0 };
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ if (i < clen && clen < (size_t) -2)
+ {
+ p0 = cur - i;
+ p = p0 + clen;
+ }
+ break;
+ }
+ }
+ else
+ {
+ mbstate_t mbs = { 0 };
+ do
{
- /* An invalid sequence, or a truncated multibyte character.
- Treat it as a single byte character. */
- clen = 1;
- memset (&cur_state, 0, sizeof cur_state);
+ size_t clen = mb_clen (p, end - p, &mbs);
+
+ if ((size_t) -2 <= clen)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ Treat it as a single byte character. */
+ clen = 1;
+ memset (&mbs, 0, sizeof mbs);
+ }
+ p0 = p;
+ p += clen;
}
- p0 = p;
- p += clen;
+ while (p < cur);
}
*mb_start = p;
@@ -127,11 +144,11 @@ wordchars_count (char const *buf, char const *end, bool
countall)
mbstate_t mbs = { 0 };
while (n < end - buf)
{
- wint_t ws = word_start[to_uchar (buf[n])];
- if (ws == 0)
- break;
- else if (ws == 1)
+ unsigned char b = buf[n];
+ if (sbwordchar[b])
n++;
+ else if (localeinfo.sbclen[b] != -2)
+ break;
else
{
wchar_t wc = 0;
@@ -163,19 +180,19 @@ wordchar_next (char const *buf, char const *end)
return wordchars_count (buf, end, false);
}
-/* In the buffer BUF, return true if the character whose encoding
+/* In the buffer BUF, return nonzero if the character whose encoding
contains the byte before CUR is a word constituent. The buffer
ends at END. */
-bool
+size_t
wordchar_prev (char const *buf, char const *cur, char const *end)
{
if (buf == cur)
- return false;
- cur--;
- wint_t ws = word_start[to_uchar (*cur)];
- if (! localeinfo.multibyte)
- return ws == 1;
+ return 0;
+ unsigned char b = *--cur;
+ if (! localeinfo.multibyte
+ || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
+ return sbwordchar[b];
char const *p = buf;
cur -= mb_goback (&p, cur, end);
- return wordchar_next (cur, end) != 0;
+ return wordchar_next (cur, end);
}
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=d628657e87f4a7d7e242129b318a903d1ea26b74
commit fdf9fadd03160680e4d3edc886b1e31e424b535f
Author: Paul Eggert <address@hidden>
Date: Fri Dec 23 16:16:01 2016 -0800
grep: improve word checking with UTF-8
* src/searchutils.c: Do not include <verify.h>.
(word_start): Remove, replacing with ...
(sbwordchar): New static var. All uses changed.
(wordchar_prev): Return size_t, not bool, as this generates
slightly better code. Go back faster if UTF-8.
diff --git a/src/search.h b/src/search.h
index 1def4d6..b700ed5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -50,7 +50,7 @@ extern void wordinit (void);
extern kwset_t kwsinit (bool);
extern size_t wordchars_size (char const *, char const *);
extern size_t wordchar_next (char const *, char const *);
-extern bool wordchar_prev (char const *, char const *, char const *);
+extern size_t wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
/* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 6f6ae0b..3ba3cdb 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,14 +22,9 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
-#include <verify.h>
-
-/* For each byte B, word_start[B] is 1 if B is a single-byte character
- that is a word constituent, 0 if B cannot start a word constituent,
- and -1 if B might be or might not be the start of a word
- constituent. */
-static wint_t word_start[NCHAR];
-verify (WEOF != 0 && WEOF != 1);
+/* For each byte B, sbwordchar[B] is true if B is a single-byte
+ character that is a word constituent, and is false otherwise. */
+static bool sbwordchar[NCHAR];
/* Whether -w considers WC to be a word constituent. */
static bool
@@ -42,8 +37,7 @@ void
wordinit (void)
{
for (int i = 0; i < NCHAR; i++)
- word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
- : wordchar (localeinfo.sbctowc[i]));
+ sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
}
kwset_t
@@ -94,23 +88,46 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
{
const char *p = *mb_start;
const char *p0 = p;
- mbstate_t cur_state;
- memset (&cur_state, 0, sizeof cur_state);
+ if (cur <= p)
+ return cur - p;
- while (p < cur)
+ if (localeinfo.using_utf8)
{
- size_t clen = mb_clen (p, end - p, &cur_state);
-
- if ((size_t) -2 <= clen)
+ p = cur;
+
+ if (cur < end && (*cur & 0xc0) == 0x80)
+ for (int i = 1; i <= 3; i++)
+ if ((cur[-i] & 0xc0) != 0x80)
+ {
+ mbstate_t mbs = { 0 };
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ if (i < clen && clen < (size_t) -2)
+ {
+ p0 = cur - i;
+ p = p0 + clen;
+ }
+ break;
+ }
+ }
+ else
+ {
+ mbstate_t mbs = { 0 };
+ do
{
- /* An invalid sequence, or a truncated multibyte character.
- Treat it as a single byte character. */
- clen = 1;
- memset (&cur_state, 0, sizeof cur_state);
+ size_t clen = mb_clen (p, end - p, &mbs);
+
+ if ((size_t) -2 <= clen)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ Treat it as a single byte character. */
+ clen = 1;
+ memset (&mbs, 0, sizeof mbs);
+ }
+ p0 = p;
+ p += clen;
}
- p0 = p;
- p += clen;
+ while (p < cur);
}
*mb_start = p;
@@ -127,11 +144,11 @@ wordchars_count (char const *buf, char const *end, bool
countall)
mbstate_t mbs = { 0 };
while (n < end - buf)
{
- wint_t ws = word_start[to_uchar (buf[n])];
- if (ws == 0)
- break;
- else if (ws == 1)
+ unsigned char b = buf[n];
+ if (sbwordchar[b])
n++;
+ else if (localeinfo.sbclen[b] != -2)
+ break;
else
{
wchar_t wc = 0;
@@ -163,19 +180,19 @@ wordchar_next (char const *buf, char const *end)
return wordchars_count (buf, end, false);
}
-/* In the buffer BUF, return true if the character whose encoding
+/* In the buffer BUF, return nonzero if the character whose encoding
contains the byte before CUR is a word constituent. The buffer
ends at END. */
-bool
+size_t
wordchar_prev (char const *buf, char const *cur, char const *end)
{
if (buf == cur)
- return false;
- cur--;
- wint_t ws = word_start[to_uchar (*cur)];
- if (! localeinfo.multibyte)
- return ws == 1;
+ return 0;
+ unsigned char b = *--cur;
+ if (! localeinfo.multibyte
+ || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
+ return sbwordchar[b];
char const *p = buf;
cur -= mb_goback (&p, cur, end);
- return wordchar_next (cur, end) != 0;
+ return wordchar_next (cur, end);
}
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=192d61e2828e13c4a2f1a81cd128721a229c88f9
commit fdf9fadd03160680e4d3edc886b1e31e424b535f
Author: Paul Eggert <address@hidden>
Date: Fri Dec 23 16:16:01 2016 -0800
grep: improve word checking with UTF-8
* src/searchutils.c: Do not include <verify.h>.
(word_start): Remove, replacing with ...
(sbwordchar): New static var. All uses changed.
(wordchar_prev): Return size_t, not bool, as this generates
slightly better code. Go back faster if UTF-8.
diff --git a/src/search.h b/src/search.h
index 1def4d6..b700ed5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -50,7 +50,7 @@ extern void wordinit (void);
extern kwset_t kwsinit (bool);
extern size_t wordchars_size (char const *, char const *);
extern size_t wordchar_next (char const *, char const *);
-extern bool wordchar_prev (char const *, char const *, char const *);
+extern size_t wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
/* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 6f6ae0b..3ba3cdb 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,14 +22,9 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
-#include <verify.h>
-
-/* For each byte B, word_start[B] is 1 if B is a single-byte character
- that is a word constituent, 0 if B cannot start a word constituent,
- and -1 if B might be or might not be the start of a word
- constituent. */
-static wint_t word_start[NCHAR];
-verify (WEOF != 0 && WEOF != 1);
+/* For each byte B, sbwordchar[B] is true if B is a single-byte
+ character that is a word constituent, and is false otherwise. */
+static bool sbwordchar[NCHAR];
/* Whether -w considers WC to be a word constituent. */
static bool
@@ -42,8 +37,7 @@ void
wordinit (void)
{
for (int i = 0; i < NCHAR; i++)
- word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
- : wordchar (localeinfo.sbctowc[i]));
+ sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
}
kwset_t
@@ -94,23 +88,46 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
{
const char *p = *mb_start;
const char *p0 = p;
- mbstate_t cur_state;
- memset (&cur_state, 0, sizeof cur_state);
+ if (cur <= p)
+ return cur - p;
- while (p < cur)
+ if (localeinfo.using_utf8)
{
- size_t clen = mb_clen (p, end - p, &cur_state);
-
- if ((size_t) -2 <= clen)
+ p = cur;
+
+ if (cur < end && (*cur & 0xc0) == 0x80)
+ for (int i = 1; i <= 3; i++)
+ if ((cur[-i] & 0xc0) != 0x80)
+ {
+ mbstate_t mbs = { 0 };
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ if (i < clen && clen < (size_t) -2)
+ {
+ p0 = cur - i;
+ p = p0 + clen;
+ }
+ break;
+ }
+ }
+ else
+ {
+ mbstate_t mbs = { 0 };
+ do
{
- /* An invalid sequence, or a truncated multibyte character.
- Treat it as a single byte character. */
- clen = 1;
- memset (&cur_state, 0, sizeof cur_state);
+ size_t clen = mb_clen (p, end - p, &mbs);
+
+ if ((size_t) -2 <= clen)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ Treat it as a single byte character. */
+ clen = 1;
+ memset (&mbs, 0, sizeof mbs);
+ }
+ p0 = p;
+ p += clen;
}
- p0 = p;
- p += clen;
+ while (p < cur);
}
*mb_start = p;
@@ -127,11 +144,11 @@ wordchars_count (char const *buf, char const *end, bool
countall)
mbstate_t mbs = { 0 };
while (n < end - buf)
{
- wint_t ws = word_start[to_uchar (buf[n])];
- if (ws == 0)
- break;
- else if (ws == 1)
+ unsigned char b = buf[n];
+ if (sbwordchar[b])
n++;
+ else if (localeinfo.sbclen[b] != -2)
+ break;
else
{
wchar_t wc = 0;
@@ -163,19 +180,19 @@ wordchar_next (char const *buf, char const *end)
return wordchars_count (buf, end, false);
}
-/* In the buffer BUF, return true if the character whose encoding
+/* In the buffer BUF, return nonzero if the character whose encoding
contains the byte before CUR is a word constituent. The buffer
ends at END. */
-bool
+size_t
wordchar_prev (char const *buf, char const *cur, char const *end)
{
if (buf == cur)
- return false;
- cur--;
- wint_t ws = word_start[to_uchar (*cur)];
- if (! localeinfo.multibyte)
- return ws == 1;
+ return 0;
+ unsigned char b = *--cur;
+ if (! localeinfo.multibyte
+ || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
+ return sbwordchar[b];
char const *p = buf;
cur -= mb_goback (&p, cur, end);
- return wordchar_next (cur, end) != 0;
+ return wordchar_next (cur, end);
}
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=4fa1971d98c79b56b466eff57117351dc395ee2a
commit fdf9fadd03160680e4d3edc886b1e31e424b535f
Author: Paul Eggert <address@hidden>
Date: Fri Dec 23 16:16:01 2016 -0800
grep: improve word checking with UTF-8
* src/searchutils.c: Do not include <verify.h>.
(word_start): Remove, replacing with ...
(sbwordchar): New static var. All uses changed.
(wordchar_prev): Return size_t, not bool, as this generates
slightly better code. Go back faster if UTF-8.
diff --git a/src/search.h b/src/search.h
index 1def4d6..b700ed5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -50,7 +50,7 @@ extern void wordinit (void);
extern kwset_t kwsinit (bool);
extern size_t wordchars_size (char const *, char const *);
extern size_t wordchar_next (char const *, char const *);
-extern bool wordchar_prev (char const *, char const *, char const *);
+extern size_t wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
/* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 6f6ae0b..3ba3cdb 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,14 +22,9 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
-#include <verify.h>
-
-/* For each byte B, word_start[B] is 1 if B is a single-byte character
- that is a word constituent, 0 if B cannot start a word constituent,
- and -1 if B might be or might not be the start of a word
- constituent. */
-static wint_t word_start[NCHAR];
-verify (WEOF != 0 && WEOF != 1);
+/* For each byte B, sbwordchar[B] is true if B is a single-byte
+ character that is a word constituent, and is false otherwise. */
+static bool sbwordchar[NCHAR];
/* Whether -w considers WC to be a word constituent. */
static bool
@@ -42,8 +37,7 @@ void
wordinit (void)
{
for (int i = 0; i < NCHAR; i++)
- word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
- : wordchar (localeinfo.sbctowc[i]));
+ sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
}
kwset_t
@@ -94,23 +88,46 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
{
const char *p = *mb_start;
const char *p0 = p;
- mbstate_t cur_state;
- memset (&cur_state, 0, sizeof cur_state);
+ if (cur <= p)
+ return cur - p;
- while (p < cur)
+ if (localeinfo.using_utf8)
{
- size_t clen = mb_clen (p, end - p, &cur_state);
-
- if ((size_t) -2 <= clen)
+ p = cur;
+
+ if (cur < end && (*cur & 0xc0) == 0x80)
+ for (int i = 1; i <= 3; i++)
+ if ((cur[-i] & 0xc0) != 0x80)
+ {
+ mbstate_t mbs = { 0 };
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ if (i < clen && clen < (size_t) -2)
+ {
+ p0 = cur - i;
+ p = p0 + clen;
+ }
+ break;
+ }
+ }
+ else
+ {
+ mbstate_t mbs = { 0 };
+ do
{
- /* An invalid sequence, or a truncated multibyte character.
- Treat it as a single byte character. */
- clen = 1;
- memset (&cur_state, 0, sizeof cur_state);
+ size_t clen = mb_clen (p, end - p, &mbs);
+
+ if ((size_t) -2 <= clen)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ Treat it as a single byte character. */
+ clen = 1;
+ memset (&mbs, 0, sizeof mbs);
+ }
+ p0 = p;
+ p += clen;
}
- p0 = p;
- p += clen;
+ while (p < cur);
}
*mb_start = p;
@@ -127,11 +144,11 @@ wordchars_count (char const *buf, char const *end, bool
countall)
mbstate_t mbs = { 0 };
while (n < end - buf)
{
- wint_t ws = word_start[to_uchar (buf[n])];
- if (ws == 0)
- break;
- else if (ws == 1)
+ unsigned char b = buf[n];
+ if (sbwordchar[b])
n++;
+ else if (localeinfo.sbclen[b] != -2)
+ break;
else
{
wchar_t wc = 0;
@@ -163,19 +180,19 @@ wordchar_next (char const *buf, char const *end)
return wordchars_count (buf, end, false);
}
-/* In the buffer BUF, return true if the character whose encoding
+/* In the buffer BUF, return nonzero if the character whose encoding
contains the byte before CUR is a word constituent. The buffer
ends at END. */
-bool
+size_t
wordchar_prev (char const *buf, char const *cur, char const *end)
{
if (buf == cur)
- return false;
- cur--;
- wint_t ws = word_start[to_uchar (*cur)];
- if (! localeinfo.multibyte)
- return ws == 1;
+ return 0;
+ unsigned char b = *--cur;
+ if (! localeinfo.multibyte
+ || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
+ return sbwordchar[b];
char const *p = buf;
cur -= mb_goback (&p, cur, end);
- return wordchar_next (cur, end) != 0;
+ return wordchar_next (cur, end);
}
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=4dd5274d6a8519d08ede792baafb0f9415cf4f9f
commit fdf9fadd03160680e4d3edc886b1e31e424b535f
Author: Paul Eggert <address@hidden>
Date: Fri Dec 23 16:16:01 2016 -0800
grep: improve word checking with UTF-8
* src/searchutils.c: Do not include <verify.h>.
(word_start): Remove, replacing with ...
(sbwordchar): New static var. All uses changed.
(wordchar_prev): Return size_t, not bool, as this generates
slightly better code. Go back faster if UTF-8.
diff --git a/src/search.h b/src/search.h
index 1def4d6..b700ed5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -50,7 +50,7 @@ extern void wordinit (void);
extern kwset_t kwsinit (bool);
extern size_t wordchars_size (char const *, char const *);
extern size_t wordchar_next (char const *, char const *);
-extern bool wordchar_prev (char const *, char const *, char const *);
+extern size_t wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
/* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 6f6ae0b..3ba3cdb 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,14 +22,9 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
-#include <verify.h>
-
-/* For each byte B, word_start[B] is 1 if B is a single-byte character
- that is a word constituent, 0 if B cannot start a word constituent,
- and -1 if B might be or might not be the start of a word
- constituent. */
-static wint_t word_start[NCHAR];
-verify (WEOF != 0 && WEOF != 1);
+/* For each byte B, sbwordchar[B] is true if B is a single-byte
+ character that is a word constituent, and is false otherwise. */
+static bool sbwordchar[NCHAR];
/* Whether -w considers WC to be a word constituent. */
static bool
@@ -42,8 +37,7 @@ void
wordinit (void)
{
for (int i = 0; i < NCHAR; i++)
- word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
- : wordchar (localeinfo.sbctowc[i]));
+ sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
}
kwset_t
@@ -94,23 +88,46 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
{
const char *p = *mb_start;
const char *p0 = p;
- mbstate_t cur_state;
- memset (&cur_state, 0, sizeof cur_state);
+ if (cur <= p)
+ return cur - p;
- while (p < cur)
+ if (localeinfo.using_utf8)
{
- size_t clen = mb_clen (p, end - p, &cur_state);
-
- if ((size_t) -2 <= clen)
+ p = cur;
+
+ if (cur < end && (*cur & 0xc0) == 0x80)
+ for (int i = 1; i <= 3; i++)
+ if ((cur[-i] & 0xc0) != 0x80)
+ {
+ mbstate_t mbs = { 0 };
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ if (i < clen && clen < (size_t) -2)
+ {
+ p0 = cur - i;
+ p = p0 + clen;
+ }
+ break;
+ }
+ }
+ else
+ {
+ mbstate_t mbs = { 0 };
+ do
{
- /* An invalid sequence, or a truncated multibyte character.
- Treat it as a single byte character. */
- clen = 1;
- memset (&cur_state, 0, sizeof cur_state);
+ size_t clen = mb_clen (p, end - p, &mbs);
+
+ if ((size_t) -2 <= clen)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ Treat it as a single byte character. */
+ clen = 1;
+ memset (&mbs, 0, sizeof mbs);
+ }
+ p0 = p;
+ p += clen;
}
- p0 = p;
- p += clen;
+ while (p < cur);
}
*mb_start = p;
@@ -127,11 +144,11 @@ wordchars_count (char const *buf, char const *end, bool
countall)
mbstate_t mbs = { 0 };
while (n < end - buf)
{
- wint_t ws = word_start[to_uchar (buf[n])];
- if (ws == 0)
- break;
- else if (ws == 1)
+ unsigned char b = buf[n];
+ if (sbwordchar[b])
n++;
+ else if (localeinfo.sbclen[b] != -2)
+ break;
else
{
wchar_t wc = 0;
@@ -163,19 +180,19 @@ wordchar_next (char const *buf, char const *end)
return wordchars_count (buf, end, false);
}
-/* In the buffer BUF, return true if the character whose encoding
+/* In the buffer BUF, return nonzero if the character whose encoding
contains the byte before CUR is a word constituent. The buffer
ends at END. */
-bool
+size_t
wordchar_prev (char const *buf, char const *cur, char const *end)
{
if (buf == cur)
- return false;
- cur--;
- wint_t ws = word_start[to_uchar (*cur)];
- if (! localeinfo.multibyte)
- return ws == 1;
+ return 0;
+ unsigned char b = *--cur;
+ if (! localeinfo.multibyte
+ || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
+ return sbwordchar[b];
char const *p = buf;
cur -= mb_goback (&p, cur, end);
- return wordchar_next (cur, end) != 0;
+ return wordchar_next (cur, end);
}
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=a01232744cb71fdde44a3c8b50dcbf34181e16a1
commit fdf9fadd03160680e4d3edc886b1e31e424b535f
Author: Paul Eggert <address@hidden>
Date: Fri Dec 23 16:16:01 2016 -0800
grep: improve word checking with UTF-8
* src/searchutils.c: Do not include <verify.h>.
(word_start): Remove, replacing with ...
(sbwordchar): New static var. All uses changed.
(wordchar_prev): Return size_t, not bool, as this generates
slightly better code. Go back faster if UTF-8.
diff --git a/src/search.h b/src/search.h
index 1def4d6..b700ed5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -50,7 +50,7 @@ extern void wordinit (void);
extern kwset_t kwsinit (bool);
extern size_t wordchars_size (char const *, char const *);
extern size_t wordchar_next (char const *, char const *);
-extern bool wordchar_prev (char const *, char const *, char const *);
+extern size_t wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
/* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 6f6ae0b..3ba3cdb 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,14 +22,9 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
-#include <verify.h>
-
-/* For each byte B, word_start[B] is 1 if B is a single-byte character
- that is a word constituent, 0 if B cannot start a word constituent,
- and -1 if B might be or might not be the start of a word
- constituent. */
-static wint_t word_start[NCHAR];
-verify (WEOF != 0 && WEOF != 1);
+/* For each byte B, sbwordchar[B] is true if B is a single-byte
+ character that is a word constituent, and is false otherwise. */
+static bool sbwordchar[NCHAR];
/* Whether -w considers WC to be a word constituent. */
static bool
@@ -42,8 +37,7 @@ void
wordinit (void)
{
for (int i = 0; i < NCHAR; i++)
- word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
- : wordchar (localeinfo.sbctowc[i]));
+ sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
}
kwset_t
@@ -94,23 +88,46 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
{
const char *p = *mb_start;
const char *p0 = p;
- mbstate_t cur_state;
- memset (&cur_state, 0, sizeof cur_state);
+ if (cur <= p)
+ return cur - p;
- while (p < cur)
+ if (localeinfo.using_utf8)
{
- size_t clen = mb_clen (p, end - p, &cur_state);
-
- if ((size_t) -2 <= clen)
+ p = cur;
+
+ if (cur < end && (*cur & 0xc0) == 0x80)
+ for (int i = 1; i <= 3; i++)
+ if ((cur[-i] & 0xc0) != 0x80)
+ {
+ mbstate_t mbs = { 0 };
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ if (i < clen && clen < (size_t) -2)
+ {
+ p0 = cur - i;
+ p = p0 + clen;
+ }
+ break;
+ }
+ }
+ else
+ {
+ mbstate_t mbs = { 0 };
+ do
{
- /* An invalid sequence, or a truncated multibyte character.
- Treat it as a single byte character. */
- clen = 1;
- memset (&cur_state, 0, sizeof cur_state);
+ size_t clen = mb_clen (p, end - p, &mbs);
+
+ if ((size_t) -2 <= clen)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ Treat it as a single byte character. */
+ clen = 1;
+ memset (&mbs, 0, sizeof mbs);
+ }
+ p0 = p;
+ p += clen;
}
- p0 = p;
- p += clen;
+ while (p < cur);
}
*mb_start = p;
@@ -127,11 +144,11 @@ wordchars_count (char const *buf, char const *end, bool
countall)
mbstate_t mbs = { 0 };
while (n < end - buf)
{
- wint_t ws = word_start[to_uchar (buf[n])];
- if (ws == 0)
- break;
- else if (ws == 1)
+ unsigned char b = buf[n];
+ if (sbwordchar[b])
n++;
+ else if (localeinfo.sbclen[b] != -2)
+ break;
else
{
wchar_t wc = 0;
@@ -163,19 +180,19 @@ wordchar_next (char const *buf, char const *end)
return wordchars_count (buf, end, false);
}
-/* In the buffer BUF, return true if the character whose encoding
+/* In the buffer BUF, return nonzero if the character whose encoding
contains the byte before CUR is a word constituent. The buffer
ends at END. */
-bool
+size_t
wordchar_prev (char const *buf, char const *cur, char const *end)
{
if (buf == cur)
- return false;
- cur--;
- wint_t ws = word_start[to_uchar (*cur)];
- if (! localeinfo.multibyte)
- return ws == 1;
+ return 0;
+ unsigned char b = *--cur;
+ if (! localeinfo.multibyte
+ || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
+ return sbwordchar[b];
char const *p = buf;
cur -= mb_goback (&p, cur, end);
- return wordchar_next (cur, end) != 0;
+ return wordchar_next (cur, end);
}
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=3cec0670866aaffbd4506ad934029f21983c99b9
commit fdf9fadd03160680e4d3edc886b1e31e424b535f
Author: Paul Eggert <address@hidden>
Date: Fri Dec 23 16:16:01 2016 -0800
grep: improve word checking with UTF-8
* src/searchutils.c: Do not include <verify.h>.
(word_start): Remove, replacing with ...
(sbwordchar): New static var. All uses changed.
(wordchar_prev): Return size_t, not bool, as this generates
slightly better code. Go back faster if UTF-8.
diff --git a/src/search.h b/src/search.h
index 1def4d6..b700ed5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -50,7 +50,7 @@ extern void wordinit (void);
extern kwset_t kwsinit (bool);
extern size_t wordchars_size (char const *, char const *);
extern size_t wordchar_next (char const *, char const *);
-extern bool wordchar_prev (char const *, char const *, char const *);
+extern size_t wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
/* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 6f6ae0b..3ba3cdb 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,14 +22,9 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
-#include <verify.h>
-
-/* For each byte B, word_start[B] is 1 if B is a single-byte character
- that is a word constituent, 0 if B cannot start a word constituent,
- and -1 if B might be or might not be the start of a word
- constituent. */
-static wint_t word_start[NCHAR];
-verify (WEOF != 0 && WEOF != 1);
+/* For each byte B, sbwordchar[B] is true if B is a single-byte
+ character that is a word constituent, and is false otherwise. */
+static bool sbwordchar[NCHAR];
/* Whether -w considers WC to be a word constituent. */
static bool
@@ -42,8 +37,7 @@ void
wordinit (void)
{
for (int i = 0; i < NCHAR; i++)
- word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
- : wordchar (localeinfo.sbctowc[i]));
+ sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
}
kwset_t
@@ -94,23 +88,46 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
{
const char *p = *mb_start;
const char *p0 = p;
- mbstate_t cur_state;
- memset (&cur_state, 0, sizeof cur_state);
+ if (cur <= p)
+ return cur - p;
- while (p < cur)
+ if (localeinfo.using_utf8)
{
- size_t clen = mb_clen (p, end - p, &cur_state);
-
- if ((size_t) -2 <= clen)
+ p = cur;
+
+ if (cur < end && (*cur & 0xc0) == 0x80)
+ for (int i = 1; i <= 3; i++)
+ if ((cur[-i] & 0xc0) != 0x80)
+ {
+ mbstate_t mbs = { 0 };
+ size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ if (i < clen && clen < (size_t) -2)
+ {
+ p0 = cur - i;
+ p = p0 + clen;
+ }
+ break;
+ }
+ }
+ else
+ {
+ mbstate_t mbs = { 0 };
+ do
{
- /* An invalid sequence, or a truncated multibyte character.
- Treat it as a single byte character. */
- clen = 1;
- memset (&cur_state, 0, sizeof cur_state);
+ size_t clen = mb_clen (p, end - p, &mbs);
+
+ if ((size_t) -2 <= clen)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ Treat it as a single byte character. */
+ clen = 1;
+ memset (&mbs, 0, sizeof mbs);
+ }
+ p0 = p;
+ p += clen;
}
- p0 = p;
- p += clen;
+ while (p < cur);
}
*mb_start = p;
@@ -127,11 +144,11 @@ wordchars_count (char const *buf, char const *end, bool
countall)
mbstate_t mbs = { 0 };
while (n < end - buf)
{
- wint_t ws = word_start[to_uchar (buf[n])];
- if (ws == 0)
- break;
- else if (ws == 1)
+ unsigned char b = buf[n];
+ if (sbwordchar[b])
n++;
+ else if (localeinfo.sbclen[b] != -2)
+ break;
else
{
wchar_t wc = 0;
@@ -163,19 +180,19 @@ wordchar_next (char const *buf, char const *end)
return wordchars_count (buf, end, false);
}
-/* In the buffer BUF, return true if the character whose encoding
+/* In the buffer BUF, return nonzero if the character whose encoding
contains the byte before CUR is a word constituent. The buffer
ends at END. */
-bool
+size_t
wordchar_prev (char const *buf, char const *cur, char const *end)
{
if (buf == cur)
- return false;
- cur--;
- wint_t ws = word_start[to_uchar (*cur)];
- if (! localeinfo.multibyte)
- return ws == 1;
+ return 0;
+ unsigned char b = *--cur;
+ if (! localeinfo.multibyte
+ || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
+ return sbwordchar[b];
char const *p = buf;
cur -= mb_goback (&p, cur, end);
- return wordchar_next (cur, end) != 0;
+ return wordchar_next (cur, end);
}
-----------------------------------------------------------------------
Summary of changes:
src/dfasearch.c | 13 ++---
src/grep.c | 11 +++--
src/kwsearch.c | 64 ++++++++++++------------
src/kwset.c | 95 +++++++++++++++++-------------------
src/pcresearch.c | 2 +-
src/search.h | 6 ++-
src/searchutils.c | 141 +++++++++++++++++++++++++++++++++++++++++------------
src/system.h | 1 +
8 files changed, 201 insertions(+), 132 deletions(-)
hooks/post-receive
--
grep
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- grep branch, master, updated. v2.27-20-gfdf9fad,
Paul Eggert <=