From ccea0b637cca101e09ebcb722472641cfb7f9cae Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 19 Sep 2016 11:22:34 +0900 Subject: [PATCH] sed: handle the patterns which consist of ^ or $ manually * sed/regex.c (compile_regex_1): Mark the patterns which consist of ^ or $. (match_regex): Handle the patterns which consist of ^ or $ manually. * sed/sed.h (struct regex): New members 'begline' and 'endline'. --- sed/regexp.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ sed/sed.h | 2 + 2 files changed, 65 insertions(+), 0 deletions(-) diff --git a/sed/regexp.c b/sed/regexp.c index 1eecd73..bd680d0 100644 --- a/sed/regexp.c +++ b/sed/regexp.c @@ -146,6 +146,18 @@ compile_regex_1 (struct regex *new_regex, int needed_sub) new_regex->dfa = dfaalloc (); dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts); dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1); + + /* The patterns which consist of only ^ or $ often appear in + substitution, but regex and dfa are not good at them, as regex does + not build fastmap, and as all in buffer must be scanned for $. So + we mark them to handle manually. */ + if (new_regex->sz == 1) + { + if (new_regex->re[0] == '^') + new_regex->begline = true; + if (new_regex->re[0] == '$') + new_regex->endline = true; + } } struct regex * @@ -256,6 +268,57 @@ match_regex(struct regex *regex, char *buf, size_t buflen, regex->pattern.regs_allocated = REGS_REALLOCATE; + if (regex->begline || regex->endline) + { + size_t offset; + + if (regex->endline) + { + const char *p = NULL; + + if (regex->pattern.newline_anchor) + p = memchr (buf, '\n', buflen); + + offset = p ? p - buf : buflen; + } + else if (buf_start_offset == 0) + offset = 0; + else if (!regex->pattern.newline_anchor) + return 0; + else if (buf[buf_start_offset - 1] == '\n') + offset = buf_start_offset; + else + { + const char *p = memchr (buf + buf_start_offset, '\n', + buflen - buf_start_offset); + + if (p == NULL) + return 0; + + offset = p - buf + 1; + } + + if (regsize) + { + size_t i; + + if (!regarray->start) + { + regarray->start = MALLOC (1, regoff_t); + regarray->end = MALLOC (1, regoff_t); + regarray->num_regs = 1; + } + + regarray->start[0] = offset; + regarray->end[0] = offset; + + for (i = 1 ; i < regarray->num_regs; ++i) + regarray->start[i] = regarray->end[i] = -1; + } + + return 1; + } + if (buf_start_offset == 0) { struct dfa *superset = dfasuperset (regex->dfa); diff --git a/sed/sed.h b/sed/sed.h index 083baae..6f1591c 100644 --- a/sed/sed.h +++ b/sed/sed.h @@ -53,6 +53,8 @@ struct regex { int flags; size_t sz; struct dfa *dfa; + bool begline; + bool endline; char re[1]; }; -- 1.7.1