From c394a3d4ec1eef203d55c0294fac97769bcf9764 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 19 Sep 2016 11:22:34 +0900 Subject: [PATCH 1/3] sed: handle the patterns which consist of ^ or $ manually * sed/regex.c (compile_regex_1): Mark the patterns which consist of ^ or $. (match_regex): Handle the patterns which consist of ^ or $ manually. * sed/sed.h (struct regex): New members 'begline' and 'endline'. * testsuite/newline-anchor.good, testsuite/newline-anchor.sed, testsuite/newline-anchor.sed: New test. * testsuite/Makefile.tests: Add the test. * testsuite/local.mk: Add the test. --- sed/regexp.c | 63 +++++++++++++++++++++++++++++++++++++++++ sed/sed.h | 2 + testsuite/Makefile.tests | 2 +- testsuite/local.mk | 6 +++- testsuite/newline-anchor.good | 3 ++ testsuite/newline-anchor.inp | 3 ++ testsuite/newline-anchor.sed | 3 ++ 7 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 testsuite/newline-anchor.good create mode 100644 testsuite/newline-anchor.inp create mode 100644 testsuite/newline-anchor.sed diff --git a/sed/regexp.c b/sed/regexp.c index cf4f8a0..0543fe6 100644 --- a/sed/regexp.c +++ b/sed/regexp.c @@ -147,6 +147,18 @@ compile_regex_1 (struct regex *new_regex, int needed_sub) new_regex->dfa = dfaalloc (); dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts); dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1); + + /* The patterns which consist of only ^ or $ often appear in + substitution, but regex and dfa are not good at them, as regex does + not build fastmap, and as all in buffer must be scanned for $. So + we mark them to handle manually. */ + if (new_regex->sz == 1) + { + if (new_regex->re[0] == '^') + new_regex->begline = true; + if (new_regex->re[0] == '$') + new_regex->endline = true; + } } struct regex * @@ -257,6 +269,57 @@ match_regex(struct regex *regex, char *buf, size_t buflen, regex->pattern.regs_allocated = REGS_REALLOCATE; + if (regex->begline || regex->endline) + { + size_t offset; + + if (regex->endline) + { + const char *p = NULL; + + if (regex->flags & REG_NEWLINE) + p = memchr (buf + buf_start_offset, buffer_delimiter, buflen); + + offset = p ? p - buf : buflen; + } + else if (buf_start_offset == 0) + offset = 0; + else if (!(regex->flags & REG_NEWLINE)) + return 0; + else if (buf[buf_start_offset - 1] == buffer_delimiter) + offset = buf_start_offset; + else + { + const char *p = memchr (buf + buf_start_offset, buffer_delimiter, + buflen - buf_start_offset); + + if (p == NULL) + return 0; + + offset = p - buf + 1; + } + + if (regsize) + { + size_t i; + + if (!regarray->start) + { + regarray->start = MALLOC (1, regoff_t); + regarray->end = MALLOC (1, regoff_t); + regarray->num_regs = 1; + } + + regarray->start[0] = offset; + regarray->end[0] = offset; + + for (i = 1 ; i < regarray->num_regs; ++i) + regarray->start[i] = regarray->end[i] = -1; + } + + return 1; + } + if (buf_start_offset == 0) { struct dfa *superset = dfasuperset (regex->dfa); diff --git a/sed/sed.h b/sed/sed.h index 083baae..6f1591c 100644 --- a/sed/sed.h +++ b/sed/sed.h @@ -53,6 +53,8 @@ struct regex { int flags; size_t sz; struct dfa *dfa; + bool begline; + bool endline; char re[1]; }; diff --git a/testsuite/Makefile.tests b/testsuite/Makefile.tests index ab48bc1..5605fd4 100644 --- a/testsuite/Makefile.tests +++ b/testsuite/Makefile.tests @@ -21,7 +21,7 @@ SKIP = :>address@hidden; exit 77 enable sep inclib 8bit 8to7 newjis xabcx dollar noeol bkslashes \ numsub head madding mac-mf empty xbxcx xbxcx3 recall recall2 xemacs \ appquit fasts uniq manis linecnt khadafy allsub flipcase space modulo \ -y-bracket y-newline y-zero insert brackets amp-escape:: +y-bracket y-newline y-zero insert brackets amp-escape newline-anchor:: $(SEDENV) $(SED) -f $(srcdir)/address@hidden \ < $(srcdir)/address@hidden | $(TR) -d \\r > address@hidden $(CMP) $(srcdir)/address@hidden address@hidden diff --git a/testsuite/local.mk b/testsuite/local.mk index 085a415..e1ecf2f 100644 --- a/testsuite/local.mk +++ b/testsuite/local.mk @@ -108,7 +108,8 @@ SEDTESTS += testsuite/appquit testsuite/enable testsuite/sep \ testsuite/badenc testsuite/inplace-hold testsuite/brackets \ testsuite/amp-escape testsuite/help testsuite/file \ testsuite/quiet testsuite/factor testsuite/binary3 \ - testsuite/binary2 testsuite/binary testsuite/dc + testsuite/binary2 testsuite/binary testsuite/dc \ + testsuite/newline-anchor # Note that the first lines are statements. They ensure that environment # variables that can perturb tests are unset or set to expected values. @@ -274,6 +275,9 @@ EXTRA_DIST += \ testsuite/newjis.good \ testsuite/newjis.inp \ testsuite/newjis.sed \ + testsuite/newline-anchor.good \ + testsuite/newline-anchor.inp \ + testsuite/newline-anchor.sed \ testsuite/noeol.good \ testsuite/noeol.inp \ testsuite/noeol.sed \ diff --git a/testsuite/newline-anchor.good b/testsuite/newline-anchor.good new file mode 100644 index 0000000..f237c7c --- /dev/null +++ b/testsuite/newline-anchor.good @@ -0,0 +1,3 @@ +XaY +XbY +XcY diff --git a/testsuite/newline-anchor.inp b/testsuite/newline-anchor.inp new file mode 100644 index 0000000..de98044 --- /dev/null +++ b/testsuite/newline-anchor.inp @@ -0,0 +1,3 @@ +a +b +c diff --git a/testsuite/newline-anchor.sed b/testsuite/newline-anchor.sed new file mode 100644 index 0000000..63af5d2 --- /dev/null +++ b/testsuite/newline-anchor.sed @@ -0,0 +1,3 @@ +N +N +s/^/X/mg;s/$/Y/mg -- 1.7.1