From 463261a86552c02fa5145422f498db889547bc0b Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Thu, 6 Oct 2016 08:14:10 +0900 Subject: [PATCH 2/3] sed: fix matching with multi-line option * NEWS: Mention it. * sed/regexp.c (compile_regex_1): Don't use newline_anchor of regex, if the buffer delimiter is not newline character. (match_regex): Do above case line-by-line. * testsuite/zero-anchor.good, testsuite/zero-anchor.inp, testsuite/zero-anchor.sed: New test. * testsuite/local.mk: Add the test. * testsuite/Makefile.tests: Add the test. --- NEWS | 3 ++ sed/regexp.c | 71 ++++++++++++++++++++++++++++++++++++++++---- testsuite/Makefile.tests | 6 ++++ testsuite/local.mk | 7 +++- testsuite/zero-anchor.good | Bin 0 -> 12 bytes testsuite/zero-anchor.inp | Bin 0 -> 6 bytes testsuite/zero-anchor.sed | 3 ++ 7 files changed, 82 insertions(+), 8 deletions(-) create mode 100644 testsuite/zero-anchor.good create mode 100644 testsuite/zero-anchor.inp create mode 100644 testsuite/zero-anchor.sed diff --git a/NEWS b/NEWS index e9b4584..367f2bc 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,9 @@ GNU sed NEWS -*- outline -*- ** Bug fixes + sed no longer substitue input with multi-line option even if the + buffer delimiter is not newline character. + sed no longer accepts a ":" command without a label; before, it would treat that as defining a label whose name is empty, and subsequent label-free "t" and "b" commands would use that label. Now, sed emits diff --git a/sed/regexp.c b/sed/regexp.c index 0543fe6..34d25cb 100644 --- a/sed/regexp.c +++ b/sed/regexp.c @@ -113,7 +113,8 @@ compile_regex_1 (struct regex *new_regex, int needed_sub) re_set_syntax (syntax); error = re_compile_pattern (new_regex->re, new_regex->sz, &new_regex->pattern); - new_regex->pattern.newline_anchor = (new_regex->flags & REG_NEWLINE) != 0; + new_regex->pattern.newline_anchor = + buffer_delimiter == '\n' && (new_regex->flags & REG_NEWLINE) != 0; new_regex->pattern.translate = NULL; #ifndef RE_ICASE @@ -327,7 +328,7 @@ match_regex(struct regex *regex, char *buf, size_t buflen, if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL)) return 0; - if ((!regsize && regex->pattern.newline_anchor) + if ((!regsize && (regex->flags & REG_NEWLINE)) || (!superset && dfaisfast (regex->dfa))) { bool backref = false; @@ -335,14 +336,72 @@ match_regex(struct regex *regex, char *buf, size_t buflen, if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref)) return 0; - if (!regsize && regex->pattern.newline_anchor && !backref) + if (!regsize && (regex->flags & REG_NEWLINE) && !backref) return 1; } } - ret = re_search (®ex->pattern, buf, buflen, buf_start_offset, - buflen - buf_start_offset, - regsize ? regarray : NULL); + /* If the buffer delimiter is not newline character, we can not use + newline_anchor flag of regex. So do it line-by-line, and add offset + value to results. */ + if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n') + { + const char *beg, *end; + const char *start; + + beg = buf; + + if (buf_start_offset > 0) + { + const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset); + + if (eol != NULL) + beg = eol + 1; + } + + start = buf + buf_start_offset; + + for (;;) + { + end = memchr (beg, buffer_delimiter, buf + buflen - beg); + + if (end == NULL) + end = buf + buflen; + + ret = re_search (®ex->pattern, beg, end - beg, + start - beg, end - start, + regsize ? regarray : NULL); + + if (ret > -1) + { + size_t i; + + ret += beg - buf; + + if (regsize) + { + for (i = 0; i < regarray->num_regs; ++i) + { + if (regarray->start[i] > -1) + regarray->start[i] += beg - buf; + if (regarray->end[i] > -1) + regarray->end[i] += beg - buf; + } + } + + break; + } + + if (end == buf + buflen) + break; + + beg = start = end + 1; + } + } + else + ret = re_search (®ex->pattern, buf, buflen, buf_start_offset, + buflen - buf_start_offset, + regsize ? regarray : NULL); return (ret > -1); #endif diff --git a/testsuite/Makefile.tests b/testsuite/Makefile.tests index 5605fd4..2c2704a 100644 --- a/testsuite/Makefile.tests +++ b/testsuite/Makefile.tests @@ -33,6 +33,12 @@ y-bracket y-newline y-zero insert brackets amp-escape newline-anchor:: $(CMP) $(srcdir)/address@hidden address@hidden @$(RM) address@hidden +zero-anchor:: + $(SEDENV) $(SED) -z -f $(srcdir)/address@hidden \ + < $(srcdir)/address@hidden | $(TR) -d \\r > address@hidden + $(CMP) $(srcdir)/address@hidden address@hidden + @$(RM) address@hidden + badenc:: LC_ALL=ru_RU.UTF-8 $(TIME) $(SED) -nf $(srcdir)/address@hidden \ < $(srcdir)/address@hidden | $(TR) -d \\r > address@hidden diff --git a/testsuite/local.mk b/testsuite/local.mk index e1ecf2f..ae1f06c 100644 --- a/testsuite/local.mk +++ b/testsuite/local.mk @@ -109,7 +109,7 @@ SEDTESTS += testsuite/appquit testsuite/enable testsuite/sep \ testsuite/amp-escape testsuite/help testsuite/file \ testsuite/quiet testsuite/factor testsuite/binary3 \ testsuite/binary2 testsuite/binary testsuite/dc \ - testsuite/newline-anchor + testsuite/newline-anchor testsuite/zero-anchor # Note that the first lines are statements. They ensure that environment # variables that can perturb tests are unset or set to expected values. @@ -358,7 +358,10 @@ EXTRA_DIST += \ testsuite/y-zero.inp \ testsuite/y-newline.good \ testsuite/y-newline.sed \ - testsuite/y-newline.inp + testsuite/y-newline.inp \ + testsuite/zero-anchor.good \ + testsuite/zero-anchor.sed \ + testsuite/zero-anchor.inp # automake makes `check' depend on $(TESTS). Declare # dummy targets for $(TESTS) so that make does not complain. diff --git a/testsuite/zero-anchor.good b/testsuite/zero-anchor.good new file mode 100644 index 0000000000000000000000000000000000000000..b085ed4529df593ffa060d3dec3ffb88c2b4fc3f GIT binary patch literal 12 address@hidden;OI1^^b213CZz literal 0 HcmV?d00001 diff --git a/testsuite/zero-anchor.inp b/testsuite/zero-anchor.inp new file mode 100644 index 0000000000000000000000000000000000000000..7207ec48b9862c834457031abf3a8bffc831fb11 GIT binary patch literal 6 NcmYdfNMcB4000D<0Ve