>From 29c2f2238ed58ceb4101687f3aae7265f6839025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Mon, 8 Nov 2021 21:27:03 -0800 Subject: [PATCH v2] pcre: migrate to pcre2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mostly a bug by bug translation of the original code to the PCRE2 API. Code still could do with some optimizations but should be good as a starting point. The API changes the sign of some types and therefore some ugly casts were needed, some of the changes are just to make sure all variables fit into the newer types better. Includes backward compatibility and could be made to build all the way to 10.00, but assumes a recent enough version and has been tested with 10.23 (from CentOS 7, the oldest). Performance seems equivalent, and it also seems functionally complete. Signed-off-by: Carlo Marcelo Arenas Belón --- configure.ac | 2 +- doc/grep.in.1 | 8 +- doc/grep.texi | 2 +- m4/{pcre.m4 => pcre2.m4} | 23 ++-- src/pcresearch.c | 243 +++++++++++++++++++-------------------- tests/filename-lineno.pl | 4 +- 6 files changed, 139 insertions(+), 143 deletions(-) rename m4/{pcre.m4 => pcre2.m4} (67%) diff --git a/configure.ac b/configure.ac index c49ec4a..9291cee 100644 --- a/configure.ac +++ b/configure.ac @@ -197,7 +197,7 @@ if test "$ac_use_included_regex" = no; then AC_MSG_WARN([Included lib/regex.c not used]) fi -gl_FUNC_PCRE +gl_FUNC_PCRE2 AM_CONDITIONAL([USE_PCRE], [test $use_pcre = yes]) case $host_os in diff --git a/doc/grep.in.1 b/doc/grep.in.1 index b014f65..208cb76 100644 --- a/doc/grep.in.1 +++ b/doc/grep.in.1 @@ -756,7 +756,7 @@ In other implementations, basic regular expressions are less powerful. The following description applies to extended regular expressions; differences for basic regular expressions are summarized afterwards. Perl-compatible regular expressions give additional functionality, and are -documented in B(3) and B(3), but work only if +documented in B(3) and B(3), but work only if PCRE support is enabled. .PP The fundamental building blocks are the regular expressions @@ -1360,9 +1360,9 @@ from the globbing syntax that the shell uses to match file names. .BR sort (1), .BR xargs (1), .BR read (2), -.BR pcre (3), -.BR pcresyntax (3), -.BR pcrepattern (3), +.BR pcre2 (3), +.BR pcre2syntax (3), +.BR pcre2pattern (3), .BR terminfo (5), .BR glob (7), .BR regex (7) diff --git a/doc/grep.texi b/doc/grep.texi index e5b9fd8..c3c4bbf 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -1168,7 +1168,7 @@ In other implementations, basic regular expressions are less powerful. The following description applies to extended regular expressions; differences for basic regular expressions are summarized afterwards. Perl-compatible regular expressions give additional functionality, and -are documented in the @i{pcresyntax}(3) and @i{pcrepattern}(3) manual +are documented in the @i{pcre2syntax}(3) and @i{pcre2pattern}(3) manual pages, but work only if PCRE is available in the system. @menu diff --git a/m4/pcre.m4 b/m4/pcre2.m4 similarity index 67% rename from m4/pcre.m4 rename to m4/pcre2.m4 index 78b7fda..7970c4e 100644 --- a/m4/pcre.m4 +++ b/m4/pcre2.m4 @@ -1,15 +1,15 @@ -# pcre.m4 - check for libpcre support +# pcre2.m4 - check for libpcre2 support # Copyright (C) 2010-2021 Free Software Foundation, Inc. # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. -AC_DEFUN([gl_FUNC_PCRE], +AC_DEFUN([gl_FUNC_PCRE2], [ AC_ARG_ENABLE([perl-regexp], AS_HELP_STRING([--disable-perl-regexp], - [disable perl-regexp (pcre) support]), + [disable perl-regexp (pcre2) support]), [case $enableval in yes|no) test_pcre=$enableval;; *) AC_MSG_ERROR([invalid value $enableval for --disable-perl-regexp]);; @@ -21,24 +21,25 @@ AC_DEFUN([gl_FUNC_PCRE], use_pcre=no if test $test_pcre != no; then - PKG_CHECK_MODULES([PCRE], [libpcre], [], [: ${PCRE_LIBS=-lpcre}]) + PKG_CHECK_MODULES([PCRE], [libpcre2-8], [], [: ${PCRE_LIBS=-lpcre2-8}]) - AC_CACHE_CHECK([for pcre_compile], [pcre_cv_have_pcre_compile], + AC_CACHE_CHECK([for pcre2_compile], [pcre_cv_have_pcre2_compile], [pcre_saved_CFLAGS=$CFLAGS pcre_saved_LIBS=$LIBS CFLAGS="$CFLAGS $PCRE_CFLAGS" LIBS="$PCRE_LIBS $LIBS" AC_LINK_IFELSE( - [AC_LANG_PROGRAM([[#include + [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8 + #include ]], - [[pcre *p = pcre_compile (0, 0, 0, 0, 0); + [[pcre2_code *p = pcre2_compile (0, 0, 0, 0, 0, 0); return !p;]])], - [pcre_cv_have_pcre_compile=yes], - [pcre_cv_have_pcre_compile=no]) + [pcre_cv_have_pcre2_compile=yes], + [pcre_cv_have_pcre2_compile=no]) CFLAGS=$pcre_saved_CFLAGS LIBS=$pcre_saved_LIBS]) - if test "$pcre_cv_have_pcre_compile" = yes; then + if test "$pcre_cv_have_pcre2_compile" = yes; then use_pcre=yes elif test $test_pcre = maybe; then AC_MSG_WARN([AC_PACKAGE_NAME will be built without pcre support.]) @@ -50,7 +51,7 @@ AC_DEFUN([gl_FUNC_PCRE], if test $use_pcre = yes; then AC_DEFINE([HAVE_LIBPCRE], [1], [Define to 1 if you have the Perl Compatible Regular Expressions - library (-lpcre).]) + library (-lpcre2).]) else PCRE_CFLAGS= PCRE_LIBS= diff --git a/src/pcresearch.c b/src/pcresearch.c index 3bdaee9..e83f371 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -17,41 +17,32 @@ 02110-1301, USA. */ /* Written August 1992 by Mike Haertel. */ +/* Updated for PCRE2 by Carlo Arenas. */ #include #include "search.h" #include "die.h" -#include +#define PCRE2_CODE_UNIT_WIDTH 8 +#include -/* This must be at least 2; everything after that is for performance - in pcre_exec. */ -enum { NSUB = 300 }; - -#ifndef PCRE_EXTRA_MATCH_LIMIT_RECURSION -# define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 -#endif -#ifndef PCRE_STUDY_JIT_COMPILE -# define PCRE_STUDY_JIT_COMPILE 0 -#endif -#ifndef PCRE_STUDY_EXTRA_NEEDED -# define PCRE_STUDY_EXTRA_NEEDED 0 +/* Needed for backward compatibility for PCRE2 < 10.30 */ +#ifndef PCRE2_CONFIG_DEPTHLIMIT +#define PCRE2_CONFIG_DEPTHLIMIT PCRE2_CONFIG_RECURSIONLIMIT +#define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT +#define pcre2_set_depth_limit pcre2_set_recursion_limit #endif struct pcre_comp { - /* Compiled internal form of a Perl regular expression. */ - pcre *cre; - - /* Additional information about the pattern. */ - pcre_extra *extra; - -#if PCRE_STUDY_JIT_COMPILE /* The JIT stack and its maximum size. */ - pcre_jit_stack *jit_stack; - int jit_stack_size; -#endif + pcre2_jit_stack *jit_stack; + PCRE2_SIZE jit_stack_size; + /* Compiled internal form of a Perl regular expression. */ + pcre2_code *cre; + pcre2_match_context *mcontext; + pcre2_match_data *data; /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty string matches when that flag is used. */ int empty_match[2]; @@ -60,51 +51,50 @@ struct pcre_comp /* Match the already-compiled PCRE pattern against the data in SUBJECT, of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with - options OPTIONS, and storing resulting matches into SUB. Return - the (nonnegative) match location or a (negative) error number. */ + options OPTIONS. + Return the (nonnegative) match count or a (negative) error number. */ static int -jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes, - int search_offset, int options, int *sub) +jit_exec (struct pcre_comp *pc, char const *subject, PCRE2_SIZE search_bytes, + PCRE2_SIZE search_offset, int options) { while (true) { - int e = pcre_exec (pc->cre, pc->extra, subject, search_bytes, - search_offset, options, sub, NSUB); + int e = pcre2_match (pc->cre, (PCRE2_SPTR)subject, search_bytes, + search_offset, options, pc->data, pc->mcontext); -#if PCRE_STUDY_JIT_COMPILE - if (e == PCRE_ERROR_JIT_STACKLIMIT + if (e == PCRE2_ERROR_JIT_STACKLIMIT && 0 < pc->jit_stack_size && pc->jit_stack_size <= INT_MAX / 2) { - int old_size = pc->jit_stack_size; - int new_size = pc->jit_stack_size = old_size * 2; + PCRE2_SIZE old_size = pc->jit_stack_size; + PCRE2_SIZE new_size = pc->jit_stack_size = old_size * 2; + if (pc->jit_stack) - pcre_jit_stack_free (pc->jit_stack); - pc->jit_stack = pcre_jit_stack_alloc (old_size, new_size); - if (!pc->jit_stack) + pcre2_jit_stack_free (pc->jit_stack); + pc->jit_stack = pcre2_jit_stack_create (old_size, new_size, NULL); + + if (!pc->mcontext) + pc->mcontext = pcre2_match_context_create (NULL); + + if (!pc->jit_stack || !pc->mcontext) die (EXIT_TROUBLE, 0, _("failed to allocate memory for the PCRE JIT stack")); - pcre_assign_jit_stack (pc->extra, NULL, pc->jit_stack); + pcre2_jit_stack_assign (pc->mcontext, NULL, pc->jit_stack); continue; } -#endif - -#if PCRE_EXTRA_MATCH_LIMIT_RECURSION - if (e == PCRE_ERROR_RECURSIONLIMIT - && (PCRE_STUDY_EXTRA_NEEDED || pc->extra)) + if (e == PCRE2_ERROR_DEPTHLIMIT) { - unsigned long lim - = (pc->extra->flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION - ? pc->extra->match_limit_recursion - : 0); - if (lim <= ULONG_MAX / 2) - { - pc->extra->match_limit_recursion = lim ? 2 * lim : (1 << 24) - 1; - pc->extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; - continue; - } - } -#endif + uint32_t lim; + pcre2_config (PCRE2_CONFIG_DEPTHLIMIT, &lim); + if (lim >= UINT32_MAX / 2) + return e; + + lim <<= 1; + if (!pc->mcontext) + pc->mcontext = pcre2_match_context_create (NULL); + pcre2_set_depth_limit (pc->mcontext, lim); + continue; + } return e; } } @@ -115,27 +105,35 @@ jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes, void * Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) { - int e; - char const *ep; + PCRE2_SIZE e; + int ec; + PCRE2_UCHAR8 ep[128]; /* 120 code units is suggested to avoid truncation */ static char const wprefix[] = "(?cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); + pcre2_set_character_tables (ccontext, pcre2_maketables (NULL)); + pc->cre = pcre2_compile (re, n - (char *)re, flags, &ec, &e, ccontext); if (!pc->cre) - die (EXIT_TROUBLE, 0, "%s", ep); - - int pcre_study_flags = PCRE_STUDY_EXTRA_NEEDED | PCRE_STUDY_JIT_COMPILE; - pc->extra = pcre_study (pc->cre, pcre_study_flags, &ep); - if (ep) - die (EXIT_TROUBLE, 0, "%s", ep); + { + pcre2_get_error_message (ec, ep, sizeof (ep)); + die (EXIT_TROUBLE, 0, "%s", ep); + } -#if PCRE_STUDY_JIT_COMPILE - if (pcre_fullinfo (pc->cre, pc->extra, PCRE_INFO_JIT, &e)) - die (EXIT_TROUBLE, 0, _("internal error (should never happen)")); + pc->data = pcre2_match_data_create_from_pattern (pc->cre, NULL); - /* The PCRE documentation says that a 32 KiB stack is the default. */ - if (e) - pc->jit_stack_size = 32 << 10; -#endif + ec = pcre2_jit_compile (pc->cre, PCRE2_JIT_COMPLETE); + if (ec && ec != PCRE2_ERROR_JIT_BADOPTION && ec != PCRE2_ERROR_NOMEMORY) + die (EXIT_TROUBLE, 0, _("JIT internal error: %d"), ec); + else + { + /* The PCRE documentation says that a 32 KiB stack is the default. */ + pc->jit_stack_size = 32 << 10; + } free (re); - int sub[NSUB]; - pc->empty_match[false] = pcre_exec (pc->cre, pc->extra, "", 0, 0, - PCRE_NOTBOL, sub, NSUB); - pc->empty_match[true] = pcre_exec (pc->cre, pc->extra, "", 0, 0, 0, sub, - NSUB); + pc->empty_match[false] = jit_exec (pc, "", 0, 0, PCRE2_NOTBOL); + pc->empty_match[true] = jit_exec (pc, "", 0, 0, 0); return pc; } @@ -206,15 +190,15 @@ ptrdiff_t Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, char const *start_ptr) { - int sub[NSUB]; char const *p = start_ptr ? start_ptr : buf; bool bol = p[-1] == eolbyte; char const *line_start = buf; - int e = PCRE_ERROR_NOMATCH; + int e = PCRE2_ERROR_NOMATCH; char const *line_end; struct pcre_comp *pc = vcp; + PCRE2_SIZE *sub = pcre2_get_ovector_pointer (pc->data); - /* The search address to pass to pcre_exec. This is the start of + /* The search address to pass to PCRE. This is the start of the buffer, or just past the most-recently discovered encoding error or line end. */ char const *subject = buf; @@ -226,14 +210,14 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, better and the correctness issues were too puzzling. See Bug#22655. */ line_end = rawmemchr (p, eolbyte); - if (INT_MAX < line_end - p) + if (PCRE2_SIZE_MAX < line_end - p) die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); for (;;) { /* Skip past bytes that are easily determined to be encoding errors, treating them as data that cannot match. This is - faster than having pcre_exec check them. */ + faster than having PCRE check them. */ while (localeinfo.sbclen[to_uchar (*p)] == -1) { p++; @@ -241,10 +225,10 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, bol = false; } - int search_offset = p - subject; + PCRE2_SIZE search_offset = p - subject; /* Check for an empty match; this is faster than letting - pcre_exec do it. */ + PCRE do it. */ if (p == line_end) { sub[0] = sub[1] = search_offset; @@ -254,13 +238,14 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, int options = 0; if (!bol) - options |= PCRE_NOTBOL; + options |= PCRE2_NOTBOL; - e = jit_exec (pc, subject, line_end - subject, search_offset, - options, sub); - if (e != PCRE_ERROR_BADUTF8) + e = jit_exec (pc, subject, line_end - subject, + search_offset, options); + /* PCRE2 provides 22 different error codes for bad UTF-8 */ + if (! (PCRE2_ERROR_UTF8_ERR21 <= e && e < PCRE2_ERROR_UTF8_ERR1)) break; - int valid_bytes = sub[0]; + PCRE2_SIZE valid_bytes = pcre2_get_startchar (pc->data); if (search_offset <= valid_bytes) { @@ -270,14 +255,15 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, /* Handle the empty-match case specially, for speed. This optimization is valid if VALID_BYTES is zero, which means SEARCH_OFFSET is also zero. */ + sub[0] = valid_bytes; sub[1] = 0; e = pc->empty_match[bol]; } else e = jit_exec (pc, subject, valid_bytes, search_offset, - options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub); + options | PCRE2_NO_UTF_CHECK | PCRE2_NOTEOL); - if (e != PCRE_ERROR_NOMATCH) + if (e != PCRE2_ERROR_NOMATCH) break; /* Treat the encoding error as data that cannot match. */ @@ -288,7 +274,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, subject += valid_bytes + 1; } - if (e != PCRE_ERROR_NOMATCH) + if (e != PCRE2_ERROR_NOMATCH) break; bol = true; p = subject = line_start = line_end + 1; @@ -299,26 +285,35 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, { switch (e) { - case PCRE_ERROR_NOMATCH: + case PCRE2_ERROR_NOMATCH: break; - case PCRE_ERROR_NOMEMORY: + case PCRE2_ERROR_NOMEMORY: die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ()); -#if PCRE_STUDY_JIT_COMPILE - case PCRE_ERROR_JIT_STACKLIMIT: + case PCRE2_ERROR_JIT_STACKLIMIT: die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"), input_filename ()); -#endif - case PCRE_ERROR_MATCHLIMIT: + case PCRE2_ERROR_MATCHLIMIT: die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"), input_filename ()); - case PCRE_ERROR_RECURSIONLIMIT: - die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's recursion limit"), + case PCRE2_ERROR_DEPTHLIMIT: + die (EXIT_TROUBLE, 0, + _("%s: exceeded PCRE's nested backtracking limit"), input_filename ()); + case PCRE2_ERROR_RECURSELOOP: + die (EXIT_TROUBLE, 0, _("%s: PCRE detected recurse loop"), + input_filename ()); + +#ifdef PCRE2_ERROR_HEAPLIMIT + case PCRE2_ERROR_HEAPLIMIT: + die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's heap limit"), + input_filename ()); +#endif + default: /* For now, we lump all remaining PCRE failures into this basket. If anyone cares to provide sample grep usage that can trigger diff --git a/tests/filename-lineno.pl b/tests/filename-lineno.pl index 1e84b45..1ff3d6a 100755 --- a/tests/filename-lineno.pl +++ b/tests/filename-lineno.pl @@ -101,13 +101,13 @@ my @Tests = ], ['invalid-re-P-paren', '-P ")"', {EXIT=>2}, {ERR => $ENV{PCRE_WORKS} == 1 - ? "$prog: unmatched parentheses\n" + ? "$prog: unmatched closing parenthesis\n" : $no_pcre }, ], ['invalid-re-P-star-paren', '-P "a.*)"', {EXIT=>2}, {ERR => $ENV{PCRE_WORKS} == 1 - ? "$prog: unmatched parentheses\n" + ? "$prog: unmatched closing parenthesis\n" : $no_pcre }, ], -- 2.34.0.rc1.349.g8f33748433