sed-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] new feature: add PCRE2 regexps with libpcre2 (https://pcre2proje


From: Gergely Szasz
Subject: [PATCH] new feature: add PCRE2 regexps with libpcre2 (https://pcre2project.github.io/pcre2)
Date: Thu, 20 Feb 2025 22:35:50 +0000

Hi,

Here is a patch to use libpcre2 for PCRE regexps in sed.

e.g. echo -ne 'Qingdao\nQuezon 
City\nQueens\nQuito\nQom\nQiqihar\nQinhuangdao\nQuetta\nQuebec\nQuilmes\nQionghai\nQazvin\nQarchak\nQina\nQuthbullapur\nQarshi\nQui
 Nhon\nQuelimane\nQo`qon\nQuanzhou\nQianjiang\nQapqal\n' | sed/sed -nRe 
'/q(?!u)/Ip'

Best regards,
Gergely

------------------------------------------------------------------------------
* configure.ac: add --with-pcre2 option
* sed/sed.h: conditionally include pcre2.h and define PCRE2 related macros
extend the regex structure with PCRE2 things
extend the replacement structure with PCRE2 things
* sed/sed.c (usage): add some text about PCRE2 to help
add a new command line switch -R to select PCRE2 style regexps
* sed/compile.c (mark_subst_opts): add new s/// flags for PCRE2: s x X l u U J
corresponding to (?.) options X - (?xx)
(compile_address): add same flags for address regexps //
(compile_program): prepare PCRE2 replacements
(finish_program): free PCRE2 replacements
* sed/execute.c (do_subst): add PCRE2 substitution
the output buffer handling of pcre2_substitute() is far from perfect
so we allocate a quite big (128k) output buffer first time to avoid
second scan of line
(process_files): free PCRE2 substitute output buffer
* sed/regexp.c (compile_regex): compiling PCRE2 regexp
(match_regex): PCRE2 match
(release_regex): free PCRE2 compiled program and context
Replacing only Nth match of regexp in a line with PCRE2 not implemented
---
 configure.ac  |  20 +++++++++
 sed/compile.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++
 sed/execute.c |  56 +++++++++++++++++++++++++
 sed/regexp.c  |  49 ++++++++++++++++++++++
 sed/sed.c     |  23 ++++++++++-
 sed/sed.h     |  20 +++++++++
 6 files changed, 278 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 220f406..06ed96c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -118,6 +118,26 @@ AC_CHECK_FUNCS_ONCE(isatty isascii memcpy strchr strtoul 
readlink
 
 AM_CONDITIONAL([TEST_SYMLINKS],
           [test "$ac_cv_func_readlink" = yes])
+dnl PCRE
+dnl Do you want to look for pcre support?
+AC_ARG_WITH(pcre2,
+AS_HELP_STRING([--with-pcre2],[enable the search for the pcre2 library (may 
create run-time library dependencies)]))
+
+AC_ARG_VAR(PCRE_CONFIG, [pathname of pcre2-config if it is not in PATH])
+if test x"$with_pcre2" = xyes; then
+  AC_CHECK_PROG([PCRE_CONFIG], pcre2-config, pcre2-config)
+  if test "x$PCRE_CONFIG" = x; then
+    with_pcre2=no
+    AC_MSG_WARN([pcre2-config not found: pcre2 is disabled.])
+    AC_MSG_NOTICE(
+      [Set PCRE_CONFIG to pathname of pcre2-config if it is not in PATH.])
+  fi
+fi
+if test x"$with_pcre2" = xyes; then
+  LIBS="`pcre2-config --libs8` $LIBS"
+  CFLAGS="`pcre2-config --cflags` $CFLAGS"
+  AC_DEFINE([WITH_PCRE2], [1], [Define to 1 if PCRE2 enabled.])
+fi
 
 AC_ARG_ENABLE(i18n,
 [  --disable-i18n          disable internationalization (default=enabled)], ,
diff --git a/sed/compile.c b/sed/compile.c
index 46b0bed..11a0857 100644
--- a/sed/compile.c
+++ b/sed/compile.c
@@ -525,6 +525,50 @@ mark_subst_opts (struct subst *cmd)
         flags |= REG_NEWLINE;
         break;
 
+#ifdef WITH_PCRE2
+      case 's':        /* PCRE2 extension */
+        if (extended_regexp_flags != REG_PCRE2)
+          bad_prog ("unknown option to 's'");
+        flags |= REG_PCRE2_DOTALL;
+        break;
+
+      case 'x':        /* PCRE2 extension */
+        if (extended_regexp_flags != REG_PCRE2)
+          bad_prog ("unknown option to 's'");
+        flags |= REG_PCRE2_EXTENDED;
+        break;
+
+      case 'X':        /* PCRE2 extension */
+        if (extended_regexp_flags != REG_PCRE2)
+          bad_prog ("unknown option to 's'");
+        flags |= REG_PCRE2_EXTENDED_MORE;
+        break;
+
+      case 'l':        /* PCRE2 extension */
+        if (extended_regexp_flags != REG_PCRE2)
+          bad_prog ("unknown option to 's'");
+        flags |= REG_PCRE2_LITERAL;
+        break;
+
+      case 'u':        /* PCRE2 extension */
+        if (extended_regexp_flags != REG_PCRE2)
+          bad_prog ("unknown option to 's'");
+        flags |= REG_PCRE2_MATCH_INVALID_UTF;
+        break;
+
+      case 'U':        /* PCRE2 extension */
+        if (extended_regexp_flags != REG_PCRE2)
+          bad_prog ("unknown option to 's'");
+        flags |= REG_PCRE2_UNGREEDY;
+        break;
+
+      case 'J':        /* PCRE2 extension */
+        if (extended_regexp_flags != REG_PCRE2)
+          bad_prog ("unknown option to 's'");
+        flags |= REG_PCRE2_DUPNAMES;
+        break;
+#endif
+
       case 'e':
         if (posixicity == POSIXLY_BASIC)
           bad_prog ("unknown option to 's'");
@@ -827,8 +871,14 @@ compile_address (struct addr *addr, int ch)
       for (;;)
         {
           ch = in_nonblank ();
+#ifdef WITH_PCRE2
+          if (extended_regexp_flags != REG_PCRE2 &&
+              posixicity == POSIXLY_BASIC)
+            goto posix_address_modifier;
+#else
           if (posixicity == POSIXLY_BASIC)
             goto posix_address_modifier;
+#endif
           switch (ch)
             {
             case 'I':  /* GNU extension */
@@ -838,7 +888,49 @@ compile_address (struct addr *addr, int ch)
             case 'M':  /* GNU extension */
               flags |= REG_NEWLINE;
               break;
+#ifdef WITH_PCRE2
+            case 's':  /* PCRE2 extension */
+              if (extended_regexp_flags != REG_PCRE2)
+                goto posix_address_modifier;
+              flags |= REG_PCRE2_DOTALL;
+              break;
 
+            case 'x':  /* PCRE2 extension */
+              if (extended_regexp_flags != REG_PCRE2)
+                goto posix_address_modifier;
+              flags |= REG_PCRE2_EXTENDED;
+              break;
+
+            case 'X':  /* PCRE2 extension */
+              if (extended_regexp_flags != REG_PCRE2)
+                goto posix_address_modifier;
+              flags |= REG_PCRE2_EXTENDED_MORE;
+              break;
+
+            case 'l':  /* PCRE2 extension */
+              if (extended_regexp_flags != REG_PCRE2)
+                goto posix_address_modifier;
+              flags |= REG_PCRE2_LITERAL;
+              break;
+
+            case 'u':  /* PCRE2 extension */
+              if (extended_regexp_flags != REG_PCRE2)
+                goto posix_address_modifier;
+              flags |= REG_PCRE2_MATCH_INVALID_UTF;
+              break;
+
+            case 'U':  /* PCRE2 extension */
+              if (extended_regexp_flags != REG_PCRE2)
+                goto posix_address_modifier;
+              flags |= REG_PCRE2_UNGREEDY;
+              break;
+
+            case 'J':  /* PCRE2 extension */
+              if (extended_regexp_flags != REG_PCRE2)
+                goto posix_address_modifier;
+              flags |= REG_PCRE2_DUPNAMES;
+              break;
+#endif
             default:
             posix_address_modifier:
               savchar (ch);
@@ -1153,8 +1245,20 @@ compile_program (struct vector *vector)
               bad_prog ("unterminated 's' command");
 
             cur_cmd->x.cmd_subst = OB_MALLOC (&obs, 1, struct subst);
+#ifdef WITH_PCRE2
+            if (extended_regexp_flags == REG_PCRE2)
+              {
+                cur_cmd->x.cmd_subst->rplcmnt = (PCRE2_SPTR)xstrdup 
(get_buffer (b2));
+                cur_cmd->x.cmd_subst->rlength = size_buffer (b2);
+              }
+            else
+              {
+#endif
             setup_replacement (cur_cmd->x.cmd_subst,
                                get_buffer (b2), size_buffer (b2));
+#ifdef WITH_PCRE2
+              }
+#endif
             free_buffer (b2);
 
             flags = mark_subst_opts (cur_cmd->x.cmd_subst);
@@ -1609,6 +1713,13 @@ finish_program (struct vector *program)
         {
         case 's':
           free (sc->x.cmd_subst->replacement_buffer);
+#ifdef WITH_PCRE2
+          if (extended_regexp_flags == REG_PCRE2)
+            {
+              if (sc->x.cmd_subst->rplcmnt)
+                free ((void *)sc->x.cmd_subst->rplcmnt);
+            }
+#endif
           if (sc->x.cmd_subst->regx)
             release_regex (sc->x.cmd_subst->regx);
           break;
diff --git a/sed/execute.c b/sed/execute.c
index adc7575..dd06962 100644
--- a/sed/execute.c
+++ b/sed/execute.c
@@ -127,6 +127,11 @@ static struct line buffer;
 static struct append_queue *append_head = NULL;
 static struct append_queue *append_tail = NULL;
 
+#ifdef WITH_PCRE2
+static PCRE2_SIZE outallocd = 0;
+static PCRE2_UCHAR *outputbuffer = NULL;
+#endif
+
 /* Prepare to increase LB's length by LEN, making some attempt at
    keeping realloc() calls under control by padding for future growth.  */
 static void
@@ -1022,6 +1027,53 @@ do_subst (struct subst *sub)
 
   line_reset (&s_accum, &line);
 
+#ifdef WITH_PCRE2
+  if (extended_regexp_flags == REG_PCRE2)
+    {
+      int ret;
+      PCRE2_SIZE outlength = outallocd - 1;
+
+      if (outallocd == 0 || 2 * line.length > outlength)
+        {
+          outallocd = 2 * line.length + 1;
+          if (outallocd < 128 * 1024)
+            outallocd = 128 * 1024;
+          outputbuffer = xrealloc (outputbuffer, outallocd);
+          outlength = outallocd - 1;
+        }
+
+      ret = pcre2_substitute (sub->regx->rex, (PCRE2_SPTR8)line.active,
+                             line.length, start,
+                             PCRE2_SUBSTITUTE_OVERFLOW_LENGTH |
+                                (sub->global ? PCRE2_SUBSTITUTE_GLOBAL : 0),
+                             NULL, NULL, sub->rplcmnt, sub->rlength,
+                             outputbuffer, &outlength);
+      if (ret == PCRE2_ERROR_NOMEMORY)
+        {
+          outallocd = outlength + 1;
+          outputbuffer = xrealloc (outputbuffer, outallocd);
+          ret = pcre2_substitute (sub->regx->rex, (PCRE2_SPTR8)line.active,
+                                 line.length, start,
+                                 (sub->global ? PCRE2_SUBSTITUTE_GLOBAL : 0),
+                                 NULL, NULL, sub->rplcmnt, sub->rlength,
+                                 outputbuffer, &outlength);
+        }
+      if (ret < 0)
+        {
+          if (ret == PCRE2_ERROR_NOMATCH)
+            return;
+          panic (_("error in PCRE2 substitute"));
+        }
+      str_append (&s_accum, (const char *)outputbuffer, outlength);
+      s_accum.chomped = line.chomped;
+
+      /* Exchange line and s_accum.  This can be much cheaper
+         than copying s_accum.active into line.text (for huge lines). */
+      line_exchange (&line, &s_accum, false);
+      replaced = true;
+      goto post_subst;
+    }
+#endif
   /* The first part of the loop optimizes s/xxx// when xxx is at the
      start, and s/xxx$// */
   if (!match_regex (sub->regx, line.active, line.length, start,
@@ -1709,6 +1761,10 @@ process_files (struct vector *the_program, char **argv)
   free (hold.text);
   free (line.text);
   free (s_accum.text);
+#ifdef WITH_PCRE2
+  if (extended_regexp_flags == REG_PCRE2 && outputbuffer)
+    free (outputbuffer);
+#endif
 #endif /* lint */
 
   if (input.bad_count)
diff --git a/sed/regexp.c b/sed/regexp.c
index f05c5ad..212ac0d 100644
--- a/sed/regexp.c
+++ b/sed/regexp.c
@@ -132,6 +132,10 @@ compile_regex (struct buffer *b, int flags, int needed_sub)
 {
   struct regex *new_regex;
   idx_t re_len;
+#ifdef WITH_PCRE2
+  int errornumber;
+  PCRE2_SIZE erroroffset;
+#endif
 
   /* // matches the last RE */
   if (size_buffer (b) == 0)
@@ -146,6 +150,35 @@ compile_regex (struct buffer *b, int flags, int needed_sub)
   new_regex->flags = flags;
   memcpy (new_regex->re, get_buffer (b), re_len);
 
+#ifdef WITH_PCRE2
+  if (extended_regexp_flags == REG_PCRE2)
+    {
+      new_regex->cctx = NULL;
+      if (buffer_delimiter == 0)
+        {
+          new_regex->cctx = pcre2_compile_context_create (NULL);
+          pcre2_set_newline (new_regex->cctx, PCRE2_NEWLINE_NUL);
+        }
+      uint32_t options = (flags & REG_ICASE ? PCRE2_CASELESS : 0) |
+                         (flags & REG_NEWLINE ? PCRE2_MULTILINE : 0) |
+                         (flags & REG_PCRE2_DOTALL ? PCRE2_DOTALL : 0) |
+                         (flags & REG_PCRE2_EXTENDED ? PCRE2_EXTENDED : 0) |
+                         (flags & REG_PCRE2_EXTENDED_MORE ? 
PCRE2_EXTENDED_MORE : 0) |
+                         (flags & REG_PCRE2_LITERAL ? PCRE2_LITERAL : 0) |
+                         (flags & REG_PCRE2_MATCH_INVALID_UTF ? 
PCRE2_MATCH_INVALID_UTF : 0) |
+                         (flags & REG_PCRE2_UNGREEDY ? PCRE2_UNGREEDY : 0) |
+                         (flags & REG_PCRE2_DUPNAMES ? PCRE2_DUPNAMES : 0);
+      new_regex->rex = pcre2_compile ((PCRE2_SPTR8)new_regex->re, re_len,
+                             options, &errornumber, &erroroffset,
+                             new_regex->cctx);
+      if (new_regex->rex == NULL)
+        {
+          bad_prog ("cannot compile PCRE2 regexp");
+          return NULL;
+        }
+      return new_regex;
+    }
+#endif
   /* GNU regex does not process \t & co. */
   new_regex->sz = normalize_text (new_regex->re, re_len, TEXT_REGEX);
 
@@ -171,6 +204,15 @@ match_regex (struct regex *regex, char *buf, idx_t buflen,
   else
     regex_last = regex;
 
+#ifdef WITH_PCRE2
+  if (extended_regexp_flags == REG_PCRE2)
+    {
+      pcre2_match_data *match_data = pcre2_match_data_create (2, NULL);
+      ret = pcre2_match (regex->rex, (PCRE2_SPTR)buf, buflen, 
buf_start_offset, 0, match_data, NULL);
+      pcre2_match_data_free (match_data);
+      return (ret > -1);
+    }
+#endif
   regoff_t buflen_regoff;
   if (ckd_add (&buflen_regoff, buflen, 0))
     panic (_("regex input buffer length overflow"));
@@ -354,6 +396,13 @@ release_regex (struct regex *regex)
       regex->dfa = NULL;
     }
   regfree (&regex->pattern);
+#ifdef WITH_PCRE2
+  if (extended_regexp_flags == REG_PCRE2)
+    {
+      pcre2_compile_context_free (regex->cctx);
+      pcre2_code_free (regex->rex);
+    }
+#endif
   free (regex);
 }
 #endif /* lint */
diff --git a/sed/sed.c b/sed/sed.c
index 18c70a3..22314c8 100644
--- a/sed/sed.c
+++ b/sed/sed.c
@@ -162,6 +162,10 @@ Usage: %s [OPTION]... {script-only-if-no-other-script} 
[input-file]...\n\
   fprintf (out, _("  -E, -r, --regexp-extended\n\
                  use extended regular expressions in the script\n\
                  (for portability use POSIX -E).\n"));
+#ifdef WITH_PCRE2
+  fprintf (out, _("  -R, --regexp-pcre2\n\
+                 use PCRE2 regular expressions in the script\n"));
+#endif
   fprintf (out, _("  -s, --separate\n\
                  consider files as separate rather than as a single,\n\
                  continuous long stream.\n"));
@@ -180,6 +184,16 @@ non-option argument is taken as the sed script to 
interpret.  All\n\
 remaining arguments are names of input files; if no input files are\n\
 specified, then the standard input is read.\n\
 \n"));
+#ifdef WITH_PCRE2
+  fprintf (out, _("Note for PCRE2 (-R) regexp: the following flags supported \
+for address\n\
+match and substitution: s - PCRE2_DOTALL (single line), x - PCRE2_EXTENDED\n\
+X - PCRE2_EXTENDED_MORE, l - PCRE2_LITERAL, u - PCRE2_MATCH_INVALID_UTF,\n\
+U - PCRE2_UNGREEDY, J - PCRE2_DUPNAMES, the nth replacement and nth\n\
+replacement with the g (global) modifier not works for PCRE2 substitutions.\n\
+The -z option (the NL char is NUL) is implemented.\n\
+\n"));
+#endif
   contact (status);
 
   ck_fclose (NULL);
@@ -189,7 +203,7 @@ specified, then the standard input is read.\n\
 int
 main (int argc, char **argv)
 {
-#define SHORTOPTS "bsnrzuEe:f:l:i::V:"
+#define SHORTOPTS "bsnrRzuEe:f:l:i::V:"
 
   enum { SANDBOX_OPTION = CHAR_MAX+1,
          DEBUG_OPTION
@@ -198,6 +212,9 @@ main (int argc, char **argv)
   static const struct option longopts[] = {
     {"binary", 0, NULL, 'b'},
     {"regexp-extended", 0, NULL, 'r'},
+#ifdef WITH_PCRE2
+    {"regexp-pcre2", 0, NULL, 'R'},
+#endif
     {"debug", 0, NULL, DEBUG_OPTION},
     {"expression", 1, NULL, 'e'},
     {"file", 1, NULL, 'f'},
@@ -322,6 +339,10 @@ main (int argc, char **argv)
           extended_regexp_flags = REG_EXTENDED;
           break;
 
+        case 'R':
+          extended_regexp_flags = REG_PCRE2;
+          break;
+
         case 's':
           separate_files = true;
           break;
diff --git a/sed/sed.h b/sed/sed.h
index d57cc9a..b23a072 100644
--- a/sed/sed.h
+++ b/sed/sed.h
@@ -21,6 +21,18 @@
 #include "regex.h"
 #include <stdio.h>
 #include "unlocked-io.h"
+#ifdef WITH_PCRE2
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
+#define REG_PCRE2 2
+#define REG_PCRE2_DOTALL            0x00100u
+#define REG_PCRE2_EXTENDED          0x00200u
+#define REG_PCRE2_EXTENDED_MORE     0x00400u
+#define REG_PCRE2_LITERAL           0x00800u
+#define REG_PCRE2_MATCH_INVALID_UTF 0x01000u
+#define REG_PCRE2_UNGREEDY          0x02000u
+#define REG_PCRE2_DUPNAMES          0x04000u
+#endif
 
 #include "utils.h"
 
@@ -55,6 +67,10 @@ struct regex {
   bool begline;
   bool endline;
   char re[1];
+#ifdef WITH_PCRE2
+  pcre2_code *rex;
+  pcre2_compile_context *cctx;
+#endif
 };
 
 struct readcmd {
@@ -124,6 +140,10 @@ struct replacement {
 struct subst {
   struct regex *regx;
   struct replacement *replacement;
+#ifdef WITH_PCRE2
+  PCRE2_SPTR rplcmnt;
+  PCRE2_SIZE rlength;
+#endif
   intmax_t numb;       /* if >0, only substitute for match number "numb" */
   struct output *outf; /* 'w' option given */
   unsigned global : 1; /* 'g' option given */
-- 
2.39.5




reply via email to

[Prev in Thread] Current Thread [Next in Thread]