[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] new feature: add PCRE2 regexps with libpcre2 (https://pcre2proje
From: |
Gergely Szasz |
Subject: |
[PATCH] new feature: add PCRE2 regexps with libpcre2 (https://pcre2project.github.io/pcre2) |
Date: |
Thu, 20 Feb 2025 22:35:50 +0000 |
Hi,
Here is a patch to use libpcre2 for PCRE regexps in sed.
e.g. echo -ne 'Qingdao\nQuezon
City\nQueens\nQuito\nQom\nQiqihar\nQinhuangdao\nQuetta\nQuebec\nQuilmes\nQionghai\nQazvin\nQarchak\nQina\nQuthbullapur\nQarshi\nQui
Nhon\nQuelimane\nQo`qon\nQuanzhou\nQianjiang\nQapqal\n' | sed/sed -nRe
'/q(?!u)/Ip'
Best regards,
Gergely
------------------------------------------------------------------------------
* configure.ac: add --with-pcre2 option
* sed/sed.h: conditionally include pcre2.h and define PCRE2 related macros
extend the regex structure with PCRE2 things
extend the replacement structure with PCRE2 things
* sed/sed.c (usage): add some text about PCRE2 to help
add a new command line switch -R to select PCRE2 style regexps
* sed/compile.c (mark_subst_opts): add new s/// flags for PCRE2: s x X l u U J
corresponding to (?.) options X - (?xx)
(compile_address): add same flags for address regexps //
(compile_program): prepare PCRE2 replacements
(finish_program): free PCRE2 replacements
* sed/execute.c (do_subst): add PCRE2 substitution
the output buffer handling of pcre2_substitute() is far from perfect
so we allocate a quite big (128k) output buffer first time to avoid
second scan of line
(process_files): free PCRE2 substitute output buffer
* sed/regexp.c (compile_regex): compiling PCRE2 regexp
(match_regex): PCRE2 match
(release_regex): free PCRE2 compiled program and context
Replacing only Nth match of regexp in a line with PCRE2 not implemented
---
configure.ac | 20 +++++++++
sed/compile.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++
sed/execute.c | 56 +++++++++++++++++++++++++
sed/regexp.c | 49 ++++++++++++++++++++++
sed/sed.c | 23 ++++++++++-
sed/sed.h | 20 +++++++++
6 files changed, 278 insertions(+), 1 deletion(-)
diff --git a/configure.ac b/configure.ac
index 220f406..06ed96c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -118,6 +118,26 @@ AC_CHECK_FUNCS_ONCE(isatty isascii memcpy strchr strtoul
readlink
AM_CONDITIONAL([TEST_SYMLINKS],
[test "$ac_cv_func_readlink" = yes])
+dnl PCRE
+dnl Do you want to look for pcre support?
+AC_ARG_WITH(pcre2,
+AS_HELP_STRING([--with-pcre2],[enable the search for the pcre2 library (may
create run-time library dependencies)]))
+
+AC_ARG_VAR(PCRE_CONFIG, [pathname of pcre2-config if it is not in PATH])
+if test x"$with_pcre2" = xyes; then
+ AC_CHECK_PROG([PCRE_CONFIG], pcre2-config, pcre2-config)
+ if test "x$PCRE_CONFIG" = x; then
+ with_pcre2=no
+ AC_MSG_WARN([pcre2-config not found: pcre2 is disabled.])
+ AC_MSG_NOTICE(
+ [Set PCRE_CONFIG to pathname of pcre2-config if it is not in PATH.])
+ fi
+fi
+if test x"$with_pcre2" = xyes; then
+ LIBS="`pcre2-config --libs8` $LIBS"
+ CFLAGS="`pcre2-config --cflags` $CFLAGS"
+ AC_DEFINE([WITH_PCRE2], [1], [Define to 1 if PCRE2 enabled.])
+fi
AC_ARG_ENABLE(i18n,
[ --disable-i18n disable internationalization (default=enabled)], ,
diff --git a/sed/compile.c b/sed/compile.c
index 46b0bed..11a0857 100644
--- a/sed/compile.c
+++ b/sed/compile.c
@@ -525,6 +525,50 @@ mark_subst_opts (struct subst *cmd)
flags |= REG_NEWLINE;
break;
+#ifdef WITH_PCRE2
+ case 's': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ bad_prog ("unknown option to 's'");
+ flags |= REG_PCRE2_DOTALL;
+ break;
+
+ case 'x': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ bad_prog ("unknown option to 's'");
+ flags |= REG_PCRE2_EXTENDED;
+ break;
+
+ case 'X': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ bad_prog ("unknown option to 's'");
+ flags |= REG_PCRE2_EXTENDED_MORE;
+ break;
+
+ case 'l': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ bad_prog ("unknown option to 's'");
+ flags |= REG_PCRE2_LITERAL;
+ break;
+
+ case 'u': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ bad_prog ("unknown option to 's'");
+ flags |= REG_PCRE2_MATCH_INVALID_UTF;
+ break;
+
+ case 'U': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ bad_prog ("unknown option to 's'");
+ flags |= REG_PCRE2_UNGREEDY;
+ break;
+
+ case 'J': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ bad_prog ("unknown option to 's'");
+ flags |= REG_PCRE2_DUPNAMES;
+ break;
+#endif
+
case 'e':
if (posixicity == POSIXLY_BASIC)
bad_prog ("unknown option to 's'");
@@ -827,8 +871,14 @@ compile_address (struct addr *addr, int ch)
for (;;)
{
ch = in_nonblank ();
+#ifdef WITH_PCRE2
+ if (extended_regexp_flags != REG_PCRE2 &&
+ posixicity == POSIXLY_BASIC)
+ goto posix_address_modifier;
+#else
if (posixicity == POSIXLY_BASIC)
goto posix_address_modifier;
+#endif
switch (ch)
{
case 'I': /* GNU extension */
@@ -838,7 +888,49 @@ compile_address (struct addr *addr, int ch)
case 'M': /* GNU extension */
flags |= REG_NEWLINE;
break;
+#ifdef WITH_PCRE2
+ case 's': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ goto posix_address_modifier;
+ flags |= REG_PCRE2_DOTALL;
+ break;
+ case 'x': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ goto posix_address_modifier;
+ flags |= REG_PCRE2_EXTENDED;
+ break;
+
+ case 'X': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ goto posix_address_modifier;
+ flags |= REG_PCRE2_EXTENDED_MORE;
+ break;
+
+ case 'l': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ goto posix_address_modifier;
+ flags |= REG_PCRE2_LITERAL;
+ break;
+
+ case 'u': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ goto posix_address_modifier;
+ flags |= REG_PCRE2_MATCH_INVALID_UTF;
+ break;
+
+ case 'U': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ goto posix_address_modifier;
+ flags |= REG_PCRE2_UNGREEDY;
+ break;
+
+ case 'J': /* PCRE2 extension */
+ if (extended_regexp_flags != REG_PCRE2)
+ goto posix_address_modifier;
+ flags |= REG_PCRE2_DUPNAMES;
+ break;
+#endif
default:
posix_address_modifier:
savchar (ch);
@@ -1153,8 +1245,20 @@ compile_program (struct vector *vector)
bad_prog ("unterminated 's' command");
cur_cmd->x.cmd_subst = OB_MALLOC (&obs, 1, struct subst);
+#ifdef WITH_PCRE2
+ if (extended_regexp_flags == REG_PCRE2)
+ {
+ cur_cmd->x.cmd_subst->rplcmnt = (PCRE2_SPTR)xstrdup
(get_buffer (b2));
+ cur_cmd->x.cmd_subst->rlength = size_buffer (b2);
+ }
+ else
+ {
+#endif
setup_replacement (cur_cmd->x.cmd_subst,
get_buffer (b2), size_buffer (b2));
+#ifdef WITH_PCRE2
+ }
+#endif
free_buffer (b2);
flags = mark_subst_opts (cur_cmd->x.cmd_subst);
@@ -1609,6 +1713,13 @@ finish_program (struct vector *program)
{
case 's':
free (sc->x.cmd_subst->replacement_buffer);
+#ifdef WITH_PCRE2
+ if (extended_regexp_flags == REG_PCRE2)
+ {
+ if (sc->x.cmd_subst->rplcmnt)
+ free ((void *)sc->x.cmd_subst->rplcmnt);
+ }
+#endif
if (sc->x.cmd_subst->regx)
release_regex (sc->x.cmd_subst->regx);
break;
diff --git a/sed/execute.c b/sed/execute.c
index adc7575..dd06962 100644
--- a/sed/execute.c
+++ b/sed/execute.c
@@ -127,6 +127,11 @@ static struct line buffer;
static struct append_queue *append_head = NULL;
static struct append_queue *append_tail = NULL;
+#ifdef WITH_PCRE2
+static PCRE2_SIZE outallocd = 0;
+static PCRE2_UCHAR *outputbuffer = NULL;
+#endif
+
/* Prepare to increase LB's length by LEN, making some attempt at
keeping realloc() calls under control by padding for future growth. */
static void
@@ -1022,6 +1027,53 @@ do_subst (struct subst *sub)
line_reset (&s_accum, &line);
+#ifdef WITH_PCRE2
+ if (extended_regexp_flags == REG_PCRE2)
+ {
+ int ret;
+ PCRE2_SIZE outlength = outallocd - 1;
+
+ if (outallocd == 0 || 2 * line.length > outlength)
+ {
+ outallocd = 2 * line.length + 1;
+ if (outallocd < 128 * 1024)
+ outallocd = 128 * 1024;
+ outputbuffer = xrealloc (outputbuffer, outallocd);
+ outlength = outallocd - 1;
+ }
+
+ ret = pcre2_substitute (sub->regx->rex, (PCRE2_SPTR8)line.active,
+ line.length, start,
+ PCRE2_SUBSTITUTE_OVERFLOW_LENGTH |
+ (sub->global ? PCRE2_SUBSTITUTE_GLOBAL : 0),
+ NULL, NULL, sub->rplcmnt, sub->rlength,
+ outputbuffer, &outlength);
+ if (ret == PCRE2_ERROR_NOMEMORY)
+ {
+ outallocd = outlength + 1;
+ outputbuffer = xrealloc (outputbuffer, outallocd);
+ ret = pcre2_substitute (sub->regx->rex, (PCRE2_SPTR8)line.active,
+ line.length, start,
+ (sub->global ? PCRE2_SUBSTITUTE_GLOBAL : 0),
+ NULL, NULL, sub->rplcmnt, sub->rlength,
+ outputbuffer, &outlength);
+ }
+ if (ret < 0)
+ {
+ if (ret == PCRE2_ERROR_NOMATCH)
+ return;
+ panic (_("error in PCRE2 substitute"));
+ }
+ str_append (&s_accum, (const char *)outputbuffer, outlength);
+ s_accum.chomped = line.chomped;
+
+ /* Exchange line and s_accum. This can be much cheaper
+ than copying s_accum.active into line.text (for huge lines). */
+ line_exchange (&line, &s_accum, false);
+ replaced = true;
+ goto post_subst;
+ }
+#endif
/* The first part of the loop optimizes s/xxx// when xxx is at the
start, and s/xxx$// */
if (!match_regex (sub->regx, line.active, line.length, start,
@@ -1709,6 +1761,10 @@ process_files (struct vector *the_program, char **argv)
free (hold.text);
free (line.text);
free (s_accum.text);
+#ifdef WITH_PCRE2
+ if (extended_regexp_flags == REG_PCRE2 && outputbuffer)
+ free (outputbuffer);
+#endif
#endif /* lint */
if (input.bad_count)
diff --git a/sed/regexp.c b/sed/regexp.c
index f05c5ad..212ac0d 100644
--- a/sed/regexp.c
+++ b/sed/regexp.c
@@ -132,6 +132,10 @@ compile_regex (struct buffer *b, int flags, int needed_sub)
{
struct regex *new_regex;
idx_t re_len;
+#ifdef WITH_PCRE2
+ int errornumber;
+ PCRE2_SIZE erroroffset;
+#endif
/* // matches the last RE */
if (size_buffer (b) == 0)
@@ -146,6 +150,35 @@ compile_regex (struct buffer *b, int flags, int needed_sub)
new_regex->flags = flags;
memcpy (new_regex->re, get_buffer (b), re_len);
+#ifdef WITH_PCRE2
+ if (extended_regexp_flags == REG_PCRE2)
+ {
+ new_regex->cctx = NULL;
+ if (buffer_delimiter == 0)
+ {
+ new_regex->cctx = pcre2_compile_context_create (NULL);
+ pcre2_set_newline (new_regex->cctx, PCRE2_NEWLINE_NUL);
+ }
+ uint32_t options = (flags & REG_ICASE ? PCRE2_CASELESS : 0) |
+ (flags & REG_NEWLINE ? PCRE2_MULTILINE : 0) |
+ (flags & REG_PCRE2_DOTALL ? PCRE2_DOTALL : 0) |
+ (flags & REG_PCRE2_EXTENDED ? PCRE2_EXTENDED : 0) |
+ (flags & REG_PCRE2_EXTENDED_MORE ?
PCRE2_EXTENDED_MORE : 0) |
+ (flags & REG_PCRE2_LITERAL ? PCRE2_LITERAL : 0) |
+ (flags & REG_PCRE2_MATCH_INVALID_UTF ?
PCRE2_MATCH_INVALID_UTF : 0) |
+ (flags & REG_PCRE2_UNGREEDY ? PCRE2_UNGREEDY : 0) |
+ (flags & REG_PCRE2_DUPNAMES ? PCRE2_DUPNAMES : 0);
+ new_regex->rex = pcre2_compile ((PCRE2_SPTR8)new_regex->re, re_len,
+ options, &errornumber, &erroroffset,
+ new_regex->cctx);
+ if (new_regex->rex == NULL)
+ {
+ bad_prog ("cannot compile PCRE2 regexp");
+ return NULL;
+ }
+ return new_regex;
+ }
+#endif
/* GNU regex does not process \t & co. */
new_regex->sz = normalize_text (new_regex->re, re_len, TEXT_REGEX);
@@ -171,6 +204,15 @@ match_regex (struct regex *regex, char *buf, idx_t buflen,
else
regex_last = regex;
+#ifdef WITH_PCRE2
+ if (extended_regexp_flags == REG_PCRE2)
+ {
+ pcre2_match_data *match_data = pcre2_match_data_create (2, NULL);
+ ret = pcre2_match (regex->rex, (PCRE2_SPTR)buf, buflen,
buf_start_offset, 0, match_data, NULL);
+ pcre2_match_data_free (match_data);
+ return (ret > -1);
+ }
+#endif
regoff_t buflen_regoff;
if (ckd_add (&buflen_regoff, buflen, 0))
panic (_("regex input buffer length overflow"));
@@ -354,6 +396,13 @@ release_regex (struct regex *regex)
regex->dfa = NULL;
}
regfree (®ex->pattern);
+#ifdef WITH_PCRE2
+ if (extended_regexp_flags == REG_PCRE2)
+ {
+ pcre2_compile_context_free (regex->cctx);
+ pcre2_code_free (regex->rex);
+ }
+#endif
free (regex);
}
#endif /* lint */
diff --git a/sed/sed.c b/sed/sed.c
index 18c70a3..22314c8 100644
--- a/sed/sed.c
+++ b/sed/sed.c
@@ -162,6 +162,10 @@ Usage: %s [OPTION]... {script-only-if-no-other-script}
[input-file]...\n\
fprintf (out, _(" -E, -r, --regexp-extended\n\
use extended regular expressions in the script\n\
(for portability use POSIX -E).\n"));
+#ifdef WITH_PCRE2
+ fprintf (out, _(" -R, --regexp-pcre2\n\
+ use PCRE2 regular expressions in the script\n"));
+#endif
fprintf (out, _(" -s, --separate\n\
consider files as separate rather than as a single,\n\
continuous long stream.\n"));
@@ -180,6 +184,16 @@ non-option argument is taken as the sed script to
interpret. All\n\
remaining arguments are names of input files; if no input files are\n\
specified, then the standard input is read.\n\
\n"));
+#ifdef WITH_PCRE2
+ fprintf (out, _("Note for PCRE2 (-R) regexp: the following flags supported \
+for address\n\
+match and substitution: s - PCRE2_DOTALL (single line), x - PCRE2_EXTENDED\n\
+X - PCRE2_EXTENDED_MORE, l - PCRE2_LITERAL, u - PCRE2_MATCH_INVALID_UTF,\n\
+U - PCRE2_UNGREEDY, J - PCRE2_DUPNAMES, the nth replacement and nth\n\
+replacement with the g (global) modifier not works for PCRE2 substitutions.\n\
+The -z option (the NL char is NUL) is implemented.\n\
+\n"));
+#endif
contact (status);
ck_fclose (NULL);
@@ -189,7 +203,7 @@ specified, then the standard input is read.\n\
int
main (int argc, char **argv)
{
-#define SHORTOPTS "bsnrzuEe:f:l:i::V:"
+#define SHORTOPTS "bsnrRzuEe:f:l:i::V:"
enum { SANDBOX_OPTION = CHAR_MAX+1,
DEBUG_OPTION
@@ -198,6 +212,9 @@ main (int argc, char **argv)
static const struct option longopts[] = {
{"binary", 0, NULL, 'b'},
{"regexp-extended", 0, NULL, 'r'},
+#ifdef WITH_PCRE2
+ {"regexp-pcre2", 0, NULL, 'R'},
+#endif
{"debug", 0, NULL, DEBUG_OPTION},
{"expression", 1, NULL, 'e'},
{"file", 1, NULL, 'f'},
@@ -322,6 +339,10 @@ main (int argc, char **argv)
extended_regexp_flags = REG_EXTENDED;
break;
+ case 'R':
+ extended_regexp_flags = REG_PCRE2;
+ break;
+
case 's':
separate_files = true;
break;
diff --git a/sed/sed.h b/sed/sed.h
index d57cc9a..b23a072 100644
--- a/sed/sed.h
+++ b/sed/sed.h
@@ -21,6 +21,18 @@
#include "regex.h"
#include <stdio.h>
#include "unlocked-io.h"
+#ifdef WITH_PCRE2
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
+#define REG_PCRE2 2
+#define REG_PCRE2_DOTALL 0x00100u
+#define REG_PCRE2_EXTENDED 0x00200u
+#define REG_PCRE2_EXTENDED_MORE 0x00400u
+#define REG_PCRE2_LITERAL 0x00800u
+#define REG_PCRE2_MATCH_INVALID_UTF 0x01000u
+#define REG_PCRE2_UNGREEDY 0x02000u
+#define REG_PCRE2_DUPNAMES 0x04000u
+#endif
#include "utils.h"
@@ -55,6 +67,10 @@ struct regex {
bool begline;
bool endline;
char re[1];
+#ifdef WITH_PCRE2
+ pcre2_code *rex;
+ pcre2_compile_context *cctx;
+#endif
};
struct readcmd {
@@ -124,6 +140,10 @@ struct replacement {
struct subst {
struct regex *regx;
struct replacement *replacement;
+#ifdef WITH_PCRE2
+ PCRE2_SPTR rplcmnt;
+ PCRE2_SIZE rlength;
+#endif
intmax_t numb; /* if >0, only substitute for match number "numb" */
struct output *outf; /* 'w' option given */
unsigned global : 1; /* 'g' option given */
--
2.39.5
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [PATCH] new feature: add PCRE2 regexps with libpcre2 (https://pcre2project.github.io/pcre2),
Gergely Szasz <=