>From 707f4c3588bf265d4145d1c3fceb1d3d6806c6c6 Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Thu, 6 Jan 2022 15:36:43 -0700 Subject: [PATCH 8/9] cut: implement -F --- src/cut.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 117 insertions(+), 7 deletions(-) diff --git a/src/cut.c b/src/cut.c index 4e86953d3..7da0c131f 100644 --- a/src/cut.c +++ b/src/cut.c @@ -32,10 +32,12 @@ #include #include "system.h" +#include "die.h" #include "error.h" #include "fadvise.h" #include "getndelim2.h" #include "hash.h" +#include "regex.h" #include "set-fields.h" @@ -128,6 +130,14 @@ static bool allow_duplicates; (which is always added by set_fields() */ static struct field_range_pair *last_frp; +/* With "-F", the input delimiter (-d) can be a regex string, not + just a single character. Keep the string here. */ +static char* delim_str; + +/* With "-F", this is the compiled regex */ +static bool delim_use_regex; +static struct re_pattern_buffer delim_regex; + /* For long options that have no equivalent short option, use a non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum @@ -140,6 +150,7 @@ static struct option const longopts[] = {"bytes", required_argument, NULL, 'b'}, {"characters", required_argument, NULL, 'c'}, {"fields", required_argument, NULL, 'f'}, + {"regex-fields", required_argument, NULL, 'F'}, {"delimiter", required_argument, NULL, 'd'}, {"allow-duplicates", required_argument, NULL, 'D'}, {"only-delimited", no_argument, NULL, 's'}, @@ -183,6 +194,10 @@ Print selected parts of lines from each FILE to standard output.\n\ that contains no delimiter character, unless\n\ the -s option is specified\n\ -n (ignored)\n\ +"), stdout); + fputs (_("\ + -F, --regex-fields=LIST select only these fields; treat -d DELIM as a\n\ + regular expression delimiter\n\ "), stdout); fputs (_("\ --complement complement the set of selected bytes, characters\n\ @@ -200,7 +215,7 @@ Print selected parts of lines from each FILE to standard output.\n\ fputs (VERSION_OPTION_DESCRIPTION, stdout); fputs (_("\ \n\ -Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\ +Use one, and only one of -b, -c, -f or -F. Each LIST is made up of one\n\ range, or many ranges separated by commas. Selected input is written\n\ in the same order that it is read, and is written exactly once.\n\ "), stdout); @@ -451,11 +466,56 @@ cut_adv_fields (char* linebuf, size_t len) /* Split into fields */ char *p = linebuf; + char *endp ; size_t l = len; idx_t fld = 0 ; while (true) { - char *endp = memchr (p, delim, l); + if (delim_use_regex) + { + #if 0 + fprintf(stderr,"Running regex exec, beg = '%c'\n", *p); + #endif + + struct re_registers regs; + memset (®s, 0, sizeof regs); + regoff_t i = re_search (&delim_regex, p, l, 0, l, ®s); + + if (i == -2) + FATAL_ERROR (_("regex search failed")); + + #if 0 + fprintf(stderr,"re_search returned %ld, num-reg = %zu\n", i, regs.num_regs); + for (int j=0;j= 0) + { + /* The matched regex register is the location of the + delimiting string. Add NUL at the start (to + terminate the preceeding field) and set ENDP to the + end of it (one octet before the next field) */ + const regoff_t s = regs.start[0]; + const regoff_t e = regs.end[0]; + *(p+s) = '\0'; + endp = p+e-1; + } + else + { + endp = 0; + } + free (regs.start); + free (regs.end); + } + else + { + endp = memchr (p, delim, l); + } /* NUL-terminate the field if not the last */ if (endp) @@ -674,7 +734,7 @@ main (int argc, char **argv) delim = '\0'; have_read_stdin = false; - while ((optc = getopt_long (argc, argv, "b:c:d:Df:nO:sz", longopts, NULL)) != -1) + while ((optc = getopt_long (argc, argv, "b:c:d:Df:F:nO:sz", longopts, NULL)) != -1) { switch (optc) { @@ -695,6 +755,16 @@ main (int argc, char **argv) spec_list_string = optarg; break; + case 'F': + /* Build the field list. */ + if (operating_mode != undefined_mode) + FATAL_ERROR (_("only one type of list may be specified")); + operating_mode = field_mode; + adv_mode = true; + spec_list_string = optarg; + delim_use_regex = true; + break; + case 'D': adv_mode = true; allow_duplicates = true; @@ -702,10 +772,7 @@ main (int argc, char **argv) case 'd': /* New delimiter. */ - /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */ - if (optarg[0] != '\0' && optarg[1] != '\0') - FATAL_ERROR (_("the delimiter must be a single character")); - delim = optarg[0]; + delim_str = xstrdup (optarg); delim_specified = true; break; @@ -742,6 +809,49 @@ main (int argc, char **argv) } } + if (operating_mode == field_mode && delim_use_regex && !delim_specified) + { + /* Default delimiter for -F (regex delimiter) is whitespace */ + delim_str = xstrdup("[ \t]+"); + delim_specified = true; + } + + /* '-d DELIM' validation */ + if (delim_specified) + { + if (operating_mode == field_mode && delim_use_regex) + { + /* in -F/--regex-field mode, DELIM can be a non-empty string and + a valid regex. */ + if (strlen (delim_str)==0) + FATAL_ERROR (_("delimiter string must not be empty with -F")); + + /* FIXME: What are the correct flags compared to busybox/toybox? */ + re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE | RE_ICASE); + memset (&delim_regex, 0, sizeof delim_regex); + const char *s = re_compile_pattern (delim_str, strlen (delim_str), &delim_regex); + if (s) + die (EXIT_FAILURE, 0, _("regex error: %s"), s); + + /* Default output delimiter is one space */ + if (!output_delimiter_specified) + { + output_delimiter_specified = true; + output_delimiter_string = xstrdup (" "); + output_delimiter_length = 1 ; + } + + } + else + { + /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */ + if (delim_str[0] != '\0' && delim_str[1] != '\0') + FATAL_ERROR (_("the delimiter must be a single character")); + delim = delim_str[0]; + } + } + + if (operating_mode == undefined_mode) FATAL_ERROR (_("you must specify a list of bytes, characters, or fields")); -- 2.20.1