>From 074614c0764c278e8abd9d41af4ce626fefd6cfc Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Wed, 6 Feb 2013 16:40:00 -0500 Subject: [PATCH] csplit: split files by field-change src/csplit.c: create a new output file whenever field content changes. --- src/csplit.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 230 insertions(+), 7 deletions(-) diff --git a/src/csplit.c b/src/csplit.c index 22f3ad4..ec725d2 100644 --- a/src/csplit.c +++ b/src/csplit.c @@ -44,6 +44,13 @@ /* The default prefix for output file names. */ #define DEFAULT_PREFIX "xx" +enum csplit_type + { + CSPLIT_LINE, + CSPLIT_REGEXPR, + CSPLIT_FIELD_CHANGE + }; + /* A compiled pattern arg. */ struct control { @@ -53,8 +60,9 @@ struct control int argnum; /* ARGV index. */ bool repeat_forever; /* True if '*' used as a repeat count. */ bool ignore; /* If true, produce no output (for regexp). */ - bool regexpr; /* True if regular expression was used. */ + enum csplit_type type; /* Split type: line/regex/field */ struct re_pattern_buffer re_compiled; /* Compiled regular expression. */ + uintmax_t field; /* Field to monitor for change */ }; /* Initial size of data area in buffers. */ @@ -176,6 +184,16 @@ static size_t control_used; /* The set of signals that are caught. */ static sigset_t caught_signals; +/* If delimiter has this value, blanks separate fields. */ +enum { DELIMITER_DEFAULT = CHAR_MAX + 1 }; + +/* The delimiter to use for field extraction */ +static int delimiter = DELIMITER_DEFAULT; + +/* The content of the field from the last line, to be compared with the + * current line */ +static struct cstring last_field; + static struct option const longopts[] = { {"digits", required_argument, NULL, 'n'}, @@ -185,6 +203,7 @@ static struct option const longopts[] = {"elide-empty-files", no_argument, NULL, 'z'}, {"prefix", required_argument, NULL, 'f'}, {"suffix-format", required_argument, NULL, 'b'}, + {"delimiter", required_argument, NULL, 'd'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -867,6 +886,169 @@ process_regexp (struct control *p, uintmax_t repetition) current_line = break_line; } +/* Skip the requested number of fields in the input string. + Returns a pointer to the *delimiter* of the requested field, + or a pointer to NUL (if reached the end of the string). + + NOTE: buf is *not* expected to be NULL-terminated string. + The end of the string is determined by 'len' */ +static inline char * +__attribute ((pure)) +skip_fields (char *buf, int len, int fields) +{ + static char null_str[] = ""; + + char *ptr = buf; + if (delimiter != DELIMITER_DEFAULT) + { + if (*ptr == delimiter) + fields--; + while (len && fields--) + { + while (len && *ptr == delimiter) + { + ++ptr; + --len; + } + while (len && *ptr != delimiter) + { + ++ptr; + --len; + } + } + } + else + while (len && fields--) + { + while (len && isblank (*ptr)) + { + --len; + ++ptr; + } + while (len && !isblank (*ptr)) + { + ++ptr; + --len; + } + } + + if (len==0) + return null_str; + + return ptr; +} + +static void +set_last_field (const char* str, size_t len) +{ + last_field.len = len ; + last_field.str = xrealloc (last_field.str, len); + memcpy (last_field.str, str, len); +} + +static void +reset_last_field (void) +{ + last_field.len = 0 ; +} + +static void +free_last_field (void) +{ + last_field.len = 0; + free (last_field.str); + last_field.str=NULL; +} + +/* Prints the input line until a fields change its value */ +static void +process_field_change (struct control *p) +{ + struct cstring *line; /* From input file. */ + char *field_start = NULL; + char *field_end = NULL ; + size_t field_len; + size_t line_len; + size_t eol_len; /* length from field_start to EOL */ + + create_output_file (); + + reset_last_field (); + + while (true) + { + line = find_line (++current_line); + if (line == NULL) + { + /* No more input lines */ + if (p->repeat_forever) + { + dump_rest_of_file (); + close_output_file (); + exit (EXIT_SUCCESS); + } + else + { + error (0, 0, _("not enough input lines for pattern %d " + "field @%zu"), p->argnum, p->field); + cleanup_fatal (); + } + } + + + line_len = line->len; + if (line->str[line_len-1] == '\n') + --line_len; + + /* Find the beginning of the field */ + if (p->field>1) + { + field_start = skip_fields (line->str, line_len, p->field-1); + if (*field_start == '\0') + { + error (0, 0, _("not enough input fields on line %zu" + "(Looking for field @%zu)"), p->field,current_line); + cleanup_fatal (); + } + ++field_start; /* skip delimiter */ + } + else + field_start = line->str; + + /* Find the end of the field */ + eol_len = line_len - (field_start-line->str); + field_end = skip_fields (field_start, eol_len, 1); + if (*field_end=='\0') + field_len = eol_len ; + else + field_len = field_end - field_start; + + + /* new field content, or same value as previous line? */ + if ( last_field.len==0 + || (last_field.len == field_len + && memcmp (last_field.str, field_start, field_len)==0 ) ) + { + /* First line encountered with this field - set it */ + if (last_field.len==0) + set_last_field (field_start, field_len); + + line = remove_line (); + save_line_to_file (line); + } + else + { + /* Field changed, get out (but use the same line next time) */ + --current_line; + break; + } + } + + close_output_file (); +} + + + /* Split the input file according to the control records we have built. */ static void @@ -877,17 +1059,25 @@ split_file (void) for (i = 0; i < control_used; i++) { uintmax_t j; - if (controls[i].regexpr) + switch (controls[i].type) { + case CSPLIT_REGEXPR: for (j = 0; (controls[i].repeat_forever || j <= controls[i].repeat); j++) process_regexp (&controls[i], j); - } - else - { + break; + + case CSPLIT_LINE: for (j = 0; (controls[i].repeat_forever || j <= controls[i].repeat); j++) process_line_count (&controls[i], j); + break; + + case CSPLIT_FIELD_CHANGE: + for (j = 0; (controls[i].repeat_forever + || j <= controls[i].repeat); j++) + process_field_change (&controls[i]); + break; } } @@ -1039,7 +1229,7 @@ new_control_record (void) if (control_used == control_allocated) controls = X2NREALLOC (controls, &control_allocated); p = &controls[control_used++]; - p->regexpr = false; + p->type = CSPLIT_LINE; p->repeat = 0; p->repeat_forever = false; p->lines_required = 0; @@ -1116,7 +1306,7 @@ extract_regexp (int argnum, bool ignore, char const *str) p->argnum = argnum; p->ignore = ignore; - p->regexpr = true; + p->type = CSPLIT_REGEXPR; p->re_compiled.buffer = NULL; p->re_compiled.allocated = 0; p->re_compiled.fastmap = xmalloc (UCHAR_MAX + 1); @@ -1136,6 +1326,30 @@ extract_regexp (int argnum, bool ignore, char const *str) return p; } +/* Extract the field specification from STR and check for a numeric offset. + Return a new control record for the field pattern. + ARGNUM is the ARGV index of STR. */ +static struct control * +extract_field_pattern (int argnum, char const *str) +{ + struct control *p; + uintmax_t val = 0; + + p = new_control_record (); + p->argnum = argnum; + p->type = CSPLIT_FIELD_CHANGE; + + if (xstrtoumax (str+1, NULL, 10, &val, "") != LONGINT_OK) + error (EXIT_FAILURE, 0, _("%s: invalid field number"), str+1); + if (val == 0) + error (EXIT_FAILURE, 0, + _("%s: field number must be greater than zero"), + str+1); + p->field = val; + + return p; +} + /* Extract the break patterns from args START through ARGC - 1 of ARGV. After each pattern, check if the next argument is a repeat count. */ @@ -1154,6 +1368,11 @@ parse_patterns (int argc, int start, char **argv) p = extract_regexp (i, *argv[i] == '%', argv[i]); } else + if (*argv[i] == '@') + { + p = extract_field_pattern (i, argv[i]); + } + else { p = new_control_record (); p->argnum = i; @@ -1432,6 +1651,8 @@ main (int argc, char **argv) split_file (); + free_last_field (); + if (close (STDIN_FILENO) != 0) { error (0, errno, _("read error")); @@ -1461,6 +1682,7 @@ and output byte counts of each piece to standard output.\n\ fputs (_("\ -b, --suffix-format=FORMAT use sprintf FORMAT instead of %02d\n\ + -d, --delimiter=X use X instead of whitespace for field delimiter\n\ -f, --prefix=PREFIX use PREFIX instead of 'xx'\n\ -k, --keep-files do not remove output files on errors\n\ "), stdout); @@ -1480,6 +1702,7 @@ Read standard input if FILE is -. Each PATTERN may be:\n\ INTEGER copy up to but not including specified line number\n\ /REGEXP/[OFFSET] copy up to but not including a matching line\n\ %REGEXP%[OFFSET] skip to, but not including a matching line\n\ + @N Start new field every time field N changes\n\ {INTEGER} repeat the previous pattern specified number of times\n\ {*} repeat the previous pattern as many times as possible\n\ \n\ -- 1.7.7.4