>From 08ee89a89d6912c5872a1785b9079d943ad71623 Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Thu, 7 Feb 2013 11:46:22 -0500 Subject: [PATCH] uniq: support uniq-by-field src/uniq.c: add --field and --check-fields=N support --- src/uniq.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 67 insertions(+), 1 deletions(-) diff --git a/src/uniq.c b/src/uniq.c index 5efdad7..b7c3dc8 100644 --- a/src/uniq.c +++ b/src/uniq.c @@ -63,6 +63,9 @@ static size_t skip_chars; /* Number of chars to compare. */ static size_t check_chars; +/* Number of fields to compare */ +static size_t check_fields; + enum countmode { count_occurrences, /* -c Print count before output lines. */ @@ -108,6 +111,13 @@ static enum delimit_method const delimit_method_map[] = /* Select whether/how to delimit groups of duplicate lines. */ static enum delimit_method delimit_groups; +/* For long options that have no equivalent short option, use a + non-character as a pseudo short option, starting with CHAR_MAX + 1. */ +enum +{ + UNIQ_FIELD = CHAR_MAX + 1, +}; + static struct option const longopts[] = { {"count", no_argument, NULL, 'c'}, @@ -118,6 +128,8 @@ static struct option const longopts[] = {"skip-fields", required_argument, NULL, 'f'}, {"skip-chars", required_argument, NULL, 's'}, {"check-chars", required_argument, NULL, 'w'}, + {"check-fields", required_argument, NULL, 'y'}, + {"field", required_argument, NULL, UNIQ_FIELD}, {"zero-terminated", no_argument, NULL, 'z'}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, @@ -153,6 +165,8 @@ With no options, matching lines are merged to the first occurrence.\n\ delimit-method={none(default),prepend,separate}\n\ Delimiting is done with blank lines\n\ -f, --skip-fields=N avoid comparing the first N fields\n\ + --field=N check only field N.\n\ + equivalent to '-f (N-1) -y 1'\n\ -i, --ignore-case ignore differences in case when comparing\n\ -s, --skip-chars=N avoid comparing the first N characters\n\ -u, --unique only print unique lines\n\ @@ -160,6 +174,7 @@ With no options, matching lines are merged to the first occurrence.\n\ "), stdout); fputs (_("\ -w, --check-chars=N compare no more than N characters in lines\n\ + -y, --check-fields=N compare no more than N fields in lines\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -225,6 +240,34 @@ find_field (struct linebuffer const *line) return line->buffer + i; } +/* Given a string and maximum length, + * returns the position after skipping 'check_fields' fields, + * or maximum length (if not enough fields on the input string) */ +static size_t _GL_ATTRIBUTE_PURE +check_fields_length (const char* str, size_t maxlen) +{ + size_t count; + size_t i = 0; + +/* fputs("check_fields_length(str='",stderr); + fwrite(str,sizeof(char),maxlen,stderr); + fprintf(stderr,"' len=%zu, check_fields=%zu)\n",maxlen,check_fields);*/ + + for (count = 0; count < check_fields && i < maxlen; count++) + { + while (i < maxlen && isblank (to_uchar (str[i]))) + i++; + while (i < maxlen && !isblank (to_uchar (str[i]))) + i++; + } + +/* fprintf(stderr," result= '"); + fwrite(str,sizeof(char),i,stderr); + fputs("'\n",stderr);*/ + + return i; +} + /* Return false if two strings OLD and NEW match, true if not. OLD and NEW point not to the beginnings of the lines but rather to the beginnings of the fields to compare. @@ -312,6 +355,8 @@ check_file (const char *infile, const char *outfile, char delimiter) break; thisfield = find_field (thisline); thislen = thisline->length - 1 - (thisfield - thisline->buffer); + if (check_fields) + thislen = check_fields_length (thisfield, thislen); if (prevline->length == 0 || different (thisfield, prevfield, thislen, prevlen)) { @@ -335,6 +380,8 @@ check_file (const char *infile, const char *outfile, char delimiter) goto closefiles; prevfield = find_field (prevline); prevlen = prevline->length - 1 - (prevfield - prevline->buffer); + if (check_fields) + prevlen = check_fields_length (prevfield, prevlen); while (!feof (stdin)) { @@ -349,6 +396,8 @@ check_file (const char *infile, const char *outfile, char delimiter) } thisfield = find_field (thisline); thislen = thisline->length - 1 - (thisfield - thisline->buffer); + if (check_fields) + thislen = check_fields_length (thisfield, thislen); match = !different (thisfield, prevfield, thislen, prevlen); match_count += match; @@ -429,6 +478,7 @@ main (int argc, char **argv) skip_chars = 0; skip_fields = 0; check_chars = SIZE_MAX; + check_fields = 0; output_unique = output_first_repeated = true; output_later_repeated = false; countmode = count_none; @@ -443,7 +493,7 @@ main (int argc, char **argv) if (optc == -1 || (posixly_correct && nfiles != 0) || ((optc = getopt_long (argc, argv, - "-0123456789Dcdf:is:uw:z", longopts, NULL)) + "-0123456789Dcdf:is:uw:y:z", longopts, NULL)) == -1)) { if (argc <= optind) @@ -539,6 +589,22 @@ main (int argc, char **argv) N_("invalid number of bytes to compare")); break; + case 'y': + check_fields = size_opt (optarg, + N_("invalid number of fields to compare")); + if (check_fields==0) + error (EXIT_FAILURE, 0, N_("invalid number of fields to compare")); + break; + + case UNIQ_FIELD: + skip_fields = size_opt (optarg, + N_("invalid number of field to compare")); + if (skip_fields==0) + error (EXIT_FAILURE, 0, N_("invalid number of field to compare")); + --skip_fields; /* users specify 1-based values */ + check_fields = 1; + break; + case 'z': delimiter = '\0'; break; -- 1.7.7.4