[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
RFE: uniq --sequential
From: |
Daiki Ueno |
Subject: |
RFE: uniq --sequential |
Date: |
Thu, 11 Jun 2015 06:04:56 +0900 |
User-agent: |
Gnus/5.13 (Gnus v5.13) Emacs/24.4 (gnu/linux) |
Hello,
I occasionally have to deal with sequential numbers which is largely
contiguous, but contain gaps (e.g., Unicode code points).
To detect gaps, I usually write a shell-script loop, which is not
trivial. So, I thought that it would be handy if this is supported by
coreutils, like this:
$ { seq 1 10; seq 12 22; seq 26 34; } | uniq --sequential
1
12
26
or, a more practical use-case:
$ wc -l UnicodeData.txt
27268 UnicodeData.txt
$ cut -f1 -d';' UnicodeData.txt | sed 's/^/0x/' | uniq --sequential | wc -l
612
where contiguous numbers are treated as duplicates. I'm attaching a
patch which implements this.
Comments appreciated.
Regards,
--
Daiki Ueno
>From 0378c2e3e35fddee69a6e40d2b5fda4c27765d9d Mon Sep 17 00:00:00 2001
From: Daiki Ueno <address@hidden>
Date: Wed, 10 Jun 2015 11:11:23 +0900
Subject: [PATCH] uniq: add the --sequential option
* src/uniq.c (seq_interval): New global variable.
(longopts): Register the --sequential option.
(usage): Summarize the new option.
(different): Check number input based on the --sequential option.
(check_file): Adjust the loop for the --sequential option.
(main): Handle the new --sequential option.
* tests/misc/uniq.pl (add_z_variants): Add tests for the new
--sequential option.
---
src/uniq.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-----
tests/misc/uniq.pl | 9 ++++++++
2 files changed, 67 insertions(+), 5 deletions(-)
diff --git a/src/uniq.c b/src/uniq.c
index e0cfe4d..b3d5619 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -138,6 +138,8 @@ static enum grouping_method const grouping_method_map[] =
static enum grouping_method grouping = GM_NONE;
+static size_t seq_interval;
+
enum
{
GROUP_OPTION = CHAR_MAX + 1
@@ -155,6 +157,7 @@ static struct option const longopts[] =
{"skip-chars", required_argument, NULL, 's'},
{"check-chars", required_argument, NULL, 'w'},
{"zero-terminated", no_argument, NULL, 'z'},
+ {"sequential", optional_argument, NULL, 'S'},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
@@ -207,6 +210,10 @@ With no options, matching lines are merged to the first
occurrence.\n\
fputs (_("\
-w, --check-chars=N compare no more than N characters in lines\n\
"), stdout);
+ fputs (_("\
+ -S, --sequential[=INTERVAL] treat lines as numbers, and remove adjacent\n\
+ numbers as duplicate lines\n\
+"), stdout);
fputs (HELP_OPTION_DESCRIPTION, stdout);
fputs (VERSION_OPTION_DESCRIPTION, stdout);
fputs (_("\
@@ -284,7 +291,32 @@ different (char *old, char *new, size_t oldlen, size_t
newlen)
if (check_chars < newlen)
newlen = check_chars;
- if (ignore_case)
+ if (seq_interval > 0)
+ {
+ unsigned long int oldval, newval;
+ int oldchar, newchar;
+ bool result = true;
+
+ /* Temporarily NUL terminate OLD and NEW for xstrtoul. Those
+ should have enough room here. */
+ oldchar = old[oldlen];
+ newchar = new[newlen];
+ old[oldlen] = '\0';
+ new[newlen] = '\0';
+ if (xstrtoul (old, NULL, 0, &oldval, "") == LONGINT_OK
+ && xstrtoul (new, NULL, 0, &newval, "") == LONGINT_OK)
+ {
+ /* FIXME: This relies on the fact that OLD points to a field
+ on the current line and NEW points to a field on the
+ previous line. */
+ result = newval + seq_interval != oldval;
+ }
+ /* Restore the original terminators. */
+ old[oldlen] = oldchar;
+ new[newlen] = newchar;
+ return result;
+ }
+ else if (ignore_case)
{
/* FIXME: This should invoke strcoll somehow. */
return oldlen != newlen || memcasecmp (old, new, oldlen);
@@ -385,10 +417,13 @@ check_file (const char *infile, const char *outfile, char
delimiter)
fwrite (thisline->buffer, sizeof (char),
thisline->length, stdout);
+ first_group_printed = true;
+ }
+ if (new_group || grouping != GM_NONE || seq_interval > 0)
+ {
SWAP_LINES (prevline, thisline);
prevfield = thisfield;
prevlen = thislen;
- first_group_printed = true;
}
}
if ((grouping == GM_BOTH || grouping == GM_APPEND) &&
first_group_printed)
@@ -448,11 +483,14 @@ check_file (const char *infile, const char *outfile, char
delimiter)
if (!match || output_later_repeated)
{
writeline (prevline, match, match_count);
+ if (!match)
+ match_count = 0;
+ }
+ if (!match || output_later_repeated || seq_interval > 0)
+ {
SWAP_LINES (prevline, thisline);
prevfield = thisfield;
prevlen = thislen;
- if (!match)
- match_count = 0;
}
}
@@ -514,7 +552,7 @@ main (int argc, char **argv)
if (optc == -1
|| (posixly_correct && nfiles != 0)
|| ((optc = getopt_long (argc, argv,
- "-0123456789Dcdf:is:uw:z", longopts, NULL))
+ "-0123456789DScdf:is:uw:z", longopts, NULL))
== -1))
{
if (argc <= optind)
@@ -613,6 +651,21 @@ main (int argc, char **argv)
N_("invalid number of bytes to skip"));
break;
+ case 'S':
+ if (optarg == NULL)
+ seq_interval = 1;
+ else
+ {
+ seq_interval = size_opt (optarg,
+ N_("invalid interval"));
+ if (seq_interval == 0)
+ {
+ error (0, 0, _("invalid interval %s"), quote (optarg));
+ usage (EXIT_FAILURE);
+ }
+ }
+ break;
+
case 'u':
output_first_repeated = false;
output_option_used = true;
diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl
index 5eae701..3cea277 100755
--- a/tests/misc/uniq.pl
+++ b/tests/misc/uniq.pl
@@ -227,6 +227,15 @@ my @Tests =
" - 'separate'\n" .
" - 'both'\n" .
"Try '$prog --help' for more information.\n"}],
+ # Check sequential option
+ ['146', '--sequential', {IN=>"1\n2\n6\n7\n8\n11\n12\n"}, {OUT=>"1\n6\n11\n"}],
+ ['147', '--sequential --group', {IN=>"1\n2\n6\n7\n8\n11\n12\n"},
+ {OUT=>"1\n2\n\n6\n7\n8\n\n11\n12\n"}],
+ ['148', '--sequential --count', {IN=>"1\n2\n6\n7\n8\n11\n12\n"},
+ {OUT=>" 2 2\n 3 8\n 2 12\n"}],
+ ['149', '--sequential=0', {IN=>""}, {OUT=>""}, {EXIT=>1},
+ {ERR=>"$prog: invalid interval '0'\n" .
+ "Try '$prog --help' for more information.\n"}]
);
# Locale related tests
--
2.1.4
- RFE: uniq --sequential,
Daiki Ueno <=