[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[coreutils] [PATCH] sort: support all combinations of -d, -f, -i, -R, a
From: |
Paul Eggert |
Subject: |
[coreutils] [PATCH] sort: support all combinations of -d, -f, -i, -R, and -V |
Date: |
Fri, 06 Aug 2010 21:32:52 -0700 |
User-agent: |
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.11) Gecko/20100713 Thunderbird/3.0.6 |
* NEWS: Document this.
* src/sort.c (getmonth): Omit LEN arg, as MONTH is now null-terminated.
(compare_random): Don't null-terminate keys, as caller now does that.
(compare_version): Remove.
(debug_key): Null-terminate string for getmonth.
(keycompare): Support combining -R with any of -d, -f, -i, -V.
Also, support combining -V with any of -d, -i.
(check_ordering_compatibility): Allow newly-supported combinations.
* tests/misc/sort (02q, 02r, 02s): New tests, for new combinations.
(incompat2): Now test -nR, since -fR are now compatible.
---
NEWS | 2 +
src/sort.c | 224 ++++++++++++++++++++++++------------------------------
tests/misc/sort | 8 ++-
3 files changed, 108 insertions(+), 126 deletions(-)
diff --git a/NEWS b/NEWS
index 4662173..ae7b839 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,8 @@ GNU coreutils NEWS -*-
outline -*-
sort now accepts the --debug option, to highlight the part of the
line significant in the sort, and warn about questionable options.
+ sort now supports -d, -f, -i, -R, and -V in any combination.
+
** Changes in behavior
du now uses less than half as much memory when operating on trees
diff --git a/src/sort.c b/src/sort.c
index a5e5c07..dcfd24f 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -1925,24 +1925,17 @@ general_numcompare (char const *sa, char const *sb)
: memcmp (&a, &b, sizeof a));
}
-/* Return an integer in 1..12 of the month name MONTH with length LEN.
+/* Return an integer in 1..12 of the month name MONTH.
Return 0 if the name in S is not recognized. */
static int
-getmonth (char const *month, size_t len, char **ea)
+getmonth (char const *month, char **ea)
{
size_t lo = 0;
size_t hi = MONTHS_PER_YEAR;
- char const *monthlim = month + len;
- while (true)
- {
- if (month == monthlim)
- return 0;
- if (!blanks[to_uchar (*month)])
- break;
- ++month;
- }
+ while (blanks[to_uchar (*month)])
+ month++;
do
{
@@ -1958,7 +1951,7 @@ getmonth (char const *month, size_t len, char **ea)
*ea = (char *) m;
return monthtab[ix].val;
}
- if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
+ if (fold_toupper[to_uchar (*m)] < to_uchar (*n))
{
hi = ix;
break;
@@ -2015,7 +2008,8 @@ xstrxfrm (char *restrict dest, char const *restrict src,
size_t destsize)
}
/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
- using one or more random hash functions. */
+ using one or more random hash functions. TEXTA[LENA] and
+ TEXTB[LENB] must be zero. */
static int
compare_random (char *restrict texta, size_t lena,
@@ -2036,9 +2030,8 @@ compare_random (char *restrict texta, size_t lena,
if (hard_LC_COLLATE)
{
- /* Null-terminate the keys so that strxfrm knows where to stop. */
- char *lima = texta + lena; char enda = *lima; *lima = '\0';
- char *limb = textb + lenb; char endb = *limb; *limb = '\0';
+ char const *lima = texta + lena;
+ char const *limb = textb + lenb;
while (true)
{
@@ -2106,9 +2099,6 @@ compare_random (char *restrict texta, size_t lena,
xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
}
}
-
- *lima = enda;
- *limb = endb;
}
/* Compute and compare the checksums. */
@@ -2135,32 +2125,6 @@ compare_random (char *restrict texta, size_t lena,
return diff;
}
-/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
- using filevercmp. See lib/filevercmp.h for function description. */
-
-static int
-compare_version (char *restrict texta, size_t lena,
- char *restrict textb, size_t lenb)
-{
- int diff;
-
- /* It is necessary to save the character after the end of the field.
- "filevercmp" works with NUL terminated strings. Our blocks of
- text are not necessarily terminated with a NUL byte. */
- char sv_a = texta[lena];
- char sv_b = textb[lenb];
-
- texta[lena] = '\0';
- textb[lenb] = '\0';
-
- diff = filevercmp (texta, textb);
-
- texta[lena] = sv_a;
- textb[lenb] = sv_b;
-
- return diff;
-}
-
/* Return the printable width of the block of memory starting at
TEXT and ending just before LIM, counting each tab as one byte.
FIXME: Should we generally be counting non printable chars? */
@@ -2221,17 +2185,19 @@ debug_key (struct line const *line, struct keyfield
const *key)
lim = limfield (line, key);
if (key->skipsblanks || key->month || key_numeric (key))
- while (beg < lim && blanks[to_uchar (*beg)])
- beg++;
-
- if (key_numeric (key))
{
- char *numlim;
char saved = *lim; *lim = '\0';
- if (key->general_numeric)
- strtold (beg, &numlim);
- else
+ while (blanks[to_uchar (*beg)])
+ beg++;
+
+ char *tighter_lim = beg;
+
+ if (key->month)
+ getmonth (beg, &tighter_lim);
+ else if (key->general_numeric)
+ strtold (beg, &tighter_lim);
+ else if (key->numeric || key->human_numeric)
{
char *p = beg + (beg < lim && *beg == '-');
bool found_digit = false;
@@ -2248,19 +2214,14 @@ debug_key (struct line const *line, struct keyfield
const *key)
while (ISDIGIT (ch = *p++))
found_digit = true;
- numlim = (found_digit
- ? p - ! (key->human_numeric && unit_order[ch])
- : beg);
+ if (found_digit)
+ tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
}
+ else
+ tighter_lim = lim;
*lim = saved;
- lim = numlim;
- }
- else if (key->month)
- {
- char *monthlim = beg;
- getmonth (beg, lim - beg, &monthlim);
- lim = monthlim;
+ lim = tighter_lim;
}
}
@@ -2465,73 +2426,89 @@ keycompare (struct line const *a, struct line const *b)
size_t lena = lima - texta;
size_t lenb = limb - textb;
- /* Actually compare the fields. */
-
- if (key->random)
- diff = compare_random (texta, lena, textb, lenb);
- else if (key_numeric (key))
+ if (hard_LC_COLLATE || key_numeric (key)
+ || key->month || key->random || key->version)
{
- char savea = *lima, saveb = *limb;
+ char *ta;
+ char *tb;
+ size_t tlena;
+ size_t tlenb;
+
+ char enda IF_LINT (= 0);
+ char endb IF_LINT (= 0);
+ void *allocated IF_LINT (= NULL);
+ char stackbuf[4000];
- *lima = *limb = '\0';
- diff = (key->numeric ? numcompare (texta, textb)
- : key->general_numeric ? general_numcompare (texta, textb)
- : human_numcompare (texta, textb));
- *lima = savea, *limb = saveb;
- }
- else if (key->version)
- diff = compare_version (texta, lena, textb, lenb);
- else if (key->month)
- diff = getmonth (texta, lena, NULL) - getmonth (textb, lenb, NULL);
- /* Sorting like this may become slow, so in a simple locale the user
- can select a faster sort that is similar to ascii sort. */
- else if (hard_LC_COLLATE)
- {
- /* FIXME: for debug, account for skipped chars, while handling mb
chars.
- Generally perhaps xmemfrm could be used to determine chars that
are
- excluded from the collating order? */
if (ignore || translate)
{
- char buf[4000];
- size_t size = lena + 1 + lenb + 1;
- char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
- char *copy_b = copy_a + lena + 1;
- size_t new_len_a, new_len_b, i;
+ /* Compute with copies of the keys, which are the result of
+ translating or ignoring characters, and which need their
+ own storage. */
- /* Ignore and/or translate chars before comparing. */
- for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
- {
- if (i < lena)
- {
- copy_a[new_len_a] = (translate
- ? translate[to_uchar (texta[i])]
- : texta[i]);
- if (!ignore || !ignore[to_uchar (texta[i])])
- ++new_len_a;
- }
- if (i < lenb)
- {
- copy_b[new_len_b] = (translate
- ? translate[to_uchar (textb[i])]
- : textb [i]);
- if (!ignore || !ignore[to_uchar (textb[i])])
- ++new_len_b;
- }
- }
+ size_t i;
- copy_a[new_len_a++] = '\0';
- copy_b[new_len_b++] = '\0';
- diff = xmemcoll0 (copy_a, new_len_a, copy_b, new_len_b);
+ /* Allocate space for copies. */
+ size_t size = lena + 1 + lenb + 1;
+ if (size <= sizeof stackbuf)
+ ta = stackbuf, allocated = NULL;
+ else
+ ta = allocated = xmalloc (size);
+ tb = ta + lena + 1;
+
+ /* Put into each copy a version of the key in which the
+ requested characters are ignored or translated. */
+ for (tlena = i = 0; i < lena; i++)
+ if (! (ignore && ignore[to_uchar (texta[i])]))
+ ta[tlena++] = (translate
+ ? translate[to_uchar (texta[i])]
+ : texta[i]);
+ ta[tlena] = '\0';
+
+ for (tlenb = i = 0; i < lenb; i++)
+ if (! (ignore && ignore[to_uchar (textb[i])]))
+ tb[tlenb++] = (translate
+ ? translate[to_uchar (textb[i])]
+ : textb[i]);
+ tb[tlenb] = '\0';
+ }
+ else
+ {
+ /* Use the keys in-place, temporarily null-terminated. */
+ ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
+ tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
+ }
- if (sizeof buf < size)
- free (copy_a);
+ if (key->numeric)
+ diff = numcompare (ta, tb);
+ else if (key->general_numeric)
+ diff = general_numcompare (ta, tb);
+ else if (key->human_numeric)
+ diff = human_numcompare (ta, tb);
+ else if (key->month)
+ diff = getmonth (ta, NULL) - getmonth (tb, NULL);
+ else if (key->random)
+ diff = compare_random (ta, tlena, tb, tlenb);
+ else if (key->version)
+ diff = filevercmp (ta, tb);
+ else
+ {
+ /* Locale-dependent string sorting. This is slower than
+ C-locale sorting, which is implemented below. */
+ if (tlena == 0)
+ diff = - NONZERO (tlenb);
+ else if (tlenb == 0)
+ diff = 1;
+ else
+ diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
}
- else if (lena == 0)
- diff = - NONZERO (lenb);
- else if (lenb == 0)
- goto greater;
+
+ if (ignore || translate)
+ free (allocated);
else
- diff = xmemcoll (texta, lena, textb, lenb);
+ {
+ ta[tlena] = enda;
+ tb[tlenb] = endb;
+ }
}
else if (ignore)
{
@@ -3849,9 +3826,8 @@ check_ordering_compatibility (void)
struct keyfield *key;
for (key = keylist; key; key = key->next)
- if ((1 < (key->random + key->numeric + key->general_numeric + key->month
- + key->version + !!key->ignore + key->human_numeric))
- || (key->random && key->translate))
+ if (1 < (key->numeric + key->general_numeric + key->human_numeric
+ + key->month + (key->version | key->random | !!key->ignore)))
{
/* The following is too big, but guaranteed to be "big enough". */
char opts[sizeof short_options];
diff --git a/tests/misc/sort b/tests/misc/sort
index 202951c..fedbf27 100755
--- a/tests/misc/sort
+++ b/tests/misc/sort
@@ -92,6 +92,10 @@ my @Tests =
{ERR=>"$prog: -:3: disorder: B\n"}, $normalize_filename],
["02p", '-cu', {IN=>"B\nA\nB\n"}, {OUT=>''}, {EXIT=>1},
{ERR=>"$prog: -:2: disorder: A\n"}, $normalize_filename],
+["02q", '-c -k 1,1fR', {IN=>"ABC\nABc\nAbC\nAbc\naBC\naBc\nabC\nabc\n"}],
+["02r", '-c -k 1,1fV', {IN=>"ABC\nABc\nAbC\nAbc\naBC\naBc\nabC\nabc\n"}],
+["02s", '-c -k 1,1dfR',
+ {IN=>".ABC\n.ABc.\nA.bC\nA.bc.\naB.C\naB.c.\nabC.\nabc..\n"}],
#
["03a", '-k1', {IN=>"B\nA\n"}, {OUT=>"A\nB\n"}],
["03b", '-k1,1', {IN=>"B\nA\n"}, {OUT=>"A\nB\n"}],
@@ -335,8 +339,8 @@ my @Tests =
# Specifying incompatible options should evoke a failure.
["incompat1", '-in', {EXIT=>2},
{ERR=>"$prog: options `-in' are incompatible\n"}],
-["incompat2", '-fR', {EXIT=>2},
- {ERR=>"$prog: options `-fR' are incompatible\n"}],
+["incompat2", '-nR', {EXIT=>2},
+ {ERR=>"$prog: options `-nR' are incompatible\n"}],
["incompat3", '-dfgiMnR', {EXIT=>2},
{ERR=>"$prog: options `-dfgMnR' are incompatible\n"}],
["incompat4", qw(-c -o /dev/null), {EXIT=>2},
--
1.7.2
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [coreutils] [PATCH] sort: support all combinations of -d, -f, -i, -R, and -V,
Paul Eggert <=