--- coreutils-6.10-sort.c 2009-06-04 17:27:35.000000000 +0200 +++ coreutils-6.10-sort-natural.c 2009-06-04 17:27:44.000000000 +0200 @@ -170,6 +170,7 @@ bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ + bool natural; /* Flag for natural order comparison. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ struct keyfield *next; /* Next keyfield to try. */ @@ -327,6 +328,7 @@ -i, --ignore-nonprinting consider only printable characters\n\ -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\ -n, --numeric-sort compare according to string numerical value\n\ + -N, --natural-order-sort sort strings containing numbers in natural order\n\ -R, --random-sort sort by random hash of keys\n\ --random-source=FILE get random bytes from FILE (default /dev/urandom)\n\ -r, --reverse reverse the result of comparisons\n\ @@ -394,7 +396,7 @@ RANDOM_SOURCE_OPTION }; -static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z"; +static char const short_options[] = "-bcCdfgik:mMnNo:rRsS:t:T:uy:z"; static struct option const long_options[] = { @@ -409,6 +411,7 @@ {"merge", no_argument, NULL, 'm'}, {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, + {"natural-order-sort", no_argument, NULL, 'N'}, {"random-sort", no_argument, NULL, 'R'}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, {"output", required_argument, NULL, 'o'}, @@ -1514,6 +1517,139 @@ return strnumcmp (a, b, decimal_point, thousands_sep); } +/* Perform 'natural order' comparisons of strings ("A1" < "A2" < "A10") + Adoption for "sort" from strnatcmp.c by Martin Sarfy + Original author (C) 2000, 2004 by Martin Pool */ + +static inline int +nat_isdigit(char a) +{ + return isdigit((unsigned char) a); +} + + +static inline int +nat_isspace(char a) +{ + return isspace((unsigned char) a); +} + + +static inline char +nat_toupper(char a) +{ + return toupper((unsigned char) a); +} + +static int +nat_compare_right(char const *a, char const *b) +{ + int bias = 0; + + /* The longest run of digits wins. That aside, the greatest + value wins, but we can't know that it will until we've scanned + both numbers to know that they have the same magnitude, so we + remember it in BIAS. */ + for (;; a++, b++) { + if (!nat_isdigit(*a) && !nat_isdigit(*b)) + return bias; + else if (!nat_isdigit(*a)) + return -1; + else if (!nat_isdigit(*b)) + return +1; + else if (*a < *b) { + if (!bias) + bias = -1; + } else if (*a > *b) { + if (!bias) + bias = +1; + } else if (!*a && !*b) + return bias; + } + + return 0; +} + + +static int +nat_compare_left(char const *a, char const *b) +{ + /* Compare two left-aligned numbers: the first to have a + different value wins. */ + for (;; a++, b++) { + if (!nat_isdigit(*a) && !nat_isdigit(*b)) + return 0; + else if (!nat_isdigit(*a)) + return -1; + else if (!nat_isdigit(*b)) + return +1; + else if (*a < *b) + return -1; + else if (*a > *b) + return +1; + } + + return 0; +} + + +static int strnatcmp0(char const *a, char const *b, int fold_case) +{ + int ai, bi; + char ca, cb; + int fractional, result; + + /* assert(a && b); */ + ai = bi = 0; + while (1) { + ca = a[ai]; cb = b[bi]; + + /* skip over leading spaces or zeros */ + while (nat_isspace(ca)) + ca = a[++ai]; + + while (nat_isspace(cb)) + cb = b[++bi]; + + /* process run of digits */ + if (nat_isdigit(ca) && nat_isdigit(cb)) { + fractional = (ca == '0' || cb == '0'); + + if (fractional) { + if ((result = nat_compare_left(a+ai, b+bi)) != 0) + return result; + } else { + if ((result = nat_compare_right(a+ai, b+bi)) != 0) + return result; + } + } + + if (!ca && !cb) { + /* The strings compare the same. Perhaps the caller + will want to call strcmp to break the tie. */ + return 0; + } + + if (fold_case) { + ca = nat_toupper(ca); + cb = nat_toupper(cb); + } + + if (ca < cb) + return -1; + else if (ca > cb) + return +1; + + ++ai; ++bi; + } +} + +static int +natural_compare (const char *sa, const char *sb) +{ + return strnatcmp0(sa,sb,0); +} + static int general_numcompare (const char *sa, const char *sb) { @@ -1702,6 +1838,34 @@ return diff; } +static void +translate_chars(const char* texta, size_t lena, const char* textb, size_t lenb, + char *copy_a, size_t *p_new_len_a, char* copy_b, size_t *p_new_len_b, + bool const *ignore, char const *translate) +{ + int i; + /* Ignore and/or translate chars before comparing. */ + for (*p_new_len_a = *p_new_len_b = i = 0; i < MAX (lena, lenb); i++) + { + if (i < lena) + { + copy_a[*p_new_len_a] = (translate + ? translate[to_uchar (texta[i])] + : texta[i]); + if (!ignore || !ignore[to_uchar (texta[i])]) + ++(*p_new_len_a); + } + if (i < lenb) + { + copy_b[*p_new_len_b] = (translate + ? translate[to_uchar (textb[i])] + : textb [i]); + if (!ignore || !ignore[to_uchar (textb[i])]) + ++(*p_new_len_b); + } + } +} + /* Compare two lines A and B trying every key in sequence until there are no more keys or a difference is found. */ @@ -1741,6 +1905,22 @@ (texta, textb)); *lima = savea, *limb = saveb; } + else if (key->natural) + { + char buf[4000]; + size_t size = lena + 1 + lenb + 1; + char *copy_a = (size <= sizeof buf ? buf : xmalloc (size)); + char *copy_b = copy_a + lena + 1; + size_t new_len_a, new_len_b; + + translate_chars(texta, lena, textb, lenb, copy_a, &new_len_a, + copy_b, &new_len_b, ignore, translate); + + diff = natural_compare (copy_a, copy_b ); + + if (sizeof buf < size) + free (copy_a); + } else if (key->month) diff = getmonth (texta, lena) - getmonth (textb, lenb); /* Sorting like this may become slow, so in a simple locale the user @@ -1753,28 +1933,10 @@ size_t size = lena + 1 + lenb + 1; char *copy_a = (size <= sizeof buf ? buf : xmalloc (size)); char *copy_b = copy_a + lena + 1; - size_t new_len_a, new_len_b, i; + size_t new_len_a, new_len_b; - /* Ignore and/or translate chars before comparing. */ - for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++) - { - if (i < lena) - { - copy_a[new_len_a] = (translate - ? translate[to_uchar (texta[i])] - : texta[i]); - if (!ignore || !ignore[to_uchar (texta[i])]) - ++new_len_a; - } - if (i < lenb) - { - copy_b[new_len_b] = (translate - ? translate[to_uchar (textb[i])] - : textb [i]); - if (!ignore || !ignore[to_uchar (textb[i])]) - ++new_len_b; - } - } + translate_chars(texta, lena, textb, lenb, copy_a, &new_len_a, + copy_b, &new_len_b, ignore, translate); diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b); @@ -2587,7 +2749,7 @@ for (key = keylist; key; key = key->next) if ((1 < (key->random + key->numeric + key->general_numeric + key->month - + !!key->ignore)) + + key->natural + !!key->ignore)) || (key->random && key->translate)) { char opts[7]; @@ -2604,6 +2766,8 @@ *p++ = 'M'; if (key->numeric) *p++ = 'n'; + if (key->natural) + *p++ = 'N'; if (key->random) *p++ = 'R'; *p = '\0'; @@ -2696,6 +2860,9 @@ case 'M': key->month = true; break; + case 'N': + key->natural = true; + break; case 'n': key->numeric = true; break; @@ -2830,7 +2997,7 @@ gkey.sword = gkey.eword = SIZE_MAX; gkey.ignore = NULL; gkey.translate = NULL; - gkey.numeric = gkey.general_numeric = gkey.random = false; + gkey.numeric = gkey.general_numeric = gkey.natural = gkey.random = false; gkey.month = gkey.reverse = false; gkey.skipsblanks = gkey.skipeblanks = false; @@ -2909,6 +3076,7 @@ case 'i': case 'M': case 'n': + case 'N': case 'r': case 'R': { @@ -3087,7 +3255,7 @@ if (! (key->ignore || key->translate || (key->skipsblanks | key->reverse | key->skipeblanks | key->month | key->numeric - | key->general_numeric + | key->general_numeric | key->natural | key->random))) { key->ignore = gkey.ignore; @@ -3097,6 +3265,7 @@ key->month = gkey.month; key->numeric = gkey.numeric; key->general_numeric = gkey.general_numeric; + key->natural = gkey.natural; key->random = gkey.random; key->reverse = gkey.reverse; } @@ -3106,7 +3275,7 @@ if (!keylist && (gkey.ignore || gkey.translate || (gkey.skipsblanks | gkey.skipeblanks | gkey.month - | gkey.numeric | gkey.general_numeric + | gkey.numeric | gkey.general_numeric | gkey.natural | gkey.random))) { insertkey (&gkey);