From 9f48fb992a3d7e96610c4ce8be969cff2d61a01b Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sat, 12 Feb 2022 16:27:05 -0800 Subject: [PATCH] filevercmp: fix several unexpected results Problems reported by Michael Debertol in . While looking into this, I spotted some more areas where the code and documentation did not agree, or where the documentation was unclear. The biggest change needed by coreutils is a new function filenvercmp that can compare byte strings containing NUL. * lib/filevercmp.c: Do not include sys/types.h, stdlib.h, string.h. Include idx.h, verify.h. (match_suffix): Remove, replacing all uses with calls to ... (file_prefixlen): ... this new function. Simplify it by avoiding the need for a confusing READ_ALPHA state variable. Change its API to something more useful, with a *LEN arg. it with a new *LEN arg. (file_prefixlen, verrevcmp): Prefer idx_t to size_t where either will do. (order): Change args to S, POS, LEN instead of just S[POS]. This lets us handle NUL bytes correctly. Callers changed. Verify that ints are sufficiently wide for its API. (verrevcmp): Don't assume that S1[S1_LEN] is a non-digit, and likewise for S2[S2_LEN]. The byte might not be accessible if filenvercmp is being called. (filevercmp): Reimplement by calling filenvercmp. (filenvercmp): New function, rewritten without the assumption that the inputs are null-terminated. Remove "easy comparison to see if strings are identical", as the use of it later (a) was undocumented, and (b) caused sort -V to be unstable. When both strings start with ".", do not skip past the "."s before looking for suffixes, as this disagreed with the documentation. * lib/filevercmp.h: Fix comments, which had many mistakes. (filenvercmp): New decl. * modules/filevercmp (Depends-on): Add idx, verify. Remove string. * tests/test-filevercmp.c: Include string.h. (examples): Reorder examples ".0" and ".9" that matched the code but not the documentation. The code has been fixed to match the documentation. Add some examples involving \1 so that they can be tried with both \1 and \0. Add some other examples taken from the bug report. (equals): New set of test cases. (sign, test_filevercmp): New functions. (main): Remove test case where the fixed filevercmp disagrees with strverscmp. Use test_filevercmp instead of filevercmp, so that we also test filenvercmp. Test the newly-introduced EQUALS cases. --- ChangeLog | 46 ++++++++++ lib/filevercmp.c | 187 +++++++++++++++++++++------------------- lib/filevercmp.h | 66 ++++++++++---- modules/filevercmp | 3 +- tests/test-filevercmp.c | 94 +++++++++++++++++++- 5 files changed, 284 insertions(+), 112 deletions(-) diff --git a/ChangeLog b/ChangeLog index 62162cbfce..4bf0cec7f0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,49 @@ +2022-02-12 Paul Eggert + + filevercmp: fix several unexpected results + Problems reported by Michael Debertol in . + While looking into this, I spotted some more areas where the + code and documentation did not agree, or where the documentation + was unclear. The biggest change needed by coreutils is a new + function filenvercmp that can compare byte strings containing NUL. + * lib/filevercmp.c: Do not include sys/types.h, stdlib.h, string.h. + Include idx.h, verify.h. + (match_suffix): Remove, replacing all uses with calls to ... + (file_prefixlen): ... this new function. Simplify it by + avoiding the need for a confusing READ_ALPHA state variable. + Change its API to something more useful, with a *LEN arg. + it with a new *LEN arg. + (file_prefixlen, verrevcmp): + Prefer idx_t to size_t where either will do. + (order): Change args to S, POS, LEN instead of just S[POS]. + This lets us handle NUL bytes correctly. Callers changed. + Verify that ints are sufficiently wide for its API. + (verrevcmp): Don't assume that S1[S1_LEN] is a non-digit, + and likewise for S2[S2_LEN]. The byte might not be accessible + if filenvercmp is being called. + (filevercmp): Reimplement by calling filenvercmp. + (filenvercmp): New function, rewritten without the assumption + that the inputs are null-terminated. + Remove "easy comparison to see if strings are identical", as the + use of it later (a) was undocumented, and (b) caused sort -V to be + unstable. When both strings start with ".", do not skip past + the "."s before looking for suffixes, as this disagreed + with the documentation. + * lib/filevercmp.h: Fix comments, which had many mistakes. + (filenvercmp): New decl. + * modules/filevercmp (Depends-on): Add idx, verify. Remove string. + * tests/test-filevercmp.c: Include string.h. + (examples): Reorder examples ".0" and ".9" that matched the code + but not the documentation. The code has been fixed to match the + documentation. Add some examples involving \1 so that they + can be tried with both \1 and \0. Add some other examples + taken from the bug report. + (equals): New set of test cases. + (sign, test_filevercmp): New functions. + (main): Remove test case where the fixed filevercmp disagrees with + strverscmp. Use test_filevercmp instead of filevercmp, so that + we also test filenvercmp. Test the newly-introduced EQUALS cases. + 2022-02-09 Bruno Haible string: Fix "mismatched allocation function" warnings regarding strndup. diff --git a/lib/filevercmp.c b/lib/filevercmp.c index b7e3ac6167..d546e79054 100644 --- a/lib/filevercmp.c +++ b/lib/filevercmp.c @@ -1,4 +1,5 @@ -/* +/* Compare file names containing version numbers. + Copyright (C) 1995 Ian Jackson Copyright (C) 2001 Anthony Towns Copyright (C) 2008-2022 Free Software Foundation, Inc. @@ -19,60 +20,65 @@ #include #include "filevercmp.h" -#include -#include #include -#include #include #include - -/* Match a file suffix defined by this regular expression: - /(\.[A-Za-z~][A-Za-z0-9~]*)*$/ - Scan the string *STR and return a pointer to the matching suffix, or - NULL if not found. Upon return, *STR points to terminating NUL. */ -static const char * -match_suffix (const char **str) +#include +#include + +/* Return the length of a prefix of S that corresponds to the suffix + defined by this extended regular expression in the C locale: + (\.[A-Za-z~][A-Za-z0-9~]*)*$ + If *LEN is -1, S is a string; set *LEN to S's length. + Otherwise, *LEN should be nonnegative, S is a char array, + and *LEN does not change. */ +static idx_t +file_prefixlen (char const *s, ptrdiff_t *len) { - const char *match = NULL; - bool read_alpha = false; - while (**str) + size_t n = *len; /* SIZE_MAX if N == -1. */ + + for (idx_t i = 0; ; i++) { - if (read_alpha) - { - read_alpha = false; - if (!c_isalpha (**str) && '~' != **str) - match = NULL; - } - else if ('.' == **str) + idx_t prefixlen = i; + while (i + 1 < n && s[i] == '.' && (c_isalpha (s[i + 1]) + || s[i + 1] == '~')) + for (i += 2; i < n && (c_isalnum (s[i]) || s[i] == '~'); i++) + continue; + + if (*len < 0 ? !s[i] : i == n) { - read_alpha = true; - if (!match) - match = *str; + *len = i; + return prefixlen; } - else if (!c_isalnum (**str) && '~' != **str) - match = NULL; - (*str)++; } - return match; } -/* verrevcmp helper function */ +/* Return a version sort comparison value for S's byte at position POS. + S has length LEN. If POS == LEN, sort before all non-'~' bytes. */ + static int -order (unsigned char c) +order (char const *s, idx_t pos, idx_t len) { + if (pos == len) + return -1; + + unsigned char c = s[pos]; if (c_isdigit (c)) return 0; else if (c_isalpha (c)) return c; else if (c == '~') - return -1; + return -2; else - return (int) c + UCHAR_MAX + 1; + { + verify (UCHAR_MAX <= (INT_MAX - 1 - 2) / 2); + return c + UCHAR_MAX + 1; + } } /* slightly modified verrevcmp function from dpkg - S1, S2 - compared string - S1_LEN, S2_LEN - length of strings to be scanned + S1, S2 - compared char array + S1_LEN, S2_LEN - length of arrays to be scanned This implements the algorithm for comparison of version strings specified by Debian and now widely adopted. The detailed @@ -81,37 +87,38 @@ order (unsigned char c) implements that from s5.6.12 of Debian Policy v3.8.0.1 https://www.debian.org/doc/debian-policy/ch-controlfields.html#s-f-Version */ static int _GL_ATTRIBUTE_PURE -verrevcmp (const char *s1, size_t s1_len, const char *s2, size_t s2_len) +verrevcmp (const char *s1, idx_t s1_len, const char *s2, idx_t s2_len) { - size_t s1_pos = 0; - size_t s2_pos = 0; + idx_t s1_pos = 0; + idx_t s2_pos = 0; while (s1_pos < s1_len || s2_pos < s2_len) { int first_diff = 0; while ((s1_pos < s1_len && !c_isdigit (s1[s1_pos])) || (s2_pos < s2_len && !c_isdigit (s2[s2_pos]))) { - int s1_c = (s1_pos == s1_len) ? 0 : order (s1[s1_pos]); - int s2_c = (s2_pos == s2_len) ? 0 : order (s2[s2_pos]); + int s1_c = order (s1, s1_pos, s1_len); + int s2_c = order (s2, s2_pos, s2_len); if (s1_c != s2_c) return s1_c - s2_c; s1_pos++; s2_pos++; } - while (s1[s1_pos] == '0') + while (s1_pos < s1_len && s1[s1_pos] == '0') s1_pos++; - while (s2[s2_pos] == '0') + while (s2_pos < s2_len && s2[s2_pos] == '0') s2_pos++; - while (c_isdigit (s1[s1_pos]) && c_isdigit (s2[s2_pos])) + while (s1_pos < s1_len && s2_pos < s2_len + && c_isdigit (s1[s1_pos]) && c_isdigit (s2[s2_pos])) { if (!first_diff) first_diff = s1[s1_pos] - s2[s2_pos]; s1_pos++; s2_pos++; } - if (c_isdigit (s1[s1_pos])) + if (s1_pos < s1_len && c_isdigit (s1[s1_pos])) return 1; - if (c_isdigit (s2[s2_pos])) + if (s2_pos < s2_len && c_isdigit (s2[s2_pos])) return -1; if (first_diff) return first_diff; @@ -124,58 +131,56 @@ verrevcmp (const char *s1, size_t s1_len, const char *s2, size_t s2_len) int filevercmp (const char *s1, const char *s2) { - const char *s1_pos; - const char *s2_pos; - const char *s1_suffix, *s2_suffix; - size_t s1_len, s2_len; - int result; - - /* easy comparison to see if strings are identical */ - int simple_cmp = strcmp (s1, s2); - if (simple_cmp == 0) - return 0; + return filenvercmp (s1, -1, s2, -1); +} - /* special handle for "", "." and ".." */ - if (!*s1) - return -1; - if (!*s2) - return 1; - if (0 == strcmp (".", s1)) - return -1; - if (0 == strcmp (".", s2)) - return 1; - if (0 == strcmp ("..", s1)) - return -1; - if (0 == strcmp ("..", s2)) +/* Compare versions A (of length ALEN) and B (of length BLEN). + See filevercmp.h for function description. */ +int +filenvercmp (char const *a, ptrdiff_t alen, char const *b, ptrdiff_t blen) +{ + /* Special case for empty versions. */ + bool aempty = alen < 0 ? !a[0] : !alen; + bool bempty = blen < 0 ? !b[0] : !blen; + if (aempty) + return -!bempty; + if (bempty) return 1; - /* special handle for other hidden files */ - if (*s1 == '.' && *s2 != '.') - return -1; - if (*s1 != '.' && *s2 == '.') - return 1; - if (*s1 == '.' && *s2 == '.') + /* Special cases for leading ".": "." sorts first, then "..", then + other names with leading ".", then other names. */ + if (a[0] == '.') { - s1++; - s2++; - } + if (b[0] != '.') + return -1; - /* "cut" file suffixes */ - s1_pos = s1; - s2_pos = s2; - s1_suffix = match_suffix (&s1_pos); - s2_suffix = match_suffix (&s2_pos); - s1_len = (s1_suffix ? s1_suffix : s1_pos) - s1; - s2_len = (s2_suffix ? s2_suffix : s2_pos) - s2; - - /* restore file suffixes if strings are identical after "cut" */ - if ((s1_suffix || s2_suffix) && (s1_len == s2_len) - && 0 == strncmp (s1, s2, s1_len)) - { - s1_len = s1_pos - s1; - s2_len = s2_pos - s2; + bool adot = alen < 0 ? !a[1] : alen == 1; + bool bdot = blen < 0 ? !b[1] : blen == 1; + if (adot) + return -!bdot; + if (bdot) + return 1; + + bool adotdot = a[1] == '.' && (alen < 0 ? !a[2] : alen == 2); + bool bdotdot = b[1] == '.' && (blen < 0 ? !b[2] : blen == 2); + if (adotdot) + return -!bdotdot; + if (bdotdot) + return 1; } + else if (b[0] == '.') + return 1; + + /* Cut file suffixes. */ + idx_t aprefixlen = file_prefixlen (a, &alen); + idx_t bprefixlen = file_prefixlen (b, &blen); + + /* If both suffixes are empty, a second pass would return the same thing. */ + bool one_pass_only = aprefixlen == alen && bprefixlen == blen; + + int result = verrevcmp (a, aprefixlen, b, bprefixlen); - result = verrevcmp (s1, s1_len, s2, s2_len); - return result == 0 ? simple_cmp : result; + /* Return the initial result if nonzero, or if no second pass is needed. + Otherwise, restore the suffixes and try again. */ + return result || one_pass_only ? result : verrevcmp (a, alen, b, blen); } diff --git a/lib/filevercmp.h b/lib/filevercmp.h index d62cc4000e..5a33677671 100644 --- a/lib/filevercmp.h +++ b/lib/filevercmp.h @@ -1,4 +1,5 @@ -/* +/* Compare file names containing version numbers. + Copyright (C) 1995 Ian Jackson Copyright (C) 2001 Anthony Towns Copyright (C) 2008-2022 Free Software Foundation, Inc. @@ -19,24 +20,57 @@ #ifndef FILEVERCMP_H #define FILEVERCMP_H -/* Compare version strings: +#include + +/* Compare strings A and B as file names containing version numbers, + and return an integer that is negative, zero, or positive depending + on whether A compares less than, equal to, or greater than B. + + Use the following version sort algorithm: + + 1. Compare the strings' maximal-length non-digit prefixes lexically. + If there is a difference return that difference. + Otherwise discard the prefixes and continue with the next step. + + 2. Compare the strings' maximal-length digit prefixes, using + numeric comparison of the numbers represented by each prefix. + (Treat an empty prefix as zero; this can happen only at string end.) + If there is a difference, return that difference. + Otherwise discard the prefixes and continue with the next step. + + 3. If both strings are empty, return 0. Otherwise continue with step 1. + + In version sort, lexical comparison is left to right, byte by byte, + using the byte's numeric value (0-255), except that: + + 1. ASCII letters sort before other bytes. + 2. A tilde sorts before anything, even an empty string. + + In addition to the version sort rules, the following strings have + special priority and sort before all other strings (listed in order): - This function compares strings S1 and S2: - 1) By PREFIX in the same way as strcmp. - 2) Then by VERSION (most similarly to version compare of Debian's dpkg). - Leading zeros in version numbers are ignored. - 3) If both (PREFIX and VERSION) are equal, strcmp function is used for - comparison. So this function can return 0 if (and only if) strings S1 - and S2 are identical. + 1. The empty string. + 2. ".". + 3. "..". + 4. Strings starting with "." sort before other strings. - It returns number >0 for S1 > S2, 0 for S1 == S2 and number <0 for S1 < S2. + Before comparing two strings where both begin with non-".", + or where both begin with "." but neither is "." or "..", + suffixes matching the C-locale extended regular expression + (\.[A-Za-z~][A-Za-z0-9~]*)*$ are removed and the strings compared + without them, using version sort without special priority; + if they do not compare equal, this comparison result is used and + the suffixes are effectively ignored. Otherwise, the entire + strings are compared using version sort. - This function compares strings, in a way that if VER1 and VER2 are version - numbers and PREFIX and SUFFIX (SUFFIX defined as (\.[A-Za-z~][A-Za-z0-9~]*)*) - are strings then VER1 < VER2 implies filevercmp (PREFIX VER1 SUFFIX, - PREFIX VER2 SUFFIX) < 0. + This function is intended to be a replacement for strverscmp. */ +int filevercmp (char const *a, char const *b) _GL_ATTRIBUTE_PURE; - This function is intended to be a replacement for strverscmp. */ -int filevercmp (const char *s1, const char *s2) _GL_ATTRIBUTE_PURE; +/* Like filevercmp, except compare the byte arrays A (of length ALEN) + and B (of length BLEN) so that A and B can contain '\0', which + sorts just before '\1'. But if ALEN is -1 treat A as a string + terminated by '\0', and similarly for BLEN. */ +int filenvercmp (char const *a, ptrdiff_t alen, char const *b, ptrdiff_t blen) + _GL_ATTRIBUTE_PURE; #endif /* FILEVERCMP_H */ diff --git a/modules/filevercmp b/modules/filevercmp index 373dfd00ea..4786ce11b6 100644 --- a/modules/filevercmp +++ b/modules/filevercmp @@ -7,8 +7,9 @@ lib/filevercmp.c Depends-on: c-ctype +idx stdbool -string +verify configure.ac: diff --git a/tests/test-filevercmp.c b/tests/test-filevercmp.c index 79ad4aed8f..b2a7e90f3f 100644 --- a/tests/test-filevercmp.c +++ b/tests/test-filevercmp.c @@ -19,6 +19,7 @@ #include "filevercmp.h" #include +#include #include "macros.h" @@ -28,8 +29,6 @@ static const char *const examples[] = "", ".", "..", - ".0", - ".9", ".A", ".Z", ".a~", @@ -40,7 +39,14 @@ static const char *const examples[] = ".zz~", ".zz", ".zz.~1~", + ".0", + ".9", ".zz.0", + ".\1", + ".\1.txt", + ".\1x", + ".\1x\1", + ".\1.0", "0", "9", "A", @@ -51,6 +57,10 @@ static const char *const examples[] = "a.b", "a.bc~", "a.bc", + "a+", + "a.", + "a..a", + "a.+", "b~", "b", "gcc-c++-10.fc9.tar.gz", @@ -80,10 +90,79 @@ static const char *const examples[] = "zz", "zz.~1~", "zz.0", + "zz.0.txt", + "\1", + "\1.txt", + "\1x", + "\1x\1", + "\1.0", + "#\1.b#", "#.b#", NULL }; +/* Sets of examples that should all sort equally. Each set is + terminated by NULL. */ +static char const *const equals[] = + { + "a", + "a0", + "a0000", + NULL, + "a\1c-27.txt", + "a\1c-027.txt", + "a\1c-00000000000000000000000000000000000000000000000000000027.txt", + NULL, + ".a\1c-27.txt", + ".a\1c-027.txt", + ".a\1c-00000000000000000000000000000000000000000000000000000027.txt", + NULL, + "a\1c-", + "a\1c-0", + "a\1c-00", + NULL, + ".a\1c-", + ".a\1c-0", + ".a\1c-00", + NULL, + "a\1c-0.txt", + "a\1c-00.txt", + NULL, + ".a\1c-1\1.txt", + ".a\1c-001\1.txt", + NULL, + }; + +static int +sign (int i) +{ + return i < 0 ? -1 : 0 < i; +} + +/* Return filevercmp (A, B), checking that a similar result is gotten + after replacing all '\1's with '\0's and calling filenvercmp with + the embedded '\0's. */ +static int +test_filevercmp (char const *a, char const *b) +{ + int result = filevercmp (a, b); + + char buffer[1000]; + + ptrdiff_t alen = strlen (a), blen = strlen (b); + ASSERT (alen + blen <= sizeof buffer); + memcpy (buffer, a, alen); + memcpy (buffer + alen, b, blen); + for (ptrdiff_t i = 0; i < alen + blen; i++) + if (buffer[i] == '\1') + buffer[i] = '\0'; + + int nresult = filenvercmp (buffer, alen, buffer + alen, blen); + ASSERT (sign (nresult) == sign (result)); + + return result; +} + int main (void) { @@ -94,7 +173,6 @@ main (void) ASSERT (filevercmp ("a", "a") == 0); ASSERT (filevercmp ("a", "b") < 0); ASSERT (filevercmp ("b", "a") > 0); - ASSERT (filevercmp ("a0", "a") > 0); ASSERT (filevercmp ("00", "01") < 0); ASSERT (filevercmp ("01", "010") < 0); ASSERT (filevercmp ("9", "10") < 0); @@ -106,7 +184,7 @@ main (void) const char *const *j; for (j = examples; *j; j++) { - int result = filevercmp (*i, *j); + int result = test_filevercmp (*i, *j); if (result < 0) ASSERT (i < j); else if (0 < result) @@ -116,5 +194,13 @@ main (void) } } + for (i = equals; i < equals + sizeof equals / sizeof *equals; i++) + for (; *i; i++) + for (char const *const *j = i; *j; j++) + { + ASSERT (test_filevercmp (*i, *j) == 0); + ASSERT (test_filevercmp (*j, *i) == 0); + } + return 0; } -- 2.32.0