>From 9e89c5f3e9c5652df2cfbf46b3e1f9608cb0092f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sun, 10 Oct 2021 18:35:59 +0100 Subject: [PATCH] sort: --debug: add warnings about sign, radix, and grouping chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New warnings are added related to the handling of thousands grouping characters, and decimal points. Examples now diagnosed are: $ printf '0,9\n1,a\n' | sort -nk1 --debug -t, -s sort: key 1 is numeric and spans multiple fields sort: field separator ‘,’ is treated as a group separator in numbers 1,a _ 0,9 ___ $ printf '1,a\n0,9\n' | LC_ALL=fr_FR.utf8 sort -gk1 --debug -t, -s sort: key 1 is numeric and spans multiple fields sort: field separator ‘,’ is treated as a decimal point in numbers 0,9 ___ 1,a __ $ printf '1.0\n0.9\n' | LC_ALL=fr_FR.utf8 sort -s -k1,1g --debug sort: note numbers use ‘,’ as a decimal point in this locale 0.9 _ 1.0 _ $ LC_ALL=fr_FR.utf8 sort -n --debug /dev/null sort: text ordering performed using ‘fr_FR.utf8’ sorting rules sort: note numbers use ‘,’ as a decimal point in this locale sort: the multi-byte number group separator in this locale \ is not supported $ sort --debug -t- -k1n /dev/null sort: key 1 is numeric and spans multiple fields sort: field separator ‘-’ is treated as a minus sign in numbers sort: note numbers use ‘.’ as a decimal point in this locale $ sort --debug -t+ -k1g /dev/null sort: key 1 is numeric and spans multiple fields sort: field separator ‘+’ is treated as a plus sign in numbers sort: note numbers use ‘.’ as a decimal point in this locale * src/sort.c (key_warnings): Add the warnings above. * tests/misc/sort-debug-warn.sh: Add test cases. * NEWS: Mention the improvement. Addresses https://bugs.gnu.org/51011 --- NEWS | 7 ++- src/sort.c | 90 ++++++++++++++++++++++++++++++++++- tests/misc/sort-debug-warn.sh | 51 +++++++++++++++++++- 3 files changed, 144 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 086da03ae..1097f0c32 100644 --- a/NEWS +++ b/NEWS @@ -11,10 +11,15 @@ GNU coreutils NEWS -*- outline -*- ** Changes in behavior timeout --foreground --kill-after=... will now exit with status 137 - if the kill signal was sent, which is consistent with the behaviour + if the kill signal was sent, which is consistent with the behavior when the --foreground option is not specified. This allows users to distinguish if the command was more forcefully terminated. +** Improvements + + sort --debug now diagnoses issues with --field-separator characters + that conflict with characters possibly used in numbers. + * Noteworthy changes in release 9.0 (2021-09-24) [stable] diff --git a/src/sort.c b/src/sort.c index d32dcfb02..c75281661 100644 --- a/src/sort.c +++ b/src/sort.c @@ -156,6 +156,8 @@ static char decimal_point; /* Thousands separator; if outside char range, there is no separator. */ static int thousands_sep; +/* We currently ignore multi-byte grouping chars. */ +static bool thousands_sep_ignored; /* Nonzero if the corresponding locales are hard. */ static bool hard_LC_COLLATE; @@ -2425,9 +2427,21 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) struct keyfield const *key; struct keyfield ugkey = *gkey; unsigned long keynum = 1; + bool basic_numeric_field = false; + bool general_numeric_field = false; + bool basic_numeric_field_span = false; + bool general_numeric_field_span = false; for (key = keylist; key; key = key->next, keynum++) { + if (key_numeric (key)) + { + if (key->general_numeric) + general_numeric_field = true; + else + basic_numeric_field = true; + } + if (key->traditional_used) { size_t sword = key->sword; @@ -2482,8 +2496,14 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) if (!sword) sword++; if (!eword || sword < eword) - error (0, 0, _("key %lu is numeric and spans multiple fields"), - keynum); + { + error (0, 0, _("key %lu is numeric and spans multiple fields"), + keynum); + if (key->general_numeric) + general_numeric_field_span = true; + else + basic_numeric_field_span = true; + } } /* Flag global options not copied or specified in any key. */ @@ -2502,6 +2522,70 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) ugkey.reverse &= !key->reverse; } + /* Explicitly warn if field delimiters in this locale + don't constrain numbers. */ + bool number_locale_warned = false; + if (basic_numeric_field_span) + { + if (tab == TAB_DEFAULT + ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep))) + : tab == thousands_sep) + { + error (0, 0, + _("field separator %s is treated as a " + "group separator in numbers"), + quote (((char []) {thousands_sep, 0}))); + number_locale_warned = true; + } + } + if (basic_numeric_field_span || general_numeric_field_span) + { + if (tab == TAB_DEFAULT + ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point))) + : tab == decimal_point) + { + error (0, 0, + _("field separator %s is treated as a " + "decimal point in numbers"), + quote (((char []) {decimal_point, 0}))); + number_locale_warned = true; + } + else if (tab == '-') + { + error (0, 0, + _("field separator %s is treated as a " + "minus sign in numbers"), + quote (((char []) {tab, 0}))); + } + else if (general_numeric_field_span && tab == '+') + { + error (0, 0, + _("field separator %s is treated as a " + "plus sign in numbers"), + quote (((char []) {tab, 0}))); + } + } + + /* Explicitly indicate the decimal point used in this locale, + as it suggests that robust scripts need to consider + setting the locale when comparing numbers. */ + if ((basic_numeric_field || general_numeric_field) && ! number_locale_warned) + { + error (0, 0, + _("%snumbers use %s as a decimal point in this locale"), + tab == decimal_point ? "" : _("note "), + quote (((char []) {decimal_point, 0}))); + + } + + if (basic_numeric_field) + { + if (thousands_sep_ignored) + error (0, 0, + _("the multi-byte number group separator " + "in this locale is not supported")); + } + /* Warn about ignored global options flagged above. This clears all flags if UGKEY is the only one in the list. */ if (!default_key_compare (&ugkey) @@ -4238,6 +4322,8 @@ main (int argc, char **argv) /* FIXME: add support for multibyte thousands separators. */ thousands_sep = locale->thousands_sep[0]; + if (thousands_sep && locale->thousands_sep[1]) + thousands_sep_ignored = true; if (! thousands_sep || locale->thousands_sep[1]) thousands_sep = NON_CHAR; } diff --git a/tests/misc/sort-debug-warn.sh b/tests/misc/sort-debug-warn.sh index bd1c4d8a1..982b47c15 100755 --- a/tests/misc/sort-debug-warn.sh +++ b/tests/misc/sort-debug-warn.sh @@ -26,30 +26,39 @@ sort: key 1 has zero width and will be ignored 2 sort: text ordering performed using simple byte comparison sort: key 1 has zero width and will be ignored +sort: note numbers use '.' as a decimal point in this locale 3 sort: text ordering performed using simple byte comparison sort: key 1 is numeric and spans multiple fields +sort: note numbers use '.' as a decimal point in this locale 4 sort: text ordering performed using simple byte comparison +sort: note numbers use '.' as a decimal point in this locale sort: options '-bghMRrV' are ignored 5 sort: text ordering performed using simple byte comparison +sort: note numbers use '.' as a decimal point in this locale sort: options '-bghMRV' are ignored sort: option '-r' only applies to last-resort comparison 6 sort: text ordering performed using simple byte comparison +sort: note numbers use '.' as a decimal point in this locale sort: option '-r' only applies to last-resort comparison 7 sort: text ordering performed using simple byte comparison sort: leading blanks are significant in key 2; consider also specifying 'b' +sort: note numbers use '.' as a decimal point in this locale sort: options '-bg' are ignored 8 sort: text ordering performed using simple byte comparison +sort: note numbers use '.' as a decimal point in this locale 9 sort: text ordering performed using simple byte comparison +sort: note numbers use '.' as a decimal point in this locale sort: option '-b' is ignored 10 sort: text ordering performed using simple byte comparison +sort: note numbers use '.' as a decimal point in this locale 11 sort: text ordering performed using simple byte comparison sort: leading blanks are significant in key 1; consider also specifying 'b' @@ -106,7 +115,6 @@ echo 16 >> out sort -rM --debug /dev/null 2>>out #no warning echo 17 >> out sort -rM -k1,1 --debug /dev/null 2>>out #no warning - compare exp out || fail=1 @@ -130,6 +138,7 @@ sort: text ordering performed using simple byte comparison sort: key 1 is numeric and spans multiple fields sort: obsolescent key '+2 -1' used; consider '-k 3,1' instead sort: key 2 has zero width and will be ignored +sort: note numbers use '.' as a decimal point in this locale sort: option '-b' is ignored sort: option '-r' only applies to last-resort comparison EOF @@ -138,4 +147,44 @@ sort --debug -rb -k2n +2.2 -1b /dev/null 2>out compare exp out || fail=1 + +# check sign, decimal point, and grouping character warnings +cat <<\EOF > exp +sort: text ordering performed using simple byte comparison +sort: key 1 is numeric and spans multiple fields +sort: field separator ',' is treated as a group separator in numbers +EOF + +if test $(printf '0,9\n0,8\n' | sort -ns | tail -n1) = '0,9'; then + # thousands_sep == , + sort -nk1 -t, --debug /dev/null 2>out + compare exp out || fail=1 +fi + +cat <<\EOF > exp +sort: text ordering performed using simple byte comparison +sort: key 1 is numeric and spans multiple fields +sort: field separator '.' is treated as a decimal point in numbers +EOF +sort -nk1 -t. --debug /dev/null 2>out +compare exp out || fail=1 + +cat <<\EOF > exp +sort: text ordering performed using simple byte comparison +sort: key 1 is numeric and spans multiple fields +sort: field separator '-' is treated as a minus sign in numbers +sort: note numbers use '.' as a decimal point in this locale +EOF +sort -nk1 -t- --debug /dev/null 2>out +compare exp out || fail=1 + +cat <<\EOF > exp +sort: text ordering performed using simple byte comparison +sort: key 1 is numeric and spans multiple fields +sort: field separator '+' is treated as a plus sign in numbers +sort: note numbers use '.' as a decimal point in this locale +EOF +sort -gk1 -t+ --debug /dev/null 2>out +compare exp out || fail=1 + Exit $fail -- 2.26.2