>From 439a0f7fe0b89ebc371a80b30b07e3fd8b0c1b4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <address@hidden>
Date: Sun, 23 Feb 2020 13:20:08 +0000
Subject: [PATCH] uniq: avoid strcoll() to improve performance and consistency

strcoll() is only significant to uniq(1) if it returns 0,
and it generally only does so with buggy locales or mismatched
locales and data.  Some systems may have strcoll()
return 0 for equivalent normalized unicode forms,
but for consistency across platforms strcoll() is avoided.
The various cases are defined in the new test.
This is consistent with newer POSIX standards as discussed at:
https://www.austingroupbugs.net/view.php?id=963

* src/uniq.c: s/xstrcoll/memcmp/.
* tests/local.mk: Reference the new test.
* tests/misc/uniq-collate.sh: Add a new test.
* NEWS: Mention the change in behavior.
Fixes https://bugs.gnu.org/38627
---
 NEWS                       |  3 ++
 src/uniq.c                 | 13 +-------
 tests/local.mk             |  1 +
 tests/misc/uniq-collate.sh | 64 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 12 deletions(-)
 create mode 100755 tests/misc/uniq-collate.sh

diff --git a/NEWS b/NEWS
index 8a349634e..6afb9cb6d 100644
--- a/NEWS
+++ b/NEWS
@@ -65,6 +65,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   [The old behavior was introduced in sh-utils 2.0.15 ca. 1999, predating
   coreutils package.]
 
+  uniq no longer uses strcoll() to determine string equivalence,
+  and so will operate more efficiently and consistently.
+
 ** New Features
 
   ls now supports the --time=birth option to display and sort by
diff --git a/src/uniq.c b/src/uniq.c
index 0fcf50a16..6608b6c62 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -30,7 +30,6 @@
 #include "hard-locale.h"
 #include "posixver.h"
 #include "stdio--.h"
-#include "xmemcoll.h"
 #include "xstrtol.h"
 #include "memcasecmp.h"
 #include "quote.h"
@@ -52,9 +51,6 @@
     }						\
   while (0)
 
-/* True if the LC_COLLATE locale is hard.  */
-static bool hard_LC_COLLATE;
-
 /* Number of fields to skip on each line when doing comparisons. */
 static size_t skip_fields;
 
@@ -220,7 +216,6 @@ characters.  Fields are skipped before chars.\n\
 \n\
 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
-Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
 "), stdout);
       emit_ancillary_info (PROGRAM_NAME);
     }
@@ -293,12 +288,7 @@ different (char *old, char *new, size_t oldlen, size_t newlen)
     newlen = check_chars;
 
   if (ignore_case)
-    {
-      /* FIXME: This should invoke strcoll somehow.  */
-      return oldlen != newlen || memcasecmp (old, new, oldlen);
-    }
-  else if (hard_LC_COLLATE)
-    return xmemcoll (old, oldlen, new, newlen) != 0;
+    return oldlen != newlen || memcasecmp (old, new, oldlen);
   else
     return oldlen != newlen || memcmp (old, new, oldlen);
 }
@@ -501,7 +491,6 @@ main (int argc, char **argv)
   setlocale (LC_ALL, "");
   bindtextdomain (PACKAGE, LOCALEDIR);
   textdomain (PACKAGE);
-  hard_LC_COLLATE = hard_locale (LC_COLLATE);
 
   atexit (close_stdout);
 
diff --git a/tests/local.mk b/tests/local.mk
index bbcb9d413..0aabdaacc 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -438,6 +438,7 @@ all_tests =					\
   tests/misc/unexpand.pl			\
   tests/misc/uniq.pl				\
   tests/misc/uniq-perf.sh			\
+  tests/misc/uniq-collate.sh			\
   tests/misc/xattr.sh				\
   tests/misc/yes.sh				\
   tests/tail-2/wait.sh				\
diff --git a/tests/misc/uniq-collate.sh b/tests/misc/uniq-collate.sh
new file mode 100755
index 000000000..6767848a4
--- /dev/null
+++ b/tests/misc/uniq-collate.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+# before coreutils-8.32, uniq would not distinguish
+# items which compared equal with strcoll()
+# So ensure we avoid strcoll() for the following cases.
+
+# Copyright (C) 2020 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ uniq printf
+
+gen_input()
+{
+  env LC_ALL=$LOCALE_FR_UTF8 printf \
+    "$@" > in || framework_failure_
+}
+
+# strcoll() used to return 0 comparing the following strings
+# which was fixed somewhere between glibc-2.22 and glibc-2.30
+gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+# normalization in strcoll is inconsistent across platforms.
+# glibc based systems at least do _not_ normalize in strcoll,
+# while cygwin systems for example may do so.
+# á composed, decomposed
+gen_input '\u00E1\na\u0301\n' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+# Similarly with hangul
+gen_input '\uAC01\n\u1100\u1161\u11A8' > in || framework_failure_
+test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note if running in the wrong locale, strcoll may
+# indicate the strings match when they don't
+# I.e., cjk and hangul will now work even if
+# uniq is running in the wong locale
+# hangul (ko_KR.utf8)
+gen_input '\uAC00\n\uAC01\n' > in || framework_failure_
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+# CJK (zh_CN.utf8)
+gen_input '\u3400\n\u3401\n' > in || framework_failure_
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note strcoll() ignores certain characters,
+# but not if the strings are otherwise equal.
+# I.e., the following on glibc-2.30 at least
+# does not print a single item as expected,
+# but testing here for illustration
+gen_input ',a\n.a\n' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+Exit $fail
-- 
2.24.1