From 11b01fc21f1dff2685477c03596a0a4009aec7da Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 30 Oct 2023 00:32:51 -0700 Subject: [PATCH 09/11] join,uniq: support multi-byte separators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * NEWS: Mention this. * bootstrap.conf (gnulib_modules): Remove cu-ctype, as this module is now more trouble than it’s worth. All uses removed. Add skipchars. * gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype: Remove. * gl/lib/skipchars.c, gl/lib/skipchars.h, gl/modules/skipchars: * tests/misc/join-utf8.sh: New files. * src/join.c: Include skipchars.h and mcel.h instead of cu-ctype.h. (tab): Now mcel_t, not int. All uses changed. (output_separator, output_seplen): New static vars. (eq_tab, newline_or_blank, comma_or_blank): New functions. (xfields, prfields, prjoin, add_field_list, main): Support multi-byte characters. * src/numfmt.c: Include ctype.h, skipchars.h. Do not include cu-ctype.h. (newline_or_blank): New function. (next_field): Support multi-byte characters. * src/sort.c: Include ctype.h instead of cu-ctype.h. (inittables): Open-code field_sep since it no longer exists. ‘sort’ is not multi-byte safe yet, but when it is this code will need revamping anyway. * src/uniq.c: Include mcel.h and skipchars.h instead of cu-ctype.h. (newline_or_blank): New function. (find_field): Support multi-byte characters. * tests/local.mk (all_tests): Add tests/misc/join-utf8.sh --- NEWS | 5 ++ bootstrap.conf | 2 +- gl/lib/cu-ctype.c | 3 - gl/lib/cu-ctype.h | 35 ------------ gl/lib/skipchars.c | 3 + gl/lib/skipchars.h | 56 +++++++++++++++++++ gl/modules/cu-ctype | 24 -------- gl/modules/skipchars | 24 ++++++++ src/join.c | 119 +++++++++++++++++++++++++--------------- src/numfmt.c | 16 ++++-- src/sort.c | 6 +- src/uniq.c | 27 +++++---- tests/local.mk | 1 + tests/misc/join-utf8.sh | 51 +++++++++++++++++ 14 files changed, 244 insertions(+), 128 deletions(-) delete mode 100644 gl/lib/cu-ctype.c delete mode 100644 gl/lib/cu-ctype.h create mode 100644 gl/lib/skipchars.c create mode 100644 gl/lib/skipchars.h delete mode 100644 gl/modules/cu-ctype create mode 100644 gl/modules/skipchars create mode 100755 tests/misc/join-utf8.sh diff --git a/NEWS b/NEWS index 3021211dc..b1088f683 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,11 @@ GNU coreutils NEWS -*- outline -*- to preserve ownership" when copying to GNU/Linux CIFS file systems. They do this by working around some Linux CIFS bugs. + join and uniq now support multi-byte characters better. + For example, 'join -tX' now works even if X is a multi-byte character, + and both programs now treat multi-byte characters like U+3000 + IDEOGRAPHIC SPACE as blanks if the current locale treats them so. + numfmt options like --suffix no longer have an arbitrary 127-byte limit. [bug introduced with numfmt in coreutils-8.21] diff --git a/bootstrap.conf b/bootstrap.conf index 4724544d7..97645d6f0 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -70,7 +70,6 @@ gnulib_modules=" crypto/sha256 crypto/sha512 crypto/sm3 - cu-ctype cycle-check d-ino d-type @@ -241,6 +240,7 @@ gnulib_modules=" settime sig2str sigaction + skipchars smack ssize_t stat-macros diff --git a/gl/lib/cu-ctype.c b/gl/lib/cu-ctype.c deleted file mode 100644 index 9f753de2e..000000000 --- a/gl/lib/cu-ctype.c +++ /dev/null @@ -1,3 +0,0 @@ -#include -#define CU_CTYPE_INLINE _GL_EXTERN_INLINE -#include diff --git a/gl/lib/cu-ctype.h b/gl/lib/cu-ctype.h deleted file mode 100644 index 82f1d73f2..000000000 --- a/gl/lib/cu-ctype.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Character type definitions for coreutils - - Copyright 2023 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include - -#ifndef _GL_INLINE_HEADER_BEGIN -# error "Please include config.h first." -#endif -_GL_INLINE_HEADER_BEGIN -#ifndef CU_CTYPE_INLINE -# define CU_CTYPE_INLINE _GL_INLINE -#endif - -/* '\n' is considered a field separator with --zero-terminated. */ -CU_CTYPE_INLINE bool -field_sep (unsigned char ch) -{ - return isblank (ch) || ch == '\n'; -} - -_GL_INLINE_HEADER_END diff --git a/gl/lib/skipchars.c b/gl/lib/skipchars.c new file mode 100644 index 000000000..827c89d45 --- /dev/null +++ b/gl/lib/skipchars.c @@ -0,0 +1,3 @@ +#include +#define SKIPCHARS_INLINE _GL_EXTERN_INLINE +#include diff --git a/gl/lib/skipchars.h b/gl/lib/skipchars.h new file mode 100644 index 000000000..baa9eaba6 --- /dev/null +++ b/gl/lib/skipchars.h @@ -0,0 +1,56 @@ +/* Skipping sequences of characters satisfying a predicate + + Copyright 2023 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include "mcel.h" + +_GL_INLINE_HEADER_BEGIN +#ifndef SKIPCHARS_INLINE +# define SKIPCHARS_INLINE _GL_INLINE +#endif + +/* Return the address just past the leading sequence of possibly + multi-byte characters or encoding errors G in STR that satisfy + PREDICATE (G) if OK is true, or that do not satisfy the predicate + call if OK is false. */ + +SKIPCHARS_INLINE char * +skip_str_matching (char const *str, bool (*predicate) (mcel_t), bool ok) +{ + char const *s = str; + for (mcel_t g; *s && predicate (g = mcel_scanz (s)) == ok; + s += g.len) + continue; + return (char *) s; +} + +/* Return the address just past the leading sequence of possibly + multi-byte characters or encoding errors G in BUF (which ends at LIM) + that satisfy PREDICATE (G) if OK is true, or that do not satisfy + the predicate call if OK is false. */ + +SKIPCHARS_INLINE char * +skip_buf_matching (char const *buf, char const *lim, + bool (*predicate) (mcel_t), bool ok) +{ + char const *s = buf; + for (mcel_t g; s < lim && predicate (g = mcel_scan (s, lim)) == ok; + s += g.len) + continue; + return (char *) s; +} + +_GL_INLINE_HEADER_END diff --git a/gl/modules/cu-ctype b/gl/modules/cu-ctype deleted file mode 100644 index bd328b32e..000000000 --- a/gl/modules/cu-ctype +++ /dev/null @@ -1,24 +0,0 @@ -Description: -ctype.h-like definitions for coreutils - -Files: -lib/cu-ctype.c -lib/cu-ctype.h - -Depends-on: -ctype -extern-inline - -configure.ac: - -Makefile.am: -lib_SOURCES += cu-ctype.c - -Include: -"cu-ctype.h" - -License: -GPL - -Maintainer: -all diff --git a/gl/modules/skipchars b/gl/modules/skipchars new file mode 100644 index 000000000..3b25fd6eb --- /dev/null +++ b/gl/modules/skipchars @@ -0,0 +1,24 @@ +Description: +Skip sequences of multi-byte characters or encoding errors + +Files: +lib/skipchars.c +lib/skipchars.h + +Depends-on: +extern-inline +mcel + +configure.ac: + +Makefile.am: +lib_SOURCES += skipchars.c + +Include: +"skipchars.h" + +License: +GPL + +Maintainer: +all diff --git a/src/join.c b/src/join.c index b95cf2b9b..b3ad27465 100644 --- a/src/join.c +++ b/src/join.c @@ -23,12 +23,13 @@ #include "system.h" #include "assure.h" -#include "cu-ctype.h" #include "fadvise.h" #include "hard-locale.h" #include "linebuffer.h" +#include "mcel.h" #include "memcasecmp.h" #include "quote.h" +#include "skipchars.h" #include "stdio--.h" #include "xmemcoll.h" #include "xstrtol.h" @@ -135,10 +136,14 @@ static struct outlist outlist_head; /* Last element in 'outlist', where a new element can be added. */ static struct outlist *outlist_end = &outlist_head; -/* Tab character separating fields. If negative, fields are separated - by any nonempty string of blanks, otherwise by exactly one - tab character whose value (when cast to unsigned char) equals TAB. */ -static int tab = -1; +/* Tab character (or encoding error) separating fields. If TAB.len == 0, + fields are separated by any nonempty string of blanks, otherwise by + exactly one tab character (or encoding error) equal to TAB. */ +static mcel_t tab; + +/* The output separator to use, and its length in bytes. */ +static char const *output_separator = " "; +static idx_t output_seplen = 1; /* If nonzero, check that the input is correctly ordered. */ static enum @@ -267,6 +272,18 @@ extract_field (struct line *line, char *field, idx_t len) ++(line->nfields); } +static bool +eq_tab (mcel_t g) +{ + return mcel_cmp (g, tab) == 0; +} + +static bool +newline_or_blank (mcel_t g) +{ + return g.ch == '\n' || c32isblank (g.ch); +} + /* Fill in the 'fields' structure in LINE. */ static void @@ -278,34 +295,29 @@ xfields (struct line *line) if (ptr == lim) return; - if (0 <= tab && tab != '\n') - { - char *sep; - for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1) - extract_field (line, ptr, sep - ptr); - } - else if (tab < 0) + if (!tab.len) { - /* Skip leading blanks before the first field. */ - while (field_sep (*ptr)) - if (++ptr == lim) - return; - - do + while (ptr < lim) { - char *sep; - for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++) - continue; + ptr = skip_buf_matching (ptr, lim, newline_or_blank, true); + if (!*ptr) + break; + char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false); extract_field (line, ptr, sep - ptr); - if (sep == lim) - return; - for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++) - continue; + ptr = sep; } - while (ptr != lim); } + else + { + if (tab.ch != '\n') + for (char *sep; + ((sep = skip_buf_matching (ptr, lim, eq_tab, false)) + < lim); + ptr = sep + mcel_scan (sep, lim).len) + extract_field (line, ptr, sep - ptr); - extract_field (line, ptr, lim - ptr); + extract_field (line, ptr, lim - ptr); + } } static void @@ -568,16 +580,15 @@ prfields (struct line const *line, idx_t join_field, idx_t autocount) { idx_t i; idx_t nfields = autoformat ? autocount : line->nfields; - char output_separator = tab < 0 ? ' ' : tab; for (i = 0; i < join_field && i < nfields; ++i) { - putchar (output_separator); + fwrite (output_separator, 1, output_seplen, stdout); prfield (i, line); } for (i = join_field + 1; i < nfields; ++i) { - putchar (output_separator); + fwrite (output_separator, 1, output_seplen, stdout); prfield (i, line); } } @@ -588,7 +599,6 @@ static void prjoin (struct line const *line1, struct line const *line2) { const struct outlist *outlist; - char output_separator = tab < 0 ? ' ' : tab; idx_t field; struct line const *line; @@ -622,7 +632,7 @@ prjoin (struct line const *line1, struct line const *line2) o = o->next; if (o == nullptr) break; - putchar (output_separator); + fwrite (output_separator, 1, output_seplen, stdout); } putchar (eolchar); } @@ -886,6 +896,12 @@ decode_field_spec (char const *s, int *file_index, idx_t *field_index) } } +static bool +comma_or_blank (mcel_t g) +{ + return g.ch == ',' || c32isblank (g.ch); +} + /* Add the comma or blank separated field spec(s) in STR to 'outlist'. */ static void @@ -898,14 +914,17 @@ add_field_list (char *str) int file_index; idx_t field_index; char const *spec_item = p; - - p = strpbrk (p, ", \t"); - if (p) - *p++ = '\0'; + p = skip_str_matching (spec_item, comma_or_blank, false); + if (*p) + { + mcel_t g = mcel_scanz (p); + *p = '\0'; + p += g.len; + } decode_field_spec (spec_item, &file_index, &field_index); add_field (file_index, field_index); } - while (p); + while (*p); } /* Set the join field *VAR to VAL, but report an error if *VAR is set @@ -1087,20 +1106,30 @@ main (int argc, char **argv) case 't': { - unsigned char newtab = optarg[0]; - if (! newtab) - newtab = '\n'; /* '' => process the whole line. */ - else if (optarg[1]) + mcel_t newtab; + if (!*optarg) + { + /* '' => process the whole line. */ + newtab = mcel_ch ('\n', 1); + /* output_separator does not matter. */ + } + else if (STREQ (optarg, "\\0")) + { + newtab = mcel_ch ('\0', 1); + output_separator = ""; + } + else { - if (STREQ (optarg, "\\0")) - newtab = '\0'; - else + newtab = mcel_scanz (optarg); + if (optarg[newtab.len]) error (EXIT_FAILURE, 0, _("multi-character tab %s"), quote (optarg)); + output_separator = optarg; } - if (0 <= tab && tab != newtab) + if (tab.len && mcel_cmp (tab, newtab) != 0) error (EXIT_FAILURE, 0, _("incompatible tabs")); tab = newtab; + output_seplen = newtab.len; } break; diff --git a/src/numfmt.c b/src/numfmt.c index 2ce70226c..7b53c87e4 100644 --- a/src/numfmt.c +++ b/src/numfmt.c @@ -15,6 +15,7 @@ along with this program. If not, see . */ #include +#include #include #include #include @@ -24,9 +25,9 @@ #include "argmatch.h" #include "c-ctype.h" -#include "cu-ctype.h" #include "mbswidth.h" #include "quote.h" +#include "skipchars.h" #include "system.h" #include "xstrtol.h" @@ -1314,6 +1315,12 @@ process_suffixed_number (char *text, long double *result, return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS); } +static bool +newline_or_blank (mcel_t g) +{ + return g.ch == '\n' || c32isblank (g.ch); +} + /* Return a pointer to the beginning of the next field in line. The line pointer is moved to the end of the next field. */ static char* @@ -1334,11 +1341,8 @@ next_field (char **line) else { /* keep any space prefix in the returned field */ - while (*field_end && field_sep (*field_end)) - ++field_end; - - while (*field_end && ! field_sep (*field_end)) - ++field_end; + field_end = skip_str_matching (field_end, newline_or_blank, true); + field_end = skip_str_matching (field_end, newline_or_blank, false); } *line = field_end; diff --git a/src/sort.c b/src/sort.c index 6856e6151..829b17f42 100644 --- a/src/sort.c +++ b/src/sort.c @@ -22,6 +22,7 @@ #include +#include #include #include #include @@ -31,7 +32,6 @@ #include "system.h" #include "argmatch.h" #include "assure.h" -#include "cu-ctype.h" #include "fadvise.h" #include "filevercmp.h" #include "flexmember.h" @@ -1293,9 +1293,9 @@ inittables (void) for (i = 0; i < UCHAR_LIM; ++i) { - blanks[i] = field_sep (i); + blanks[i] = i == '\n' || isblank (i); + nondictionary[i] = ! blanks[i] && ! isalnum (i); nonprinting[i] = ! isprint (i); - nondictionary[i] = ! isalnum (i) && ! field_sep (i); fold_toupper[i] = toupper (i); } diff --git a/src/uniq.c b/src/uniq.c index 7e177ac5a..7dc0c999a 100644 --- a/src/uniq.c +++ b/src/uniq.c @@ -23,10 +23,11 @@ #include "system.h" #include "argmatch.h" -#include "cu-ctype.h" #include "linebuffer.h" #include "fadvise.h" +#include "mcel.h" #include "posixver.h" +#include "skipchars.h" #include "stdio--.h" #include "xstrtol.h" #include "memcasecmp.h" @@ -248,6 +249,12 @@ size_opt (char const *opt, char const *msgid) return MIN (size, SIZE_MAX); } +static bool +newline_or_blank (mcel_t g) +{ + return g.ch == '\n' || c32isblank (g.ch); +} + /* Given a linebuffer LINE, return a pointer to the beginning of the line's field to be compared. */ @@ -256,21 +263,19 @@ static char * find_field (struct linebuffer const *line) { size_t count; - char const *lp = line->buffer; - size_t size = line->length - 1; - size_t i = 0; + char *lp = line->buffer; + char const *lim = lp + line->length - 1; - for (count = 0; count < skip_fields && i < size; count++) + for (count = 0; count < skip_fields && lp < lim; count++) { - while (i < size && field_sep (lp[i])) - i++; - while (i < size && !field_sep (lp[i])) - i++; + lp = skip_buf_matching (lp, lim, newline_or_blank, true); + lp = skip_buf_matching (lp, lim, newline_or_blank, false); } - i += MIN (skip_chars, size - i); + for (size_t s = skip_chars; lp < lim && s; s--) + lp += mcel_scan (lp, lim).len; - return line->buffer + i; + return lp; } /* Return false if two strings OLD and NEW match, true if not. diff --git a/tests/local.mk b/tests/local.mk index 79fea1f6e..a5fb62d96 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -271,6 +271,7 @@ all_tests = \ tests/misc/mktemp.pl \ tests/misc/arch.sh \ tests/misc/join.pl \ + tests/misc/join-utf8.sh \ tests/pr/pr-tests.pl \ tests/pwd/pwd-option.sh \ tests/chcon/chcon-fail.sh \ diff --git a/tests/misc/join-utf8.sh b/tests/misc/join-utf8.sh new file mode 100755 index 000000000..b70bff7f9 --- /dev/null +++ b/tests/misc/join-utf8.sh @@ -0,0 +1,51 @@ +#!/bin/sh +# Test join in a UTF-8 locale. + +# Copyright 2023 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ join + +test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available" + +LC_ALL=$LOCALE_FR_UTF8 +export LC_ALL + +fail=0 + +vertical_line='|' +multiplication_sign='×' +en_dash='–' +old_Persian_word_divider='𐏐' + +for s in \ + "$vertical_line" \ + "$multiplication_sign" \ + "$en_dash" \ + "$old_Persian_word_divider" +do + printf '0%sA\n1%sa\n2%sb\n4%sc\n' "$s" "$s" "$s" "$s" >a || + framework_failure_ + printf '0%sB\n1%sd\n3%se\n4%sf\n' "$s" "$s" "$s" "$s" >b || + framework_failure_ + join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1 + printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \ + "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp || + framework_failure + compare exp out || fail=1 +done + +Exit $fail -- 2.39.2