bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: misalignment in ls -l in fr_FR locale


From: Pádraig Brady
Subject: Re: misalignment in ls -l in fr_FR locale
Date: Fri, 27 Mar 2009 12:14:11 +0000
User-agent: Thunderbird 2.0.0.6 (X11/20071008)

Sorry about all the iterations.
Hopefully this version is complete.

cheers,
Pádraig.
>From 55d1c2e2ea14f0713d7604323b4f43a83237897b Mon Sep 17 00:00:00 2001
From: =?utf-8?q?P=C3=A1draig=20Brady?= <address@hidden>
Date: Tue, 24 Mar 2009 14:29:21 +0000
Subject: [PATCH] ls: Fix alignment when month names have varying widths
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 8bit

Reported by Samuel Thibault and Stéphane Raimbault, as the glibc fr_FR
locale has recently changed to use the official but variable width
abbreviated month names. Other glibc locales also have variable widths.
http://sourceware.org/ml/libc-locales/2008-q1/msg00035.html
http://sourceware.org/bugzilla/show_bug.cgi?id=9859
* NEWS: Mention the fix
* gl/lib/mbsalign.c: A new module to align and truncate a
string in a specified number of screen cells, while handling
multi-byte characters appropriately.
* gl/lib/mbsalign.h: Ditto
* gl/modules/mbsalign: Ditto
* bootstrap.conf: Reference the new module
* src/ls.c (abmon_init): New function, precompute the abbreviated
months aligned left in a minimum width column <= 5 screen cells.
(align_nstrftime): New function, replace the first %b in the
format specification to strftime with the precomputed month string.
Note using the cached month strings speeds up `ls -lU` by around 17%
on glibc-2.7-2 on linux at least.  Also if we implement this function
using heap storage rather than automatic storage, and use snprintf
instead of strcpy, ls will slow down by 2% and 1% respectively
(i.e. a net gain of 14% rather than 17%).
---
 NEWS                |    3 +
 bootstrap.conf      |    1 +
 gl/lib/mbsalign.c   |  188 +++++++++++++++++++++++++++++++++++++++++++++++++++
 gl/lib/mbsalign.h   |   23 ++++++
 gl/modules/mbsalign |   25 +++++++
 src/ls.c            |  104 +++++++++++++++++++++++++++-
 6 files changed, 341 insertions(+), 3 deletions(-)
 create mode 100644 gl/lib/mbsalign.c
 create mode 100644 gl/lib/mbsalign.h
 create mode 100644 gl/modules/mbsalign

diff --git a/NEWS b/NEWS
index 3ef5f94..6538071 100644
--- a/NEWS
+++ b/NEWS
@@ -28,6 +28,9 @@ GNU coreutils NEWS                                    -*- 
outline -*-
   ls --sort=version (-v) sorted names beginning with "." inconsistently.
   Now, names that start with "." are always listed before those that don't.
 
+  ls now aligns output correctly in the presence of abbreviated month names
+  from the locale database that have differing widths.
+
   pr: fix the bug whereby --indent=N (-o) did not indent header lines
   [bug introduced in coreutils-6.9.90]
 
diff --git a/bootstrap.conf b/bootstrap.conf
index 0747bb8..ae033d9 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -70,6 +70,7 @@ gnulib_modules="
        long-options lstat malloc
        manywarnings
        mbrtowc
+       mbsalign
        mbswidth
        memcasecmp memcmp2 mempcpy
        memrchr mgetgroups
diff --git a/gl/lib/mbsalign.c b/gl/lib/mbsalign.c
new file mode 100644
index 0000000..1079a71
--- /dev/null
+++ b/gl/lib/mbsalign.c
@@ -0,0 +1,188 @@
+/* Align/Truncate a string in a given screen width
+   Copyright (C) 2009 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Pádraig Brady.  */
+
+#include <config.h>
+#include "mbsalign.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <wchar.h>
+#include <wctype.h>
+
+/* Replace non printable chars.
+   Return 1 if replacement made, 0 otherwise.  */
+
+static bool
+wc_ensure_printable (wchar_t *wchars)
+{
+  bool replaced = false;
+  wchar_t *wc = wchars;
+  while (*wc)
+    {
+      if (!iswprint ((wint_t) *wc))
+        {
+          *wc = 0xFFFD; /* L'\uFFFD' (replacement char) */
+          replaced = true;
+        }
+      wc++;
+    }
+  return replaced;
+}
+
+/* Truncate wchar string to width cells.
+ * Returns number of cells used.  */
+
+static size_t
+wc_truncate (wchar_t *wc, size_t width)
+{
+  size_t cells = 0;
+  int next_cells = 0;
+
+  while (*wc)
+    {
+      next_cells = wcwidth (*wc);
+      if (next_cells == -1) /* non printable */
+        {
+          *wc = 0xFFFD; /* L'\uFFFD' (replacement char) */
+          next_cells = 1;
+        }
+      if (cells + next_cells > width)
+        break;
+      cells += next_cells;
+      wc++;
+    }
+  *wc = L'\0';
+  return cells;
+}
+
+/* FIXME: move this function to gnulib as it's missing on:
+   OpenBSD 3.8, IRIX 5.3, Solaris 2.5.1, mingw, BeOS  */
+
+static int
+rpl_wcswidth (const wchar_t *s, size_t n)
+{
+  int ret = 0;
+
+  while (n-- > 0 && *s != L'\0')
+    {
+      int nwidth = wcwidth (*s++);
+      if (nwidth == -1) /* non printable  */
+        return -1;
+      ret += nwidth;
+    }
+
+  return ret;
+}
+
+/* Align a string in a given screen width, handling multi-byte characters.
+   In addition if the string is too large for the width, truncate it to fit.
+   When centering, the number of trailing spaces may be 1 less than the
+   number of leading spaces.
+   Return the number of bytes written to or required in dest (without
+   the trailing NUL).  A value >= dest_size means there wasn't enough space.
+   If an error is encountered, a negative value is returned.
+   The width parameter both specifies the width to align/pad/truncate to,
+   and is updated to return the width used before padding.  */
+
+int
+mbsalign (const char *src, char *dest, size_t dest_size,
+          size_t *width, mbs_align_t align)
+{
+  int ret = -1;
+  size_t src_size = strlen (src) + 1;
+  char *newstr = NULL;
+  wchar_t *str_wc = NULL;
+  const char *str_to_print = src;
+  size_t n_used = src_size - 1;
+  size_t n_spaces = 0;
+  bool conversion = false;
+  bool wc_enabled = false;
+
+  if (MB_CUR_MAX > 1)
+    {
+      size_t src_chars = mbstowcs (NULL, src, 0);
+      if (src_chars == (size_t) -1)
+        goto mbsalign_cleanup;
+      src_chars += 1; /* make space for NUL */
+      str_wc = malloc (src_chars * sizeof (wchar_t));
+      if (str_wc == NULL)
+        goto mbsalign_cleanup;
+      if (mbstowcs (str_wc, src, src_chars) > 0)
+        {
+          str_wc[src_chars - 1] = L'\0';
+          wc_enabled = true;
+          conversion = wc_ensure_printable (str_wc);
+          n_used = rpl_wcswidth (str_wc, src_chars);
+        }
+    }
+
+  if (conversion || (n_used > *width))
+    {
+      newstr = malloc (src_size);
+      if (newstr == NULL)
+        goto mbsalign_cleanup;
+      str_to_print = newstr;
+      if (wc_enabled)
+        {
+          n_used = wc_truncate (str_wc, *width);
+          wcstombs (newstr, str_wc, src_size);
+        }
+      else
+        {
+          n_used = *width;
+          memcpy (newstr, src, *width);
+          newstr[*width] = '\0';
+        }
+    }
+
+  if (*width > n_used)
+    n_spaces = *width - n_used;
+  *width = n_used;  /* indicate to caller how many cells needed.  */
+
+  /* FIXME: Should I be padding with "figure space" (\u2007)
+     rather than spaces below? (only if non ascii data present).  */
+  switch (align)
+    {
+    case MBS_ALIGN_CENTER:
+      ret = snprintf (dest, dest_size, "%*s%s%*s",
+                      n_spaces / 2 + n_spaces % 2, "",
+                      str_to_print, n_spaces / 2, "");
+      break;
+    case MBS_ALIGN_LEFT:
+      ret = snprintf (dest, dest_size, "%s%*s", str_to_print, n_spaces, "");
+      break;
+    case MBS_ALIGN_RIGHT:
+      ret = snprintf (dest, dest_size, "%*s%s", n_spaces, "", str_to_print);
+      break;
+    }
+
+mbsalign_cleanup:
+
+  free (str_wc);
+  free (newstr);
+
+  return ret;
+}
+
+/*
+ * Local variables:
+ *  indent-tabs-mode: nil
+ * End:
+ */
diff --git a/gl/lib/mbsalign.h b/gl/lib/mbsalign.h
new file mode 100644
index 0000000..69e3966
--- /dev/null
+++ b/gl/lib/mbsalign.h
@@ -0,0 +1,23 @@
+/* Align/Truncate a string in a given screen width
+   Copyright (C) 2009 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+
+typedef enum { MBS_ALIGN_LEFT, MBS_ALIGN_RIGHT, MBS_ALIGN_CENTER } mbs_align_t;
+
+int
+mbsalign (const char *src, char *dest, size_t dest_size,
+          size_t *width, mbs_align_t align);
diff --git a/gl/modules/mbsalign b/gl/modules/mbsalign
new file mode 100644
index 0000000..f5845dc
--- /dev/null
+++ b/gl/modules/mbsalign
@@ -0,0 +1,25 @@
+Description:
+Align/Truncate a string in a given screen width.
+
+Files:
+lib/mbsalign.c
+lib/mbsalign.h
+
+Depends-on:
+wchar
+wctype
+wcwidth
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += mbsalign.c mbsalign.h
+
+Include:
+"mbsalign.h"
+
+License:
+LGPL
+
+Maintainer:
+Pádraig Brady
diff --git a/src/ls.c b/src/ls.c
index d30e5a0..c55f525 100644
--- a/src/ls.c
+++ b/src/ls.c
@@ -67,6 +67,10 @@
 #include <selinux/selinux.h>
 #include <wchar.h>
 
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
+
 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
    present.  */
 #ifndef SA_NOCLDSTOP
@@ -105,6 +109,7 @@
 #include "strftime.h"
 #include "xstrtol.h"
 #include "areadlink.h"
+#include "mbsalign.h"
 
 #define PROGRAM_NAME (ls_mode == LS_LS ? "ls" \
                      : (ls_mode == LS_MULTI_COL \
@@ -695,6 +700,11 @@ static char const *long_time_format[2] =
        screen columns small, because many people work in windows with
        only 80 columns.  But make this as wide as the other string
        below, for recent files.  */
+    /* TRANSLATORS: ls output needs to be aligned for ease of reading,
+       so be wary of using variable width fields from the locale.
+       Note %b is handled specially by ls and aligned correctly.
+       Note also that specifying a width as in %5b is erroneous as strftime
+       will count bytes rather than characters in multibyte locales.  */
     N_("%b %e  %Y"),
     /* strftime format for recent files (younger than 6 months), in -l
        output.  This should contain the month, day and time (at
@@ -703,6 +713,11 @@ static char const *long_time_format[2] =
        screen columns small, because many people work in windows with
        only 80 columns.  But make this as wide as the other string
        above, for non-recent files.  */
+    /* TRANSLATORS: ls output needs to be aligned for ease of reading,
+       so be wary of using variable width fields from the locale.
+       Note %b is handled specially by ls and aligned correctly.
+       Note also that specifying a width as in %5b is erroneous as strftime
+       will count bytes rather than characters in multibyte locales.  */
     N_("%b %e %H:%M")
   };
 
@@ -978,6 +993,56 @@ dired_dump_obstack (const char *prefix, struct obstack *os)
     }
 }
 
+/* Read the abbreviated month names from the locale, to align them
+   and to determine the max width of the field and to truncate names
+   greater than our max allowed.
+   Note even though this handles multibyte locales correctly
+   it's not restricted to them as single byte locales can have
+   variable width abbreviated months and also precomputing/caching
+   the names was seen to increase the performance of ls significantly.  */
+
+/* max number of display cells to use */
+enum { MAX_MON_WIDTH = 5 };
+/* In the unlikely event that the abmon[] storage is not big enough
+   an error message will be displayed, and we revert to using
+   unmodified abbreviated month names from the locale database.  */
+static char abmon[12][MAX_MON_WIDTH * 2 * MB_LEN_MAX + 1];
+/* minimum width needed to align %b, 0 => don't use precomputed values.  */
+static size_t required_mon_width;
+
+static size_t
+abmon_init (void)
+{
+#ifdef HAVE_NL_LANGINFO
+  required_mon_width = MAX_MON_WIDTH;
+  size_t curr_max_width;
+  do
+    {
+      curr_max_width = required_mon_width;
+      required_mon_width = 0;
+      for (int i = 0; i < 12; i++)
+       {
+         size_t width = curr_max_width;
+
+         int req = mbsalign (nl_langinfo (ABMON_1 + i),
+                             abmon[i], sizeof (abmon[i]),
+                             &width, MBS_ALIGN_LEFT);
+
+         if (req == -1 || req >= sizeof(abmon[i]))
+           {
+             required_mon_width = 0; /* ignore precomputed strings.  */
+             return required_mon_width;
+           }
+
+         required_mon_width = MAX (required_mon_width, width);
+       }
+    }
+  while (curr_max_width > required_mon_width);
+#endif
+
+  return required_mon_width;
+}
+
 static size_t
 dev_ino_hash (void const *x, size_t table_size)
 {
@@ -1953,6 +2018,10 @@ decode_switches (int argc, char **argv)
                  }
              }
          }
+      /* Note we leave %5b etc. alone so user widths/flags are honoured.  */
+      if (strstr(long_time_format[0],"%b") || strstr(long_time_format[1],"%b"))
+       if (!abmon_init())
+         error (0, 0, _("error initializing month strings"));
     }
 
   return optind;
@@ -3317,6 +3386,35 @@ print_current_files (void)
     }
 }
 
+/* Replace the first %b with precomputed aligned month names.
+   Note on glibc-2.7 on linux at least this speeds up the whole `ls -lU`
+   process by around 17%, compared to letting strftime() handle the %b.  */
+
+static size_t
+align_nstrftime (char *src, size_t size, char const *fmt, struct tm const *tm,
+                int __utc, int __ns)
+{
+  const char *nfmt = fmt;
+  /* In the unlikely event that rpl_fmt below is not large enough,
+     the replacement is not done.  A malloc here slows ls down by 2%  */
+  char rpl_fmt[sizeof (abmon[0]) + 100];
+  char *pb = NULL;
+  if (required_mon_width && (pb = strstr (fmt, "%b")))
+    {
+      if (strlen(fmt) < (sizeof (rpl_fmt) - sizeof (abmon[0]) + 2))
+       {
+         char *pfmt = rpl_fmt;
+         nfmt = rpl_fmt;
+
+         pfmt = mempcpy (pfmt, fmt, pb - fmt);
+         pfmt = stpcpy (pfmt, abmon[tm->tm_mon]);
+         strcpy (pfmt, pb + 2);
+       }
+    }
+  size_t ret = nstrftime (src, size, nfmt, tm, __utc, __ns);
+  return ret;
+}
+
 /* Return the expected number of columns in a long-format time stamp,
    or zero if it cannot be calculated.  */
 
@@ -3341,7 +3439,7 @@ long_time_expected_width (void)
       if (tm)
        {
          size_t len =
-           nstrftime (buf, sizeof buf, long_time_format[0], tm, 0, 0);
+           align_nstrftime (buf, sizeof buf, long_time_format[0], tm, 0, 0);
          if (len != 0)
            width = mbsnwidth (buf, len, 0);
        }
@@ -3616,8 +3714,8 @@ print_long_format (const struct fileinfo *f)
 
       /* We assume here that all time zones are offset from UTC by a
         whole number of seconds.  */
-      s = nstrftime (p, TIME_STAMP_LEN_MAXIMUM + 1, fmt,
-                    when_local, 0, when_timespec.tv_nsec);
+      s = align_nstrftime (p, TIME_STAMP_LEN_MAXIMUM + 1, fmt,
+                          when_local, 0, when_timespec.tv_nsec);
     }
 
   if (s || !*p)
-- 
1.5.3.6


reply via email to

[Prev in Thread] Current Thread [Next in Thread]