bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

fix du and wc wrt --files0-from=F


From: Jim Meyering
Subject: fix du and wc wrt --files0-from=F
Date: Mon, 01 Dec 2008 20:23:23 +0100

Here are the two patches I expect to push:
[as I write this I realize that while there was already very good
test coverage of most of the modified code, I should write a test that
exercises the bug in question, at least for du: read a large list from
a file and ensure that it no longer fails in a memory-limited environment. ]

>From b1f67649bfa3c4b2c5c3b584799e49284d0ee4c1 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Mon, 24 Nov 2008 09:55:55 +0100
Subject: [PATCH 1/2] du: read and process --files0-from= input a name at a time,

rather than by reading the entire input into RAM and *then*
processing each file name.
* src/du.c: Include "argv-iter.h", not "readtokens0.h".
(main): Rewrite to use argv-iter.
Call xfts_open on each argument, rather than on the entire
argv list at once.
Call print_size here, not from du_files.
Diagnose read failure.
* NEWS (Bug fixes): Mention it.
* THANKS: update.
Reported by Barry Kelly.  More details in
http://article.gmane.org/gmane.comp.gnu.core-utils.bugs/15159/
---
 NEWS     |    3 +
 THANKS   |    1 +
 src/du.c |  140 ++++++++++++++++++++++++++++++++++----------------------------
 3 files changed, 81 insertions(+), 63 deletions(-)

diff --git a/NEWS b/NEWS
index f0c5550..158b089 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,9 @@ GNU coreutils NEWS                                    -*- 
outline -*-

   cp uses much less memory in some situations

+  du --files0-from=FILE no longer reads all of FILE into RAM before
+  processing the first file name
+
   seq 9223372036854775807 9223372036854775808 now prints only two numbers
   on systems with extended long double support and good library support.
   Even with this patch, on some systems, it still produces invalid output,
diff --git a/THANKS b/THANKS
index d06f755..f74d4fb 100644
--- a/THANKS
+++ b/THANKS
@@ -58,6 +58,7 @@ Augey Mikus                         address@hidden
 Aurelien Jarno                      address@hidden
 Austin Donnelly                     address@hidden
 Axel Kittenberger                   address@hidden
+Barry Kelly                         http://barrkel.blogspot.com/
 Bauke Jan Douma                     address@hidden
 Ben Elliston                        address@hidden
 Ben Harris                          address@hidden
diff --git a/src/du.c b/src/du.c
index e566978..6e4d28b 100644
--- a/src/du.c
+++ b/src/du.c
@@ -30,6 +30,7 @@
 #include <assert.h>
 #include "system.h"
 #include "argmatch.h"
+#include "argv-iter.h"
 #include "error.h"
 #include "exclude.h"
 #include "fprintftime.h"
@@ -37,7 +38,6 @@
 #include "human.h"
 #include "quote.h"
 #include "quotearg.h"
-#include "readtokens0.h"
 #include "same.h"
 #include "stat-time.h"
 #include "xfts.h"
@@ -649,9 +649,6 @@ du_files (char **files, int bit_flags)
       fts_close (fts);
     }

-  if (print_grand_total)
-    print_size (&tot_dui, _("total"));
-
   return ok;
 }

@@ -660,10 +657,8 @@ main (int argc, char **argv)
 {
   char *cwd_only[2];
   bool max_depth_specified = false;
-  char **files;
   bool ok = true;
   char *files_from = NULL;
-  struct Tokens tok;

   /* Bit flags that control how fts works.  */
   int bit_flags = FTS_TIGHT_CYCLE_CHECK;
@@ -926,6 +921,7 @@ main (int argc, char **argv)
         }
     }

+  struct argv_iterator *ai;
   if (files_from)
     {
       /* When using --files0-from=F, you may not specify any files
@@ -942,78 +938,96 @@ main (int argc, char **argv)
        error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
               quote (files_from));

-      readtokens0_init (&tok);
-
-      if (! readtokens0 (stdin, &tok) || fclose (stdin) != 0)
-       error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
-              quote (files_from));
-
-      files = tok.tok;
+      ai = argv_iter_init_stream (stdin);
     }
   else
     {
-      files = (optind < argc ? argv + optind : cwd_only);
+      char **files = (optind < argc ? argv + optind : cwd_only);
+      ai = argv_iter_init_argv (files);
     }

+  if (!ai)
+    xalloc_die ();
+
   /* Initialize the hash structure for inode numbers.  */
   hash_init ();

-  /* Report and filter out any empty file names before invoking fts.
-     This works around a glitch in fts, which fails immediately
-     (without looking at the other file names) when given an empty
-     file name.  */
-  {
-    size_t i = 0;
-    size_t j;
+  bit_flags |= symlink_deref_bits;
+  static char *temp_argv[] = { NULL, NULL };

-    for (j = 0; ; j++)
-      {
-       if (i != j)
-         files[i] = files[j];
+  while (true)
+    {
+      bool skip_file = false;
+      enum argv_iter_err ai_err;
+      char *file_name = argv_iter (ai, &ai_err);
+      if (ai_err == AI_ERR_EOF)
+       break;
+      if (!file_name)
+       {
+         switch (ai_err)
+           {
+           case AI_ERR_READ:
+             error (0, errno, _("%s: read error"), quote (files_from));
+             skip_file = true;
+             continue;

-       if ( ! files[i])
-         break;
+           case AI_ERR_MEM:
+             xalloc_die ();

-       if (files_from && STREQ (files_from, "-") && STREQ (files[i], "-"))
-         {
-           /* Give a better diagnostic in an unusual case:
-              printf - | du --files0-from=- */
-           error (0, 0, _("when reading file names from stdin, "
-                          "no file name of %s allowed"),
-                  quote ("-"));
-           continue;
-         }
+           default:
+             assert (!"unexpected error code from argv_iter");
+           }
+       }
+      if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
+       {
+         /* Give a better diagnostic in an unusual case:
+            printf - | du --files0-from=- */
+         error (0, 0, _("when reading file names from stdin, "
+                        "no file name of %s allowed"),
+                quote (file_name));
+         skip_file = true;
+       }

-       if (files[i][0])
-         i++;
-       else
-         {
-           /* Diagnose a zero-length file name.  When it's one
-              among many, knowing the record number may help.  */
-           if (files_from)
-             {
-               /* Using the standard `filename:line-number:' prefix here is
-                  not totally appropriate, since NUL is the separator, not NL,
-                  but it might be better than nothing.  */
-               unsigned long int file_number = j + 1;
-               error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
-                      file_number, _("invalid zero-length file name"));
-             }
-           else
-             error (0, 0, "%s", _("invalid zero-length file name"));
-         }
-      }
+      /* Report and skip any empty file names before invoking fts.
+        This works around a glitch in fts, which fails immediately
+        (without looking at the other file names) when given an empty
+        file name.  */
+      if (!file_name[0])
+       {
+         /* Diagnose a zero-length file name.  When it's one
+            among many, knowing the record number may help.
+            FIXME: currently print the record number only with
+            --files0-from=FILE.  Maybe do it for argv, too?  */
+         if (files_from == NULL)
+           error (0, 0, "%s", _("invalid zero-length file name"));
+         else
+           {
+             /* Using the standard `filename:line-number:' prefix here is
+                not totally appropriate, since NUL is the separator, not NL,
+                but it might be better than nothing.  */
+             unsigned long int file_number = argv_iter_n_args (ai);
+             error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
+                    file_number, _("invalid zero-length file name"));
+           }
+         skip_file = true;
+       }
+
+      if (skip_file)
+       ok = false;
+      else
+       {
+         temp_argv[0] = file_name;
+         ok &= du_files (temp_argv, bit_flags);
+       }
+    }

-    ok = (i == j);
-  }
+  argv_iter_free (ai);

-  bit_flags |= symlink_deref_bits;
-  ok &= du_files (files, bit_flags);
+  if (files_from && (ferror (stdin) || fclose (stdin) != 0))
+    error (EXIT_FAILURE, 0, _("error reading %s"), quote (files_from));

-  /* This isn't really necessary, but it does ensure we
-     exercise this function.  */
-  if (files_from)
-    readtokens0_free (&tok);
+  if (print_grand_total)
+    print_size (&tot_dui, _("total"));

   hash_free (htab);

--
1.6.0.4.1101.g642f8


>From 622f12ed8fd799ad83546233a60655a6a0d2a4b6 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Tue, 25 Nov 2008 18:38:26 +0100
Subject: [PATCH 2/2] wc: read and process --files0-from= input a name at a time,

when the file name list is not too large.  Before, wc would always
reading the entire file name list into RAM and *then* process each
file name.
* src/wc.c: Include "argv-iter.h".
(main): Rewrite to use argv-iter when the input file name list
is known to be too large.
* NEWS (Bug fixes): Mention it.
---
 NEWS     |    4 ++
 src/wc.c |  139 ++++++++++++++++++++++++++++++++++++++++++--------------------
 2 files changed, 99 insertions(+), 44 deletions(-)

diff --git a/NEWS b/NEWS
index 158b089..6a85dd7 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,10 @@ GNU coreutils NEWS                                    -*- 
outline -*-
   Even with this patch, on some systems, it still produces invalid output,
   from 3 to at least 1026 lines long. [bug introduced in coreutils-6.11]

+  wc --files0-from=FILE no longer reads all of FILE into RAM, before
+  processing the first file name, unless the list of names is known
+  to be small enough.
+
 ** Changes in behavior

   cp and mv: the --reply={yes,no,query} option has been removed.
diff --git a/src/wc.c b/src/wc.c
index 8cfd974..25494d6 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -20,14 +20,17 @@
 #include <config.h>

 #include <stdio.h>
+#include <assert.h>
 #include <getopt.h>
 #include <sys/types.h>
 #include <wchar.h>
 #include <wctype.h>

 #include "system.h"
+#include "argv-iter.h"
 #include "error.h"
 #include "mbchar.h"
+#include "physmem.h"
 #include "quote.h"
 #include "quotearg.h"
 #include "readtokens0.h"
@@ -515,17 +518,19 @@ wc_file (char const *file, struct fstatus *fstatus)
 /* Return the file status for the NFILES files addressed by FILE.
    Optimize the case where only one number is printed, for just one
    file; in that case we can use a print width of 1, so we don't need
-   to stat the file.  */
+   to stat the file.  Handle the case of (nfiles == 0) in the same way;
+   that happens when we don't know how long the list of file names will be.  */

 static struct fstatus *
-get_input_fstatus (int nfiles, char * const *file)
+get_input_fstatus (int nfiles, char *const *file)
 {
-  struct fstatus *fstatus = xnmalloc (nfiles, sizeof *fstatus);
+  struct fstatus *fstatus = xnmalloc (nfiles ? nfiles : 1, sizeof *fstatus);

-  if (nfiles == 1
-      && ((print_lines + print_words + print_chars
-          + print_bytes + print_linelength)
-         == 1))
+  if (nfiles == 0
+      || (nfiles == 1
+         && ((print_lines + print_words + print_chars
+              + print_bytes + print_linelength)
+             == 1)))
     fstatus[0].failed = 1;
   else
     {
@@ -577,7 +582,6 @@ compute_number_width (int nfiles, struct fstatus const 
*fstatus)
 int
 main (int argc, char **argv)
 {
-  int i;
   bool ok;
   int optc;
   int nfiles;
@@ -637,6 +641,8 @@ main (int argc, char **argv)
         | print_linelength))
     print_lines = print_words = print_bytes = true;

+  bool read_tokens = false;
+  struct argv_iterator *ai;
   if (files_from)
     {
       FILE *stream;
@@ -661,69 +667,114 @@ main (int argc, char **argv)
                   quote (files_from));
        }

-      readtokens0_init (&tok);
-
-      if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
-       error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
-              quote (files_from));
-
-      files = tok.tok;
-      nfiles = tok.n_tok;
+      /* Read the file list into RAM if we can detect its size and that
+        size is reasonable.  Otherwise, we'll read a name at a time.  */
+      struct stat st;
+      if (fstat (fileno (stream), &st) == 0
+         && S_ISREG (st.st_mode)
+         && st.st_size <= MIN (10 * 1024 * 1024, physmem_available () / 2))
+       {
+         read_tokens = true;
+         readtokens0_init (&tok);
+         if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
+           error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
+                  quote (files_from));
+         files = tok.tok;
+         nfiles = tok.n_tok;
+         ai = argv_iter_init_argv (files);
+       }
+      else
+       {
+         files = NULL;
+         nfiles = 0;
+         ai = argv_iter_init_stream (stream);
+       }
     }
   else
     {
-      static char *stdin_only[2];
+      static char *stdin_only[] = { NULL };
       files = (optind < argc ? argv + optind : stdin_only);
       nfiles = (optind < argc ? argc - optind : 1);
-      stdin_only[0] = NULL;
+      ai = argv_iter_init_argv (files);
     }

   fstatus = get_input_fstatus (nfiles, files);
   number_width = compute_number_width (nfiles, fstatus);

+  int i;
   ok = true;
-  for (i = 0; i < nfiles; i++)
+  for (i = 0; /* */; i++)
     {
-      if (files[i])
+      bool skip_file = false;
+      enum argv_iter_err ai_err;
+      char *file_name = argv_iter (ai, &ai_err);
+      if (ai_err == AI_ERR_EOF)
+       break;
+      if (!file_name)
        {
-         if (files_from && STREQ (files_from, "-") && STREQ (files[i], "-"))
+         switch (ai_err)
            {
-             ok = false;
-             /* Give a better diagnostic in an unusual case:
-                printf - | wc --files0-from=- */
-             error (0, 0, _("when reading file names from stdin, "
-                            "no file name of %s allowed"),
-                    quote ("-"));
+           case AI_ERR_READ:
+             error (0, errno, _("%s: read error"), quote (files_from));
+             skip_file = true;
              continue;
+           case AI_ERR_MEM:
+             xalloc_die ();
+           default:
+             assert (!"unexpected error code from argv_iter");
            }
+       }
+      if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
+       {
+         /* Give a better diagnostic in an unusual case:
+            printf - | wc --files0-from=- */
+         error (0, 0, _("when reading file names from stdin, "
+                        "no file name of %s allowed"),
+                quote (file_name));
+         skip_file = true;
+       }

+      if (!file_name[0])
+       {
          /* Diagnose a zero-length file name.  When it's one
-            among many, knowing the record number may help.  */
-         if (files[i][0] == '\0')
+            among many, knowing the record number may help.
+            FIXME: currently print the record number only with
+            --files0-from=FILE.  Maybe do it for argv, too?  */
+         if (files_from == NULL)
+           error (0, 0, "%s", _("invalid zero-length file name"));
+         else
            {
-             ok = false;
-             if (files_from)
-               {
-                 /* Using the standard `filename:line-number:' prefix here is
-                    not totally appropriate, since NUL is the separator, not 
NL,
-                    but it might be better than nothing.  */
-                 unsigned long int file_number = i + 1;
-                 error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
-                        file_number, _("invalid zero-length file name"));
-               }
-             else
-               error (0, 0, "%s", _("invalid zero-length file name"));
-             continue;
+             /* Using the standard `filename:line-number:' prefix here is
+                not totally appropriate, since NUL is the separator, not NL,
+                but it might be better than nothing.  */
+             unsigned long int file_number = argv_iter_n_args (ai);
+             error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
+                    file_number, _("invalid zero-length file name"));
            }
+         skip_file = true;
        }

-      ok &= wc_file (files[i], &fstatus[i]);
+      if (skip_file)
+       ok = false;
+      else
+       ok &= wc_file (file_name, &fstatus[nfiles ? i : 0]);
     }

-  if (1 < nfiles)
+  /* No arguments on the command line is fine.  That means read from stdin.
+     However, no arguments on the --files0-from input stream is an error
+     means don't read anything.  */
+  if (ok && !files_from && argv_iter_n_args (ai) == 0)
+    ok &= wc_file (NULL, &fstatus[0]);
+
+  if (read_tokens)
+    readtokens0_free (&tok);
+
+  if (1 < argv_iter_n_args (ai))
     write_counts (total_lines, total_words, total_chars, total_bytes,
                  max_line_length, _("total"));

+  argv_iter_free (ai);
+
   free (fstatus);

   if (have_read_stdin && close (STDIN_FILENO) != 0)
--
1.6.0.4.1101.g642f8




reply via email to

[Prev in Thread] Current Thread [Next in Thread]