[PATCH 2/2] split: process more efficiently when filters exit early

coreutils

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 2/2] split: process more efficiently when filters exit early

From:	Pádraig Brady
Subject:	[PATCH 2/2] split: process more efficiently when filters exit early
Date:	Sun, 19 Mar 2017 22:53:59 -0700

* src/split.c (bytes_split): Don't write to an existing filter
if it has exited.  When filters exit early, skip input data if
possible.  Refactor out 2 redundant variables.
* tests/split/filter.sh: Improve test coverage given the
new more efficient processing.  Also use a 10TB file to
expand the file systems tested on.
---
 src/split.c           | 37 ++++++++++++++++++++++---------------
 tests/split/filter.sh | 33 +++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/src/split.c b/src/split.c
index 85bc052..01f97af 100644
--- a/src/split.c
+++ b/src/split.c
@@ -623,6 +623,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, 
size_t initial_read,
 {
   size_t n_read;
   bool new_file_flag = true;
+  bool filter_ok = true;
   uintmax_t to_write = n_bytes;
   uintmax_t opened = 0;
   bool eof;
@@ -637,42 +638,48 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t 
bufsize, size_t initial_read,
         }
       else
         {
+          if (! filter_ok
+              && lseek (STDIN_FILENO, to_write, SEEK_CUR) != -1)
+            {
+              to_write = n_bytes;
+              new_file_flag = true;
+            }
+
           n_read = safe_read (STDIN_FILENO, buf, bufsize);
           if (n_read == SAFE_READ_ERROR)
             die (EXIT_FAILURE, errno, "%s", quotef (infile));
           eof = n_read == 0;
         }
       char *bp_out = buf;
-      size_t to_read = n_read;
-      while (to_write <= to_read)
+      while (to_write <= n_read)
         {
-          size_t w = to_write;
-          bool cwrite_ok = cwrite (new_file_flag, bp_out, w);
+          if (filter_ok || new_file_flag)
+            filter_ok = cwrite (new_file_flag, bp_out, to_write);
           opened += new_file_flag;
           new_file_flag = !max_files || (opened < max_files);
-          if (!new_file_flag && !cwrite_ok)
+          if (! filter_ok && ! new_file_flag)
             {
-              /* If filter no longer accepting input, stop reading.  */
-              n_read = to_read = 0;
+              /* If filters no longer accepting input, stop reading.  */
+              n_read = 0;
               eof = true;
               break;
             }
-          bp_out += w;
-          to_read -= w;
+          bp_out += to_write;
+          n_read -= to_write;
           to_write = n_bytes;
         }
-      if (to_read != 0)
+      if (n_read != 0)
         {
-          bool cwrite_ok = cwrite (new_file_flag, bp_out, to_read);
+          if (filter_ok || new_file_flag)
+            filter_ok = cwrite (new_file_flag, bp_out, n_read);
           opened += new_file_flag;
-          to_write -= to_read;
           new_file_flag = false;
-          if (!cwrite_ok && opened == max_files)
+          if (! filter_ok && opened == max_files)
             {
-              /* If filter no longer accepting input, stop reading.  */
-              n_read = 0;
+              /* If filters no longer accepting input, stop reading.  */
               break;
             }
+          to_write -= n_read;
         }
     }
   while (! eof);
diff --git a/tests/split/filter.sh b/tests/split/filter.sh
index a85093c..a703b3b 100755
--- a/tests/split/filter.sh
+++ b/tests/split/filter.sh
@@ -18,8 +18,7 @@
 
 . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
 print_ver_ split
-require_sparse_support_ # for 'truncate --size=$OFF_T_MAX'
-eval $(getlimits) # for OFF_T limits
+require_sparse_support_ # for 'truncate --size=$LARGE'
 xz --version || skip_ "xz (better than gzip/bzip2) required"
 
 for total_n_lines in 5 3000 20000; do
@@ -52,15 +51,29 @@ returns_ 1 split -n 1/2 --filter='true' /dev/null 2>&1 || 
fail=1
 # where they would result in a non zero exit from split.
 yes | head -n200K | split -b1G --filter='head -c1 >/dev/null' || fail=1
 
-# Do not use a size of OFF_T_MAX, since split.c applies a GNU/Hurd
-# /dev/zero workaround for files of that size.  Use one less:
-N=$(expr $OFF_T_MAX - 1)
-
 # Ensure that "endless" input is ignored when all filters finish
-timeout 10 sh -c 'yes | split --filter="head -c1 >/dev/null" -n r/1' || fail=1
-if truncate -s$N zero.in; then
-  timeout 10 sh -c 'split --filter="head -c1 >/dev/null" -n 1 zero.in' || 
fail=1
-fi
+for mode in '' 'r/'; do
+  FILE = '-'
+  if test "$mode" = ''; then
+    FILE = 'zero.in'
+    truncate -s10T "$FILE" || continue
+  fi
+  for N in 1 2; do
+    rm -f x??.n || framework_failure_
+    timeout 10 sh -c \
+      "yes | split --filter='head -c1 >\$FILE.n' -n $mode$N $FILE" || fail=1
+    # Also ensure we get appropriate output from each filter
+    seq 1 $N | tr '0-9' 1 > stat.exp
+    stat -c%s x??.n > stat.out || framework_failure_
+    compare stat.exp stat.out || fail=1
+  done
+done
+
+# Ensure that "endless" input _is_ processed for unbounded number of filters
+for buf in 1000 1000000; do
+  returns_ 124 timeout .5 sh -c \
+    "yes | split --filter='head -c1 >/dev/null' -b $buf" || fail=1
+done
 
 # Ensure that "endless" input _is_ processed for unbounded number of filters
 for buf in 1000 1000000; do
-- 
2.9.3

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH 1/2] split: ensure input is processed when filters exit early, Pádraig Brady, 2017/03/20
- [PATCH 2/2] split: process more efficiently when filters exit early, Pádraig Brady <=

Prev by Date: [PATCH 1/2] split: ensure input is processed when filters exit early
Next by Date: [PATCH] maint: avoid a static analysis warning in expand-common
Previous by thread: [PATCH 1/2] split: ensure input is processed when filters exit early
Next by thread: [PATCH] maint: avoid a static analysis warning in expand-common
Index(es):
- Date
- Thread