>From f3548e6fd20755629c33e7e0d87b1c455677aedd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sat, 20 Apr 2013 08:46:43 +0100 Subject: [PATCH] split: change -C to only allocate memory when required * src/split.c (line_bytes_split): Rewrite to only buffer when necessary. I.E. only increase the buffer when we've already lines output in a split and we encounter a line larger than the input buffer size, in which case a hold buffer will be increased in increments of the input buffer size. * tests/split/line-bytes.sh: Add a new test for this function which previously had no test coverage. * tests/local.mk: Reference the new test. * NEWS: Mention the improvement. Fixes http://bugs.gnu.org/13537 --- NEWS | 3 + src/split.c | 137 +++++++++++++++++++++++++++++++-------------- tests/local.mk | 1 + tests/split/line-bytes.sh | 85 ++++++++++++++++++++++++++++ 4 files changed, 184 insertions(+), 42 deletions(-) create mode 100755 tests/split/line-bytes.sh diff --git a/NEWS b/NEWS index fc05b8a..e9a8efa 100644 --- a/NEWS +++ b/NEWS @@ -41,6 +41,9 @@ GNU coreutils NEWS -*- outline -*- Reservoir sampling is used to limit memory usage based on the number of outputs, rather than the number of inputs. + split --line-bytes=SIZE, now only allocates memory as needed rather + than allocating SIZE bytes at program start. + ** Build-related factor now builds on aarch64 based systems [bug introduced in coreutils-8.20] diff --git a/src/split.c b/src/split.c index e5e75f0..e6a7ce7 100644 --- a/src/split.c +++ b/src/split.c @@ -616,62 +616,115 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize) } while (n_read == bufsize); } - + /* Split into pieces that are as large as possible while still not more than N_BYTES bytes, and are split on line boundaries except - where lines longer than N_BYTES bytes occur. - FIXME: Allow N_BYTES to be any uintmax_t value, and don't require a - buffer of size N_BYTES, in case N_BYTES is very large. */ + where lines longer than N_BYTES bytes occur. */ static void -line_bytes_split (size_t n_bytes) +line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) { - char *bp; - bool eof = false; - size_t n_buffered = 0; - char *buf = xmalloc (n_bytes); + size_t n_read; + uintmax_t n_out = 0; /* for each split. */ + size_t n_hold = 0; + char *hold = NULL; /* for lines > bufsize. */ + size_t hold_size = 0; + bool split_line = false; /* Whether a \n was output in a split. */ do { - /* Fill up the full buffer size from the input file. */ - - size_t to_read = n_bytes - n_buffered; - size_t n_read = full_read (STDIN_FILENO, buf + n_buffered, to_read); - if (n_read < to_read && errno) + n_read = full_read (STDIN_FILENO, buf, bufsize); + if (n_read < bufsize && errno) error (EXIT_FAILURE, errno, "%s", infile); - - n_buffered += n_read; - if (n_buffered != n_bytes) + size_t n_left = n_read; + char *sob = buf; + while (n_left) { - if (n_buffered == 0) - break; - eof = true; - } + size_t split_rest = 0; + char *eoc = NULL; + char *eol; - /* Find where to end this chunk. */ - bp = buf + n_buffered; - if (n_buffered == n_bytes) - { - while (bp > buf && bp[-1] != '\n') - bp--; - } + /* Determine End Of Chunk and/or End of Line, + which are used below to select what to write or buffer. */ + if (n_bytes - n_out - n_hold <= n_left) + { + /* Have enough for split. */ + split_rest = n_bytes - n_out - n_hold; + eoc = sob + split_rest - 1; + eol = memrchr (sob, '\n', split_rest); + } + else + eol = memrchr (sob, '\n', n_left); + + /* Output hold space if possible. */ + if (n_hold && !(!eol && n_out)) + { + cwrite (n_out == 0, hold, n_hold); + n_out += n_hold; + if (n_hold > bufsize) + hold = xrealloc (hold, bufsize); + n_hold = 0; + hold_size = bufsize; + } + + /* Output to eol if present. */ + if (eol) + { + split_line = true; + size_t n_write = eol - sob + 1; + cwrite (n_out == 0, sob, n_write); + n_out += n_write; + n_left -= n_write; + sob += n_write; + if (eoc) + split_rest -= n_write; + } - /* If chunk has no newlines, use all the chunk. */ - if (bp == buf) - bp = buf + n_buffered; + /* Output to eoc or eob if possible. */ + if (n_left && !split_line) + { + size_t n_write = eoc ? split_rest : n_left; + cwrite (n_out == 0, sob, n_write); + n_out += n_write; + n_left -= n_write; + sob += n_write; + if (eoc) + split_rest -= n_write; + } - /* Output the chars as one output file. */ - cwrite (true, buf, bp - buf); + /* Update hold if needed. */ + if ((eoc && split_rest) || (!eoc && n_left)) + { + size_t n_buf = eoc ? split_rest : n_left; + if (hold_size - n_hold < n_buf) + { + if (hold_size <= SIZE_MAX - bufsize) + hold_size += bufsize; + else + error (EXIT_FAILURE, 0, "%s", _("memory exhausted")); + hold = xrealloc (hold, hold_size); + } + memcpy (hold + n_hold, sob, n_buf); + n_hold += n_buf; + n_left -= n_buf; + sob += n_buf; + } - /* Discard the chars we just output; move rest of chunk - down to be the start of the next chunk. Source and - destination probably overlap. */ - n_buffered -= bp - buf; - if (n_buffered > 0) - memmove (buf, bp, n_buffered); + /* Reset for new split. */ + if (eoc) + { + n_out = 0; + split_line = false; + } + } } - while (!eof); - free (buf); + while (n_read == bufsize); + + /* Handle no eol at end of file. */ + if (n_hold) + cwrite (n_out == 0, hold, n_hold); + + free (hold); } /* -n l/[K/]N: Write lines to files of approximately file size / N. @@ -1408,7 +1461,7 @@ main (int argc, char **argv) break; case type_byteslines: - line_bytes_split (n_units); + line_bytes_split (n_units, buf, in_blk_size); break; case type_chunk_bytes: diff --git a/tests/local.mk b/tests/local.mk index f47da8d..b255464 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -345,6 +345,7 @@ all_tests = \ tests/split/b-chunk.sh \ tests/split/fail.sh \ tests/split/lines.sh \ + tests/split/line-bytes.sh \ tests/split/l-chunk.sh \ tests/split/r-chunk.sh \ tests/split/numeric.sh \ diff --git a/tests/split/line-bytes.sh b/tests/split/line-bytes.sh new file mode 100755 index 0000000..ed2baac --- /dev/null +++ b/tests/split/line-bytes.sh @@ -0,0 +1,85 @@ +#!/bin/sh +# test -C, --lines-bytes + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ split + + +# Ensure memory is not allocated up front +(ulimit -v 20000; split -C 'E' /dev/null || fail=1) + + +# Ensure correct operation with various split and buffer size combinations + +lines=\ +1~2222~3~4 + +printf '%s' "$lines" | tr '~' '\n' > in || framework_failure_ + +cat <<\EOF > splits_exp +1 1 1 1 1 1 1 1 1 1 +2 2 2 1 2 1 +2 3 2 2 1 +2 4 3 1 +2 5 3 +2 5 3 +7 3 +7 3 +9 1 +9 1 +10 +EOF + +seq 0 9 | tr -d '\n' > no_eol_in + +cat <<\EOF > no_eol_splits_exp +1 1 1 1 1 1 1 1 1 1 +2 2 2 2 2 +3 3 3 1 +4 4 2 +5 5 +6 4 +7 3 +8 2 +9 1 +10 +10 +EOF + +for b in $(seq 10); do + : > splits + : > no_eol_splits + for s in $(seq 11); do + rm x?? + split ---io=$b -C$s in || fail=1 + cat x* > out || framework_failure_ + compare in out || fail=1 + stat -c %s x* | paste -s -d ' ' >> splits + + rm x?? + split ---io=$b -C$s no_eol_in || fail=1 + cat x* > out || framework_failure_ + cat xaa + compare no_eol_in out || fail=1 + stat -c %s x* | paste -s -d ' ' >> no_eol_splits + done + compare splits_exp splits || fail=1 + compare no_eol_splits_exp no_eol_splits || fail=1 +done + +Exit $fail -- 1.7.7.6