>From 0e48615b9bee36ea29130ba8396c1646be3f76d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sun, 24 Mar 2013 12:27:46 +0000 Subject: [PATCH 1/9] test: adjust new shuf-reservoir.sh test style: Remove redundant line continuation chars style: put 'for ...; do' on same line style: "non interpolated stuff" -> 'non interpolated stuff' style: remove redundant fail=0 (caught by syntax check) efficiency: Don't use cat file | ... accuracy: count lines twice lest we ignore erroneous output Also I marked this as expensive_ rather than very_expensive_ --- tests/misc/shuf-reservoir.sh | 33 ++++++++++++++------------------- 1 files changed, 14 insertions(+), 19 deletions(-) diff --git a/tests/misc/shuf-reservoir.sh b/tests/misc/shuf-reservoir.sh index 60f5a74..78bfd62 100755 --- a/tests/misc/shuf-reservoir.sh +++ b/tests/misc/shuf-reservoir.sh @@ -21,7 +21,7 @@ . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src print_ver_ shuf -very_expensive_ +expensive_ require_valgrind_ getlimits_ @@ -34,41 +34,36 @@ run_shuf_n() # Critical memory-related bugs will cause a segfault here # (with varying numbres of input/output lines) - seq "$INPUT_LINES" | \ - valgrind --leak-check=full --error-exitcode=1 shuf -n "$OUTPUT_LINES" \ - -o "out_${INPUT_LINES}_${OUTPUT_LINES}" || return 1 + seq "$INPUT_LINES" | valgrind --leak-check=full --error-exitcode=1 \ + shuf -n "$OUTPUT_LINES" -o "out_${INPUT_LINES}_${OUTPUT_LINES}" || return 1 EXPECTED_LINES="$OUTPUT_LINES" test "$INPUT_LINES" -lt "$OUTPUT_LINES" && EXPECTED_LINES="$INPUT_LINES" # There is no sure way to verify shuffled output (as it is random). - # Ensure we got enough lines, they are all numeric, non-empty, - # and not duplicated (all would indicate a bug). - LINES=$(cat "out_${INPUT_LINES}_${OUTPUT_LINES}" | \ - grep "^[0-9][0-9]*$" | \ - LC_ALL=C sort -n | \ - uniq | wc -l) || framework_failure_ + # Ensure we have the correct number of all numeric lines non duplicated lines. + GOOD_LINES=$(grep '^[0-9][0-9]*$' "out_${INPUT_LINES}_${OUTPUT_LINES}" | + sort -un | wc -l) || framework_failure_ + LINES=$(wc -l < "out_${INPUT_LINES}_${OUTPUT_LINES}") || framework_failure_ + test "$EXPECTED_LINES" -eq "$GOOD_LINES" || return 1 test "$EXPECTED_LINES" -eq "$LINES" || return 1 return 0 } -fail=0 - # Test multiple combinations of input lines and output lines. # (e.g. small number of input lines and large number of output lines, # and vice-versa. Also, each reservoir allocation uses a 1000-lines batch, # so test 999/1000/1001 and related values). TEST_LINES="0 1 5 999 1000 1001 2999 3000 3001" -for IN_N in $TEST_LINES -do - for OUT_N in $TEST_LINES - do - run_shuf_n "$IN_N" "$OUT_N" || - { fail=1 ; echo "shuf-reservoir-sampling failed with "\ - "IN_N=$IN_N OUT_N=$OUT_N" 1>&2 ; } +for IN_N in $TEST_LINES; do + for OUT_N in $TEST_LINES; do + run_shuf_n "$IN_N" "$OUT_N" || { + fail=1 + echo "shuf-reservoir-sampling failed with IN_N=$IN_N OUT_N=$OUT_N" >&2; + } done done -- 1.7.7.6 >From 7e3503b9b027ca767256a1b4c6ea04cb0f35728d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sun, 24 Mar 2013 12:50:42 +0000 Subject: [PATCH 2/9] shuf: adjust resevoir batch size to 1024 Since this is a memory management batch size 1024 is a more natural boundary than 1000 --- src/shuf.c | 2 +- tests/misc/shuf-reservoir.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index 91b475b..762d2fc 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -40,7 +40,7 @@ #define AUTHORS proper_name ("Paul Eggert") /* For reservoir-sampling, allocate the resevoir lines in batches */ -enum { RESERVOIR_LINES_INCREMENT = 1000 }; +enum { RESERVOIR_LINES_INCREMENT = 1024 }; void diff --git a/tests/misc/shuf-reservoir.sh b/tests/misc/shuf-reservoir.sh index 78bfd62..b695afc 100755 --- a/tests/misc/shuf-reservoir.sh +++ b/tests/misc/shuf-reservoir.sh @@ -54,9 +54,9 @@ run_shuf_n() # Test multiple combinations of input lines and output lines. # (e.g. small number of input lines and large number of output lines, -# and vice-versa. Also, each reservoir allocation uses a 1000-lines batch, -# so test 999/1000/1001 and related values). -TEST_LINES="0 1 5 999 1000 1001 2999 3000 3001" +# and vice-versa. Also, each reservoir allocation uses a 1024-lines batch, +# so test 1023/1024/1025 and related values). +TEST_LINES="0 1 5 1023 1024 1025 3071 3072 3073" for IN_N in $TEST_LINES; do for OUT_N in $TEST_LINES; do -- 1.7.7.6 >From 67847b6ec6ffa5225e3ab667e05355f0eec0dd4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sun, 24 Mar 2013 13:13:27 +0000 Subject: [PATCH 3/9] maint: adjust whitespace for new shuf resevoir sampling Run through indent(1) with adjustments --- src/shuf.c | 54 +++++++++++++++++++++++++++--------------------------- 1 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index 762d2fc..820b5e4 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -39,7 +39,7 @@ #define AUTHORS proper_name ("Paul Eggert") -/* For reservoir-sampling, allocate the resevoir lines in batches */ +/* For reservoir-sampling, allocate the reservoir lines in batches */ enum { RESERVOIR_LINES_INCREMENT = 1024 }; @@ -145,23 +145,24 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, struct randint_source *s, struct linebuffer **out_rsrv) { - size_t n_lines=0; - size_t n_alloc_lines = MIN (k,RESERVOIR_LINES_INCREMENT); - struct linebuffer *line = NULL ; - struct linebuffer *rsrv ; + size_t n_lines = 0; + size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT); + struct linebuffer *line = NULL; + struct linebuffer *rsrv; rsrv = xzalloc (n_alloc_lines * sizeof (struct linebuffer)); /* Fill the first K lines, directly into the reservoir */ - while ( n_lines < k - && (line=readlinebuffer_delim (&rsrv[n_lines], in, eolbyte))!=NULL ) + while (n_lines < k + && (line = + readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != NULL) { ++n_lines; /* Enlarge reservoir */ if (n_lines >= n_alloc_lines) { - n_alloc_lines += RESERVOIR_LINES_INCREMENT ; + n_alloc_lines += RESERVOIR_LINES_INCREMENT; rsrv = xrealloc (rsrv, n_alloc_lines * sizeof (struct linebuffer)); memset (&rsrv[n_lines], 0, RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer)); @@ -173,7 +174,7 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, if (line != NULL) { struct linebuffer dummy; - initbuffer (&dummy); /* space for lines that won't be put in reservoir */ + initbuffer (&dummy); /* space for lines that won't be put in reservoir */ /* Choose the fate of the next line, with decreasing probability (as n_lines increases in size). @@ -184,16 +185,16 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, With 'struct linebuffer', storing into existing buffer will reduce re-allocations (will only re-allocate if the new line is longer than the currently allocated space. */ - randint j = randint_choose (s, n_lines+1); - line = ( j < k ) ? (&rsrv[j]) : (&dummy) ; + randint j = randint_choose (s, n_lines + 1); + line = (j < k) ? (&rsrv[j]) : (&dummy); - while ( readlinebuffer_delim (line, in, eolbyte)!=NULL ) + while (readlinebuffer_delim (line, in, eolbyte) != NULL) { n_lines++; /* Choose the fate of the next line, see comment above */ - j = randint_choose (s, n_lines+1); - line = ( j < k ) ? (&rsrv[j]) : (&dummy) ; + j = randint_choose (s, n_lines + 1); + line = (j < k) ? (&rsrv[j]) : (&dummy); } freebuffer (&dummy); @@ -209,17 +210,16 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, static int write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines, - size_t const *permutation) + size_t const *permutation) { size_t i; for (i = 0; i < n_lines; i++) - { - const struct linebuffer *p = &lines[permutation[i]]; - if (fwrite (p->buffer, sizeof (char), - p->length, stdout) != p->length) - return -1; - } + { + const struct linebuffer *p = &lines[permutation[i]]; + if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length) + return -1; + } return 0; } @@ -263,7 +263,7 @@ read_input (FILE *in, char eolbyte, char ***pline) } static int -write_permuted_output (size_t n_lines, char * const *line, size_t lo_input, +write_permuted_output (size_t n_lines, char *const *line, size_t lo_input, size_t const *permutation, char eolbyte) { size_t i; @@ -271,7 +271,7 @@ write_permuted_output (size_t n_lines, char * const *line, size_t lo_input, if (line) for (i = 0; i < n_lines; i++) { - char * const *p = line + permutation[i]; + char *const *p = line + permutation[i]; size_t len = p[1] - p[0]; if (fwrite (p[0], sizeof *p[0], len, stdout) != len) return -1; @@ -436,7 +436,7 @@ main (int argc, char **argv) if (head_lines != SIZE_MAX) { use_reservoir_sampling = true; - n_lines = SIZE_MAX; /* unknown number of input lines, for now */ + n_lines = SIZE_MAX; /* unknown number of input lines, for now */ } else { @@ -456,9 +456,9 @@ main (int argc, char **argv) { /* Instead of reading the entire file into 'line', use reservoir-sampling to store just "head_lines" random lines. */ - head_lines = n_lines = read_input_reservoir_sampling (stdin, eolbyte, - head_lines, randint_source, - &reservoir); + n_lines = read_input_reservoir_sampling (stdin, eolbyte, head_lines, + randint_source, &reservoir); + head_lines = n_lines; } /* Close stdin now, rather than earlier, so that randint_all_new -- 1.7.7.6 >From df41aaf2c2e332aaebe634d7af2f5acebc0d23c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sun, 24 Mar 2013 15:12:18 +0000 Subject: [PATCH 4/9] maint: simplify shuf reservoir sampling loop using do..while Remove the redundant duplication outside the loop --- src/shuf.c | 11 +++-------- 1 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index 820b5e4..41dad22 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -185,17 +185,12 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, With 'struct linebuffer', storing into existing buffer will reduce re-allocations (will only re-allocate if the new line is longer than the currently allocated space. */ - randint j = randint_choose (s, n_lines + 1); - line = (j < k) ? (&rsrv[j]) : (&dummy); - - while (readlinebuffer_delim (line, in, eolbyte) != NULL) + do { - n_lines++; - - /* Choose the fate of the next line, see comment above */ - j = randint_choose (s, n_lines + 1); + randint j = randint_choose (s, n_lines + 1); /* 0 .. n_lines. */ line = (j < k) ? (&rsrv[j]) : (&dummy); } + while (readlinebuffer_delim (line, in, eolbyte) != NULL && n_lines++); freebuffer (&dummy); } -- 1.7.7.6 >From 8694e2cc5670872b9c319dc3332501a3f6154f36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sun, 24 Mar 2013 15:22:58 +0000 Subject: [PATCH 5/9] maint: augment/adjust shuf resevoir sampling comments Mostly formtting, but added a short description for the read_input_reservoir_sampling() function. --- src/shuf.c | 24 +++++++++++++----------- 1 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index 41dad22..bf6136c 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -39,7 +39,7 @@ #define AUTHORS proper_name ("Paul Eggert") -/* For reservoir-sampling, allocate the reservoir lines in batches */ +/* For reservoir-sampling, allocate the reservoir lines in batches. */ enum { RESERVOIR_LINES_INCREMENT = 1024 }; @@ -140,6 +140,9 @@ next_line (char *line, char eolbyte, size_t n) return p + 1; } +/* Read all lines and store up to K permuted lines in *OUT_RSRV. + Return the number of lines read, up to a maximum of K. */ + static size_t read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, struct randint_source *s, @@ -152,14 +155,14 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, rsrv = xzalloc (n_alloc_lines * sizeof (struct linebuffer)); - /* Fill the first K lines, directly into the reservoir */ + /* Fill the first K lines, directly into the reservoir. */ while (n_lines < k && (line = readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != NULL) { - ++n_lines; + n_lines++; - /* Enlarge reservoir */ + /* Enlarge reservoir. */ if (n_lines >= n_alloc_lines) { n_alloc_lines += RESERVOIR_LINES_INCREMENT; @@ -169,12 +172,11 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, } } - - /* last line wasn't NULL - so there are more lines to read */ + /* last line wasn't NULL - so there are more lines to read. */ if (line != NULL) { struct linebuffer dummy; - initbuffer (&dummy); /* space for lines that won't be put in reservoir */ + initbuffer (&dummy); /* space for lines not put in reservoir. */ /* Choose the fate of the next line, with decreasing probability (as n_lines increases in size). @@ -184,7 +186,7 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, With 'struct linebuffer', storing into existing buffer will reduce re-allocations (will only re-allocate if the new line is longer than - the currently allocated space. */ + the currently allocated space). */ do { randint j = randint_choose (s, n_lines + 1); /* 0 .. n_lines. */ @@ -195,7 +197,7 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, freebuffer (&dummy); } - /* no more input lines, or an input error */ + /* no more input lines, or an input error. */ if (ferror (in)) error (EXIT_FAILURE, errno, _("read error")); @@ -431,7 +433,7 @@ main (int argc, char **argv) if (head_lines != SIZE_MAX) { use_reservoir_sampling = true; - n_lines = SIZE_MAX; /* unknown number of input lines, for now */ + n_lines = SIZE_MAX; /* unknown number of input lines, for now. */ } else { @@ -450,7 +452,7 @@ main (int argc, char **argv) if (use_reservoir_sampling) { /* Instead of reading the entire file into 'line', - use reservoir-sampling to store just "head_lines" random lines. */ + use reservoir-sampling to store just "head_lines" random lines. */ n_lines = read_input_reservoir_sampling (stdin, eolbyte, head_lines, randint_source, &reservoir); head_lines = n_lines; -- 1.7.7.6 >From e20d930c47ffc35541e83551e1d3fc90393d1f94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sun, 24 Mar 2013 15:57:04 +0000 Subject: [PATCH 6/9] shuf: fix overflow handling in reservoir sampling Handle multiplicative overflow by using xcalloc and xnrealloc, rather than xzalloc and xrealloc respectively. Notice where we overflow n_lines and diagnose appropriately. Fix the type of n_lines to randint which is more correct. If randint is smaller than size_t, then overflow is correctly detected. In the more likely case where randint is larger than size_t, like on 32 bit platforms for example, this will support up to uintmax_t input lines rather than being restricted to size_t. --- src/shuf.c | 9 ++++++--- 1 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index bf6136c..a62526f 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -148,12 +148,12 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, struct randint_source *s, struct linebuffer **out_rsrv) { - size_t n_lines = 0; + randint n_lines = 0; size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT); struct linebuffer *line = NULL; struct linebuffer *rsrv; - rsrv = xzalloc (n_alloc_lines * sizeof (struct linebuffer)); + rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer)); /* Fill the first K lines, directly into the reservoir. */ while (n_lines < k @@ -166,7 +166,7 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, if (n_lines >= n_alloc_lines) { n_alloc_lines += RESERVOIR_LINES_INCREMENT; - rsrv = xrealloc (rsrv, n_alloc_lines * sizeof (struct linebuffer)); + rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer)); memset (&rsrv[n_lines], 0, RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer)); } @@ -194,6 +194,9 @@ read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, } while (readlinebuffer_delim (line, in, eolbyte) != NULL && n_lines++); + if (! n_lines) + error (EXIT_FAILURE, EOVERFLOW, _("too many input lines")); + freebuffer (&dummy); } -- 1.7.7.6 >From d3d09e6fb3cca2277709f028e0cdfffc91b263e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 25 Mar 2013 00:02:01 +0000 Subject: [PATCH 7/9] shuf: when reservoir sampling, read more random data upfront Since we call randint_choose() per input line, read as much random data as possible. --- src/shuf.c | 1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index a62526f..ab47a09 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -448,6 +448,7 @@ main (int argc, char **argv) head_lines = MIN (head_lines, n_lines); randint_source = randint_all_new (random_source, + use_reservoir_sampling ? SIZE_MAX : randperm_bound (head_lines, n_lines)); if (! randint_source) error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source)); -- 1.7.7.6 >From 737294f20b5dd9cab9e7732d2fc7e863d40a1fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 25 Mar 2013 01:40:55 +0000 Subject: [PATCH 8/9] doc: add a NEWS entry for shuf reservoir sampling Meintion the memory usage benefits. --- NEWS | 4 ++++ 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/NEWS b/NEWS index 483669d..0c2daad 100644 --- a/NEWS +++ b/NEWS @@ -25,6 +25,10 @@ GNU coreutils NEWS -*- outline -*- inotify for files on those file systems, rather than the default (for unknown file system types) of issuing a warning and reverting to polling. + shuf outputs subsets of large inputs much more efficiently. + Reservoir sampling is used to limit memory usage based on the number of + outputs, rather than the number of inputs. + ** Build-related factor now builds on aarch64 based systems [bug introduced in coreutils-8.20] -- 1.7.7.6 >From 43f41d90ff8b6c87e86ba8b532c10c794d933948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 25 Mar 2013 03:16:36 +0000 Subject: [PATCH 9/9] shuf: only enable reservoir sampling for large or unknown inputs Reservoir sampling was seen to have up to double the CPU overhead when used with smaller inputs. So don't enable if for input files less than 8MiB. --- src/shuf.c | 34 +++++++++++++++++++++++++++++++++- 1 files changed, 33 insertions(+), 1 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index ab47a09..50891c0 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -42,6 +42,14 @@ /* For reservoir-sampling, allocate the reservoir lines in batches. */ enum { RESERVOIR_LINES_INCREMENT = 1024 }; +/* reservoir-sampling introduces CPU overhead for small inputs. + So only enable it for inputs >= this limit. + This limit was determined using these commands: + $ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done + $ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done + $ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done .*/ +enum { RESERVOIR_MIN_INPUT = 8192 * 1024 }; + void usage (int status) @@ -140,6 +148,30 @@ next_line (char *line, char eolbyte, size_t n) return p + 1; } +/* Return the size of the input if possible or OFF_T_MAX if not. */ + +static off_t +input_size (void) +{ + off_t file_size; + + struct stat stat_buf; + if (fstat (STDIN_FILENO, &stat_buf) != 0) + return OFF_T_MAX; + if (usable_st_size (&stat_buf)) + file_size = stat_buf.st_size; + else + return OFF_T_MAX; + + off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR); + if (input_offset < 0) + return OFF_T_MAX; + + file_size -= input_offset; + + return file_size; +} + /* Read all lines and store up to K permuted lines in *OUT_RSRV. Return the number of lines read, up to a maximum of K. */ @@ -433,7 +465,7 @@ main (int argc, char **argv) fadvise (stdin, FADVISE_SEQUENTIAL); - if (head_lines != SIZE_MAX) + if (head_lines != SIZE_MAX && input_size() > RESERVOIR_MIN_INPUT) { use_reservoir_sampling = true; n_lines = SIZE_MAX; /* unknown number of input lines, for now. */ -- 1.7.7.6