>From 1adfd08cd3a52c373932b0f1039755a240d2c0b8 Mon Sep 17 00:00:00 2001
From: Assaf Gordon
Date: Thu, 7 Mar 2013 01:57:57 -0500
Subject: [PATCH 1/2] shuf: add (expensive) test for randomness
To run manually:
make check TESTS=tests/misc/shuf-nonrandomess.sh \
SUBDIRS=. RUN_VERY_EXPENSIVE_TESTS=yes
* tests/misc/shuf-randomness.sh: run 'shuf' repeatedly, and check if the
output is uniformly distributed "enough".
* tests/local.mk: add new test script.
---
tests/local.mk | 1 +
tests/misc/shuf-randomness.sh | 186 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 187 insertions(+), 0 deletions(-)
create mode 100755 tests/misc/shuf-randomness.sh
diff --git a/tests/local.mk b/tests/local.mk
index 607ddc4..d3923f8 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -313,6 +313,7 @@ all_tests = \
tests/misc/shred-passes.sh \
tests/misc/shred-remove.sh \
tests/misc/shuf.sh \
+ tests/misc/shuf-randomness.sh \
tests/misc/sort.pl \
tests/misc/sort-benchmark-random.sh \
tests/misc/sort-compress.sh \
diff --git a/tests/misc/shuf-randomness.sh b/tests/misc/shuf-randomness.sh
new file mode 100755
index 0000000..3e35cca
--- /dev/null
+++ b/tests/misc/shuf-randomness.sh
@@ -0,0 +1,186 @@
+#!/bin/sh
+# Test shuf for somewhat uniform randomness
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ shuf
+getlimits_
+
+# Don't run these tests by default.
+very_expensive_
+
+# Number of trails
+T=1000
+
+# Number of categories
+N=100
+REQUIRED_CHI_SQUARED=200 # Be extremely leniet:
+ # don't require "great" goodness of fit
+ # even for our assumed 99 degrees of freedom
+
+# K - when testing reservoir-sampling, print K lines
+K=20
+REQUIRED_CHI_SQUARED_K=50 # Be extremely leniet:
+ # don't require "great" goodness of fit
+ # even for our assumed 19 degrees of freedom
+
+
+
+# The input: many zeros followed by 1 one
+(yes 0 | head -n $((N-1)) ; echo 1 ) > in || framework_failure_
+
+
+is_uniform()
+{
+ # Input is assumed to be a string of $T spaces-separated-values
+ # between 1 and $N
+ LINES="$1"
+
+ # Convert spaces to new-lines
+ LINES=$(echo "$LINES" | tr ' ' '\n' | sed '/^$/d') || framework_failure_
+
+ # Requre exactly $T values
+ COUNT=$(echo "$LINES" | wc -l)
+ test "$COUNT" -eq "$T" || framework_failure_
+
+ # HIST is the histogram of counts per categories
+ # ( categories are between 1 and $N )
+ HIST=$(echo "$LINES" | sort -n | uniq -c)
+
+ #DEBUG
+ #echo "HIST=$HIST" 1>&2
+
+ ## Calculate Chi-Squared
+ CHI=$( echo "$HIST" |
+ awk -v n=$N -v t=$T '{ counts[$2] = $1 }
+ END {
+ exptd = ((1.0)*t)/n
+ chi = 0
+ for (i=1;i<=n;++i)
+ {
+ if (i in counts)
+ obs = counts[i]
+ else
+ obs= 0;
+ chi += ((obs-exptd)*(obs-exptd))/exptd
+ }
+ printf "%7.0f\n", chi
+ }' ) || framework_failure_
+ #DEBUG
+ #echo "CHI = $CHI" 1>&2
+
+ # If CHI is "small enough" assume uniformity.
+ test "$CHI" -le "$REQUIRED_CHI_SQUARED" && return 0
+
+ # assume not uniform "enough"
+ return 1
+}
+
+is_uniform_k()
+{
+ K="$1"
+ LINES="$2"
+
+ # Convert spaces to new-lines
+ LINES=$(echo "$LINES" | tr ' ' '\n' | sed '/^$/d') || framework_failure_
+
+ # HIST is the histogram of counts per categories
+ # There should be K numeric categories (e.g 1/2/3/4/5 for K=5)
+ # and one X category (when the first K results from 'shuf' didn't
+ # contain "1").
+ HIST=$(echo "$LINES" | sort | uniq -c) || framework_failure_
+
+
+ #DEBUG
+ #echo "HIST=$HIST" 1>&2
+
+ ## Calculate Chi-Squared
+ CHI=$( echo "$HIST" |
+ awk -v n=$N \
+ -v t=$T \
+ -v k=$K \
+ '{ counts[$2] = $1 }
+ END {
+ # each of the first K categories is expected
+ # to appear as if they are uniformaly distributed
+ exptd_k = ((1.0)*t)/n
+
+ chi = 0
+ # Sum chi-squared for the K categories
+ for (i=1;i<=k;++i)
+ {
+ if (i in counts)
+ obs = counts[i]
+ else
+ obs= 0;
+ chi += ((obs-exptd_k)*(obs-exptd_k))/exptd_k
+ }
+
+ # Print Chi-Squared
+ printf "%7.0f\n",chi
+ }' ) || framework_failure_
+ #DEBUG
+ #echo "CHI = $CHI" 1>&2
+
+ # If CHI is "small enough" assume uniformity.
+ test "$CHI" -le "$REQUIRED_CHI_SQUARED_K" && return 0
+
+ # assume not uniform "enough"
+ return 1
+}
+
+
+##
+## Step 1:
+## Run a control: the "1"s in the input file should not be uniformly distributed
+##
+LINE=$(cat in | sed -n '/1/=') || framework_failure_
+for loop in $(seq $T) ;
+do
+ LINES="$LINES $LINE"
+done
+is_uniform "$LINES" && framework_failure_
+
+##
+## Step 2:
+## Shuffle the input files $T times.
+## The resulting position (=line number) of the "1" should be uniformly distributed.
+## This tests the "load entire file" control flow in the C code.
+LINES=""
+for loop in $(seq $T) ;
+do
+ LINE=$(shuf in | sed -n '/1/=') || framework_failure_
+ LINES="$LINES $LINE"
+done
+is_uniform "$LINES" ||
+ { fail=1 ; echo "step 2 failed - not uniformly distributed?" 1>&2 ; }
+
+##
+## Step 3:
+## Shuffle the input files $T times, selecting the first K lines.
+## This tests the "reservoir sampling" control flow in the C code.
+LINES=""
+for loop in $(seq $T) ;
+do
+ LINE=$(shuf -n $K in | sed -n '/1/=') || framework_failure_
+ LINES="$LINES $LINE"
+done
+is_uniform_k "$K" "$LINES" ||
+ { fail=1 ; echo "step 3 failed - not uniformly distributed?" 1>&2 ; }
+
+
+Exit $fail
--
1.7.7.4