>From 1adfd08cd3a52c373932b0f1039755a240d2c0b8 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <address@hidden>
Date: Thu, 7 Mar 2013 01:57:57 -0500
Subject: [PATCH 1/2] shuf: add (expensive) test for randomness

To run manually:
  make check TESTS=tests/misc/shuf-nonrandomess.sh \
             SUBDIRS=. RUN_VERY_EXPENSIVE_TESTS=yes

* tests/misc/shuf-randomness.sh: run 'shuf' repeatedly, and check if the
output is uniformly distributed "enough".
* tests/local.mk: add new test script.
---
 tests/local.mk                |    1 +
 tests/misc/shuf-randomness.sh |  186 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+), 0 deletions(-)
 create mode 100755 tests/misc/shuf-randomness.sh

diff --git a/tests/local.mk b/tests/local.mk
index 607ddc4..d3923f8 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -313,6 +313,7 @@ all_tests =					\
   tests/misc/shred-passes.sh			\
   tests/misc/shred-remove.sh			\
   tests/misc/shuf.sh				\
+  tests/misc/shuf-randomness.sh			\
   tests/misc/sort.pl				\
   tests/misc/sort-benchmark-random.sh		\
   tests/misc/sort-compress.sh			\
diff --git a/tests/misc/shuf-randomness.sh b/tests/misc/shuf-randomness.sh
new file mode 100755
index 0000000..3e35cca
--- /dev/null
+++ b/tests/misc/shuf-randomness.sh
@@ -0,0 +1,186 @@
+#!/bin/sh
+# Test shuf for somewhat uniform randomness
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ shuf
+getlimits_
+
+# Don't run these tests by default.
+very_expensive_
+
+# Number of trails
+T=1000
+
+# Number of categories
+N=100
+REQUIRED_CHI_SQUARED=200 # Be extremely leniet:
+                         # don't require "great" goodness of fit
+                         # even for our assumed 99 degrees of freedom
+
+# K - when testing reservoir-sampling, print K lines
+K=20
+REQUIRED_CHI_SQUARED_K=50 # Be extremely leniet:
+                          # don't require "great" goodness of fit
+                          # even for our assumed 19 degrees of freedom
+
+
+
+# The input: many zeros followed by 1 one
+(yes 0 | head -n $((N-1)) ; echo 1 ) > in || framework_failure_
+
+
+is_uniform()
+{
+  # Input is assumed to be a string of $T spaces-separated-values
+  # between 1 and $N
+  LINES="$1"
+
+  # Convert spaces to new-lines
+  LINES=$(echo "$LINES" | tr ' ' '\n' | sed '/^$/d') || framework_failure_
+
+  # Requre exactly $T values
+  COUNT=$(echo "$LINES" | wc -l)
+  test "$COUNT" -eq "$T" || framework_failure_
+
+  # HIST is the histogram of counts per categories
+  #  ( categories are between 1 and $N )
+  HIST=$(echo "$LINES" | sort -n | uniq -c)
+
+  #DEBUG
+  #echo "HIST=$HIST" 1>&2
+
+  ## Calculate Chi-Squared
+  CHI=$( echo "$HIST" |
+         awk -v n=$N -v t=$T '{ counts[$2] = $1 }
+			      END {
+			        exptd = ((1.0)*t)/n
+                                chi = 0
+				for (i=1;i<=n;++i)
+                                  {
+                                    if (i in counts)
+                                      obs = counts[i]
+                                    else
+                                      obs= 0;
+                                    chi += ((obs-exptd)*(obs-exptd))/exptd
+                                  }
+                                printf "%7.0f\n", chi
+                              }' ) || framework_failure_
+  #DEBUG
+  #echo "CHI = $CHI" 1>&2
+
+  # If CHI is "small enough" assume uniformity.
+  test "$CHI" -le "$REQUIRED_CHI_SQUARED" && return 0
+
+  # assume not uniform "enough"
+  return 1
+}
+
+is_uniform_k()
+{
+  K="$1"
+  LINES="$2"
+
+  # Convert spaces to new-lines
+  LINES=$(echo "$LINES" | tr ' ' '\n' | sed '/^$/d') || framework_failure_
+
+  # HIST is the histogram of counts per categories
+  #  There should be K numeric categories (e.g 1/2/3/4/5 for K=5)
+  #  and one X category (when the first K results from 'shuf' didn't
+  #  contain "1").
+  HIST=$(echo "$LINES" | sort | uniq -c) || framework_failure_
+
+
+  #DEBUG
+  #echo "HIST=$HIST" 1>&2
+
+  ## Calculate Chi-Squared
+  CHI=$( echo "$HIST" |
+         awk -v n=$N \
+             -v t=$T \
+             -v k=$K \
+                 '{ counts[$2] = $1 }
+                  END {
+                     # each of the first K categories is expected
+                     # to appear as if they are uniformaly distributed
+                     exptd_k = ((1.0)*t)/n
+
+                     chi = 0
+                     # Sum chi-squared for the K categories
+                     for (i=1;i<=k;++i)
+                       {
+                         if (i in counts)
+                           obs = counts[i]
+                         else
+                           obs= 0;
+                         chi += ((obs-exptd_k)*(obs-exptd_k))/exptd_k
+                       }
+
+                     # Print Chi-Squared
+                     printf "%7.0f\n",chi
+                    }' ) || framework_failure_
+  #DEBUG
+  #echo "CHI = $CHI" 1>&2
+
+  # If CHI is "small enough" assume uniformity.
+  test "$CHI" -le "$REQUIRED_CHI_SQUARED_K" && return 0
+
+  # assume not uniform "enough"
+  return 1
+}
+
+
+##
+## Step 1:
+##   Run a control: the "1"s in the input file should not be uniformly distributed
+##
+LINE=$(cat in | sed -n '/1/=') || framework_failure_
+for loop in $(seq $T) ;
+do
+  LINES="$LINES $LINE"
+done
+is_uniform "$LINES" && framework_failure_
+
+##
+## Step 2:
+##   Shuffle the input files $T times.
+##   The resulting position (=line number) of the "1" should be uniformly distributed.
+##   This tests the "load entire file" control flow in the C code.
+LINES=""
+for loop in $(seq $T) ;
+do
+  LINE=$(shuf in | sed -n '/1/=') || framework_failure_
+  LINES="$LINES $LINE"
+done
+is_uniform "$LINES" ||
+	{ fail=1 ; echo "step 2 failed - not uniformly distributed?" 1>&2 ; }
+
+##
+## Step 3:
+##   Shuffle the input files $T times, selecting the first K lines.
+##   This tests the "reservoir sampling" control flow in the C code.
+LINES=""
+for loop in $(seq $T) ;
+do
+  LINE=$(shuf -n $K in | sed -n '/1/=') || framework_failure_
+  LINES="$LINES $LINE"
+done
+is_uniform_k "$K" "$LINES" ||
+	  { fail=1 ; echo "step 3 failed - not uniformly distributed?" 1>&2 ; }
+
+
+Exit $fail
-- 
1.7.7.4