>From 0ff2403dde869af3f9a44dd7418aae3082d8c0aa Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Thu, 7 Mar 2013 01:57:57 -0500 Subject: [PATCH 1/2] shuf: add (expensive) test for randomness To run manually: make check TESTS=tests/misc/shuf-randomess.sh \ SUBDIRS=. RUN_VERY_EXPENSIVE_TESTS=yes * tests/misc/shuf-randomness.sh: run 'shuf' repeatedly, and check if the output is uniformly distributed "enough". * tests/local.mk: add new test script. --- tests/local.mk | 1 + tests/misc/shuf-randomness.sh | 187 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 0 deletions(-) create mode 100755 tests/misc/shuf-randomness.sh diff --git a/tests/local.mk b/tests/local.mk index 607ddc4..d3923f8 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -313,6 +313,7 @@ all_tests = \ tests/misc/shred-passes.sh \ tests/misc/shred-remove.sh \ tests/misc/shuf.sh \ + tests/misc/shuf-randomness.sh \ tests/misc/sort.pl \ tests/misc/sort-benchmark-random.sh \ tests/misc/sort-compress.sh \ diff --git a/tests/misc/shuf-randomness.sh b/tests/misc/shuf-randomness.sh new file mode 100755 index 0000000..c0b9e2e --- /dev/null +++ b/tests/misc/shuf-randomness.sh @@ -0,0 +1,187 @@ +#!/bin/sh +# Test shuf for somewhat uniform randomness + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ shuf +getlimits_ + +# Don't run these tests by default. +very_expensive_ + +# Number of trails +T=1000 + +# Number of categories +N=100 +REQUIRED_CHI_SQUARED=200 # Be extremely leniet: + # don't require "great" goodness of fit + # even for our assumed 99 degrees of freedom + +# K - when testing reservoir-sampling, print K lines +K=20 +REQUIRED_CHI_SQUARED_K=50 # Be extremely leniet: + # don't require "great" goodness of fit + # even for our assumed 19 degrees of freedom + + + +# The input: many zeros followed by 1 one +(yes 0 | head -n $((N-1)) ; echo 1 ) > in || framework_failure_ + + +is_uniform() +{ + # Input is assumed to be a string of $T spaces-separated-values + # between 1 and $N + LINES="$1" + + # Convert spaces to new-lines + LINES=$(echo "$LINES" | tr ' ' '\n' | sed '/^$/d') || framework_failure_ + + # Requre exactly $T values + COUNT=$(echo "$LINES" | wc -l) + test "$COUNT" -eq "$T" || framework_failure_ + + # HIST is the histogram of counts per categories + # ( categories are between 1 and $N ) + HIST=$(echo "$LINES" | sort -n | uniq -c) + + #DEBUG + #echo "HIST=$HIST" 1>&2 + + ## Calculate Chi-Squared + CHI=$( echo "$HIST" | + awk -v n=$N -v t=$T '{ counts[$2] = $1 } + END { + exptd = ((1.0)*t)/n + chi = 0 + for (i=1;i<=n;++i) + { + if (i in counts) + obs = counts[i] + else + obs= 0; + chi += ((obs-exptd)*(obs-exptd))/exptd + } + printf "%7.0f\n", chi + }' ) || framework_failure_ + #DEBUG + #echo "CHI = $CHI" 1>&2 + + # If CHI is "small enough" assume uniformity. + test "$CHI" -le "$REQUIRED_CHI_SQUARED" && return 0 + + # assume not uniform "enough" + return 1 +} + +is_uniform_k() +{ + K="$1" + LINES="$2" + + # Convert spaces to new-lines + LINES=$(echo "$LINES" | tr ' ' '\n' | sed '/^$/d') || framework_failure_ + + # HIST is the histogram of counts per categories + # There should be K numeric categories (e.g 1/2/3/4/5 for K=5) + # and one X category (when the first K results from 'shuf' didn't + # contain "1"). + HIST=$(echo "$LINES" | sort | uniq -c) || framework_failure_ + + + #DEBUG + #echo "HIST=$HIST" 1>&2 + + ## Calculate Chi-Squared + CHI=$( echo "$HIST" | + awk -v n=$N \ + -v t=$T \ + -v k=$K \ + '{ counts[$2] = $1 } + END { + # each of the first K categories is expected + # to appear as if they are uniformaly distributed + exptd_k = ((1.0)*t)/n + + chi = 0 + # Sum chi-squared for the K categories + for (i=1;i<=k;++i) + { + if (i in counts) + obs = counts[i] + else + obs= 0; + chi += ((obs-exptd_k)*(obs-exptd_k))/exptd_k + } + + # Print Chi-Squared + printf "%7.0f\n",chi + }' ) || framework_failure_ + #DEBUG + #echo "CHI = $CHI" 1>&2 + + # If CHI is "small enough" assume uniformity. + test "$CHI" -le "$REQUIRED_CHI_SQUARED_K" && return 0 + + # assume not uniform "enough" + return 1 +} + + +## +## Step 1: +## Run a control: the "1"s in the input file should not be +## uniformly distributed +LINE=$(cat in | sed -n '/1/=') || framework_failure_ +for loop in $(seq $T) ; +do + LINES="$LINES $LINE" +done +is_uniform "$LINES" && framework_failure_ + +## +## Step 2: +## Shuffle the input files $T times. +## The resulting position (=line number) of the "1" should be +## U niformly distributed. +## This tests the "load entire file" control flow in the C code. +LINES="" +for loop in $(seq $T) ; +do + LINE=$(shuf in | sed -n '/1/=') || framework_failure_ + LINES="$LINES $LINE" +done +is_uniform "$LINES" || + { fail=1 ; echo "step 2 failed - not uniformly distributed?" 1>&2 ; } + +## +## Step 3: +## Shuffle the input files $T times, selecting the first K lines. +## This tests the "reservoir sampling" control flow in the C code. +LINES="" +for loop in $(seq $T) ; +do + LINE=$(shuf -n $K in | sed -n '/1/=') || framework_failure_ + LINES="$LINES $LINE" +done +is_uniform_k "$K" "$LINES" || + { fail=1 ; echo "step 3 failed - not uniformly distributed?" 1>&2 ; } + + +Exit $fail -- 1.7.7.4