#include #include #include #include /* Encourage users to pre-sort p[] monotonic descending for large K or small N. * Improves early-out behaviour with fewer calls to gsl_ran_binomial(). * * Returns 'last': least index to all-zero tail sequence. * last==0 && N>0 implies degeneracy * if norm==0.0, degenerate by request (note: K==0 => norm==0.0) * elsewise, freelance degeneracy. */ size_t /* last */ gsl_ran_multinomial_early_out (const gsl_rng * r, const size_t K, const unsigned int N, const double p[], unsigned int n[]) { size_t last; /* return value */ size_t k; double norm = 0.0; double sum_p = 0.0; unsigned int tail_sum_n; /* p[k] may contain non-negative weights that do not sum to 1.0. * Even a probability distribution will not exactly sum to 1.0 * due to rounding errors. * * Where users sort monotonic descending as encouraged for efficiency * for large K, summation accuracy would improve by reverse summation * from least to greatest; however, early-out requires traversal * from greatest to least, and forward summation ensures sum_p converges * on norm, perhaps more important, even if norm ends up less * accurate summing in the foward direction. * * If norm sums to 0 (iff each p[k] <= 0.0), * then not possible to place N elements. */ for (k = 0; k < K; ++k) { if (p[k] > 0.0) norm += p[k]; } tail_sum_n = N; /* iterate until all elements placed: ie. tail_sum_n exhausted * for well-conditioned inputs tail_sum_n > 0 => k < K */ for (k = 0; tail_sum_n > 0 && k < K; ++k) { /* tempting to reflect sum_p into tail_sum_p and thus eliminate * subtraction from 'norm' term, however, repeated subtraction * from tail_sum_p could yield negative epsilon artifacts */ if (p[k] > 0.0) { n[k] = gsl_ran_binomial (r, p[k] / (norm - sum_p), tail_sum_n); sum_p += p[k]; } else { n[k] = 0; } tail_sum_n -= n[k]; } /* if tail_sum > 0 we have a bogus partial result * norm == 0.0 => tail_sum_n == N on loop exit * other cases might arise from bugs or floating point inaccuracy */ last = (tail_sum_n > 0) ? 0 : k; /* establish that last is *least* index to all-zero tail sequence * from above last != 0 => k > 0, but let's not code brittle invariants */ assert (last == 0 || ((k > 0) && n[k-1] > 0)); /* tail clear early-out or bogus-result * in the case of early-out, this is unnecessary work if each use * takes full acount of the return value, which we neither demand nor expect */ for (k = last; k < K; ++k) n[k] = 0; /* 1) logical post-condition: last > 0 => sum n[0..last) == N * 2) defensive post-condition: last==0 => each n[0..K) == 0 * 3) historical post-condition: each n[last..K) == 0 * unless the API is refactored to pre-sum norm, no point relaxing (3) * since we are O(K) already, even for N << K * notation: [) expresses semi-open interval */ return last; } /* WARNING: potential C++isms ahead; might require g++ */ void violation (const char * testname, const char * msg) { printf ("%20s: VIOLATION *** %s ***\n", testname, msg); } size_t /* last */ multinomial_post_check (const char * testname, const gsl_rng * r, const size_t K, const unsigned int N, const double p[], unsigned int n[]) { size_t last = gsl_ran_multinomial_early_out (r, K, N, p, n); size_t sum = 0; for (size_t i = 0; i < last; ++i) sum += n[i]; if (last > 0 && sum != N) violation (testname, "partial placement"); if (last > 0 && n[last-1] == 0) violation (testname, "non-maximal all-zero tail"); { /* ISO scope error: loop-predicate amnesia */ size_t i; for (i = last; i < K && n[i] == 0; ++i) ; if (i < K) violation (testname, "all-zero tail not all-zero\n"); } return last; } // don't have %z for size_t from C99 in g++ and not using cout #define FORMAT_Z "u" #define UNSIZE_T(x) ((unsigned int)(x)) void multinomial_test (const char * testname, const size_t reps, const gsl_rng * r, const size_t K, const unsigned int N, const double p[], unsigned int n[]) { printf ("Begin <%s: K=(", testname); for (size_t i = 0; i < K; ++i) printf ("%s%f", i?", ":"", p[i]); printf ("), N=%u>\n", N); for (size_t i = 0; i < reps; ++i) { size_t last = multinomial_post_check (testname, r, K, N, p, n); printf (" ++ %2" FORMAT_Z ": ", UNSIZE_T(last)); for (size_t k = 0; k < K; ++k) printf ("%d ", n[k]); printf ("\n"); } for (size_t i = 0; i < reps; ++i) { gsl_ran_multinomial (r, K, N, p, n); printf (" -- : "); for (size_t k = 0; k < K; ++k) printf ("%d ", n[k]); printf ("\n"); } printf ("End <%s>\n", testname); } int main (int argc, char* argv[]) { const gsl_rng_type * T; gsl_rng * r; gsl_rng_env_setup(); T = gsl_rng_default; r = gsl_rng_alloc (T); const size_t K = 3; unsigned int s[K]; size_t reps = 0; unsigned int N = 0; double p_null[K] = { 0.0, 0.0, 0.0 }; N = 40; reps = 1; multinomial_test ("empty dist", reps, r, 0, N, p_null, s); multinomial_test ("null dist", reps, r, K, N, p_null, s); double p_sloppy[K] = { 0.5, 0.5, -0.25 }; N = 100; reps = 10; multinomial_test ("bad addition", reps, r, K, N, p_sloppy, s); double p_small[K] = { 0.01, -100.0, 0.01 }; N = 100; reps = 10; multinomial_test ("eschew negatives", reps, r, K, N, p_small, s); double p_fast[K] = { 0.8, 0.16, 0.04 }; N = 10; reps = 40; multinomial_test ("early out", reps, r, K, N, p_fast, s); gsl_rng_free (r); return 0; }