[4648] | 1 | ;; |
---|
| 2 | ;; |
---|
| 3 | ;; Perform some simple randomness tests on a sequence of numerical |
---|
| 4 | ;; values. |
---|
| 5 | ;; |
---|
| 6 | ;; Based on the ENT program by John Walker |
---|
| 7 | ;; |
---|
| 8 | ;; |
---|
| 9 | ;; Copyright 2007 Ivan Raikov |
---|
| 10 | ;; |
---|
| 11 | ;; |
---|
[4881] | 12 | ;; This program is free software: you can redistribute it and/or modify |
---|
| 13 | ;; it under the terms of the GNU General Public License as published by |
---|
| 14 | ;; the Free Software Foundation, either version 3 of the License, or (at |
---|
| 15 | ;; your option) any later version. |
---|
[4648] | 16 | ;; |
---|
[4881] | 17 | ;; This program is distributed in the hope that it will be useful, but |
---|
| 18 | ;; WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 19 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
| 20 | ;; General Public License for more details. |
---|
[4648] | 21 | ;; |
---|
[4881] | 22 | ;; A full copy of the GPL license can be found at |
---|
| 23 | ;; <http://www.gnu.org/licenses/>.")))) |
---|
[4648] | 24 | ;; |
---|
[4881] | 25 | ;; |
---|
[4648] | 26 | |
---|
| 27 | |
---|
| 28 | |
---|
| 29 | ;; |
---|
| 30 | ;; |
---|
| 31 | ;; Chi-Square Test |
---|
| 32 | ;; |
---|
| 33 | ;; In general, the Chi-Square distribution for an experiment with k |
---|
| 34 | ;; possible outcomes, performed n times, in which Y1, Y2,... Yk are |
---|
| 35 | ;; the number of experiments which resulted in each possible outcome, |
---|
| 36 | ;; and probabilities of each outcome p1, p2,... pk, is given as: |
---|
| 37 | ;; |
---|
| 38 | ;; \Chi^2 = \sum_{1 <= i <= k}{(Y_i - m * p_i)^2 / np_i} |
---|
| 39 | ;; |
---|
| 40 | ;; \Chi^2 will grow larger as the measured results diverge from those |
---|
| 41 | ;; expected by pure chance. The probability Q that a Chi-Square value |
---|
| 42 | ;; calculated for an experiment with d degrees of freedom (where |
---|
| 43 | ;; d=k-1, one less the number of possible outcomes) is due to chance |
---|
| 44 | ;; is: |
---|
| 45 | ;; |
---|
| 46 | ;; Q(\Chi^2,d) = |
---|
| 47 | ;; [2^{d/2} * \Gamma(d/2)]^{-1} * |
---|
| 48 | ;; \int_{\Chi^2}^{\infty}(t)^{d/2-1} * exp(-t/2) * dt |
---|
| 49 | ;; |
---|
| 50 | ;; Where Gamma is the generalisation of the factorial function to real |
---|
| 51 | ;; and complex arguments: |
---|
| 52 | ;; |
---|
| 53 | ;; Gamma(x) = \int_{0}^{\infty} t^{x-1} * exp(-t) * dt |
---|
| 54 | ;; |
---|
| 55 | ;; |
---|
| 56 | ;; There is no closed form solution for Q, so it must be evaluated |
---|
| 57 | ;; numerically. Note that the probability calculated from the \Chi^2 |
---|
| 58 | ;; is an approximation which is valid only for large values of n, and |
---|
| 59 | ;; is therefore only meaningful when calculated from a large number of |
---|
| 60 | ;; independent experiments. |
---|
| 61 | ;; |
---|
| 62 | ;; In this implementation, the Chi-Square distribution is calculated |
---|
| 63 | ;; for the list of values given as argument to the random-test |
---|
| 64 | ;; procedure and expressed as an absolute number and a percentage |
---|
| 65 | ;; which indicates how frequently a truly random sequence would exceed |
---|
| 66 | ;; the value calculated. |
---|
| 67 | ;; |
---|
| 68 | ;; The percentage can be interpreted as the degree to which the |
---|
| 69 | ;; sequence tested is suspected of being non-random. If the percentage |
---|
| 70 | ;; is greater than 99% or less than 1%, the sequence is almost |
---|
| 71 | ;; certainly not random. If the percentage is between 99% and 95% or |
---|
| 72 | ;; between 1% and 5%, the sequence is suspect. Percentages between 90% |
---|
| 73 | ;; and 95% and 5% and 10% indicate the sequence is "almost |
---|
| 74 | ;; suspect". |
---|
| 75 | ;; |
---|
| 76 | ;; |
---|
| 77 | ;; Arithmetic Mean |
---|
| 78 | ;; |
---|
| 79 | ;; This is simply the result of summing the all the values in the |
---|
| 80 | ;; sequence and dividing by the sequence length. If the data are close |
---|
| 81 | ;; to random, the mean should be about (2^b - 1)/2 where b is the |
---|
| 82 | ;; number of bits used to represent a value. If the mean departs from |
---|
| 83 | ;; this value, the values are consistently high or low. |
---|
| 84 | ;; |
---|
| 85 | ;; |
---|
| 86 | ;; Monte Carlo Value for Pi |
---|
| 87 | ;; |
---|
| 88 | ;; Each pair of two values in the input sequence is used as X and |
---|
| 89 | ;; Y coordinates within a square with side N (the length of the input |
---|
| 90 | ;; sequence). If the distance of the randomly-generated point is less |
---|
| 91 | ;; than the radius of a circle inscribed within the square, the pair |
---|
| 92 | ;; of values is considered a "hit". The percentage of hits can be used |
---|
| 93 | ;; to calculate the value of Pi: |
---|
| 94 | ;; |
---|
| 95 | ;; |
---|
| 96 | ;; # points within circle 1/4 * pi * r^2 |
---|
| 97 | ;; ---------------------- = -------------- = 1/4 * pi |
---|
| 98 | ;; # points within square r^2 |
---|
| 99 | ;; |
---|
| 100 | ;; |
---|
| 101 | ;; Therefore, |
---|
| 102 | ;; # points within circle |
---|
| 103 | ;; pi = 4 * ---------------------- |
---|
| 104 | ;; # points within square |
---|
| 105 | ;; |
---|
| 106 | ;; |
---|
| 107 | ;; For very long sequences (this approximation converges very |
---|
| 108 | ;; slowly), the value will approach the correct value of Pi if the |
---|
| 109 | ;; sequence is close to random. |
---|
| 110 | ;; |
---|
| 111 | ;; |
---|
| 112 | ;; Serial Correlation Coefficient |
---|
| 113 | ;; |
---|
| 114 | ;; This statistic measures the extent to which each value in the |
---|
| 115 | ;; sequence depends upon the previous value. For random sequences, |
---|
| 116 | ;; this value (which can be positive or negative) must be close to |
---|
| 117 | ;; zero. |
---|
| 118 | ;; |
---|
| 119 | ;; |
---|
| 120 | |
---|
| 121 | (require-extension srfi-4) |
---|
| 122 | (define-extension random-test) |
---|
| 123 | |
---|
| 124 | (declare (export make-random-test format-random-stats)) |
---|
| 125 | |
---|
| 126 | |
---|
| 127 | ; Parse & embed |
---|
| 128 | #>! |
---|
| 129 | double pochisq (double x, double df); |
---|
| 130 | double log2 (double x); |
---|
| 131 | <# |
---|
| 132 | |
---|
| 133 | ;; Number of bins for Chi-Square test |
---|
| 134 | (define N 1024) |
---|
| 135 | |
---|
| 136 | ;; Circle radius for the Monte Carlo calculation of Pi |
---|
| 137 | (define R N) |
---|
| 138 | |
---|
| 139 | ;; Number of coordinates for the Monte Carlo calculation of Pi |
---|
| 140 | (define M 2) |
---|
| 141 | |
---|
| 142 | |
---|
| 143 | (define PI 3.14159265358979323846) |
---|
| 144 | |
---|
| 145 | (define (make-random-test . rest) |
---|
| 146 | (let-optionals rest ((scar car) (scdr cdr) (snull? null?)) |
---|
[4656] | 147 | (lambda (seq) |
---|
[4648] | 148 | |
---|
| 149 | ;; Bins to count occurrences of values |
---|
| 150 | (define bins (make-u32vector N 0)) |
---|
| 151 | |
---|
| 152 | ;; Monte Carlo coordinates |
---|
| 153 | (define mcoord (make-u32vector M 0)) |
---|
| 154 | |
---|
| 155 | (let ((nmin 0.0) ;; Minimum and maximum values encountered |
---|
| 156 | (nmax 0.0) |
---|
| 157 | (mp 0) ;; Monte Carlo accumulator pointer |
---|
| 158 | (mcount 0) ;; Monte Carlo tries |
---|
| 159 | (incircle 0)) ;; Monte Carlo circle inside count |
---|
| 160 | (let* ;; Serial correlation terms |
---|
| 161 | ((sccn0 (and seq (scar seq))) |
---|
| 162 | (sccn-1 #f) |
---|
| 163 | (scct1 0.0) |
---|
| 164 | (scct2 0.0) |
---|
| 165 | (scct3 0.0)) |
---|
| 166 | |
---|
| 167 | (let loop ((total 0) (seq seq)) |
---|
| 168 | (if (snull? seq) |
---|
[4656] | 169 | (random-stats total nmin nmax bins incircle mcount sccn0 scct1 scct2 scct3 sccn-1 ) |
---|
[4648] | 170 | (let ((n (scar seq))) |
---|
| 171 | ;; Updated minimum and maximum |
---|
| 172 | ;; Update bin counters |
---|
[4653] | 173 | (let ((nbin (inexact->exact (modulo n N)))) |
---|
[4648] | 174 | (u32vector-set! bins nbin (fx+ 1 (u32vector-ref bins nbin))) |
---|
[4656] | 175 | (let ((bi (u32vector-ref bins nbin))) |
---|
| 176 | (if (< bi nmin) (set! nmin bi)) |
---|
| 177 | (if (> bi nmax) (set! nmax bi))) |
---|
[4648] | 178 | ;; Update inside / outside circle counts for Monte Carlo |
---|
| 179 | ;; calculation of Pi |
---|
| 180 | (u32vector-set! mcoord mp nbin) |
---|
| 181 | (set! mp (fx+ 1 mp)) |
---|
| 182 | (if (fx<= M mp) |
---|
| 183 | (begin (set! mcount (fx+ mcount 1)) |
---|
| 184 | (set! mp 0) |
---|
| 185 | (let ((x (u32vector-ref mcoord 0)) |
---|
| 186 | (y (u32vector-ref mcoord 1))) |
---|
| 187 | ;; determine if the random "coordinates" are |
---|
| 188 | ;; inside or outside the circle |
---|
| 189 | (let ((dist (sqrt (+ (expt x 2) (expt y 2))))) |
---|
| 190 | (if (<= dist R) (set! incircle (fx+ 1 incircle))))))) |
---|
| 191 | ;; Update calculation of serial correlation coefficient |
---|
| 192 | (if sccn-1 (set! scct1 (+ scct1 (* sccn-1 n)))) |
---|
| 193 | (set! scct2 (+ scct2 n)) |
---|
| 194 | (set! scct3 (+ scct3 (* n n))) |
---|
| 195 | (set! sccn-1 n) |
---|
| 196 | (loop (fx+ 1 total) (scdr seq))))))))))) |
---|
| 197 | |
---|
[4656] | 198 | (define (random-stats total nmin nmax bins incircle mcount sccn0 scct1 scct2 scct3 sccn) |
---|
[4648] | 199 | |
---|
| 200 | ;; Probabilities per bin for entropy |
---|
| 201 | (define probs (make-f64vector N 0)) |
---|
| 202 | |
---|
| 203 | ;; Compute serial correlation coefficient |
---|
| 204 | (let* ((scct1 (+ scct1 (* sccn sccn0))) |
---|
| 205 | (scct2 (* scct2 scct2)) |
---|
| 206 | (sccdenom (- (* total scct3) scct2)) |
---|
| 207 | (scc (if (zero? sccdenom) |
---|
| 208 | -10000.0 |
---|
| 209 | (/ (- (* total scct1) scct2) sccdenom)))) |
---|
| 210 | |
---|
| 211 | ;; Scan bins and calculate probability for each bin and |
---|
| 212 | ;; Chi-Square distribution |
---|
| 213 | (let ((cexp (/ total N))) ;; Expected count per bin |
---|
| 214 | (let-values (((chisq pochisq datasum) |
---|
| 215 | (let loop ((i 0) (chisq 0.0) (datasum 0.0)) |
---|
| 216 | (if (fx< i N) |
---|
| 217 | (let ((bi (u32vector-ref bins i))) |
---|
| 218 | (let ((a (- bi cexp)) (p (/ bi total))) |
---|
| 219 | (f64vector-set! probs i p) |
---|
| 220 | (loop (fx+ 1 i) |
---|
| 221 | (+ chisq (/ (* a a) cexp)) |
---|
| 222 | (+ datasum (* i bi))))) |
---|
| 223 | (values chisq (pochisq chisq (- total 1)) datasum))))) |
---|
| 224 | |
---|
| 225 | ;; Calculate entropy |
---|
| 226 | (let ((ent (let loop ((i 0) (ent 0)) |
---|
| 227 | (if (fx< i N) |
---|
| 228 | (let ((p (f64vector-ref probs i))) |
---|
| 229 | (loop (fx+ 1 i) (if (< 0.0 p) (+ ent (* p (log2 (/ 1.0 p)))) ent))) |
---|
| 230 | ent)))) |
---|
| 231 | |
---|
| 232 | ;; Calculate Monte Carlo value for Pi from percentage of hits |
---|
| 233 | ;; within the circle |
---|
| 234 | (let ((montepi (* 4 (/ incircle mcount)))) |
---|
[4656] | 235 | `((total . ,total) |
---|
[4648] | 236 | (ent . ,ent) |
---|
| 237 | (chisq . ,chisq) |
---|
| 238 | (pochisq . ,pochisq) |
---|
| 239 | (mean . ,(/ datasum total)) |
---|
| 240 | (min . ,nmin) |
---|
| 241 | (max . ,nmax) |
---|
| 242 | (montepi . ,montepi) |
---|
| 243 | (scc . ,scc)))))))) |
---|
| 244 | |
---|
| 245 | (define (format-random-stats out stats) |
---|
| 246 | (fprintf out "Entropy is ~S.\n" (alist-ref 'ent stats)) |
---|
| 247 | (fprintf out "Chi-Square distribution for ~S samples is ~S (~S percent random).\n" |
---|
| 248 | (alist-ref 'total stats) |
---|
| 249 | (alist-ref 'chisq stats) |
---|
| 250 | (* 100 (alist-ref 'pochisq stats))) |
---|
| 251 | (fprintf out "Arithmetic mean value of the sequence is ~S (~S = random).\n" |
---|
| 252 | (alist-ref 'mean stats) |
---|
[4656] | 253 | (/ (alist-ref 'total stats) N)) |
---|
[4648] | 254 | (fprintf out "Monte-Carlo value for Pi is ~S (error ~S percent).\n" |
---|
| 255 | (alist-ref 'montepi stats) |
---|
| 256 | (* 100 (/ (abs (- PI (alist-ref 'montepi stats))) PI))) |
---|
| 257 | (fprintf out "Serial correlation coefficient is ~S.\n" |
---|
| 258 | (let ((scc (alist-ref 'scc stats))) |
---|
| 259 | (if (>= scc -99999) scc "undefined (all values equal)")))) |
---|
| 260 | |
---|
| 261 | |
---|
| 262 | ; Include into generated code, but don't parse: |
---|
| 263 | #> |
---|
| 264 | |
---|
| 265 | #include <math.h> |
---|
| 266 | double poz (double z); |
---|
| 267 | |
---|
| 268 | #define LOG_SQRT_PI 0.5723649429247000870717135 /* log (sqrt (pi)) */ |
---|
| 269 | #define I_SQRT_PI 0.5641895835477562869480795 /* 1 / sqrt (pi) */ |
---|
| 270 | #define BIGX 20.0 /* max value to represent exp (x) */ |
---|
| 271 | #define ex(x) (((x) < -BIGX) ? 0.0 : exp (x)) |
---|
| 272 | |
---|
| 273 | |
---|
| 274 | /* ALGORITHM Compute probability of chi square value. |
---|
| 275 | ;; Adapted from: |
---|
| 276 | ;; Hill, I. D. and Pike, M. C. Algorithm 299 |
---|
| 277 | ;; Collected Algorithms for the CACM 1967 p. 243 |
---|
| 278 | ;; Updated for rounding errors based on remark in |
---|
| 279 | ;; ACM TOMS June 1985, page 185 |
---|
| 280 | */ |
---|
| 281 | |
---|
| 282 | /* |
---|
| 283 | ;; x: obtained chi-square value |
---|
| 284 | ;; df: degrees of freedom |
---|
| 285 | */ |
---|
| 286 | |
---|
| 287 | double pochisq (double x, double df) |
---|
| 288 | { |
---|
| 289 | |
---|
| 290 | double a, y, s; |
---|
| 291 | double e, c, z; |
---|
| 292 | int even; /* true if df is an even number */ |
---|
| 293 | |
---|
| 294 | if (x <= 0.0 || df < 1) |
---|
| 295 | return (1.0); |
---|
| 296 | |
---|
| 297 | a = 0.5 * x; |
---|
| 298 | even = (2*(df/2)) == df; |
---|
| 299 | if (df > 1) |
---|
| 300 | y = ex (-a); |
---|
| 301 | s = (even ? y : (2.0 * poz (-sqrt (x)))); |
---|
| 302 | if (df > 2) |
---|
| 303 | { |
---|
| 304 | x = 0.5 * (df - 1.0); |
---|
| 305 | z = (even ? 1.0 : 0.5); |
---|
| 306 | if (a > BIGX) |
---|
| 307 | { |
---|
| 308 | e = (even ? 0.0 : LOG_SQRT_PI); |
---|
| 309 | c = log (a); |
---|
| 310 | while (z <= x) |
---|
| 311 | { |
---|
| 312 | e = log (z) + e; |
---|
| 313 | s += ex (c*z-a-e); |
---|
| 314 | z += 1.0; |
---|
| 315 | } |
---|
| 316 | return (s); |
---|
| 317 | } |
---|
| 318 | else |
---|
| 319 | { |
---|
| 320 | e = (even ? 1.0 : (I_SQRT_PI / sqrt (a))); |
---|
| 321 | c = 0.0; |
---|
| 322 | while (z <= x) |
---|
| 323 | { |
---|
| 324 | e = e * (a / z); |
---|
| 325 | c = c + e; |
---|
| 326 | z += 1.0; |
---|
| 327 | } |
---|
| 328 | return (c * y + s); |
---|
| 329 | } |
---|
| 330 | } |
---|
| 331 | else |
---|
| 332 | return (s); |
---|
| 333 | } |
---|
| 334 | |
---|
| 335 | /* Maximum meaningful z value */ |
---|
| 336 | #define Z_MAX 6.0 |
---|
| 337 | |
---|
| 338 | |
---|
| 339 | /* |
---|
| 340 | ;; FUNCTION poz: probability of normal z value |
---|
| 341 | ;; |
---|
| 342 | ;; Adapted from a polynomial approximation in: |
---|
| 343 | ;; Ibbetson D, Algorithm 209 |
---|
| 344 | ;; Collected Algorithms of the CACM 1963 p. 616 |
---|
| 345 | ;; Note: |
---|
| 346 | ;; |
---|
| 347 | ;; This routine has six digit accuracy, so it is only |
---|
| 348 | ;; useful for absolute z values < 6. For z values >= to |
---|
| 349 | ;; 6.0, poz() returns 0.0. |
---|
| 350 | ;; |
---|
| 351 | */ |
---|
| 352 | double poz (double z) |
---|
| 353 | { |
---|
| 354 | |
---|
| 355 | double y, x, w; |
---|
| 356 | |
---|
| 357 | if (z == 0.0) |
---|
| 358 | x = 0.0; |
---|
| 359 | else |
---|
| 360 | { |
---|
| 361 | y = 0.5 * fabs (z); |
---|
| 362 | if (y >= (Z_MAX * 0.5)) |
---|
| 363 | { |
---|
| 364 | x = 1.0; |
---|
| 365 | } |
---|
| 366 | else if (y < 1.0) |
---|
| 367 | { |
---|
| 368 | w = y * y; |
---|
| 369 | x = ((((((((0.000124818987 * w |
---|
| 370 | -0.001075204047) * w +0.005198775019) * w |
---|
| 371 | -0.019198292004) * w +0.059054035642) * w |
---|
| 372 | -0.151968751364) * w +0.319152932694) * w |
---|
| 373 | -0.531923007300) * w +0.797884560593) * y * 2.0; |
---|
| 374 | } |
---|
| 375 | else |
---|
| 376 | { |
---|
| 377 | y -= 2.0; |
---|
| 378 | x = (((((((((((((-0.000045255659 * y |
---|
| 379 | +0.000152529290) * y -0.000019538132) * y |
---|
| 380 | -0.000676904986) * y +0.001390604284) * y |
---|
| 381 | -0.000794620820) * y -0.002034254874) * y |
---|
| 382 | +0.006549791214) * y -0.010557625006) * y |
---|
| 383 | +0.011630447319) * y -0.009279453341) * y |
---|
| 384 | +0.005353579108) * y -0.002141268741) * y |
---|
| 385 | +0.000535310849) * y +0.999936657524; |
---|
| 386 | } |
---|
| 387 | } |
---|
| 388 | |
---|
| 389 | return (z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)); |
---|
| 390 | } |
---|
| 391 | |
---|
| 392 | <# |
---|