1 | |
---|
2 | (use eggdoc) |
---|
3 | |
---|
4 | (define doc |
---|
5 | `((eggdoc:begin |
---|
6 | (name "random-test") |
---|
7 | (description "Some simple randomness tests for a sequence of numbers.") |
---|
8 | (author (url "http://chicken.wiki.br/ivan raikov" "Ivan Raikov")) |
---|
9 | |
---|
10 | (history |
---|
11 | (version "1.6" "Removed testeez dependency") |
---|
12 | (version "1.5" "Build script updated for better cross-platform compatibility") |
---|
13 | (version "1.4" "eggdoc documentation fix") |
---|
14 | (version "1.3" "License upgrade to GPL v3.") |
---|
15 | (version "1.2" "Simplified the interface of random-test procedure") |
---|
16 | (version "1.1" "Bug fix in the bin update code") |
---|
17 | (version "1.0" "Initial release")) |
---|
18 | |
---|
19 | (requires easyffi) |
---|
20 | |
---|
21 | (usage "(require-extension random-test)") |
---|
22 | |
---|
23 | (download "random-test.egg") |
---|
24 | |
---|
25 | (documentation |
---|
26 | |
---|
27 | (p "The " (tt "random-test") " library provides a procedure that applies various " |
---|
28 | "statistical tests to a sequence of random numerical values, and a procedure to " |
---|
29 | "reports the results of those tests in convenient form. ") |
---|
30 | |
---|
31 | (p "The library is useful for evaluating pseudorandom number generators for " |
---|
32 | "statistical sampling applications, compression algorithms, and other applications " |
---|
33 | "where the properties of a random sequence are of interest. The code in the library " |
---|
34 | "is based on the " (url "http://www.fourmilab.ch/random/" "ent program") |
---|
35 | " by John Walker.") |
---|
36 | |
---|
37 | |
---|
38 | (subsection "Procedures" |
---|
39 | |
---|
40 | (procedure "make-random-test:: [CAR CDR NULL?] -> (SEQ -> RANDOM-STATS)" |
---|
41 | (p "This procedure creates a procedure that reads in a sequence " |
---|
42 | "of numerical values, and performs statistical tests to tests the " |
---|
43 | "randomness of the elements of the sequence.") |
---|
44 | (p "By default, the sequence is expected to be a list; however, if a " |
---|
45 | "different sequential data structure is used (e.g. a stream), " |
---|
46 | "the optional arguments " (tt "CAR, CDR, NULL?") " may be used to " |
---|
47 | "specify procedures that perform the corresponding operations on " |
---|
48 | "the input sequence. ") |
---|
49 | (p "The returned procedure is of the form " (tt "SEQ -> RANDOM-STATS") |
---|
50 | ", where " (tt "SEQ") " is the sequence and the returned value is an " |
---|
51 | "alist with the following fields: " |
---|
52 | (symbol-table |
---|
53 | (describe "'ent" |
---|
54 | "entropy measure") |
---|
55 | (describe "'chisq" |
---|
56 | "the result of the Chi-Square test (see next section)") |
---|
57 | (describe "'pochisq" |
---|
58 | "the calculated probability of the Chi-Square test (see next section)") |
---|
59 | (describe "'mean" |
---|
60 | "the mean of the values in the input sequence") |
---|
61 | (describe "'min" |
---|
62 | "the minimum of the values in the input sequence") |
---|
63 | (describe "'max" |
---|
64 | "the maximum of the values in the input sequence") |
---|
65 | (describe "'montepi" |
---|
66 | "Monte Carlo value of pi (see next section)") |
---|
67 | (describe "'scc" |
---|
68 | "the serial correlation coefficient (see next section)")))) |
---|
69 | |
---|
70 | (procedure "format-random-stats:: OUT * RANDOM-STATS -> UNDEFINED" |
---|
71 | |
---|
72 | (p "Given an output port, and the value returned by the random test " |
---|
73 | "procedure, this procedure outputs a human readable interpretation " |
---|
74 | "of the test results."))) |
---|
75 | |
---|
76 | (subsection "Tests" |
---|
77 | |
---|
78 | (ol |
---|
79 | (li "Chi-Square Test" |
---|
80 | (p "In general, the Chi-Square distribution for an experiment with " |
---|
81 | (tt "k") " possible outcomes, performed " (tt "n") " times, " |
---|
82 | "in which " (tt "Y1, Y2,... Yk") " are the number of experiments " |
---|
83 | "which resulted in each possible outcome, and probabilities of each " |
---|
84 | "outcome " (tt "p1, p2,... pk")", is given as: " |
---|
85 | (img (@ (src "random-test.fig1.png") |
---|
86 | (alt "\\chi^{2} = \\sum_{1 <= i <= k}{\\frac{(Y_{i} - m p_{i})^{2}}{np_{i}}}")))) |
---|
87 | |
---|
88 | (p (tt "\\Chi^2") " will grow larger as the measured results diverge from those " |
---|
89 | "expected by pure chance. The probability " (tt "Q") " that a Chi-Square value " |
---|
90 | "calculated for an experiment with d degrees of freedom (where " (tt "d=k-1") |
---|
91 | ", one less the number of possible outcomes) is due to chance is: " |
---|
92 | (img (@ (src "random-test.fig2.png") |
---|
93 | (alt "Q(\\Chi^2,d) = [2^{d/2} * \\Gamma(d/2)]^{-1} * \\int_{\\Chi^2}^{\\infty}(t)^{d/2-1} * exp(-t/2) * dt")))) |
---|
94 | |
---|
95 | (p "Where Gamma is the generalization of the factorial function to real " |
---|
96 | "and complex arguments: " |
---|
97 | (img (@ (src "random-test.fig3.png") |
---|
98 | (alt "\\Gamma(x) = \\int_{0}^{\\infty} t^{x-1} * exp(-t) * dt")))) |
---|
99 | |
---|
100 | (p "There is no closed form solution for Q, so it must be evaluated " |
---|
101 | "numerically. Note that the probability calculated from the " |
---|
102 | (tt "\\Chi^2") " is an approximation which is valid only for large " |
---|
103 | "values of n, and is therefore only meaningful when calculated " |
---|
104 | "from a large number of independent experiments. ") |
---|
105 | |
---|
106 | (p "In this implementation, the Chi-Square distribution is calculated " |
---|
107 | "for the list of values given as argument to the random-test " |
---|
108 | "procedure and expressed as an absolute number and a percentage " |
---|
109 | "which indicates how frequently a truly random sequence would exceed " |
---|
110 | "the value calculated. ") |
---|
111 | |
---|
112 | (p "The percentage can be interpreted as the degree to which the sequence " |
---|
113 | "tested is suspected of being non-random. If the percentage is greater " |
---|
114 | "than 99% or less than 1%, the sequence is almost certainly not random. " |
---|
115 | "If the percentage is between 99% and 95% or between 1% and 5%, " |
---|
116 | "the sequence is suspect. Percentages between 90% and 95% and 5% and 10% " |
---|
117 | "indicate the sequence is almost suspect.")) |
---|
118 | |
---|
119 | (li "Arithmetic Mean" |
---|
120 | (p "This is simply the result of summing the all the values in the sequence " |
---|
121 | "and dividing by the sequence length. If the data are close to random, " |
---|
122 | "the mean should be about " (tt "(2^b - 1)/2") " where " (tt "b") |
---|
123 | " is the number of bits used to represent a value. " |
---|
124 | "If the mean departs from this value, the values are consistently high or low.")) |
---|
125 | |
---|
126 | (li "Monte Carlo Value for Pi" |
---|
127 | (p "Each pair of two values in the input sequence is used as X and Y coordinates " |
---|
128 | "within a square with side N (the length of the input sequence). " |
---|
129 | "If the distance of the randomly-generated point is less than the radius of a " |
---|
130 | "circle inscribed within the square, the pair of values is considered a hit. " |
---|
131 | "The percentage of hits can be used to calculate the value of pi: " (pre #<<EOF |
---|
132 | # points within circle 1/4 * pi * r^2 |
---|
133 | ---------------------- = -------------- = 1/4 * pi |
---|
134 | # points within square r^2 |
---|
135 | EOF |
---|
136 | ) |
---|
137 | "Therefore, " (pre #<<EOF |
---|
138 | # points within circle |
---|
139 | pi = 4 * ---------------------- |
---|
140 | # points within square |
---|
141 | EOF |
---|
142 | )) |
---|
143 | |
---|
144 | (p "For very long sequences (this approximation converges very slowly), " |
---|
145 | "the value will approach the correct value of Pi if the sequence is close to random.")) |
---|
146 | |
---|
147 | (li "Serial Correlation Coefficient" |
---|
148 | (p "This quantity measures the extent to which each value in the sequence " |
---|
149 | "depends upon the previous value. For random sequences, this metric " |
---|
150 | "(which can be positive or negative) will be close to zero. " |
---|
151 | "A non-random sequence such as a text file will yield a serial " |
---|
152 | "correlation coefficient of about 0.5. Predictable data will exhibit " |
---|
153 | "serial correlation coefficients approaching 1."))))) |
---|
154 | |
---|
155 | |
---|
156 | (examples (pre #<<EOF |
---|
157 | (require-extension random-test) |
---|
158 | (require-extension srfi-1) |
---|
159 | |
---|
160 | (randomize (current-milliseconds)) |
---|
161 | |
---|
162 | (define random-test (make-random-test)) |
---|
163 | (define lst (list-tabulate 1000000 (lambda (x) (random 1000000)))) |
---|
164 | |
---|
165 | (random-test lst 1000000) |
---|
166 | |
---|
167 | (define stats (random-test lst 1000000)) |
---|
168 | |
---|
169 | (format-random-stats stats) |
---|
170 | EOF |
---|
171 | )) |
---|
172 | |
---|
173 | (license |
---|
174 | "Copyright 2007 Ivan Raikov. |
---|
175 | |
---|
176 | This program is free software: you can redistribute it and/or modify |
---|
177 | it under the terms of the GNU General Public License as published by |
---|
178 | the Free Software Foundation, either version 3 of the License, or (at |
---|
179 | your option) any later version. |
---|
180 | |
---|
181 | This program is distributed in the hope that it will be useful, but |
---|
182 | WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
183 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
184 | General Public License for more details. |
---|
185 | |
---|
186 | A full copy of the GPL license can be found at |
---|
187 | <http://www.gnu.org/licenses/>.")))) |
---|
188 | |
---|
189 | (if (eggdoc->html doc) (void)) |
---|