@@ -20,6 +20,8 @@ package org.apache.spark.util.random
2020import org .scalatest .FunSuite
2121import org .scalatest .matchers .ShouldMatchers
2222
23+ import org .apache .commons .math3 .stat .inference .ChiSquareTest
24+
2325import org .apache .spark .util .Utils .times
2426
2527import scala .language .reflectiveCalls
@@ -33,45 +35,30 @@ class XORShiftRandomSuite extends FunSuite with ShouldMatchers {
3335 }
3436
3537 /*
36- * This test is based on a chi-squared test for randomness. The values are hard-coded
37- * so as not to create Spark's dependency on apache.commons.math3 just to call one
38- * method for calculating the exact p-value for a given number of random numbers
39- * and bins. In case one would want to move to a full-fledged test based on
40- * apache.commons.math3, the relevant class is here:
41- * org.apache.commons.math3.stat.inference.ChiSquareTest
38+ * This test is based on a chi-squared test for randomness.
4239 */
4340 test (" XORShift generates valid random numbers" ) {
4441
4542 val f = fixture
4643
47- val numBins = 10
48- // create 10 bins
49- val bins = Array .fill( numBins)( 0 )
44+ val numBins = 10 // create 10 bins
45+ val numRows = 5 // create 5 rows
46+ val bins = Array .ofDim[ Long ](numRows, numBins)
5047
51- // populate bins based on modulus of the random number
52- times(f.hundMil) {bins(math.abs(f.xorRand.nextInt) % 10 ) += 1 }
48+ // populate bins based on modulus of the random number for each row
49+ for (r <- 0 to numRows- 1 ) {
50+ times(f.hundMil) {bins(r)(math.abs(f.xorRand.nextInt) % numBins) += 1 }
51+ }
5352
54- /* since the seed is deterministic, until the algorithm is changed, we know the result will be
55- * exactly this: Array(10004908, 9993136, 9994600, 10000744, 10000091, 10002474, 10002272,
56- * 10000790, 10002286, 9998699), so the test will never fail at the prespecified (5%)
57- * significance level. However, should the RNG implementation change, the test should still
58- * pass at the same significance level. The chi-squared test done in R gave the following
59- * results:
60- * > chisq.test(c(10004908, 9993136, 9994600, 10000744, 10000091, 10002474, 10002272,
61- * 10000790, 10002286, 9998699))
62- * Chi-squared test for given probabilities
63- * data: c(10004908, 9993136, 9994600, 10000744, 10000091, 10002474, 10002272, 10000790,
64- * 10002286, 9998699)
65- * X-squared = 11.975, df = 9, p-value = 0.2147
66- * Note that the p-value was ~0.22. The test will fail if alpha < 0.05, which for 100 million
67- * random numbers
68- * and 10 bins will happen at X-squared of ~16.9196. So, the test will fail if X-squared
69- * is greater than or equal to that number.
53+ /*
54+ * Perform the chi square test on the 5 rows of randomly generated numbers evenly divided into
55+ * 10 bins. chiSquareTest returns true iff the null hypothesis (that the classifications
56+ * represented by the counts in the columns of the input 2-way table are independent of the
57+ * rows) can be rejected with 100 * (1 - alpha) percent confidence, where alpha is prespeficied
58+ * as 0.05
7059 */
71- val binSize = f.hundMil/ numBins
72- val xSquared = bins.map(x => math.pow((binSize - x), 2 )/ binSize).sum
73- xSquared should be < (16.9196 )
74-
60+ val chiTest = new ChiSquareTest
61+ assert(chiTest.chiSquareTest(bins, 0.05 ) === false )
7562 }
7663
7764 test (" XORShift with zero seed" ) {
0 commit comments