1717
1818package org .apache .spark .sql
1919
20- import java .sql .Timestamp
20+ import java .lang .Double .longBitsToDouble
21+ import java .lang .Float .intBitsToFloat
22+ import java .math .MathContext
2123
22- import org . scalacheck .{ Arbitrary , Gen }
24+ import scala . util . Random
2325
2426import org .apache .spark .sql .types ._
2527
2628/**
27- * ScalaCheck random data generators for Spark SQL DataTypes.
29+ * Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random
30+ * values; instead, they're biased to return "interesting" values (such as maximum / minimum values)
31+ * with higher probability.
2832 */
2933object RandomDataGenerator {
3034
35+ /**
36+ * The conditional probability of a non-null value being drawn from a set of "interesting" values
37+ * instead of being chosen uniformly at random.
38+ */
39+ private val PROBABILITY_OF_INTERESTING_VALUE : Float = 0.5f
40+
41+ /**
42+ * The probability of the generated value being null
43+ */
44+ private val PROBABILITY_OF_NULL : Float = 0.1f
45+
46+ private val MAX_STR_LEN : Int = 1024
47+ private val MAX_ARR_SIZE : Int = 128
48+ private val MAX_MAP_SIZE : Int = 128
49+
50+ /**
51+ * Helper function for constructing a biased random number generator which returns "interesting"
52+ * values with a higher probability.
53+ */
54+ private def randomNumeric [T ](
55+ rand : Random ,
56+ uniformRand : Random => T ,
57+ interestingValues : Seq [T ]): Some [() => T ] = {
58+ val f = () => {
59+ if (rand.nextFloat() <= PROBABILITY_OF_INTERESTING_VALUE ) {
60+ interestingValues(rand.nextInt(interestingValues.length))
61+ } else {
62+ uniformRand(rand)
63+ }
64+ }
65+ Some (f)
66+ }
67+
3168 /**
3269 * Returns a function which generates random values for the given [[DataType ]], or `None` if no
3370 * random data generator is defined for that data type. The generated values will use an external
@@ -37,58 +74,85 @@ object RandomDataGenerator {
3774 *
3875 * @param dataType the type to generate values for
3976 * @param nullable whether null values should be generated
40- * @return a ScalaCheck [[Gen ]] which can be used to produce random values.
77+ * @param seed an optional seed for the random number generator
78+ * @return a function which can be called to generate random values.
4179 */
4280 def forType (
4381 dataType : DataType ,
44- nullable : Boolean = true ): Option [Gen [Any ]] = {
45- val valueGenerator : Option [Gen [Any ]] = dataType match {
46- case StringType => Some (Arbitrary .arbitrary[String ])
47- case BinaryType => Some (Gen .listOf(Arbitrary .arbitrary[Byte ]).map(_.toArray))
48- case BooleanType => Some (Arbitrary .arbitrary[Boolean ])
49- case DateType => Some (Arbitrary .arbitrary[Int ].suchThat(_ >= 0 ).map(new java.sql.Date (_)))
50- case DoubleType => Some (Arbitrary .arbitrary[Double ])
51- case FloatType => Some (Arbitrary .arbitrary[Float ])
52- case ByteType => Some (Arbitrary .arbitrary[Byte ])
53- case IntegerType => Some (Arbitrary .arbitrary[Int ])
54- case LongType => Some (Arbitrary .arbitrary[Long ])
55- case ShortType => Some (Arbitrary .arbitrary[Short ])
56- case NullType => Some (Gen .const[Any ](null ))
57- case TimestampType => Some (Arbitrary .arbitrary[Long ].suchThat(_ >= 0 ).map(new Timestamp (_)))
58- case DecimalType .Unlimited => Some (Arbitrary .arbitrary[BigDecimal ])
82+ nullable : Boolean = true ,
83+ seed : Option [Long ] = None ): Option [() => Any ] = {
84+ val rand = new Random ()
85+ seed.foreach(rand.setSeed)
86+
87+ val valueGenerator : Option [() => Any ] = dataType match {
88+ case StringType => Some (() => rand.nextString(rand.nextInt(MAX_STR_LEN )))
89+ case BinaryType => Some (() => {
90+ val arr = new Array [Byte ](rand.nextInt(MAX_STR_LEN ))
91+ rand.nextBytes(arr)
92+ arr
93+ })
94+ case BooleanType => Some (() => rand.nextBoolean())
95+ case DateType => Some (() => new java.sql.Date (rand.nextInt()))
96+ case TimestampType => Some (() => new java.sql.Timestamp (rand.nextLong()))
97+ case DecimalType .Unlimited => Some (
98+ () => BigDecimal .apply(rand.nextLong, rand.nextInt, MathContext .UNLIMITED ))
99+ case DoubleType => randomNumeric[Double ](
100+ rand, r => longBitsToDouble(r.nextLong()), Seq (Double .MinValue , Double .MinPositiveValue ,
101+ Double .MaxValue , Double .PositiveInfinity , Double .NegativeInfinity , Double .NaN , 0.0 ))
102+ case FloatType => randomNumeric[Float ](
103+ rand, r => intBitsToFloat(r.nextInt()), Seq (Float .MinValue , Float .MinPositiveValue ,
104+ Float .MaxValue , Float .PositiveInfinity , Float .NegativeInfinity , Float .NaN , 0.0f ))
105+ case ByteType => randomNumeric[Byte ](
106+ rand, _.nextInt().toByte, Seq (Byte .MinValue , Byte .MaxValue , 0 .toByte))
107+ case IntegerType => randomNumeric[Int ](
108+ rand, _.nextInt(), Seq (Int .MinValue , Int .MaxValue , 0 ))
109+ case LongType => randomNumeric[Long ](
110+ rand, _.nextLong(), Seq (Long .MinValue , Long .MaxValue , 0L ))
111+ case ShortType => randomNumeric[Short ](
112+ rand, _.nextInt().toShort, Seq (Short .MinValue , Short .MaxValue , 0 .toShort))
113+ case NullType => Some (() => null )
59114 case ArrayType (elementType, containsNull) => {
60- forType(elementType, nullable = containsNull) .map { elementGen =>
61- Gen .listOf(elementGen).map(_.toArray )
115+ forType(elementType, nullable = containsNull, seed = Some (rand.nextLong())) .map {
116+ elementGenerator => () => Array .fill(rand.nextInt( MAX_ARR_SIZE ))(elementGenerator() )
62117 }
63118 }
64119 case MapType (keyType, valueType, valueContainsNull) => {
65120 for (
66- keyGenerator <- forType(keyType, nullable = false );
67- valueGenerator <- forType(valueType, nullable = valueContainsNull)
68- // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173)
69- // and Spark can hit NumberFormatException errors converting certain BigDecimals
70- // (SPARK-8802). For these reasons, we don't support generation of maps with decimal keys.
71- if ! keyType.isInstanceOf [DecimalType ]
121+ keyGenerator <- forType(keyType, nullable = false , seed = Some (rand.nextLong()));
122+ valueGenerator <-
123+ forType(valueType, nullable = valueContainsNull, seed = Some (rand.nextLong()))
72124 ) yield {
73- Gen .listOf(Gen .zip(keyGenerator, valueGenerator)).map(_.toMap)
125+ () => {
126+ Seq .fill(rand.nextInt(MAX_MAP_SIZE ))((keyGenerator(), valueGenerator())).toMap
127+ }
74128 }
75129 }
76130 case StructType (fields) => {
77- val maybeFieldGenerators : Seq [Option [Gen [ Any ] ]] = fields.map { field =>
78- forType(field.dataType, nullable = field.nullable)
131+ val maybeFieldGenerators : Seq [Option [() => Any ]] = fields.map { field =>
132+ forType(field.dataType, nullable = field.nullable, seed = Some (rand.nextLong()) )
79133 }
80134 if (maybeFieldGenerators.forall(_.isDefined)) {
81- Some (Gen .sequence[Seq [Any ], Any ](maybeFieldGenerators.flatten).map(vs => Row .fromSeq(vs)))
135+ val fieldGenerators : Seq [() => Any ] = maybeFieldGenerators.map(_.get)
136+ Some (() => Row .fromSeq(fieldGenerators.map(_.apply())))
82137 } else {
83138 None
84139 }
85140 }
86141 case unsupportedType => None
87142 }
88- if (nullable) {
89- valueGenerator.map(Gen .oneOf(_, Gen .const[Any ](null )))
90- } else {
91- valueGenerator
143+ // Handle nullability by wrapping the non-null value generator:
144+ valueGenerator.map { valueGenerator =>
145+ if (nullable) {
146+ () => {
147+ if (rand.nextFloat() <= PROBABILITY_OF_NULL ) {
148+ null
149+ } else {
150+ valueGenerator()
151+ }
152+ }
153+ } else {
154+ valueGenerator
155+ }
92156 }
93157 }
94158}
0 commit comments