From d2b4a4a9a2139b1a6c2be5d1f1aa3d98a6c9ed99 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 1 Jul 2015 20:18:05 -0700 Subject: [PATCH 1/8] Add random data generator test utilities to Spark SQL. --- .../spark/sql/test/DataTypeTestUtils.scala | 59 +++++++ .../spark/sql/test/RandomDataGenerator.scala | 151 ++++++++++++++++++ .../sql/test/RandomDataGeneratorSuite.scala | 77 +++++++++ 3 files changed, 287 insertions(+) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/test/DataTypeTestUtils.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGenerator.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGeneratorSuite.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataTypeTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataTypeTestUtils.scala new file mode 100644 index 000000000000..d862eb7293d6 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataTypeTestUtils.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.test + +import org.apache.spark.sql.types._ + +/** + * Utility functions for working with DataTypes in tests. + */ +object DataTypeTestUtils { + + /** + * Instances of all [[IntegralType]]s. + */ + val integralType: Set[IntegralType] = Set( + ByteType, ShortType, IntegerType, LongType + ) + + /** + * Instances of all [[FractionalType]]s, including both fixed- and unlimited-precision + * decimal types. + */ + val fractionalTypes: Set[FractionalType] = Set( + DecimalType(precisionInfo = None), + DecimalType(2, 1), + DoubleType, + FloatType + ) + + /** + * Instances of all [[NumericType]]s. + */ + val numericTypes: Set[NumericType] = integralType ++ fractionalTypes + + /** + * Instances of all [[AtomicType]]s. + */ + val atomicTypes: Set[DataType] = Set(BinaryType, StringType, TimestampType) ++ numericTypes + + /** + * Instances of [[ArrayType]] for all [[AtomicType]]s. Arrays of these types may contain null. + */ + val atomicArrayTypes: Set[ArrayType] = atomicTypes.map(ArrayType(_, containsNull = true)) +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGenerator.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGenerator.scala new file mode 100644 index 000000000000..6ac2ba155655 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGenerator.scala @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.test + +import org.apache.spark.sql.Row + +import scala.util.Random + +import org.apache.spark.sql.types._ + +/** + * Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random + * values; instead, they're biased to return "interesting" values (such as maximum / minimum values) + * with higher probability. + */ +object RandomDataGenerator { + + /** + * The conditional probability of a non-null value being drawn from a set of "interesting" values + * instead of being chosen uniformly at random. + */ + private val PROBABILITY_OF_INTERESTING_VALUE: Float = 0.25f + + /** + * The probability of the generated value being null + */ + private val PROBABILITY_OF_NULL: Float = 0.1f + + private val MAX_STR_LEN: Int = 1024 + private val MAX_ARR_SIZE: Int = 128 + private val MAX_MAP_SIZE: Int = 128 + + /** + * Helper function for constructing a biased random number generator which returns "interesting" + * values with a higher probability. + */ + private def randomNumeric[T]( + rand: Random, + uniformRand: Random => T, + interestingValues: Seq[T]): Some[() => T] = { + val f = () => { + if (rand.nextFloat() <= PROBABILITY_OF_INTERESTING_VALUE) { + interestingValues(rand.nextInt(interestingValues.length)) + } else { + uniformRand(rand) + } + } + Some(f) + } + + /** + * Returns a function which generates random values for the given [[DataType]], or `None` if no + * random data generator is defined for that data type. The generated values will use an external + * representation of the data type; for example, the random generator for [[DateType]] will return + * instances of [[java.sql.Date]] and the generator for [[StructType]] will return a + * [[org.apache.spark.Row]]. + * + * @param dataType the type to generate values for + * @param nullable whether null values should be generated + * @param seed an optional seed for the random number generator + * @return a function which can be called to generate random values. + */ + def forType( + dataType: DataType, + nullable: Boolean = true, + seed: Option[Long] = None): Option[() => Any] = { + val rand = new Random() + seed.foreach(rand.setSeed) + + val valueGenerator: Option[() => Any] = dataType match { + case StringType => Some(() => rand.nextString(rand.nextInt(MAX_STR_LEN))) + case BinaryType => Some(() => { + val arr = new Array[Byte](rand.nextInt(MAX_STR_LEN)) + rand.nextBytes(arr) + arr + }) + case BooleanType => Some(() => rand.nextBoolean()) + case DateType => Some(() => new java.sql.Date(rand.nextInt(Int.MaxValue))) + case DoubleType => randomNumeric[Double]( + rand, _.nextDouble(), Seq(Double.MinValue, Double.MinPositiveValue, Double.MaxValue, 0.0)) + case FloatType => randomNumeric[Float]( + rand, _.nextFloat(), Seq(Float.MinValue, Float.MinPositiveValue, Float.MaxValue, 0.0f)) + case ByteType => randomNumeric[Byte]( + rand, _.nextInt().toByte, Seq(Byte.MinValue, Byte.MaxValue, 0.toByte)) + case IntegerType => randomNumeric[Int]( + rand, _.nextInt(), Seq(Int.MinValue, Int.MaxValue, 0)) + case LongType => randomNumeric[Long]( + rand, _.nextLong(), Seq(Long.MinValue, Long.MaxValue, 0L)) + case ShortType => randomNumeric[Short]( + rand, _.nextInt().toShort, Seq(Short.MinValue, Short.MaxValue, 0.toShort)) + case NullType => Some(() => null) + case ArrayType(elementType, containsNull) => { + forType(elementType, nullable = containsNull, seed = Some(rand.nextLong())).map { + elementGenerator => () => Array.fill(rand.nextInt(MAX_ARR_SIZE))(elementGenerator()) + } + } + case MapType(keyType, valueType, valueContainsNull) => { + for ( + keyGenerator <- forType(keyType, nullable = false, seed = Some(rand.nextLong())); + valueGenerator <- + forType(valueType, nullable = valueContainsNull, seed = Some(rand.nextLong())) + ) yield { + () => { + Seq.fill(rand.nextInt(MAX_MAP_SIZE))((keyGenerator(), valueGenerator())).toMap + } + } + } + case StructType(fields) => { + val maybeFieldGenerators: Seq[Option[() => Any]] = fields.map { field => + forType(field.dataType, nullable = field.nullable, seed = Some(rand.nextLong())) + } + if (maybeFieldGenerators.forall(_.isDefined)) { + val fieldGenerators: Seq[() => Any] = maybeFieldGenerators.map(_.get) + Some(() => Row.fromSeq(fieldGenerators.map(_.apply()))) + } else { + None + } + } + case unsupportedType => None + } + // Handle nullability by wrapping the non-null value generator: + valueGenerator.map { valueGenerator => + if (nullable) { + () => { + if (rand.nextFloat() <= PROBABILITY_OF_NULL) { + null + } else { + valueGenerator() + } + } + } else { + valueGenerator + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGeneratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGeneratorSuite.scala new file mode 100644 index 000000000000..fb4ed9028c2c --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGeneratorSuite.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.test + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.types.{StructField, StructType, MapType, DataType} + +/** + * Tests of [[RandomDataGenerator]]. + */ +class RandomDataGeneratorSuite extends SparkFunSuite { + + /** + * Tests random data generation for the given type by using it to generate random values then + * converting those values into their Catalyst equivalents using CatalystTypeConverters. + */ + def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { + val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) + RandomDataGenerator.forType(dataType, nullable, Some(42L)).foreach { generator => + for (_ <- 1 to 10) { + val generatedValue = generator() + val convertedValue = toCatalyst(generatedValue) + if (!nullable) { + assert(convertedValue !== null) + } + } + } + + } + + // Basic types: + + (DataTypeTestUtils.atomicTypes ++ DataTypeTestUtils.atomicArrayTypes).foreach { dataType => + test(s"$dataType") { + testRandomDataGeneration(dataType) + } + } + + // Complex types: + + for ( + keyType <- DataTypeTestUtils.atomicTypes; + valueType <- DataTypeTestUtils.atomicTypes + ) { + val mapType = MapType(keyType, valueType) + test(s"$mapType") { + testRandomDataGeneration(mapType) + } + } + + for ( + colOneType <- DataTypeTestUtils.atomicTypes; + colTwoType <- DataTypeTestUtils.atomicTypes + ) { + val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) + test(s"$structType") { + testRandomDataGeneration(structType) + } + } + +} From ab76cbd89bf800d590b7833f5a25c62df4ec2a95 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 1 Jul 2015 21:37:38 -0700 Subject: [PATCH 2/8] Move code to Catalyst package. --- .../scala/org/apache/spark/sql}/RandomDataGenerator.scala | 6 ++---- .../org/apache/spark/sql}/RandomDataGeneratorSuite.scala | 4 ++-- .../org/apache/spark/sql/types}/DataTypeTestUtils.scala | 4 +--- 3 files changed, 5 insertions(+), 9 deletions(-) rename sql/{core/src/test/scala/org/apache/spark/sql/test => catalyst/src/test/scala/org/apache/spark/sql}/RandomDataGenerator.scala (98%) rename sql/{core/src/test/scala/org/apache/spark/sql/test => catalyst/src/test/scala/org/apache/spark/sql}/RandomDataGeneratorSuite.scala (95%) rename sql/{core/src/test/scala/org/apache/spark/sql/test => catalyst/src/test/scala/org/apache/spark/sql/types}/DataTypeTestUtils.scala (96%) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala similarity index 98% rename from sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGenerator.scala rename to sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 6ac2ba155655..f167557be818 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -15,14 +15,12 @@ * limitations under the License. */ -package org.apache.spark.sql.test +package org.apache.spark.sql -import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ import scala.util.Random -import org.apache.spark.sql.types._ - /** * Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random * values; instead, they're biased to return "interesting" values (such as maximum / minimum values) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala similarity index 95% rename from sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGeneratorSuite.scala rename to sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala index fb4ed9028c2c..ea70fe03eb91 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/RandomDataGeneratorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala @@ -15,11 +15,11 @@ * limitations under the License. */ -package org.apache.spark.sql.test +package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters -import org.apache.spark.sql.types.{StructField, StructType, MapType, DataType} +import org.apache.spark.sql.types._ /** * Tests of [[RandomDataGenerator]]. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataTypeTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala similarity index 96% rename from sql/core/src/test/scala/org/apache/spark/sql/test/DataTypeTestUtils.scala rename to sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala index d862eb7293d6..0b7ed54c681e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataTypeTestUtils.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala @@ -15,9 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.test - -import org.apache.spark.sql.types._ +package org.apache.spark.sql.types /** * Utility functions for working with DataTypes in tests. From 5acdd5ccf36487ba49815e8e0429f4c99558d427 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 1 Jul 2015 22:15:13 -0700 Subject: [PATCH 3/8] Infinity and NaN are interesting. --- .../scala/org/apache/spark/sql/RandomDataGenerator.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index f167557be818..cd4ffdfd4517 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -32,7 +32,7 @@ object RandomDataGenerator { * The conditional probability of a non-null value being drawn from a set of "interesting" values * instead of being chosen uniformly at random. */ - private val PROBABILITY_OF_INTERESTING_VALUE: Float = 0.25f + private val PROBABILITY_OF_INTERESTING_VALUE: Float = 0.5f /** * The probability of the generated value being null @@ -90,9 +90,11 @@ object RandomDataGenerator { case BooleanType => Some(() => rand.nextBoolean()) case DateType => Some(() => new java.sql.Date(rand.nextInt(Int.MaxValue))) case DoubleType => randomNumeric[Double]( - rand, _.nextDouble(), Seq(Double.MinValue, Double.MinPositiveValue, Double.MaxValue, 0.0)) + rand, _.nextDouble(), Seq(Double.MinValue, Double.MinPositiveValue, Double.MaxValue, + Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0)) case FloatType => randomNumeric[Float]( - rand, _.nextFloat(), Seq(Float.MinValue, Float.MinPositiveValue, Float.MaxValue, 0.0f)) + rand, _.nextFloat(), Seq(Float.MinValue, Float.MinPositiveValue, Float.MaxValue, + Float.PositiveInfinity, Float.NegativeInfinity, Float.NaN, 0.0f)) case ByteType => randomNumeric[Byte]( rand, _.nextInt().toByte, Seq(Byte.MinValue, Byte.MaxValue, 0.toByte)) case IntegerType => randomNumeric[Int]( From b55875a05e4805cfdf2c3468a6cd50eec6a30578 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 1 Jul 2015 22:23:55 -0700 Subject: [PATCH 4/8] Generate doubles and floats over entire possible range. --- .../org/apache/spark/sql/RandomDataGenerator.scala | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index cd4ffdfd4517..26437c45eb41 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -17,10 +17,13 @@ package org.apache.spark.sql -import org.apache.spark.sql.types._ +import java.lang.Double.longBitsToDouble +import java.lang.Float.intBitsToFloat import scala.util.Random +import org.apache.spark.sql.types._ + /** * Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random * values; instead, they're biased to return "interesting" values (such as maximum / minimum values) @@ -90,11 +93,11 @@ object RandomDataGenerator { case BooleanType => Some(() => rand.nextBoolean()) case DateType => Some(() => new java.sql.Date(rand.nextInt(Int.MaxValue))) case DoubleType => randomNumeric[Double]( - rand, _.nextDouble(), Seq(Double.MinValue, Double.MinPositiveValue, Double.MaxValue, - Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0)) + rand, r => longBitsToDouble(r.nextLong()), Seq(Double.MinValue, Double.MinPositiveValue, + Double.MaxValue, Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0)) case FloatType => randomNumeric[Float]( - rand, _.nextFloat(), Seq(Float.MinValue, Float.MinPositiveValue, Float.MaxValue, - Float.PositiveInfinity, Float.NegativeInfinity, Float.NaN, 0.0f)) + rand, r => intBitsToFloat(r.nextInt()), Seq(Float.MinValue, Float.MinPositiveValue, + Float.MaxValue, Float.PositiveInfinity, Float.NegativeInfinity, Float.NaN, 0.0f)) case ByteType => randomNumeric[Byte]( rand, _.nextInt().toByte, Seq(Byte.MinValue, Byte.MaxValue, 0.toByte)) case IntegerType => randomNumeric[Int]( From 0c209051777620b62a6cab4b18673dd145ca91c8 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 2 Jul 2015 14:58:30 -0700 Subject: [PATCH 5/8] Initial attempt at using ScalaCheck. --- .../spark/sql/RandomDataGenerator.scala | 127 +++++------------- .../spark/sql/RandomDataGeneratorSuite.scala | 58 +++++--- 2 files changed, 74 insertions(+), 111 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 26437c45eb41..9479c9b1a078 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -17,12 +17,10 @@ package org.apache.spark.sql -import java.lang.Double.longBitsToDouble -import java.lang.Float.intBitsToFloat - -import scala.util.Random +import java.sql.Timestamp import org.apache.spark.sql.types._ +import org.scalacheck.{Arbitrary, Gen} /** * Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random @@ -31,39 +29,6 @@ import org.apache.spark.sql.types._ */ object RandomDataGenerator { - /** - * The conditional probability of a non-null value being drawn from a set of "interesting" values - * instead of being chosen uniformly at random. - */ - private val PROBABILITY_OF_INTERESTING_VALUE: Float = 0.5f - - /** - * The probability of the generated value being null - */ - private val PROBABILITY_OF_NULL: Float = 0.1f - - private val MAX_STR_LEN: Int = 1024 - private val MAX_ARR_SIZE: Int = 128 - private val MAX_MAP_SIZE: Int = 128 - - /** - * Helper function for constructing a biased random number generator which returns "interesting" - * values with a higher probability. - */ - private def randomNumeric[T]( - rand: Random, - uniformRand: Random => T, - interestingValues: Seq[T]): Some[() => T] = { - val f = () => { - if (rand.nextFloat() <= PROBABILITY_OF_INTERESTING_VALUE) { - interestingValues(rand.nextInt(interestingValues.length)) - } else { - uniformRand(rand) - } - } - Some(f) - } - /** * Returns a function which generates random values for the given [[DataType]], or `None` if no * random data generator is defined for that data type. The generated values will use an external @@ -73,82 +38,58 @@ object RandomDataGenerator { * * @param dataType the type to generate values for * @param nullable whether null values should be generated - * @param seed an optional seed for the random number generator - * @return a function which can be called to generate random values. + * @return a ScalaCheck [[Gen]] which can be used to produce random values. */ def forType( dataType: DataType, - nullable: Boolean = true, - seed: Option[Long] = None): Option[() => Any] = { - val rand = new Random() - seed.foreach(rand.setSeed) - - val valueGenerator: Option[() => Any] = dataType match { - case StringType => Some(() => rand.nextString(rand.nextInt(MAX_STR_LEN))) - case BinaryType => Some(() => { - val arr = new Array[Byte](rand.nextInt(MAX_STR_LEN)) - rand.nextBytes(arr) - arr - }) - case BooleanType => Some(() => rand.nextBoolean()) - case DateType => Some(() => new java.sql.Date(rand.nextInt(Int.MaxValue))) - case DoubleType => randomNumeric[Double]( - rand, r => longBitsToDouble(r.nextLong()), Seq(Double.MinValue, Double.MinPositiveValue, - Double.MaxValue, Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0)) - case FloatType => randomNumeric[Float]( - rand, r => intBitsToFloat(r.nextInt()), Seq(Float.MinValue, Float.MinPositiveValue, - Float.MaxValue, Float.PositiveInfinity, Float.NegativeInfinity, Float.NaN, 0.0f)) - case ByteType => randomNumeric[Byte]( - rand, _.nextInt().toByte, Seq(Byte.MinValue, Byte.MaxValue, 0.toByte)) - case IntegerType => randomNumeric[Int]( - rand, _.nextInt(), Seq(Int.MinValue, Int.MaxValue, 0)) - case LongType => randomNumeric[Long]( - rand, _.nextLong(), Seq(Long.MinValue, Long.MaxValue, 0L)) - case ShortType => randomNumeric[Short]( - rand, _.nextInt().toShort, Seq(Short.MinValue, Short.MaxValue, 0.toShort)) - case NullType => Some(() => null) + nullable: Boolean = true): Option[Gen[Any]] = { + val valueGenerator: Option[Gen[Any]] = dataType match { + case StringType => Some(Arbitrary.arbitrary[String]) + case BinaryType => Some(Gen.listOf(Arbitrary.arbitrary[Byte]).map(_.toArray)) + case BooleanType => Some(Arbitrary.arbitrary[Boolean]) + case DateType => Some(Arbitrary.arbitrary[Int].suchThat(_ >= 0).map(new java.sql.Date(_))) + case DoubleType => Some(Arbitrary.arbitrary[Double]) + case FloatType => Some(Arbitrary.arbitrary[Float]) + case ByteType => Some(Arbitrary.arbitrary[Byte]) + case IntegerType => Some(Arbitrary.arbitrary[Int]) + case LongType => Some(Arbitrary.arbitrary[Long]) + case ShortType => Some(Arbitrary.arbitrary[Short]) + case NullType => Some(Gen.const[Any](null)) + case TimestampType => Some(Arbitrary.arbitrary[Long].suchThat(_ >= 0).map(new Timestamp(_))) + case DecimalType.Unlimited => Some(Arbitrary.arbitrary[BigDecimal]) case ArrayType(elementType, containsNull) => { - forType(elementType, nullable = containsNull, seed = Some(rand.nextLong())).map { - elementGenerator => () => Array.fill(rand.nextInt(MAX_ARR_SIZE))(elementGenerator()) + forType(elementType, nullable = containsNull).map { elementGen => + Gen.listOf(elementGen).map(_.toArray) } } case MapType(keyType, valueType, valueContainsNull) => { for ( - keyGenerator <- forType(keyType, nullable = false, seed = Some(rand.nextLong())); - valueGenerator <- - forType(valueType, nullable = valueContainsNull, seed = Some(rand.nextLong())) + keyGenerator <- forType(keyType, nullable = false); + valueGenerator <- forType(valueType, nullable = valueContainsNull) + // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) + // and Spark can hit NumberFormatException errors converting certain BigDecimals + // (SPARK-8802). For these reasons, we don't support generation of maps with decimal keys. + if !keyType.isInstanceOf[DecimalType] ) yield { - () => { - Seq.fill(rand.nextInt(MAX_MAP_SIZE))((keyGenerator(), valueGenerator())).toMap - } + Gen.listOf(Gen.zip(keyGenerator, valueGenerator)).map(_.toMap) } } case StructType(fields) => { - val maybeFieldGenerators: Seq[Option[() => Any]] = fields.map { field => - forType(field.dataType, nullable = field.nullable, seed = Some(rand.nextLong())) + val maybeFieldGenerators: Seq[Option[Gen[Any]]] = fields.map { field => + forType(field.dataType, nullable = field.nullable) } if (maybeFieldGenerators.forall(_.isDefined)) { - val fieldGenerators: Seq[() => Any] = maybeFieldGenerators.map(_.get) - Some(() => Row.fromSeq(fieldGenerators.map(_.apply()))) + Some(Gen.sequence[Seq[Any], Any](maybeFieldGenerators.flatten).map(vs => Row.fromSeq(vs))) } else { None } } case unsupportedType => None } - // Handle nullability by wrapping the non-null value generator: - valueGenerator.map { valueGenerator => - if (nullable) { - () => { - if (rand.nextFloat() <= PROBABILITY_OF_NULL) { - null - } else { - valueGenerator() - } - } - } else { - valueGenerator - } + if (nullable) { + valueGenerator.map(Gen.oneOf(_, Gen.const[Any](null))) + } else { + valueGenerator } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala index ea70fe03eb91..dc07a732cdb1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala @@ -17,6 +17,9 @@ package org.apache.spark.sql +import org.scalacheck.Prop.{exists, forAll, secure} +import org.scalatest.prop.Checkers + import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.types._ @@ -24,7 +27,7 @@ import org.apache.spark.sql.types._ /** * Tests of [[RandomDataGenerator]]. */ -class RandomDataGeneratorSuite extends SparkFunSuite { +class RandomDataGeneratorSuite extends SparkFunSuite with Checkers { /** * Tests random data generation for the given type by using it to generate random values then @@ -32,31 +35,50 @@ class RandomDataGeneratorSuite extends SparkFunSuite { */ def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) - RandomDataGenerator.forType(dataType, nullable, Some(42L)).foreach { generator => - for (_ <- 1 to 10) { - val generatedValue = generator() - val convertedValue = toCatalyst(generatedValue) - if (!nullable) { - assert(convertedValue !== null) - } - } + val generator = RandomDataGenerator.forType(dataType, nullable).getOrElse { + fail(s"Random data generator was not defined for $dataType") } - + if (nullable) { + check(exists(generator) { _ == null }) + } + if (!nullable) { + check(forAll(generator) { _ != null }) + } + check(secure(forAll(generator) { v => { toCatalyst(v); true } })) } // Basic types: - - (DataTypeTestUtils.atomicTypes ++ DataTypeTestUtils.atomicArrayTypes).foreach { dataType => - test(s"$dataType") { + for ( + dataType <- DataTypeTestUtils.atomicTypes; + nullable <- Seq(true, false) + if !dataType.isInstanceOf[DecimalType] || + dataType.asInstanceOf[DecimalType].precisionInfo.isEmpty + ) { + test(s"$dataType (nullable=$nullable)") { testRandomDataGeneration(dataType) } } - // Complex types: + for ( + arrayType <- DataTypeTestUtils.atomicArrayTypes + if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined + ) { + test(s"$arrayType") { + testRandomDataGeneration(arrayType) + } + } + + val atomicTypesWithDataGenerators = + DataTypeTestUtils.atomicTypes.filter(RandomDataGenerator.forType(_).isDefined) + // Complex types: for ( - keyType <- DataTypeTestUtils.atomicTypes; - valueType <- DataTypeTestUtils.atomicTypes + keyType <- atomicTypesWithDataGenerators; + valueType <- atomicTypesWithDataGenerators + // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) and + // Spark can hit NumberFormatException errors when converting certain BigDecimals (SPARK-8802). + // For these reasons, we don't support generation of maps with decimal keys. + if !keyType.isInstanceOf[DecimalType] ) { val mapType = MapType(keyType, valueType) test(s"$mapType") { @@ -65,8 +87,8 @@ class RandomDataGeneratorSuite extends SparkFunSuite { } for ( - colOneType <- DataTypeTestUtils.atomicTypes; - colTwoType <- DataTypeTestUtils.atomicTypes + colOneType <- atomicTypesWithDataGenerators; + colTwoType <- atomicTypesWithDataGenerators ) { val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) test(s"$structType") { From 89d86b1f65f5db7044280d916b07d1622b4662a0 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 2 Jul 2015 17:09:15 -0700 Subject: [PATCH 6/8] Bump ScalaCheck version. --- pom.xml | 2 +- .../scala/org/apache/spark/sql/RandomDataGenerator.scala | 7 +++---- .../org/apache/spark/sql/types/DataTypeTestUtils.scala | 8 +++++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pom.xml b/pom.xml index 211da9ee74a3..6a41aa4f6510 100644 --- a/pom.xml +++ b/pom.xml @@ -689,7 +689,7 @@ org.scalacheck scalacheck_${scala.binary.version} - 1.11.3 + 1.12.4 test diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 9479c9b1a078..ccf171543f8e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -19,13 +19,12 @@ package org.apache.spark.sql import java.sql.Timestamp -import org.apache.spark.sql.types._ import org.scalacheck.{Arbitrary, Gen} +import org.apache.spark.sql.types._ + /** - * Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random - * values; instead, they're biased to return "interesting" values (such as maximum / minimum values) - * with higher probability. + * ScalaCheck random data generators for Spark SQL DataTypes. */ object RandomDataGenerator { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala index 0b7ed54c681e..32632b5d6e34 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeTestUtils.scala @@ -48,7 +48,13 @@ object DataTypeTestUtils { /** * Instances of all [[AtomicType]]s. */ - val atomicTypes: Set[DataType] = Set(BinaryType, StringType, TimestampType) ++ numericTypes + val atomicTypes: Set[DataType] = numericTypes ++ Set( + BinaryType, + BooleanType, + DateType, + StringType, + TimestampType + ) /** * Instances of [[ArrayType]] for all [[AtomicType]]s. Arrays of these types may contain null. From e0d7d49023f7cd63d963f6147ce5db6b6bd94f99 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 2 Jul 2015 17:10:17 -0700 Subject: [PATCH 7/8] Bump ScalaCheck version in LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index f9e412cade34..2a41ea294e46 100644 --- a/LICENSE +++ b/LICENSE @@ -922,7 +922,7 @@ The following components are provided under a BSD-style license. See project lin (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.10.4 - http://www.scala-lang.org/) (BSD-like) Scala Library (org.scala-lang:scala-library:2.10.4 - http://www.scala-lang.org/) (BSD-like) Scalap (org.scala-lang:scalap:2.10.4 - http://www.scala-lang.org/) - (BSD-style) scalacheck (org.scalacheck:scalacheck_2.10:1.10.0 - http://www.scalacheck.org) + (BSD-style) scalacheck (org.scalacheck:scalacheck_2.10:1.12.4 - http://www.scalacheck.org) (BSD-style) spire (org.spire-math:spire_2.10:0.7.1 - http://spire-math.org) (BSD-style) spire-macros (org.spire-math:spire-macros_2.10:0.7.1 - http://spire-math.org) (New BSD License) Kryo (com.esotericsoftware.kryo:kryo:2.21 - http://code.google.com/p/kryo/) From f71634d73470189cfe45a89d2a69ea9c5ffa9e29 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 2 Jul 2015 22:14:33 -0700 Subject: [PATCH 8/8] Roll back ScalaCheck usage --- LICENSE | 2 +- pom.xml | 2 +- .../spark/sql/RandomDataGenerator.scala | 134 +++++++++++++----- .../spark/sql/RandomDataGeneratorSuite.scala | 15 +- 4 files changed, 108 insertions(+), 45 deletions(-) diff --git a/LICENSE b/LICENSE index 2a41ea294e46..f9e412cade34 100644 --- a/LICENSE +++ b/LICENSE @@ -922,7 +922,7 @@ The following components are provided under a BSD-style license. See project lin (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.10.4 - http://www.scala-lang.org/) (BSD-like) Scala Library (org.scala-lang:scala-library:2.10.4 - http://www.scala-lang.org/) (BSD-like) Scalap (org.scala-lang:scalap:2.10.4 - http://www.scala-lang.org/) - (BSD-style) scalacheck (org.scalacheck:scalacheck_2.10:1.12.4 - http://www.scalacheck.org) + (BSD-style) scalacheck (org.scalacheck:scalacheck_2.10:1.10.0 - http://www.scalacheck.org) (BSD-style) spire (org.spire-math:spire_2.10:0.7.1 - http://spire-math.org) (BSD-style) spire-macros (org.spire-math:spire-macros_2.10:0.7.1 - http://spire-math.org) (New BSD License) Kryo (com.esotericsoftware.kryo:kryo:2.21 - http://code.google.com/p/kryo/) diff --git a/pom.xml b/pom.xml index 6a41aa4f6510..211da9ee74a3 100644 --- a/pom.xml +++ b/pom.xml @@ -689,7 +689,7 @@ org.scalacheck scalacheck_${scala.binary.version} - 1.12.4 + 1.11.3 test diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index ccf171543f8e..13aad467fa57 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -17,17 +17,54 @@ package org.apache.spark.sql -import java.sql.Timestamp +import java.lang.Double.longBitsToDouble +import java.lang.Float.intBitsToFloat +import java.math.MathContext -import org.scalacheck.{Arbitrary, Gen} +import scala.util.Random import org.apache.spark.sql.types._ /** - * ScalaCheck random data generators for Spark SQL DataTypes. + * Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random + * values; instead, they're biased to return "interesting" values (such as maximum / minimum values) + * with higher probability. */ object RandomDataGenerator { + /** + * The conditional probability of a non-null value being drawn from a set of "interesting" values + * instead of being chosen uniformly at random. + */ + private val PROBABILITY_OF_INTERESTING_VALUE: Float = 0.5f + + /** + * The probability of the generated value being null + */ + private val PROBABILITY_OF_NULL: Float = 0.1f + + private val MAX_STR_LEN: Int = 1024 + private val MAX_ARR_SIZE: Int = 128 + private val MAX_MAP_SIZE: Int = 128 + + /** + * Helper function for constructing a biased random number generator which returns "interesting" + * values with a higher probability. + */ + private def randomNumeric[T]( + rand: Random, + uniformRand: Random => T, + interestingValues: Seq[T]): Some[() => T] = { + val f = () => { + if (rand.nextFloat() <= PROBABILITY_OF_INTERESTING_VALUE) { + interestingValues(rand.nextInt(interestingValues.length)) + } else { + uniformRand(rand) + } + } + Some(f) + } + /** * Returns a function which generates random values for the given [[DataType]], or `None` if no * random data generator is defined for that data type. The generated values will use an external @@ -37,58 +74,85 @@ object RandomDataGenerator { * * @param dataType the type to generate values for * @param nullable whether null values should be generated - * @return a ScalaCheck [[Gen]] which can be used to produce random values. + * @param seed an optional seed for the random number generator + * @return a function which can be called to generate random values. */ def forType( dataType: DataType, - nullable: Boolean = true): Option[Gen[Any]] = { - val valueGenerator: Option[Gen[Any]] = dataType match { - case StringType => Some(Arbitrary.arbitrary[String]) - case BinaryType => Some(Gen.listOf(Arbitrary.arbitrary[Byte]).map(_.toArray)) - case BooleanType => Some(Arbitrary.arbitrary[Boolean]) - case DateType => Some(Arbitrary.arbitrary[Int].suchThat(_ >= 0).map(new java.sql.Date(_))) - case DoubleType => Some(Arbitrary.arbitrary[Double]) - case FloatType => Some(Arbitrary.arbitrary[Float]) - case ByteType => Some(Arbitrary.arbitrary[Byte]) - case IntegerType => Some(Arbitrary.arbitrary[Int]) - case LongType => Some(Arbitrary.arbitrary[Long]) - case ShortType => Some(Arbitrary.arbitrary[Short]) - case NullType => Some(Gen.const[Any](null)) - case TimestampType => Some(Arbitrary.arbitrary[Long].suchThat(_ >= 0).map(new Timestamp(_))) - case DecimalType.Unlimited => Some(Arbitrary.arbitrary[BigDecimal]) + nullable: Boolean = true, + seed: Option[Long] = None): Option[() => Any] = { + val rand = new Random() + seed.foreach(rand.setSeed) + + val valueGenerator: Option[() => Any] = dataType match { + case StringType => Some(() => rand.nextString(rand.nextInt(MAX_STR_LEN))) + case BinaryType => Some(() => { + val arr = new Array[Byte](rand.nextInt(MAX_STR_LEN)) + rand.nextBytes(arr) + arr + }) + case BooleanType => Some(() => rand.nextBoolean()) + case DateType => Some(() => new java.sql.Date(rand.nextInt())) + case TimestampType => Some(() => new java.sql.Timestamp(rand.nextLong())) + case DecimalType.Unlimited => Some( + () => BigDecimal.apply(rand.nextLong, rand.nextInt, MathContext.UNLIMITED)) + case DoubleType => randomNumeric[Double]( + rand, r => longBitsToDouble(r.nextLong()), Seq(Double.MinValue, Double.MinPositiveValue, + Double.MaxValue, Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0)) + case FloatType => randomNumeric[Float]( + rand, r => intBitsToFloat(r.nextInt()), Seq(Float.MinValue, Float.MinPositiveValue, + Float.MaxValue, Float.PositiveInfinity, Float.NegativeInfinity, Float.NaN, 0.0f)) + case ByteType => randomNumeric[Byte]( + rand, _.nextInt().toByte, Seq(Byte.MinValue, Byte.MaxValue, 0.toByte)) + case IntegerType => randomNumeric[Int]( + rand, _.nextInt(), Seq(Int.MinValue, Int.MaxValue, 0)) + case LongType => randomNumeric[Long]( + rand, _.nextLong(), Seq(Long.MinValue, Long.MaxValue, 0L)) + case ShortType => randomNumeric[Short]( + rand, _.nextInt().toShort, Seq(Short.MinValue, Short.MaxValue, 0.toShort)) + case NullType => Some(() => null) case ArrayType(elementType, containsNull) => { - forType(elementType, nullable = containsNull).map { elementGen => - Gen.listOf(elementGen).map(_.toArray) + forType(elementType, nullable = containsNull, seed = Some(rand.nextLong())).map { + elementGenerator => () => Array.fill(rand.nextInt(MAX_ARR_SIZE))(elementGenerator()) } } case MapType(keyType, valueType, valueContainsNull) => { for ( - keyGenerator <- forType(keyType, nullable = false); - valueGenerator <- forType(valueType, nullable = valueContainsNull) - // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) - // and Spark can hit NumberFormatException errors converting certain BigDecimals - // (SPARK-8802). For these reasons, we don't support generation of maps with decimal keys. - if !keyType.isInstanceOf[DecimalType] + keyGenerator <- forType(keyType, nullable = false, seed = Some(rand.nextLong())); + valueGenerator <- + forType(valueType, nullable = valueContainsNull, seed = Some(rand.nextLong())) ) yield { - Gen.listOf(Gen.zip(keyGenerator, valueGenerator)).map(_.toMap) + () => { + Seq.fill(rand.nextInt(MAX_MAP_SIZE))((keyGenerator(), valueGenerator())).toMap + } } } case StructType(fields) => { - val maybeFieldGenerators: Seq[Option[Gen[Any]]] = fields.map { field => - forType(field.dataType, nullable = field.nullable) + val maybeFieldGenerators: Seq[Option[() => Any]] = fields.map { field => + forType(field.dataType, nullable = field.nullable, seed = Some(rand.nextLong())) } if (maybeFieldGenerators.forall(_.isDefined)) { - Some(Gen.sequence[Seq[Any], Any](maybeFieldGenerators.flatten).map(vs => Row.fromSeq(vs))) + val fieldGenerators: Seq[() => Any] = maybeFieldGenerators.map(_.get) + Some(() => Row.fromSeq(fieldGenerators.map(_.apply()))) } else { None } } case unsupportedType => None } - if (nullable) { - valueGenerator.map(Gen.oneOf(_, Gen.const[Any](null))) - } else { - valueGenerator + // Handle nullability by wrapping the non-null value generator: + valueGenerator.map { valueGenerator => + if (nullable) { + () => { + if (rand.nextFloat() <= PROBABILITY_OF_NULL) { + null + } else { + valueGenerator() + } + } + } else { + valueGenerator + } } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala index dc07a732cdb1..dbba93dba668 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala @@ -17,9 +17,6 @@ package org.apache.spark.sql -import org.scalacheck.Prop.{exists, forAll, secure} -import org.scalatest.prop.Checkers - import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.types._ @@ -27,7 +24,7 @@ import org.apache.spark.sql.types._ /** * Tests of [[RandomDataGenerator]]. */ -class RandomDataGeneratorSuite extends SparkFunSuite with Checkers { +class RandomDataGeneratorSuite extends SparkFunSuite { /** * Tests random data generation for the given type by using it to generate random values then @@ -39,12 +36,14 @@ class RandomDataGeneratorSuite extends SparkFunSuite with Checkers { fail(s"Random data generator was not defined for $dataType") } if (nullable) { - check(exists(generator) { _ == null }) + assert(Iterator.fill(100)(generator()).contains(null)) + } else { + assert(Iterator.fill(100)(generator()).forall(_ != null)) } - if (!nullable) { - check(forAll(generator) { _ != null }) + for (_ <- 1 to 10) { + val generatedValue = generator() + toCatalyst(generatedValue) } - check(secure(forAll(generator) { v => { toCatalyst(v); true } })) } // Basic types: