From 606b959486d5ef7c955ca8c381656b73ef9d6437 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 1 Feb 2021 09:22:47 +0000 Subject: [PATCH 01/47] Random integers within a range. --- .../spark/ml/tuning/ParamGridBuilder.scala | 2 +- .../spark/ml/tuning/ParamRandomBuilder.scala | 68 +++++++++++++++++++ .../ml/tuning/ParamRandomBuilderSuite.scala | 24 +++++++ 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala index d369e7a61cdc..231f1b981a63 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala @@ -27,7 +27,7 @@ import org.apache.spark.ml.param._ * Builder for a param grid used in grid search-based model selection. */ @Since("1.2.0") -class ParamGridBuilder @Since("1.2.0") { +class ParamGridBuilder { private val paramGrid = mutable.Map.empty[Param[_], Iterable[_]] diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala new file mode 100644 index 000000000000..ca837f945278 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.tuning + +import org.apache.spark.ml.param._ + +trait RandomT[T] { + def randomT(): T +} + +object RandomRanges { + + val rnd = new scala.util.Random + + def randomBigInt0To(diff: BigInt): BigInt = { + var randVal = BigInt(diff.bitLength, rnd) + while (randVal > diff) { + randVal = BigInt(diff.bitLength, rnd) + } + randVal + } + + implicit class RandomInt(limits: Limits[Int]) extends RandomT[Int] { + def randomT(): Int = { + import limits._ + val lower = BigInt(math.min(x, y)) + val upper = BigInt(math.max(x, y)) + val diff: BigInt = upper - lower + val randVal: BigInt = randomBigInt0To(diff.bitLength) + lower + randVal.intValue() + } + } +} + + +case class Limits[T: Numeric](x: T, y: T) + +class ParamRandomBuilder { + + def addGrid(param: DoubleParam, values: Array[Double]): this.type = ??? + + def addGrid(param: IntParam, values: Array[Int]): this.type = ??? + + def addGrid(param: FloatParam, values: Array[Float]): this.type = ??? + + def addGrid(param: LongParam, values: Array[Long]): this.type = ??? + + def addGrid[T](param: Param[T], values: Iterable[T]): this.type = ??? + + def build(): Array[ParamMap] = { + ??? + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala new file mode 100644 index 000000000000..17ac3fb00fab --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -0,0 +1,24 @@ +package org.apache.spark.ml.tuning + +import org.apache.spark.SparkFunSuite +import org.scalacheck.Arbitrary._ +import org.scalatest.matchers.must.Matchers +import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks + +class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { + + import RandomRanges._ + + test("random BigInt generation does not go into infinite loop") { + assert(randomBigInt0To(0) == BigInt(0)) + } + + test("random ints") { + forAll { (x: Int, y: Int) => + val intLimit: Limits[Int] = Limits(x, y) + val result: Int = intLimit.randomT() + assert(result >= math.min(x, y) && result <= math.max(x, y)) + } + } + +} From c518e5f129b36eb3a1607734af1fce641c7d5dcb Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 1 Feb 2021 10:13:22 +0000 Subject: [PATCH 02/47] Refactored. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index ca837f945278..ba758347fc90 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -27,22 +27,23 @@ object RandomRanges { val rnd = new scala.util.Random - def randomBigInt0To(diff: BigInt): BigInt = { - var randVal = BigInt(diff.bitLength, rnd) - while (randVal > diff) { - randVal = BigInt(diff.bitLength, rnd) + private[tuning] def randomBigInt0To(x: BigInt): BigInt = { + var randVal = BigInt(x.bitLength, rnd) + while (randVal > x) { + randVal = BigInt(x.bitLength, rnd) } randVal } + def bigIntBetween(lower: BigInt, upper: BigInt): BigInt = { + val diff: BigInt = upper - lower + randomBigInt0To(diff.bitLength) + lower + } + implicit class RandomInt(limits: Limits[Int]) extends RandomT[Int] { def randomT(): Int = { import limits._ - val lower = BigInt(math.min(x, y)) - val upper = BigInt(math.max(x, y)) - val diff: BigInt = upper - lower - val randVal: BigInt = randomBigInt0To(diff.bitLength) + lower - randVal.intValue() + bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).intValue() } } } @@ -50,8 +51,16 @@ object RandomRanges { case class Limits[T: Numeric](x: T, y: T) +/** + * "For any distribution over a sample space with a finite maximum, the maximum of 60 random + * observations lies within the top 5% of the true maximum, with 95% probability" + * - Evaluating Machine Learning Models by Alice Zheng + * https://www.oreilly.com/library/view/evaluating-machine-learning/9781492048756/ch04.html + */ class ParamRandomBuilder { + // Java interface + def addGrid(param: DoubleParam, values: Array[Double]): this.type = ??? def addGrid(param: IntParam, values: Array[Int]): this.type = ??? @@ -60,6 +69,12 @@ class ParamRandomBuilder { def addGrid(param: LongParam, values: Array[Long]): this.type = ??? + // Scala interface + + def addGrid[T: RandomT](param: Param[T], values: Iterable[T]): this.type = { + ??? + } + def addGrid[T](param: Param[T], values: Iterable[T]): this.type = ??? def build(): Array[ParamMap] = { From 37f32c2912702e2b6a6b5ec5292358ffa052852f Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 1 Feb 2021 11:09:06 +0000 Subject: [PATCH 03/47] Random longs. --- .../apache/spark/ml/tuning/ParamRandomBuilder.scala | 10 +++++++++- .../spark/ml/tuning/ParamRandomBuilderSuite.scala | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index ba758347fc90..f1aa5f275952 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.tuning import org.apache.spark.ml.param._ -trait RandomT[T] { +abstract class RandomT[T: Numeric] { def randomT(): T } @@ -46,6 +46,14 @@ object RandomRanges { bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).intValue() } } + + implicit class RandomLong(limits: Limits[Long]) extends RandomT[Long] { + def randomT(): Long = { + import limits._ + bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).longValue() + } + } + } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 17ac3fb00fab..fb5f6809b80a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -21,4 +21,12 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } } + test("random longs") { + forAll { (x: Long, y: Long) => + val longLimit: Limits[Long] = Limits(x, y) + val result: Long = longLimit.randomT() + assert(result >= math.min(x, y) && result <= math.max(x, y)) + } + } + } From 77cf67816a4d8061d3c670d54ec1c7bf4b724246 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 1 Feb 2021 14:51:29 +0000 Subject: [PATCH 04/47] Better use of type classes. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 31 ++++++++++++------- .../ml/tuning/ParamRandomBuilderSuite.scala | 15 +++++++-- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index f1aa5f275952..2a5acbfcc32d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -19,10 +19,16 @@ package org.apache.spark.ml.tuning import org.apache.spark.ml.param._ +case class Limits[T: Numeric](x: T, y: T) + abstract class RandomT[T: Numeric] { def randomT(): T } +abstract class Generator[T: Numeric] { + def apply(lim: Limits[T]): RandomT[T] +} + object RandomRanges { val rnd = new scala.util.Random @@ -40,24 +46,27 @@ object RandomRanges { randomBigInt0To(diff.bitLength) + lower } - implicit class RandomInt(limits: Limits[Int]) extends RandomT[Int] { - def randomT(): Int = { - import limits._ - bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).intValue() + implicit object IntGenerator extends Generator[Int] { + def apply(limits: Limits[Int]): RandomT[Int] = new RandomT[Int] { + override def randomT(): Int = { + import limits._ + bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).intValue() + } } } - implicit class RandomLong(limits: Limits[Long]) extends RandomT[Long] { - def randomT(): Long = { - import limits._ - bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).longValue() + implicit object LongGenerator extends Generator[Long] { + def apply(limits: Limits[Long]): RandomT[Long] = new RandomT[Long] { + override def randomT(): Long = { + import limits._ + bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).longValue() + } } } -} - + def apply[T: Generator](lim: Limits[T])(implicit t: Generator[T]): RandomT[T] = t(lim) -case class Limits[T: Numeric](x: T, y: T) +} /** * "For any distribution over a sample space with a finite maximum, the maximum of 60 random diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index fb5f6809b80a..b536c2d3944f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -16,15 +16,26 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert test("random ints") { forAll { (x: Int, y: Int) => val intLimit: Limits[Int] = Limits(x, y) - val result: Int = intLimit.randomT() + val gen: RandomT[Int] = RandomRanges(intLimit) + val result: Int = gen.randomT() assert(result >= math.min(x, y) && result <= math.max(x, y)) } } + def assertEvenDistribution[T: Numeric: RandomT](n: Int) = { + val gen = implicitly[RandomT[T]] + val ops = implicitly[Numeric[T]] + val xs = (0 to n).map(_ => gen.randomT()) + val mean = ops.toDouble(xs.sum) / xs.length + val squaredDiff = xs.map(x => math.pow(ops.toDouble(x) - mean, 2)) + val stdDev = math.pow(squaredDiff.sum / n - 1, 0.5) + } + test("random longs") { forAll { (x: Long, y: Long) => val longLimit: Limits[Long] = Limits(x, y) - val result: Long = longLimit.randomT() + val gen: RandomT[Long] = RandomRanges(longLimit) + val result: Long = gen.randomT() assert(result >= math.min(x, y) && result <= math.max(x, y)) } } From 8bd0dd76dbeb5151b4b9bcd17d0933656bf0a9c3 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 1 Feb 2021 16:21:50 +0000 Subject: [PATCH 05/47] Checks distribution. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 2 +- .../ml/tuning/ParamRandomBuilderSuite.scala | 30 +++++++++++++++---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 2a5acbfcc32d..42109ec7ce68 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -43,7 +43,7 @@ object RandomRanges { def bigIntBetween(lower: BigInt, upper: BigInt): BigInt = { val diff: BigInt = upper - lower - randomBigInt0To(diff.bitLength) + lower + randomBigInt0To(diff) + lower } implicit object IntGenerator extends Generator[Int] { diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index b536c2d3944f..b0ff2ac026dc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -2,6 +2,9 @@ package org.apache.spark.ml.tuning import org.apache.spark.SparkFunSuite import org.scalacheck.Arbitrary._ +import org.scalacheck.Gen +import org.scalacheck.Gen.Choose +import org.scalatest.Assertion import org.scalatest.matchers.must.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks @@ -22,13 +25,28 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } } - def assertEvenDistribution[T: Numeric: RandomT](n: Int) = { - val gen = implicitly[RandomT[T]] - val ops = implicitly[Numeric[T]] - val xs = (0 to n).map(_ => gen.randomT()) - val mean = ops.toDouble(xs.sum) / xs.length + test("random int distribution") { + val range = 1000 + val gen = for { + x <- Gen.choose(-range, range) + y <- Gen.choose(-range, range) + } yield (x, y) + forAll(gen) { case (x, y) => + assertEvenDistribution(10000, Limits(x, y + 2 * range)) + } + } + + def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { + val gen = RandomRanges(lim) + val ops = implicitly[Numeric[T]] + val xs = (0 to n).map(_ => gen.randomT()) + val mean = ops.toDouble(xs.sum) / xs.length val squaredDiff = xs.map(x => math.pow(ops.toDouble(x) - mean, 2)) - val stdDev = math.pow(squaredDiff.sum / n - 1, 0.5) + val stdDev = math.pow(squaredDiff.sum / n - 1, 0.5) + val halfWay = ops.toDouble(lim.x) + ops.toDouble(lim.y) / 2 + println(s"halfWay = $halfWay, stdDev = $stdDev, squaredDiff = ${squaredDiff.sum}, lim = $lim, mean = $mean, xs = ${xs.take(10).mkString(", ")}") + val tolerance = 5 * stdDev + assert(mean > halfWay - tolerance && mean < halfWay + tolerance) } test("random longs") { From 6c9a918d329ae27535e7b1ee6304d91000efe406 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 2 Feb 2021 12:10:39 +0000 Subject: [PATCH 06/47] Refactored. --- .../ml/tuning/ParamRandomBuilderSuite.scala | 53 ++++++++++++------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index b0ff2ac026dc..585a2105ef67 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -27,26 +27,8 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert test("random int distribution") { val range = 1000 - val gen = for { - x <- Gen.choose(-range, range) - y <- Gen.choose(-range, range) - } yield (x, y) - forAll(gen) { case (x, y) => - assertEvenDistribution(10000, Limits(x, y + 2 * range)) - } - } - - def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { - val gen = RandomRanges(lim) - val ops = implicitly[Numeric[T]] - val xs = (0 to n).map(_ => gen.randomT()) - val mean = ops.toDouble(xs.sum) / xs.length - val squaredDiff = xs.map(x => math.pow(ops.toDouble(x) - mean, 2)) - val stdDev = math.pow(squaredDiff.sum / n - 1, 0.5) - val halfWay = ops.toDouble(lim.x) + ops.toDouble(lim.y) / 2 - println(s"halfWay = $halfWay, stdDev = $stdDev, squaredDiff = ${squaredDiff.sum}, lim = $lim, mean = $mean, xs = ${xs.take(10).mkString(", ")}") - val tolerance = 5 * stdDev - assert(mean > halfWay - tolerance && mean < halfWay + tolerance) + val fn: RangeToLimitsFn[Int] = { case (x, y) => Limits(x, y + 2 * range) } + checkDistributionOf(range, fn) } test("random longs") { @@ -58,4 +40,35 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } } + test("random long distribution") { + val range = 1000L + val fn: RangeToLimitsFn[Long] = { case (x, y) => Limits(x, y + 2 * range) } + checkDistributionOf(range, fn) + } + + type RangeToLimitsFn[T] = (T, T) => Limits[T] + + def checkDistributionOf[T: Numeric: Generator: Choose](range: T, limFn: RangeToLimitsFn[T]): Unit = { + val ops: Numeric[T] = implicitly[Numeric[T]] + val gen: Gen[(T, T)] = for { + x <- Gen.choose(ops.negate(range), range) + y <- Gen.choose(ops.negate(range), range) + } yield (x, y) + forAll(gen) { case (x, y) => + assertEvenDistribution(10000, limFn(x, y)) + } + } + + def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { + val gen: RandomT[T] = RandomRanges(lim) + val ops: Numeric[T] = implicitly[Numeric[T]] + val xs: Seq[T] = (0 to n).map(_ => gen.randomT()) + val mean: Double = ops.toDouble(xs.sum) / xs.length + val squaredDiff: Seq[Double] = xs.map(x => math.pow(ops.toDouble(x) - mean, 2)) + val stdDev: Double = math.pow(squaredDiff.sum / n - 1, 0.5) + val halfWay: Double = ops.toDouble(lim.x) + ops.toDouble(lim.y) / 2 + val tolerance: Double = 5 * stdDev + assert(mean > halfWay - tolerance && mean < halfWay + tolerance) + } + } From 5489de6bdfb580944f5b341582ede263ece49d3c Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 4 Feb 2021 10:00:53 +0000 Subject: [PATCH 07/47] Formatting. --- .../apache/spark/ml/tuning/ParamRandomBuilderSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 585a2105ef67..9e430a8eff34 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -33,9 +33,9 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert test("random longs") { forAll { (x: Long, y: Long) => - val longLimit: Limits[Long] = Limits(x, y) - val gen: RandomT[Long] = RandomRanges(longLimit) - val result: Long = gen.randomT() + val longLimit: Limits[Long] = Limits(x, y) + val gen: RandomT[Long] = RandomRanges(longLimit) + val result: Long = gen.randomT() assert(result >= math.min(x, y) && result <= math.max(x, y)) } } From 3d1f46c91c2ace71a462448a91125a043c66d860 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 4 Feb 2021 11:04:27 +0000 Subject: [PATCH 08/47] Linear random doubles added. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 19 +++++++++++++++ .../ml/tuning/ParamRandomBuilderSuite.scala | 23 ++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 42109ec7ce68..23a491535d1f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -46,6 +46,25 @@ object RandomRanges { randomBigInt0To(diff) + lower } + def lowerUpper[T: Numeric](x: T, y: T): (T, T) = { + val ops: Numeric[T] = implicitly[Numeric[T]] + (ops.min(x, y), ops.max(x, y)) + } + + implicit object DoubleGenerator extends Generator[Double] { + def apply(limits: Limits[Double]): RandomT[Double] = new RandomT[Double] { + override def randomT(): Double = { + import limits._ + val upper: BigDecimal = BigDecimal(math.max(x, y)) + val lower: BigDecimal = BigDecimal(math.min(x, y)) + val zeroCenteredRnd: BigDecimal = BigDecimal(rnd.nextDouble() - 0.5) + val range: BigDecimal = upper - lower + val halfWay: BigDecimal = lower + range / 2 + ((zeroCenteredRnd * range) + halfWay).doubleValue() + } + } + } + implicit object IntGenerator extends Generator[Int] { def apply(limits: Limits[Int]): RandomT[Int] = new RandomT[Int] { override def randomT(): Int = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 9e430a8eff34..e0133dc2bd12 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -46,6 +46,21 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert checkDistributionOf(range, fn) } + test("random doubles") { + forAll { (x: Double, y: Double) => + val limit: Limits[Double] = Limits(x, y) + val gen: RandomT[Double] = RandomRanges(limit) + val result: Double = gen.randomT() + assert(result >= math.min(x, y) && result <= math.max(x, y)) + } + } + + test("random double distribution") { + val range = 1000d + val fn: RangeToLimitsFn[Double] = { case (x, y) => Limits(x, y + 2 * range) } + checkDistributionOf(range, fn) + } + type RangeToLimitsFn[T] = (T, T) => Limits[T] def checkDistributionOf[T: Numeric: Generator: Choose](range: T, limFn: RangeToLimitsFn[T]): Unit = { @@ -62,11 +77,13 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { val gen: RandomT[T] = RandomRanges(lim) val ops: Numeric[T] = implicitly[Numeric[T]] - val xs: Seq[T] = (0 to n).map(_ => gen.randomT()) + val xs: Seq[T] = (0 to n).map { _: Int => gen.randomT() } val mean: Double = ops.toDouble(xs.sum) / xs.length - val squaredDiff: Seq[Double] = xs.map(x => math.pow(ops.toDouble(x) - mean, 2)) + val squaredDiff: Seq[Double] = xs.map { x: T => math.pow(ops.toDouble(x) - mean, 2) } val stdDev: Double = math.pow(squaredDiff.sum / n - 1, 0.5) - val halfWay: Double = ops.toDouble(lim.x) + ops.toDouble(lim.y) / 2 + val ordered: (T, T) = lowerUpper(lim.x, lim.y) + val range: T = ops.minus(ordered._2, ordered._1) + val halfWay: Double = (ops.toDouble(range) / 2) + ops.toDouble(ordered._1) val tolerance: Double = 5 * stdDev assert(mean > halfWay - tolerance && mean < halfWay + tolerance) } From 290c815ecd3d6eb1d6e508551310bfe4445247c7 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 4 Feb 2021 13:55:01 +0000 Subject: [PATCH 09/47] Refactored. --- .../ml/tuning/ParamRandomBuilderSuite.scala | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index e0133dc2bd12..3db8b420c758 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -74,17 +74,28 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } } - def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { - val gen: RandomT[T] = RandomRanges(lim) + def meanAndStandardDeviation[T: Numeric](xs: Seq[T]): (Double, Double) = { val ops: Numeric[T] = implicitly[Numeric[T]] - val xs: Seq[T] = (0 to n).map { _: Int => gen.randomT() } - val mean: Double = ops.toDouble(xs.sum) / xs.length + val n: Int = xs.length + val mean: Double = ops.toDouble(xs.sum) / n val squaredDiff: Seq[Double] = xs.map { x: T => math.pow(ops.toDouble(x) - mean, 2) } val stdDev: Double = math.pow(squaredDiff.sum / n - 1, 0.5) - val ordered: (T, T) = lowerUpper(lim.x, lim.y) - val range: T = ops.minus(ordered._2, ordered._1) - val halfWay: Double = (ops.toDouble(range) / 2) + ops.toDouble(ordered._1) - val tolerance: Double = 5 * stdDev + (mean, stdDev) + } + + def midPointOf[T: Numeric : Generator](lim: Limits[T]): Double = { + val ordered: (T, T) = lowerUpper(lim.x, lim.y) + val ops: Numeric[T] = implicitly[Numeric[T]] + val range: T = ops.minus(ordered._2, ordered._1) + (ops.toDouble(range) / 2) + ops.toDouble(ordered._1) + } + + def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { + val gen: RandomT[T] = RandomRanges(lim) + val xs: Seq[T] = (0 to n).map { _: Int => gen.randomT() } + val (mean, stdDev) = meanAndStandardDeviation(xs) + val tolerance: Double = 4 * stdDev + val halfWay: Double = midPointOf(lim) assert(mean > halfWay - tolerance && mean < halfWay + tolerance) } From dadbd54e58d915716809d23ff61da92c287322e4 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 4 Feb 2021 14:02:44 +0000 Subject: [PATCH 10/47] Added random floats. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 24 ++++++++++++------- .../ml/tuning/ParamRandomBuilderSuite.scala | 20 ++++++++++++++++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 23a491535d1f..166c9c9edbcc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -46,21 +46,27 @@ object RandomRanges { randomBigInt0To(diff) + lower } - def lowerUpper[T: Numeric](x: T, y: T): (T, T) = { - val ops: Numeric[T] = implicitly[Numeric[T]] - (ops.min(x, y), ops.max(x, y)) + private def randomBigIntIn(lower: BigDecimal, upper: BigDecimal): BigDecimal = { + val zeroCenteredRnd: BigDecimal = BigDecimal(rnd.nextDouble() - 0.5) + val range: BigDecimal = upper - lower + val halfWay: BigDecimal = lower + range / 2 + (zeroCenteredRnd * range) + halfWay } implicit object DoubleGenerator extends Generator[Double] { def apply(limits: Limits[Double]): RandomT[Double] = new RandomT[Double] { override def randomT(): Double = { import limits._ - val upper: BigDecimal = BigDecimal(math.max(x, y)) - val lower: BigDecimal = BigDecimal(math.min(x, y)) - val zeroCenteredRnd: BigDecimal = BigDecimal(rnd.nextDouble() - 0.5) - val range: BigDecimal = upper - lower - val halfWay: BigDecimal = lower + range / 2 - ((zeroCenteredRnd * range) + halfWay).doubleValue() + randomBigIntIn(BigDecimal(math.min(x, y)), BigDecimal(math.max(x, y))).doubleValue() + } + } + } + + implicit object FloatGenerator extends Generator[Float] { + def apply(limits: Limits[Float]): RandomT[Float] = new RandomT[Float] { + override def randomT(): Float = { + import limits._ + randomBigIntIn(BigDecimal(math.min(x, y)), BigDecimal(math.max(x, y))).floatValue() } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 3db8b420c758..b8660f1c9e8d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -61,6 +61,21 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert checkDistributionOf(range, fn) } + test("random floats") { + forAll { (x: Float, y: Float) => + val limit: Limits[Float] = Limits(x, y) + val gen: RandomT[Float] = RandomRanges(limit) + val result: Float = gen.randomT() + assert(result >= math.min(x, y) && result <= math.max(x, y)) + } + } + + test("random float distribution") { + val range = 1000f + val fn: RangeToLimitsFn[Float] = { case (x, y) => Limits(x, y + 2 * range) } + checkDistributionOf(range, fn) + } + type RangeToLimitsFn[T] = (T, T) => Limits[T] def checkDistributionOf[T: Numeric: Generator: Choose](range: T, limFn: RangeToLimitsFn[T]): Unit = { @@ -83,6 +98,11 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert (mean, stdDev) } + def lowerUpper[T: Numeric](x: T, y: T): (T, T) = { + val ops: Numeric[T] = implicitly[Numeric[T]] + (ops.min(x, y), ops.max(x, y)) + } + def midPointOf[T: Numeric : Generator](lim: Limits[T]): Double = { val ordered: (T, T) = lowerUpper(lim.x, lim.y) val ops: Numeric[T] = implicitly[Numeric[T]] From f8339d71e0e76e135f04a5928a30d9b4befc4e76 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 4 Feb 2021 14:09:27 +0000 Subject: [PATCH 11/47] Refactored. --- .../ml/tuning/ParamRandomBuilderSuite.scala | 27 +++++++------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index b8660f1c9e8d..f65c778f3011 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -26,9 +26,7 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } test("random int distribution") { - val range = 1000 - val fn: RangeToLimitsFn[Int] = { case (x, y) => Limits(x, y + 2 * range) } - checkDistributionOf(range, fn) + checkDistributionOf(1000) } test("random longs") { @@ -41,9 +39,7 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } test("random long distribution") { - val range = 1000L - val fn: RangeToLimitsFn[Long] = { case (x, y) => Limits(x, y + 2 * range) } - checkDistributionOf(range, fn) + checkDistributionOf(1000L) } test("random doubles") { @@ -56,9 +52,7 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } test("random double distribution") { - val range = 1000d - val fn: RangeToLimitsFn[Double] = { case (x, y) => Limits(x, y + 2 * range) } - checkDistributionOf(range, fn) + checkDistributionOf(1000d) } test("random floats") { @@ -71,21 +65,18 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } test("random float distribution") { - val range = 1000f - val fn: RangeToLimitsFn[Float] = { case (x, y) => Limits(x, y + 2 * range) } - checkDistributionOf(range, fn) + checkDistributionOf(1000f) } - type RangeToLimitsFn[T] = (T, T) => Limits[T] - - def checkDistributionOf[T: Numeric: Generator: Choose](range: T, limFn: RangeToLimitsFn[T]): Unit = { + def checkDistributionOf[T: Numeric: Generator: Choose](range: T): Unit = { val ops: Numeric[T] = implicitly[Numeric[T]] + import ops._ val gen: Gen[(T, T)] = for { - x <- Gen.choose(ops.negate(range), range) - y <- Gen.choose(ops.negate(range), range) + x <- Gen.choose(negate(range), range) + y <- Gen.choose(range, times(range, plus(one, one))) } yield (x, y) forAll(gen) { case (x, y) => - assertEvenDistribution(10000, limFn(x, y)) + assertEvenDistribution(10000, Limits(x, y)) } } From 70cdf24ee43569972ae06c61fa1384015a96c0a5 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 4 Feb 2021 14:21:24 +0000 Subject: [PATCH 12/47] Even more refactoring. --- .../ml/tuning/ParamRandomBuilderSuite.scala | 42 ++++++++----------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index f65c778f3011..583512e04d8e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -1,8 +1,9 @@ package org.apache.spark.ml.tuning +import scala.reflect.runtime.universe.TypeTag import org.apache.spark.SparkFunSuite import org.scalacheck.Arbitrary._ -import org.scalacheck.Gen +import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Gen.Choose import org.scalatest.Assertion import org.scalatest.matchers.must.Matchers @@ -17,12 +18,7 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } test("random ints") { - forAll { (x: Int, y: Int) => - val intLimit: Limits[Int] = Limits(x, y) - val gen: RandomT[Int] = RandomRanges(intLimit) - val result: Int = gen.randomT() - assert(result >= math.min(x, y) && result <= math.max(x, y)) - } + checkRange[Int] } test("random int distribution") { @@ -30,12 +26,7 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } test("random longs") { - forAll { (x: Long, y: Long) => - val longLimit: Limits[Long] = Limits(x, y) - val gen: RandomT[Long] = RandomRanges(longLimit) - val result: Long = gen.randomT() - assert(result >= math.min(x, y) && result <= math.max(x, y)) - } + checkRange[Long] } test("random long distribution") { @@ -43,12 +34,7 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } test("random doubles") { - forAll { (x: Double, y: Double) => - val limit: Limits[Double] = Limits(x, y) - val gen: RandomT[Double] = RandomRanges(limit) - val result: Double = gen.randomT() - assert(result >= math.min(x, y) && result <= math.max(x, y)) - } + checkRange[Double] } test("random double distribution") { @@ -56,18 +42,24 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } test("random floats") { - forAll { (x: Float, y: Float) => - val limit: Limits[Float] = Limits(x, y) - val gen: RandomT[Float] = RandomRanges(limit) - val result: Float = gen.randomT() - assert(result >= math.min(x, y) && result <= math.max(x, y)) - } + checkRange[Float] } test("random float distribution") { checkDistributionOf(1000f) } + def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary]: Assertion = { + forAll { (x: T, y: T) => + val ops: Numeric[T] = implicitly[Numeric[T]] + val limit: Limits[T] = Limits(x, y) + val gen: RandomT[T] = RandomRanges(limit) + val result: T = gen.randomT() + val ordered = lowerUpper(x, y) + assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) + } + } + def checkDistributionOf[T: Numeric: Generator: Choose](range: T): Unit = { val ops: Numeric[T] = implicitly[Numeric[T]] import ops._ From 8148a699f0c7b1c7cc6b17e9ed90fc2b81208843 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sun, 7 Feb 2021 10:54:58 +0000 Subject: [PATCH 13/47] RandomRange tests in a separate class. --- .../ml/tuning/ParamRandomBuilderSuite.scala | 98 +--------------- .../spark/ml/tuning/RandomRangesSuite.scala | 105 ++++++++++++++++++ 2 files changed, 110 insertions(+), 93 deletions(-) create mode 100644 mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 583512e04d8e..5ee7943f7014 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -1,105 +1,17 @@ package org.apache.spark.ml.tuning -import scala.reflect.runtime.universe.TypeTag import org.apache.spark.SparkFunSuite -import org.scalacheck.Arbitrary._ -import org.scalacheck.{Arbitrary, Gen} -import org.scalacheck.Gen.Choose -import org.scalatest.Assertion +import org.apache.spark.ml.param.TestParams import org.scalatest.matchers.must.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { - import RandomRanges._ + val solver = new TestParams() +// import solver.{inputCol, maxIter} - test("random BigInt generation does not go into infinite loop") { - assert(randomBigInt0To(0) == BigInt(0)) - } - - test("random ints") { - checkRange[Int] - } - - test("random int distribution") { - checkDistributionOf(1000) - } - - test("random longs") { - checkRange[Long] - } - - test("random long distribution") { - checkDistributionOf(1000L) - } - - test("random doubles") { - checkRange[Double] - } - - test("random double distribution") { - checkDistributionOf(1000d) - } - - test("random floats") { - checkRange[Float] - } - - test("random float distribution") { - checkDistributionOf(1000f) - } - - def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary]: Assertion = { - forAll { (x: T, y: T) => - val ops: Numeric[T] = implicitly[Numeric[T]] - val limit: Limits[T] = Limits(x, y) - val gen: RandomT[T] = RandomRanges(limit) - val result: T = gen.randomT() - val ordered = lowerUpper(x, y) - assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) - } - } - - def checkDistributionOf[T: Numeric: Generator: Choose](range: T): Unit = { - val ops: Numeric[T] = implicitly[Numeric[T]] - import ops._ - val gen: Gen[(T, T)] = for { - x <- Gen.choose(negate(range), range) - y <- Gen.choose(range, times(range, plus(one, one))) - } yield (x, y) - forAll(gen) { case (x, y) => - assertEvenDistribution(10000, Limits(x, y)) - } - } - - def meanAndStandardDeviation[T: Numeric](xs: Seq[T]): (Double, Double) = { - val ops: Numeric[T] = implicitly[Numeric[T]] - val n: Int = xs.length - val mean: Double = ops.toDouble(xs.sum) / n - val squaredDiff: Seq[Double] = xs.map { x: T => math.pow(ops.toDouble(x) - mean, 2) } - val stdDev: Double = math.pow(squaredDiff.sum / n - 1, 0.5) - (mean, stdDev) - } - - def lowerUpper[T: Numeric](x: T, y: T): (T, T) = { - val ops: Numeric[T] = implicitly[Numeric[T]] - (ops.min(x, y), ops.max(x, y)) - } - - def midPointOf[T: Numeric : Generator](lim: Limits[T]): Double = { - val ordered: (T, T) = lowerUpper(lim.x, lim.y) - val ops: Numeric[T] = implicitly[Numeric[T]] - val range: T = ops.minus(ordered._2, ordered._1) - (ops.toDouble(range) / 2) + ops.toDouble(ordered._1) - } - - def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { - val gen: RandomT[T] = RandomRanges(lim) - val xs: Seq[T] = (0 to n).map { _: Int => gen.randomT() } - val (mean, stdDev) = meanAndStandardDeviation(xs) - val tolerance: Double = 4 * stdDev - val halfWay: Double = midPointOf(lim) - assert(mean > halfWay - tolerance && mean < halfWay + tolerance) + test("random params") { + // TODO } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala new file mode 100644 index 000000000000..e940e73ac167 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -0,0 +1,105 @@ +package org.apache.spark.ml.tuning + +import scala.reflect.runtime.universe.TypeTag +import org.apache.spark.SparkFunSuite +import org.scalacheck.Arbitrary._ +import org.scalacheck.{Arbitrary, Gen} +import org.scalacheck.Gen.Choose +import org.scalatest.Assertion +import org.scalatest.matchers.must.Matchers +import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks + +class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { + + import RandomRanges._ + + test("random BigInt generation does not go into infinite loop") { + assert(randomBigInt0To(0) == BigInt(0)) + } + + test("random ints") { + checkRange[Int] + } + + test("random int distribution") { + checkDistributionOf(1000) + } + + test("random longs") { + checkRange[Long] + } + + test("random long distribution") { + checkDistributionOf(1000L) + } + + test("random doubles") { + checkRange[Double] + } + + test("random double distribution") { + checkDistributionOf(1000d) + } + + test("random floats") { + checkRange[Float] + } + + test("random float distribution") { + checkDistributionOf(1000f) + } + + def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary]: Assertion = { + forAll { (x: T, y: T) => + val ops: Numeric[T] = implicitly[Numeric[T]] + val limit: Limits[T] = Limits(x, y) + val gen: RandomT[T] = RandomRanges(limit) + val result: T = gen.randomT() + val ordered = lowerUpper(x, y) + assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) + } + } + + def checkDistributionOf[T: Numeric: Generator: Choose](range: T): Unit = { + val ops: Numeric[T] = implicitly[Numeric[T]] + import ops._ + val gen: Gen[(T, T)] = for { + x <- Gen.choose(negate(range), range) + y <- Gen.choose(range, times(range, plus(one, one))) + } yield (x, y) + forAll(gen) { case (x, y) => + assertEvenDistribution(10000, Limits(x, y)) + } + } + + def meanAndStandardDeviation[T: Numeric](xs: Seq[T]): (Double, Double) = { + val ops: Numeric[T] = implicitly[Numeric[T]] + val n: Int = xs.length + val mean: Double = ops.toDouble(xs.sum) / n + val squaredDiff: Seq[Double] = xs.map { x: T => math.pow(ops.toDouble(x) - mean, 2) } + val stdDev: Double = math.pow(squaredDiff.sum / n - 1, 0.5) + (mean, stdDev) + } + + def lowerUpper[T: Numeric](x: T, y: T): (T, T) = { + val ops: Numeric[T] = implicitly[Numeric[T]] + (ops.min(x, y), ops.max(x, y)) + } + + def midPointOf[T: Numeric : Generator](lim: Limits[T]): Double = { + val ordered: (T, T) = lowerUpper(lim.x, lim.y) + val ops: Numeric[T] = implicitly[Numeric[T]] + val range: T = ops.minus(ordered._2, ordered._1) + (ops.toDouble(range) / 2) + ops.toDouble(ordered._1) + } + + def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { + val gen: RandomT[T] = RandomRanges(lim) + val xs: Seq[T] = (0 to n).map { _: Int => gen.randomT() } + val (mean, stdDev) = meanAndStandardDeviation(xs) + val tolerance: Double = 4 * stdDev + val halfWay: Double = midPointOf(lim) + assert(mean > halfWay - tolerance && mean < halfWay + tolerance) + } + +} From ea945c54672ee2769ff493662605c928f4873af7 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sun, 7 Feb 2021 11:40:13 +0000 Subject: [PATCH 14/47] Linear numerics supported. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 24 +++------------- .../ml/tuning/ParamRandomBuilderSuite.scala | 28 +++++++++++++++---- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 166c9c9edbcc..d0edc921edc3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -99,27 +99,11 @@ object RandomRanges { * - Evaluating Machine Learning Models by Alice Zheng * https://www.oreilly.com/library/view/evaluating-machine-learning/9781492048756/ch04.html */ -class ParamRandomBuilder { +class ParamRandomBuilder extends ParamGridBuilder { - // Java interface - - def addGrid(param: DoubleParam, values: Array[Double]): this.type = ??? - - def addGrid(param: IntParam, values: Array[Int]): this.type = ??? - - def addGrid(param: FloatParam, values: Array[Float]): this.type = ??? - - def addGrid(param: LongParam, values: Array[Long]): this.type = ??? - - // Scala interface - - def addGrid[T: RandomT](param: Param[T], values: Iterable[T]): this.type = { - ??? + def addRandom[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = { + val gen: RandomT[T] = RandomRanges(lim) + addGrid(param, (1 to n).map { _: Int => gen.randomT() }) } - def addGrid[T](param: Param[T], values: Iterable[T]): this.type = ??? - - def build(): Array[ParamMap] = { - ??? - } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 5ee7943f7014..97fbfb3af770 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -1,17 +1,35 @@ package org.apache.spark.ml.tuning import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.param.TestParams +import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, TestParams} import org.scalatest.matchers.must.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { - val solver = new TestParams() -// import solver.{inputCol, maxIter} + val solver = new TestParams() { + val randomCol: Param[Double] = new Param[Double](this, "randomVal", "randomly generated value") + } + import solver.{inputCol, maxIter, randomCol} - test("random params") { - // TODO + test("random params mixed with fixed values") { + import RandomRanges._ + val maxIterations = 10 + val basedOn: Array[ParamPair[_]] = Array(maxIter -> maxIterations) + val inputCols: Array[String] = Array("input0", "input1") + val limit: Limits[Double] = Limits(0d, 100d) + val nRandoms = 5 + val paramMap: Array[ParamMap] = new ParamRandomBuilder() + .baseOn(basedOn: _*) + .addGrid(inputCol, inputCols) + .addRandom(randomCol, limit, nRandoms) + .build() + assert(paramMap.length == inputCols.length * nRandoms * basedOn.length) + paramMap.foreach { m: ParamMap => + assert(m(maxIter) == maxIterations) + assert(inputCols contains m(inputCol)) + assert(m(randomCol) >= limit.x && m(randomCol) <= limit.y) + } } } From cb4e4b8462531e92a3a289030e6dee228975723b Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sun, 7 Feb 2021 12:50:46 +0000 Subject: [PATCH 15/47] Checkstyle. --- .../org/apache/spark/ml/tuning/ParamRandomBuilder.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index d0edc921edc3..ec055a6819c4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -47,9 +47,9 @@ object RandomRanges { } private def randomBigIntIn(lower: BigDecimal, upper: BigDecimal): BigDecimal = { - val zeroCenteredRnd: BigDecimal = BigDecimal(rnd.nextDouble() - 0.5) - val range: BigDecimal = upper - lower - val halfWay: BigDecimal = lower + range / 2 + val zeroCenteredRnd: BigDecimal = BigDecimal(rnd.nextDouble() - 0.5) + val range: BigDecimal = upper - lower + val halfWay: BigDecimal = lower + range / 2 (zeroCenteredRnd * range) + halfWay } From f558060b9072c257403201318cc557bd90743ad6 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sun, 7 Feb 2021 13:55:43 +0000 Subject: [PATCH 16/47] Log space. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 62 ++++++++++++----- .../spark/ml/tuning/RandomRangesSuite.scala | 69 +++++++++++++++---- 2 files changed, 102 insertions(+), 29 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index ec055a6819c4..bf1b111516c4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -23,6 +23,7 @@ case class Limits[T: Numeric](x: T, y: T) abstract class RandomT[T: Numeric] { def randomT(): T + def randomTLog(n: Int): T } abstract class Generator[T: Numeric] { @@ -46,7 +47,7 @@ object RandomRanges { randomBigInt0To(diff) + lower } - private def randomBigIntIn(lower: BigDecimal, upper: BigDecimal): BigDecimal = { + private def randomBigDecimalBetween(lower: BigDecimal, upper: BigDecimal): BigDecimal = { val zeroCenteredRnd: BigDecimal = BigDecimal(rnd.nextDouble() - 0.5) val range: BigDecimal = upper - lower val halfWay: BigDecimal = lower + range / 2 @@ -55,40 +56,67 @@ object RandomRanges { implicit object DoubleGenerator extends Generator[Double] { def apply(limits: Limits[Double]): RandomT[Double] = new RandomT[Double] { - override def randomT(): Double = { - import limits._ - randomBigIntIn(BigDecimal(math.min(x, y)), BigDecimal(math.max(x, y))).doubleValue() - } + import limits._ + val lower: Double = math.min(x, y) + val upper: Double = math.max(x, y) + + override def randomTLog(n: Int): Double = + RandomRanges.randomLog(lower, upper, n) + + override def randomT(): Double = + randomBigDecimalBetween(BigDecimal(lower), BigDecimal(upper)).doubleValue() } } implicit object FloatGenerator extends Generator[Float] { def apply(limits: Limits[Float]): RandomT[Float] = new RandomT[Float] { - override def randomT(): Float = { - import limits._ - randomBigIntIn(BigDecimal(math.min(x, y)), BigDecimal(math.max(x, y))).floatValue() - } + import limits._ + val lower: Float = math.min(x, y) + val upper: Float = math.max(x, y) + + override def randomTLog(n: Int): Float = + RandomRanges.randomLog(lower, upper, n).toFloat + + override def randomT(): Float = + randomBigDecimalBetween(BigDecimal(lower), BigDecimal(upper)).floatValue() } } implicit object IntGenerator extends Generator[Int] { def apply(limits: Limits[Int]): RandomT[Int] = new RandomT[Int] { - override def randomT(): Int = { - import limits._ - bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).intValue() - } + import limits._ + val lower: Int = math.min(x, y) + val upper: Int = math.max(x, y) + + override def randomTLog(n: Int): Int = + RandomRanges.randomLog(lower, upper, n).toInt + + override def randomT(): Int = + bigIntBetween(BigInt(lower), BigInt(upper)).intValue() } } implicit object LongGenerator extends Generator[Long] { def apply(limits: Limits[Long]): RandomT[Long] = new RandomT[Long] { - override def randomT(): Long = { - import limits._ - bigIntBetween(BigInt(math.min(x, y)), BigInt(math.max(x, y))).longValue() - } + import limits._ + val lower: Long = math.min(x, y) + val upper: Long = math.max(x, y) + + override def randomTLog(n: Int): Long = + RandomRanges.randomLog(lower, upper, n).toLong + + override def randomT(): Long = + bigIntBetween(BigInt(lower), BigInt(upper)).longValue() } } + def randomLog(lower: Double, upper: Double, n: Int): Double = { + val logLimits: Limits[Double] = Limits(math.log10(lower), math.log10(upper)) + val rndLogged: RandomT[Double] = RandomRanges(logLimits) + val rndDouble: Double = math.pow(10, rndLogged.randomT()) // TODO use log n + rndDouble + } + def apply[T: Generator](lim: Limits[T])(implicit t: Generator[T]): RandomT[T] = t(lim) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index e940e73ac167..b94a7d38b58a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -5,7 +5,7 @@ import org.apache.spark.SparkFunSuite import org.scalacheck.Arbitrary._ import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Gen.Choose -import org.scalatest.Assertion +import org.scalatest.{Assertion, Succeeded} import org.scalatest.matchers.must.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks @@ -13,12 +13,29 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck import RandomRanges._ + test("random doubles in log space") { + val gen: Gen[(Double, Double)] = for { + x <- Gen.choose(0d, Double.MaxValue) + y <- Gen.choose(0d, Double.MaxValue) + } yield (x, y) + forAll(gen) { case (x, y) => + val lower = math.min(x, y) + val upper = math.max(x, y) + val result = randomLog(x, y, 10) + assert(result >= lower && result <= upper) + } + } + test("random BigInt generation does not go into infinite loop") { assert(randomBigInt0To(0) == BigInt(0)) } test("random ints") { - checkRange[Int] + checkRange(Linear[Int]) + } + + test("random log ints") { + checkRange(Log10[Int]) } test("random int distribution") { @@ -26,7 +43,11 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } test("random longs") { - checkRange[Long] + checkRange(Linear[Long]) + } + + test("random log longs") { + checkRange(Log10[Long]) } test("random long distribution") { @@ -34,7 +55,11 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } test("random doubles") { - checkRange[Double] + checkRange(Linear[Double]) + } + + test("random log doubles") { + checkRange(Log10[Double]) } test("random double distribution") { @@ -42,21 +67,41 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } test("random floats") { - checkRange[Float] + checkRange(Linear[Float]) + } + + test("random log floats") { + checkRange(Log10[Float]) } test("random float distribution") { checkDistributionOf(1000f) } - def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary]: Assertion = { + abstract class RandomFn[T: Numeric: Generator] { + def apply(genRandom: RandomT[T]): T = genRandom.randomT() + def appropriate(t: T): Boolean + } + def Linear[T: Numeric: Generator]: RandomFn[T] = new RandomFn { + override def apply(genRandom: RandomT[T]): T = genRandom.randomT() + override def appropriate(t: T): Boolean = true + } + def Log10[T: Numeric: Generator]: RandomFn[T] = new RandomFn { + override def apply(genRandom: RandomT[T]): T = genRandom.randomTLog(10) + val ops: Numeric[T] = implicitly[Numeric[T]] + override def appropriate(t: T): Boolean = ops.gt(t, ops.zero) + } + + def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary](rand: RandomFn[T]): Assertion = { forAll { (x: T, y: T) => - val ops: Numeric[T] = implicitly[Numeric[T]] - val limit: Limits[T] = Limits(x, y) - val gen: RandomT[T] = RandomRanges(limit) - val result: T = gen.randomT() - val ordered = lowerUpper(x, y) - assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) + if (rand.appropriate(x) && rand.appropriate(y)) { + val ops: Numeric[T] = implicitly[Numeric[T]] + val limit: Limits[T] = Limits(x, y) + val gen: RandomT[T] = RandomRanges(limit) + val result: T = rand(gen) + val ordered = lowerUpper(x, y) + assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) + } else Succeeded } } From d0c8eaaf7d7e07df705055c87a07215296357e6e Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sun, 7 Feb 2021 13:58:04 +0000 Subject: [PATCH 17/47] Log space. --- .../org/apache/spark/ml/tuning/ParamRandomBuilder.scala | 6 ++++-- .../org/apache/spark/ml/tuning/RandomRangesSuite.scala | 9 +++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index bf1b111516c4..8f1607ea7939 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -111,9 +111,11 @@ object RandomRanges { } def randomLog(lower: Double, upper: Double, n: Int): Double = { - val logLimits: Limits[Double] = Limits(math.log10(lower), math.log10(upper)) + val logLower: Double = math.log10(lower) + val logUpper: Double = math.log10(upper) + val logLimits: Limits[Double] = Limits(logLower, logUpper) val rndLogged: RandomT[Double] = RandomRanges(logLimits) - val rndDouble: Double = math.pow(10, rndLogged.randomT()) // TODO use log n + val rndDouble: Double = math.pow(10, rndLogged.randomT()) rndDouble } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index b94a7d38b58a..e7f5decf0b44 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -14,14 +14,15 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck import RandomRanges._ test("random doubles in log space") { - val gen: Gen[(Double, Double)] = for { + val gen: Gen[(Double, Double, Int)] = for { x <- Gen.choose(0d, Double.MaxValue) y <- Gen.choose(0d, Double.MaxValue) - } yield (x, y) - forAll(gen) { case (x, y) => + n <- Gen.choose(0, Int.MaxValue) + } yield (x, y, n) + forAll(gen) { case (x, y, n) => val lower = math.min(x, y) val upper = math.max(x, y) - val result = randomLog(x, y, 10) + val result = randomLog(x, y, n) assert(result >= lower && result <= upper) } } From affb9e469404a984f5fa4292391814543b29b1d8 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sun, 7 Feb 2021 14:18:31 +0000 Subject: [PATCH 18/47] Logarithm for any base. Non deterministic bug...? --- .../org/apache/spark/ml/tuning/ParamRandomBuilder.scala | 9 +++++---- .../org/apache/spark/ml/tuning/RandomRangesSuite.scala | 6 ++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 8f1607ea7939..385a9f6b0115 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -110,13 +110,14 @@ object RandomRanges { } } + def logN(x: Double, base: Int): Double = math.log(x) / math.log(base) + def randomLog(lower: Double, upper: Double, n: Int): Double = { - val logLower: Double = math.log10(lower) - val logUpper: Double = math.log10(upper) + val logLower: Double = logN(lower, n) + val logUpper: Double = logN(upper, n) val logLimits: Limits[Double] = Limits(logLower, logUpper) val rndLogged: RandomT[Double] = RandomRanges(logLimits) - val rndDouble: Double = math.pow(10, rndLogged.randomT()) - rndDouble + math.pow(n, rndLogged.randomT()) } def apply[T: Generator](lim: Limits[T])(implicit t: Generator[T]): RandomT[T] = t(lim) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index e7f5decf0b44..01666bc24820 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -13,6 +13,12 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck import RandomRanges._ + test("log of any base") { + assert(logN(16, 4) == 2d) + assert(logN(1000, 10) === (3d +- 0.000001)) + assert(logN(256, 2) == 8d) + } + test("random doubles in log space") { val gen: Gen[(Double, Double, Int)] = for { x <- Gen.choose(0d, Double.MaxValue) From 7808b7cf5151f09829a1caf2171702b7f4fa3747 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sun, 7 Feb 2021 14:31:47 +0000 Subject: [PATCH 19/47] Logarithm for any base. Non deterministic bug...? --- .../scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 01666bc24820..76faa2af77b5 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -107,6 +107,7 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck val gen: RandomT[T] = RandomRanges(limit) val result: T = rand(gen) val ordered = lowerUpper(x, y) + println(s"result = $result [${result.getClass}], ordered._1 = ${ordered._1} [${ordered._1.getClass}], , ordered._1 = ${ordered._2} [${ordered._2.getClass}]") assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) } else Succeeded } From b433ab1e49674ebce64f068df081c43d4f6bafd5 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 8 Feb 2021 09:13:58 +0000 Subject: [PATCH 20/47] Still a problem with double conversion. --- .../spark/ml/tuning/RandomRangesSuite.scala | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 76faa2af77b5..183b1edafe91 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -33,6 +33,17 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } } + test("natural numbers to log range") { + checkCornerCase(Limits(Long.MaxValue - 1L, Long.MaxValue)) + checkCornerCase(Limits(Int.MaxValue - 1L, Int.MaxValue)) + } + + private def checkCornerCase(extreme: Limits[Long]): Assertion = { + val gen = RandomRanges(extreme) + val result = gen.randomTLog(10) + assert(result >= extreme.x && result <= extreme.y) + } + test("random BigInt generation does not go into infinite loop") { assert(randomBigInt0To(0) == BigInt(0)) } @@ -87,27 +98,29 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck abstract class RandomFn[T: Numeric: Generator] { def apply(genRandom: RandomT[T]): T = genRandom.randomT() - def appropriate(t: T): Boolean + def appropriate(x: T, y: T): Boolean } def Linear[T: Numeric: Generator]: RandomFn[T] = new RandomFn { override def apply(genRandom: RandomT[T]): T = genRandom.randomT() - override def appropriate(t: T): Boolean = true + override def appropriate(x: T, y: T): Boolean = true } def Log10[T: Numeric: Generator]: RandomFn[T] = new RandomFn { override def apply(genRandom: RandomT[T]): T = genRandom.randomTLog(10) val ops: Numeric[T] = implicitly[Numeric[T]] - override def appropriate(t: T): Boolean = ops.gt(t, ops.zero) + override def appropriate(x: T, y: T): Boolean = { + ops.gt(x, ops.zero) && ops.gt(y, ops.zero) && x != y + } } def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary](rand: RandomFn[T]): Assertion = { forAll { (x: T, y: T) => - if (rand.appropriate(x) && rand.appropriate(y)) { + if (rand.appropriate(x, y)) { val ops: Numeric[T] = implicitly[Numeric[T]] val limit: Limits[T] = Limits(x, y) val gen: RandomT[T] = RandomRanges(limit) val result: T = rand(gen) val ordered = lowerUpper(x, y) - println(s"result = $result [${result.getClass}], ordered._1 = ${ordered._1} [${ordered._1.getClass}], , ordered._1 = ${ordered._2} [${ordered._2.getClass}]") + println(s"result = $result, ordered = $ordered, x = $x, y = $y, ${x == y}") assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) } else Succeeded } From 844f706fd17c6a45eaee0e28a5d5c36f99a766be Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 8 Feb 2021 09:37:21 +0000 Subject: [PATCH 21/47] Extreme Long/Int ranges may cause trouble with being converted to a double for log-space. I guess this is just highly unlikely but not impossible. --- .../apache/spark/ml/tuning/RandomRangesSuite.scala | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 183b1edafe91..5a6d8a77dd38 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -33,17 +33,6 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } } - test("natural numbers to log range") { - checkCornerCase(Limits(Long.MaxValue - 1L, Long.MaxValue)) - checkCornerCase(Limits(Int.MaxValue - 1L, Int.MaxValue)) - } - - private def checkCornerCase(extreme: Limits[Long]): Assertion = { - val gen = RandomRanges(extreme) - val result = gen.randomTLog(10) - assert(result >= extreme.x && result <= extreme.y) - } - test("random BigInt generation does not go into infinite loop") { assert(randomBigInt0To(0) == BigInt(0)) } From 3d03565842bb62ae172b7308d4fc7f3eee32e710 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 8 Feb 2021 09:38:37 +0000 Subject: [PATCH 22/47] Restored a tag that was making my IntelliJ upset. --- .../scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala index 231f1b981a63..d369e7a61cdc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamGridBuilder.scala @@ -27,7 +27,7 @@ import org.apache.spark.ml.param._ * Builder for a param grid used in grid search-based model selection. */ @Since("1.2.0") -class ParamGridBuilder { +class ParamGridBuilder @Since("1.2.0") { private val paramGrid = mutable.Map.empty[Param[_], Iterable[_]] From 27b323eebcc147884a1b36b4d1c108cf1bfd2010 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 8 Feb 2021 09:40:50 +0000 Subject: [PATCH 23/47] Removed test println. --- .../scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 5a6d8a77dd38..e559c60045c1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -109,7 +109,6 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck val gen: RandomT[T] = RandomRanges(limit) val result: T = rand(gen) val ordered = lowerUpper(x, y) - println(s"result = $result, ordered = $ordered, x = $x, y = $y, ${x == y}") assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) } else Succeeded } From ef7bfd75a5dd2418bcbd05d86ecfe4cabac04b83 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 9 Feb 2021 09:54:52 +0000 Subject: [PATCH 24/47] Commit re. Hyperopt and its ilk. --- .../scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 385a9f6b0115..84a6d8aea039 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -129,6 +129,8 @@ object RandomRanges { * observations lies within the top 5% of the true maximum, with 95% probability" * - Evaluating Machine Learning Models by Alice Zheng * https://www.oreilly.com/library/view/evaluating-machine-learning/9781492048756/ch04.html + * + * Note: if you want more sophisticated hyperparameter tuning, consider Python libraries such as Hyperopt. */ class ParamRandomBuilder extends ParamGridBuilder { From 0de1ba5c6d20839f835fda90903c719b6ea0da2e Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 9 Feb 2021 10:21:43 +0000 Subject: [PATCH 25/47] @Since tags added. --- .../scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 84a6d8aea039..56eaddf3527d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.tuning +import org.apache.spark.annotation.Since import org.apache.spark.ml.param._ case class Limits[T: Numeric](x: T, y: T) @@ -132,8 +133,9 @@ object RandomRanges { * * Note: if you want more sophisticated hyperparameter tuning, consider Python libraries such as Hyperopt. */ +@Since("3.1.0") class ParamRandomBuilder extends ParamGridBuilder { - + @Since("3.1.0") def addRandom[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = { val gen: RandomT[T] = RandomRanges(lim) addGrid(param, (1 to n).map { _: Int => gen.randomT() }) From 9e7b7bd08849b0feba5239e820a72d98adeeb027 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 9 Feb 2021 10:51:25 +0000 Subject: [PATCH 26/47] Code style. --- .../scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 56eaddf3527d..981c502128f0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -131,7 +131,8 @@ object RandomRanges { * - Evaluating Machine Learning Models by Alice Zheng * https://www.oreilly.com/library/view/evaluating-machine-learning/9781492048756/ch04.html * - * Note: if you want more sophisticated hyperparameter tuning, consider Python libraries such as Hyperopt. + * Note: if you want more sophisticated hyperparameter tuning, consider Python libraries + * such as Hyperopt. */ @Since("3.1.0") class ParamRandomBuilder extends ParamGridBuilder { From fd8ac9f4ff2b7bfcb297b2072c4749eee9dbcfd0 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 9 Feb 2021 11:27:56 +0000 Subject: [PATCH 27/47] Code style. --- .../ml/tuning/ParamRandomBuilderSuite.scala | 33 ++++++++-- .../spark/ml/tuning/RandomRangesSuite.scala | 66 ++++++++++++------- 2 files changed, 68 insertions(+), 31 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 97fbfb3af770..8a65d6aab7ec 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -1,11 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.ml.tuning import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, TestParams} + import org.scalatest.matchers.must.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks -class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { +class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks + with Matchers { val solver = new TestParams() { val randomCol: Param[Double] = new Param[Double](this, "randomVal", "randomly generated value") @@ -14,12 +33,12 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert test("random params mixed with fixed values") { import RandomRanges._ - val maxIterations = 10 - val basedOn: Array[ParamPair[_]] = Array(maxIter -> maxIterations) - val inputCols: Array[String] = Array("input0", "input1") - val limit: Limits[Double] = Limits(0d, 100d) - val nRandoms = 5 - val paramMap: Array[ParamMap] = new ParamRandomBuilder() + val maxIterations: Int = 10 + val basedOn: Array[ParamPair[_]] = Array(maxIter -> maxIterations) + val inputCols: Array[String] = Array("input0", "input1") + val limit: Limits[Double] = Limits(0d, 100d) + val nRandoms: Int = 5 + val paramMap: Array[ParamMap] = new ParamRandomBuilder() .baseOn(basedOn: _*) .addGrid(inputCol, inputCols) .addRandom(randomCol, limit, nRandoms) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index e559c60045c1..f0f42476c641 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -1,14 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.ml.tuning -import scala.reflect.runtime.universe.TypeTag import org.apache.spark.SparkFunSuite + import org.scalacheck.Arbitrary._ -import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Gen.Choose -import org.scalatest.{Assertion, Succeeded} +import org.scalacheck.{Arbitrary, Gen} import org.scalatest.matchers.must.Matchers +import org.scalatest.{Assertion, Succeeded} import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks +import scala.reflect.runtime.universe.TypeTag + class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { import RandomRanges._ @@ -101,18 +120,17 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } } - def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary](rand: RandomFn[T]): Assertion = { + def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary](rand: RandomFn[T]): Assertion = forAll { (x: T, y: T) => if (rand.appropriate(x, y)) { - val ops: Numeric[T] = implicitly[Numeric[T]] - val limit: Limits[T] = Limits(x, y) - val gen: RandomT[T] = RandomRanges(limit) - val result: T = rand(gen) - val ordered = lowerUpper(x, y) + val ops: Numeric[T] = implicitly[Numeric[T]] + val limit: Limits[T] = Limits(x, y) + val gen: RandomT[T] = RandomRanges(limit) + val result: T = rand(gen) + val ordered: (T, T) = lowerUpper(x, y) assert(ops.gteq(result, ordered._1) && ops.lteq(result, ordered._2)) } else Succeeded } - } def checkDistributionOf[T: Numeric: Generator: Choose](range: T): Unit = { val ops: Numeric[T] = implicitly[Numeric[T]] @@ -127,32 +145,32 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } def meanAndStandardDeviation[T: Numeric](xs: Seq[T]): (Double, Double) = { - val ops: Numeric[T] = implicitly[Numeric[T]] - val n: Int = xs.length - val mean: Double = ops.toDouble(xs.sum) / n - val squaredDiff: Seq[Double] = xs.map { x: T => math.pow(ops.toDouble(x) - mean, 2) } - val stdDev: Double = math.pow(squaredDiff.sum / n - 1, 0.5) + val ops: Numeric[T] = implicitly[Numeric[T]] + val n: Int = xs.length + val mean: Double = ops.toDouble(xs.sum) / n + val squaredDiff: Seq[Double] = xs.map { x: T => math.pow(ops.toDouble(x) - mean, 2) } + val stdDev: Double = math.pow(squaredDiff.sum / n - 1, 0.5) (mean, stdDev) } def lowerUpper[T: Numeric](x: T, y: T): (T, T) = { - val ops: Numeric[T] = implicitly[Numeric[T]] + val ops: Numeric[T] = implicitly[Numeric[T]] (ops.min(x, y), ops.max(x, y)) } def midPointOf[T: Numeric : Generator](lim: Limits[T]): Double = { - val ordered: (T, T) = lowerUpper(lim.x, lim.y) - val ops: Numeric[T] = implicitly[Numeric[T]] - val range: T = ops.minus(ordered._2, ordered._1) + val ordered: (T, T) = lowerUpper(lim.x, lim.y) + val ops: Numeric[T] = implicitly[Numeric[T]] + val range: T = ops.minus(ordered._2, ordered._1) (ops.toDouble(range) / 2) + ops.toDouble(ordered._1) } def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { - val gen: RandomT[T] = RandomRanges(lim) - val xs: Seq[T] = (0 to n).map { _: Int => gen.randomT() } - val (mean, stdDev) = meanAndStandardDeviation(xs) - val tolerance: Double = 4 * stdDev - val halfWay: Double = midPointOf(lim) + val gen: RandomT[T] = RandomRanges(lim) + val xs: Seq[T] = (0 to n).map { _: Int => gen.randomT() } + val (mean, stdDev) = meanAndStandardDeviation(xs) + val tolerance: Double = 4 * stdDev + val halfWay: Double = midPointOf(lim) assert(mean > halfWay - tolerance && mean < halfWay + tolerance) } From e402df588e0d3fa0f940168025e8093b1098dd40 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 9 Feb 2021 12:19:43 +0000 Subject: [PATCH 28/47] Code style. --- .../apache/spark/ml/tuning/ParamRandomBuilderSuite.scala | 6 +++--- .../org/apache/spark/ml/tuning/RandomRangesSuite.scala | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 8a65d6aab7ec..cb0cffde4b38 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -17,12 +17,12 @@ package org.apache.spark.ml.tuning -import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, TestParams} - import org.scalatest.matchers.must.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, TestParams} + class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index f0f42476c641..87df203c6f34 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.ml.tuning -import org.apache.spark.SparkFunSuite +import scala.reflect.runtime.universe.TypeTag import org.scalacheck.Arbitrary._ import org.scalacheck.Gen.Choose @@ -26,7 +26,7 @@ import org.scalatest.matchers.must.Matchers import org.scalatest.{Assertion, Succeeded} import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks -import scala.reflect.runtime.universe.TypeTag +import org.apache.spark.SparkFunSuite class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { From e54a58ff84dbe464eb94c92270866b0502b00c26 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 9 Feb 2021 12:47:57 +0000 Subject: [PATCH 29/47] Code style. --- .../scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 87df203c6f34..c0b922a54149 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -20,10 +20,10 @@ package org.apache.spark.ml.tuning import scala.reflect.runtime.universe.TypeTag import org.scalacheck.Arbitrary._ -import org.scalacheck.Gen.Choose import org.scalacheck.{Arbitrary, Gen} -import org.scalatest.matchers.must.Matchers +import org.scalacheck.Gen.Choose import org.scalatest.{Assertion, Succeeded} +import org.scalatest.matchers.must.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks import org.apache.spark.SparkFunSuite From b0455a18c4e8157f781cf92da488da0937eac3a3 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 9 Feb 2021 13:14:15 +0000 Subject: [PATCH 30/47] Code style. --- .../scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index c0b922a54149..85ffb4553d26 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -19,8 +19,8 @@ package org.apache.spark.ml.tuning import scala.reflect.runtime.universe.TypeTag -import org.scalacheck.Arbitrary._ import org.scalacheck.{Arbitrary, Gen} +import org.scalacheck.Arbitrary._ import org.scalacheck.Gen.Choose import org.scalatest.{Assertion, Succeeded} import org.scalatest.matchers.must.Matchers From f641d51abc0f6cd0eb076e78beebe5d6fac22b5b Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 9 Feb 2021 13:33:43 +0000 Subject: [PATCH 31/47] Superfluous parentheses. --- .../org/apache/spark/ml/tuning/ParamRandomBuilder.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 981c502128f0..4772160a733f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -65,7 +65,7 @@ object RandomRanges { RandomRanges.randomLog(lower, upper, n) override def randomT(): Double = - randomBigDecimalBetween(BigDecimal(lower), BigDecimal(upper)).doubleValue() + randomBigDecimalBetween(BigDecimal(lower), BigDecimal(upper)).doubleValue } } @@ -79,7 +79,7 @@ object RandomRanges { RandomRanges.randomLog(lower, upper, n).toFloat override def randomT(): Float = - randomBigDecimalBetween(BigDecimal(lower), BigDecimal(upper)).floatValue() + randomBigDecimalBetween(BigDecimal(lower), BigDecimal(upper)).floatValue } } @@ -93,7 +93,7 @@ object RandomRanges { RandomRanges.randomLog(lower, upper, n).toInt override def randomT(): Int = - bigIntBetween(BigInt(lower), BigInt(upper)).intValue() + bigIntBetween(BigInt(lower), BigInt(upper)).intValue } } @@ -107,7 +107,7 @@ object RandomRanges { RandomRanges.randomLog(lower, upper, n).toLong override def randomT(): Long = - bigIntBetween(BigInt(lower), BigInt(upper)).longValue() + bigIntBetween(BigInt(lower), BigInt(upper)).longValue } } From 86a781df75b0604f80e6171888cd44cad2c0f200 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Fri, 12 Feb 2021 09:01:43 +0000 Subject: [PATCH 32/47] [SPARK-34415][ML] Made private anything that wasn't and that was not public facing. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 4772160a733f..929a9e3e773d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -20,20 +20,20 @@ package org.apache.spark.ml.tuning import org.apache.spark.annotation.Since import org.apache.spark.ml.param._ -case class Limits[T: Numeric](x: T, y: T) +private[ml] case class Limits[T: Numeric](x: T, y: T) -abstract class RandomT[T: Numeric] { +private[ml] abstract class RandomT[T: Numeric] { def randomT(): T def randomTLog(n: Int): T } -abstract class Generator[T: Numeric] { +private[ml] abstract class Generator[T: Numeric] { def apply(lim: Limits[T]): RandomT[T] } -object RandomRanges { +private[ml] object RandomRanges { - val rnd = new scala.util.Random + private val rnd = new scala.util.Random private[tuning] def randomBigInt0To(x: BigInt): BigInt = { var randVal = BigInt(x.bitLength, rnd) @@ -43,7 +43,7 @@ object RandomRanges { randVal } - def bigIntBetween(lower: BigInt, upper: BigInt): BigInt = { + private[ml] def bigIntBetween(lower: BigInt, upper: BigInt): BigInt = { val diff: BigInt = upper - lower randomBigInt0To(diff) + lower } @@ -55,7 +55,7 @@ object RandomRanges { (zeroCenteredRnd * range) + halfWay } - implicit object DoubleGenerator extends Generator[Double] { + private[ml] implicit object DoubleGenerator extends Generator[Double] { def apply(limits: Limits[Double]): RandomT[Double] = new RandomT[Double] { import limits._ val lower: Double = math.min(x, y) @@ -69,7 +69,7 @@ object RandomRanges { } } - implicit object FloatGenerator extends Generator[Float] { + private[ml] implicit object FloatGenerator extends Generator[Float] { def apply(limits: Limits[Float]): RandomT[Float] = new RandomT[Float] { import limits._ val lower: Float = math.min(x, y) @@ -83,7 +83,7 @@ object RandomRanges { } } - implicit object IntGenerator extends Generator[Int] { + private[ml] implicit object IntGenerator extends Generator[Int] { def apply(limits: Limits[Int]): RandomT[Int] = new RandomT[Int] { import limits._ val lower: Int = math.min(x, y) @@ -97,7 +97,7 @@ object RandomRanges { } } - implicit object LongGenerator extends Generator[Long] { + private[ml] implicit object LongGenerator extends Generator[Long] { def apply(limits: Limits[Long]): RandomT[Long] = new RandomT[Long] { import limits._ val lower: Long = math.min(x, y) @@ -111,9 +111,9 @@ object RandomRanges { } } - def logN(x: Double, base: Int): Double = math.log(x) / math.log(base) + private[ml] def logN(x: Double, base: Int): Double = math.log(x) / math.log(base) - def randomLog(lower: Double, upper: Double, n: Int): Double = { + private[ml] def randomLog(lower: Double, upper: Double, n: Int): Double = { val logLower: Double = logN(lower, n) val logUpper: Double = logN(upper, n) val logLimits: Limits[Double] = Limits(logLower, logUpper) @@ -121,7 +121,7 @@ object RandomRanges { math.pow(n, rndLogged.randomT()) } - def apply[T: Generator](lim: Limits[T])(implicit t: Generator[T]): RandomT[T] = t(lim) + private[ml] def apply[T: Generator](lim: Limits[T])(implicit t: Generator[T]): RandomT[T] = t(lim) } @@ -134,9 +134,9 @@ object RandomRanges { * Note: if you want more sophisticated hyperparameter tuning, consider Python libraries * such as Hyperopt. */ -@Since("3.1.0") +@Since("3.2.0") class ParamRandomBuilder extends ParamGridBuilder { - @Since("3.1.0") + @Since("3.2.0") def addRandom[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = { val gen: RandomT[T] = RandomRanges(lim) addGrid(param, (1 to n).map { _: Int => gen.randomT() }) From b805ea523bd77fcefb6b4a1d1b8473e06c1365b7 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Fri, 12 Feb 2021 12:13:40 +0000 Subject: [PATCH 33/47] [SPARK-34415][ML] Oops. The user needs Limits and added log methods. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 28 ++++++++++++++----- .../ml/tuning/ParamRandomBuilderSuite.scala | 25 +++++++++++++---- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 929a9e3e773d..33c88ff88c5d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -20,18 +20,18 @@ package org.apache.spark.ml.tuning import org.apache.spark.annotation.Since import org.apache.spark.ml.param._ -private[ml] case class Limits[T: Numeric](x: T, y: T) +case class Limits[T: Numeric](x: T, y: T) private[ml] abstract class RandomT[T: Numeric] { def randomT(): T def randomTLog(n: Int): T } -private[ml] abstract class Generator[T: Numeric] { +abstract class Generator[T: Numeric] { def apply(lim: Limits[T]): RandomT[T] } -private[ml] object RandomRanges { +object RandomRanges { private val rnd = new scala.util.Random @@ -55,7 +55,7 @@ private[ml] object RandomRanges { (zeroCenteredRnd * range) + halfWay } - private[ml] implicit object DoubleGenerator extends Generator[Double] { + implicit object DoubleGenerator extends Generator[Double] { def apply(limits: Limits[Double]): RandomT[Double] = new RandomT[Double] { import limits._ val lower: Double = math.min(x, y) @@ -69,7 +69,7 @@ private[ml] object RandomRanges { } } - private[ml] implicit object FloatGenerator extends Generator[Float] { + implicit object FloatGenerator extends Generator[Float] { def apply(limits: Limits[Float]): RandomT[Float] = new RandomT[Float] { import limits._ val lower: Float = math.min(x, y) @@ -83,7 +83,7 @@ private[ml] object RandomRanges { } } - private[ml] implicit object IntGenerator extends Generator[Int] { + implicit object IntGenerator extends Generator[Int] { def apply(limits: Limits[Int]): RandomT[Int] = new RandomT[Int] { import limits._ val lower: Int = math.min(x, y) @@ -97,7 +97,7 @@ private[ml] object RandomRanges { } } - private[ml] implicit object LongGenerator extends Generator[Long] { + implicit object LongGenerator extends Generator[Long] { def apply(limits: Limits[Long]): RandomT[Long] = new RandomT[Long] { import limits._ val lower: Long = math.min(x, y) @@ -142,4 +142,18 @@ class ParamRandomBuilder extends ParamGridBuilder { addGrid(param, (1 to n).map { _: Int => gen.randomT() }) } + @Since("3.2.0") + def addLog10Random[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = + addLogRandom(param, lim, n, 2) + + @Since("3.2.0") + def addLog2Random[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = + addLogRandom(param, lim, n, 2) + + private def addLogRandom[T: Generator](param: Param[T], lim: Limits[T], + n: Int, base: Int): this.type = { + val gen: RandomT[T] = RandomRanges(lim) + addGrid(param, (1 to n).map { _: Int => gen.randomTLog(base) }) + } + } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index cb0cffde4b38..7ef335af80e2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -31,18 +31,31 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert } import solver.{inputCol, maxIter, randomCol} - test("random params mixed with fixed values") { + val limit: Limits[Double] = Limits(1d, 100d) + val nRandoms: Int = 5 + + test("random linear params mixed with fixed values") { + import RandomRanges._ + checkRangeAndCardinality(_.addRandom(randomCol, limit, nRandoms)) + } + test("random log2 params mixed with fixed values") { + import RandomRanges._ + checkRangeAndCardinality(_.addLog2Random(randomCol, limit, nRandoms)) + } + test("random log10 params mixed with fixed values") { import RandomRanges._ + checkRangeAndCardinality(_.addLog10Random(randomCol, limit, nRandoms)) + } + + def checkRangeAndCardinality(addFn: ParamRandomBuilder => ParamRandomBuilder): Unit = { val maxIterations: Int = 10 val basedOn: Array[ParamPair[_]] = Array(maxIter -> maxIterations) val inputCols: Array[String] = Array("input0", "input1") - val limit: Limits[Double] = Limits(0d, 100d) - val nRandoms: Int = 5 - val paramMap: Array[ParamMap] = new ParamRandomBuilder() + + val builder: ParamRandomBuilder = new ParamRandomBuilder() .baseOn(basedOn: _*) .addGrid(inputCol, inputCols) - .addRandom(randomCol, limit, nRandoms) - .build() + val paramMap: Array[ParamMap] = addFn(builder).build() assert(paramMap.length == inputCols.length * nRandoms * basedOn.length) paramMap.foreach { m: ParamMap => assert(m(maxIter) == maxIterations) From b02f64ea7f07c2d23e37dfced5acffbfef3a766a Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Fri, 12 Feb 2021 12:15:29 +0000 Subject: [PATCH 34/47] [SPARK-34415][ML] Oops. The user needs Limits and added log methods. --- .../org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 7ef335af80e2..d750bd1dd90c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -38,10 +38,12 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert import RandomRanges._ checkRangeAndCardinality(_.addRandom(randomCol, limit, nRandoms)) } + test("random log2 params mixed with fixed values") { import RandomRanges._ checkRangeAndCardinality(_.addLog2Random(randomCol, limit, nRandoms)) } + test("random log10 params mixed with fixed values") { import RandomRanges._ checkRangeAndCardinality(_.addLog10Random(randomCol, limit, nRandoms)) From 44061a47b646edf0f934d901e1c3bf9047db9da5 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Fri, 12 Feb 2021 12:17:28 +0000 Subject: [PATCH 35/47] [SPARK-34415][ML] Oops. Base 10. --- .../scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 33c88ff88c5d..40fb16a9f631 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -144,7 +144,7 @@ class ParamRandomBuilder extends ParamGridBuilder { @Since("3.2.0") def addLog10Random[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = - addLogRandom(param, lim, n, 2) + addLogRandom(param, lim, n, 10) @Since("3.2.0") def addLog2Random[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = From 07a1e01499151db3f9cb272337a6dfdfd41b78c5 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Fri, 12 Feb 2021 13:13:25 +0000 Subject: [PATCH 36/47] [SPARK-34415][ML] Added Java specific API and tests. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 51 ++++++++ .../ml/tuning/ParamRandomBuilderSuite.scala | 123 ++++++++++++++++-- .../spark/ml/tuning/RandomRangesSuite.scala | 2 + 3 files changed, 167 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 40fb16a9f631..5fbbe9121944 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.tuning import org.apache.spark.annotation.Since import org.apache.spark.ml.param._ +import org.apache.spark.ml.tuning.RandomRanges._ case class Limits[T: Numeric](x: T, y: T) @@ -156,4 +157,54 @@ class ParamRandomBuilder extends ParamGridBuilder { addGrid(param, (1 to n).map { _: Int => gen.randomTLog(base) }) } + // specialized versions for Java. + + @Since("3.2.0") + def addRandom(param: DoubleParam, x: Double, y: Double, n: Int): this.type = + addRandom(param, Limits(x, y), n)(DoubleGenerator) + + @Since("3.2.0") + def addLog10Random(param: DoubleParam, x: Double, y: Double, n: Int): this.type = + addLogRandom(param, Limits(x, y), n, 10)(DoubleGenerator) + + @Since("3.2.0") + def addLog2Random(param: DoubleParam, x: Double, y: Double, n: Int): this.type = + addLogRandom(param, Limits(x, y), n, 2)(DoubleGenerator) + + @Since("3.2.0") + def addRandom(param: FloatParam, x: Float, y: Float, n: Int): this.type = + addRandom(param, Limits(x, y), n)(FloatGenerator) + + @Since("3.2.0") + def addLog10Random(param: FloatParam, x: Float, y: Float, n: Int): this.type = + addLogRandom(param, Limits(x, y), n, 10)(FloatGenerator) + + @Since("3.2.0") + def addLog2Random(param: FloatParam, x: Float, y: Float, n: Int): this.type = + addLogRandom(param, Limits(x, y), n, 2)(FloatGenerator) + + @Since("3.2.0") + def addRandom(param: IntParam, x: Int, y: Int, n: Int): this.type = + addRandom(param, Limits(x, y), n)(IntGenerator) + + @Since("3.2.0") + def addLog10Random(param: IntParam, x: Int, y: Int, n: Int): this.type = + addLogRandom(param, Limits(x, y), n, 10)(IntGenerator) + + @Since("3.2.0") + def addLog2Random(param: IntParam, x: Int, y: Int, n: Int): this.type = + addLogRandom(param, Limits(x, y), n, 2)(IntGenerator) + + @Since("3.2.0") + def addRandom(param: LongParam, x: Long, y: Long, n: Int): this.type = + addRandom(param, Limits(x, y), n)(LongGenerator) + + @Since("3.2.0") + def addLog10Random(param: LongParam, x: Long, y: Long, n: Int): this.type = + addLogRandom(param, Limits(x, y), n, 10)(LongGenerator) + + @Since("3.2.0") + def addLog2Random(param: LongParam, x: Long, y: Long, n: Int): this.type = + addLogRandom(param, Limits(x, y), n, 2)(LongGenerator) + } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index d750bd1dd90c..2be5d8ffab29 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -21,38 +21,142 @@ import org.scalatest.matchers.must.Matchers import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, TestParams} +import org.apache.spark.ml.param._ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropertyChecks with Matchers { val solver = new TestParams() { - val randomCol: Param[Double] = new Param[Double](this, "randomVal", "randomly generated value") + private val randomColName = "randomVal" + val DummyDoubleParam = new DoubleParam(this, randomColName, "doc") + val DummyLongParam = new LongParam(this, randomColName, "doc") + val DummyFloatParam = new FloatParam(this, randomColName, "doc") + val DummyIntParam = new IntParam(this, randomColName, "doc") } - import solver.{inputCol, maxIter, randomCol} + import solver._ - val limit: Limits[Double] = Limits(1d, 100d) + val DoubleLimits: Limits[Double] = Limits(1d, 100d) + val FloatLimits: Limits[Float] = Limits(1f, 100f) + val IntLimits: Limits[Int] = Limits(1, 100) + val LongLimits: Limits[Long] = Limits(1L, 100L) val nRandoms: Int = 5 + // Java API + + test("Java API random Double linear params mixed with fixed values") { + checkRangeAndCardinality( + _.addRandom(DummyDoubleParam, DoubleLimits.x, DoubleLimits.y, nRandoms), + DoubleLimits, + DummyDoubleParam) + } + + test("Java API random Double log2 params mixed with fixed values") { + checkRangeAndCardinality( + _.addLog2Random(DummyDoubleParam, DoubleLimits.x, DoubleLimits.y, nRandoms), + DoubleLimits, + DummyDoubleParam) + } + + test("Java API random Double log10 params mixed with fixed values") { + checkRangeAndCardinality( + _.addLog10Random(DummyDoubleParam, DoubleLimits.x, DoubleLimits.y, nRandoms), + DoubleLimits, + DummyDoubleParam) + } + + test("Java API random Float linear params mixed with fixed values") { + checkRangeAndCardinality( + _.addRandom(DummyFloatParam, FloatLimits.x, FloatLimits.y, nRandoms), + FloatLimits, + DummyFloatParam) + } + + test("Java API random Float log2 params mixed with fixed values") { + checkRangeAndCardinality( + _.addLog2Random(DummyFloatParam, FloatLimits.x, FloatLimits.y, nRandoms), + FloatLimits, + DummyFloatParam) + } + + test("Java API random Float log10 params mixed with fixed values") { + checkRangeAndCardinality( + _.addLog10Random(DummyFloatParam, FloatLimits.x, FloatLimits.y, nRandoms), + FloatLimits, + DummyFloatParam) + } + + test("Java API random Long linear params mixed with fixed values") { + checkRangeAndCardinality( + _.addRandom(DummyLongParam, LongLimits.x, LongLimits.y, nRandoms), + LongLimits, + DummyLongParam) + } + + test("Java API random Long log2 params mixed with fixed values") { + checkRangeAndCardinality( + _.addLog2Random(DummyLongParam, LongLimits.x, LongLimits.y, nRandoms), + LongLimits, + DummyLongParam) + } + + test("Java API random Long log10 params mixed with fixed values") { + checkRangeAndCardinality( + _.addLog10Random(DummyLongParam, LongLimits.x, LongLimits.y, nRandoms), + LongLimits, + DummyLongParam) + } + + test("Java API random Int linear params mixed with fixed values") { + checkRangeAndCardinality( + _.addRandom(DummyIntParam, IntLimits.x, IntLimits.y, nRandoms), + IntLimits, + DummyIntParam) + } + + test("Java API random Int log2 params mixed with fixed values") { + checkRangeAndCardinality( + _.addLog2Random(DummyIntParam, IntLimits.x, IntLimits.y, nRandoms), + IntLimits, + DummyIntParam) + } + + test("Java API random Int log10 params mixed with fixed values") { + checkRangeAndCardinality( + _.addLog10Random(DummyIntParam, IntLimits.x, IntLimits.y, nRandoms), + IntLimits, + DummyIntParam) + } + + // Scala API + test("random linear params mixed with fixed values") { import RandomRanges._ - checkRangeAndCardinality(_.addRandom(randomCol, limit, nRandoms)) + checkRangeAndCardinality(_.addRandom(DummyDoubleParam, DoubleLimits, nRandoms), + DoubleLimits, + DummyDoubleParam) } test("random log2 params mixed with fixed values") { import RandomRanges._ - checkRangeAndCardinality(_.addLog2Random(randomCol, limit, nRandoms)) + checkRangeAndCardinality(_.addLog2Random(DummyDoubleParam, DoubleLimits, nRandoms), + DoubleLimits, + DummyDoubleParam) } test("random log10 params mixed with fixed values") { import RandomRanges._ - checkRangeAndCardinality(_.addLog10Random(randomCol, limit, nRandoms)) + checkRangeAndCardinality(_.addLog10Random(DummyDoubleParam, DoubleLimits, nRandoms), + DoubleLimits, + DummyDoubleParam) } - def checkRangeAndCardinality(addFn: ParamRandomBuilder => ParamRandomBuilder): Unit = { + def checkRangeAndCardinality[T: Numeric](addFn: ParamRandomBuilder => ParamRandomBuilder, + lim: Limits[T], + randomCol: Param[T]): Unit = { val maxIterations: Int = 10 val basedOn: Array[ParamPair[_]] = Array(maxIter -> maxIterations) val inputCols: Array[String] = Array("input0", "input1") + val ops: Numeric[T] = implicitly[Numeric[T]] val builder: ParamRandomBuilder = new ParamRandomBuilder() .baseOn(basedOn: _*) @@ -62,7 +166,8 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert paramMap.foreach { m: ParamMap => assert(m(maxIter) == maxIterations) assert(inputCols contains m(inputCol)) - assert(m(randomCol) >= limit.x && m(randomCol) <= limit.y) + assert(ops.gteq(m(randomCol), lim.x)) + assert(ops.lteq(m(randomCol), lim.y)) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 85ffb4553d26..114f47c70b00 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -108,10 +108,12 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck def apply(genRandom: RandomT[T]): T = genRandom.randomT() def appropriate(x: T, y: T): Boolean } + def Linear[T: Numeric: Generator]: RandomFn[T] = new RandomFn { override def apply(genRandom: RandomT[T]): T = genRandom.randomT() override def appropriate(x: T, y: T): Boolean = true } + def Log10[T: Numeric: Generator]: RandomFn[T] = new RandomFn { override def apply(genRandom: RandomT[T]): T = genRandom.randomTLog(10) val ops: Numeric[T] = implicitly[Numeric[T]] From 23444954e859019239460a0c2324084a2884bbce Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sat, 13 Feb 2021 11:02:31 +0000 Subject: [PATCH 37/47] [SPARK-34415][ML] Random Long generated removed as superfluous (per code review). Unnecessary @Since tags removed. --- .../spark/ml/tuning/ParamRandomBuilder.scala | 38 ------------------- .../ml/tuning/ParamRandomBuilderSuite.scala | 23 ----------- .../spark/ml/tuning/RandomRangesSuite.scala | 12 ------ 3 files changed, 73 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 5fbbe9121944..64be2b4deda0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -98,20 +98,6 @@ object RandomRanges { } } - implicit object LongGenerator extends Generator[Long] { - def apply(limits: Limits[Long]): RandomT[Long] = new RandomT[Long] { - import limits._ - val lower: Long = math.min(x, y) - val upper: Long = math.max(x, y) - - override def randomTLog(n: Int): Long = - RandomRanges.randomLog(lower, upper, n).toLong - - override def randomT(): Long = - bigIntBetween(BigInt(lower), BigInt(upper)).longValue - } - } - private[ml] def logN(x: Double, base: Int): Double = math.log(x) / math.log(base) private[ml] def randomLog(lower: Double, upper: Double, n: Int): Double = { @@ -137,17 +123,14 @@ object RandomRanges { */ @Since("3.2.0") class ParamRandomBuilder extends ParamGridBuilder { - @Since("3.2.0") def addRandom[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = { val gen: RandomT[T] = RandomRanges(lim) addGrid(param, (1 to n).map { _: Int => gen.randomT() }) } - @Since("3.2.0") def addLog10Random[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = addLogRandom(param, lim, n, 10) - @Since("3.2.0") def addLog2Random[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = addLogRandom(param, lim, n, 2) @@ -159,52 +142,31 @@ class ParamRandomBuilder extends ParamGridBuilder { // specialized versions for Java. - @Since("3.2.0") def addRandom(param: DoubleParam, x: Double, y: Double, n: Int): this.type = addRandom(param, Limits(x, y), n)(DoubleGenerator) - @Since("3.2.0") def addLog10Random(param: DoubleParam, x: Double, y: Double, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 10)(DoubleGenerator) - @Since("3.2.0") def addLog2Random(param: DoubleParam, x: Double, y: Double, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 2)(DoubleGenerator) - @Since("3.2.0") def addRandom(param: FloatParam, x: Float, y: Float, n: Int): this.type = addRandom(param, Limits(x, y), n)(FloatGenerator) - @Since("3.2.0") def addLog10Random(param: FloatParam, x: Float, y: Float, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 10)(FloatGenerator) - @Since("3.2.0") def addLog2Random(param: FloatParam, x: Float, y: Float, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 2)(FloatGenerator) - @Since("3.2.0") def addRandom(param: IntParam, x: Int, y: Int, n: Int): this.type = addRandom(param, Limits(x, y), n)(IntGenerator) - @Since("3.2.0") def addLog10Random(param: IntParam, x: Int, y: Int, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 10)(IntGenerator) - @Since("3.2.0") def addLog2Random(param: IntParam, x: Int, y: Int, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 2)(IntGenerator) - @Since("3.2.0") - def addRandom(param: LongParam, x: Long, y: Long, n: Int): this.type = - addRandom(param, Limits(x, y), n)(LongGenerator) - - @Since("3.2.0") - def addLog10Random(param: LongParam, x: Long, y: Long, n: Int): this.type = - addLogRandom(param, Limits(x, y), n, 10)(LongGenerator) - - @Since("3.2.0") - def addLog2Random(param: LongParam, x: Long, y: Long, n: Int): this.type = - addLogRandom(param, Limits(x, y), n, 2)(LongGenerator) - } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 2be5d8ffab29..02aef1d7b7da 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -29,7 +29,6 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert val solver = new TestParams() { private val randomColName = "randomVal" val DummyDoubleParam = new DoubleParam(this, randomColName, "doc") - val DummyLongParam = new LongParam(this, randomColName, "doc") val DummyFloatParam = new FloatParam(this, randomColName, "doc") val DummyIntParam = new IntParam(this, randomColName, "doc") } @@ -38,7 +37,6 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert val DoubleLimits: Limits[Double] = Limits(1d, 100d) val FloatLimits: Limits[Float] = Limits(1f, 100f) val IntLimits: Limits[Int] = Limits(1, 100) - val LongLimits: Limits[Long] = Limits(1L, 100L) val nRandoms: Int = 5 // Java API @@ -85,27 +83,6 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert DummyFloatParam) } - test("Java API random Long linear params mixed with fixed values") { - checkRangeAndCardinality( - _.addRandom(DummyLongParam, LongLimits.x, LongLimits.y, nRandoms), - LongLimits, - DummyLongParam) - } - - test("Java API random Long log2 params mixed with fixed values") { - checkRangeAndCardinality( - _.addLog2Random(DummyLongParam, LongLimits.x, LongLimits.y, nRandoms), - LongLimits, - DummyLongParam) - } - - test("Java API random Long log10 params mixed with fixed values") { - checkRangeAndCardinality( - _.addLog10Random(DummyLongParam, LongLimits.x, LongLimits.y, nRandoms), - LongLimits, - DummyLongParam) - } - test("Java API random Int linear params mixed with fixed values") { checkRangeAndCardinality( _.addRandom(DummyIntParam, IntLimits.x, IntLimits.y, nRandoms), diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 114f47c70b00..45989a271e26 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -68,18 +68,6 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck checkDistributionOf(1000) } - test("random longs") { - checkRange(Linear[Long]) - } - - test("random log longs") { - checkRange(Log10[Long]) - } - - test("random long distribution") { - checkDistributionOf(1000L) - } - test("random doubles") { checkRange(Linear[Double]) } From 25737d78e9367af03fa427986662b56ad271dcc4 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Sat, 13 Feb 2021 14:35:55 +0000 Subject: [PATCH 38/47] [SPARK-34415][ML] Documentation and Scala example. --- docs/ml-tuning.md | 8 ++- ...ctionViaRandomHyperparametersExample.scala | 72 +++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md index 3ddd185d19ff..72226a09bf5a 100644 --- a/docs/ml-tuning.md +++ b/docs/ml-tuning.md @@ -71,10 +71,16 @@ for multiclass problems, a [`MultilabelClassificationEvaluator`](api/scala/org/a [`RankingEvaluator`](api/scala/org/apache/spark/ml/evaluation/RankingEvaluator.html) for ranking problems. The default metric used to choose the best `ParamMap` can be overridden by the `setMetricName` method in each of these evaluators. -To help construct the parameter grid, users can use the [`ParamGridBuilder`](api/scala/org/apache/spark/ml/tuning/ParamGridBuilder.html) utility. +To help construct the parameter grid, users can use the [`ParamGridBuilder`](api/scala/org/apache/spark/ml/tuning/ParamGridBuilder.html) utility (see the *Cross-Validation* section below for an example). By default, sets of parameters from the parameter grid are evaluated in serial. Parameter evaluation can be done in parallel by setting `parallelism` with a value of 2 or more (a value of 1 will be serial) before running model selection with `CrossValidator` or `TrainValidationSplit`. The value of `parallelism` should be chosen carefully to maximize parallelism without exceeding cluster resources, and larger values may not always lead to improved performance. Generally speaking, a value up to 10 should be sufficient for most clusters. +Alternatively, users can use the [`ParamRandomBuilder`](api/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.html) utility. +This has the same properties of `ParamGridBuilder` mentioned above, but hyperparameters are chosen at random within a user-defined range. +The mathematical principle behind this is that given enough samples, the probability of at least one sample *not* being near the optimum within a range tends to zero. +Irrespective of machine learning model, the expected number of samples needed to have at least one within 5% of the optimum is about 60. +If this 5% volume lies between the parameters defined in a grid search, it will *never* be found by `ParamGridBuilder`. + # Cross-Validation `CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets. E.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. To evaluate a particular `ParamMap`, `CrossValidator` computes the average evaluation metric for the 3 `Model`s produced by fitting the `Estimator` on the 3 different (training, test) dataset pairs. diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala new file mode 100644 index 000000000000..46d35fc0cb6e --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamRandomBuilder} +// $example off$ +import org.apache.spark.sql.SparkSession + +/** + * A simple example demonstrating model selection using ParamRandomBuilder. + * + * Run with + * {{{ + * bin/run-example ml.ModelSelectionViaRandomHyperparametersExample + * }}} + */ +object ModelSelectionViaRandomHyperparametersExample { + def main(args: Array[String]): Unit = { + val spark = SparkSession + .builder + .appName("ModelSelectionViaTrainValidationSplitExample") + .getOrCreate() + // scalastyle:off println + // $example on$ + // Prepare training and test data. + val data = spark.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt") + + val lr = new LinearRegression().setMaxIter(10) + + val paramGrid = new ParamRandomBuilder() + .addLog10Random(lr.regParam, 0.01, 1.0, 5) + .addGrid(lr.fitIntercept) + .addRandom(lr.elasticNetParam, 0.0, 1.0, 5) + .build() + + val eval = new BinaryClassificationEvaluator + eval.setRawPredictionCol("prediction") + val cv: CrossValidator = new CrossValidator() + .setEstimator(lr) + .setEstimatorParamMaps(paramGrid) + .setEvaluator(eval) + .setNumFolds(3) + val cvModel: CrossValidatorModel = cv.fit(data) + val parent: LinearRegression = cvModel.bestModel.parent.asInstanceOf[LinearRegression] + + println(s"Optimal value for ${lr.regParam}: ${parent.getRegParam}") + println(s"Optimal value for ${lr.elasticNetParam}: ${parent.getElasticNetParam}") + println(s"Optimal value for ${lr.fitIntercept}: ${parent.getFitIntercept}") + // $example off$ + + spark.stop() + } + // scalastyle:on println +} From 308f1c3ed3627b897d84974b910916cf0227828d Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 15 Feb 2021 11:30:11 +0000 Subject: [PATCH 39/47] [SPARK-34415][ML] Documentation, Scala and Java examples. --- docs/ml-tuning.md | 24 ++++++ ...ectionViaRandomHyperparametersExample.java | 83 +++++++++++++++++++ ...ctionViaRandomHyperparametersExample.scala | 31 ++++--- 3 files changed, 126 insertions(+), 12 deletions(-) create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaRandomHyperparametersExample.java diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md index 72226a09bf5a..7bd32316ece9 100644 --- a/docs/ml-tuning.md +++ b/docs/ml-tuning.md @@ -81,6 +81,30 @@ The mathematical principle behind this is that given enough samples, the probabi Irrespective of machine learning model, the expected number of samples needed to have at least one within 5% of the optimum is about 60. If this 5% volume lies between the parameters defined in a grid search, it will *never* be found by `ParamGridBuilder`. +
+ +
+ +Refer to the [`ParamRandomBuilder` Scala docs](api/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.html) for details on the API. + +{% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala %} +
+ +
+ +Refer to the [`ParamRandomBuilder` Java docs](api/java/org/apache/spark/ml/tuning/ParamRandomBuilder.html) for details on the API. + +{% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java %} +
+ +
+ +Python users are recommended to look at Python libraries that are specifically for hyperparameter tuning such as Hyperopt. + +
+ +
+ # Cross-Validation `CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets. E.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. To evaluate a particular `ParamMap`, `CrossValidator` computes the average evaluation metric for the 3 `Model`s produced by fitting the `Estimator` on the 3 different (training, test) dataset pairs. diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaRandomHyperparametersExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaRandomHyperparametersExample.java new file mode 100644 index 000000000000..086920f77536 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaRandomHyperparametersExample.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import org.apache.spark.ml.evaluation.RegressionEvaluator; +import org.apache.spark.ml.param.ParamMap; +import org.apache.spark.ml.regression.LinearRegression; +import org.apache.spark.ml.tuning.*; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +// $example off$ + +/** + * A simple example demonstrating model selection using ParamRandomBuilder. + * + * Run with + * {{{ + * bin/run-example ml.JavaModelSelectionViaRandomHyperparametersExample + * }}} + */ +public class JavaModelSelectionViaRandomHyperparametersExample { + + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaModelSelectionViaTrainValidationSplitExample") + .getOrCreate(); + + // $example on$ + Dataset data = spark.read().format("libsvm") + .load("data/mllib/sample_linear_regression_data.txt"); + + LinearRegression lr = new LinearRegression(); + + // We sample the regularization parameter logarithmically over the range [0.01, 1.0]. + // This means that values around 0.01, 0.1 and 1.0 are roughly equally likely. + // Note that both parameters must be greater than zero as otherwise we'll get an infinity. + // We sample the the ElasticNet mixing parameter uniformly over the range [0, 1] + // Note that in real life, you'd choose more than the 5 samples we see below. + ParamMap[] hyperparameters = new ParamRandomBuilder() + .addLog10Random(lr.regParam(), 0.01, 1.0, 5) + .addRandom(lr.elasticNetParam(), 0.0, 1.0, 5) + .addGrid(lr.fitIntercept()) + .build(); + + System.out.println("hyperparameters:"); + for (ParamMap param : hyperparameters) { + System.out.println(param); + } + + CrossValidator cv = new CrossValidator() + .setEstimator(lr) + .setEstimatorParamMaps(hyperparameters) + .setEvaluator(new RegressionEvaluator()) + .setNumFolds(3); + CrossValidatorModel cvModel = cv.fit(data); + LinearRegression parent = (LinearRegression)cvModel.bestModel().parent(); + + System.out.println("Optimal model has\n" + lr.regParam() + " = " + parent.getRegParam() + + "\n" + lr.elasticNetParam() + " = "+ parent.getElasticNetParam() + + "\n" + lr.fitIntercept() + " = " + parent.getFitIntercept()); + // $example off$ + + spark.stop(); + } +} diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala index 46d35fc0cb6e..9d2c58bbf9c7 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaRandomHyperparametersExample.scala @@ -18,9 +18,10 @@ package org.apache.spark.examples.ml // $example on$ -import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator +import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.regression.LinearRegression -import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamRandomBuilder} +import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, Limits, ParamRandomBuilder} +import org.apache.spark.ml.tuning.RandomRanges._ // $example off$ import org.apache.spark.sql.SparkSession @@ -45,25 +46,31 @@ object ModelSelectionViaRandomHyperparametersExample { val lr = new LinearRegression().setMaxIter(10) - val paramGrid = new ParamRandomBuilder() - .addLog10Random(lr.regParam, 0.01, 1.0, 5) + // We sample the regularization parameter logarithmically over the range [0.01, 1.0]. + // This means that values around 0.01, 0.1 and 1.0 are roughly equally likely. + // Note that both parameters must be greater than zero as otherwise we'll get an infinity. + // We sample the the ElasticNet mixing parameter uniformly over the range [0, 1] + // Note that in real life, you'd choose more than the 5 samples we see below. + val hyperparameters = new ParamRandomBuilder() + .addLog10Random(lr.regParam, Limits(0.01, 1.0), 5) .addGrid(lr.fitIntercept) - .addRandom(lr.elasticNetParam, 0.0, 1.0, 5) + .addRandom(lr.elasticNetParam, Limits(0.0, 1.0), 5) .build() - val eval = new BinaryClassificationEvaluator - eval.setRawPredictionCol("prediction") + println(s"hyperparameters:\n${hyperparameters.mkString("\n")}") + val cv: CrossValidator = new CrossValidator() .setEstimator(lr) - .setEstimatorParamMaps(paramGrid) - .setEvaluator(eval) + .setEstimatorParamMaps(hyperparameters) + .setEvaluator(new RegressionEvaluator) .setNumFolds(3) val cvModel: CrossValidatorModel = cv.fit(data) val parent: LinearRegression = cvModel.bestModel.parent.asInstanceOf[LinearRegression] - println(s"Optimal value for ${lr.regParam}: ${parent.getRegParam}") - println(s"Optimal value for ${lr.elasticNetParam}: ${parent.getElasticNetParam}") - println(s"Optimal value for ${lr.fitIntercept}: ${parent.getFitIntercept}") + println(s"""Optimal model has: + |${lr.regParam} = ${parent.getRegParam} + |${lr.elasticNetParam} = ${parent.getElasticNetParam} + |${lr.fitIntercept} = ${parent.getFitIntercept}""".stripMargin) // $example off$ spark.stop() From e88f907585cef0c44eba27910274d2152811c6ea Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Tue, 16 Feb 2021 10:29:09 +0000 Subject: [PATCH 40/47] [SPARK-34415][ML] Removed random log2 space, fixed error in documentation. --- docs/ml-tuning.md | 2 +- .../spark/ml/tuning/ParamRandomBuilder.scala | 12 -------- .../ml/tuning/ParamRandomBuilderSuite.scala | 28 ------------------- 3 files changed, 1 insertion(+), 41 deletions(-) diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md index 7bd32316ece9..40243e58d14e 100644 --- a/docs/ml-tuning.md +++ b/docs/ml-tuning.md @@ -94,7 +94,7 @@ Refer to the [`ParamRandomBuilder` Scala docs](api/scala/org/apache/spark/ml/tun Refer to the [`ParamRandomBuilder` Java docs](api/java/org/apache/spark/ml/tuning/ParamRandomBuilder.html) for details on the API. -{% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java %} +{% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaRandomHyperparametersExample.java %}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala index 64be2b4deda0..9c296bbc9522 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/ParamRandomBuilder.scala @@ -131,9 +131,6 @@ class ParamRandomBuilder extends ParamGridBuilder { def addLog10Random[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = addLogRandom(param, lim, n, 10) - def addLog2Random[T: Generator](param: Param[T], lim: Limits[T], n: Int): this.type = - addLogRandom(param, lim, n, 2) - private def addLogRandom[T: Generator](param: Param[T], lim: Limits[T], n: Int, base: Int): this.type = { val gen: RandomT[T] = RandomRanges(lim) @@ -148,25 +145,16 @@ class ParamRandomBuilder extends ParamGridBuilder { def addLog10Random(param: DoubleParam, x: Double, y: Double, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 10)(DoubleGenerator) - def addLog2Random(param: DoubleParam, x: Double, y: Double, n: Int): this.type = - addLogRandom(param, Limits(x, y), n, 2)(DoubleGenerator) - def addRandom(param: FloatParam, x: Float, y: Float, n: Int): this.type = addRandom(param, Limits(x, y), n)(FloatGenerator) def addLog10Random(param: FloatParam, x: Float, y: Float, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 10)(FloatGenerator) - def addLog2Random(param: FloatParam, x: Float, y: Float, n: Int): this.type = - addLogRandom(param, Limits(x, y), n, 2)(FloatGenerator) - def addRandom(param: IntParam, x: Int, y: Int, n: Int): this.type = addRandom(param, Limits(x, y), n)(IntGenerator) def addLog10Random(param: IntParam, x: Int, y: Int, n: Int): this.type = addLogRandom(param, Limits(x, y), n, 10)(IntGenerator) - def addLog2Random(param: IntParam, x: Int, y: Int, n: Int): this.type = - addLogRandom(param, Limits(x, y), n, 2)(IntGenerator) - } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala index 02aef1d7b7da..e17c48e4d991 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/ParamRandomBuilderSuite.scala @@ -48,13 +48,6 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert DummyDoubleParam) } - test("Java API random Double log2 params mixed with fixed values") { - checkRangeAndCardinality( - _.addLog2Random(DummyDoubleParam, DoubleLimits.x, DoubleLimits.y, nRandoms), - DoubleLimits, - DummyDoubleParam) - } - test("Java API random Double log10 params mixed with fixed values") { checkRangeAndCardinality( _.addLog10Random(DummyDoubleParam, DoubleLimits.x, DoubleLimits.y, nRandoms), @@ -69,13 +62,6 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert DummyFloatParam) } - test("Java API random Float log2 params mixed with fixed values") { - checkRangeAndCardinality( - _.addLog2Random(DummyFloatParam, FloatLimits.x, FloatLimits.y, nRandoms), - FloatLimits, - DummyFloatParam) - } - test("Java API random Float log10 params mixed with fixed values") { checkRangeAndCardinality( _.addLog10Random(DummyFloatParam, FloatLimits.x, FloatLimits.y, nRandoms), @@ -90,13 +76,6 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert DummyIntParam) } - test("Java API random Int log2 params mixed with fixed values") { - checkRangeAndCardinality( - _.addLog2Random(DummyIntParam, IntLimits.x, IntLimits.y, nRandoms), - IntLimits, - DummyIntParam) - } - test("Java API random Int log10 params mixed with fixed values") { checkRangeAndCardinality( _.addLog10Random(DummyIntParam, IntLimits.x, IntLimits.y, nRandoms), @@ -113,13 +92,6 @@ class ParamRandomBuilderSuite extends SparkFunSuite with ScalaCheckDrivenPropert DummyDoubleParam) } - test("random log2 params mixed with fixed values") { - import RandomRanges._ - checkRangeAndCardinality(_.addLog2Random(DummyDoubleParam, DoubleLimits, nRandoms), - DoubleLimits, - DummyDoubleParam) - } - test("random log10 params mixed with fixed values") { import RandomRanges._ checkRangeAndCardinality(_.addLog10Random(DummyDoubleParam, DoubleLimits, nRandoms), From 4e48759979f7859aff3f81f43192038a5fbc0a3b Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 22 Feb 2021 12:04:23 +0000 Subject: [PATCH 41/47] [SPARK-34415][ML] Everything that can be made private as srowen recommended. --- .../spark/ml/tuning/RandomRangesSuite.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 45989a271e26..50c46ebbc686 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -92,17 +92,17 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck checkDistributionOf(1000f) } - abstract class RandomFn[T: Numeric: Generator] { + private abstract class RandomFn[T: Numeric: Generator] { def apply(genRandom: RandomT[T]): T = genRandom.randomT() def appropriate(x: T, y: T): Boolean } - def Linear[T: Numeric: Generator]: RandomFn[T] = new RandomFn { + private def Linear[T: Numeric: Generator]: RandomFn[T] = new RandomFn { override def apply(genRandom: RandomT[T]): T = genRandom.randomT() override def appropriate(x: T, y: T): Boolean = true } - def Log10[T: Numeric: Generator]: RandomFn[T] = new RandomFn { + private def Log10[T: Numeric: Generator]: RandomFn[T] = new RandomFn { override def apply(genRandom: RandomT[T]): T = genRandom.randomTLog(10) val ops: Numeric[T] = implicitly[Numeric[T]] override def appropriate(x: T, y: T): Boolean = { @@ -110,7 +110,7 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } } - def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary](rand: RandomFn[T]): Assertion = + private def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary](rand: RandomFn[T]): Assertion = forAll { (x: T, y: T) => if (rand.appropriate(x, y)) { val ops: Numeric[T] = implicitly[Numeric[T]] @@ -122,7 +122,7 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } else Succeeded } - def checkDistributionOf[T: Numeric: Generator: Choose](range: T): Unit = { + private def checkDistributionOf[T: Numeric: Generator: Choose](range: T): Unit = { val ops: Numeric[T] = implicitly[Numeric[T]] import ops._ val gen: Gen[(T, T)] = for { @@ -134,7 +134,7 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } } - def meanAndStandardDeviation[T: Numeric](xs: Seq[T]): (Double, Double) = { + private def meanAndStandardDeviation[T: Numeric](xs: Seq[T]): (Double, Double) = { val ops: Numeric[T] = implicitly[Numeric[T]] val n: Int = xs.length val mean: Double = ops.toDouble(xs.sum) / n @@ -143,19 +143,19 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck (mean, stdDev) } - def lowerUpper[T: Numeric](x: T, y: T): (T, T) = { + private def lowerUpper[T: Numeric](x: T, y: T): (T, T) = { val ops: Numeric[T] = implicitly[Numeric[T]] (ops.min(x, y), ops.max(x, y)) } - def midPointOf[T: Numeric : Generator](lim: Limits[T]): Double = { + private def midPointOf[T: Numeric : Generator](lim: Limits[T]): Double = { val ordered: (T, T) = lowerUpper(lim.x, lim.y) val ops: Numeric[T] = implicitly[Numeric[T]] val range: T = ops.minus(ordered._2, ordered._1) (ops.toDouble(range) / 2) + ops.toDouble(ordered._1) } - def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { + private def assertEvenDistribution[T: Numeric: Generator](n: Int, lim: Limits[T]): Assertion = { val gen: RandomT[T] = RandomRanges(lim) val xs: Seq[T] = (0 to n).map { _: Int => gen.randomT() } val (mean, stdDev) = meanAndStandardDeviation(xs) From 259edfe1a78d0bb63c7b2cff03dc3a46746ecef6 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Mon, 22 Feb 2021 13:26:37 +0000 Subject: [PATCH 42/47] [SPARK-34415][ML] ScalaStyle violation. --- .../scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala index 50c46ebbc686..afcbc033956b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/RandomRangesSuite.scala @@ -110,7 +110,8 @@ class RandomRangesSuite extends SparkFunSuite with ScalaCheckDrivenPropertyCheck } } - private def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary](rand: RandomFn[T]): Assertion = + private def checkRange[T: Numeric: Generator: Choose: TypeTag: Arbitrary] + (rand: RandomFn[T]): Assertion = forAll { (x: T, y: T) => if (rand.appropriate(x, y)) { val ops: Numeric[T] = implicitly[Numeric[T]] From a41c8f385c17779fcc67521f69de2f247fd0e206 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Wed, 24 Feb 2021 16:56:03 +0000 Subject: [PATCH 43/47] [SPARK-34415][ML] Very hacky first draft of the Python version of ParamRandomBuilder. --- python/pyspark/ml/tests/test_tuning.py | 28 +++++++++++++++++++++++++- python/pyspark/ml/tuning.py | 10 ++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py index 3cde34facbf9..6fc9a03dab6a 100644 --- a/python/pyspark/ml/tests/test_tuning.py +++ b/python/pyspark/ml/tests/test_tuning.py @@ -26,7 +26,7 @@ from pyspark.ml.linalg import Vectors from pyspark.ml.param import Param, Params from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder, \ - TrainValidationSplit, TrainValidationSplitModel + TrainValidationSplit, TrainValidationSplitModel, ParamRandomBuilder from pyspark.sql.functions import rand from pyspark.testing.mlutils import DummyEvaluator, DummyLogisticRegression, \ DummyLogisticRegressionModel, SparkSessionTestCase @@ -65,6 +65,32 @@ def _fit(self, dataset): return model +class DummyParams(Params): + + def __init__(self): + super(DummyParams, self).__init__() + self.test_param = Param(self, "test_param", "dummy parameter for testing") + + +class ParamRandomBuilderTests(unittest.TestCase): + + def __init__(self, uid): + super(ParamRandomBuilderTests, self).__init__(methodName=uid) + self.dummy_params = DummyParams() + + def test_add_random(self): + n = 100 + lowest = 100 + highest = 200 + to_test = ParamRandomBuilder() + params = to_test.addRandom(self.dummy_params.test_param, lowest, highest, n).build() + self.assertEqual(n, len(params)) + for param in params: + for v in param.values(): + self.assertGreaterEqual(v, lowest) + self.assertLessEqual(v, highest) + + class ParamGridBuilderTests(SparkSessionTestCase): def test_addGrid(self): diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 2bddfe822f29..45083e1b4c42 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -18,6 +18,7 @@ import os import sys import itertools +import random from multiprocessing.pool import ThreadPool import numpy as np @@ -35,7 +36,7 @@ from pyspark.sql.types import BooleanType __all__ = ['ParamGridBuilder', 'CrossValidator', 'CrossValidatorModel', 'TrainValidationSplit', - 'TrainValidationSplitModel'] + 'TrainValidationSplitModel', 'ParamRandomBuilder'] def _parallelFitTasks(est, train, eva, validation, epm, collectSubModel): @@ -152,6 +153,13 @@ def to_key_value_pairs(keys, values): return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)] +class ParamRandomBuilder(ParamGridBuilder): + def addRandom(self, param, x, y, n): + values = map(lambda _: random.randrange(x, y), range(n)) + self.addGrid(param, values) + return self + + class _ValidatorParams(HasSeed): """ Common params for TrainValidationSplit and CrossValidator. From 73d077b934f4538f9350f0231a0ea68e27d98086 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 25 Feb 2021 09:11:23 +0000 Subject: [PATCH 44/47] [SPARK-34415][ML] Added ParamRandomBuilder to the .pyi file. More tests. Behaviour for float and integer are different. Unknown type raises exception. --- python/pyspark/ml/tests/test_tuning.py | 41 ++++++++++++++++++++------ python/pyspark/ml/tuning.py | 7 ++++- python/pyspark/ml/tuning.pyi | 10 +++++++ 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py index 6fc9a03dab6a..ec4e04f99e81 100644 --- a/python/pyspark/ml/tests/test_tuning.py +++ b/python/pyspark/ml/tests/test_tuning.py @@ -74,21 +74,44 @@ def __init__(self): class ParamRandomBuilderTests(unittest.TestCase): - def __init__(self, uid): - super(ParamRandomBuilderTests, self).__init__(methodName=uid) + def __init__(self, methodName): + super(ParamRandomBuilderTests, self).__init__(methodName=methodName) self.dummy_params = DummyParams() + self.to_test = ParamRandomBuilder() + self.n = 100 - def test_add_random(self): - n = 100 - lowest = 100 - highest = 200 - to_test = ParamRandomBuilder() - params = to_test.addRandom(self.dummy_params.test_param, lowest, highest, n).build() - self.assertEqual(n, len(params)) + def check_ranges(self, params, lowest, highest, expected_type): + self.assertEqual(self.n, len(params)) for param in params: for v in param.values(): self.assertGreaterEqual(v, lowest) self.assertLessEqual(v, highest) + self.assertEqual(type(v), expected_type) + + def test_add_random_integer_range(self): + lowest = 100 + highest = 200 + params = self.to_test.addRandom(self.dummy_params.test_param, lowest, highest, self.n)\ + .build() + self.check_ranges(params, lowest, highest, int) + + def test_add_random_float_and_integer_yields_floats(self): + lowest = 100 + highest = 200. + params = self.to_test.addRandom(self.dummy_params.test_param, lowest, highest, self.n)\ + .build() + self.check_ranges(params, lowest, highest, float) + + def test_add_random_float_range(self): + lowest = 100. + highest = 200. + params = self.to_test.addRandom(self.dummy_params.test_param, lowest, highest, self.n)\ + .build() + self.check_ranges(params, lowest, highest, float) + + def test_unexpected_type(self): + with self.assertRaises(TypeError): + self.to_test.addRandom(self.dummy_params.test_param, 1, "wrong type", 1).build() class ParamGridBuilderTests(SparkSessionTestCase): diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 45083e1b4c42..964f793e1c37 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -155,7 +155,12 @@ def to_key_value_pairs(keys, values): class ParamRandomBuilder(ParamGridBuilder): def addRandom(self, param, x, y, n): - values = map(lambda _: random.randrange(x, y), range(n)) + if type(x) == int and type(y) == int: + values = map(lambda _: random.randrange(x, y), range(n)) + elif type(x) == float or type(y) == float: + values = map(lambda _: random.uniform(x, y), range(n)) + else: + raise TypeError("unable to make range for types %s and %s" % type(x) % type(y)) self.addGrid(param, values) return self diff --git a/python/pyspark/ml/tuning.pyi b/python/pyspark/ml/tuning.pyi index 912abd4d7124..62bd79dca1ac 100644 --- a/python/pyspark/ml/tuning.pyi +++ b/python/pyspark/ml/tuning.pyi @@ -35,6 +35,16 @@ class ParamGridBuilder: def baseOn(self, *args: Tuple[Param, Any]) -> ParamGridBuilder: ... def build(self) -> List[ParamMap]: ... +class ParamRandomBuilder(ParamGridBuilder): + def __init__(self) -> None: ... + def addGrid(self, param: Param, values: List[Any]) -> ParamGridBuilder: ... + @overload + def baseOn(self, __args: ParamMap) -> ParamGridBuilder: ... + @overload + def baseOn(self, *args: Tuple[Param, Any]) -> ParamGridBuilder: ... + def build(self) -> List[ParamMap]: ... + def addRandom(self, param: Param, x: Any, y: Any, n: int) -> ParamRandomBuilder: ... + class _ValidatorParams(HasSeed): estimator: Param[Estimator] estimatorParamMaps: Param[List[ParamMap]] From 5d89774a8d19f4d93b04b8e8ff0ee01869ac5ad2 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 25 Feb 2021 12:10:18 +0000 Subject: [PATCH 45/47] [SPARK-34415][ML] Python log10 space. --- python/pyspark/ml/tests/test_tuning.py | 37 +++++++++++++++----------- python/pyspark/ml/tuning.py | 32 ++++++++++++++++++++++ python/pyspark/ml/tuning.pyi | 1 + 3 files changed, 55 insertions(+), 15 deletions(-) diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py index ec4e04f99e81..9ba007e2fbe9 100644 --- a/python/pyspark/ml/tests/test_tuning.py +++ b/python/pyspark/ml/tests/test_tuning.py @@ -16,8 +16,10 @@ # import tempfile +import math import unittest +import numpy as np from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml import Estimator, Pipeline, Model from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel, OneVsRest @@ -88,26 +90,31 @@ def check_ranges(self, params, lowest, highest, expected_type): self.assertLessEqual(v, highest) self.assertEqual(type(v), expected_type) + def check_addRandom_ranges(self, x, y, expected_type): + params = self.to_test.addRandom(self.dummy_params.test_param, x, y, self.n).build() + self.check_ranges(params, x, y, expected_type) + + def check_addLog10Random_ranges(self, x, y, expected_type): + params = self.to_test.addLog10Random(self.dummy_params.test_param, x, y, self.n).build() + self.check_ranges(params, x, y, expected_type) + + def test_add_random_integer_logarithmic_range(self): + self.check_addLog10Random_ranges(100, 200, int) + + def test_add_logarithmic_random_float_and_integer_yields_floats(self): + self.check_addLog10Random_ranges(100, 200., float) + + def test_add_random_float_logarithmic_range(self): + self.check_addLog10Random_ranges(100., 200., float) + def test_add_random_integer_range(self): - lowest = 100 - highest = 200 - params = self.to_test.addRandom(self.dummy_params.test_param, lowest, highest, self.n)\ - .build() - self.check_ranges(params, lowest, highest, int) + self.check_addRandom_ranges(100, 200, int) def test_add_random_float_and_integer_yields_floats(self): - lowest = 100 - highest = 200. - params = self.to_test.addRandom(self.dummy_params.test_param, lowest, highest, self.n)\ - .build() - self.check_ranges(params, lowest, highest, float) + self.check_addRandom_ranges(100, 200., float) def test_add_random_float_range(self): - lowest = 100. - highest = 200. - params = self.to_test.addRandom(self.dummy_params.test_param, lowest, highest, self.n)\ - .build() - self.check_ranges(params, lowest, highest, float) + self.check_addRandom_ranges(100., 200., float) def test_unexpected_type(self): with self.assertRaises(TypeError): diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 964f793e1c37..61c7b2f91930 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -19,6 +19,7 @@ import sys import itertools import random +import math from multiprocessing.pool import ThreadPool import numpy as np @@ -154,7 +155,20 @@ def to_key_value_pairs(keys, values): class ParamRandomBuilder(ParamGridBuilder): + r""" + Builder for random value parameters used in search-based model selection. + + + .. versionadded:: 3.2.0 + """ + + @since("3.2.0") def addRandom(self, param, x, y, n): + """ + Adds n random values between x and y. + The arguments x and y can be integers, floats or a combination of the two. If either + x or y is a float, the domain of the random value will be float. + """ if type(x) == int and type(y) == int: values = map(lambda _: random.randrange(x, y), range(n)) elif type(x) == float or type(y) == float: @@ -164,6 +178,24 @@ def addRandom(self, param, x, y, n): self.addGrid(param, values) return self + @since("3.2.0") + def addLog10Random(self, param, x, y, n): + """ + Adds n random values scaled logarithmically (base 10) between x and y. + For instance, a distribution for x=1.0, y=10000.0 and n=5 might reasonably look like + [1.6, 65.3, 221.9, 1024.3, 8997.5] + """ + def logarithmic_random(): + value = math.log10(random.uniform(10 ** x, 10 ** y)) + if type(x) == int and type(y) == int: + value = int(value) + return value + + values = map(lambda _: logarithmic_random(), range(n)) + self.addGrid(param, values) + + return self + class _ValidatorParams(HasSeed): """ diff --git a/python/pyspark/ml/tuning.pyi b/python/pyspark/ml/tuning.pyi index 62bd79dca1ac..0a419d1c86c1 100644 --- a/python/pyspark/ml/tuning.pyi +++ b/python/pyspark/ml/tuning.pyi @@ -44,6 +44,7 @@ class ParamRandomBuilder(ParamGridBuilder): def baseOn(self, *args: Tuple[Param, Any]) -> ParamGridBuilder: ... def build(self) -> List[ParamMap]: ... def addRandom(self, param: Param, x: Any, y: Any, n: int) -> ParamRandomBuilder: ... + def addLog10Random(self, param: Param, x: Any, y: Any, n: int) -> ParamRandomBuilder: ... class _ValidatorParams(HasSeed): estimator: Param[Estimator] From 183c2cd5911d2f6d72020674f59fd74b5d983b38 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Thu, 25 Feb 2021 13:23:04 +0000 Subject: [PATCH 46/47] [SPARK-34415][ML] More tests. --- python/pyspark/ml/tests/test_tuning.py | 48 ++++++++++++++++++++++++++ python/pyspark/ml/tuning.py | 3 +- python/pyspark/ml/tuning.pyi | 6 ---- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py index 9ba007e2fbe9..9f6c8192e0ff 100644 --- a/python/pyspark/ml/tests/test_tuning.py +++ b/python/pyspark/ml/tests/test_tuning.py @@ -72,6 +72,7 @@ class DummyParams(Params): def __init__(self): super(DummyParams, self).__init__() self.test_param = Param(self, "test_param", "dummy parameter for testing") + self.another_test_param = Param(self, "another_test_param", "second parameter for testing") class ParamRandomBuilderTests(unittest.TestCase): @@ -98,6 +99,53 @@ def check_addLog10Random_ranges(self, x, y, expected_type): params = self.to_test.addLog10Random(self.dummy_params.test_param, x, y, self.n).build() self.check_ranges(params, x, y, expected_type) + @staticmethod + def counts(xs): + key_to_count = {} + for v in xs: + k = int(v) + if key_to_count.get(k) is None: + key_to_count[k] = 1 + else: + key_to_count[k] = key_to_count[k] + 1 + return key_to_count + + @staticmethod + def raw_values_of(params): + values = [] + for param in params: + for v in param.values(): + values.append(v) + return values + + def check_even_distribution(self, vs, bin_function): + binned = map(lambda x: bin_function(x), vs) + histogram = self.counts(binned) + values = list(histogram.values()) + sd = np.std(values) + mu = np.mean(values) + for k, v in histogram.items(): + self.assertLess(abs(v - mu), 5 * sd, "{} values for bucket {} is unlikely " + "when the mean is {} and standard deviation {}" + .format(v, k, mu, sd)) + + def test_distribution(self): + params = self.to_test.addRandom(self.dummy_params.test_param, 0, 20000, 10000).build() + values = self.raw_values_of(params) + self.check_even_distribution(values, lambda x: x // 1000) + + def test_logarithmic_distribution(self): + params = self.to_test.addLog10Random(self.dummy_params.test_param, 1, 1e10, 10000).build() + values = self.raw_values_of(params) + self.check_even_distribution(values, lambda x: math.log10(x)) + + def test_param_cardinality(self): + num_random_params = 7 + values = [1, 2, 3] + self.to_test.addRandom(self.dummy_params.test_param, 1, 10, num_random_params) + self.to_test.addGrid(self.dummy_params.another_test_param, values) + self.assertEqual(len(self.to_test.build()), num_random_params * len(values)) + def test_add_random_integer_logarithmic_range(self): self.check_addLog10Random_ranges(100, 200, int) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 61c7b2f91930..85174c8cd02f 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -186,7 +186,8 @@ def addLog10Random(self, param, x, y, n): [1.6, 65.3, 221.9, 1024.3, 8997.5] """ def logarithmic_random(): - value = math.log10(random.uniform(10 ** x, 10 ** y)) + rand = random.uniform(math.log10(x), math.log10(y)) + value = 10 ** rand if type(x) == int and type(y) == int: value = int(value) return value diff --git a/python/pyspark/ml/tuning.pyi b/python/pyspark/ml/tuning.pyi index 0a419d1c86c1..028cebdccac9 100644 --- a/python/pyspark/ml/tuning.pyi +++ b/python/pyspark/ml/tuning.pyi @@ -37,12 +37,6 @@ class ParamGridBuilder: class ParamRandomBuilder(ParamGridBuilder): def __init__(self) -> None: ... - def addGrid(self, param: Param, values: List[Any]) -> ParamGridBuilder: ... - @overload - def baseOn(self, __args: ParamMap) -> ParamGridBuilder: ... - @overload - def baseOn(self, *args: Tuple[Param, Any]) -> ParamGridBuilder: ... - def build(self) -> List[ParamMap]: ... def addRandom(self, param: Param, x: Any, y: Any, n: int) -> ParamRandomBuilder: ... def addLog10Random(self, param: Param, x: Any, y: Any, n: int) -> ParamRandomBuilder: ... From ddfe4a9ba7c7d100f5a0d3287a9001cd7fb4e325 Mon Sep 17 00:00:00 2001 From: Phillip Henry Date: Fri, 26 Feb 2021 13:34:44 +0000 Subject: [PATCH 47/47] [SPARK-34415][ML] Python example. --- docs/ml-tuning.md | 4 ++++ python/docs/source/reference/pyspark.ml.rst | 1 + 2 files changed, 5 insertions(+) diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md index 40243e58d14e..e7940a349368 100644 --- a/docs/ml-tuning.md +++ b/docs/ml-tuning.md @@ -101,6 +101,10 @@ Refer to the [`ParamRandomBuilder` Java docs](api/java/org/apache/spark/ml/tunin Python users are recommended to look at Python libraries that are specifically for hyperparameter tuning such as Hyperopt. +Refer to the [`ParamRandomBuilder` Java docs](api/python/reference/api/pyspark.ml.tuning.ParamRandomBuilder.html) for details on the API. + +{% include_example python/ml/model_selection_random_hyperparameters_example.py %} +
diff --git a/python/docs/source/reference/pyspark.ml.rst b/python/docs/source/reference/pyspark.ml.rst index 7837d609ecb9..fc6060c979d1 100644 --- a/python/docs/source/reference/pyspark.ml.rst +++ b/python/docs/source/reference/pyspark.ml.rst @@ -288,6 +288,7 @@ Tuning :toctree: api/ ParamGridBuilder + ParamRandomBuilder CrossValidator CrossValidatorModel TrainValidationSplit