From 49a7efb1aca16491931a65e195ea403de290e280 Mon Sep 17 00:00:00 2001 From: Ievgen Prokhorenko Date: Thu, 18 Jul 2019 09:30:19 +0200 Subject: [PATCH] [MLLIB] Use TestingUtils to compare floating point values --- .../evaluation/MultilabelMetricsSuite.scala | 37 ++++++++++--------- .../mllib/fpm/AssociationRulesSuite.scala | 5 ++- .../spark/mllib/fpm/FPGrowthSuite.scala | 3 +- .../distributed/IndexedRowMatrixSuite.scala | 3 +- .../random/RandomDataGeneratorSuite.scala | 6 +-- .../spark/mllib/random/RandomRDDsSuite.scala | 11 +++--- .../spark/mllib/stat/CorrelationSuite.scala | 15 ++++---- .../spark/mllib/stat/KernelDensitySuite.scala | 13 ++++--- .../spark/mllib/tree/EnsembleTestHelper.scala | 5 ++- 9 files changed, 52 insertions(+), 46 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala index a660492c7ae5..03afd29e4750 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.mllib.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { @@ -79,24 +80,24 @@ class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { val hammingLoss = (1.0 / (7 * 3)) * (2 + 2 + 1 + 0 + 0 + 1 + 1) val strictAccuracy = 2.0 / 7 val accuracy = 1.0 / 7 * (1.0 / 3 + 1.0 /3 + 0 + 1.0 / 1 + 2.0 / 2 + 2.0 / 3 + 1.0 / 2) - assert(math.abs(metrics.precision(0.0) - precision0) < delta) - assert(math.abs(metrics.precision(1.0) - precision1) < delta) - assert(math.abs(metrics.precision(2.0) - precision2) < delta) - assert(math.abs(metrics.recall(0.0) - recall0) < delta) - assert(math.abs(metrics.recall(1.0) - recall1) < delta) - assert(math.abs(metrics.recall(2.0) - recall2) < delta) - assert(math.abs(metrics.f1Measure(0.0) - f1measure0) < delta) - assert(math.abs(metrics.f1Measure(1.0) - f1measure1) < delta) - assert(math.abs(metrics.f1Measure(2.0) - f1measure2) < delta) - assert(math.abs(metrics.microPrecision - microPrecisionClass) < delta) - assert(math.abs(metrics.microRecall - microRecallClass) < delta) - assert(math.abs(metrics.microF1Measure - microF1MeasureClass) < delta) - assert(math.abs(metrics.precision - macroPrecisionDoc) < delta) - assert(math.abs(metrics.recall - macroRecallDoc) < delta) - assert(math.abs(metrics.f1Measure - macroF1MeasureDoc) < delta) - assert(math.abs(metrics.hammingLoss - hammingLoss) < delta) - assert(math.abs(metrics.subsetAccuracy - strictAccuracy) < delta) - assert(math.abs(metrics.accuracy - accuracy) < delta) + assert(metrics.precision(0.0) ~== precision0 absTol delta) + assert(metrics.precision(1.0) ~== precision1 absTol delta) + assert(metrics.precision(2.0) ~== precision2 absTol delta) + assert(metrics.recall(0.0) ~== recall0 absTol delta) + assert(metrics.recall(1.0) ~== recall1 absTol delta) + assert(metrics.recall(2.0) ~== recall2 absTol delta) + assert(metrics.f1Measure(0.0) ~== f1measure0 absTol delta) + assert(metrics.f1Measure(1.0) ~== f1measure1 absTol delta) + assert(metrics.f1Measure(2.0) ~== f1measure2 absTol delta) + assert(metrics.microPrecision ~== microPrecisionClass absTol delta) + assert(metrics.microRecall ~== microRecallClass absTol delta) + assert(metrics.microF1Measure ~== microF1MeasureClass absTol delta) + assert(metrics.precision ~== macroPrecisionDoc absTol delta) + assert(metrics.recall ~== macroRecallDoc absTol delta) + assert(metrics.f1Measure ~== macroF1MeasureDoc absTol delta) + assert(metrics.hammingLoss ~== hammingLoss absTol delta) + assert(metrics.subsetAccuracy ~== strictAccuracy absTol delta) + assert(metrics.accuracy ~== accuracy absTol delta) assert(metrics.labels.sameElements(Array(0.0, 1.0, 2.0))) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala index dcb1f398b04b..26a75699248d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/AssociationRulesSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.fpm import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext { @@ -63,7 +64,7 @@ class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext { [1] 23 */ assert(results1.size === 23) - assert(results1.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23) + assert(results1.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23) val results2 = ar .setMinConfidence(0) @@ -84,7 +85,7 @@ class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext { [1] 23 */ assert(results2.size === 30) - assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23) + assert(results2.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala index 20bd2e5e0dc1..fa8f03be089c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.fpm import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.Utils class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { @@ -172,7 +173,7 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { .collect() assert(rules.size === 23) - assert(rules.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23) + assert(rules.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23) } test("FP-Growth using Int type") { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala index 566ce95be084..cca4eb4e4260 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala @@ -22,6 +22,7 @@ import breeze.linalg.{diag => brzDiag, DenseMatrix => BDM, DenseVector => BDV} import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { @@ -238,7 +239,7 @@ class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { for (i <- 0 until n; j <- i + 1 until n) { val trueResult = gram(i, j) / scala.math.sqrt(gram(i, i) * gram(j, j)) - assert(math.abs(G(i, j) - trueResult) < 1e-6) + assert(G(i, j) ~== trueResult absTol 1e-6) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala index e30ad159676f..8011026e6fa6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala @@ -20,9 +20,9 @@ package org.apache.spark.mllib.random import org.apache.commons.math3.special.Gamma import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.StatCounter -// TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged class RandomDataGeneratorSuite extends SparkFunSuite { def apiChecks(gen: RandomDataGenerator[Double]) { @@ -61,8 +61,8 @@ class RandomDataGeneratorSuite extends SparkFunSuite { gen.setSeed(seed.toLong) val sample = (0 until 100000).map { _ => gen.nextValue()} val stats = new StatCounter(sample) - assert(math.abs(stats.mean - mean) < epsilon) - assert(math.abs(stats.stdev - stddev) < epsilon) + assert(stats.mean ~== mean absTol epsilon) + assert(stats.stdev ~== stddev absTol epsilon) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala index f464d25c3fbd..9b4dc29d326a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomRDDsSuite.scala @@ -23,14 +23,13 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.rdd.{RandomRDD, RandomRDDPartition} import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD import org.apache.spark.util.StatCounter /* * Note: avoid including APIs that do not set the seed for the RNG in unit tests * in order to guarantee deterministic behavior. - * - * TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged */ class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Serializable { @@ -43,8 +42,8 @@ class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Seri val stats = rdd.stats() assert(expectedSize === stats.count) assert(expectedNumPartitions === rdd.partitions.size) - assert(math.abs(stats.mean - expectedMean) < epsilon) - assert(math.abs(stats.stdev - expectedStddev) < epsilon) + assert(stats.mean ~== expectedMean absTol epsilon) + assert(stats.stdev ~== expectedStddev absTol epsilon) } // assume test RDDs are small @@ -63,8 +62,8 @@ class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Seri }} assert(expectedRows === values.size / expectedColumns) val stats = new StatCounter(values) - assert(math.abs(stats.mean - expectedMean) < epsilon) - assert(math.abs(stats.stdev - expectedStddev) < epsilon) + assert(stats.mean ~== expectedMean absTol epsilon) + assert(stats.stdev ~== expectedStddev absTol epsilon) } test("RandomRDD sizes") { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala index e32767edb17a..4613f7fb6f40 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation, SpearmanCorrelation} import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { @@ -57,15 +58,15 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log val expected = 0.6546537 val default = Statistics.corr(x, y) val p1 = Statistics.corr(x, y, "pearson") - assert(approxEqual(expected, default)) - assert(approxEqual(expected, p1)) + assert(expected ~== default absTol 1e-6) + assert(expected ~== p1 absTol 1e-6) // numPartitions >= size for input RDDs for (numParts <- List(xData.size, xData.size * 2)) { val x1 = sc.parallelize(xData, numParts) val y1 = sc.parallelize(yData, numParts) val p2 = Statistics.corr(x1, y1) - assert(approxEqual(expected, p2)) + assert(expected ~== p2 absTol 1e-6) } // RDD of zero variance @@ -78,14 +79,14 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log val y = sc.parallelize(yData) val expected = 0.5 val s1 = Statistics.corr(x, y, "spearman") - assert(approxEqual(expected, s1)) + assert(expected ~== s1 absTol 1e-6) // numPartitions >= size for input RDDs for (numParts <- List(xData.size, xData.size * 2)) { val x1 = sc.parallelize(xData, numParts) val y1 = sc.parallelize(yData, numParts) val s2 = Statistics.corr(x1, y1, "spearman") - assert(approxEqual(expected, s2)) + assert(expected ~== s2 absTol 1e-6) } // RDD of zero variance => zero variance in ranks @@ -141,14 +142,14 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log val a = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0) val b = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0) val p = Statistics.corr(a, b, method = "pearson") - assert(approxEqual(p, 0.0, 0.01)) + assert(p ~== 0.0 absTol 0.01) } def approxEqual(v1: Double, v2: Double, threshold: Double = 1e-6): Boolean = { if (v1.isNaN) { v2.isNaN } else { - math.abs(v1 - v2) <= threshold + v1 ~== v2 absTol threshold } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala index 5feccdf33681..9cbb3d0024da 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala @@ -21,6 +21,7 @@ import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { @@ -29,8 +30,8 @@ class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 - assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) - assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) + assert(densities(0) ~== normal.density(5.0) absTol acceptableErr) + assert(densities(1) ~== normal.density(6.0) absTol acceptableErr) } test("kernel density multiple samples") { @@ -40,9 +41,9 @@ class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 - assert(math.abs( - densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) - assert(math.abs( - densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) + assert( + densities(0) ~== ((normal1.density(5.0) + normal2.density(5.0)) / 2) absTol acceptableErr) + assert( + densities(1) ~== ((normal1.density(6.0) + normal2.density(6.0)) / 2) absTol acceptableErr) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala index 1cc8f342021a..d43e62bb6553 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/EnsembleTestHelper.scala @@ -22,6 +22,7 @@ import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel +import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.StatCounter object EnsembleTestHelper { @@ -43,8 +44,8 @@ object EnsembleTestHelper { values ++= row } val stats = new StatCounter(values) - assert(math.abs(stats.mean - expectedMean) < epsilon) - assert(math.abs(stats.stdev - expectedStddev) < epsilon) + assert(stats.mean ~== expectedMean absTol epsilon) + assert(stats.stdev ~== expectedStddev absTol epsilon) } def validateClassifier(