Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@ package org.apache.spark.ml.clustering

import scala.language.existentials

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.SparkException
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.clustering.DistanceMeasure
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.Dataset

class BisectingKMeansSuite
extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {

import testImplicits._

final val k = 5
@transient var dataset: Dataset[_] = _
Expand Down Expand Up @@ -68,10 +69,13 @@ class BisectingKMeansSuite

// Verify fit does not fail on very sparse data
val model = bkm.fit(sparseDataset)
val result = model.transform(sparseDataset)
val numClusters = result.select("prediction").distinct().collect().length
// Verify we hit the edge case
assert(numClusters < k && numClusters > 1)

testTransformerByGlobalCheckFunc[Tuple1[Vector]](sparseDataset.toDF(), model, "prediction") {
rows =>
val numClusters = rows.distinct.length
// Verify we hit the edge case
assert(numClusters < k && numClusters > 1)
}
}

test("setter/getter") {
Expand Down Expand Up @@ -104,19 +108,16 @@ class BisectingKMeansSuite
val bkm = new BisectingKMeans().setK(k).setPredictionCol(predictionColName).setSeed(1)
val model = bkm.fit(dataset)
assert(model.clusterCenters.length === k)

val transformed = model.transform(dataset)
val expectedColumns = Array("features", predictionColName)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
}
val clusters =
transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))
assert(model.computeCost(dataset) < 0.1)
assert(model.hasParent)

testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataset.toDF(), model,
"features", predictionColName) { rows =>
val clusters = rows.map(_.getAs[Int](predictionColName)).toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))
}

// Check validity of model summary
val numRows = dataset.count()
assert(model.hasSummary)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,15 @@ import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.stat.distribution.MultivariateGaussian
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.{Dataset, Row}

class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
with DefaultReadWriteTest {

import testImplicits._
class GaussianMixtureSuite extends MLTest with DefaultReadWriteTest {

import GaussianMixtureSuite._
import testImplicits._

final val k = 5
private val seed = 538009335
Expand Down Expand Up @@ -119,15 +118,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
assert(model.weights.length === k)
assert(model.gaussians.length === k)

val transformed = model.transform(dataset)
val expectedColumns = Array("features", predictionColName, probabilityColName)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
}

// Check prediction matches the highest probability, and probabilities sum to one.
transformed.select(predictionColName, probabilityColName).collect().foreach {
case Row(pred: Int, prob: Vector) =>
testTransformer[Tuple1[Vector]](dataset.toDF(), model,
"features", predictionColName, probabilityColName) {
case Row(_, pred: Int, prob: Vector) =>
val probArray = prob.toArray
val predFromProb = probArray.zipWithIndex.maxBy(_._1)._2
assert(pred === predFromProb)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,21 @@ import scala.util.Random

import org.dmg.pmml.{ClusteringModel, PMML}

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.SparkException
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util._
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils, PMMLReadWriteTest}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.clustering.{DistanceMeasure, KMeans => MLlibKMeans, KMeansModel => MLlibKMeansModel}
import org.apache.spark.mllib.clustering.{DistanceMeasure, KMeans => MLlibKMeans,
KMeansModel => MLlibKMeansModel}
import org.apache.spark.mllib.linalg.{Vectors => MLlibVectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

private[clustering] case class TestRow(features: Vector)

class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest
with PMMLReadWriteTest {
class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTest {

import testImplicits._

final val k = 5
@transient var dataset: Dataset[_] = _
Expand Down Expand Up @@ -109,15 +110,13 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
val model = kmeans.fit(dataset)
assert(model.clusterCenters.length === k)

val transformed = model.transform(dataset)
val expectedColumns = Array("features", predictionColName)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
testTransformerByGlobalCheckFunc[Tuple1[Vector]](dataset.toDF(), model,
"features", predictionColName) { rows =>
val clusters = rows.map(_.getAs[Int](predictionColName)).toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))
}
val clusters =
transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))

assert(model.computeCost(dataset) < 0.1)
assert(model.hasParent)

Expand Down Expand Up @@ -149,9 +148,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
model.setFeaturesCol(featuresColName).setPredictionCol(predictionColName)

val transformed = model.transform(dataset.withColumnRenamed("features", featuresColName))
Seq(featuresColName, predictionColName).foreach { column =>
assert(transformed.columns.contains(column))
}
assert(transformed.schema.fieldNames.toSet === Set(featuresColName, predictionColName))
assert(model.getFeaturesCol == featuresColName)
assert(model.getPredictionCol == predictionColName)
}
Expand Down
21 changes: 7 additions & 14 deletions mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,9 @@ import scala.language.existentials

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql._

object LDASuite {
Expand Down Expand Up @@ -61,7 +59,7 @@ object LDASuite {
}


class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
class LDASuite extends MLTest with DefaultReadWriteTest {

import testImplicits._

Expand Down Expand Up @@ -186,16 +184,11 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
assert(model.topicsMatrix.numCols === k)
assert(!model.isDistributed)

// transform()
val transformed = model.transform(dataset)
val expectedColumns = Array("features", lda.getTopicDistributionCol)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
}
transformed.select(lda.getTopicDistributionCol).collect().foreach { r =>
val topicDistribution = r.getAs[Vector](0)
assert(topicDistribution.size === k)
assert(topicDistribution.toArray.forall(w => w >= 0.0 && w <= 1.0))
testTransformer[Tuple1[Vector]](dataset.toDF(), model,
"features", lda.getTopicDistributionCol) {
case Row(_, topicDistribution: Vector) =>
assert(topicDistribution.size === k)
assert(topicDistribution.toArray.forall(w => w >= 0.0 && w <= 1.0))
}

// logLikelihood, logPerplexity
Expand Down