Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@ package org.apache.spark.ml.clustering
import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.mllib.clustering.DistanceMeasure
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset

class BisectingKMeansSuite
extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {

import Encoders._
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import testImplicits._ instead


final val k = 5
@transient var dataset: Dataset[_] = _
Expand Down Expand Up @@ -65,10 +66,12 @@ class BisectingKMeansSuite

// Verify fit does not fail on very sparse data
val model = bkm.fit(sparseDataset)
val result = model.transform(sparseDataset)
val numClusters = result.select("prediction").distinct().collect().length
// Verify we hit the edge case
assert(numClusters < k && numClusters > 1)

testTransformerByGlobalCheckFunc[Vector](sparseDataset.toDF(), model, "prediction") { rows =>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use Tuple1[Vector] instead of Vector

val numClusters = rows.distinct.length
// Verify we hit the edge case
assert(numClusters < k && numClusters > 1)
}
}

test("setter/getter") {
Expand Down Expand Up @@ -102,17 +105,14 @@ class BisectingKMeansSuite
val model = bkm.fit(dataset)
assert(model.clusterCenters.length === k)

val transformed = model.transform(dataset)
val expectedColumns = Array("features", predictionColName)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
testTransformerByGlobalCheckFunc[Vector](dataset.toDF(), model,
"features", predictionColName) { rows =>
val clusters = rows.map(_.getAs[Int](predictionColName)).toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))
assert(model.computeCost(dataset) < 0.1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These checks which do not use "rows" should go outside of testTransformerByGlobalCheckFunc

assert(model.hasParent)
}
val clusters =
transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))
assert(model.computeCost(dataset) < 0.1)
assert(model.hasParent)

// Check validity of model summary
val numRows = dataset.count()
Expand Down
25 changes: 25 additions & 0 deletions mllib/src/test/scala/org/apache/spark/ml/clustering/Encoders.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.ml.clustering

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

private[clustering] object Encoders {
implicit val vectorEncoder = ExpressionEncoder[Vector]()
Copy link
Contributor Author

@smurakozi smurakozi Jan 18, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a better solution to provide an implicit Encoder[Vector] for testTransformer?
Is it ok here, or is there a better place for it?
e.g. org.apache.spark.mllib.util.MLlibTestSparkContext.testImplicits

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for asking; you shouldn't need to do this. I'll comment on BisectingKMeansSuite.scala
about using testImplicits instead. You basically just need to import testImplicits._ and use Tuple1 for the type param for testTransformer.

}
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,15 @@ import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.stat.distribution.MultivariateGaussian
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}


class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
with DefaultReadWriteTest {
class GaussianMixtureSuite extends MLTest with DefaultReadWriteTest {

import testImplicits._
import Encoders._
import GaussianMixtureSuite._

final val k = 5
Expand Down Expand Up @@ -118,15 +117,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
assert(model.weights.length === k)
assert(model.gaussians.length === k)

val transformed = model.transform(dataset)
val expectedColumns = Array("features", predictionColName, probabilityColName)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
}

// Check prediction matches the highest probability, and probabilities sum to one.
transformed.select(predictionColName, probabilityColName).collect().foreach {
case Row(pred: Int, prob: Vector) =>
testTransformer[Vector](dataset.toDF(), model,
"features", predictionColName, probabilityColName) {
case Row(_, pred: Int, prob: Vector) =>
val probArray = prob.toArray
val predFromProb = probArray.zipWithIndex.maxBy(_._1)._2
assert(pred === predFromProb)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ import scala.util.Random
import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.mllib.clustering.{DistanceMeasure, KMeans => MLlibKMeans}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

private[clustering] case class TestRow(features: Vector)

class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
class KMeansSuite extends MLTest with DefaultReadWriteTest {

import Encoders._

final val k = 5
@transient var dataset: Dataset[_] = _
Expand Down Expand Up @@ -103,15 +104,13 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
val model = kmeans.fit(dataset)
assert(model.clusterCenters.length === k)

val transformed = model.transform(dataset)
val expectedColumns = Array("features", predictionColName)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
testTransformerByGlobalCheckFunc[Vector](dataset.toDF(), model,
"features", predictionColName) { rows =>
val clusters = rows.map(_.getAs[Int](predictionColName)).toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))
}
val clusters =
transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet
assert(clusters.size === k)
assert(clusters === Set(0, 1, 2, 3, 4))

assert(model.computeCost(dataset) < 0.1)
assert(model.hasParent)

Expand Down Expand Up @@ -143,9 +142,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
model.setFeaturesCol(featuresColName).setPredictionCol(predictionColName)

val transformed = model.transform(dataset.withColumnRenamed("features", featuresColName))
Seq(featuresColName, predictionColName).foreach { column =>
assert(transformed.columns.contains(column))
}
assert(transformed.schema.fieldNames.toSet === Set(featuresColName, predictionColName))
assert(model.getFeaturesCol == featuresColName)
assert(model.getPredictionCol == predictionColName)
}
Expand Down
23 changes: 8 additions & 15 deletions mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,11 @@ package org.apache.spark.ml.clustering

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql._


object LDASuite {
def generateLDAData(
spark: SparkSession,
Expand Down Expand Up @@ -60,9 +57,10 @@ object LDASuite {
}


class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
class LDASuite extends MLTest with DefaultReadWriteTest {

import testImplicits._
import Encoders._

val k: Int = 5
val vocabSize: Int = 30
Expand Down Expand Up @@ -185,16 +183,11 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
assert(model.topicsMatrix.numCols === k)
assert(!model.isDistributed)

// transform()
val transformed = model.transform(dataset)
val expectedColumns = Array("features", lda.getTopicDistributionCol)
expectedColumns.foreach { column =>
assert(transformed.columns.contains(column))
}
transformed.select(lda.getTopicDistributionCol).collect().foreach { r =>
val topicDistribution = r.getAs[Vector](0)
assert(topicDistribution.size === k)
assert(topicDistribution.toArray.forall(w => w >= 0.0 && w <= 1.0))
testTransformer[Vector](dataset.toDF(), model,
"features", lda.getTopicDistributionCol) {
case Row(_, topicDistribution: Vector) =>
assert(topicDistribution.size === k)
assert(topicDistribution.toArray.forall(w => w >= 0.0 && w <= 1.0))
}

// logLikelihood, logPerplexity
Expand Down