From 97c96b6ac7f65762cf9f125965e8d2a3cba72f60 Mon Sep 17 00:00:00 2001
From: Sandor Murakozi <smurakozi@gmail.com>
Date: Thu, 18 Jan 2018 20:03:33 +0100
Subject: [PATCH 1/3] Converted all clustering tests to check streaming

---
 .../ml/clustering/BisectingKMeansSuite.scala  | 35 +++++++++----------
 .../apache/spark/ml/clustering/Encoders.scala | 25 +++++++++++++
 .../ml/clustering/GaussianMixtureSuite.scala  | 17 ++++-----
 .../spark/ml/clustering/KMeansSuite.scala     | 26 ++++++--------
 .../apache/spark/ml/clustering/LDASuite.scala | 24 ++++++-------
 5 files changed, 69 insertions(+), 58 deletions(-)
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/clustering/Encoders.scala

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index fa7471fa2d658..1830a98ff2655 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -17,14 +17,15 @@
 
 package org.apache.spark.ml.clustering
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.sql.Dataset
 
 class BisectingKMeansSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+  extends MLTest with DefaultReadWriteTest {
+
+  import Encoders._
 
   final val k = 5
   @transient var dataset: Dataset[_] = _
@@ -63,10 +64,12 @@ class BisectingKMeansSuite
 
     // Verify fit does not fail on very sparse data
     val model = bkm.fit(sparseDataset)
-    val result = model.transform(sparseDataset)
-    val numClusters = result.select("prediction").distinct().collect().length
-    // Verify we hit the edge case
-    assert(numClusters < k && numClusters > 1)
+
+    testTransformerByGlobalCheckFunc[Vector](sparseDataset.toDF(), model, "prediction") { rows =>
+      val numClusters = rows.distinct.length
+      // Verify we hit the edge case
+      assert(numClusters < k && numClusters > 1)
+    }
   }
 
   test("setter/getter") {
@@ -100,17 +103,13 @@ class BisectingKMeansSuite
     val model = bkm.fit(dataset)
     assert(model.clusterCenters.length === k)
 
-    val transformed = model.transform(dataset)
-    val expectedColumns = Array("features", predictionColName)
-    expectedColumns.foreach { column =>
-      assert(transformed.columns.contains(column))
+    testTransformerByGlobalCheckFunc[Vector](dataset.toDF(), model,
+      "features", predictionColName) { rows =>
+      val clusters = rows.map(_.getAs[Int](predictionColName)).toSet
+      assert(clusters === Set(0, 1, 2, 3, 4))
+      assert(model.computeCost(dataset) < 0.1)
+      assert(model.hasParent)
     }
-    val clusters =
-      transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet
-    assert(clusters.size === k)
-    assert(clusters === Set(0, 1, 2, 3, 4))
-    assert(model.computeCost(dataset) < 0.1)
-    assert(model.hasParent)
 
     // Check validity of model summary
     val numRows = dataset.count()
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/Encoders.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/Encoders.scala
new file mode 100644
index 0000000000000..afa72171638e1
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/Encoders.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.clustering
+
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+
+private[clustering] object Encoders {
+  implicit val vectorEncoder = ExpressionEncoder[Vector]()
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 08b800b7e4183..def269b02963c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -21,16 +21,16 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.stat.distribution.MultivariateGaussian
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Dataset, Row}
 
 
-class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
+class GaussianMixtureSuite extends MLTest
   with DefaultReadWriteTest {
 
   import testImplicits._
+  import Encoders._
   import GaussianMixtureSuite._
 
   final val k = 5
@@ -118,15 +118,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(model.weights.length === k)
     assert(model.gaussians.length === k)
 
-    val transformed = model.transform(dataset)
-    val expectedColumns = Array("features", predictionColName, probabilityColName)
-    expectedColumns.foreach { column =>
-      assert(transformed.columns.contains(column))
-    }
-
     // Check prediction matches the highest probability, and probabilities sum to one.
-    transformed.select(predictionColName, probabilityColName).collect().foreach {
-      case Row(pred: Int, prob: Vector) =>
+    testTransformer[Vector](dataset.toDF(), model,
+      "features", predictionColName, probabilityColName) {
+      case Row(_, pred: Int, prob: Vector) =>
         val probArray = prob.toArray
         val predFromProb = probArray.zipWithIndex.maxBy(_._1)._2
         assert(pred === predFromProb)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 119fe1dead9a9..60a99daf74d45 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -19,17 +19,17 @@ package org.apache.spark.ml.clustering
 
 import scala.util.Random
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 
 private[clustering] case class TestRow(features: Vector)
 
-class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class KMeansSuite extends MLTest with DefaultReadWriteTest {
+
+  import Encoders._
 
   final val k = 5
   @transient var dataset: Dataset[_] = _
@@ -97,15 +97,13 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
     val model = kmeans.fit(dataset)
     assert(model.clusterCenters.length === k)
 
-    val transformed = model.transform(dataset)
-    val expectedColumns = Array("features", predictionColName)
-    expectedColumns.foreach { column =>
-      assert(transformed.columns.contains(column))
+    testTransformerByGlobalCheckFunc[Vector](dataset.toDF(), model,
+      "features", predictionColName) { rows =>
+      val clusters = rows.map(_.getAs[Int](predictionColName)).toSet
+      assert(clusters.size === k)
+      assert(clusters === Set(0, 1, 2, 3, 4))
     }
-    val clusters =
-      transformed.select(predictionColName).rdd.map(_.getInt(0)).distinct().collect().toSet
-    assert(clusters.size === k)
-    assert(clusters === Set(0, 1, 2, 3, 4))
+
     assert(model.computeCost(dataset) < 0.1)
     assert(model.hasParent)
 
@@ -137,9 +135,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
     model.setFeaturesCol(featuresColName).setPredictionCol(predictionColName)
 
     val transformed = model.transform(dataset.withColumnRenamed("features", featuresColName))
-    Seq(featuresColName, predictionColName).foreach { column =>
-      assert(transformed.columns.contains(column))
-    }
+    assert(transformed.schema.fieldNames.toSet === Set(featuresColName, predictionColName))
     assert(model.getFeaturesCol == featuresColName)
     assert(model.getPredictionCol == predictionColName)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
index e73bbc18d76bd..d3f627123c09e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
@@ -19,12 +19,11 @@ package org.apache.spark.ml.clustering
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 
 
 object LDASuite {
@@ -60,10 +59,12 @@ object LDASuite {
 }
 
 
-class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class LDASuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
+  implicit val vectorEncoder = ExpressionEncoder[Vector]()
+
   val k: Int = 5
   val vocabSize: Int = 30
   @transient var dataset: Dataset[_] = _
@@ -185,16 +186,11 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
     assert(model.topicsMatrix.numCols === k)
     assert(!model.isDistributed)
 
-    // transform()
-    val transformed = model.transform(dataset)
-    val expectedColumns = Array("features", lda.getTopicDistributionCol)
-    expectedColumns.foreach { column =>
-      assert(transformed.columns.contains(column))
-    }
-    transformed.select(lda.getTopicDistributionCol).collect().foreach { r =>
-      val topicDistribution = r.getAs[Vector](0)
-      assert(topicDistribution.size === k)
-      assert(topicDistribution.toArray.forall(w => w >= 0.0 && w <= 1.0))
+    testTransformer[Vector](dataset.toDF(), model,
+      "features", lda.getTopicDistributionCol) {
+      case Row(_, topicDistribution: Vector) =>
+        assert(topicDistribution.size === k)
+        assert(topicDistribution.toArray.forall(w => w >= 0.0 && w <= 1.0))
     }
 
     // logLikelihood, logPerplexity

From b6e06e8e280f97560a342e287072f0b49e85bb79 Mon Sep 17 00:00:00 2001
From: Sandor Murakozi <smurakozi@gmail.com>
Date: Thu, 18 Jan 2018 21:12:46 +0100
Subject: [PATCH 2/3] formatting, nits

---
 .../apache/spark/ml/clustering/BisectingKMeansSuite.scala    | 4 ++--
 .../apache/spark/ml/clustering/GaussianMixtureSuite.scala    | 3 +--
 .../test/scala/org/apache/spark/ml/clustering/LDASuite.scala | 5 +----
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index 1830a98ff2655..dde60ef2a2fa0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -22,8 +22,7 @@ import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.sql.Dataset
 
-class BisectingKMeansSuite
-  extends MLTest with DefaultReadWriteTest {
+class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
 
   import Encoders._
 
@@ -106,6 +105,7 @@ class BisectingKMeansSuite
     testTransformerByGlobalCheckFunc[Vector](dataset.toDF(), model,
       "features", predictionColName) { rows =>
       val clusters = rows.map(_.getAs[Int](predictionColName)).toSet
+      assert(clusters.size === k)
       assert(clusters === Set(0, 1, 2, 3, 4))
       assert(model.computeCost(dataset) < 0.1)
       assert(model.hasParent)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index def269b02963c..dfe6fa9b60c8d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -26,8 +26,7 @@ import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.sql.{Dataset, Row}
 
 
-class GaussianMixtureSuite extends MLTest
-  with DefaultReadWriteTest {
+class GaussianMixtureSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
   import Encoders._
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
index d3f627123c09e..49cf500736f34 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
@@ -23,8 +23,6 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
-
 
 object LDASuite {
   def generateLDAData(
@@ -62,8 +60,7 @@ object LDASuite {
 class LDASuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
-
-  implicit val vectorEncoder = ExpressionEncoder[Vector]()
+  import Encoders._
 
   val k: Int = 5
   val vocabSize: Int = 30

From b2aa3c98808f734eaeb68014605f9d42089edc3f Mon Sep 17 00:00:00 2001
From: Sandor Murakozi <smurakozi@gmail.com>
Date: Mon, 9 Apr 2018 16:25:44 +0200
Subject: [PATCH 3/3] Removed duplicated import line

---
 .../org/apache/spark/ml/clustering/BisectingKMeansSuite.scala    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index e2a06e8a5b1a4..4de24d74b11f5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.ml.clustering
 
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap