add save/load for k-means for SPARK-5986

yinxusen · yinxusen · commit dce70553cb0e · 2015-03-09T22:12:59.000+08:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -17,15 +17,22 @@
 
 package org.apache.spark.mllib.clustering
 
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.util.{Loader, Saveable}
+import org.apache.spark.mllib.util.Loader._
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.SparkContext
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.rdd.RDD
-import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.linalg.Vector
 
 /**
  * A clustering model for K-means. Each point belongs to the cluster with the closest center.
  */
-class KMeansModel (val clusterCenters: Array[Vector]) extends Serializable {
+class KMeansModel (val clusterCenters: Array[Vector]) extends Saveable with Serializable {
 
   /** Total number of clusters. */
   def k: Int = clusterCenters.length
@@ -58,4 +65,53 @@ class KMeansModel (val clusterCenters: Array[Vector]) extends Serializable {
 
   private def clusterCentersWithNorm: Iterable[VectorWithNorm] =
     clusterCenters.map(new VectorWithNorm(_))
+
+  override def save(sc: SparkContext, path: String): Unit = {
+    KMeansModel.SaveLoadV1_0.save(sc, this, path)
+  }
+
+  override protected def formatVersion: String = "1.0"
+}
+
+object KMeansModel extends Loader[KMeansModel] {
+  override def load(sc: SparkContext, path: String): KMeansModel = {
+    KMeansModel.SaveLoadV1_0.load(sc, path)
+  }
+
+  private[clustering]
+  object SaveLoadV1_0 {
+
+    private val thisFormatVersion = "1.0"
+
+    private[clustering]
+    val thisClassName = "org.apache.spark.mllib.clustering.KMeansModel"
+
+    /**
+     * Saves a [[KMeansModel]], where user features are saved under `data/users` and
+     * product features are saved under `data/products`.
+     */
+    def save(sc: SparkContext, model: KMeansModel, path: String): Unit = {
+      val sqlContext = new SQLContext(sc)
+      val wrapper = new VectorUDT()
+      val metadata = compact(render(
+        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ ("k" -> model.k)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
+      val dataRDD = sc.parallelize(model.clusterCenters).map(wrapper.serialize)
+      sqlContext.createDataFrame(dataRDD, wrapper.sqlType).saveAsParquetFile(Loader.dataPath(path))
+    }
+
+    def load(sc: SparkContext, path: String): KMeansModel = {
+      implicit val formats = DefaultFormats
+      val sqlContext = new SQLContext(sc)
+      val wrapper = new VectorUDT()
+      val (className, formatVersion, metadata) = loadMetadata(sc, path)
+      assert(className == thisClassName)
+      assert(formatVersion == thisFormatVersion)
+      val k = (metadata \ "k").extract[Int]
+      val centriods = sqlContext.parquetFile(dataPath(path))
+      val localCentriods = centriods.collect()
+      assert(k == localCentriods.size)
+      new KMeansModel(localCentriods.map(wrapper.deserialize))
+    }
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -21,7 +21,8 @@ import scala.util.Random
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.util.Utils
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
 
@@ -257,6 +258,54 @@ class KMeansSuite extends FunSuite with MLlibTestSparkContext {
       assert(predicts(0) != predicts(3))
     }
   }
+
+  test("model save/load") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+
+    Array(true, false).foreach { case selector =>
+      val model = KMeansSuite.createModel(10, 3, selector)
+      // Save model, load it back, and compare.
+      try {
+        model.save(sc, path)
+        val sameModel = KMeansModel.load(sc, path)
+        KMeansSuite.checkEqual(model, sameModel, selector)
+      } finally {
+        Utils.deleteRecursively(tempDir)
+      }
+    }
+  }
+}
+
+object KMeansSuite extends FunSuite {
+  def createModel(dim: Int, k: Int, isSparse: Boolean): KMeansModel = {
+    val singlePoint = isSparse match {
+      case true =>
+        Vectors.sparse(dim, Array.empty[Int], Array.empty[Double])
+      case _ =>
+        Vectors.dense(Array.fill[Double](dim)(0.0))
+    }
+    new KMeansModel(Array.fill[Vector](k)(singlePoint))
+  }
+
+  def checkEqual(a: KMeansModel, b: KMeansModel, isSparse: Boolean): Unit = {
+    assert(a.k === b.k)
+    isSparse match {
+      case true =>
+        a.clusterCenters.zip(b.clusterCenters).foreach { case (pointA, pointB) =>
+          assert(pointA.asInstanceOf[SparseVector].size === pointB.asInstanceOf[SparseVector].size)
+          assert(
+            pointA.asInstanceOf[SparseVector].indices === pointB.asInstanceOf[SparseVector].indices)
+          assert(
+            pointA.asInstanceOf[SparseVector].values === pointB.asInstanceOf[SparseVector].values)
+        }
+      case _ =>
+        a.clusterCenters.zip(b.clusterCenters).foreach { case (pointA, pointB) =>
+          assert(
+            pointA.asInstanceOf[DenseVector].toArray === pointB.asInstanceOf[DenseVector].toArray)
+        }
+    }
+  }
 }
 
 class KMeansClusterSuite extends FunSuite with LocalClusterSparkContext {