add read/write to CountVectorizer

mengxr · mengxr · commit 3ea051aa296a · 2015-11-17T23:40:25.000-08:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -16,17 +16,19 @@
  */
 package org.apache.spark.ml.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
-import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
-import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.DataFrame
 import org.apache.spark.util.collection.OpenHashMap
 
 /**
@@ -105,7 +107,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
  */
 @Experimental
 class CountVectorizer(override val uid: String)
-  extends Estimator[CountVectorizerModel] with CountVectorizerParams {
+  extends Estimator[CountVectorizerModel] with CountVectorizerParams with Writable {
 
   def this() = this(Identifiable.randomUID("cntVec"))
 
@@ -169,6 +171,19 @@ class CountVectorizer(override val uid: String)
   }
 
   override def copy(extra: ParamMap): CountVectorizer = defaultCopy(extra)
+
+  @Since("1.6.0")
+  override def write: Writer = new DefaultParamsWriter(this)
+}
+
+@Since("1.6.0")
+object CountVectorizer extends Readable[CountVectorizer] {
+
+  @Since("1.6.0")
+  override def read: Reader[CountVectorizer] = new DefaultParamsReader
+
+  @Since("1.6.0")
+  override def load(path: String): CountVectorizer = super.load(path)
 }
 
 /**
@@ -178,7 +193,9 @@ class CountVectorizer(override val uid: String)
  */
 @Experimental
 class CountVectorizerModel(override val uid: String, val vocabulary: Array[String])
-  extends Model[CountVectorizerModel] with CountVectorizerParams {
+  extends Model[CountVectorizerModel] with CountVectorizerParams with Writable {
+
+  import CountVectorizerModel._
 
   def this(vocabulary: Array[String]) = {
     this(Identifiable.randomUID("cntVecModel"), vocabulary)
@@ -232,4 +249,47 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
     val copied = new CountVectorizerModel(uid, vocabulary).setParent(parent)
     copyValues(copied, extra)
   }
+
+  @Since("1.6.0")
+  override def write: Writer = new CountVectorizerModelWriter(this)
+}
+
+@Since("1.6.0")
+object CountVectorizerModel extends Readable[CountVectorizerModel] {
+
+  private[CountVectorizerModel]
+  class CountVectorizerModelWriter(instance: CountVectorizerModel) extends Writer {
+
+    private case class Data(vocabulary: Seq[String])
+
+    override protected def saveImpl(path: String): Unit = {
+      DefaultParamsWriter.saveMetadata(instance, path, sc)
+      val data = Data(instance.vocabulary)
+      val dataPath = new Path(path, "data").toString
+      sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+    }
+  }
+
+  private class CountVectorizerModelReader extends Reader[CountVectorizerModel] {
+
+    private val className = "org.apache.spark.ml.feature.CountVectorizerModel"
+
+    override def load(path: String): CountVectorizerModel = {
+      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+      val dataPath = new Path(path, "data").toString
+      val data = sqlContext.read.parquet(dataPath)
+        .select("vocabulary")
+        .head()
+      val vocabulary = data.getAs[Seq[String]](0).toArray
+      val model = new CountVectorizerModel(metadata.uid, vocabulary)
+      DefaultParamsReader.getAndSetParams(model, metadata)
+      model
+    }
+  }
+
+  @Since("1.6.0")
+  override def read: Reader[CountVectorizerModel] = new CountVectorizerModelReader
+
+  @Since("1.6.0")
+  override def load(path: String): CountVectorizerModel = super.load(path)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -160,7 +160,7 @@ object IDFModel extends Readable[IDFModel] {
 
   private class IDFModelReader extends Reader[IDFModel] {
 
-    private val className = "org.apache.spark.ml.feature.StringIndexerModel"
+    private val className = "org.apache.spark.ml.feature.IDFModel"
 
     override def load(path: String): IDFModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -236,5 +236,3 @@ object MinMaxScalerModel extends Readable[MinMaxScalerModel] {
   @Since("1.6.0")
   override def load(path: String): MinMaxScalerModel = super.load(path)
 }
-
-
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -18,14 +18,17 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
 
-class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext {
+class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
+  with DefaultReadWriteTest {
 
   test("params") {
+    ParamsSuite.checkParams(new CountVectorizer)
     ParamsSuite.checkParams(new CountVectorizerModel(Array("empty")))
   }
 
@@ -164,4 +167,23 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(features ~== expected absTol 1e-14)
     }
   }
+
+  test("CountVectorizer read/write") {
+    val t = new CountVectorizer()
+      .setInputCol("myInputCol")
+      .setOutputCol("myOutputCol")
+      .setMinDF(0.5)
+      .setMinTF(3.0)
+      .setVocabSize(10)
+    testDefaultReadWrite(t)
+  }
+
+  test("CountVectorizerModel read/write") {
+    val instance = new CountVectorizerModel("myCountVectorizerModel", Array("a", "b", "c"))
+      .setInputCol("myInputCol")
+      .setOutputCol("myOutputCol")
+      .setMinTF(3.0)
+    val newInstance = testDefaultReadWrite(instance)
+    assert(newInstance.vocabulary === instance.vocabulary)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.feature
 
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -67,6 +68,12 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext
     }
   }
 
+  test("params") {
+    ParamsSuite.checkParams(new StandardScaler)
+    val oldModel = new feature.StandardScalerModel(Vectors.dense(1.0), Vectors.dense(2.0))
+    ParamsSuite.checkParams(new StandardScalerModel("empty", oldModel))
+  }
+
   test("Standardization with default parameter") {
     val df0 = sqlContext.createDataFrame(data.zip(resWithStd)).toDF("features", "expected")
 

Original file line number	Diff line number	Diff line change
`@@ -236,5 +236,3 @@ object MinMaxScalerModel extends Readable[MinMaxScalerModel] {`
`236`	`236`	`@Since("1.6.0")`
`237`	`237`	`override def load(path: String): MinMaxScalerModel = super.load(path)`
`238`	`238`	`}`
`239`		`-`
`240`		`-`