Modify describeTopics to take advantage of DataFrame serialization

yu-iskw · yu-iskw · commit 6e3cf057cbaf · 2015-11-02T09:40:51.000-08:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/LDAModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/LDAModelWrapper.scala
@@ -16,11 +16,11 @@
  */
 package org.apache.spark.mllib.api.python
 
-import scala.collection.JavaConverters
-
 import org.apache.spark.SparkContext
+import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.mllib.clustering.LDAModel
 import org.apache.spark.mllib.linalg.Matrix
+import org.apache.spark.sql.{DataFrame, SQLContext}
 
 /**
  * Wrapper around LDAModel to provide helper methods in Python
@@ -31,14 +31,17 @@ private[python] class LDAModelWrapper(model: LDAModel) {
 
   def vocabSize(): Int = model.vocabSize
 
-  def describeTopics(): java.util.List[Array[Any]] = describeTopics(this.model.vocabSize)
+  def describeTopics(jsc: JavaSparkContext): DataFrame = describeTopics(this.model.vocabSize, jsc)
 
-  def describeTopics(maxTermsPerTopic: Int): java.util.List[Array[Any]] = {
+  def describeTopics(maxTermsPerTopic: Int, jsc: JavaSparkContext): DataFrame = {
+    val sqlContext = new SQLContext(jsc.sc)
+    import sqlContext.implicits._
 
-    val seq = model.describeTopics(maxTermsPerTopic).map { case (terms, termWeights) =>
-        Array.empty[Any] ++ terms ++ termWeights
-      }.toSeq
-    JavaConverters.seqAsJavaListConverter(seq).asJava
+    // Since the return value of `describeTopics` is a little complicated,
+    // the return value are converted to `Row` to take advantage of DataFrame serialization.
+    val topics = model.describeTopics(maxTermsPerTopic)
+    val rdd = jsc.sc.parallelize(topics)
+    rdd.toDF("terms", "termWeights")
   }
 
   def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
@@ -741,20 +741,18 @@ def describeTopics(self, maxTermsPerTopic=None):
         WARNING: If vocabSize and k are large, this can return a large object!
         """
         if maxTermsPerTopic is None:
-            topics = self.call("describeTopics")
+            df = self.call("describeTopics", self._sc)
         else:
-            topics = self.call("describeTopics", maxTermsPerTopic)
+            df = self.call("describeTopics", maxTermsPerTopic, self._sc)
 
-        # Converts the result to make the format similar to Scala.
-        # The returned value is mixed up with topics and topi weights.
         converted = []
-        for elms in [list(elms) for elms in topics]:
-            half_len = int(len(elms) / 2)
-            topics = elms[:half_len]
-            topicWeights = elms[(-1 * half_len):]
-            if len(topics) != len(topicWeights):
+        rows = df.collect()
+        for row in df.collect():
+            terms = row["terms"]
+            termWeights = row["termWeights"]
+            if len(terms) != len(termWeights):
                 raise TypeError("Something wrong with a return value: %s" % (topics))
-            converted.append((topics, topicWeights))
+            converted.append((terms, termWeights))
         return converted
 
     @classmethod