Revert "Modify describeTopics to take advantage of DataFrame serialization"

yu-iskw · yu-iskw · commit 0bc114e9fe8f · 2015-11-02T19:38:24.000-08:00
This reverts commit 6e3cf05.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/LDAModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/LDAModelWrapper.scala
@@ -16,11 +16,11 @@
  */
 package org.apache.spark.mllib.api.python
 
+import scala.collection.JavaConverters
+
 import org.apache.spark.SparkContext
-import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.mllib.clustering.LDAModel
 import org.apache.spark.mllib.linalg.Matrix
-import org.apache.spark.sql.{DataFrame, SQLContext}
 
 /**
  * Wrapper around LDAModel to provide helper methods in Python
@@ -31,17 +31,14 @@ private[python] class LDAModelWrapper(model: LDAModel) {
 
   def vocabSize(): Int = model.vocabSize
 
-  def describeTopics(jsc: JavaSparkContext): DataFrame = describeTopics(this.model.vocabSize, jsc)
+  def describeTopics(): java.util.List[Array[Any]] = describeTopics(this.model.vocabSize)
 
-  def describeTopics(maxTermsPerTopic: Int, jsc: JavaSparkContext): DataFrame = {
-    val sqlContext = new SQLContext(jsc.sc)
-    import sqlContext.implicits._
+  def describeTopics(maxTermsPerTopic: Int): java.util.List[Array[Any]] = {
 
-    // Since the return value of `describeTopics` is a little complicated,
-    // the return value are converted to `Row` to take advantage of DataFrame serialization.
-    val topics = model.describeTopics(maxTermsPerTopic)
-    val rdd = jsc.sc.parallelize(topics)
-    rdd.toDF("terms", "termWeights")
+    val seq = model.describeTopics(maxTermsPerTopic).map { case (terms, termWeights) =>
+        Array.empty[Any] ++ terms ++ termWeights
+      }.toSeq
+    JavaConverters.seqAsJavaListConverter(seq).asJava
   }
 
   def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
@@ -741,18 +741,20 @@ def describeTopics(self, maxTermsPerTopic=None):
         WARNING: If vocabSize and k are large, this can return a large object!
         """
         if maxTermsPerTopic is None:
-            df = self.call("describeTopics", self._sc)
+            topics = self.call("describeTopics")
         else:
-            df = self.call("describeTopics", maxTermsPerTopic, self._sc)
+            topics = self.call("describeTopics", maxTermsPerTopic)
 
+        # Converts the result to make the format similar to Scala.
+        # The returned value is mixed up with topics and topi weights.
         converted = []
-        rows = df.collect()
-        for row in df.collect():
-            terms = row["terms"]
-            termWeights = row["termWeights"]
-            if len(terms) != len(termWeights):
+        for elms in [list(elms) for elms in topics]:
+            half_len = int(len(elms) / 2)
+            topics = elms[:half_len]
+            topicWeights = elms[(-1 * half_len):]
+            if len(topics) != len(topicWeights):
                 raise TypeError("Something wrong with a return value: %s" % (topics))
-            converted.append((terms, termWeights))
+            converted.append((topics, topicWeights))
         return converted
 
     @classmethod