Code review fixes

Feynman Liang · Feynman Liang · commit 27b3877f059c · 2015-07-30T10:22:47.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -186,7 +186,6 @@ abstract class LDAModel private[clustering] extends Saveable {
  * This model stores only the inferred topics.
  * It may be used for computing topics for new documents, but it may give less accurate answers
  * than the [[DistributedLDAModel]].
- *
  * @param topics Inferred topics (vocabSize x k matrix).
  */
 @Experimental
@@ -233,6 +232,7 @@ class LocalLDAModel private[clustering] (
       .sum()
     val batchVariationalBound = bound(documents, docConcentration,
       topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
+    println(s"bound: $batchVariationalBound")
     val perWordBound = batchVariationalBound / corpusWords
 
     perWordBound
@@ -274,7 +274,7 @@ class LocalLDAModel private[clustering] (
 
       // E[log p(doc | theta, beta)]
       termCounts.foreachActive { case (idx, count) =>
-        docScore += LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t)
+        docScore += count * LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t)
       }
       // E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
       docScore += sum((brzAlpha - gammad) :* Elogthetad)
@@ -295,29 +295,36 @@ class LocalLDAModel private[clustering] (
   }
 
   /**
-   * Predicts the topic mixture distribution ("gamma") for a document. Returns a vector of zeros for
-   * an empty document.
+   * Predicts the topic mixture distribution for each document (often called "theta" in the
+   * literature).  Returns a vector of zeros for an empty document.
+   *
+   * This uses a variational approximation following Hoffman et al. (2010), where the approximate
+   * distribution is called "gamma."  Technically, this method returns this approximation "gamma"
+   * for each document.
    * @param documents documents to predict topic mixture distributions for
-   * @return topic mixture distributions for each document
+   * @return An RDD of (document ID, topic mixture distribution for document)
    */
   // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
   def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = {
     // Double transpose because dirichletExpectation normalizes by row and we need to normalize
     // by topic (columns of lambda)
     val expElogbeta = exp(LDAUtils.dirichletExpectation(topicsMatrix.toBreeze.toDenseMatrix.t).t)
-    val topicConcentrationBrz = this.docConcentration.toBreeze
+    val docConcentrationBrz = this.docConcentration.toBreeze
     val gammaShape = this.gammaShape
     val k = this.k
 
-    documents.map { doc =>
-      if (doc._2.numNonzeros == 0) (doc._1, Vectors.zeros(k))
-      val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
-        doc._2,
-        expElogbeta,
-        topicConcentrationBrz,
-        gammaShape,
-        k)
-      (doc._1, Vectors.dense(normalize(gamma, 1.0).toArray))
+    documents.map { case (id: Long, termCounts: Vector) =>
+      if (termCounts.numNonzeros == 0) {
+         (id, Vectors.zeros(k))
+      } else {
+        val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+          termCounts,
+          expElogbeta,
+          docConcentrationBrz,
+          gammaShape,
+          k)
+        (id, Vectors.dense(normalize(gamma, 1.0).toArray))
+      }
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -331,15 +331,17 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
       (0, 0.99504), (1, 0.99504),
       (1, 0.99504), (1, 0.99504))
 
-    expectedPredictions.zip(
-      ldaModel.topicDistributions(docs).map { case (_, topics) =>
+    val actualPredictions = ldaModel.topicDistributions(docs).map { case (id, topics) =>
         // convert results to expectedPredictions format, which only has highest probability topic
         val topicsBz = topics.toBreeze.toDenseVector
-        (argmax(topicsBz), max(topicsBz))
-      }.collect())
-      .forall { case (expected, actual) =>
-        expected._1 === actual._1 && (expected._2 ~== actual._2 relTol 1E-3D)
-      }
+        (id, (argmax(topicsBz), max(topicsBz)))
+      }.sortByKey()
+      .values
+      .collect()
+
+    expectedPredictions.zip(actualPredictions).forall { case (expected, actual) =>
+      expected._1 === actual._1 && (expected._2 ~== actual._2 relTol 1E-3D)
+    }
   }
 
   test("OnlineLDAOptimizer with asymmetric prior") {