@@ -186,7 +186,6 @@ abstract class LDAModel private[clustering] extends Saveable {
186186 * This model stores only the inferred topics.
187187 * It may be used for computing topics for new documents, but it may give less accurate answers
188188 * than the [[DistributedLDAModel ]].
189- *
190189 * @param topics Inferred topics (vocabSize x k matrix).
191190 */
192191@ Experimental
@@ -233,6 +232,7 @@ class LocalLDAModel private[clustering] (
233232 .sum()
234233 val batchVariationalBound = bound(documents, docConcentration,
235234 topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
235+ println(s " bound: $batchVariationalBound" )
236236 val perWordBound = batchVariationalBound / corpusWords
237237
238238 perWordBound
@@ -274,7 +274,7 @@ class LocalLDAModel private[clustering] (
274274
275275 // E[log p(doc | theta, beta)]
276276 termCounts.foreachActive { case (idx, count) =>
277- docScore += LDAUtils .logSumExp(Elogthetad + Elogbeta (idx, :: ).t)
277+ docScore += count * LDAUtils .logSumExp(Elogthetad + Elogbeta (idx, :: ).t)
278278 }
279279 // E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
280280 docScore += sum((brzAlpha - gammad) :* Elogthetad )
@@ -295,29 +295,36 @@ class LocalLDAModel private[clustering] (
295295 }
296296
297297 /**
298- * Predicts the topic mixture distribution ("gamma") for a document. Returns a vector of zeros for
299- * an empty document.
298+ * Predicts the topic mixture distribution for each document (often called "theta" in the
299+ * literature). Returns a vector of zeros for an empty document.
300+ *
301+ * This uses a variational approximation following Hoffman et al. (2010), where the approximate
302+ * distribution is called "gamma." Technically, this method returns this approximation "gamma"
303+ * for each document.
300304 * @param documents documents to predict topic mixture distributions for
301- * @return topic mixture distributions for each document
305+ * @return An RDD of (document ID, topic mixture distribution for document)
302306 */
303307 // TODO: declare in LDAModel and override once implemented in DistributedLDAModel
304308 def topicDistributions (documents : RDD [(Long , Vector )]): RDD [(Long , Vector )] = {
305309 // Double transpose because dirichletExpectation normalizes by row and we need to normalize
306310 // by topic (columns of lambda)
307311 val expElogbeta = exp(LDAUtils .dirichletExpectation(topicsMatrix.toBreeze.toDenseMatrix.t).t)
308- val topicConcentrationBrz = this .docConcentration.toBreeze
312+ val docConcentrationBrz = this .docConcentration.toBreeze
309313 val gammaShape = this .gammaShape
310314 val k = this .k
311315
312- documents.map { doc =>
313- if (doc._2.numNonzeros == 0 ) (doc._1, Vectors .zeros(k))
314- val (gamma, _) = OnlineLDAOptimizer .variationalTopicInference(
315- doc._2,
316- expElogbeta,
317- topicConcentrationBrz,
318- gammaShape,
319- k)
320- (doc._1, Vectors .dense(normalize(gamma, 1.0 ).toArray))
316+ documents.map { case (id : Long , termCounts : Vector ) =>
317+ if (termCounts.numNonzeros == 0 ) {
318+ (id, Vectors .zeros(k))
319+ } else {
320+ val (gamma, _) = OnlineLDAOptimizer .variationalTopicInference(
321+ termCounts,
322+ expElogbeta,
323+ docConcentrationBrz,
324+ gammaShape,
325+ k)
326+ (id, Vectors .dense(normalize(gamma, 1.0 ).toArray))
327+ }
321328 }
322329 }
323330
0 commit comments