Skip to content

Commit 27b3877

Browse files
author
Feynman Liang
committed
Code review fixes
1 parent 6bfb87c commit 27b3877

File tree

2 files changed

+31
-22
lines changed

2 files changed

+31
-22
lines changed

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,6 @@ abstract class LDAModel private[clustering] extends Saveable {
186186
* This model stores only the inferred topics.
187187
* It may be used for computing topics for new documents, but it may give less accurate answers
188188
* than the [[DistributedLDAModel]].
189-
*
190189
* @param topics Inferred topics (vocabSize x k matrix).
191190
*/
192191
@Experimental
@@ -233,6 +232,7 @@ class LocalLDAModel private[clustering] (
233232
.sum()
234233
val batchVariationalBound = bound(documents, docConcentration,
235234
topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
235+
println(s"bound: $batchVariationalBound")
236236
val perWordBound = batchVariationalBound / corpusWords
237237

238238
perWordBound
@@ -274,7 +274,7 @@ class LocalLDAModel private[clustering] (
274274

275275
// E[log p(doc | theta, beta)]
276276
termCounts.foreachActive { case (idx, count) =>
277-
docScore += LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t)
277+
docScore += count * LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t)
278278
}
279279
// E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
280280
docScore += sum((brzAlpha - gammad) :* Elogthetad)
@@ -295,29 +295,36 @@ class LocalLDAModel private[clustering] (
295295
}
296296

297297
/**
298-
* Predicts the topic mixture distribution ("gamma") for a document. Returns a vector of zeros for
299-
* an empty document.
298+
* Predicts the topic mixture distribution for each document (often called "theta" in the
299+
* literature). Returns a vector of zeros for an empty document.
300+
*
301+
* This uses a variational approximation following Hoffman et al. (2010), where the approximate
302+
* distribution is called "gamma." Technically, this method returns this approximation "gamma"
303+
* for each document.
300304
* @param documents documents to predict topic mixture distributions for
301-
* @return topic mixture distributions for each document
305+
* @return An RDD of (document ID, topic mixture distribution for document)
302306
*/
303307
// TODO: declare in LDAModel and override once implemented in DistributedLDAModel
304308
def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = {
305309
// Double transpose because dirichletExpectation normalizes by row and we need to normalize
306310
// by topic (columns of lambda)
307311
val expElogbeta = exp(LDAUtils.dirichletExpectation(topicsMatrix.toBreeze.toDenseMatrix.t).t)
308-
val topicConcentrationBrz = this.docConcentration.toBreeze
312+
val docConcentrationBrz = this.docConcentration.toBreeze
309313
val gammaShape = this.gammaShape
310314
val k = this.k
311315

312-
documents.map { doc =>
313-
if (doc._2.numNonzeros == 0) (doc._1, Vectors.zeros(k))
314-
val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
315-
doc._2,
316-
expElogbeta,
317-
topicConcentrationBrz,
318-
gammaShape,
319-
k)
320-
(doc._1, Vectors.dense(normalize(gamma, 1.0).toArray))
316+
documents.map { case (id: Long, termCounts: Vector) =>
317+
if (termCounts.numNonzeros == 0) {
318+
(id, Vectors.zeros(k))
319+
} else {
320+
val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
321+
termCounts,
322+
expElogbeta,
323+
docConcentrationBrz,
324+
gammaShape,
325+
k)
326+
(id, Vectors.dense(normalize(gamma, 1.0).toArray))
327+
}
321328
}
322329
}
323330

mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -331,15 +331,17 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
331331
(0, 0.99504), (1, 0.99504),
332332
(1, 0.99504), (1, 0.99504))
333333

334-
expectedPredictions.zip(
335-
ldaModel.topicDistributions(docs).map { case (_, topics) =>
334+
val actualPredictions = ldaModel.topicDistributions(docs).map { case (id, topics) =>
336335
// convert results to expectedPredictions format, which only has highest probability topic
337336
val topicsBz = topics.toBreeze.toDenseVector
338-
(argmax(topicsBz), max(topicsBz))
339-
}.collect())
340-
.forall { case (expected, actual) =>
341-
expected._1 === actual._1 && (expected._2 ~== actual._2 relTol 1E-3D)
342-
}
337+
(id, (argmax(topicsBz), max(topicsBz)))
338+
}.sortByKey()
339+
.values
340+
.collect()
341+
342+
expectedPredictions.zip(actualPredictions).forall { case (expected, actual) =>
343+
expected._1 === actual._1 && (expected._2 ~== actual._2 relTol 1E-3D)
344+
}
343345
}
344346

345347
test("OnlineLDAOptimizer with asymmetric prior") {

0 commit comments

Comments
 (0)