Skip to content

Commit a8340fa

Browse files
Feynman Liangjkbradley
authored andcommitted
[SPARK-9481] Add logLikelihood to LocalLDAModel
jkbradley Exposes `bound` (variational log likelihood bound) through public API as `logLikelihood`. Also adds unit tests, some DRYing of `LDASuite`, and includes unit tests mentioned in #7760 Author: Feynman Liang <[email protected]> Closes #7801 from feynmanliang/SPARK-9481-logLikelihood and squashes the following commits: 6d1b2c9 [Feynman Liang] Negate perplexity definition 5f62b20 [Feynman Liang] Add logLikelihood
1 parent d046347 commit a8340fa

File tree

2 files changed

+78
-71
lines changed

2 files changed

+78
-71
lines changed

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -217,22 +217,28 @@ class LocalLDAModel private[clustering] (
217217
LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration,
218218
gammaShape)
219219
}
220-
// TODO
221-
// override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
220+
221+
// TODO: declare in LDAModel and override once implemented in DistributedLDAModel
222+
/**
223+
* Calculates a lower bound on the log likelihood of the entire corpus.
224+
* @param documents test corpus to use for calculating log likelihood
225+
* @return variational lower bound on the log likelihood of the entire corpus
226+
*/
227+
def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents,
228+
docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k,
229+
vocabSize)
222230

223231
/**
224-
* Calculate the log variational bound on perplexity. See Equation (16) in original Online
232+
* Calculate an upper bound bound on perplexity. See Equation (16) in original Online
225233
* LDA paper.
226234
* @param documents test corpus to use for calculating perplexity
227-
* @return the log perplexity per word
235+
* @return variational upper bound on log perplexity per word
228236
*/
229237
def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
230238
val corpusWords = documents
231239
.map { case (_, termCounts) => termCounts.toArray.sum }
232240
.sum()
233-
val batchVariationalBound = bound(documents, docConcentration,
234-
topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize)
235-
val perWordBound = batchVariationalBound / corpusWords
241+
val perWordBound = -logLikelihood(documents) / corpusWords
236242

237243
perWordBound
238244
}

mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala

Lines changed: 65 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -210,16 +210,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
210210
}
211211

212212
test("OnlineLDAOptimizer with toy data") {
213-
def toydata: Array[(Long, Vector)] = Array(
214-
Vectors.sparse(6, Array(0, 1), Array(1, 1)),
215-
Vectors.sparse(6, Array(1, 2), Array(1, 1)),
216-
Vectors.sparse(6, Array(0, 2), Array(1, 1)),
217-
Vectors.sparse(6, Array(3, 4), Array(1, 1)),
218-
Vectors.sparse(6, Array(3, 5), Array(1, 1)),
219-
Vectors.sparse(6, Array(4, 5), Array(1, 1))
220-
).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
221-
222-
val docs = sc.parallelize(toydata)
213+
val docs = sc.parallelize(toyData)
223214
val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
224215
.setGammaShape(1e10)
225216
val lda = new LDA().setK(2)
@@ -242,30 +233,45 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
242233
}
243234
}
244235

245-
test("LocalLDAModel logPerplexity") {
246-
val k = 2
247-
val vocabSize = 6
248-
val alpha = 0.01
249-
val eta = 0.01
250-
val gammaShape = 100
251-
// obtained from LDA model trained in gensim, see below
252-
val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
253-
1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
254-
0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
236+
test("LocalLDAModel logLikelihood") {
237+
val ldaModel: LocalLDAModel = toyModel
255238

256-
def toydata: Array[(Long, Vector)] = Array(
257-
Vectors.sparse(6, Array(0, 1), Array(1, 1)),
258-
Vectors.sparse(6, Array(1, 2), Array(1, 1)),
259-
Vectors.sparse(6, Array(0, 2), Array(1, 1)),
260-
Vectors.sparse(6, Array(3, 4), Array(1, 1)),
261-
Vectors.sparse(6, Array(3, 5), Array(1, 1)),
262-
Vectors.sparse(6, Array(4, 5), Array(1, 1))
263-
).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
264-
val docs = sc.parallelize(toydata)
239+
val docsSingleWord = sc.parallelize(Array(Vectors.sparse(6, Array(0), Array(1)))
240+
.zipWithIndex
241+
.map { case (wordCounts, docId) => (docId.toLong, wordCounts) })
242+
val docsRepeatedWord = sc.parallelize(Array(Vectors.sparse(6, Array(0), Array(5)))
243+
.zipWithIndex
244+
.map { case (wordCounts, docId) => (docId.toLong, wordCounts) })
265245

246+
/* Verify results using gensim:
247+
import numpy as np
248+
from gensim import models
249+
corpus = [
250+
[(0, 1.0), (1, 1.0)],
251+
[(1, 1.0), (2, 1.0)],
252+
[(0, 1.0), (2, 1.0)],
253+
[(3, 1.0), (4, 1.0)],
254+
[(3, 1.0), (5, 1.0)],
255+
[(4, 1.0), (5, 1.0)]]
256+
np.random.seed(2345)
257+
lda = models.ldamodel.LdaModel(
258+
corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100,
259+
decay=0.51, offset=1024)
260+
docsSingleWord = [[(0, 1.0)]]
261+
docsRepeatedWord = [[(0, 5.0)]]
262+
print(lda.bound(docsSingleWord))
263+
> -25.9706969833
264+
print(lda.bound(docsRepeatedWord))
265+
> -31.4413908227
266+
*/
266267

267-
val ldaModel: LocalLDAModel = new LocalLDAModel(
268-
topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
268+
assert(ldaModel.logLikelihood(docsSingleWord) ~== -25.971 relTol 1E-3D)
269+
assert(ldaModel.logLikelihood(docsRepeatedWord) ~== -31.441 relTol 1E-3D)
270+
}
271+
272+
test("LocalLDAModel logPerplexity") {
273+
val docs = sc.parallelize(toyData)
274+
val ldaModel: LocalLDAModel = toyModel
269275

270276
/* Verify results using gensim:
271277
import numpy as np
@@ -285,32 +291,13 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
285291
> -3.69051285096
286292
*/
287293

288-
assert(ldaModel.logPerplexity(docs) ~== -3.690D relTol 1E-3D)
294+
// Gensim's definition of perplexity is negative our (and Stanford NLP's) definition
295+
assert(ldaModel.logPerplexity(docs) ~== 3.690D relTol 1E-3D)
289296
}
290297

291298
test("LocalLDAModel predict") {
292-
val k = 2
293-
val vocabSize = 6
294-
val alpha = 0.01
295-
val eta = 0.01
296-
val gammaShape = 100
297-
// obtained from LDA model trained in gensim, see below
298-
val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
299-
1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
300-
0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
301-
302-
def toydata: Array[(Long, Vector)] = Array(
303-
Vectors.sparse(6, Array(0, 1), Array(1, 1)),
304-
Vectors.sparse(6, Array(1, 2), Array(1, 1)),
305-
Vectors.sparse(6, Array(0, 2), Array(1, 1)),
306-
Vectors.sparse(6, Array(3, 4), Array(1, 1)),
307-
Vectors.sparse(6, Array(3, 5), Array(1, 1)),
308-
Vectors.sparse(6, Array(4, 5), Array(1, 1))
309-
).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
310-
val docs = sc.parallelize(toydata)
311-
312-
val ldaModel: LocalLDAModel = new LocalLDAModel(
313-
topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
299+
val docs = sc.parallelize(toyData)
300+
val ldaModel: LocalLDAModel = toyModel
314301

315302
/* Verify results using gensim:
316303
import numpy as np
@@ -351,16 +338,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
351338
}
352339

353340
test("OnlineLDAOptimizer with asymmetric prior") {
354-
def toydata: Array[(Long, Vector)] = Array(
355-
Vectors.sparse(6, Array(0, 1), Array(1, 1)),
356-
Vectors.sparse(6, Array(1, 2), Array(1, 1)),
357-
Vectors.sparse(6, Array(0, 2), Array(1, 1)),
358-
Vectors.sparse(6, Array(3, 4), Array(1, 1)),
359-
Vectors.sparse(6, Array(3, 5), Array(1, 1)),
360-
Vectors.sparse(6, Array(4, 5), Array(1, 1))
361-
).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
362-
363-
val docs = sc.parallelize(toydata)
341+
val docs = sc.parallelize(toyData)
364342
val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
365343
.setGammaShape(1e10)
366344
val lda = new LDA().setK(2)
@@ -531,4 +509,27 @@ private[clustering] object LDASuite {
531509
def getNonEmptyDoc(corpus: Array[(Long, Vector)]): Array[(Long, Vector)] = corpus.filter {
532510
case (_, wc: Vector) => Vectors.norm(wc, p = 1.0) != 0.0
533511
}
512+
513+
def toyData: Array[(Long, Vector)] = Array(
514+
Vectors.sparse(6, Array(0, 1), Array(1, 1)),
515+
Vectors.sparse(6, Array(1, 2), Array(1, 1)),
516+
Vectors.sparse(6, Array(0, 2), Array(1, 1)),
517+
Vectors.sparse(6, Array(3, 4), Array(1, 1)),
518+
Vectors.sparse(6, Array(3, 5), Array(1, 1)),
519+
Vectors.sparse(6, Array(4, 5), Array(1, 1))
520+
).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
521+
522+
def toyModel: LocalLDAModel = {
523+
val k = 2
524+
val vocabSize = 6
525+
val alpha = 0.01
526+
val eta = 0.01
527+
val gammaShape = 100
528+
val topics = new DenseMatrix(numRows = vocabSize, numCols = k, values = Array(
529+
1.86738052, 1.94056535, 1.89981687, 0.0833265, 0.07405918, 0.07940597,
530+
0.15081551, 0.08637973, 0.12428538, 1.9474897, 1.94615165, 1.95204124))
531+
val ldaModel: LocalLDAModel = new LocalLDAModel(
532+
topics, Vectors.dense(Array.fill(k)(alpha)), eta, gammaShape)
533+
ldaModel
534+
}
534535
}

0 commit comments

Comments
 (0)