@@ -210,16 +210,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
210210 }
211211
212212 test(" OnlineLDAOptimizer with toy data" ) {
213- def toydata : Array [(Long , Vector )] = Array (
214- Vectors .sparse(6 , Array (0 , 1 ), Array (1 , 1 )),
215- Vectors .sparse(6 , Array (1 , 2 ), Array (1 , 1 )),
216- Vectors .sparse(6 , Array (0 , 2 ), Array (1 , 1 )),
217- Vectors .sparse(6 , Array (3 , 4 ), Array (1 , 1 )),
218- Vectors .sparse(6 , Array (3 , 5 ), Array (1 , 1 )),
219- Vectors .sparse(6 , Array (4 , 5 ), Array (1 , 1 ))
220- ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
221-
222- val docs = sc.parallelize(toydata)
213+ val docs = sc.parallelize(toyData)
223214 val op = new OnlineLDAOptimizer ().setMiniBatchFraction(1 ).setTau0(1024 ).setKappa(0.51 )
224215 .setGammaShape(1e10 )
225216 val lda = new LDA ().setK(2 )
@@ -242,30 +233,45 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
242233 }
243234 }
244235
245- test(" LocalLDAModel logPerplexity" ) {
246- val k = 2
247- val vocabSize = 6
248- val alpha = 0.01
249- val eta = 0.01
250- val gammaShape = 100
251- // obtained from LDA model trained in gensim, see below
252- val topics = new DenseMatrix (numRows = vocabSize, numCols = k, values = Array (
253- 1.86738052 , 1.94056535 , 1.89981687 , 0.0833265 , 0.07405918 , 0.07940597 ,
254- 0.15081551 , 0.08637973 , 0.12428538 , 1.9474897 , 1.94615165 , 1.95204124 ))
236+ test(" LocalLDAModel logLikelihood" ) {
237+ val ldaModel : LocalLDAModel = toyModel
255238
256- def toydata : Array [(Long , Vector )] = Array (
257- Vectors .sparse(6 , Array (0 , 1 ), Array (1 , 1 )),
258- Vectors .sparse(6 , Array (1 , 2 ), Array (1 , 1 )),
259- Vectors .sparse(6 , Array (0 , 2 ), Array (1 , 1 )),
260- Vectors .sparse(6 , Array (3 , 4 ), Array (1 , 1 )),
261- Vectors .sparse(6 , Array (3 , 5 ), Array (1 , 1 )),
262- Vectors .sparse(6 , Array (4 , 5 ), Array (1 , 1 ))
263- ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
264- val docs = sc.parallelize(toydata)
239+ val docsSingleWord = sc.parallelize(Array (Vectors .sparse(6 , Array (0 ), Array (1 )))
240+ .zipWithIndex
241+ .map { case (wordCounts, docId) => (docId.toLong, wordCounts) })
242+ val docsRepeatedWord = sc.parallelize(Array (Vectors .sparse(6 , Array (0 ), Array (5 )))
243+ .zipWithIndex
244+ .map { case (wordCounts, docId) => (docId.toLong, wordCounts) })
265245
246+ /* Verify results using gensim:
247+ import numpy as np
248+ from gensim import models
249+ corpus = [
250+ [(0, 1.0), (1, 1.0)],
251+ [(1, 1.0), (2, 1.0)],
252+ [(0, 1.0), (2, 1.0)],
253+ [(3, 1.0), (4, 1.0)],
254+ [(3, 1.0), (5, 1.0)],
255+ [(4, 1.0), (5, 1.0)]]
256+ np.random.seed(2345)
257+ lda = models.ldamodel.LdaModel(
258+ corpus=corpus, alpha=0.01, eta=0.01, num_topics=2, update_every=0, passes=100,
259+ decay=0.51, offset=1024)
260+ docsSingleWord = [[(0, 1.0)]]
261+ docsRepeatedWord = [[(0, 5.0)]]
262+ print(lda.bound(docsSingleWord))
263+ > -25.9706969833
264+ print(lda.bound(docsRepeatedWord))
265+ > -31.4413908227
266+ */
266267
267- val ldaModel : LocalLDAModel = new LocalLDAModel (
268- topics, Vectors .dense(Array .fill(k)(alpha)), eta, gammaShape)
268+ assert(ldaModel.logLikelihood(docsSingleWord) ~== - 25.971 relTol 1E-3D )
269+ assert(ldaModel.logLikelihood(docsRepeatedWord) ~== - 31.441 relTol 1E-3D )
270+ }
271+
272+ test(" LocalLDAModel logPerplexity" ) {
273+ val docs = sc.parallelize(toyData)
274+ val ldaModel : LocalLDAModel = toyModel
269275
270276 /* Verify results using gensim:
271277 import numpy as np
@@ -285,32 +291,13 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
285291 > -3.69051285096
286292 */
287293
288- assert(ldaModel.logPerplexity(docs) ~== - 3.690D relTol 1E-3D )
294+ // Gensim's definition of perplexity is negative our (and Stanford NLP's) definition
295+ assert(ldaModel.logPerplexity(docs) ~== 3.690D relTol 1E-3D )
289296 }
290297
291298 test(" LocalLDAModel predict" ) {
292- val k = 2
293- val vocabSize = 6
294- val alpha = 0.01
295- val eta = 0.01
296- val gammaShape = 100
297- // obtained from LDA model trained in gensim, see below
298- val topics = new DenseMatrix (numRows = vocabSize, numCols = k, values = Array (
299- 1.86738052 , 1.94056535 , 1.89981687 , 0.0833265 , 0.07405918 , 0.07940597 ,
300- 0.15081551 , 0.08637973 , 0.12428538 , 1.9474897 , 1.94615165 , 1.95204124 ))
301-
302- def toydata : Array [(Long , Vector )] = Array (
303- Vectors .sparse(6 , Array (0 , 1 ), Array (1 , 1 )),
304- Vectors .sparse(6 , Array (1 , 2 ), Array (1 , 1 )),
305- Vectors .sparse(6 , Array (0 , 2 ), Array (1 , 1 )),
306- Vectors .sparse(6 , Array (3 , 4 ), Array (1 , 1 )),
307- Vectors .sparse(6 , Array (3 , 5 ), Array (1 , 1 )),
308- Vectors .sparse(6 , Array (4 , 5 ), Array (1 , 1 ))
309- ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
310- val docs = sc.parallelize(toydata)
311-
312- val ldaModel : LocalLDAModel = new LocalLDAModel (
313- topics, Vectors .dense(Array .fill(k)(alpha)), eta, gammaShape)
299+ val docs = sc.parallelize(toyData)
300+ val ldaModel : LocalLDAModel = toyModel
314301
315302 /* Verify results using gensim:
316303 import numpy as np
@@ -351,16 +338,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
351338 }
352339
353340 test(" OnlineLDAOptimizer with asymmetric prior" ) {
354- def toydata : Array [(Long , Vector )] = Array (
355- Vectors .sparse(6 , Array (0 , 1 ), Array (1 , 1 )),
356- Vectors .sparse(6 , Array (1 , 2 ), Array (1 , 1 )),
357- Vectors .sparse(6 , Array (0 , 2 ), Array (1 , 1 )),
358- Vectors .sparse(6 , Array (3 , 4 ), Array (1 , 1 )),
359- Vectors .sparse(6 , Array (3 , 5 ), Array (1 , 1 )),
360- Vectors .sparse(6 , Array (4 , 5 ), Array (1 , 1 ))
361- ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
362-
363- val docs = sc.parallelize(toydata)
341+ val docs = sc.parallelize(toyData)
364342 val op = new OnlineLDAOptimizer ().setMiniBatchFraction(1 ).setTau0(1024 ).setKappa(0.51 )
365343 .setGammaShape(1e10 )
366344 val lda = new LDA ().setK(2 )
@@ -531,4 +509,27 @@ private[clustering] object LDASuite {
531509 def getNonEmptyDoc (corpus : Array [(Long , Vector )]): Array [(Long , Vector )] = corpus.filter {
532510 case (_, wc : Vector ) => Vectors .norm(wc, p = 1.0 ) != 0.0
533511 }
512+
513+ def toyData : Array [(Long , Vector )] = Array (
514+ Vectors .sparse(6 , Array (0 , 1 ), Array (1 , 1 )),
515+ Vectors .sparse(6 , Array (1 , 2 ), Array (1 , 1 )),
516+ Vectors .sparse(6 , Array (0 , 2 ), Array (1 , 1 )),
517+ Vectors .sparse(6 , Array (3 , 4 ), Array (1 , 1 )),
518+ Vectors .sparse(6 , Array (3 , 5 ), Array (1 , 1 )),
519+ Vectors .sparse(6 , Array (4 , 5 ), Array (1 , 1 ))
520+ ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
521+
522+ def toyModel : LocalLDAModel = {
523+ val k = 2
524+ val vocabSize = 6
525+ val alpha = 0.01
526+ val eta = 0.01
527+ val gammaShape = 100
528+ val topics = new DenseMatrix (numRows = vocabSize, numCols = k, values = Array (
529+ 1.86738052 , 1.94056535 , 1.89981687 , 0.0833265 , 0.07405918 , 0.07940597 ,
530+ 0.15081551 , 0.08637973 , 0.12428538 , 1.9474897 , 1.94615165 , 1.95204124 ))
531+ val ldaModel : LocalLDAModel = new LocalLDAModel (
532+ topics, Vectors .dense(Array .fill(k)(alpha)), eta, gammaShape)
533+ ldaModel
534+ }
534535}
0 commit comments