2121from pyspark .ml .param .shared import *
2222from pyspark .mllib .common import inherit_doc
2323
24- __all__ = ['KMeans ' , 'KMeansModel ' ,
25- 'BisectingKMeans ' , 'BisectingKMeansModel ' ,
24+ __all__ = ['BisectingKMeans ' , 'BisectingKMeansModel ' ,
25+ 'KMeans ' , 'KMeansModel ' ,
2626 'LDA' , 'LDAModel' , 'LocalLDAModel' , 'DistributedLDAModel' ]
2727
2828
@@ -313,85 +313,133 @@ def _create_model(self, java_model):
313313
314314class LDAModel (JavaModel ):
315315 """
316- A clustering model derived from the LDA method.
316+ Latent Dirichlet Allocation (LDA) model.
317+ This abstraction permits for different underlying representations,
318+ including local and distributed data structures.
317319
318320 .. versionadded:: 2.0.0
319321 """
320322
321323 @since ("2.0.0" )
322324 def isDistributed (self ):
323- """Indicates whether this instance is of type DistributedLDAModel"""
325+ """
326+ Indicates whether this instance is of type DistributedLDAModel
327+ """
324328 return self ._call_java ("isDistributed" )
325329
326330 @since ("2.0.0" )
327331 def vocabSize (self ):
328- """Vocabulary size (number of terms or terms in the vocabulary)"""
332+ """Vocabulary size (number of terms or words in the vocabulary)"""
329333 return self ._call_java ("vocabSize" )
330334
331335 @since ("2.0.0" )
332336 def topicsMatrix (self ):
333- """ Inferred topics, where each topic is represented by a distribution over terms.
337+ """
338+ Inferred topics, where each topic is represented by a distribution over terms.
334339 This is a matrix of size vocabSize x k, where each column is a topic.
335340 No guarantees are given about the ordering of the topics.
336341
337- WARNING: If this model is actually a [[ DistributedLDAModel]] instance produced by
338- the Expectation-Maximization ("em") [[ optimizer]] , then this method could involve
342+ WARNING: If this model is actually a :py:attr:` DistributedLDAModel` instance produced by
343+ the Expectation-Maximization ("em") ` optimizer` , then this method could involve
339344 collecting a large amount of data to the driver (on the order of vocabSize x k).
340345 """
341346 return self ._call_java ("topicsMatrix" )
342347
343348 @since ("2.0.0" )
344349 def logLikelihood (self , dataset ):
345- """Calculates a lower bound on the log likelihood of the entire corpus.
350+ """
351+ Calculates a lower bound on the log likelihood of the entire corpus.
346352 See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
347353
348- WARNING: If this model is an instance of [[ DistributedLDAModel]] (produced when
349- [[ optimizer]] is set to "em"), this involves collecting a large [[ topicsMatrix]] to the
354+ WARNING: If this model is an instance of :py:attr:` DistributedLDAModel` (produced when
355+ :py:attr:` optimizer` is set to "em"), this involves collecting a large :py:attr:` topicsMatrix` to the
350356 driver. This implementation may be changed in the future.
351357 """
352358 return self ._call_java ("logLikelihood" , dataset )
353359
354360 @since ("2.0.0" )
355361 def logPerplexity (self , dataset ):
356- """Calculate an upper bound bound on perplexity. (Lower is better.)
362+ """
363+ Calculate an upper bound bound on perplexity. (Lower is better.)
357364 See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
358365
359- WARNING: If this model is an instance of [[ DistributedLDAModel]] (produced when
360- [[ optimizer]] is set to "em"), this involves collecting a large [[ topicsMatrix]] to the
366+ WARNING: If this model is an instance of :py:attr:` DistributedLDAModel` (produced when
367+ :py:attr:` optimizer` is set to "em"), this involves collecting a large :py:attr:` topicsMatrix` to the
361368 driver. This implementation may be changed in the future.
362369 """
363370 return self ._call_java ("logPerplexity" , dataset )
364371
365372 @since ("2.0.0" )
366373 def describeTopics (self , maxTermsPerTopic = 10 ):
367- """Return the topics described by their top-weighted terms.
368-
369- WARNING: If vocabSize and k are large, this can return a large object!
374+ """
375+ Return the topics described by their top-weighted terms.
370376 """
371377 return self ._call_java ("describeTopics" , maxTermsPerTopic )
372378
373379 @since ("2.0.0" )
374380 def estimatedDocConcentration (self ):
375- """Value for [[docConcentration]] estimated from data.
376- If Online LDA was used and [[optimizeDocConcentration]] was set to false,
377- then this returns the fixed (given) value for the [[docConcentration]] parameter.
381+ """
382+ Value for :py:attr:`LDA.docConcentration` estimated from data.
383+ If Online LDA was used and :py:attr::`LDA.optimizeDocConcentration` was set to false,
384+ then this returns the fixed (given) value for the :py:attr:`LDA.docConcentration` parameter.
378385 """
379386 return self ._call_java ("estimatedDocConcentration" )
380387
388+ @since ("2.0.0" )
389+ def trainingLogLikelihood (self ):
390+ """
391+ Log likelihood of the observed tokens in the training set,
392+ given the current parameter estimates:
393+ log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)
394+
395+ Notes:
396+ - This excludes the prior; for that, use :py:attr::`logPrior`.
397+ - Even with :py:attr::`logPrior`, this is NOT the same as the data log likelihood given
398+ the hyperparameters.
399+ - This is computed from the topic distributions computed during training. If you call
400+ :py:attr::`logLikelihood` on the same training dataset, the topic distributions
401+ will be computed again, possibly giving different results.
402+ """
403+ return self ._call_java ("trainingLogLikelihood" )
404+
381405
382406class DistributedLDAModel (LDAModel ):
383407 """
384- Model fitted by LDA.
408+ Distributed model fitted by :py:attr:`LDA`.
409+ This type of model is currently only produced by Expectation-Maximization (EM).
410+ This model stores the inferred topics, the full training dataset, and the topic distribution
411+ for each training document.
385412
386413 .. versionadded:: 2.0.0
387414 """
388415 def toLocal (self ):
389- return self ._call_java ("toLocal" )
416+ return LocalLDAModel (self ._call_java ("toLocal" ))
417+
418+ @since ("2.0.0" )
419+ def logPrior (self ):
420+ """
421+ Log probability of the current parameter estimate:
422+ log P(topics, topic distributions for docs | alpha, eta)
423+ """
424+ return self ._call_java ("logPrior" )
425+
426+ @since ("2.0.0" )
427+ def getCheckpointFiles (self ):
428+ """
429+ If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may be
430+ saved checkpoint files. This method is provided so that users can manage those files.
431+
432+ Note that removing the checkpoints can cause failures if a partition is lost and is needed
433+ by certain :py:attr:`DistributedLDAModel` methods. Reference counting will clean up the
434+ checkpoints when this model and derivative data go out of scope.
435+ """
436+ return self ._call_java ("getCheckpointFiles" )
390437
391438
392439class LocalLDAModel (LDAModel ):
393440 """
394- Model fitted by LDA.
441+ Local (non-distributed) model fitted by :py:attr:`LDA`.
442+ This model stores the inferred topics only; it does not store info about the training dataset.
395443
396444 .. versionadded:: 2.0.0
397445 """
@@ -401,18 +449,27 @@ class LocalLDAModel(LDAModel):
401449class LDA (JavaEstimator , HasFeaturesCol , HasMaxIter , HasSeed , HasCheckpointInterval ):
402450 """
403451 Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
404- Terminology
405- - "word" = "term": an element of the vocabulary
452+ Terminology:
453+
454+ - "term" = "word": an el
406455 - "token": instance of a term appearing in a document
407- - "topic": multinomial distribution over words representing some concept
456+ - "topic": multinomial distribution over terms representing some concept
457+ - "document": one piece of text, corresponding to one row in the input data
408458 References:
409459 - Original LDA paper (journal version):
410- Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.
460+ Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.
461+
462+ Input data (featuresCol):
463+ LDA is given a collection of documents as input data, via the featuresCol parameter.
464+ Each document is specified as a :py:attr:`Vector` of length vocabSize, where each entry is the
465+ count for the corresponding term (word) in the document. Feature transformers such as
466+ :py:attr:`Tokenizer` and :py:attr:`CountVectorizer`
467+ can be useful for converting text to word count vectors.
411468
412469 >>> from pyspark.mllib.linalg import Vectors, SparseVector
413470 >>> from pyspark.ml.clustering import LDA
414- >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], \
415- [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
471+ >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])],
472+ ... [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
416473 >>> lda = LDA(k=2, seed=1, optimizer="em")
417474 >>> model = lda.fit(df)
418475 >>> model.isDistributed()
@@ -505,8 +562,8 @@ def setParams(self, featuresCol="features", k=10,
505562 setParams(self, featuresCol="features", k=10, \
506563 optimizer="online", learningOffset=1024.0, learningDecay=0.51, \
507564 subsamplingRate=0.05, optimizeDocConcentration=True, \
508- checkpointInterval=10, maxIter=20, docConcentration=None,
509- topicConcentration=None,
565+ checkpointInterval=10, maxIter=20, docConcentration=None, \
566+ topicConcentration=None, \
510567 topicDistributionCol="topicDistribution", seed=None):
511568
512569 Sets params for LDA.
@@ -529,7 +586,7 @@ def setK(self, value):
529586 @since ("2.0.0" )
530587 def getK (self ):
531588 """
532- Gets the value of `k` or its default value.
589+ Gets the value of :py:attr: `k` or its default value.
533590 """
534591 return self .getOrDefault (self .k )
535592
@@ -538,6 +595,7 @@ def setOptimizer(self, value):
538595 """
539596 Sets the value of :py:attr:`optimizer`.
540597 Currenlty only support 'em' and 'online'.
598+
541599 >>> algo = LDA().setOptimizer("em")
542600 >>> algo.getOptimizer()
543601 'em'
@@ -548,14 +606,15 @@ def setOptimizer(self, value):
548606 @since ("2.0.0" )
549607 def getOptimizer (self ):
550608 """
551- Gets the value of `optimizer` or its default value.
609+ Gets the value of :py:attr: `optimizer` or its default value.
552610 """
553611 return self .getOrDefault (self .optimizer )
554612
555613 @since ("2.0.0" )
556614 def setLearningOffset (self , value ):
557615 """
558616 Sets the value of :py:attr:`learningOffset`.
617+
559618 >>> algo = LDA().setLearningOffset(100)
560619 >>> algo.getLearningOffset()
561620 100
@@ -566,14 +625,15 @@ def setLearningOffset(self, value):
566625 @since ("2.0.0" )
567626 def getLearningOffset (self ):
568627 """
569- Gets the value of `learningOffset` or its default value.
628+ Gets the value of :py:attr: `learningOffset` or its default value.
570629 """
571630 return self .getOrDefault (self .learningOffset )
572631
573632 @since ("2.0.0" )
574633 def setLearningDecay (self , value ):
575634 """
576635 Sets the value of :py:attr:`learningDecay`.
636+
577637 >>> algo = LDA().setLearningDecay(0.1)
578638 >>> algo.getLearningDecay()
579639 0.1...
@@ -584,14 +644,15 @@ def setLearningDecay(self, value):
584644 @since ("2.0.0" )
585645 def getLearningDecay (self ):
586646 """
587- Gets the value of `learningDecay` or its default value.
647+ Gets the value of :py:attr: `learningDecay` or its default value.
588648 """
589649 return self .getOrDefault (self .learningDecay )
590650
591651 @since ("2.0.0" )
592652 def setSubsamplingRate (self , value ):
593653 """
594654 Sets the value of :py:attr:`subsamplingRate`.
655+
595656 >>> algo = LDA().setSubsamplingRate(0.1)
596657 >>> algo.getSubsamplingRate()
597658 0.1...
@@ -602,14 +663,15 @@ def setSubsamplingRate(self, value):
602663 @since ("2.0.0" )
603664 def getSubsamplingRate (self ):
604665 """
605- Gets the value of `subsamplingRate` or its default value.
666+ Gets the value of :py:attr: `subsamplingRate` or its default value.
606667 """
607668 return self .getOrDefault (self .subsamplingRate )
608669
609670 @since ("2.0.0" )
610671 def setOptimizeDocConcentration (self , value ):
611672 """
612673 Sets the value of :py:attr:`optimizeDocConcentration`.
674+
613675 >>> algo = LDA().setOptimizeDocConcentration(True)
614676 >>> algo.getOptimizeDocConcentration()
615677 True
@@ -620,14 +682,15 @@ def setOptimizeDocConcentration(self, value):
620682 @since ("2.0.0" )
621683 def getOptimizeDocConcentration (self ):
622684 """
623- Gets the value of `optimizeDocConcentration` or its default value.
685+ Gets the value of :py:attr: `optimizeDocConcentration` or its default value.
624686 """
625687 return self .getOrDefault (self .optimizeDocConcentration )
626688
627689 @since ("2.0.0" )
628690 def setDocConcentration (self , value ):
629691 """
630692 Sets the value of :py:attr:`docConcentration`.
693+
631694 >>> algo = LDA().setDocConcentration([0.1, 0.2])
632695 >>> algo.getDocConcentration()
633696 [0.1..., 0.2...]
@@ -638,14 +701,15 @@ def setDocConcentration(self, value):
638701 @since ("2.0.0" )
639702 def getDocConcentration (self ):
640703 """
641- Gets the value of `docConcentration` or its default value.
704+ Gets the value of :py:attr: `docConcentration` or its default value.
642705 """
643706 return self .getOrDefault (self .docConcentration )
644707
645708 @since ("2.0.0" )
646709 def setTopicConcentration (self , value ):
647710 """
648711 Sets the value of :py:attr:`topicConcentration`.
712+
649713 >>> algo = LDA().setTopicConcentration(0.5)
650714 >>> algo.getTopicConcentration()
651715 0.5...
@@ -656,14 +720,15 @@ def setTopicConcentration(self, value):
656720 @since ("2.0.0" )
657721 def getTopicConcentration (self ):
658722 """
659- Gets the value of `topicConcentration` or its default value.
723+ Gets the value of :py:attr: `topicConcentration` or its default value.
660724 """
661725 return self .getOrDefault (self .topicConcentration )
662726
663727 @since ("2.0.0" )
664728 def setTopicDistributionCol (self , value ):
665729 """
666730 Sets the value of :py:attr:`topicDistributionCol`.
731+
667732 >>> algo = LDA().setTopicDistributionCol("topicDistributionCol")
668733 >>> algo.getTopicDistributionCol()
669734 'topicDistributionCol'
@@ -674,7 +739,7 @@ def setTopicDistributionCol(self, value):
674739 @since ("2.0.0" )
675740 def getTopicDistributionCol (self ):
676741 """
677- Gets the value of `topicDistributionCol` or its default value.
742+ Gets the value of :py:attr: `topicDistributionCol` or its default value.
678743 """
679744 return self .getOrDefault (self .topicDistributionCol )
680745
0 commit comments