Skip to content

Commit 2b2bafe

Browse files
committed
address comments
1 parent 372d5a5 commit 2b2bafe

File tree

3 files changed

+106
-43
lines changed

3 files changed

+106
-43
lines changed

mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
355355
* :: Experimental ::
356356
* Model fitted by [[LDA]].
357357
*
358-
* @param vocabSize Vocabulary size (number of terms or terms in the vocabulary)
358+
* @param vocabSize Vocabulary size (number of terms or words in the vocabulary)
359359
* @param sqlContext Used to construct local DataFrames for returning query results
360360
*/
361361
@Since("1.6.0")

python/pyspark/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,6 @@ def since(version):
5959
indent_p = re.compile(r'\n( +)')
6060

6161
def deco(f):
62-
if not f.__doc__:
63-
raise Exception("Please add doc for function %s" % (f.__name__))
6462
indents = indent_p.findall(f.__doc__)
6563
indent = ' ' * (min(len(m) for m in indents) if indents else 0)
6664
f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)

python/pyspark/ml/clustering.py

Lines changed: 105 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
from pyspark.ml.param.shared import *
2222
from pyspark.mllib.common import inherit_doc
2323

24-
__all__ = ['KMeans', 'KMeansModel',
25-
'BisectingKMeans', 'BisectingKMeansModel',
24+
__all__ = ['BisectingKMeans', 'BisectingKMeansModel',
25+
'KMeans', 'KMeansModel',
2626
'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel']
2727

2828

@@ -313,85 +313,133 @@ def _create_model(self, java_model):
313313

314314
class LDAModel(JavaModel):
315315
"""
316-
A clustering model derived from the LDA method.
316+
Latent Dirichlet Allocation (LDA) model.
317+
This abstraction permits for different underlying representations,
318+
including local and distributed data structures.
317319
318320
.. versionadded:: 2.0.0
319321
"""
320322

321323
@since("2.0.0")
322324
def isDistributed(self):
323-
"""Indicates whether this instance is of type DistributedLDAModel"""
325+
"""
326+
Indicates whether this instance is of type DistributedLDAModel
327+
"""
324328
return self._call_java("isDistributed")
325329

326330
@since("2.0.0")
327331
def vocabSize(self):
328-
"""Vocabulary size (number of terms or terms in the vocabulary)"""
332+
"""Vocabulary size (number of terms or words in the vocabulary)"""
329333
return self._call_java("vocabSize")
330334

331335
@since("2.0.0")
332336
def topicsMatrix(self):
333-
""" Inferred topics, where each topic is represented by a distribution over terms.
337+
"""
338+
Inferred topics, where each topic is represented by a distribution over terms.
334339
This is a matrix of size vocabSize x k, where each column is a topic.
335340
No guarantees are given about the ordering of the topics.
336341
337-
WARNING: If this model is actually a [[DistributedLDAModel]] instance produced by
338-
the Expectation-Maximization ("em") [[optimizer]], then this method could involve
342+
WARNING: If this model is actually a :py:attr:`DistributedLDAModel` instance produced by
343+
the Expectation-Maximization ("em") `optimizer`, then this method could involve
339344
collecting a large amount of data to the driver (on the order of vocabSize x k).
340345
"""
341346
return self._call_java("topicsMatrix")
342347

343348
@since("2.0.0")
344349
def logLikelihood(self, dataset):
345-
"""Calculates a lower bound on the log likelihood of the entire corpus.
350+
"""
351+
Calculates a lower bound on the log likelihood of the entire corpus.
346352
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
347353
348-
WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when
349-
[[optimizer]] is set to "em"), this involves collecting a large [[topicsMatrix]] to the
354+
WARNING: If this model is an instance of :py:attr:`DistributedLDAModel` (produced when
355+
:py:attr:`optimizer` is set to "em"), this involves collecting a large :py:attr:`topicsMatrix` to the
350356
driver. This implementation may be changed in the future.
351357
"""
352358
return self._call_java("logLikelihood", dataset)
353359

354360
@since("2.0.0")
355361
def logPerplexity(self, dataset):
356-
"""Calculate an upper bound bound on perplexity. (Lower is better.)
362+
"""
363+
Calculate an upper bound bound on perplexity. (Lower is better.)
357364
See Equation (16) in the Online LDA paper (Hoffman et al., 2010).
358365
359-
WARNING: If this model is an instance of [[DistributedLDAModel]] (produced when
360-
[[optimizer]] is set to "em"), this involves collecting a large [[topicsMatrix]] to the
366+
WARNING: If this model is an instance of :py:attr:`DistributedLDAModel` (produced when
367+
:py:attr:`optimizer` is set to "em"), this involves collecting a large :py:attr:`topicsMatrix` to the
361368
driver. This implementation may be changed in the future.
362369
"""
363370
return self._call_java("logPerplexity", dataset)
364371

365372
@since("2.0.0")
366373
def describeTopics(self, maxTermsPerTopic=10):
367-
"""Return the topics described by their top-weighted terms.
368-
369-
WARNING: If vocabSize and k are large, this can return a large object!
374+
"""
375+
Return the topics described by their top-weighted terms.
370376
"""
371377
return self._call_java("describeTopics", maxTermsPerTopic)
372378

373379
@since("2.0.0")
374380
def estimatedDocConcentration(self):
375-
"""Value for [[docConcentration]] estimated from data.
376-
If Online LDA was used and [[optimizeDocConcentration]] was set to false,
377-
then this returns the fixed (given) value for the [[docConcentration]] parameter.
381+
"""
382+
Value for :py:attr:`LDA.docConcentration` estimated from data.
383+
If Online LDA was used and :py:attr::`LDA.optimizeDocConcentration` was set to false,
384+
then this returns the fixed (given) value for the :py:attr:`LDA.docConcentration` parameter.
378385
"""
379386
return self._call_java("estimatedDocConcentration")
380387

388+
@since("2.0.0")
389+
def trainingLogLikelihood(self):
390+
"""
391+
Log likelihood of the observed tokens in the training set,
392+
given the current parameter estimates:
393+
log P(docs | topics, topic distributions for docs, Dirichlet hyperparameters)
394+
395+
Notes:
396+
- This excludes the prior; for that, use :py:attr::`logPrior`.
397+
- Even with :py:attr::`logPrior`, this is NOT the same as the data log likelihood given
398+
the hyperparameters.
399+
- This is computed from the topic distributions computed during training. If you call
400+
:py:attr::`logLikelihood` on the same training dataset, the topic distributions
401+
will be computed again, possibly giving different results.
402+
"""
403+
return self._call_java("trainingLogLikelihood")
404+
381405

382406
class DistributedLDAModel(LDAModel):
383407
"""
384-
Model fitted by LDA.
408+
Distributed model fitted by :py:attr:`LDA`.
409+
This type of model is currently only produced by Expectation-Maximization (EM).
410+
This model stores the inferred topics, the full training dataset, and the topic distribution
411+
for each training document.
385412
386413
.. versionadded:: 2.0.0
387414
"""
388415
def toLocal(self):
389-
return self._call_java("toLocal")
416+
return LocalLDAModel(self._call_java("toLocal"))
417+
418+
@since("2.0.0")
419+
def logPrior(self):
420+
"""
421+
Log probability of the current parameter estimate:
422+
log P(topics, topic distributions for docs | alpha, eta)
423+
"""
424+
return self._call_java("logPrior")
425+
426+
@since("2.0.0")
427+
def getCheckpointFiles(self):
428+
"""
429+
If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may be
430+
saved checkpoint files. This method is provided so that users can manage those files.
431+
432+
Note that removing the checkpoints can cause failures if a partition is lost and is needed
433+
by certain :py:attr:`DistributedLDAModel` methods. Reference counting will clean up the
434+
checkpoints when this model and derivative data go out of scope.
435+
"""
436+
return self._call_java("getCheckpointFiles")
390437

391438

392439
class LocalLDAModel(LDAModel):
393440
"""
394-
Model fitted by LDA.
441+
Local (non-distributed) model fitted by :py:attr:`LDA`.
442+
This model stores the inferred topics only; it does not store info about the training dataset.
395443
396444
.. versionadded:: 2.0.0
397445
"""
@@ -401,18 +449,27 @@ class LocalLDAModel(LDAModel):
401449
class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval):
402450
"""
403451
Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
404-
Terminology
405-
- "word" = "term": an element of the vocabulary
452+
Terminology:
453+
454+
- "term" = "word": an el
406455
- "token": instance of a term appearing in a document
407-
- "topic": multinomial distribution over words representing some concept
456+
- "topic": multinomial distribution over terms representing some concept
457+
- "document": one piece of text, corresponding to one row in the input data
408458
References:
409459
- Original LDA paper (journal version):
410-
Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.
460+
Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.
461+
462+
Input data (featuresCol):
463+
LDA is given a collection of documents as input data, via the featuresCol parameter.
464+
Each document is specified as a :py:attr:`Vector` of length vocabSize, where each entry is the
465+
count for the corresponding term (word) in the document. Feature transformers such as
466+
:py:attr:`Tokenizer` and :py:attr:`CountVectorizer`
467+
can be useful for converting text to word count vectors.
411468
412469
>>> from pyspark.mllib.linalg import Vectors, SparseVector
413470
>>> from pyspark.ml.clustering import LDA
414-
>>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], \
415-
[2, SparseVector(2, {0: 1.0})],], ["id", "features"])
471+
>>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])],
472+
... [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
416473
>>> lda = LDA(k=2, seed=1, optimizer="em")
417474
>>> model = lda.fit(df)
418475
>>> model.isDistributed()
@@ -505,8 +562,8 @@ def setParams(self, featuresCol="features", k=10,
505562
setParams(self, featuresCol="features", k=10, \
506563
optimizer="online", learningOffset=1024.0, learningDecay=0.51, \
507564
subsamplingRate=0.05, optimizeDocConcentration=True, \
508-
checkpointInterval=10, maxIter=20, docConcentration=None,
509-
topicConcentration=None,
565+
checkpointInterval=10, maxIter=20, docConcentration=None, \
566+
topicConcentration=None, \
510567
topicDistributionCol="topicDistribution", seed=None):
511568
512569
Sets params for LDA.
@@ -529,7 +586,7 @@ def setK(self, value):
529586
@since("2.0.0")
530587
def getK(self):
531588
"""
532-
Gets the value of `k` or its default value.
589+
Gets the value of :py:attr:`k` or its default value.
533590
"""
534591
return self.getOrDefault(self.k)
535592

@@ -538,6 +595,7 @@ def setOptimizer(self, value):
538595
"""
539596
Sets the value of :py:attr:`optimizer`.
540597
Currenlty only support 'em' and 'online'.
598+
541599
>>> algo = LDA().setOptimizer("em")
542600
>>> algo.getOptimizer()
543601
'em'
@@ -548,14 +606,15 @@ def setOptimizer(self, value):
548606
@since("2.0.0")
549607
def getOptimizer(self):
550608
"""
551-
Gets the value of `optimizer` or its default value.
609+
Gets the value of :py:attr:`optimizer` or its default value.
552610
"""
553611
return self.getOrDefault(self.optimizer)
554612

555613
@since("2.0.0")
556614
def setLearningOffset(self, value):
557615
"""
558616
Sets the value of :py:attr:`learningOffset`.
617+
559618
>>> algo = LDA().setLearningOffset(100)
560619
>>> algo.getLearningOffset()
561620
100
@@ -566,14 +625,15 @@ def setLearningOffset(self, value):
566625
@since("2.0.0")
567626
def getLearningOffset(self):
568627
"""
569-
Gets the value of `learningOffset` or its default value.
628+
Gets the value of :py:attr:`learningOffset` or its default value.
570629
"""
571630
return self.getOrDefault(self.learningOffset)
572631

573632
@since("2.0.0")
574633
def setLearningDecay(self, value):
575634
"""
576635
Sets the value of :py:attr:`learningDecay`.
636+
577637
>>> algo = LDA().setLearningDecay(0.1)
578638
>>> algo.getLearningDecay()
579639
0.1...
@@ -584,14 +644,15 @@ def setLearningDecay(self, value):
584644
@since("2.0.0")
585645
def getLearningDecay(self):
586646
"""
587-
Gets the value of `learningDecay` or its default value.
647+
Gets the value of :py:attr:`learningDecay` or its default value.
588648
"""
589649
return self.getOrDefault(self.learningDecay)
590650

591651
@since("2.0.0")
592652
def setSubsamplingRate(self, value):
593653
"""
594654
Sets the value of :py:attr:`subsamplingRate`.
655+
595656
>>> algo = LDA().setSubsamplingRate(0.1)
596657
>>> algo.getSubsamplingRate()
597658
0.1...
@@ -602,14 +663,15 @@ def setSubsamplingRate(self, value):
602663
@since("2.0.0")
603664
def getSubsamplingRate(self):
604665
"""
605-
Gets the value of `subsamplingRate` or its default value.
666+
Gets the value of :py:attr:`subsamplingRate` or its default value.
606667
"""
607668
return self.getOrDefault(self.subsamplingRate)
608669

609670
@since("2.0.0")
610671
def setOptimizeDocConcentration(self, value):
611672
"""
612673
Sets the value of :py:attr:`optimizeDocConcentration`.
674+
613675
>>> algo = LDA().setOptimizeDocConcentration(True)
614676
>>> algo.getOptimizeDocConcentration()
615677
True
@@ -620,14 +682,15 @@ def setOptimizeDocConcentration(self, value):
620682
@since("2.0.0")
621683
def getOptimizeDocConcentration(self):
622684
"""
623-
Gets the value of `optimizeDocConcentration` or its default value.
685+
Gets the value of :py:attr:`optimizeDocConcentration` or its default value.
624686
"""
625687
return self.getOrDefault(self.optimizeDocConcentration)
626688

627689
@since("2.0.0")
628690
def setDocConcentration(self, value):
629691
"""
630692
Sets the value of :py:attr:`docConcentration`.
693+
631694
>>> algo = LDA().setDocConcentration([0.1, 0.2])
632695
>>> algo.getDocConcentration()
633696
[0.1..., 0.2...]
@@ -638,14 +701,15 @@ def setDocConcentration(self, value):
638701
@since("2.0.0")
639702
def getDocConcentration(self):
640703
"""
641-
Gets the value of `docConcentration` or its default value.
704+
Gets the value of :py:attr:`docConcentration` or its default value.
642705
"""
643706
return self.getOrDefault(self.docConcentration)
644707

645708
@since("2.0.0")
646709
def setTopicConcentration(self, value):
647710
"""
648711
Sets the value of :py:attr:`topicConcentration`.
712+
649713
>>> algo = LDA().setTopicConcentration(0.5)
650714
>>> algo.getTopicConcentration()
651715
0.5...
@@ -656,14 +720,15 @@ def setTopicConcentration(self, value):
656720
@since("2.0.0")
657721
def getTopicConcentration(self):
658722
"""
659-
Gets the value of `topicConcentration` or its default value.
723+
Gets the value of :py:attr:`topicConcentration` or its default value.
660724
"""
661725
return self.getOrDefault(self.topicConcentration)
662726

663727
@since("2.0.0")
664728
def setTopicDistributionCol(self, value):
665729
"""
666730
Sets the value of :py:attr:`topicDistributionCol`.
731+
667732
>>> algo = LDA().setTopicDistributionCol("topicDistributionCol")
668733
>>> algo.getTopicDistributionCol()
669734
'topicDistributionCol'
@@ -674,7 +739,7 @@ def setTopicDistributionCol(self, value):
674739
@since("2.0.0")
675740
def getTopicDistributionCol(self):
676741
"""
677-
Gets the value of `topicDistributionCol` or its default value.
742+
Gets the value of :py:attr:`topicDistributionCol` or its default value.
678743
"""
679744
return self.getOrDefault(self.topicDistributionCol)
680745

0 commit comments

Comments
 (0)