|
31 | 31 | from pyspark.rdd import RDD, ignore_unicode_prefix |
32 | 32 | from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py |
33 | 33 | from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector |
| 34 | +from pyspark.mllib.regression import LabeledPoint |
34 | 35 | from pyspark.mllib.stat.distribution import MultivariateGaussian |
35 | 36 | from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable |
36 | 37 | from pyspark.streaming import DStream |
37 | 38 |
|
38 | 39 | __all__ = ['KMeansModel', 'KMeans', 'GaussianMixtureModel', 'GaussianMixture', |
39 | 40 | 'PowerIterationClusteringModel', 'PowerIterationClustering', |
40 | | - 'StreamingKMeans', 'StreamingKMeansModel'] |
| 41 | + 'StreamingKMeans', 'StreamingKMeansModel', |
| 42 | + 'LDA', 'LDAModel'] |
41 | 43 |
|
42 | 44 |
|
43 | 45 | @inherit_doc |
@@ -574,5 +576,59 @@ def _test(): |
574 | 576 | exit(-1) |
575 | 577 |
|
576 | 578 |
|
| 579 | +class LDAModel(JavaModelWrapper): |
| 580 | + |
| 581 | + """ A clustering model derived from the LDA method. |
| 582 | +
|
| 583 | + Latent Dirichlet Allocation (LDA), a topic model designed for text documents. |
| 584 | + Terminologyu |
| 585 | + - "word" = "term": an element of the vocabulary |
| 586 | + - "token": instance of a term appearing in a document |
| 587 | + - "topic": multinomial distribution over words representing some concept |
| 588 | + References: |
| 589 | + - Original LDA paper (journal version): |
| 590 | + Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003. |
| 591 | +
|
| 592 | + >>> from pyspark.mllib.linalg import Vectors |
| 593 | + >>> from collections import namedtuple |
| 594 | + >>> from numpy.testing import assert_almost_equal |
| 595 | + >>> data = [ |
| 596 | + ... LabeledPoint(1, [0.0, 1.0]), |
| 597 | + ... LabeledPoint(2, [1.0, 0.0]), |
| 598 | + ... ] |
| 599 | + >>> rdd = sc.parallelize(data) |
| 600 | + >>> model = LDA.train(rdd, 2) |
| 601 | + >>> model.vocabSize() |
| 602 | + 2 |
| 603 | + >>> topics = model.topicsMatrix() |
| 604 | + >>> topics_expect = array([[0.5, 0.5], [0.5, 0.5]]) |
| 605 | + >>> assert_almost_equal(topics, topics_expect, 1) |
| 606 | + """ |
| 607 | + |
| 608 | + def topicsMatrix(self): |
| 609 | + """Inferred topics, where each topic is represented by a distribution over terms.""" |
| 610 | + return self.call("topicsMatrix").toArray() |
| 611 | + |
| 612 | + def vocabSize(self): |
| 613 | + """Vocabulary size (number of terms or terms in the vocabulary)""" |
| 614 | + return self.call("vocabSize") |
| 615 | + |
| 616 | + def describeTopics(self, maxTermsPerTopic=None): |
| 617 | + """Return the topics described by weighted terms. |
| 618 | +
|
| 619 | + TODO: |
| 620 | + Implementing this method is a little hard. Since Scala's return value consistes of tuples. |
| 621 | + """ |
| 622 | + raise NotImplementedError("LDAModel.describeTopics() in Python must be implemented.") |
| 623 | + |
| 624 | + |
| 625 | +class LDA(): |
| 626 | + |
| 627 | + @classmethod |
| 628 | + def train(cls, rdd, k, seed=None): |
| 629 | + model = callMLlibFunc("trainLDAModel", rdd, k, seed) |
| 630 | + return LDAModel(model) |
| 631 | + |
| 632 | + |
577 | 633 | if __name__ == "__main__": |
578 | 634 | _test() |
0 commit comments