Skip to content

Commit 25ef2ac

Browse files
committed
Resolve conflicts with rebasing
1 parent 11e5c37 commit 25ef2ac

File tree

2 files changed

+77
-1
lines changed

2 files changed

+77
-1
lines changed

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,26 @@ private[python] class PythonMLLibAPI extends Serializable {
502502
new MatrixFactorizationModelWrapper(model)
503503
}
504504

505+
/**
506+
* Java stub for Python mllib LDA.run()
507+
*/
508+
def trainLDAModel(
509+
data: JavaRDD[LabeledPoint],
510+
k: Int,
511+
seed: java.lang.Long): LDAModel = {
512+
val algo = new LDA()
513+
.setK(k)
514+
515+
if (seed != null) algo.setSeed(seed)
516+
517+
try {
518+
algo.run(data.rdd.map(x => (x.label.toLong, x.features)))
519+
} finally {
520+
data.rdd.unpersist(blocking = false)
521+
}
522+
}
523+
524+
505525
/**
506526
* Java stub for Python mllib FPGrowth.train(). This stub returns a handle
507527
* to the Java object instead of the content of the Java object. Extra care

python/pyspark/mllib/clustering.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,15 @@
3131
from pyspark.rdd import RDD, ignore_unicode_prefix
3232
from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, callJavaFunc, _py2java, _java2py
3333
from pyspark.mllib.linalg import SparseVector, _convert_to_vector, DenseVector
34+
from pyspark.mllib.regression import LabeledPoint
3435
from pyspark.mllib.stat.distribution import MultivariateGaussian
3536
from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable
3637
from pyspark.streaming import DStream
3738

3839
__all__ = ['KMeansModel', 'KMeans', 'GaussianMixtureModel', 'GaussianMixture',
3940
'PowerIterationClusteringModel', 'PowerIterationClustering',
40-
'StreamingKMeans', 'StreamingKMeansModel']
41+
'StreamingKMeans', 'StreamingKMeansModel',
42+
'LDA', 'LDAModel']
4143

4244

4345
@inherit_doc
@@ -574,5 +576,59 @@ def _test():
574576
exit(-1)
575577

576578

579+
class LDAModel(JavaModelWrapper):
580+
581+
""" A clustering model derived from the LDA method.
582+
583+
Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
584+
Terminologyu
585+
- "word" = "term": an element of the vocabulary
586+
- "token": instance of a term appearing in a document
587+
- "topic": multinomial distribution over words representing some concept
588+
References:
589+
- Original LDA paper (journal version):
590+
Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.
591+
592+
>>> from pyspark.mllib.linalg import Vectors
593+
>>> from collections import namedtuple
594+
>>> from numpy.testing import assert_almost_equal
595+
>>> data = [
596+
... LabeledPoint(1, [0.0, 1.0]),
597+
... LabeledPoint(2, [1.0, 0.0]),
598+
... ]
599+
>>> rdd = sc.parallelize(data)
600+
>>> model = LDA.train(rdd, 2)
601+
>>> model.vocabSize()
602+
2
603+
>>> topics = model.topicsMatrix()
604+
>>> topics_expect = array([[0.5, 0.5], [0.5, 0.5]])
605+
>>> assert_almost_equal(topics, topics_expect, 1)
606+
"""
607+
608+
def topicsMatrix(self):
609+
"""Inferred topics, where each topic is represented by a distribution over terms."""
610+
return self.call("topicsMatrix").toArray()
611+
612+
def vocabSize(self):
613+
"""Vocabulary size (number of terms or terms in the vocabulary)"""
614+
return self.call("vocabSize")
615+
616+
def describeTopics(self, maxTermsPerTopic=None):
617+
"""Return the topics described by weighted terms.
618+
619+
TODO:
620+
Implementing this method is a little hard. Since Scala's return value consistes of tuples.
621+
"""
622+
raise NotImplementedError("LDAModel.describeTopics() in Python must be implemented.")
623+
624+
625+
class LDA():
626+
627+
@classmethod
628+
def train(cls, rdd, k, seed=None):
629+
model = callMLlibFunc("trainLDAModel", rdd, k, seed)
630+
return LDAModel(model)
631+
632+
577633
if __name__ == "__main__":
578634
_test()

0 commit comments

Comments
 (0)