Skip to content

Commit 0e2e006

Browse files
committed
respond to review comments
1 parent 08a45da commit 0e2e006

File tree

2 files changed

+27
-21
lines changed

2 files changed

+27
-21
lines changed

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ import org.apache.spark.util.Utils
3737
* - "token": instance of a term appearing in a document
3838
* - "topic": multinomial distribution over words representing some concept
3939
*
40+
* References:
41+
* - Original LDA paper (journal version):
42+
* Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.
43+
*
4044
* @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
4145
* (Wikipedia)]]
4246
*/
@@ -47,12 +51,11 @@ class LDA private (
4751
private var docConcentration: Double,
4852
private var topicConcentration: Double,
4953
private var seed: Long,
50-
private var checkpointInterval: Int) extends Logging {
54+
private var checkpointInterval: Int,
55+
private var ldaOptimizer: LDAOptimizer) extends Logging {
5156

5257
def this() = this(k = 10, maxIterations = 20, docConcentration = -1, topicConcentration = -1,
53-
seed = Utils.random.nextLong(), checkpointInterval = 10)
54-
55-
private var ldaOptimizer: LDAOptimizer = getDefaultOptimizer("EM")
58+
seed = Utils.random.nextLong(), checkpointInterval = 10, ldaOptimizer = new EMLDAOptimizer)
5659

5760
/**
5861
* Number of topics to infer. I.e., the number of soft cluster centers.
@@ -208,7 +211,7 @@ class LDA private (
208211

209212

210213
/** LDAOptimizer used to perform the actual calculation */
211-
def getOptimizer(): LDAOptimizer = ldaOptimizer
214+
def getOptimizer: LDAOptimizer = ldaOptimizer
212215

213216
/**
214217
* LDAOptimizer used to perform the actual calculation (default = EMLDAOptimizer)
@@ -220,24 +223,18 @@ class LDA private (
220223

221224
/**
222225
* Set the LDAOptimizer used to perform the actual calculation by algorithm name.
223-
* Currently "EM" is supported.
226+
* Currently "em" is supported.
224227
*/
225228
def setOptimizer(optimizerName: String): this.type = {
226-
this.ldaOptimizer = getDefaultOptimizer(optimizerName)
229+
this.ldaOptimizer =
230+
optimizerName.toLowerCase match {
231+
case "em" => new EMLDAOptimizer
232+
case other =>
233+
throw new IllegalArgumentException(s"Only em is supported but got $other.")
234+
}
227235
this
228236
}
229237

230-
/**
231-
* Get the default optimizer from String parameter.
232-
*/
233-
private def getDefaultOptimizer(optimizerName: String): LDAOptimizer = {
234-
optimizerName match{
235-
case "EM" => new EMLDAOptimizer()
236-
case other =>
237-
throw new UnsupportedOperationException(s"Only EM are supported but got $other.")
238-
}
239-
}
240-
241238
/**
242239
* Learn an LDA model using the given dataset.
243240
*

mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering
2020
import java.util.Random
2121

2222
import breeze.linalg.{DenseVector => BDV, normalize}
23+
2324
import org.apache.spark.annotation.Experimental
2425
import org.apache.spark.graphx._
2526
import org.apache.spark.graphx.impl.GraphImpl
@@ -30,13 +31,20 @@ import org.apache.spark.rdd.RDD
3031
/**
3132
* :: Experimental ::
3233
*
33-
* An LDAOptimizer contains an algorithm for LDA and performs the actual computation, which
34-
* stores internal data structure (Graph or Matrix) and any other parameter for the algorithm.
35-
* The interface is isolated to improve the extensibility of LDA.
34+
* An LDAOptimizer specifies which optimization/learning/inference algorithm to use, and it can
35+
* hold optimizer-specific parameters for users to set.
3636
*/
3737
@Experimental
3838
trait LDAOptimizer{
3939

40+
/*
41+
DEVELOPERS NOTE:
42+
43+
An LDAOptimizer contains an algorithm for LDA and performs the actual computation, which
44+
stores internal data structure (Graph or Matrix) and other parameters for the algorithm.
45+
The interface is isolated to improve the extensibility of LDA.
46+
*/
47+
4048
/**
4149
* Initializer for the optimizer. LDA passes the common parameters to the optimizer and
4250
* the internal structure can be initialized properly.
@@ -75,6 +83,7 @@ trait LDAOptimizer{
7583
class EMLDAOptimizer extends LDAOptimizer{
7684

7785
import LDA._
86+
7887
/**
7988
* Following fields will only be initialized through initialState method
8089
*/

0 commit comments

Comments
 (0)