Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class GaussianMixture private (
/**
* Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
* maxIterations: 100, seed: random}.
* @since 1.3.0
*/
def this() = this(2, 0.01, 100, Utils.random.nextLong())

Expand All @@ -72,9 +73,11 @@ class GaussianMixture private (
// default random starting point
private var initialModel: Option[GaussianMixtureModel] = None

/** Set the initial GMM starting point, bypassing the random initialization.
* You must call setK() prior to calling this method, and the condition
* (model.k == this.k) must be met; failure will result in an IllegalArgumentException
/**
* Set the initial GMM starting point, bypassing the random initialization.
* You must call setK() prior to calling this method, and the condition
* (model.k == this.k) must be met; failure will result in an IllegalArgumentException
* @since 1.3.0
*/
def setInitialModel(model: GaussianMixtureModel): this.type = {
if (model.k == k) {
Expand All @@ -85,30 +88,46 @@ class GaussianMixture private (
this
}

/** Return the user supplied initial GMM, if supplied */
/**
* Return the user supplied initial GMM, if supplied
* @since 1.3.0
*/
def getInitialModel: Option[GaussianMixtureModel] = initialModel

/** Set the number of Gaussians in the mixture model. Default: 2 */
/**
* Set the number of Gaussians in the mixture model. Default: 2
* @since 1.3.0
*/
def setK(k: Int): this.type = {
this.k = k
this
}

/** Return the number of Gaussians in the mixture model */
/**
* Return the number of Gaussians in the mixture model
* @since 1.3.0
*/
def getK: Int = k

/** Set the maximum number of iterations to run. Default: 100 */
/**
* Set the maximum number of iterations to run. Default: 100
* @since 1.3.0
*/
def setMaxIterations(maxIterations: Int): this.type = {
this.maxIterations = maxIterations
this
}

/** Return the maximum number of iterations to run */
/**
* Return the maximum number of iterations to run
* @since 1.3.0
*/
def getMaxIterations: Int = maxIterations

/**
* Set the largest change in log-likelihood at which convergence is
* considered to have occurred.
* @since 1.3.0
*/
def setConvergenceTol(convergenceTol: Double): this.type = {
this.convergenceTol = convergenceTol
Expand All @@ -118,19 +137,29 @@ class GaussianMixture private (
/**
* Return the largest change in log-likelihood at which convergence is
* considered to have occurred.
* @since 1.3.0
*/
def getConvergenceTol: Double = convergenceTol

/** Set the random seed */
/**
* Set the random seed
* @since 1.3.0
*/
def setSeed(seed: Long): this.type = {
this.seed = seed
this
}

/** Return the random seed */
/**
* Return the random seed
* @since 1.3.0
*/
def getSeed: Long = seed

/** Perform expectation maximization */
/**
* Perform expectation maximization
* @since 1.3.0
*/
def run(data: RDD[Vector]): GaussianMixtureModel = {
val sc = data.sparkContext

Expand Down Expand Up @@ -204,7 +233,10 @@ class GaussianMixture private (
new GaussianMixtureModel(weights, gaussians)
}

/** Java-friendly version of [[run()]] */
/**
* Java-friendly version of [[run()]]
* @since 1.3.0
*/
def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd)

private def updateWeightsAndGaussians(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,6 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
sc.parallelize(dataArray, 1).toDF().write.parquet(Loader.dataPath(path))
}

/**
* @since 1.4.0
*/
def load(sc: SparkContext, path: String): GaussianMixtureModel = {
val dataPath = Loader.dataPath(path)
val sqlContext = new SQLContext(sc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,40 +49,51 @@ class KMeans private (
/**
* Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
* initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}.
* @since 0.8.0
*/
def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong())

/**
* Number of clusters to create (k).
* @since 1.4.0
*/
def getK: Int = k

/** Set the number of clusters to create (k). Default: 2. */
/**
* Set the number of clusters to create (k). Default: 2.
* @since 0.8.0
*/
def setK(k: Int): this.type = {
this.k = k
this
}

/**
* Maximum number of iterations to run.
* @since 1.4.0
*/
def getMaxIterations: Int = maxIterations

/** Set maximum number of iterations to run. Default: 20. */
/**
* Set maximum number of iterations to run. Default: 20.
* @since 0.8.0
*/
def setMaxIterations(maxIterations: Int): this.type = {
this.maxIterations = maxIterations
this
}

/**
* The initialization algorithm. This can be either "random" or "k-means||".
* @since 1.4.0
*/
def getInitializationMode: String = initializationMode

/**
* Set the initialization algorithm. This can be either "random" to choose random points as
* initial cluster centers, or "k-means||" to use a parallel variant of k-means++
* (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
* @since 0.8.0
*/
def setInitializationMode(initializationMode: String): this.type = {
KMeans.validateInitMode(initializationMode)
Expand All @@ -93,6 +104,7 @@ class KMeans private (
/**
* :: Experimental ::
* Number of runs of the algorithm to execute in parallel.
* @since 1.4.0
*/
@Experimental
def getRuns: Int = runs
Expand All @@ -102,6 +114,7 @@ class KMeans private (
* Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm
* this many times with random starting conditions (configured by the initialization mode), then
* return the best clustering found over any run. Default: 1.
* @since 0.8.0
*/
@Experimental
def setRuns(runs: Int): this.type = {
Expand All @@ -114,12 +127,14 @@ class KMeans private (

/**
* Number of steps for the k-means|| initialization mode
* @since 1.4.0
*/
def getInitializationSteps: Int = initializationSteps

/**
* Set the number of steps for the k-means|| initialization mode. This is an advanced
* setting -- the default of 5 is almost always enough. Default: 5.
* @since 0.8.0
*/
def setInitializationSteps(initializationSteps: Int): this.type = {
if (initializationSteps <= 0) {
Expand All @@ -131,12 +146,14 @@ class KMeans private (

/**
* The distance threshold within which we've consider centers to have converged.
* @since 1.4.0
*/
def getEpsilon: Double = epsilon

/**
* Set the distance threshold within which we've consider centers to have converged.
* If all centers move less than this Euclidean distance, we stop iterating one run.
* @since 0.8.0
*/
def setEpsilon(epsilon: Double): this.type = {
this.epsilon = epsilon
Expand All @@ -145,10 +162,14 @@ class KMeans private (

/**
* The random seed for cluster initialization.
* @since 1.4.0
*/
def getSeed: Long = seed

/** Set the random seed for cluster initialization. */
/**
* Set the random seed for cluster initialization.
* @since 1.4.0
*/
def setSeed(seed: Long): this.type = {
this.seed = seed
this
Expand All @@ -162,6 +183,7 @@ class KMeans private (
* Set the initial starting point, bypassing the random initialization or k-means||
* The condition model.k == this.k must be met, failure results
* in an IllegalArgumentException.
* @since 1.4.0
*/
def setInitialModel(model: KMeansModel): this.type = {
require(model.k == k, "mismatched cluster count")
Expand All @@ -172,6 +194,7 @@ class KMeans private (
/**
* Train a K-means model on the given set of points; `data` should be cached for high
* performance, because this is an iterative algorithm.
* @since 0.8.0
*/
def run(data: RDD[Vector]): KMeansModel = {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ class KMeansModel (
* @since 1.4.0
*/
object KMeansModel extends Loader[KMeansModel] {

/**
* @since 1.4.0
*/
override def load(sc: SparkContext, path: String): KMeansModel = {
KMeansModel.SaveLoadV1_0.load(sc, path)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ class LDA private (
private var checkpointInterval: Int,
private var ldaOptimizer: LDAOptimizer) extends Logging {

/**
* Constructs a LDA instance with default parameters.
* @since 1.3.0
*/
def this() = this(k = 10, maxIterations = 20, docConcentration = Vectors.dense(-1),
topicConcentration = -1, seed = Utils.random.nextLong(), checkpointInterval = 10,
ldaOptimizer = new EMLDAOptimizer)
Expand Down
Loading