diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index e459367333d26..bc27b1fe7390b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -62,6 +62,7 @@ class GaussianMixture private ( /** * Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01, * maxIterations: 100, seed: random}. + * @since 1.3.0 */ def this() = this(2, 0.01, 100, Utils.random.nextLong()) @@ -72,9 +73,11 @@ class GaussianMixture private ( // default random starting point private var initialModel: Option[GaussianMixtureModel] = None - /** Set the initial GMM starting point, bypassing the random initialization. - * You must call setK() prior to calling this method, and the condition - * (model.k == this.k) must be met; failure will result in an IllegalArgumentException + /** + * Set the initial GMM starting point, bypassing the random initialization. + * You must call setK() prior to calling this method, and the condition + * (model.k == this.k) must be met; failure will result in an IllegalArgumentException + * @since 1.3.0 */ def setInitialModel(model: GaussianMixtureModel): this.type = { if (model.k == k) { @@ -85,30 +88,46 @@ class GaussianMixture private ( this } - /** Return the user supplied initial GMM, if supplied */ + /** + * Return the user supplied initial GMM, if supplied + * @since 1.3.0 + */ def getInitialModel: Option[GaussianMixtureModel] = initialModel - /** Set the number of Gaussians in the mixture model. Default: 2 */ + /** + * Set the number of Gaussians in the mixture model. Default: 2 + * @since 1.3.0 + */ def setK(k: Int): this.type = { this.k = k this } - /** Return the number of Gaussians in the mixture model */ + /** + * Return the number of Gaussians in the mixture model + * @since 1.3.0 + */ def getK: Int = k - /** Set the maximum number of iterations to run. Default: 100 */ + /** + * Set the maximum number of iterations to run. Default: 100 + * @since 1.3.0 + */ def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this } - /** Return the maximum number of iterations to run */ + /** + * Return the maximum number of iterations to run + * @since 1.3.0 + */ def getMaxIterations: Int = maxIterations /** * Set the largest change in log-likelihood at which convergence is * considered to have occurred. + * @since 1.3.0 */ def setConvergenceTol(convergenceTol: Double): this.type = { this.convergenceTol = convergenceTol @@ -118,19 +137,29 @@ class GaussianMixture private ( /** * Return the largest change in log-likelihood at which convergence is * considered to have occurred. + * @since 1.3.0 */ def getConvergenceTol: Double = convergenceTol - /** Set the random seed */ + /** + * Set the random seed + * @since 1.3.0 + */ def setSeed(seed: Long): this.type = { this.seed = seed this } - /** Return the random seed */ + /** + * Return the random seed + * @since 1.3.0 + */ def getSeed: Long = seed - /** Perform expectation maximization */ + /** + * Perform expectation maximization + * @since 1.3.0 + */ def run(data: RDD[Vector]): GaussianMixtureModel = { val sc = data.sparkContext @@ -204,7 +233,10 @@ class GaussianMixture private ( new GaussianMixtureModel(weights, gaussians) } - /** Java-friendly version of [[run()]] */ + /** + * Java-friendly version of [[run()]] + * @since 1.3.0 + */ def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd) private def updateWeightsAndGaussians( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 05c52002fe922..a4da485a42ba1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -168,9 +168,6 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { sc.parallelize(dataArray, 1).toDF().write.parquet(Loader.dataPath(path)) } - /** - * @since 1.4.0 - */ def load(sc: SparkContext, path: String): GaussianMixtureModel = { val dataPath = Loader.dataPath(path) val sqlContext = new SQLContext(sc) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index c74de4d139a8c..9ef6834e5ea8d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -49,15 +49,20 @@ class KMeans private ( /** * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1, * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}. + * @since 0.8.0 */ def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong()) /** * Number of clusters to create (k). + * @since 1.4.0 */ def getK: Int = k - /** Set the number of clusters to create (k). Default: 2. */ + /** + * Set the number of clusters to create (k). Default: 2. + * @since 0.8.0 + */ def setK(k: Int): this.type = { this.k = k this @@ -65,10 +70,14 @@ class KMeans private ( /** * Maximum number of iterations to run. + * @since 1.4.0 */ def getMaxIterations: Int = maxIterations - /** Set maximum number of iterations to run. Default: 20. */ + /** + * Set maximum number of iterations to run. Default: 20. + * @since 0.8.0 + */ def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this @@ -76,6 +85,7 @@ class KMeans private ( /** * The initialization algorithm. This can be either "random" or "k-means||". + * @since 1.4.0 */ def getInitializationMode: String = initializationMode @@ -83,6 +93,7 @@ class KMeans private ( * Set the initialization algorithm. This can be either "random" to choose random points as * initial cluster centers, or "k-means||" to use a parallel variant of k-means++ * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||. + * @since 0.8.0 */ def setInitializationMode(initializationMode: String): this.type = { KMeans.validateInitMode(initializationMode) @@ -93,6 +104,7 @@ class KMeans private ( /** * :: Experimental :: * Number of runs of the algorithm to execute in parallel. + * @since 1.4.0 */ @Experimental def getRuns: Int = runs @@ -102,6 +114,7 @@ class KMeans private ( * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm * this many times with random starting conditions (configured by the initialization mode), then * return the best clustering found over any run. Default: 1. + * @since 0.8.0 */ @Experimental def setRuns(runs: Int): this.type = { @@ -114,12 +127,14 @@ class KMeans private ( /** * Number of steps for the k-means|| initialization mode + * @since 1.4.0 */ def getInitializationSteps: Int = initializationSteps /** * Set the number of steps for the k-means|| initialization mode. This is an advanced * setting -- the default of 5 is almost always enough. Default: 5. + * @since 0.8.0 */ def setInitializationSteps(initializationSteps: Int): this.type = { if (initializationSteps <= 0) { @@ -131,12 +146,14 @@ class KMeans private ( /** * The distance threshold within which we've consider centers to have converged. + * @since 1.4.0 */ def getEpsilon: Double = epsilon /** * Set the distance threshold within which we've consider centers to have converged. * If all centers move less than this Euclidean distance, we stop iterating one run. + * @since 0.8.0 */ def setEpsilon(epsilon: Double): this.type = { this.epsilon = epsilon @@ -145,10 +162,14 @@ class KMeans private ( /** * The random seed for cluster initialization. + * @since 1.4.0 */ def getSeed: Long = seed - /** Set the random seed for cluster initialization. */ + /** + * Set the random seed for cluster initialization. + * @since 1.4.0 + */ def setSeed(seed: Long): this.type = { this.seed = seed this @@ -162,6 +183,7 @@ class KMeans private ( * Set the initial starting point, bypassing the random initialization or k-means|| * The condition model.k == this.k must be met, failure results * in an IllegalArgumentException. + * @since 1.4.0 */ def setInitialModel(model: KMeansModel): this.type = { require(model.k == k, "mismatched cluster count") @@ -172,6 +194,7 @@ class KMeans private ( /** * Train a K-means model on the given set of points; `data` should be cached for high * performance, because this is an iterative algorithm. + * @since 0.8.0 */ def run(data: RDD[Vector]): KMeansModel = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index 21fbcaa6b7c8e..8de2087ceb4df 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -104,6 +104,10 @@ class KMeansModel ( * @since 1.4.0 */ object KMeansModel extends Loader[KMeansModel] { + + /** + * @since 1.4.0 + */ override def load(sc: SparkContext, path: String): KMeansModel = { KMeansModel.SaveLoadV1_0.load(sc, path) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index d7c811a69df76..2a8c6acbaec61 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -55,6 +55,10 @@ class LDA private ( private var checkpointInterval: Int, private var ldaOptimizer: LDAOptimizer) extends Logging { + /** + * Constructs a LDA instance with default parameters. + * @since 1.3.0 + */ def this() = this(k = 10, maxIterations = 20, docConcentration = Vectors.dense(-1), topicConcentration = -1, seed = Utils.random.nextLong(), checkpointInterval = 10, ldaOptimizer = new EMLDAOptimizer) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 82f05e4a18cee..5db75026aab67 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -192,12 +192,24 @@ class LocalLDAModel private[clustering] ( override protected[clustering] val gammaShape: Double = 100) extends LDAModel with Serializable { + /** + * @since 1.3.0 + */ override def k: Int = topics.numCols + /** + * @since 1.3.0 + */ override def vocabSize: Int = topics.numRows + /** + * @since 1.3.0 + */ override def topicsMatrix: Matrix = topics + /** + * @since 1.3.0 + */ override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = { val brzTopics = topics.toBreeze.toDenseMatrix Range(0, k).map { topicIndex => @@ -210,6 +222,9 @@ class LocalLDAModel private[clustering] ( override protected def formatVersion = "1.0" + /** + * @since 1.5.0 + */ override def save(sc: SparkContext, path: String): Unit = { LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration, gammaShape) @@ -223,12 +238,16 @@ class LocalLDAModel private[clustering] ( * * @param documents test corpus to use for calculating log likelihood * @return variational lower bound on the log likelihood of the entire corpus + * @since 1.3.0 */ def logLikelihood(documents: RDD[(Long, Vector)]): Double = logLikelihoodBound(documents, docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) - /** Java-friendly version of [[logLikelihood]] */ + /** + * Java-friendly version of [[logLikelihood]] + * @since 1.5.0 + */ def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) } @@ -239,6 +258,7 @@ class LocalLDAModel private[clustering] ( * * @param documents test corpus to use for calculating perplexity * @return Variational upper bound on log perplexity per token. + * @since 1.5.0 */ def logPerplexity(documents: RDD[(Long, Vector)]): Double = { val corpusTokenCount = documents @@ -247,7 +267,9 @@ class LocalLDAModel private[clustering] ( -logLikelihood(documents) / corpusTokenCount } - /** Java-friendly version of [[logPerplexity]] */ + /** Java-friendly version of [[logPerplexity]] + * @since 1.5.0 + */ def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) } @@ -325,6 +347,7 @@ class LocalLDAModel private[clustering] ( * for each document. * @param documents documents to predict topic mixture distributions for * @return An RDD of (document ID, topic mixture distribution for document) + * @since 1.3.0 */ // TODO: declare in LDAModel and override once implemented in DistributedLDAModel def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = { @@ -351,7 +374,10 @@ class LocalLDAModel private[clustering] ( } } - /** Java-friendly version of [[topicDistributions]] */ + /** + * Java-friendly version of [[topicDistributions]] + * @since 1.5.0 + */ def topicDistributions( documents: JavaPairRDD[java.lang.Long, Vector]): JavaPairRDD[java.lang.Long, Vector] = { val distributions = topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) @@ -425,6 +451,9 @@ object LocalLDAModel extends Loader[LocalLDAModel] { } } + /** + * @since 1.5.0 + */ override def load(sc: SparkContext, path: String): LocalLDAModel = { val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) implicit val formats = DefaultFormats @@ -481,6 +510,7 @@ class DistributedLDAModel private[clustering] ( * Convert model to a local model. * The local model stores the inferred topics but not the topic distributions for training * documents. + * @since 1.3.0 */ def toLocal: LocalLDAModel = new LocalLDAModel(topicsMatrix, docConcentration, topicConcentration, gammaShape) @@ -491,6 +521,7 @@ class DistributedLDAModel private[clustering] ( * No guarantees are given about the ordering of the topics. * * WARNING: This matrix is collected from an RDD. Beware memory usage when vocabSize, k are large. + * @since 1.3.0 */ override lazy val topicsMatrix: Matrix = { // Collect row-major topics @@ -510,6 +541,9 @@ class DistributedLDAModel private[clustering] ( Matrices.fromBreeze(brzTopics) } + /** + * @since 1.3.0 + */ override def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])] = { val numTopics = k // Note: N_k is not needed to find the top terms, but it is needed to normalize weights @@ -548,6 +582,7 @@ class DistributedLDAModel private[clustering] ( * @return Array over topics. Each element represent as a pair of matching arrays: * (IDs for the documents, weights of the topic in these documents). * For each topic, documents are sorted in order of decreasing topic weights. + * @since 1.5.0 */ def topDocumentsPerTopic(maxDocumentsPerTopic: Int): Array[(Array[Long], Array[Double])] = { val numTopics = k @@ -587,6 +622,7 @@ class DistributedLDAModel private[clustering] ( * - This excludes the prior; for that, use [[logPrior]]. * - Even with [[logPrior]], this is NOT the same as the data log likelihood given the * hyperparameters. + * @since 1.3.0 */ lazy val logLikelihood: Double = { // TODO: generalize this for asymmetric (non-scalar) alpha @@ -612,7 +648,8 @@ class DistributedLDAModel private[clustering] ( /** * Log probability of the current parameter estimate: - * log P(topics, topic distributions for docs | alpha, eta) + * log P(topics, topic distributions for docs | alpha, eta) + * @since 1.3.0 */ lazy val logPrior: Double = { // TODO: generalize this for asymmetric (non-scalar) alpha @@ -644,6 +681,7 @@ class DistributedLDAModel private[clustering] ( * ("theta_doc"). * * @return RDD of (document ID, topic distribution) pairs + * @since 1.3.0 */ def topicDistributions: RDD[(Long, Vector)] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => @@ -651,7 +689,10 @@ class DistributedLDAModel private[clustering] ( } } - /** Java-friendly version of [[topicDistributions]] */ + /** + * Java-friendly version of [[topicDistributions]] + * @since 1.5.0 + */ def javaTopicDistributions: JavaPairRDD[java.lang.Long, Vector] = { JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, Vector)]]) } @@ -659,6 +700,7 @@ class DistributedLDAModel private[clustering] ( /** * For each document, return the top k weighted topics for that document and their weights. * @return RDD of (doc ID, topic indices, topic weights) + * @since 1.5.0 */ def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => @@ -673,7 +715,10 @@ class DistributedLDAModel private[clustering] ( } } - /** Java-friendly version of [[topTopicsPerDocument]] */ + /** + * Java-friendly version of [[topTopicsPerDocument]] + * @since 1.5.0 + */ def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[Double])] = { val topics = topTopicsPerDocument(k) topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Double])]].toJavaRDD() @@ -684,6 +729,10 @@ class DistributedLDAModel private[clustering] ( override protected def formatVersion = "1.0" + /** + * Java-friendly version of [[topicDistributions]] + * @since 1.5.0 + */ override def save(sc: SparkContext, path: String): Unit = { DistributedLDAModel.SaveLoadV1_0.save( sc, path, graph, globalTopicTotals, k, vocabSize, docConcentration, topicConcentration, @@ -784,6 +833,9 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] { } + /** + * @since 1.5.0 + */ override def load(sc: SparkContext, path: String): DistributedLDAModel = { val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) implicit val formats = DefaultFormats diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala index 66ec544d07917..b4733ca975152 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -133,13 +133,16 @@ class PowerIterationClustering private[clustering] ( import org.apache.spark.mllib.clustering.PowerIterationClustering._ - /** Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100, - * initMode: "random"}. + /** + * Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100, + * initMode: "random"}. + * @since 1.3.0 */ def this() = this(k = 2, maxIterations = 100, initMode = "random") /** * Set the number of clusters. + * @since 1.3.0 */ def setK(k: Int): this.type = { this.k = k @@ -148,6 +151,7 @@ class PowerIterationClustering private[clustering] ( /** * Set maximum number of iterations of the power iteration loop + * @since 1.3.0 */ def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations @@ -157,6 +161,7 @@ class PowerIterationClustering private[clustering] ( /** * Set the initialization mode. This can be either "random" to use a random vector * as vertex properties, or "degree" to use normalized sum similarities. Default: random. + * @since 1.3.0 */ def setInitializationMode(mode: String): this.type = { this.initMode = mode match { @@ -199,6 +204,7 @@ class PowerIterationClustering private[clustering] ( * assume s,,ij,, = 0.0. * * @return a [[PowerIterationClusteringModel]] that contains the clustering result + * @since 1.3.0 */ def run(similarities: RDD[(Long, Long, Double)]): PowerIterationClusteringModel = { val w = normalize(similarities) @@ -211,6 +217,7 @@ class PowerIterationClustering private[clustering] ( /** * A Java-friendly version of [[PowerIterationClustering.run]]. + * @since 1.3.0 */ def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)]) : PowerIterationClusteringModel = {