From f94a3d7190911804309a7941ade5d5e96a3c2028 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 11 Mar 2015 12:26:19 +0800 Subject: [PATCH 1/2] add get for KMeans --- .../spark/mllib/clustering/KMeans.scala | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 11633e824231..88bfbbb953e8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -52,18 +52,33 @@ class KMeans private ( */ def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong()) + /** + * Number of clusters to create (k). + */ + def getK: Int = k + /** Set the number of clusters to create (k). Default: 2. */ def setK(k: Int): this.type = { this.k = k this } + /** + * Maximum number of iterations to run. + */ + def getMaxIterations: Int = maxIterations + /** Set maximum number of iterations to run. Default: 20. */ def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this } + /** + * The initialization algorithm. This can be either "random" or "k-means||". + */ + def getInitializationMode: String = initializationMode + /** * Set the initialization algorithm. This can be either "random" to choose random points as * initial cluster centers, or "k-means||" to use a parallel variant of k-means++ @@ -77,6 +92,11 @@ class KMeans private ( this } + /** + * Number of runs of the algorithm to execute in parallel. + */ + def getRuns: Int = runs + /** * :: Experimental :: * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm @@ -92,6 +112,11 @@ class KMeans private ( this } + /** + * Number of steps for the k-means|| initialization mode + */ + def getInitializationSteps: Int = initializationSteps + /** * Set the number of steps for the k-means|| initialization mode. This is an advanced * setting -- the default of 5 is almost always enough. Default: 5. @@ -104,6 +129,11 @@ class KMeans private ( this } + /** + * The distance threshold within which we've consider centers to have converged. + */ + def getEpsilon: Double = epsilon + /** * Set the distance threshold within which we've consider centers to have converged. * If all centers move less than this Euclidean distance, we stop iterating one run. @@ -113,6 +143,11 @@ class KMeans private ( this } + /** + * The random seed for cluster initialization. + */ + def getSeed: Long = seed + /** Set the random seed for cluster initialization. */ def setSeed(seed: Long): this.type = { this.seed = seed From f44d4dc38947b30762de359958f6f55f07eb6d6b Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 11 Mar 2015 16:07:46 +0800 Subject: [PATCH 2/2] add experimental to getRuns --- .../main/scala/org/apache/spark/mllib/clustering/KMeans.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 88bfbbb953e8..e41f941fd2c2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -93,8 +93,10 @@ class KMeans private ( } /** + * :: Experimental :: * Number of runs of the algorithm to execute in parallel. */ + @Experimental def getRuns: Int = runs /**