@@ -19,14 +19,14 @@ package org.apache.spark.mllib.clustering
1919
2020import scala .collection .mutable .ArrayBuffer
2121
22- import org .apache .spark .annotation .Experimental
2322import org .apache .spark .Logging
24- import org .apache .spark .SparkContext . _
23+ import org .apache .spark .annotation . Experimental
2524import org .apache .spark .mllib .linalg .{Vector , Vectors }
2625import org .apache .spark .mllib .linalg .BLAS .{axpy , scal }
2726import org .apache .spark .mllib .util .MLUtils
2827import org .apache .spark .rdd .RDD
2928import org .apache .spark .storage .StorageLevel
29+ import org .apache .spark .util .Utils
3030import org .apache .spark .util .random .XORShiftRandom
3131
3232/**
@@ -48,9 +48,9 @@ class KMeans private (
4848
4949 /**
5050 * Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
51- * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, System.nanoTime() }.
51+ * initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random }.
5252 */
53- def this () = this (2 , 20 , 1 , KMeans .K_MEANS_PARALLEL , 5 , 1e-4 , System .nanoTime ())
53+ def this () = this (2 , 20 , 1 , KMeans .K_MEANS_PARALLEL , 5 , 1e-4 , Utils .random.nextLong ())
5454
5555 /** Set the number of clusters to create (k). Default: 2. */
5656 def setK (k : Int ): this .type = {
@@ -345,17 +345,20 @@ object KMeans {
345345 * @param maxIterations max number of iterations
346346 * @param runs number of parallel runs, defaults to 1. The best model is returned.
347347 * @param initializationMode initialization model, either "random" or "k-means||" (default).
348+ * @param seed random seed value for cluster initialization
348349 */
349350 def train (
350351 data : RDD [Vector ],
351352 k : Int ,
352353 maxIterations : Int ,
353354 runs : Int ,
354- initializationMode : String ): KMeansModel = {
355+ initializationMode : String ,
356+ seed : Long ): KMeansModel = {
355357 new KMeans ().setK(k)
356358 .setMaxIterations(maxIterations)
357359 .setRuns(runs)
358360 .setInitializationMode(initializationMode)
361+ .setSeed(seed)
359362 .run(data)
360363 }
361364
@@ -367,20 +370,17 @@ object KMeans {
367370 * @param maxIterations max number of iterations
368371 * @param runs number of parallel runs, defaults to 1. The best model is returned.
369372 * @param initializationMode initialization model, either "random" or "k-means||" (default).
370- * @param seed random seed value for cluster initialization
371373 */
372374 def train (
373375 data : RDD [Vector ],
374376 k : Int ,
375377 maxIterations : Int ,
376378 runs : Int ,
377- initializationMode : String ,
378- seed : Long ): KMeansModel = {
379+ initializationMode : String ): KMeansModel = {
379380 new KMeans ().setK(k)
380381 .setMaxIterations(maxIterations)
381382 .setRuns(runs)
382383 .setInitializationMode(initializationMode)
383- .setSeed(seed)
384384 .run(data)
385385 }
386386
0 commit comments