apache · mengxr · May 29, 2015 · May 29, 2015 · May 29, 2015
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -70,7 +70,7 @@ private[ml] trait OneVsRestParams extends PredictorParams {
 final class OneVsRestModel private[ml] (
     override val uid: String,
     labelMetadata: Metadata,
-    val models: Array[_ <: ClassificationModel[_,_]])
+    val models: Array[_ <: ClassificationModel[_, _]])
   extends Model[OneVsRestModel] with OneVsRestParams {
 
   override def transformSchema(schema: StructType): StructType = {
@@ -104,17 +104,17 @@ final class OneVsRestModel private[ml] (
 
         // add temporary column to store intermediate scores and update
         val tmpColName = "mbc$tmp" + UUID.randomUUID().toString
-        val update: (Map[Int, Double], Vector) => Map[Int, Double]  =
+        val update: (Map[Int, Double], Vector) => Map[Int, Double] =
           (predictions: Map[Int, Double], prediction: Vector) => {
             predictions + ((index, prediction(1)))
           }
         val updateUdf = callUDF(update, mapType, col(accColName), col(rawPredictionCol))
-        val transformedDataset = model.transform(df).select(columns:_*)
+        val transformedDataset = model.transform(df).select(columns : _*)
         val updatedDataset = transformedDataset.withColumn(tmpColName, updateUdf)
         val newColumns = origCols ++ List(col(tmpColName))
 
         // switch out the intermediate column with the accumulator column
-        updatedDataset.select(newColumns:_*).withColumnRenamed(tmpColName, accColName)
+        updatedDataset.select(newColumns : _*).withColumnRenamed(tmpColName, accColName)
     }
 
     if (handlePersistence) {
@@ -190,7 +190,7 @@ final class OneVsRest(override val uid: String)
       val trainingDataset = multiclassLabeled.withColumn(labelColName, labelUDFWithNewMeta)
       val classifier = getClassifier
       classifier.fit(trainingDataset, classifier.labelCol -> labelColName)
-    }.toArray[ClassificationModel[_,_]]
+    }.toArray[ClassificationModel[_, _]]
 
     if (handlePersistence) {
       multiclassLabeled.unpersist()

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -159,9 +159,9 @@ final class InternalNode private[ml] (
 
   override private[tree] def subtreeToString(indentFactor: Int = 0): String = {
     val prefix: String = " " * indentFactor
-    prefix + s"If (${InternalNode.splitToString(split, left=true)})\n" +
+    prefix + s"If (${InternalNode.splitToString(split, left = true)})\n" +
       leftChild.subtreeToString(indentFactor + 1) +
-      prefix + s"Else (${InternalNode.splitToString(split, left=false)})\n" +
+      prefix + s"Else (${InternalNode.splitToString(split, left = false)})\n" +
       rightChild.subtreeToString(indentFactor + 1)
   }
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -392,7 +392,7 @@ private[python] class PythonMLLibAPI extends Serializable {
       data: JavaRDD[Vector],
       wt: Vector,
       mu: Array[Object],
-      si: Array[Object]):  RDD[Vector]  = {
+      si: Array[Object]): RDD[Vector] = {
 
       val weight = wt.toArray
       val mean = mu.map(_.asInstanceOf[DenseVector])
@@ -428,7 +428,7 @@ private[python] class PythonMLLibAPI extends Serializable {
 
     if (seed != null) als.setSeed(seed)
 
-    val model =  als.run(ratingsJRDD.rdd)
+    val model = als.run(ratingsJRDD.rdd)
     new MatrixFactorizationModelWrapper(model)
   }
 
@@ -459,7 +459,7 @@ private[python] class PythonMLLibAPI extends Serializable {
 
     if (seed != null) als.setSeed(seed)
 
-    val model =  als.run(ratingsJRDD.rdd)
+    val model = als.run(ratingsJRDD.rdd)
     new MatrixFactorizationModelWrapper(model)
   }
 
@@ -1242,7 +1242,7 @@ private[spark] object SerDe extends Serializable {
   }
 
   /* convert RDD[Tuple2[,]] to RDD[Array[Any]] */
-  def fromTuple2RDD(rdd: RDD[(Any, Any)]): RDD[Array[Any]]  = {
+  def fromTuple2RDD(rdd: RDD[(Any, Any)]): RDD[Array[Any]] = {
     rdd.map(x => Array(x._1, x._2))
   }
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -211,7 +211,7 @@ class GaussianMixture private (
 private object ExpectationSum {
   def zero(k: Int, d: Int): ExpectationSum = {
     new ExpectationSum(0.0, Array.fill(k)(0.0), 
-      Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d)))
+      Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d, d)))
   }
 
   // compute cluster contributions for each input point

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -273,7 +273,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * Default: 1024, following the original Online LDA paper.
    */
   def setTau0(tau0: Double): this.type = {
-    require(tau0 > 0,  s"LDA tau0 must be positive, but was set to $tau0")
+    require(tau0 > 0, s"LDA tau0 must be positive, but was set to $tau0")
     this.tau0 = tau0
     this
   }
@@ -339,7 +339,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
   override private[clustering] def initialize(
       docs: RDD[(Long, Vector)],
-      lda: LDA):  OnlineLDAOptimizer = {
+      lda: LDA): OnlineLDAOptimizer = {
     this.k = lda.getK
     this.corpusSize = docs.count()
     this.vocabSize = docs.first()._2.size
@@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * uses digamma which is accurate but expensive.
    */
   private def dirichletExpectation(alpha: BDM[Double]): BDM[Double] = {
-    val rowSum =  sum(alpha(breeze.linalg.*, ::))
+    val rowSum = sum(alpha(breeze.linalg.*, ::))
     val digAlpha = digamma(alpha)
     val digRowSum = digamma(rowSum)
     val result = digAlpha(::, breeze.linalg.*) - digRowSum

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -144,7 +144,7 @@ private object IDF {
          * Since arrays are initialized to 0 by default,
          * we just omit changing those entries.
          */
-        if(df(j) >= minDocFreq) {
+        if (df(j) >= minDocFreq) {
           inv(j) = math.log((m + 1.0) / (df(j) + 1.0))
         }
         j += 1

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
@@ -90,7 +90,7 @@ class StandardScalerModel (
 
   @DeveloperApi
   def setWithMean(withMean: Boolean): this.type = {
-    require(!(withMean && this.mean == null),"cannot set withMean to true while mean is null")
+    require(!(withMean && this.mean == null), "cannot set withMean to true while mean is null")
     this.withMean = withMean
     this
   }

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -49,7 +49,7 @@ private case class VocabWord(
   var cn: Int,
   var point: Array[Int],
   var code: Array[Int],
-  var codeLen:Int
+  var codeLen: Int
 )
 
 /**
@@ -469,7 +469,7 @@ class Word2VecModel private[mllib] (
     val norm1 = blas.snrm2(n, v1, 1)
     val norm2 = blas.snrm2(n, v2, 1)
     if (norm1 == 0 || norm2 == 0) return 0.0
-    blas.sdot(n, v1, 1, v2,1) / norm1 / norm2
+    blas.sdot(n, v1, 1, v2, 1) / norm1 / norm2
   }
 
   override protected def formatVersion = "1.0"
@@ -500,7 +500,7 @@ class Word2VecModel private[mllib] (
    */
   def findSynonyms(word: String, num: Int): Array[(String, Double)] = {
     val vector = transform(word)
-    findSynonyms(vector,num)
+    findSynonyms(vector, num)
   }
 
   /**

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -219,7 +219,7 @@ class RowMatrix(
 
     val computeMode = mode match {
       case "auto" =>
-        if(k > 5000) {
+        if (k > 5000) {
           logWarning(s"computing svd with k=$k and n=$n, please check necessity")
         }
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -52,7 +52,7 @@ object RandomRDDs {
       numPartitions: Int = 0,
       seed: Long = Utils.random.nextLong()): RDD[Double] = {
     val uniform = new UniformGenerator()
-    randomRDD(sc, uniform,  size, numPartitionsOrDefault(sc, numPartitions), seed)
+    randomRDD(sc, uniform, size, numPartitionsOrDefault(sc, numPartitions), seed)
   }
 
   /**

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -203,7 +203,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
   override def load(sc: SparkContext, path: String): IsotonicRegressionModel = {
     implicit val formats = DefaultFormats
     val (loadedClassName, version, metadata) = loadMetadata(sc, path)
-    val isotonic =  (metadata \ "isotonic").extract[Boolean]
+    val isotonic = (metadata \ "isotonic").extract[Boolean]
     val classNameV1_0 = SaveLoadV1_0.thisClassName
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -70,7 +70,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
     require(n == sample.size, s"Dimensions mismatch when adding new sample." +
       s" Expecting $n but got ${sample.size}.")
 
-    val localCurrMean= currMean
+    val localCurrMean = currMean
     val localCurrM2n = currM2n
     val localCurrM2 = currM2
     val localCurrL1 = currL1

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -196,7 +196,7 @@ private[stat] object ChiSqTest extends Logging {
    * Pearson's independence test on the input contingency matrix.
    * TODO: optimize for SparseMatrix when it becomes supported.
    */
-  def chiSquaredMatrix(counts: Matrix, methodName:String = PEARSON.name): ChiSqTestResult = {
+  def chiSquaredMatrix(counts: Matrix, methodName: String = PEARSON.name): ChiSqTestResult = {
     val method = methodFromString(methodName)
     val numRows = counts.numRows
     val numCols = counts.numCols

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -169,7 +169,7 @@ object DecisionTree extends Serializable with Logging {
       numClasses: Int,
       maxBins: Int,
       quantileCalculationStrategy: QuantileStrategy,
-      categoricalFeaturesInfo: Map[Int,Int]): DecisionTreeModel = {
+      categoricalFeaturesInfo: Map[Int, Int]): DecisionTreeModel = {
     val strategy = new Strategy(algo, impurity, maxDepth, numClasses, maxBins,
       quantileCalculationStrategy, categoricalFeaturesInfo)
     new DecisionTree(strategy).run(input)
@@ -768,7 +768,7 @@ object DecisionTree extends Serializable with Logging {
    */
   private def calculatePredictImpurity(
       leftImpurityCalculator: ImpurityCalculator,
-      rightImpurityCalculator: ImpurityCalculator): (Predict, Double) =  {
+      rightImpurityCalculator: ImpurityCalculator): (Predict, Double) = {
     val parentNodeAgg = leftImpurityCalculator.copy
     parentNodeAgg.add(rightImpurityCalculator)
     val predict = calculatePredict(parentNodeAgg)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -60,12 +60,12 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
   def run(input: RDD[LabeledPoint]): GradientBoostedTreesModel = {
     val algo = boostingStrategy.treeStrategy.algo
     algo match {
-      case Regression => GradientBoostedTrees.boost(input, input, boostingStrategy, validate=false)
+      case Regression =>
+        GradientBoostedTrees.boost(input, input, boostingStrategy, validate = false)
       case Classification =>
         // Map labels to -1, +1 so binary classification can be treated as regression.
         val remappedInput = input.map(x => new LabeledPoint((x.label * 2) - 1, x.features))
-        GradientBoostedTrees.boost(remappedInput,
-          remappedInput, boostingStrategy, validate=false)
+        GradientBoostedTrees.boost(remappedInput, remappedInput, boostingStrategy, validate = false)
       case _ =>
         throw new IllegalArgumentException(s"$algo is not supported by the gradient boosting.")
     }
@@ -93,16 +93,16 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
       validationInput: RDD[LabeledPoint]): GradientBoostedTreesModel = {
     val algo = boostingStrategy.treeStrategy.algo
     algo match {
-      case Regression => GradientBoostedTrees.boost(
-        input, validationInput, boostingStrategy, validate=true)
+      case Regression =>
+        GradientBoostedTrees.boost(input, validationInput, boostingStrategy, validate = true)
       case Classification =>
         // Map labels to -1, +1 so binary classification can be treated as regression.
         val remappedInput = input.map(
           x => new LabeledPoint((x.label * 2) - 1, x.features))
         val remappedValidationInput = validationInput.map(
           x => new LabeledPoint((x.label * 2) - 1, x.features))
         GradientBoostedTrees.boost(remappedInput, remappedValidationInput, boostingStrategy,
-          validate=true)
+          validate = true)
       case _ =>
         throw new IllegalArgumentException(s"$algo is not supported by the gradient boosting.")
     }

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -249,7 +249,7 @@ private class RandomForest (
       try {
         nodeIdCache.get.deleteAllCheckpoints()
       } catch {
-        case e:IOException =>
+        case e: IOException =>
           logWarning(s"delete all checkpoints failed. Error reason: ${e.getMessage}")
       }
     }

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -151,19 +151,19 @@ class Node (
           s"(feature ${split.feature} > ${split.threshold})"
         }
         case Categorical => if (left) {
-          s"(feature ${split.feature} in ${split.categories.mkString("{",",","}")})"
+          s"(feature ${split.feature} in ${split.categories.mkString("{", ",", "}")})"
         } else {
-          s"(feature ${split.feature} not in ${split.categories.mkString("{",",","}")})"
+          s"(feature ${split.feature} not in ${split.categories.mkString("{", ",", "}")})"
         }
       }
     }
     val prefix: String = " " * indentFactor
     if (isLeaf) {
       prefix + s"Predict: ${predict.predict}\n"
     } else {
-      prefix + s"If ${splitToString(split.get, left=true)}\n" +
+      prefix + s"If ${splitToString(split.get, left = true)}\n" +
         leftNode.get.subtreeToString(indentFactor + 1) +
-        prefix + s"Else ${splitToString(split.get, left=false)}\n" +
+        prefix + s"Else ${splitToString(split.get, left = false)}\n" +
         rightNode.get.subtreeToString(indentFactor + 1)
     }
   }

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
@@ -82,8 +82,7 @@ object MFDataGenerator {
     BLAS.gemm(z, A, B, 1.0, fullData)
 
     val df = rank * (m + n - rank)
-    val sampSize = scala.math.min(scala.math.round(trainSampFact * df),
-      scala.math.round(.99 * m * n)).toInt
+    val sampSize = math.min(math.round(trainSampFact * df), math.round(.99 * m * n)).toInt
     val rand = new Random()
     val mn = m * n
     val shuffled = rand.shuffle((0 until mn).toList)
@@ -102,8 +101,8 @@ object MFDataGenerator {
 
     // optionally generate testing data
     if (test) {
-      val testSampSize = scala.math
-        .min(scala.math.round(sampSize * testSampFact),scala.math.round(mn - sampSize)).toInt
+      val testSampSize = math.min(
+        math.round(sampSize * testSampFact), math.round(mn - sampSize)).toInt
       val testOmega = shuffled.slice(sampSize, sampSize + testSampSize)
       val testOrdered = testOmega.sortWith(_ < _).toArray
       val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -35,9 +35,9 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
     val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))
 
     val codes = Map(
-      "a" -> Array(-0.2811822295188904,-0.6356269121170044,-0.3020961284637451),
-      "b" -> Array(1.0309048891067505,-1.29472815990448,0.22276712954044342),
-      "c" -> Array(-0.08456747233867645,0.5137411952018738,0.11731560528278351)
+      "a" -> Array(-0.2811822295188904, -0.6356269121170044, -0.3020961284637451),
+      "b" -> Array(1.0309048891067505, -1.29472815990448, 0.22276712954044342),
+      "c" -> Array(-0.08456747233867645, 0.5137411952018738, 0.11731560528278351)
     )
 
     val expected = doc.map { sentence =>

diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -90,14 +90,20 @@ object CrossValidatorSuite {
 
     override def validateParams(): Unit = require($(inputCol).nonEmpty)
 
-    override def fit(dataset: DataFrame): MyModel = ???
+    override def fit(dataset: DataFrame): MyModel = {
+      throw new UnsupportedOperationException
+    }
 
-    override def transformSchema(schema: StructType): StructType = ???
+    override def transformSchema(schema: StructType): StructType = {
+      throw new UnsupportedOperationException
+    }
   }
 
   class MyEvaluator extends Evaluator {
 
-    override def evaluate(dataset: DataFrame): Double = ???
+    override def evaluate(dataset: DataFrame): Double = {
+      throw new UnsupportedOperationException
+    }
 
     override val uid: String = "eval"
   }

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/api/python/PythonMLLibAPISuite.scala
@@ -84,7 +84,7 @@ class PythonMLLibAPISuite extends FunSuite {
 
     val smt = new SparseMatrix(
       3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
-      isTransposed=true)
+      isTransposed = true)
     val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
     assert(smt.toArray === nsmt.toArray)
   }

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -163,7 +163,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
     val theta = Array(
       Array(0.50, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.40), // label 0
       Array(0.02, 0.70, 0.10, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02), // label 1
-      Array(0.02, 0.02, 0.60, 0.02,  0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30)  // label 2
+      Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30)  // label 2
     ).map(_.map(math.log))
 
     val testData = NaiveBayesSuite.generateNaiveBayesInput(