nit

zhengruifeng · zhengruifeng · commit a97a8fc0058e · 2020-04-28T15:40:23.000+08:00
nit
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -175,7 +175,8 @@ class LinearSVC @Since("2.2.0") (
     instr.logParams(this, labelCol, weightCol, featuresCol, predictionCol, rawPredictionCol,
       regParam, maxIter, fitIntercept, tol, standardization, threshold, aggregationDepth, blockSize)
 
-    val instances = extractInstances(dataset).setName("training instances")
+    val instances = extractInstances(dataset)
+      .setName("training instances")
 
     val (summarizer, labelSummarizer) = if ($(blockSize) == 1) {
       if (dataset.storageLevel == StorageLevel.NONE) {
@@ -201,7 +202,7 @@ class LinearSVC @Since("2.2.0") (
       val sparsity = 1 - summarizer.numNonzeros.toArray.map(_ * scale).sum
       instr.logNamedValue("sparsity", sparsity.toString)
       if (sparsity > 0.5) {
-        logWarning(s"sparsity of input dataset is $sparsity, " +
+        instr.logWarning(s"sparsity of input dataset is $sparsity, " +
           s"which may hurt performance in high-level BLAS.")
       }
     }
@@ -242,7 +243,7 @@ class LinearSVC @Since("2.2.0") (
        Note that the intercept in scaled space and original space is the same;
        as a result, no scaling is needed.
      */
-    val rawCoefficients = if ($(blockSize) == 1) {
+    val (rawCoefficients, objectiveHistory) = if ($(blockSize) == 1) {
       trainOnRows(instances, featuresStd, regularization, optimizer)
     } else {
       trainOnBlocks(instances, featuresStd, regularization, optimizer)
@@ -266,7 +267,7 @@ class LinearSVC @Since("2.2.0") (
       instances: RDD[Instance],
       featuresStd: Array[Double],
       regularization: Option[L2Regularization],
-      optimizer: BreezeOWLQN[Int, BDV[Double]]): Array[Double] = {
+      optimizer: BreezeOWLQN[Int, BDV[Double]]): (Array[Double], Array[Double]) = {
     val numFeatures = featuresStd.length
     val numFeaturesPlusIntercept = if ($(fitIntercept)) numFeatures + 1 else numFeatures
 
@@ -278,22 +279,22 @@ class LinearSVC @Since("2.2.0") (
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
       Vectors.zeros(numFeaturesPlusIntercept).asBreeze.toDenseVector)
 
-    val scaledObjectiveHistory = mutable.ArrayBuilder.make[Double]
+    val arrayBuilder = mutable.ArrayBuilder.make[Double]
     var state: optimizer.State = null
     while (states.hasNext) {
       state = states.next()
-      scaledObjectiveHistory += state.adjustedValue
+      arrayBuilder += state.adjustedValue
     }
     bcFeaturesStd.destroy()
 
-    if (state == null) null else state.x.toArray
+    (if (state == null) null else state.x.toArray, arrayBuilder.result)
   }
 
   private def trainOnBlocks(
       instances: RDD[Instance],
       featuresStd: Array[Double],
       regularization: Option[L2Regularization],
-      optimizer: BreezeOWLQN[Int, BDV[Double]]): Array[Double] = {
+      optimizer: BreezeOWLQN[Int, BDV[Double]]): (Array[Double], Array[Double]) = {
     val numFeatures = featuresStd.length
     val numFeaturesPlusIntercept = if ($(fitIntercept)) numFeatures + 1 else numFeatures
 
@@ -321,16 +322,16 @@ class LinearSVC @Since("2.2.0") (
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
       Vectors.zeros(numFeaturesPlusIntercept).asBreeze.toDenseVector)
 
-    val scaledObjectiveHistory = mutable.ArrayBuilder.make[Double]
+    val arrayBuilder = mutable.ArrayBuilder.make[Double]
     var state: optimizer.State = null
     while (states.hasNext) {
       state = states.next()
-      scaledObjectiveHistory += state.adjustedValue
+      arrayBuilder += state.adjustedValue
     }
     blocks.unpersist()
     bcFeaturesStd.destroy()
 
-    if (state == null) null else state.x.toArray
+    (if (state == null) null else state.x.toArray, arrayBuilder.result)
   }
 }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
@@ -208,25 +208,16 @@ class LinearSVCSuite extends MLTest with DefaultReadWriteTest {
   }
 
   test("LinearSVC on blocks") {
-    Seq(smallBinaryDataset, smallSparseBinaryDataset).foreach { dataset =>
-      {
-        val lsvc = new LinearSVC().setFitIntercept(false).setBlockSize(1).setMaxIter(5)
-        val model = lsvc.fit(dataset)
-        Seq(2, 4, 8, 16, 32).foreach { blockSize =>
-          val model2 = lsvc.setBlockSize(blockSize).fit(dataset)
-          assert(model.intercept ~== model2.intercept relTol 1e-9)
-          assert(model.coefficients ~== model2.coefficients relTol 1e-9)
-        }
-      }
-
-      {
-        val lsvc = new LinearSVC().setFitIntercept(true).setBlockSize(1).setMaxIter(5)
-        val model = lsvc.fit(dataset)
-        Seq(2, 4, 8, 16, 32).foreach { blockSize =>
-          val model2 = lsvc.setBlockSize(blockSize).fit(dataset)
-          assert(model.intercept ~== model2.intercept relTol 1e-9)
-          assert(model.coefficients ~== model2.coefficients relTol 1e-9)
-        }
+    for (dataset <- Seq(smallBinaryDataset, smallSparseBinaryDataset);
+         fitIntercept <- Seq(true, false)) {
+      val lsvc = new LinearSVC()
+        .setFitIntercept(fitIntercept)
+        .setMaxIter(5)
+      val model = lsvc.fit(dataset)
+      Seq(4, 16, 64).foreach { blockSize =>
+        val model2 = lsvc.setBlockSize(blockSize).fit(dataset)
+        assert(model.intercept ~== model2.intercept relTol 1e-9)
+        assert(model.coefficients ~== model2.coefficients relTol 1e-9)
       }
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala
@@ -17,7 +17,7 @@
 package org.apache.spark.ml.optim.aggregator
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.feature.{Instance, InstanceBlock}
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.ml.stat.Summarizer
 import org.apache.spark.ml.util.TestingUtils._
@@ -61,6 +61,20 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients)
   }
 
+   /** Get summary statistics for some data and create a new BlockHingeAggregator. */
+  private def getNewBlockAggregator(
+      instances: Array[Instance],
+      coefficients: Vector,
+      fitIntercept: Boolean,
+      blockSize: Int): BlockHingeAggregator = {
+    val (featuresSummarizer, ySummarizer) =
+      Summarizer.getClassificationSummarizers(sc.parallelize(instances))
+    val featuresStd = featuresSummarizer.std.toArray
+    val numFeatures = featuresStd.length
+    val bcCoefficients = spark.sparkContext.broadcast(coefficients)
+    new BlockHingeAggregator(numFeatures, fitIntercept, blockSize)(bcCoefficients)
+  }
+
   test("aggregator add method input size") {
     val coefArray = Array(1.0, 2.0)
     val interceptArray = Array(2.0)
@@ -159,4 +173,50 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(aggConstantFeatureBinary.gradient(1) == aggConstantFeatureBinaryFiltered.gradient(0))
   }
 
+  test("Block HingeAggregator") {
+    val coefArray = Array(1.0, 2.0)
+    val intercept = 1.0
+    val blocks1 = instances
+      .grouped(2)
+      .map(seq => InstanceBlock.fromInstances(seq))
+      .toArray
+
+    val blocks2 = blocks1.map { block =>
+      new InstanceBlock(block.labels, block.weights, block.matrix.toSparseRowMajor)
+    }
+
+    val blocks3 = blocks1.zipWithIndex.map { case (block, i) =>
+      if (i % 2 == 0) {
+        new InstanceBlock(block.labels, block.weights, block.matrix.toDense)
+      } else {
+        new InstanceBlock(block.labels, block.weights, block.matrix.toSparseRowMajor)
+      }
+    }
+
+    val agg1 = getNewBlockAggregator(instances, Vectors.dense(coefArray ++ Array(intercept)),
+      fitIntercept = true, blockSize = 1)
+    blocks1.foreach(agg1.add)
+    val loss1 = agg1.loss
+    val grad1 = agg1.gradient
+    for (blocks <- Seq(blocks1, blocks2, blocks3); blockSize <- Seq(1, 2, 4)) {
+      val agg = getNewBlockAggregator(instances, Vectors.dense(coefArray ++ Array(intercept)),
+        fitIntercept = true, blockSize = blockSize)
+      blocks.foreach(agg.add)
+      assert(loss1 ~== agg.loss relTol 1e-9)
+      assert(grad1 ~== agg.gradient  relTol 1e-9)
+    }
+
+    val agg2 = getNewBlockAggregator(instances, Vectors.dense(coefArray),
+      fitIntercept = false, blockSize = 1)
+    blocks1.foreach(agg2.add)
+    val loss2 = agg2.loss
+    val grad2 = agg2.gradient
+    for (blocks <- Seq(blocks1, blocks2, blocks3); blockSize <- Seq(1, 2, 4)) {
+      val agg = getNewBlockAggregator(instances, Vectors.dense(coefArray),
+        fitIntercept = false, blockSize = blockSize)
+      blocks.foreach(agg.add)
+      assert(loss2 ~== agg.loss relTol 1e-9)
+      assert(grad2 ~== agg.gradient  relTol 1e-9)
+    }
+  }
 }