remove some transient lazy variables

zhengruifeng · zhengruifeng · commit e8abb4ba6b3b · 2020-05-04T21:24:02.000+08:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -318,8 +318,7 @@ class LinearSVC @Since("2.2.0") (
       .persist(StorageLevel.MEMORY_AND_DISK)
       .setName(s"training dataset (blockSize=${$(blockSize)})")
 
-    val getAggregatorFunc = new BlockHingeAggregator(numFeatures,
-      $(fitIntercept), $(blockSize))(_)
+    val getAggregatorFunc = new BlockHingeAggregator($(fitIntercept))(_)
     val costFun = new RDDLossFunction(blocks, getAggregatorFunc,
       regularization, $(aggregationDepth))
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala
@@ -118,33 +118,23 @@ private[ml] class HingeAggregator(
  * @param fitIntercept Whether to fit an intercept term.
  */
 private[ml] class BlockHingeAggregator(
-    numFeatures: Int,
-    fitIntercept: Boolean,
-    blockSize: Int)(bcCoefficients: Broadcast[Vector])
+    fitIntercept: Boolean)(bcCoefficients: Broadcast[Vector])
   extends DifferentiableLossAggregator[InstanceBlock, BlockHingeAggregator] {
 
-  private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
+  protected override val dim: Int = bcCoefficients.value.size
+  private val numFeatures = if (fitIntercept) dim - 1 else dim
+
   @transient private lazy val coefficientsArray = bcCoefficients.value match {
     case DenseVector(values) => values
     case _ => throw new IllegalArgumentException(s"coefficients only supports dense vector" +
       s" but got type ${bcCoefficients.value.getClass}.")
   }
-  protected override val dim: Int = numFeaturesPlusIntercept
 
-  @transient private lazy val linear = if (fitIntercept) {
-    Vectors.dense(coefficientsArray.take(numFeatures)).toDense
-  } else {
-    Vectors.dense(coefficientsArray).toDense
+  @transient private lazy val linear = {
+    val linear = if (fitIntercept) coefficientsArray.take(numFeatures) else coefficientsArray
+    Vectors.dense(linear).toDense
   }
 
-  @transient private lazy val intercept =
-    if (fitIntercept) coefficientsArray.last else 0.0
-
-  @transient private lazy val linearGradSumVec =
-    if (fitIntercept) Vectors.zeros(numFeatures).toDense else null
-
-  @transient private lazy val auxiliaryVec = Vectors.zeros(blockSize).toDense
-
   /**
    * Add a new training instance block to this HingeAggregator, and update the loss and gradient
    * of the objective function.
@@ -162,20 +152,18 @@ private[ml] class BlockHingeAggregator(
     if (block.weightIter.forall(_ == 0)) return this
     val size = block.size
 
-    // vec/arr here represents dotProducts
-    val vec = if (size == blockSize) auxiliaryVec else Vectors.zeros(size).toDense
-    val arr = vec.values
-
-    if (fitIntercept && intercept != 0) {
-      java.util.Arrays.fill(arr, intercept)
-      BLAS.gemv(1.0, block.matrix, linear, 1.0, vec)
+    // vec here represents dotProducts
+    val vec = if (fitIntercept) {
+      Vectors.dense(Array.fill(size)(coefficientsArray.last)).toDense
     } else {
-      BLAS.gemv(1.0, block.matrix, linear, 0.0, vec)
+      Vectors.zeros(size).toDense
     }
+    BLAS.gemv(1.0, block.matrix, linear, 1.0, vec)
 
     // in-place convert dotProducts to gradient scales
-    // then, vec/arr represents gradient scales
+    // then, vec represents gradient scales
     var i = 0
+    var interceptGradSum = 0.0
     while (i < size) {
       val weight = block.getWeight(i)
       if (weight > 0) {
@@ -184,34 +172,32 @@ private[ml] class BlockHingeAggregator(
         // Therefore the gradient is -(2y - 1)*x
         val label = block.getLabel(i)
         val labelScaled = label + label - 1.0
-        val loss = (1.0 - labelScaled * arr(i)) * weight
+        val loss = (1.0 - labelScaled * vec.values(i)) * weight
         if (loss > 0) {
           lossSum += loss
           val gradScale = -labelScaled * weight
-          arr(i) = gradScale
-        } else {
-          arr(i) = 0.0
-        }
-      } else {
-        arr(i) = 0.0
-      }
+          vec.values(i) = gradScale
+          if (fitIntercept) interceptGradSum += gradScale
+        } else { vec.values(i) = 0.0 }
+      } else { vec.values(i) = 0.0 }
       i += 1
     }
 
     // predictions are all correct, no gradient signal
-    if (arr.forall(_ == 0)) return this
+    if (vec.values.forall(_ == 0)) return this
 
     block.matrix match {
       case dm: DenseMatrix =>
         BLAS.nativeBLAS.dgemv("N", dm.numCols, dm.numRows, 1.0, dm.values, dm.numCols,
-          arr, 1, 1.0, gradientSumArray, 1)
-        if (fitIntercept) gradientSumArray(numFeatures) += arr.sum
+          vec.values, 1, 1.0, gradientSumArray, 1)
+        if (fitIntercept) gradientSumArray(numFeatures) += interceptGradSum
 
       case sm: SparseMatrix if fitIntercept =>
+        val linearGradSumVec = Vectors.zeros(numFeatures).toDense
         BLAS.gemv(1.0, sm.transpose, vec, 0.0, linearGradSumVec)
         BLAS.getBLAS(numFeatures).daxpy(numFeatures, 1.0, linearGradSumVec.values, 1,
           gradientSumArray, 1)
-        gradientSumArray(numFeatures) += arr.sum
+        gradientSumArray(numFeatures) += interceptGradSum
 
       case sm: SparseMatrix if !fitIntercept =>
         val gradSumVec = new DenseVector(gradientSumArray)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HingeAggregatorSuite.scala
@@ -28,6 +28,7 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
   @transient var instances: Array[Instance] = _
   @transient var instancesConstantFeature: Array[Instance] = _
   @transient var instancesConstantFeatureFiltered: Array[Instance] = _
+  @transient var standardizedInstances: Array[Instance] = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
@@ -46,6 +47,7 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
       Instance(1.0, 0.5, Vectors.dense(1.0)),
       Instance(2.0, 0.3, Vectors.dense(0.5))
     )
+    standardizedInstances = standardize(instances)
   }
 
    /** Get summary statistics for some data and create a new HingeAggregator. */
@@ -61,18 +63,27 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     new HingeAggregator(bcFeaturesStd, fitIntercept)(bcCoefficients)
   }
 
+  private def standardize(instances: Array[Instance]): Array[Instance] = {
+    val (featuresSummarizer, _) =
+      Summarizer.getClassificationSummarizers(sc.parallelize(instances))
+    val stdArray = featuresSummarizer.std.toArray
+    val numFeatures = stdArray.length
+    instances.map { case Instance(label, weight, features) =>
+      val standardized = Array.ofDim[Double](numFeatures)
+      features.foreachNonZero { (i, v) =>
+        val std = stdArray(i)
+        if (std != 0) standardized(i) = v / std
+      }
+      Instance(label, weight, Vectors.dense(standardized).compressed)
+    }
+  }
+
    /** Get summary statistics for some data and create a new BlockHingeAggregator. */
   private def getNewBlockAggregator(
-      instances: Array[Instance],
       coefficients: Vector,
-      fitIntercept: Boolean,
-      blockSize: Int): BlockHingeAggregator = {
-    val (featuresSummarizer, ySummarizer) =
-      Summarizer.getClassificationSummarizers(sc.parallelize(instances))
-    val featuresStd = featuresSummarizer.std.toArray
-    val numFeatures = featuresStd.length
+      fitIntercept: Boolean): BlockHingeAggregator = {
     val bcCoefficients = spark.sparkContext.broadcast(coefficients)
-    new BlockHingeAggregator(numFeatures, fitIntercept, blockSize)(bcCoefficients)
+    new BlockHingeAggregator(fitIntercept)(bcCoefficients)
   }
 
   test("aggregator add method input size") {
@@ -153,8 +164,26 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
     val gradient = Vectors.dense((gradientCoef ++ Array(gradientIntercept)).map(_ / weightSum))
 
-    assert(loss ~== agg.loss relTol 0.01)
-    assert(gradient ~== agg.gradient relTol 0.01)
+    assert(loss ~== agg.loss relTol 1e-9)
+    assert(gradient ~== agg.gradient relTol 1e-9)
+
+    Seq(1, 2, 4).foreach { blockSize =>
+      val blocks1 = standardizedInstances
+        .grouped(blockSize)
+        .map(seq => InstanceBlock.fromInstances(seq))
+        .toArray
+      val blocks2 = blocks1.map { block =>
+        new InstanceBlock(block.labels, block.weights, block.matrix.toSparseRowMajor)
+      }
+
+      Seq(blocks1, blocks2).foreach { blocks =>
+        val blockAgg = getNewBlockAggregator(Vectors.dense(coefArray ++ Array(intercept)),
+          fitIntercept = true)
+        blocks.foreach(blockAgg.add)
+        assert(loss ~== blockAgg.loss relTol 1e-9)
+        assert(gradient ~== blockAgg.gradient relTol 1e-9)
+      }
+    }
   }
 
   test("check with zero standard deviation") {
@@ -172,51 +201,4 @@ class HingeAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(aggConstantFeatureBinary.gradient(0) === 0.0)
     assert(aggConstantFeatureBinary.gradient(1) == aggConstantFeatureBinaryFiltered.gradient(0))
   }
-
-  test("Block HingeAggregator") {
-    val coefArray = Array(1.0, 2.0)
-    val intercept = 1.0
-    val blocks1 = instances
-      .grouped(2)
-      .map(seq => InstanceBlock.fromInstances(seq))
-      .toArray
-
-    val blocks2 = blocks1.map { block =>
-      new InstanceBlock(block.labels, block.weights, block.matrix.toSparseRowMajor)
-    }
-
-    val blocks3 = blocks1.zipWithIndex.map { case (block, i) =>
-      if (i % 2 == 0) {
-        new InstanceBlock(block.labels, block.weights, block.matrix.toDense)
-      } else {
-        new InstanceBlock(block.labels, block.weights, block.matrix.toSparseRowMajor)
-      }
-    }
-
-    val agg1 = getNewBlockAggregator(instances, Vectors.dense(coefArray ++ Array(intercept)),
-      fitIntercept = true, blockSize = 1)
-    blocks1.foreach(agg1.add)
-    val loss1 = agg1.loss
-    val grad1 = agg1.gradient
-    for (blocks <- Seq(blocks1, blocks2, blocks3); blockSize <- Seq(1, 2, 4)) {
-      val agg = getNewBlockAggregator(instances, Vectors.dense(coefArray ++ Array(intercept)),
-        fitIntercept = true, blockSize = blockSize)
-      blocks.foreach(agg.add)
-      assert(loss1 ~== agg.loss relTol 1e-9)
-      assert(grad1 ~== agg.gradient  relTol 1e-9)
-    }
-
-    val agg2 = getNewBlockAggregator(instances, Vectors.dense(coefArray),
-      fitIntercept = false, blockSize = 1)
-    blocks1.foreach(agg2.add)
-    val loss2 = agg2.loss
-    val grad2 = agg2.gradient
-    for (blocks <- Seq(blocks1, blocks2, blocks3); blockSize <- Seq(1, 2, 4)) {
-      val agg = getNewBlockAggregator(instances, Vectors.dense(coefArray),
-        fitIntercept = false, blockSize = blockSize)
-      blocks.foreach(agg.add)
-      assert(loss2 ~== agg.loss relTol 1e-9)
-      assert(grad2 ~== agg.gradient  relTol 1e-9)
-    }
-  }
 }