apache
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala‎
Lines changed: 29 additions & 81 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala‎
Lines changed: 29 additions & 81 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/AFTAggregator.scala‎
Lines changed: 9 additions & 8 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/AFTAggregator.scala‎
Lines changed: 9 additions & 8 deletions
@@ -50,7 +50,7 @@ import org.apache.spark.util.VersionUtils
 private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams
   with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol
   with HasStandardization with HasWeightCol with HasThreshold with HasAggregationDepth
-  with HasBlockSize {
+  with HasMaxBlockSizeInMB {
 
   import org.apache.spark.ml.classification.LogisticRegression.supportedFamilyNames
 
@@ -245,7 +245,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
 
   setDefault(regParam -> 0.0, elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6,
     fitIntercept -> true, family -> "auto", standardization -> true, threshold -> 0.5,
-    aggregationDepth -> 2, blockSize -> 1)
+    aggregationDepth -> 2, maxBlockSizeInMB -> 0.0)
 
   protected def usingBoundConstrainedOptimization: Boolean = {
     isSet(lowerBoundsOnCoefficients) || isSet(upperBoundsOnCoefficients) ||
@@ -426,22 +426,13 @@ class LogisticRegression @Since("1.2.0") (
   def setUpperBoundsOnIntercepts(value: Vector): this.type = set(upperBoundsOnIntercepts, value)
 
   /**
-   * Set block size for stacking input data in matrices.
-   * If blockSize == 1, then stacking will be skipped, and each vector is treated individually;
-   * If blockSize &gt; 1, then vectors will be stacked to blocks, and high-level BLAS routines
-   * will be used if possible (for example, GEMV instead of DOT, GEMM instead of GEMV).
-   * Recommended size is between 10 and 1000. An appropriate choice of the block size depends
-   * on the sparsity and dim of input datasets, the underlying BLAS implementation (for example,
-   * f2jBLAS, OpenBLAS, intel MKL) and its configuration (for example, number of threads).
-   * Note that existing BLAS implementations are mainly optimized for dense matrices, if the
-   * input dataset is sparse, stacking may bring no performance gain, the worse is possible
-   * performance regression.
-   * Default is 1.
+   * Sets the value of param [[maxBlockSizeInMB]].
+   * Default is 0.0.
    *
    * @group expertSetParam
    */
   @Since("3.1.0")
-  def setBlockSize(value: Int): this.type = set(blockSize, value)
+  def setMaxBlockSizeInMB(value: Double): this.type = set(maxBlockSizeInMB, value)
 
   private def assertBoundConstrainedOptimizationParamsValid(
       numCoefficientSets: Int,
@@ -495,31 +486,24 @@ class LogisticRegression @Since("1.2.0") (
     this
   }
 
-  override protected[spark] def train(dataset: Dataset[_]): LogisticRegressionModel = {
-    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
-    train(dataset, handlePersistence)
-  }
-
   protected[spark] def train(
-      dataset: Dataset[_],
-      handlePersistence: Boolean): LogisticRegressionModel = instrumented { instr =>
+      dataset: Dataset[_]): LogisticRegressionModel = instrumented { instr =>
     instr.logPipelineStage(this)
     instr.logDataset(dataset)
     instr.logParams(this, labelCol, weightCol, featuresCol, predictionCol, rawPredictionCol,
       probabilityCol, regParam, elasticNetParam, standardization, threshold, thresholds, maxIter,
-      tol, fitIntercept, blockSize)
+      tol, fitIntercept, maxBlockSizeInMB)
+
+    if (dataset.storageLevel != StorageLevel.NONE) {
+      instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " +
+        s"then cached during training. Be careful of double caching!")
+    }
 
     val instances = extractInstances(dataset)
       .setName("training instances")
 
-    if (handlePersistence && $(blockSize) == 1) {
-      instances.persist(StorageLevel.MEMORY_AND_DISK)
-    }
-
-    var requestedMetrics = Seq("mean", "std", "count")
-    if ($(blockSize) != 1) requestedMetrics +:= "numNonZeros"
     val (summarizer, labelSummarizer) = Summarizer
-      .getClassificationSummarizers(instances, $(aggregationDepth), requestedMetrics)
+      .getClassificationSummarizers(instances, $(aggregationDepth), Seq("mean", "std", "count"))
 
     val numFeatures = summarizer.mean.size
     val histogram = labelSummarizer.histogram
@@ -547,14 +531,13 @@ class LogisticRegression @Since("1.2.0") (
     instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString)
     instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.max.toString)
     instr.logSumOfWeights(summarizer.weightSum)
-    if ($(blockSize) > 1) {
-      val scale = 1.0 / summarizer.count / numFeatures
-      val sparsity = 1 - summarizer.numNonzeros.toArray.map(_ * scale).sum
-      instr.logNamedValue("sparsity", sparsity.toString)
-      if (sparsity > 0.5) {
-        instr.logWarning(s"sparsity of input dataset is $sparsity, " +
-          s"which may hurt performance in high-level BLAS.")
-      }
+
+    var actualBlockSizeInMB = $(maxBlockSizeInMB)
+    if (actualBlockSizeInMB == 0) {
+      // TODO: for Multinomial logistic regression, take numClasses into account
+      actualBlockSizeInMB = InstanceBlock.DefaultBlockSizeInMB
+      require(actualBlockSizeInMB > 0, "inferred actual BlockSizeInMB must > 0")
+      instr.logNamedValue("actualBlockSizeInMB", actualBlockSizeInMB.toString)
     }
 
     val isMultinomial = checkMultinomial(numClasses)
@@ -584,7 +567,6 @@ class LogisticRegression @Since("1.2.0") (
       } else {
         Vectors.dense(if (numClasses == 2) Double.PositiveInfinity else Double.NegativeInfinity)
       }
-      if (instances.getStorageLevel != StorageLevel.NONE) instances.unpersist()
       return createModel(dataset, numClasses, coefMatrix, interceptVec, Array(0.0))
     }
 
@@ -636,14 +618,9 @@ class LogisticRegression @Since("1.2.0") (
        Note that the intercept in scaled space and original space is the same;
        as a result, no scaling is needed.
      */
-    val (allCoefficients, objectiveHistory) = if ($(blockSize) == 1) {
-      trainOnRows(instances, featuresStd, numClasses, initialCoefWithInterceptMatrix,
-        regularization, optimizer)
-    } else {
-      trainOnBlocks(instances, featuresStd, numClasses, initialCoefWithInterceptMatrix,
-        regularization, optimizer)
-    }
-    if (instances.getStorageLevel != StorageLevel.NONE) instances.unpersist()
+    val (allCoefficients, objectiveHistory) =
+      trainImpl(instances, actualBlockSizeInMB, featuresStd, numClasses,
+        initialCoefWithInterceptMatrix, regularization, optimizer)
 
     if (allCoefficients == null) {
       val msg = s"${optimizer.getClass.getName} failed."
@@ -949,40 +926,9 @@ class LogisticRegression @Since("1.2.0") (
     initialCoefWithInterceptMatrix
   }
 
-  private def trainOnRows(
-      instances: RDD[Instance],
-      featuresStd: Array[Double],
-      numClasses: Int,
-      initialCoefWithInterceptMatrix: Matrix,
-      regularization: Option[L2Regularization],
-      optimizer: FirstOrderMinimizer[BDV[Double], DiffFunction[BDV[Double]]]) = {
-    val bcFeaturesStd = instances.context.broadcast(featuresStd)
-    val getAggregatorFunc = new LogisticAggregator(bcFeaturesStd, numClasses, $(fitIntercept),
-      checkMultinomial(numClasses))(_)
-
-    val costFun = new RDDLossFunction(instances, getAggregatorFunc,
-      regularization, $(aggregationDepth))
-    val states = optimizer.iterations(new CachedDiffFunction(costFun),
-      new BDV[Double](initialCoefWithInterceptMatrix.toArray))
-
-    /*
-       Note that in Logistic Regression, the objective history (loss + regularization)
-       is log-likelihood which is invariant under feature standardization. As a result,
-       the objective history from optimizer is the same as the one in the original space.
-     */
-    val arrayBuilder = mutable.ArrayBuilder.make[Double]
-    var state: optimizer.State = null
-    while (states.hasNext) {
-      state = states.next()
-      arrayBuilder += state.adjustedValue
-    }
-    bcFeaturesStd.destroy()
-
-    (if (state == null) null else state.x.toArray, arrayBuilder.result)
-  }
-
-  private def trainOnBlocks(
+  private def trainImpl(
       instances: RDD[Instance],
+      actualBlockSizeInMB: Double,
       featuresStd: Array[Double],
       numClasses: Int,
       initialCoefWithInterceptMatrix: Matrix,
@@ -996,9 +942,11 @@ class LogisticRegression @Since("1.2.0") (
       val func = StandardScalerModel.getTransformFunc(Array.empty, inverseStd, false, true)
       iter.map { case Instance(label, weight, vec) => Instance(label, weight, func(vec)) }
     }
-    val blocks = InstanceBlock.blokify(standardized, $(blockSize))
+
+    val maxMemUsage = (actualBlockSizeInMB * 1024L * 1024L).ceil.toLong
+    val blocks = InstanceBlock.blokifyWithMaxMemUsage(standardized, maxMemUsage)
       .persist(StorageLevel.MEMORY_AND_DISK)
-      .setName(s"training blocks (blockSize=${$(blockSize)})")
+      .setName(s"training blocks (blockSizeInMB=$actualBlockSizeInMB)")
 
     val getAggregatorFunc = new BlockLogisticAggregator(numFeatures, numClasses, $(fitIntercept),
       checkMultinomial(numClasses))(_)
 
@@ -18,8 +18,8 @@
 package org.apache.spark.ml.optim.aggregator
 
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.feature._
 import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.regression.AFTPoint
 
 /**
  * AFTAggregator computes the gradient and loss for a AFT loss function,
@@ -108,26 +108,26 @@ import org.apache.spark.ml.regression.AFTPoint
 private[ml] class AFTAggregator(
     bcFeaturesStd: Broadcast[Array[Double]],
     fitIntercept: Boolean)(bcCoefficients: Broadcast[Vector])
-  extends DifferentiableLossAggregator[AFTPoint, AFTAggregator] {
+  extends DifferentiableLossAggregator[Instance, AFTAggregator] {
 
   protected override val dim: Int = bcCoefficients.value.size
 
   /**
    * Add a new training data to this AFTAggregator, and update the loss and gradient
    * of the objective function.
    *
-   * @param data The AFTPoint representation for one data point to be added into this aggregator.
+   * @param data The Instance representation for one data point to be added into this aggregator.
    * @return This AFTAggregator object.
    */
-  def add(data: AFTPoint): this.type = {
+  def add(data: Instance): this.type = {
     val coefficients = bcCoefficients.value.toArray
     val intercept = coefficients(dim - 2)
     // sigma is the scale parameter of the AFT model
     val sigma = math.exp(coefficients(dim - 1))
 
     val xi = data.features
     val ti = data.label
-    val delta = data.censor
+    val delta = data.weight
 
     require(ti > 0.0, "The lifetime or label should be  greater than 0.")
 
@@ -176,7 +176,7 @@ private[ml] class AFTAggregator(
  */
 private[ml] class BlockAFTAggregator(
     fitIntercept: Boolean)(bcCoefficients: Broadcast[Vector])
-  extends DifferentiableLossAggregator[(Matrix, Array[Double], Array[Double]),
+  extends DifferentiableLossAggregator[InstanceBlock,
     BlockAFTAggregator] {
 
   protected override val dim: Int = bcCoefficients.value.size
@@ -196,8 +196,9 @@ private[ml] class BlockAFTAggregator(
    *
    * @return This BlockAFTAggregator object.
    */
-  def add(block: (Matrix, Array[Double], Array[Double])): this.type = {
-    val (matrix, labels, censors) = block
+  def add(block: InstanceBlock): this.type = {
+    // here use Instance.weight to store censor for convenience
+    val (matrix, labels, censors) = (block.matrix, block.labels, block.weightIter.toArray)
     require(matrix.isTransposed)
     require(numFeatures == matrix.numCols, s"Dimensions mismatch when adding new " +
       s"instance. Expecting $numFeatures but got ${matrix.numCols}.")