From 960995780957123399a27a171df21adc699d0e46 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 26 May 2016 07:30:55 -0700 Subject: [PATCH 01/14] add multinomial logistic regression --- .../classification/LogisticRegression.scala | 251 +++-- .../MultinomialLogisticRegression.scala | 651 +++++++++++ .../MultinomialLogisticRegressionSuite.scala | 1001 +++++++++++++++++ .../apache/spark/ml/util/MLTestingUtils.scala | 49 +- 4 files changed, 1883 insertions(+), 69 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index fce3935d396f..3c65228351dd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -63,6 +63,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas * equivalent. * * Default is 0.5. + * * @group setParam */ def setThreshold(value: Double): this.type = { @@ -131,6 +132,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * If [[threshold]] and [[thresholds]] are both set, ensures they are consistent. + * * @throws IllegalArgumentException if [[threshold]] and [[thresholds]] are not equivalent */ protected def checkThresholdConsistency(): Unit = { @@ -168,6 +170,7 @@ class LogisticRegression @Since("1.2.0") ( /** * Set the regularization parameter. * Default is 0.0. + * * @group setParam */ @Since("1.2.0") @@ -179,6 +182,7 @@ class LogisticRegression @Since("1.2.0") ( * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. * For 0 < alpha < 1, the penalty is a combination of L1 and L2. * Default is 0.0 which is an L2 penalty. + * * @group setParam */ @Since("1.4.0") @@ -188,6 +192,7 @@ class LogisticRegression @Since("1.2.0") ( /** * Set the maximum number of iterations. * Default is 100. + * * @group setParam */ @Since("1.2.0") @@ -198,6 +203,7 @@ class LogisticRegression @Since("1.2.0") ( * Set the convergence tolerance of iterations. * Smaller value will lead to higher accuracy with the cost of more iterations. * Default is 1E-6. + * * @group setParam */ @Since("1.4.0") @@ -207,6 +213,7 @@ class LogisticRegression @Since("1.2.0") ( /** * Whether to fit an intercept term. * Default is true. + * * @group setParam */ @Since("1.4.0") @@ -220,6 +227,7 @@ class LogisticRegression @Since("1.2.0") ( * the models should be always converged to the same solution when no regularization * is applied. In R's GLMNET package, the default behavior is true as well. * Default is true. + * * @group setParam */ @Since("1.5.0") @@ -236,6 +244,7 @@ class LogisticRegression @Since("1.2.0") ( * Whether to over-/under-sample training instances according to the given weights in weightCol. * If not set or empty String, all instances are treated equally (weight 1.0). * Default is not set, so all instances have weight one. + * * @group setParam */ @Since("1.6.0") @@ -312,7 +321,8 @@ class LogisticRegression @Since("1.2.0") ( if (numClasses > 2) { val msg = s"Currently, LogisticRegression with ElasticNet in ML package only supports " + - s"binary classification. Found $numClasses in the input dataset." + s"binary classification. Found $numClasses in the input dataset. Consider using " + + s"MultinomialLogisticRegression instead." logError(msg) throw new SparkException(msg) } else if ($(fitIntercept) && numClasses == 2 && histogram(0) == 0.0) { @@ -349,7 +359,7 @@ class LogisticRegression @Since("1.2.0") ( val bcFeaturesStd = instances.context.broadcast(featuresStd) val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), - $(standardization), bcFeaturesStd, regParamL2) + $(standardization), featuresStd, regParamL2, multinomial = false, standardize = true) val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) @@ -416,7 +426,7 @@ class LogisticRegression @Since("1.2.0") ( /* Note that in Logistic Regression, the objective history (loss + regularization) - is log-likelihood which is invariance under feature standardization. As a result, + is log-likelihood which is invariant under feature standardization. As a result, the objective history from optimizer is the same as the one in the original space. */ val arrayBuilder = mutable.ArrayBuilder.make[Double] @@ -559,6 +569,7 @@ class LogisticRegressionModel private[spark] ( /** * Evaluates the model on a test dataset. + * * @param dataset Test dataset to evaluate model on. */ @Since("2.0.0") @@ -710,6 +721,7 @@ private[classification] class MultiClassSummarizer extends Serializable { /** * Add a new label into this MultilabelSummarizer, and update the distinct map. + * * @param label The label for this data point. * @param weight The weight of this instances. * @return This MultilabelSummarizer @@ -871,6 +883,8 @@ class BinaryLogisticRegressionSummary private[classification] ( * * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. * This will change in later Spark versions. + * + * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic */ @Since("1.5.0") @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR") @@ -936,8 +950,6 @@ class BinaryLogisticRegressionSummary private[classification] ( * LogisticAggregator computes the gradient and loss for binary logistic loss function, as used * in binary classification for instances in sparse or dense vector in an online fashion. * - * Note that multinomial logistic loss is not supported yet! - * * Two LogisticAggregator can be merged together to have a summary of loss and gradient of * the corresponding joint dataset. * @@ -952,13 +964,140 @@ private class LogisticAggregator( val bcFeaturesStd: Broadcast[Array[Double]], private val numFeatures: Int, numClasses: Int, - fitIntercept: Boolean) extends Serializable { + fitIntercept: Boolean, + multinomial: Boolean, + standardize: Boolean) extends Serializable { private var weightSum = 0.0 private var lossSum = 0.0 + private val totalCoefficientLength = { + val cols = if (fitIntercept) numFeatures + 1 else numFeatures + val rows = if (multinomial) numClasses else math.max(1, numClasses - 1) + rows * cols + } + private val gradientSumArray = - Array.ofDim[Double](if (fitIntercept) numFeatures + 1 else numFeatures) + Array.ofDim[Double](totalCoefficientLength) + + /** Update gradient and loss using binary loss function. */ + private def binaryUpdateInPlace( + features: Vector, + weight: Double, + label: Double, + coefficients: Array[Double], + gradient: Array[Double], + featuresStd: Array[Double], + numFeaturesPlusIntercept: Int, + standardize: Boolean): Unit = { + val margin = - { + var sum = 0.0 + features.foreachActive { (index, value) => + if (featuresStd(index) != 0.0 && value != 0.0) { + val x = if (standardize) value / featuresStd(index) else value + sum += coefficients(index) * x + } + } + sum + { + if (fitIntercept) coefficients(numFeaturesPlusIntercept - 1) else 0.0 + } + } + + val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label) + + features.foreachActive { (index, value) => + if (featuresStd(index) != 0.0 && value != 0.0) { + val x = if (standardize) value / featuresStd(index) else value + gradient(index) += multiplier * x + } + } + + if (fitIntercept) { + gradient(numFeaturesPlusIntercept - 1) += multiplier + } + + if (label > 0) { + // The following is equivalent to log(1 + exp(margin)) but more numerically stable. + lossSum += weight * MLUtils.log1pExp(margin) + } else { + lossSum += weight * (MLUtils.log1pExp(margin) - margin) + } + } + + /** Update gradient and loss using multinomial loss function. */ + private def multinomialUpdateInPlace( + features: Vector, + weight: Double, + label: Double, + coefficients: Array[Double], + gradient: Array[Double], + featuresStd: Array[Double], + numFeaturesPlusIntercept: Int, + standardize: Boolean): Unit = { + /* + Note: this can still be used when numClasses = 2 for binary + logistic regression without pivoting. + */ + var marginY = 0.0 + var maxMargin = Double.NegativeInfinity + + val margins = Array.tabulate(numClasses) { i => + var margin = 0.0 + features.foreachActive { (index, value) => + if (featuresStd(index) != 0.0 && value != 0.0) { + val x = if (standardize) value / featuresStd(index) else value + margin += coefficients(i * numFeaturesPlusIntercept + index) * x + } + } + + if (fitIntercept) { + margin += coefficients(i * numFeaturesPlusIntercept + features.size) + } + if (i == label.toInt) marginY = margin + if (margin > maxMargin) { + maxMargin = margin + } + margin + } + + val sum = { + var temp = 0.0 + if (maxMargin > 0) { + for (i <- 0 until numClasses) { + margins(i) -= maxMargin + temp += math.exp(margins(i)) + } + } else { + for (i <- 0 until numClasses) { + temp += math.exp(margins(i)) + } + } + temp + } + + for (i <- 0 until numClasses) { + val multiplier = math.exp(margins(i)) / sum - { + if (label == i) 1.0 else 0.0 + } + features.foreachActive { (index, value) => + if (value != 0.0) { + val x = if (standardize) value / featuresStd(index) else value + gradient(i * numFeaturesPlusIntercept + index) += weight * multiplier * x + } + } + if (fitIntercept) { + gradient(i * numFeaturesPlusIntercept + features.size) += + weight * multiplier + } + } + + val loss = if (maxMargin > 0) { + math.log(sum) - marginY + maxMargin + } else { + math.log(sum) - marginY + } + lossSum += weight * loss + } /** * Add a new training instance to this LogisticAggregator, and update the loss and gradient @@ -969,9 +1108,20 @@ private class LogisticAggregator( */ def add(instance: Instance): this.type = { instance match { case Instance(label, weight, features) => - require(numFeatures == features.size, s"Dimensions mismatch when adding new instance." + - s" Expecting $numFeatures but got ${features.size}.") + val size = coefficients.size + require(numFeatures == features.size, s"Dimension mismatch when adding new instance." + + s" Expecting $numFeatures but got ${features.size}") require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0") + if (multinomial) { + require(numClasses == size / numFeaturesPlusIntercept, s"The number" + + s" of coefficients should be ${numClasses * numFeaturesPlusIntercept} but " + + s"was $size") + } else { + require(size == numFeaturesPlusIntercept, s"Expected " + + s"$numFeaturesPlusIntercept coefficients but got $size") + require(numClasses <= 2, s"Binary logistic aggregator requires numClasses in {1, 2}" + + s" but found $numClasses.") + } if (weight == 0.0) return this @@ -984,43 +1134,12 @@ private class LogisticAggregator( } val localGradientSumArray = gradientSumArray - val featuresStd = bcFeaturesStd.value - numClasses match { - case 2 => - // For Binary Logistic Regression. - val margin = - { - var sum = 0.0 - features.foreachActive { (index, value) => - if (featuresStd(index) != 0.0 && value != 0.0) { - sum += coefficientsArray(index) * (value / featuresStd(index)) - } - } - sum + { - if (fitIntercept) coefficientsArray(numFeatures) else 0.0 - } - } - - val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label) - - features.foreachActive { (index, value) => - if (featuresStd(index) != 0.0 && value != 0.0) { - localGradientSumArray(index) += multiplier * (value / featuresStd(index)) - } - } - - if (fitIntercept) { - localGradientSumArray(numFeatures) += multiplier - } - - if (label > 0) { - // The following is equivalent to log(1 + exp(margin)) but more numerically stable. - lossSum += weight * MLUtils.log1pExp(margin) - } else { - lossSum += weight * (MLUtils.log1pExp(margin) - margin) - } - case _ => - new NotImplementedError("LogisticRegression with ElasticNet in ML package " + - "only supports binary classification for now.") + if (multinomial) { + multinomialUpdateInPlace(features, weight, label, coefficientsArray, localGradientSumArray, + featuresStd, numFeaturesPlusIntercept, standardize) + } else { + binaryUpdateInPlace(features, weight, label, coefficientsArray, localGradientSumArray, + featuresStd, numFeaturesPlusIntercept, standardize) } weightSum += weight this @@ -1082,57 +1201,55 @@ private class LogisticCostFun( fitIntercept: Boolean, standardization: Boolean, bcFeaturesStd: Broadcast[Array[Double]], - regParamL2: Double) extends DiffFunction[BDV[Double]] { + regParamL2: Double, + multinomial: Boolean, + standardize: Boolean) extends DiffFunction[BDV[Double]] { val featuresStd = bcFeaturesStd.value override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = { - val numFeatures = featuresStd.length val coeffs = Vectors.fromBreeze(coefficients) val bcCoeffs = instances.context.broadcast(coeffs) - val n = coeffs.size + val localFeaturesStd = featuresStd + val numFeatures = localFeaturesStd.length + val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures val logisticAggregator = { - val seqOp = (c: LogisticAggregator, instance: Instance) => c.add(instance) + val seqOp = (c: LogisticAggregator, instance: Instance) => + c.add(instance, bcCoeffs.value, localFeaturesStd) val combOp = (c1: LogisticAggregator, c2: LogisticAggregator) => c1.merge(c2) instances.treeAggregate( - new LogisticAggregator(bcCoeffs, bcFeaturesStd, numFeatures, numClasses, fitIntercept) + new LogisticAggregator(numFeatures, numClasses, fitIntercept, multinomial, standardize) )(seqOp, combOp) } val totalGradientArray = logisticAggregator.gradient.toArray - // regVal is the sum of coefficients squares excluding intercept for L2 regularization. val regVal = if (regParamL2 == 0.0) { 0.0 } else { + val K = if (multinomial) numClasses else numClasses - 1 var sum = 0.0 - coeffs.foreachActive { (index, value) => - // If `fitIntercept` is true, the last term which is intercept doesn't - // contribute to the regularization. - if (index != numFeatures) { - // The following code will compute the loss of the regularization; also - // the gradient of the regularization, and add back to totalGradientArray. + (0 until K).foreach { k => + var j = 0 + while (j < numFeatures) { + val value = coeffs(k * numFeaturesPlusIntercept + j) sum += { if (standardization) { - totalGradientArray(index) += regParamL2 * value + totalGradientArray(k * numFeaturesPlusIntercept + j) += regParamL2 * value value * value } else { - if (featuresStd(index) != 0.0) { - // If `standardization` is false, we still standardize the data - // to improve the rate of convergence; as a result, we have to - // perform this reverse standardization by penalizing each component - // differently to get effectively the same objective function when - // the training dataset is not standardized. - val temp = value / (featuresStd(index) * featuresStd(index)) - totalGradientArray(index) += regParamL2 * temp + if (featuresStd(j) != 0.0) { + val temp = value / (featuresStd(j) * featuresStd(j)) + totalGradientArray(k * numFeaturesPlusIntercept + j) += regParamL2 * temp value * temp } else { 0.0 } } } + j += 1 } } 0.5 * regParamL2 * sum diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala new file mode 100644 index 000000000000..a46639d66d47 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -0,0 +1,651 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.classification + +import scala.collection.mutable + +import breeze.linalg.{DenseVector => BDV} +import breeze.optimize.{CachedDiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} +import org.apache.hadoop.fs.Path + +import org.apache.spark.SparkException +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.internal.Logging +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg._ +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util._ +import org.apache.spark.mllib.linalg.VectorImplicits._ +import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.types.DoubleType +import org.apache.spark.storage.StorageLevel + +/** + * Params for multinomial logistic regression. + */ +private[classification] trait MultinomialLogisticRegressionParams + extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter + with HasFitIntercept with HasTol with HasStandardization with HasWeightCol { + + /** + * Set thresholds in multiclass (or binary) classification to adjust the probability of + * predicting each class. Array must have length equal to the number of classes, with values >= 0. + * The class with largest value p/t is predicted, where p is the original probability of that + * class and t is the class' threshold. + * + * @group setParam + */ + def setThresholds(value: Array[Double]): this.type = { + set(thresholds, value) + } + + /** + * Get thresholds for binary or multiclass classification. + * + * @group getParam + */ + override def getThresholds: Array[Double] = { + $(thresholds) + } +} + +/** + * :: Experimental :: + * Multinomial Logistic regression. + */ +@Since("2.1.0") +@Experimental +class MultinomialLogisticRegression @Since("2.1.0") ( + @Since("2.1.0") override val uid: String) + extends ProbabilisticClassifier[Vector, + MultinomialLogisticRegression, MultinomialLogisticRegressionModel] + with MultinomialLogisticRegressionParams with DefaultParamsWritable with Logging { + + @Since("2.1.0") + def this() = this(Identifiable.randomUID("mlogreg")) + + /** + * Set the regularization parameter. + * Default is 0.0. + * + * @group setParam + */ + @Since("2.1.0") + def setRegParam(value: Double): this.type = set(regParam, value) + + setDefault(regParam -> 0.0) + + /** + * Set the ElasticNet mixing parameter. + * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. + * For 0 < alpha < 1, the penalty is a combination of L1 and L2. + * Default is 0.0 which is an L2 penalty. + * + * @group setParam + */ + @Since("2.1.0") + def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) + + setDefault(elasticNetParam -> 0.0) + + /** + * Set the maximum number of iterations. + * Default is 100. + * + * @group setParam + */ + @Since("2.1.0") + def setMaxIter(value: Int): this.type = set(maxIter, value) + + setDefault(maxIter -> 100) + + /** + * Set the convergence tolerance of iterations. + * Smaller value will lead to higher accuracy with the cost of more iterations. + * Default is 1E-6. + * + * @group setParam + */ + @Since("2.1.0") + def setTol(value: Double): this.type = set(tol, value) + + setDefault(tol -> 1E-6) + + /** + * Whether to fit an intercept term. + * Default is true. + * + * @group setParam + */ + @Since("2.1.0") + def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) + + setDefault(fitIntercept -> true) + + /** + * Whether to standardize the training features before fitting the model. + * The coefficients of models will be always returned on the original scale, + * so it will be transparent for users. Note that with/without standardization, + * the models should always converge to the same solution when no regularization + * is applied. In R's GLMNET package, the default behavior is true as well. + * Default is true. + * + * @group setParam + */ + @Since("2.1.0") + def setStandardization(value: Boolean): this.type = set(standardization, value) + + setDefault(standardization -> true) + + /** + * Sets the value of param [[weightCol]]. + * If this is not set or empty, we treat all instance weights as 1.0. + * Default is not set, so all instances have weight one. + * + * @group setParam + */ + @Since("2.1.0") + def setWeightCol(value: String): this.type = set(weightCol, value) + + @Since("2.1.0") + override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) + + override protected[spark] def train(dataset: Dataset[_]): MultinomialLogisticRegressionModel = { + val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE + train(dataset, handlePersistence) + } + + protected[spark] def train( + dataset: Dataset[_], + handlePersistence: Boolean): MultinomialLogisticRegressionModel = { + val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) + val instances: RDD[Instance] = + dataset.select(col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map { + case Row(label: Double, weight: Double, features: Vector) => + Instance(label, weight, features) + } + + val instr = Instrumentation.create(this, instances) + instr.logParams(regParam, elasticNetParam, standardization, thresholds, + maxIter, tol, fitIntercept) + + val (summarizer, labelSummarizer) = { + val seqOp = (c: (MultivariateOnlineSummarizer, MultiClassSummarizer), + instance: Instance) => + (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight)) + + val combOp = (c1: (MultivariateOnlineSummarizer, MultiClassSummarizer), + c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) => + (c1._1.merge(c2._1), c1._2.merge(c2._2)) + + instances.treeAggregate( + new MultivariateOnlineSummarizer, new MultiClassSummarizer)(seqOp, combOp) + } + + val histogram = labelSummarizer.histogram + val numInvalid = labelSummarizer.countInvalid + val numFeatures = summarizer.mean.size + val numFeaturesPlusIntercept = if (getFitIntercept) numFeatures + 1 else numFeatures + val numClasses = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match { + case Some(n: Int) => + require(n >= histogram.length, s"Specified number of classes $n was " + + s"less than the number of unique labels ${histogram.length}") + n + case None => histogram.length + } + + instr.logNumClasses(numClasses) + instr.logNumFeatures(numFeatures) + + val (coefficients, intercepts, objectiveHistory) = { + if (numInvalid != 0) { + val msg = s"Classification labels should be in {0 to ${numClasses - 1} " + + s"Found $numInvalid invalid labels." + logError(msg) + throw new SparkException(msg) + } + + val labelIsConstant = histogram.count(_ != 0) == 1 + + if ($(fitIntercept) && labelIsConstant) { + // we want to produce a model that will always predict the constant label + (Matrices.sparse(numClasses, numFeatures, Array.fill(numFeatures + 1)(0), Array(), Array()), + Vectors.sparse(numClasses, Seq((numClasses - 1, Double.PositiveInfinity))), + Array.empty[Double]) + } else { + if (!$(fitIntercept) && labelIsConstant) { + logWarning(s"All labels belong to a single class and fitIntercept=false. It's" + + s"a dangerous ground, so the algorithm may not converge.") + } + + val featuresStd = summarizer.variance.toArray.map(math.sqrt) + val standardizedInstances = instances.map { case Instance(label, weight, features) => + val f = features match { + case DenseVector(vs) => + val values = vs.clone() + val size = values.length + var i = 0 + while (i < size) { + values(i) *= (if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0) + i += 1 + } + Vectors.dense(values) + case SparseVector(size, indices, vs) => + val values = vs.clone() + val nnz = values.length + var i = 0 + while (i < nnz) { + values(i) *= (if (featuresStd(indices(i)) != 0.0) { + 1.0 / featuresStd(indices(i)) + } else { + 0.0 + }) + i += 1 + } + Vectors.sparse(size, indices, values) + case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) + } + Instance(label, weight, f) + } + if (handlePersistence) standardizedInstances.persist(StorageLevel.MEMORY_AND_DISK) + + val regParamL1 = $(elasticNetParam) * $(regParam) + val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) + + val costFun = new LogisticCostFun(standardizedInstances, numClasses, $(fitIntercept), + $(standardization), featuresStd, regParamL2, multinomial = true, standardize = false) + + val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { + new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) + } else { + val standardizationParam = $(standardization) + def regParamL1Fun = (index: Int) => { + // Remove the L1 penalization on the intercept + val isIntercept = $(fitIntercept) && ((index + 1) % coefWithInterceptLength == 0) + if (isIntercept) { + 0.0 + } else { + if (standardizationParam) { + regParamL1 + } else { + val featureIndex = if ($(fitIntercept)) { + index % numFeaturesPlusIntercept + } else { + index % numFeatures + } + // If `standardization` is false, we still standardize the data + // to improve the rate of convergence; as a result, we have to + // perform this reverse standardization by penalizing each component + // differently to get effectively the same objective function when + // the training dataset is not standardized. + if (featuresStd(featureIndex) != 0.0) { + regParamL1 / featuresStd(featureIndex) + } else { + 0.0 + } + } + } + } + new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) + } + + val initialCoefficientsWithIntercept = Vectors.zeros(numClasses * coefWithInterceptLength) + + if ($(fitIntercept)) { + /* + For multinomial logistic regression, when we initialize the coefficients as zeros, + it will converge faster if we initialize the intercepts such that + it follows the distribution of the labels. + {{{ + P(0) = \exp(b_0) / (\sum_{k=1}^K \exp(b_k)) + ... + P(K) = \exp(b_K) / (\sum_{k=1}^K \exp(b_k)) + }}} + The solution to this is not identifiable, so choose the solution with minimum + L2 penalty (i.e. subtract the mean). Hence, + {{{ + b_k = \log{count_k / count_0} + b_k' = b_k - \frac{1}{K} \sum b_k + }}} + */ + val referenceCoef = histogram.indices.map { i => + if (histogram(i) > 0) { + math.log(histogram(i) / (histogram(0) + 1)) // add 1 for smoothing + } else { + 0.0 + } + } + val referenceMean = referenceCoef.sum / referenceCoef.length + histogram.indices.foreach { i => + initialCoefficientsWithIntercept.toArray(i * numFeaturesPlusIntercept + numFeatures) = + referenceCoef(i) - referenceMean + } + } + val states = optimizer.iterations(new CachedDiffFunction(costFun), + initialCoefficientsWithIntercept.asBreeze.toDenseVector) + + /* + Note that in Multinomial Logistic Regression, the objective history + (loss + regularization) is log-likelihood which is invariant under feature + standardization. As a result, the objective history from optimizer is the same as the + one in the original space. + */ + val arrayBuilder = mutable.ArrayBuilder.make[Double] + var state: optimizer.State = null + while (states.hasNext) { + state = states.next() + arrayBuilder += state.adjustedValue + } + if (handlePersistence) standardizedInstances.unpersist() + + if (state == null) { + val msg = s"${optimizer.getClass.getName} failed." + logError(msg) + throw new SparkException(msg) + } + + /* + The coefficients are trained in the scaled space; we're converting them back to + the original space. + Note that the intercept in scaled space and original space is the same; + as a result, no scaling is needed. + */ + var interceptSum = 0.0 + var coefSum = 0.0 + val rawCoefficients = state.x.toArray.clone() + val coefArray = Array.ofDim[Double](numFeatures * numClasses) + val interceptArray = Array.ofDim[Double](if (getFitIntercept) numClasses else 0) + (0 until numClasses).foreach { k => + var i = 0 + while (i < numFeatures) { + val rawValue = rawCoefficients(k * numFeaturesPlusIntercept + i) + val unscaledCoef = + rawValue * { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 } + coefArray(k * numFeatures + i) = unscaledCoef + coefSum += unscaledCoef + i += 1 + } + if (getFitIntercept) { + val intercept = rawCoefficients(k * numFeaturesPlusIntercept + numFeatures) + interceptArray(k) = intercept + interceptSum += intercept + } + } + + val _coefficients = { + /* + When no regularization is applied, the coefficients lack identifiability because + we do not use a pivot class. We can add any constant value to the coefficients and + get the same likelihood. So here, we choose the mean centered coefficients for + reproducibility. This method follows the approach in glmnet, described here: + + Friedman, et al. "Regularization Paths for Generalized Linear Models via + Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf + */ + if ($(regParam) == 0) { + val coefficientMean = coefSum / (numClasses * numFeatures) + var i = 0 + while (i < coefArray.length) { + coefArray(i) -= coefficientMean + i += 1 + } + } + new DenseMatrix(numClasses, numFeatures, coefArray, isTransposed = true) + } + + val _intercepts = if (getFitIntercept) { + /* + The intercepts are never regularized, so we always center the mean. + */ + val interceptMean = interceptSum / numClasses + var k = 0 + while (k < interceptArray.length) { + interceptArray(k) -= interceptMean + k += 1 + } + Vectors.dense(interceptArray) + } else { + Vectors.sparse(numClasses, Seq()) + } + + (_coefficients, _intercepts, arrayBuilder.result()) + } + } + + val model = copyValues( + new MultinomialLogisticRegressionModel(uid, coefficients, intercepts, numClasses)) + instr.logSuccess(model) + model + } + + @Since("2.1.0") + override def copy(extra: ParamMap): MultinomialLogisticRegression = defaultCopy(extra) +} + +@Since("2.1.0") +object MultinomialLogisticRegression extends DefaultParamsReadable[MultinomialLogisticRegression] { + + @Since("2.1.0") + override def load(path: String): MultinomialLogisticRegression = super.load(path) +} + +/** + * :: Experimental :: + * Model produced by [[MultinomialLogisticRegression]]. + */ +@Since("2.1.0") +@Experimental +class MultinomialLogisticRegressionModel private[spark] ( + @Since("2.1.0") override val uid: String, + @Since("2.1.0") val coefficients: Matrix, + @Since("2.1.0") val intercepts: Vector, + @Since("2.1.0") val numClasses: Int) + extends ProbabilisticClassificationModel[Vector, MultinomialLogisticRegressionModel] + with MultinomialLogisticRegressionParams with MLWritable { + + @Since("2.1.0") + override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) + + @Since("2.1.0") + override def getThresholds: Array[Double] = super.getThresholds + + @Since("2.1.0") + override val numFeatures: Int = coefficients.numCols + + /** Margin (rawPrediction) for each class label. */ + private val margins: Vector => Vector = (features) => { + val m = intercepts.toDense.copy + BLAS.gemv(1.0, coefficients, features, 1.0, m) + m + } + + /** Score (probability) for each class label. */ + private val scores: Vector => Vector = (features) => { + val m = margins(features).toDense + val maxMarginIndex = m.argmax + val maxMargin = m(maxMarginIndex) + + // adjust margins for overflow + val sum = { + var temp = 0.0 + if (maxMargin > 0) { + for (i <- 0 until numClasses) { + m.toArray(i) -= maxMargin + temp += math.exp(m(i)) + } + } else { + for (i <- 0 until numClasses ) { + temp += math.exp(m(i)) + } + } + temp + } + + var i = 0 + while (i < m.size) { + m.values(i) = math.exp(m.values(i)) / sum + i += 1 + } + m + } + + /** + * Predict label for the given feature vector. + * The behavior of this can be adjusted using [[thresholds]]. + */ + override protected def predict(features: Vector): Double = { + if (isDefined(thresholds)) { + val thresholds: Array[Double] = getThresholds + val scaledProbability: Array[Double] = + scores(features).toArray.zip(thresholds).map { case (p, t) => + if (t == 0.0) Double.PositiveInfinity else p / t + } + Vectors.dense(scaledProbability).argmax + } else { + scores(features).argmax + } + } + + override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { + rawPrediction match { + case dv: DenseVector => + val size = dv.size + + // get the maximum margin + val maxMarginIndex = rawPrediction.argmax + val maxMargin = rawPrediction(maxMarginIndex) + + if (maxMargin == Double.PositiveInfinity) { + for (j <- 0 until size) { + if (j == maxMarginIndex) { + dv.values(j) = 1.0 + } else { + dv.values(j) = 0.0 + } + } + } else { + val sum = { + var temp = 0.0 + if (maxMargin > 0) { + // adjust margins for overflow + for (j <- 0 until numClasses) { + dv.values(j) -= maxMargin + temp += math.exp(dv.values(j)) + } + } else { + for (j <- 0 until numClasses) { + temp += math.exp(dv.values(j)) + } + } + temp + } + + // update in place + var i = 0 + while (i < size) { + dv.values(i) = math.exp(dv.values(i)) / sum + i += 1 + } + } + dv + case sv: SparseVector => + throw new RuntimeException("Unexpected error in MultinomialLogisticRegressionModel:" + + " raw2probabilitiesInPlace encountered SparseVector") + } + } + + override protected def predictRaw(features: Vector): Vector = margins(features) + + @Since("2.1.0") + override def copy(extra: ParamMap): MultinomialLogisticRegressionModel = { + val newModel = + copyValues( + new MultinomialLogisticRegressionModel(uid, coefficients, intercepts, numClasses), extra) + newModel.setParent(parent) + } + + /** + * Returns a [[org.apache.spark.ml.util.MLWriter]] instance for this ML instance. + * + * This does not save the [[parent]] currently. + */ + @Since("2.1.0") + override def write: MLWriter = + new MultinomialLogisticRegressionModel.MultinomialLogisticRegressionModelWriter(this) +} + + +@Since("2.1.0") +object MultinomialLogisticRegressionModel extends MLReadable[MultinomialLogisticRegressionModel] { + + @Since("2.1.0") + override def read: MLReader[MultinomialLogisticRegressionModel] = + new MultinomialLogisticRegressionModelReader + + @Since("2.1.0") + override def load(path: String): MultinomialLogisticRegressionModel = super.load(path) + + /** [[MLWriter]] instance for [[MultinomialLogisticRegressionModel]] */ + private[MultinomialLogisticRegressionModel] + class MultinomialLogisticRegressionModelWriter(instance: MultinomialLogisticRegressionModel) + extends MLWriter with Logging { + + private case class Data( + numClasses: Int, + numFeatures: Int, + intercept: Vector, + coefficients: Matrix) + + override protected def saveImpl(path: String): Unit = { + // Save metadata and Params + DefaultParamsWriter.saveMetadata(instance, path, sc) + // Save model data: numClasses, numFeatures, intercept, coefficients + val data = Data(instance.numClasses, instance.numFeatures, instance.intercepts, + instance.coefficients) + val dataPath = new Path(path, "data").toString + sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) + } + } + + private class MultinomialLogisticRegressionModelReader + extends MLReader[MultinomialLogisticRegressionModel] { + + /** Checked against metadata when loading model */ + private val className = classOf[MultinomialLogisticRegressionModel].getName + + override def load(path: String): MultinomialLogisticRegressionModel = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + + val dataPath = new Path(path, "data").toString + val data = sqlContext.read.format("parquet").load(dataPath) + .select("numClasses", "numFeatures", "intercept", "coefficients").head() + val numClasses = data.getInt(0) + val intercepts = data.getAs[Vector](2) + val coefficients = data.getAs[Matrix](3) + val model = + new MultinomialLogisticRegressionModel(metadata.uid, coefficients, intercepts, numClasses) + + DefaultParamsReader.getAndSetParams(model, metadata) + model + } + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala new file mode 100644 index 000000000000..19f7f29a80c1 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala @@ -0,0 +1,1001 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.classification + +import scala.language.existentials + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.attribute.NominalAttribute +import org.apache.spark.ml.classification.LogisticRegressionSuite._ +import org.apache.spark.ml.feature.LabeledPoint +import org.apache.spark.ml.linalg._ +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.{DataFrame, Dataset, Row} + +class MultinomialLogisticRegressionSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + @transient var dataset: Dataset[_] = _ + @transient var multinomialDataset: DataFrame = _ + private val eps: Double = 1e-5 + + override def beforeAll(): Unit = { + super.beforeAll() + + dataset = { + val nPoints = 100 + val coefficients = Array( + -0.57997, 0.912083, -0.371077, + -0.16624, -0.84355, -0.048509) + + val xMean = Array(5.843, 3.057) + val xVariance = Array(0.6856, 0.1899) + + val testData = generateMultinomialLogisticInput( + coefficients, xMean, xVariance, addIntercept = true, nPoints, 42) + + val df = spark.createDataFrame(sc.parallelize(testData, 4)) + df.cache() + df + } + + multinomialDataset = { + val nPoints = 10000 + val coefficients = Array( + -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, + -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) + + val xMean = Array(5.843, 3.057, 3.758, 1.199) + val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) + + val testData = generateMultinomialLogisticInput( + coefficients, xMean, xVariance, addIntercept = true, nPoints, 42) + + val df = spark.createDataFrame(sc.parallelize(testData, 4)) + df.cache() + df + } + } + + /** + * Enable the ignored test to export the dataset into CSV format, + * so we can validate the training accuracy compared with R's glmnet package. + */ + ignore("export test data into CSV format") { + multinomialDataset.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset") + } + + test("params") { + ParamsSuite.checkParams(new MultinomialLogisticRegression) + val model = new MultinomialLogisticRegressionModel("mLogReg", + Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2) + ParamsSuite.checkParams(model) + } + + test("multinomial logistic regression: default params") { + val mlr = new MultinomialLogisticRegression + assert(mlr.getLabelCol === "label") + assert(mlr.getFeaturesCol === "features") + assert(mlr.getPredictionCol === "prediction") + assert(mlr.getRawPredictionCol === "rawPrediction") + assert(mlr.getProbabilityCol === "probability") + assert(!mlr.isDefined(mlr.weightCol)) + assert(!mlr.isDefined(mlr.thresholds)) + assert(mlr.getFitIntercept) + assert(mlr.getStandardization) + val model = mlr.fit(dataset) + model.transform(dataset) + .select("label", "probability", "prediction", "rawPrediction") + .collect() + assert(model.getFeaturesCol === "features") + assert(model.getPredictionCol === "prediction") + assert(model.getRawPredictionCol === "rawPrediction") + assert(model.getProbabilityCol === "probability") + assert(model.intercepts !== Vectors.dense(0.0, 0.0)) + assert(model.hasParent) + } + + test("multinomial logistic regression with intercept without regularization") { + + val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100) + val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + > library("glmnet") + > data <- read.csv("path", header=FALSE) + > label = as.factor(data$V1) + > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + > coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0)) + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -2.24493379 + V2 0.25096771 + V3 -0.03915938 + V4 0.14766639 + V5 0.36810817 + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.3778931 + V2 -0.3327489 + V3 0.8893666 + V4 -0.2306948 + V5 -0.4442330 + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.86704066 + V2 0.08178121 + V3 -0.85020722 + V4 0.08302840 + V5 0.07612480 + */ + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.2509677, -0.0391594, 0.1476664, 0.3681082, + -0.3327489, 0.8893666, -0.2306948, -0.4442330, + 0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true) + val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407) + + assert(model1.coefficients ~== coefficientsR relTol 0.05) + assert(model1.intercepts ~== interceptsR relTol 0.05) + assert(model2.coefficients ~== coefficientsR relTol 0.05) + assert(model2.intercepts ~== interceptsR relTol 0.05) + } + + test("multinomial logistic regression without intercept without regularization") { + + val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true) + val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0, + intercept=F)) + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.06992464 + V3 -0.36562784 + V4 0.12142680 + V5 0.32052211 + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.3036269 + V3 0.9449630 + V4 -0.2271038 + V5 -0.4364839 + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.2337022 + V3 -0.5793351 + V4 0.1056770 + V5 0.1159618 + */ + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0699246, -0.3656278, 0.1214268, 0.3205221, + -0.3036269, 0.9449630, -0.2271038, -0.4364839, + 0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true) + + assert(model1.coefficients ~== coefficientsR relTol 0.05) + assert(model2.coefficients ~== coefficientsR relTol 0.05) + assert(model1.intercepts.toArray === Array.fill(3)(0.0)) + assert(model2.intercepts.toArray === Array.fill(3)(0.0)) + } + + test("multinomial logistic regression with intercept with L1 regularization") { + + // use tighter constraints because OWL-QN solver takes longer to converge + val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1, + lambda = 0.05, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05, + standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.68988825 + V2 . + V3 . + V4 . + V5 0.09404023 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2303499 + V2 -0.1232443 + V3 0.3258380 + V4 -0.1564688 + V5 -0.2053965 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.9202381 + V2 . + V3 -0.4803856 + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.44893320 + V2 . + V3 . + V4 0.01933812 + V5 0.03666044 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.7376760 + V2 -0.0577182 + V3 . + V4 -0.2081718 + V5 -0.1304592 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2887428 + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.09404023, + -0.1232443, 0.3258380, -0.1564688, -0.2053965, + 0.0, -0.4803856, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.68988825, -0.2303499, 0.9202381) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.01933812, 0.03666044, + -0.0577182, 0.0, -0.2081718, -0.1304592, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428) + + assert(model1.coefficients ~== coefficientsRStd absTol 0.01) + assert(model1.intercepts ~== interceptsRStd relTol 0.1) + assert(model2.coefficients ~== coefficientsR absTol 0.01) + assert(model2.intercepts ~== interceptsR relTol 0.1) + } + + test("multinomial logistic regression without intercept with L1 regularization") { + val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) + val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1, + lambda = 0.05, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05, + intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 0.01525105 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.1502410 + V3 0.5134658 + V4 -0.1601146 + V5 -0.2500232 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.003301875 + V3 . + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 0.1943624 + V4 -0.1902577 + V5 -0.1028789 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.01525105, + -0.1502410, 0.5134658, -0.1601146, -0.2500232, + 0.003301875, 0.0, 0.0, 0.0), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.1943624, -0.1902577, -0.1028789, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + + assert(model1.coefficients ~== coefficientsRStd absTol 0.01) + assert(model2.coefficients ~== coefficientsR absTol 0.01) + assert(model1.intercepts.toArray === Array.fill(3)(0.0)) + assert(model2.intercepts.toArray === Array.fill(3)(0.0)) + } + + test("multinomial logistic regression with intercept with L2 regularization") { + val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) + val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=T, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=T, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -1.70040424 + V2 0.17576070 + V3 0.01527894 + V4 0.10216108 + V5 0.26099531 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.2438590 + V2 -0.2238875 + V3 0.5967610 + V4 -0.1555496 + V5 -0.3010479 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.45654525 + V2 0.04812679 + V3 -0.61203992 + V4 0.05338850 + V5 0.04005258 + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -1.65488543 + V2 0.15715048 + V3 0.01992903 + V4 0.12428858 + V5 0.22130317 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.1297533 + V2 -0.1974768 + V3 0.2776373 + V4 -0.1869445 + V5 -0.2510320 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.52513212 + V2 0.04032627 + V3 -0.29756637 + V4 0.06265594 + V5 0.02972883 + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.17576070, 0.01527894, 0.10216108, 0.26099531, + -0.2238875, 0.5967610, -0.1555496, -0.3010479, + 0.04812679, -0.61203992, 0.05338850, 0.04005258), isTransposed = true) + val interceptsRStd = Vectors.dense(-1.70040424, 0.2438590, 1.45654525) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.15715048, 0.01992903, 0.12428858, 0.22130317, + -0.1974768, 0.2776373, -0.1869445, -0.2510320, + 0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true) + val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212) + + assert(model1.coefficients ~== coefficientsRStd relTol 0.05) + assert(model1.intercepts ~== interceptsRStd relTol 0.05) + assert(model2.coefficients ~== coefficientsR relTol 0.05) + assert(model2.intercepts ~== interceptsR relTol 0.05) + } + test("multinomial logistic regression without intercept with L2 regularization") { + val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) + val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.03904171 + V3 -0.23354322 + V4 0.08288096 + V5 0.22706393 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.2061848 + V3 0.6341398 + V4 -0.1530059 + V5 -0.2958455 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.16714312 + V3 -0.40059658 + V4 0.07012496 + V5 0.06878158 + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.005704542 + V3 -0.144466409 + V4 0.092080736 + V5 0.182927657 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.08469036 + V3 0.38996748 + V4 -0.16468436 + V5 -0.22522976 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.09039490 + V3 -0.24550107 + V4 0.07260362 + V5 0.04230210 + */ + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.03904171, -0.23354322, 0.08288096, 0.2270639, + -0.2061848, 0.6341398, -0.1530059, -0.2958455, + 0.16714312, -0.40059658, 0.07012496, 0.06878158), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + -0.005704542, -0.144466409, 0.092080736, 0.182927657, + -0.08469036, 0.38996748, -0.16468436, -0.22522976, + 0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true) + + assert(model1.coefficients ~== coefficientsRStd absTol 0.01) + assert(model1.intercepts.toArray === Array.fill(3)(0.0)) + assert(model2.coefficients ~== coefficientsR absTol 0.01) + assert(model2.intercepts.toArray === Array.fill(3)(0.0)) + } + + test("multinomial logistic regression with intercept with elasticnet regularization") { + val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=T, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=T, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.5521819483 + V2 0.0003092611 + V3 . + V4 . + V5 0.0913818490 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.27531989 + V2 -0.09790029 + V3 0.28502034 + V4 -0.12416487 + V5 -0.16513373 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.8275018 + V2 . + V3 -0.4044859 + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.39876213 + V2 . + V3 . + V4 0.02547520 + V5 0.03893991 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.61089869 + V2 -0.04224269 + V3 . + V4 -0.18923970 + V5 -0.09104249 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2121366 + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0003092611, 0.0, 0.0, 0.091381849, + -0.09790029, 0.28502034, -0.12416487, -0.16513373, + 0.0, -0.4044859, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.5521819483, -0.27531989, 0.8275018) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0254752, 0.03893991, + -0.04224269, 0.0, -0.1892397, -0.09104249, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366) + + assert(model1.coefficients ~== coefficientsRStd absTol 0.01) + assert(model1.intercepts ~== interceptsRStd absTol 0.01) + assert(model2.coefficients ~== coefficientsR absTol 0.01) + assert(model2.intercepts ~== interceptsR absTol 0.01) + } + test("multinomial logistic regression without intercept with elasticnet regularization") { + val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 0.03543706 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.1187387 + V3 0.4025482 + V4 -0.1270969 + V5 -0.1918386 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.00774365 + V3 . + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 0.14666497 + V4 -0.16570638 + V5 -0.05982875 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + */ + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.03543706, + -0.1187387, 0.4025482, -0.1270969, -0.1918386, + 0.0, 0.0, 0.0, 0.00774365), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.14666497, -0.16570638, -0.05982875, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + + assert(model1.coefficients ~== coefficientsRStd absTol 0.01) + assert(model1.intercepts.toArray === Array.fill(3)(0.0)) + assert(model2.coefficients ~== coefficientsR absTol 0.01) + assert(model2.intercepts.toArray === Array.fill(3)(0.0)) + } + + test("prediction") { + val model = new MultinomialLogisticRegressionModel("mLogReg", + Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)), + Vectors.dense(0.0, 0.0, 0.0), 3) + val overFlowData = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)), + LabeledPoint(1.0, Vectors.dense(0.0, -1.0)) + )) + val results = model.transform(overFlowData).select("rawPrediction", "probability").collect() + + // probabilities are correct when margins have to be adjusted + val raw1 = results(0).getAs[Vector](0) + val prob1 = results(0).getAs[Vector](1) + assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0)) + assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps) + + // probabilities are correct when margins don't have to be adjusted + val raw2 = results(1).getAs[Vector](0) + val prob2 = results(1).getAs[Vector](1) + assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0)) + assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps) + } + + test("multinomial logistic regression: Predictor, Classifier methods") { + val mlr = new MultinomialLogisticRegression + + val model = mlr.fit(dataset) + assert(model.numClasses === 3) + val numFeatures = dataset.select("features").first().getAs[Vector](0).size + assert(model.numFeatures === numFeatures) + + val results = model.transform(dataset) + // check that raw prediction is coefficients dot features + intercept + results.select("rawPrediction", "features").collect().foreach { + case Row(raw: Vector, features: Vector) => + assert(raw.size === 3) + val margins = Array.tabulate(3) { k => + var margin = 0.0 + features.foreachActive { (index, value) => + margin += value * model.coefficients(k, index) + } + margin += model.intercepts(k) + margin + } + assert(raw ~== Vectors.dense(margins) relTol eps) + } + + // Compare rawPrediction with probability + results.select("rawPrediction", "probability").collect().foreach { + case Row(raw: Vector, prob: Vector) => + assert(raw.size === 3) + assert(prob.size === 3) + val max = raw.toArray.max + val subtract = if (max > 0) max else 0.0 + val sum = raw.toArray.map(x => math.exp(x - subtract)).sum + val probFromRaw0 = math.exp(raw(0) - subtract) / sum + val probFromRaw1 = math.exp(raw(1) - subtract) / sum + assert(prob(0) ~== probFromRaw0 relTol eps) + assert(prob(1) ~== probFromRaw1 relTol eps) + assert(prob(2) ~== 1.0 - probFromRaw1 - probFromRaw0 relTol eps) + } + + // Compare prediction with probability + results.select("prediction", "probability").collect().foreach { + case Row(pred: Double, prob: Vector) => + val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2 + assert(pred == predFromProb) + } + } + + test("multinomial logistic regression coefficients should be centered") { + val mlr = new MultinomialLogisticRegression().setMaxIter(1) + val model = mlr.fit(dataset) + assert(model.intercepts.toArray.sum ~== 0.0 absTol 1e-6) + assert(model.coefficients.toArray.sum ~== 0.0 absTol 1e-6) + } + + test("numClasses specified in metadata/inferred") { + val mlr = new MultinomialLogisticRegression().setMaxIter(1) + + // specify more classes than unique label values + val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata() + val df = dataset.select(dataset("label").as("label", labelMeta), dataset("features")) + val model1 = mlr.fit(df) + assert(model1.numClasses === 4) + assert(model1.intercepts.size === 4) + + // specify two classes when there are really three + val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata() + val df1 = dataset.select(dataset("label").as("label", labelMeta1), dataset("features")) + val thrown = intercept[IllegalArgumentException] { + mlr.fit(df1) + } + assert(thrown.getMessage.contains( + "less than the number of unique labels")) + + // mlr should infer the number of classes if not specified + val model3 = mlr.fit(dataset) + assert(model3.numClasses === 3) + } + + test("all labels the same") { + val constantData = spark.createDataFrame(Seq( + LabeledPoint(4.0, Vectors.dense(0.0)), + LabeledPoint(4.0, Vectors.dense(1.0)), + LabeledPoint(4.0, Vectors.dense(2.0))) + ) + val mlr = new MultinomialLogisticRegression + val model = mlr.fit(constantData) + val results = model.transform(constantData) + results.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity))) + assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0))) + assert(pred === 4.0) + } + // TODO: check num iters is zero when it become available in the model + } + + test("weighted data") { + val numClasses = 5 + val numPoints = 40 + val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark, + numClasses, numPoints) + val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => + LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) + }) + val mlr = new MultinomialLogisticRegression().setWeightCol("weight") + val model = mlr.fit(outlierData) + val results = model.transform(testData).select("label", "prediction").collect() + + // check that the predictions are the one to one mapping + results.foreach { case Row(label: Double, pred: Double) => + assert(label === pred) + } + val (overSampledData, weightedData) = + MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features", + 42L) + val weightedModel = mlr.fit(weightedData) + val overSampledModel = mlr.setWeightCol("").fit(overSampledData) + assert(weightedModel.coefficients ~== overSampledModel.coefficients relTol 0.01) + } + + test("thresholds prediction") { + val mlr = new MultinomialLogisticRegression + val model = mlr.fit(dataset) + val basePredictions = model.transform(dataset).select("prediction").collect() + + // should predict all zeros + model.setThresholds(Array(1, 1000, 1000)) + val zeroPredictions = model.transform(dataset).select("prediction").collect() + assert(zeroPredictions.forall(_.getDouble(0) === 0.0)) + + // should predict all ones + model.setThresholds(Array(1000, 1, 1000)) + val onePredictions = model.transform(dataset).select("prediction").collect() + assert(onePredictions.forall(_.getDouble(0) === 1.0)) + + // should predict all twos + model.setThresholds(Array(1000, 1000, 1)) + val twoPredictions = model.transform(dataset).select("prediction").collect() + assert(twoPredictions.forall(_.getDouble(0) === 2.0)) + + // constant threshold scaling is the same as no thresholds + model.setThresholds(Array(1000, 1000, 1000)) + val scaledPredictions = model.transform(dataset).select("prediction").collect() + assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) => + scaled.getDouble(0) === base.getDouble(0) + }) + + } + + test("read/write") { + def checkModelData( + model: MultinomialLogisticRegressionModel, + model2: MultinomialLogisticRegressionModel): Unit = { + assert(model.intercepts === model2.intercepts) + assert(model.coefficients.toArray === model2.coefficients.toArray) + assert(model.numClasses === model2.numClasses) + assert(model.numFeatures === model2.numFeatures) + } + val mlr = new MultinomialLogisticRegression() + testEstimatorAndModelReadWrite(mlr, dataset, + MultinomialLogisticRegressionSuite.allParamSettings, + checkModelData) + } + + test("should support all NumericType labels and not support other types") { + val mlr = new MultinomialLogisticRegression().setMaxIter(1) + MLTestingUtils + .checkNumericTypes[MultinomialLogisticRegressionModel, MultinomialLogisticRegression]( + mlr, spark) { (expected, actual) => + assert(expected.intercepts === actual.intercepts) + assert(expected.coefficients.toArray === actual.coefficients.toArray) + } + } +} + +object MultinomialLogisticRegressionSuite { + + /** + * Mapping from all Params to valid settings which differ from the defaults. + * This is useful for tests which need to exercise all Params, such as save/load. + * This excludes input columns to simplify some tests. + */ + val allParamSettings: Map[String, Any] = ProbabilisticClassifierSuite.allParamSettings ++ Map( + "probabilityCol" -> "myProbability", + "thresholds" -> Array(0.4, 0.6), + "regParam" -> 0.01, + "elasticNetParam" -> 0.1, + "maxIter" -> 2, // intentionally small + "fitIntercept" -> true, + "tol" -> 0.8, + "standardization" -> false + ) +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala index 80b976914cbd..472a5af06e7a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala @@ -19,12 +19,14 @@ package org.apache.spark.ml.util import org.apache.spark.SparkFunSuite import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.evaluation.Evaluator -import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.feature.Instance +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.recommendation.{ALS, ALSModel} import org.apache.spark.ml.tree.impl.TreeTests -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @@ -179,4 +181,47 @@ object MLTestingUtils extends SparkFunSuite { .map(t => t -> df.select(col(labelColName).cast(t), col(predictionColName))) .toMap } + + def genClassificationInstancesWithWeightedOutliers( + spark: SparkSession, + numClasses: Int, + numInstances: Int): DataFrame = { + val data = Array.tabulate[Instance](numInstances) { i => + val feature = i % numClasses + if (i < numInstances / 3) { + // give large weights to minority of data with 1 to 1 mapping feature to label + Instance(feature, 1.0, Vectors.dense(feature)) + } else { + // give small weights to majority of data points with reverse mapping + Instance(numClasses - feature - 1, 0.01, Vectors.dense(feature)) + } + } + val labelMeta = + NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses).toMetadata() + spark.createDataFrame(data).select(col("label").as("label", labelMeta), col("weight"), + col("features")) + } + + def genEquivalentOversampledAndWeightedInstances( + data: DataFrame, + labelCol: String, + featuresCol: String, + seed: Long): (DataFrame, DataFrame) = { + import data.sparkSession.implicits._ + val rng = scala.util.Random + rng.setSeed(seed) + val sample: () => Int = () => rng.nextInt(10) + 1 + val sampleUDF = udf(sample) + val rawData = data.select(labelCol, featuresCol).withColumn("samples", sampleUDF()) + val overSampledData = rawData.rdd.flatMap { + case Row(label: Double, features: Vector, n: Int) => + Iterator.fill(n)(Instance(label, 1.0, features)) + }.toDF() + rng.setSeed(seed) + val weightedData = rawData.rdd.map { + case Row(label: Double, features: Vector, n: Int) => + Instance(label, n.toDouble, features) + }.toDF() + (overSampledData, weightedData) + } } From 349ac7ca85744da31722c91c3539a198cfdd0c29 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 20 Jun 2016 20:03:10 -0700 Subject: [PATCH 02/14] std check --- .../org/apache/spark/ml/classification/LogisticRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 3c65228351dd..47dc3f705902 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1080,7 +1080,7 @@ private class LogisticAggregator( if (label == i) 1.0 else 0.0 } features.foreachActive { (index, value) => - if (value != 0.0) { + if (featuresStd(index) != 0.0 && value != 0.0) { val x = if (standardize) value / featuresStd(index) else value gradient(i * numFeaturesPlusIntercept + index) += weight * multiplier * x } From 595feef30d56dec3fd5b2ef72f7bee5e38b7f0c5 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 20 Jun 2016 20:42:17 -0700 Subject: [PATCH 03/14] fix small naming bug --- .../ml/classification/MultinomialLogisticRegression.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala index a46639d66d47..c8ffed876a1b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -280,7 +280,7 @@ class MultinomialLogisticRegression @Since("2.1.0") ( val standardizationParam = $(standardization) def regParamL1Fun = (index: Int) => { // Remove the L1 penalization on the intercept - val isIntercept = $(fitIntercept) && ((index + 1) % coefWithInterceptLength == 0) + val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0) if (isIntercept) { 0.0 } else { @@ -308,7 +308,7 @@ class MultinomialLogisticRegression @Since("2.1.0") ( new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) } - val initialCoefficientsWithIntercept = Vectors.zeros(numClasses * coefWithInterceptLength) + val initialCoefficientsWithIntercept = Vectors.zeros(numClasses * numFeaturesPlusIntercept) if ($(fitIntercept)) { /* From 823889bdca534ac85be0e77253e78ae6e1a705e7 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 12 Aug 2016 12:25:43 -0700 Subject: [PATCH 04/14] address some comments --- .../spark/ml/classification/LogisticRegression.scala | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 47dc3f705902..6a094a8be4fd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -883,8 +883,6 @@ class BinaryLogisticRegressionSummary private[classification] ( * * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. * This will change in later Spark versions. - * - * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic */ @Since("1.5.0") @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR") @@ -947,8 +945,8 @@ class BinaryLogisticRegressionSummary private[classification] ( } /** - * LogisticAggregator computes the gradient and loss for binary logistic loss function, as used - * in binary classification for instances in sparse or dense vector in an online fashion. + * LogisticAggregator computes the gradient and loss for binary or multinomial logistic loss + * function, as used in classification for instances in sparse or dense vector in an online fashion. * * Two LogisticAggregator can be merged together to have a summary of loss and gradient of * the corresponding joint dataset. @@ -973,12 +971,11 @@ private class LogisticAggregator( private val totalCoefficientLength = { val cols = if (fitIntercept) numFeatures + 1 else numFeatures - val rows = if (multinomial) numClasses else math.max(1, numClasses - 1) + val rows = if (multinomial) numClasses else 1 rows * cols } - private val gradientSumArray = - Array.ofDim[Double](totalCoefficientLength) + private val gradientSumArray = Array.ofDim[Double](totalCoefficientLength) /** Update gradient and loss using binary loss function. */ private def binaryUpdateInPlace( From 2f11bd737f4286fe2dac4058416aa1d0e3bab2f4 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 15 Aug 2016 13:11:26 -0700 Subject: [PATCH 05/14] Merging master and addressing some comments --- .../classification/LogisticRegression.scala | 39 ++++++++++--------- .../MultinomialLogisticRegression.scala | 4 +- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 6a094a8be4fd..dd68f8e8a9f1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -359,7 +359,7 @@ class LogisticRegression @Since("1.2.0") ( val bcFeaturesStd = instances.context.broadcast(featuresStd) val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), - $(standardization), featuresStd, regParamL2, multinomial = false, standardize = true) + $(standardization), bcFeaturesStd, regParamL2, multinomial = false, standardize = true) val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) @@ -966,6 +966,18 @@ private class LogisticAggregator( multinomial: Boolean, standardize: Boolean) extends Serializable { + private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures + private val coefficientSize = bcCoefficients.value.size + if (multinomial) { + require(numClasses == coefficientSize / numFeaturesPlusIntercept, s"The number of " + + s"coefficients should be ${numClasses * numFeaturesPlusIntercept} but was $coefficientSize") + } else { + require(coefficientSize == numFeaturesPlusIntercept, s"Expected $numFeaturesPlusIntercept " + + s"coefficients but got $coefficientSize") + require(numClasses <= 2, s"Binary logistic aggregator requires numClasses in {1, 2}" + + s" but found $numClasses.") + } + private var weightSum = 0.0 private var lossSum = 0.0 @@ -1105,20 +1117,9 @@ private class LogisticAggregator( */ def add(instance: Instance): this.type = { instance match { case Instance(label, weight, features) => - val size = coefficients.size require(numFeatures == features.size, s"Dimension mismatch when adding new instance." + s" Expecting $numFeatures but got ${features.size}") require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0") - if (multinomial) { - require(numClasses == size / numFeaturesPlusIntercept, s"The number" + - s" of coefficients should be ${numClasses * numFeaturesPlusIntercept} but " + - s"was $size") - } else { - require(size == numFeaturesPlusIntercept, s"Expected " + - s"$numFeaturesPlusIntercept coefficients but got $size") - require(numClasses <= 2, s"Binary logistic aggregator requires numClasses in {1, 2}" + - s" but found $numClasses.") - } if (weight == 0.0) return this @@ -1129,14 +1130,13 @@ private class LogisticAggregator( "coefficients only supports dense vector" + s"but got type ${bcCoefficients.value.getClass}.") } - val localGradientSumArray = gradientSumArray if (multinomial) { - multinomialUpdateInPlace(features, weight, label, coefficientsArray, localGradientSumArray, - featuresStd, numFeaturesPlusIntercept, standardize) + multinomialUpdateInPlace(features, weight, label, coefficientsArray, gradientSumArray, + bcFeaturesStd.value, numFeaturesPlusIntercept, standardize) } else { - binaryUpdateInPlace(features, weight, label, coefficientsArray, localGradientSumArray, - featuresStd, numFeaturesPlusIntercept, standardize) + binaryUpdateInPlace(features, weight, label, coefficientsArray, gradientSumArray, + bcFeaturesStd.value, numFeaturesPlusIntercept, standardize) } weightSum += weight this @@ -1213,11 +1213,12 @@ private class LogisticCostFun( val logisticAggregator = { val seqOp = (c: LogisticAggregator, instance: Instance) => - c.add(instance, bcCoeffs.value, localFeaturesStd) + c.add(instance) val combOp = (c1: LogisticAggregator, c2: LogisticAggregator) => c1.merge(c2) instances.treeAggregate( - new LogisticAggregator(numFeatures, numClasses, fitIntercept, multinomial, standardize) + new LogisticAggregator(bcCoeffs, bcFeaturesStd, numFeatures, numClasses, fitIntercept, + multinomial, standardize) )(seqOp, combOp) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala index c8ffed876a1b..43d081294917 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -271,8 +271,9 @@ class MultinomialLogisticRegression @Since("2.1.0") ( val regParamL1 = $(elasticNetParam) * $(regParam) val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) + val bcFeaturesStd = instances.context.broadcast(featuresStd) val costFun = new LogisticCostFun(standardizedInstances, numClasses, $(fitIntercept), - $(standardization), featuresStd, regParamL2, multinomial = true, standardize = false) + $(standardization), bcFeaturesStd, regParamL2, multinomial = true, standardize = false) val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) @@ -362,6 +363,7 @@ class MultinomialLogisticRegression @Since("2.1.0") ( logError(msg) throw new SparkException(msg) } + bcFeaturesStd.destroy(blocking = false) /* The coefficients are trained in the scaled space; we're converting them back to From 0d72c6fb974919e99022fc3f1fa6b47424d20d11 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 15 Aug 2016 13:47:12 -0700 Subject: [PATCH 06/14] addressing more review comments --- .../classification/LogisticRegression.scala | 43 ++++++++++++++----- .../MultinomialLogisticRegressionSuite.scala | 1 + 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index dd68f8e8a9f1..d436ff04e0a4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -692,6 +692,7 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { val data = sparkSession.read.format("parquet").load(dataPath) // We will need numClasses, numFeatures in the future for multinomial logreg support. + // TODO: remove numClasses and numFeatures fields? val Row(numClasses: Int, numFeatures: Int, intercept: Double, coefficients: Vector) = MLUtils.convertVectorColumnsToML(data, "coefficients") .select("numClasses", "numFeatures", "intercept", "coefficients") @@ -964,7 +965,7 @@ private class LogisticAggregator( numClasses: Int, fitIntercept: Boolean, multinomial: Boolean, - standardize: Boolean) extends Serializable { + standardize: Boolean) extends Serializable with Logging { private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures private val coefficientSize = bcCoefficients.value.size @@ -989,6 +990,13 @@ private class LogisticAggregator( private val gradientSumArray = Array.ofDim[Double](totalCoefficientLength) + if (multinomial && numClasses < 2) { + logInfo(s"Multinomial logistic regression for binary classification yields separate " + + s"coefficients for positive and negative classes. When no regularization is applied, the" + + s"result will be effectively the same as binary logistic regression. When regularization" + + s"is applied, multinomial loss will produce a result different from binary loss.") + } + /** Update gradient and loss using binary loss function. */ private def binaryUpdateInPlace( features: Vector, @@ -1033,7 +1041,7 @@ private class LogisticAggregator( } } - /** Update gradient and loss using multinomial loss function. */ + /** Update gradient and loss using multinomial (softmax) loss function. */ private def multinomialUpdateInPlace( features: Vector, weight: Double, @@ -1043,11 +1051,14 @@ private class LogisticAggregator( featuresStd: Array[Double], numFeaturesPlusIntercept: Int, standardize: Boolean): Unit = { + // TODO: use level 2 BLAS operations /* Note: this can still be used when numClasses = 2 for binary logistic regression without pivoting. */ - var marginY = 0.0 + + // marginOfLabel is margins(label) in the formula + var marginOfLabel = 0.0 var maxMargin = Double.NegativeInfinity val margins = Array.tabulate(numClasses) { i => @@ -1062,13 +1073,18 @@ private class LogisticAggregator( if (fitIntercept) { margin += coefficients(i * numFeaturesPlusIntercept + features.size) } - if (i == label.toInt) marginY = margin + if (i == label.toInt) marginOfLabel = margin if (margin > maxMargin) { maxMargin = margin } margin } + /** + * When maxMargin > 0, the original formula could cause overflow. + * We address this by subtracting maxMargin from all the margins, so it's guaranteed + * that all of the new margins will be smaller than zero to prevent arithmetic overflow. + */ val sum = { var temp = 0.0 if (maxMargin > 0) { @@ -1101,9 +1117,9 @@ private class LogisticAggregator( } val loss = if (maxMargin > 0) { - math.log(sum) - marginY + maxMargin + math.log(sum) - marginOfLabel + maxMargin } else { - math.log(sum) - marginY + math.log(sum) - marginOfLabel } lossSum += weight * loss } @@ -1117,8 +1133,8 @@ private class LogisticAggregator( */ def add(instance: Instance): this.type = { instance match { case Instance(label, weight, features) => - require(numFeatures == features.size, s"Dimension mismatch when adding new instance." + - s" Expecting $numFeatures but got ${features.size}") + require(numFeatures == features.size, s"Dimensions mismatch when adding new instance." + + s" Expecting $numFeatures but got ${features.size}.") require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0") if (weight == 0.0) return this @@ -1187,8 +1203,8 @@ private class LogisticAggregator( } /** - * LogisticCostFun implements Breeze's DiffFunction[T] for a multinomial logistic loss function, - * as used in multi-class classification (it is also used in binary logistic regression). + * LogisticCostFun implements Breeze's DiffFunction[T] for a multinomial (softmax) logistic loss + * function, as used in multi-class classification (it is also used in binary logistic regression). * It returns the loss and gradient with L2 regularization at a particular point (coefficients). * It's used in Breeze's convex optimization routines. */ @@ -1232,6 +1248,8 @@ private class LogisticCostFun( (0 until K).foreach { k => var j = 0 while (j < numFeatures) { + // The following code will compute the loss of the regularization; also + // the gradient of the regularization, and add back to totalGradientArray. val value = coeffs(k * numFeaturesPlusIntercept + j) sum += { if (standardization) { @@ -1239,6 +1257,11 @@ private class LogisticCostFun( value * value } else { if (featuresStd(j) != 0.0) { + // If `standardization` is false, we still standardize the data + // to improve the rate of convergence; as a result, we have to + // perform this reverse standardization by penalizing each component + // differently to get effectively the same objective function when + // the training dataset is not standardized. val temp = value / (featuresStd(j) * featuresStd(j)) totalGradientArray(k * numFeaturesPlusIntercept + j) += regParamL2 * temp value * temp diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala index 19f7f29a80c1..c7fa78f9e971 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala @@ -508,6 +508,7 @@ class MultinomialLogisticRegressionSuite assert(model2.coefficients ~== coefficientsR relTol 0.05) assert(model2.intercepts ~== interceptsR relTol 0.05) } + test("multinomial logistic regression without intercept with L2 regularization") { val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) From 51f51e67077b7b177dc09611b2f1146609660d15 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 15 Aug 2016 14:55:22 -0700 Subject: [PATCH 07/14] reverting feature scaling --- .../classification/LogisticRegression.scala | 33 +++++++---------- .../MultinomialLogisticRegression.scala | 37 +++---------------- 2 files changed, 18 insertions(+), 52 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index d436ff04e0a4..fadf8630eb35 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -359,7 +359,7 @@ class LogisticRegression @Since("1.2.0") ( val bcFeaturesStd = instances.context.broadcast(featuresStd) val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), - $(standardization), bcFeaturesStd, regParamL2, multinomial = false, standardize = true) + $(standardization), bcFeaturesStd, regParamL2, multinomial = false) val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) @@ -964,8 +964,7 @@ private class LogisticAggregator( private val numFeatures: Int, numClasses: Int, fitIntercept: Boolean, - multinomial: Boolean, - standardize: Boolean) extends Serializable with Logging { + multinomial: Boolean) extends Serializable with Logging { private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures private val coefficientSize = bcCoefficients.value.size @@ -1005,14 +1004,12 @@ private class LogisticAggregator( coefficients: Array[Double], gradient: Array[Double], featuresStd: Array[Double], - numFeaturesPlusIntercept: Int, - standardize: Boolean): Unit = { + numFeaturesPlusIntercept: Int): Unit = { val margin = - { var sum = 0.0 features.foreachActive { (index, value) => if (featuresStd(index) != 0.0 && value != 0.0) { - val x = if (standardize) value / featuresStd(index) else value - sum += coefficients(index) * x + sum += coefficients(index) * value / featuresStd(index) } } sum + { @@ -1024,8 +1021,7 @@ private class LogisticAggregator( features.foreachActive { (index, value) => if (featuresStd(index) != 0.0 && value != 0.0) { - val x = if (standardize) value / featuresStd(index) else value - gradient(index) += multiplier * x + gradient(index) += multiplier * value / featuresStd(index) } } @@ -1049,8 +1045,7 @@ private class LogisticAggregator( coefficients: Array[Double], gradient: Array[Double], featuresStd: Array[Double], - numFeaturesPlusIntercept: Int, - standardize: Boolean): Unit = { + numFeaturesPlusIntercept: Int): Unit = { // TODO: use level 2 BLAS operations /* Note: this can still be used when numClasses = 2 for binary @@ -1065,8 +1060,7 @@ private class LogisticAggregator( var margin = 0.0 features.foreachActive { (index, value) => if (featuresStd(index) != 0.0 && value != 0.0) { - val x = if (standardize) value / featuresStd(index) else value - margin += coefficients(i * numFeaturesPlusIntercept + index) * x + margin += coefficients(i * numFeaturesPlusIntercept + index) * value / featuresStd(index) } } @@ -1106,8 +1100,8 @@ private class LogisticAggregator( } features.foreachActive { (index, value) => if (featuresStd(index) != 0.0 && value != 0.0) { - val x = if (standardize) value / featuresStd(index) else value - gradient(i * numFeaturesPlusIntercept + index) += weight * multiplier * x + gradient(i * numFeaturesPlusIntercept + index) += + weight * multiplier * value / featuresStd(index) } } if (fitIntercept) { @@ -1149,10 +1143,10 @@ private class LogisticAggregator( if (multinomial) { multinomialUpdateInPlace(features, weight, label, coefficientsArray, gradientSumArray, - bcFeaturesStd.value, numFeaturesPlusIntercept, standardize) + bcFeaturesStd.value, numFeaturesPlusIntercept) } else { binaryUpdateInPlace(features, weight, label, coefficientsArray, gradientSumArray, - bcFeaturesStd.value, numFeaturesPlusIntercept, standardize) + bcFeaturesStd.value, numFeaturesPlusIntercept) } weightSum += weight this @@ -1215,8 +1209,7 @@ private class LogisticCostFun( standardization: Boolean, bcFeaturesStd: Broadcast[Array[Double]], regParamL2: Double, - multinomial: Boolean, - standardize: Boolean) extends DiffFunction[BDV[Double]] { + multinomial: Boolean) extends DiffFunction[BDV[Double]] { val featuresStd = bcFeaturesStd.value @@ -1234,7 +1227,7 @@ private class LogisticCostFun( instances.treeAggregate( new LogisticAggregator(bcCoeffs, bcFeaturesStd, numFeatures, numClasses, fitIntercept, - multinomial, standardize) + multinomial) )(seqOp, combOp) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala index 43d081294917..878baecc9de1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -184,6 +184,8 @@ class MultinomialLogisticRegression @Since("2.1.0") ( Instance(label, weight, features) } + if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) + val instr = Instrumentation.create(this, instances) instr.logParams(regParam, elasticNetParam, standardization, thresholds, maxIter, tol, fitIntercept) @@ -238,42 +240,13 @@ class MultinomialLogisticRegression @Since("2.1.0") ( } val featuresStd = summarizer.variance.toArray.map(math.sqrt) - val standardizedInstances = instances.map { case Instance(label, weight, features) => - val f = features match { - case DenseVector(vs) => - val values = vs.clone() - val size = values.length - var i = 0 - while (i < size) { - values(i) *= (if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0) - i += 1 - } - Vectors.dense(values) - case SparseVector(size, indices, vs) => - val values = vs.clone() - val nnz = values.length - var i = 0 - while (i < nnz) { - values(i) *= (if (featuresStd(indices(i)) != 0.0) { - 1.0 / featuresStd(indices(i)) - } else { - 0.0 - }) - i += 1 - } - Vectors.sparse(size, indices, values) - case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) - } - Instance(label, weight, f) - } - if (handlePersistence) standardizedInstances.persist(StorageLevel.MEMORY_AND_DISK) val regParamL1 = $(elasticNetParam) * $(regParam) val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) val bcFeaturesStd = instances.context.broadcast(featuresStd) - val costFun = new LogisticCostFun(standardizedInstances, numClasses, $(fitIntercept), - $(standardization), bcFeaturesStd, regParamL2, multinomial = true, standardize = false) + val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), + $(standardization), bcFeaturesStd, regParamL2, multinomial = true) val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) @@ -356,7 +329,6 @@ class MultinomialLogisticRegression @Since("2.1.0") ( state = states.next() arrayBuilder += state.adjustedValue } - if (handlePersistence) standardizedInstances.unpersist() if (state == null) { val msg = s"${optimizer.getClass.getName} failed." @@ -432,6 +404,7 @@ class MultinomialLogisticRegression @Since("2.1.0") ( (_coefficients, _intercepts, arrayBuilder.result()) } } + if (handlePersistence) instances.unpersist() val model = copyValues( new MultinomialLogisticRegressionModel(uid, coefficients, intercepts, numClasses)) From 96c52a5b2a4fa034bedc92613e88993ce37414b5 Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 16 Aug 2016 08:58:02 -0700 Subject: [PATCH 08/14] adding derivation to LogisticAggregator doc --- .../classification/LogisticRegression.scala | 164 ++++++++++++++++-- 1 file changed, 150 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index fadf8630eb35..34bc114437ad 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -946,17 +946,160 @@ class BinaryLogisticRegressionSummary private[classification] ( } /** - * LogisticAggregator computes the gradient and loss for binary or multinomial logistic loss - * function, as used in classification for instances in sparse or dense vector in an online fashion. + * LogisticAggregator computes the gradient and loss for binary or multinomial logistic (softmax) + * loss function, as used in classification for instances in sparse or dense vector in an online + * fashion. * - * Two LogisticAggregator can be merged together to have a summary of loss and gradient of + * Two LogisticAggregators can be merged together to have a summary of loss and gradient of * the corresponding joint dataset. * + * For improving the convergence rate during the optimization process and also to prevent against + * features with very large variances exerting an overly large influence during model training, + * packages like R's GLMNET perform the scaling to unit variance and remove the mean in order to + * reduce the condition number. The model is then trained in this scaled space, but returns the + * coefficients in the original scale. See page 9 in + * http://cran.r-project.org/web/packages/glmnet/glmnet.pdf + * + * However, we don't want to apply the [[org.apache.spark.ml.feature.StandardScaler]] on the + * training dataset, and then cache the standardized dataset since it will create a lot of overhead. + * As a result, we perform the scaling implicitly when we compute the objective function (though + * we do not subtract the mean). + * + * Note that there is a difference between multinomial (softmax) and binary loss. The binary case + * uses one outcome class as a "pivot" and regresses the other class against the pivot. In the + * multinomial case, the softmax loss function is used to model each class probability + * independently. Using softmax loss produces `K` sets of coefficients, while using a pivot class + * produces `K - 1` sets of coefficients (a single coefficient vector in the binary case). In the + * binary case, we can say that the coefficients are shared between the positive and negative + * classes. When regularization is applied, multinomial (softmax) loss will produce a result + * different from binary loss since the positive and negative don't share the coefficients while the + * binary regression shares the coefficients between positive and negative. + * + * The following is a mathematical derivation for the multinomial (softmax) loss. + * + * The probability of the multinomial outcome $y$ taking on any of the K possible outcomes is: + * + *

+ * $$ + * P(y_i=0|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} + * e^{\vec{x}_i^T \vec{\beta}_k}} \\ + * P(y_i=1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_1}}{\sum_{k=0}^{K-1} + * e^{\vec{x}_i^T \vec{\beta}_k}}\\ + * ... \\ + * P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}}{\sum_{k=0}^{K-1} + * e^{\vec{x}_i^T \vec{\beta}_k}}\\ + * $$ + *

+ * + * The model coefficients $\beta = (\beta_1, \beta_2, ..., \beta_{K-1})^T$ become a matrix + * which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not + * added, the dimension will be $K \times N$. + * + * Note that the coefficients in the model above lack identifiability. That is, any constant scalar + * can be added to all of the coefficients and the probabilities remain the same. + * + *

+ * $$ + * \begin{align} + * \frac{e^{\vec{x}_i^T \left(\vec{\beta}_0 + \vec{c}\right)}}{\sum_{k=0}^{K-1} + * e^{\vec{x}_i^T \left(\vec{\beta}_k + \vec{c}\right)}} + * = \frac{e^{\vec{x}_i^T \vec{\beta}_0}e^{\vec{x}_i^T \vec{c}}}{e^{\vec{x}_i^T \vec{c}} + * \sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}} + * = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}} + * \end{align} + * $$ + *

+ * + * However, when regularization is added to the loss function, the coefficients are indeed + * identifiable because there is only one set of coefficients which minimizes the regularization + * term. When no regularization is applied, we choose the coefficients with the minimum L2 + * penalty for consistency and reproducibility. For further discussion see: + * + * Friedman, et al. "Regularization Paths for Generalized Linear Models via Coordinate Descent" + * + * The loss of objective function for a single instance of data (we do not include the + * regularization term here for simplicity) can be written as + * + *

+ * $$ + * \begin{align} + * \ell\left(\beta, x_i\right) &= -log{P\left(y_i \middle| \vec{x}_i, \beta\right)} \\ + * &= log\left(\sum_{k=0}^{K-1}e^{\vec{x}_i^T \vec{\beta}_k}\right) - \vec{x}_i^T \vec{\beta}_y\\ + * &= log\left(\sum_{k=0}^{K-1} e^{margins_k}\right) - margins_y + * \end{align} + * $$ + *

+ * + * where ${margins}_k = \vec{x}_i^T \vec{\beta}_k$. + * + * For optimization, we have to calculate the first derivative of the loss function, and a simple + * calculation shows that + * + *

+ * $$ + * \begin{align} + * \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}} &= + * x_{i,j} \cdot w_i \cdot \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k'=0}^{K-1} + * e^{\vec{x}_i \cdot \vec{\beta}_{k'}}} - I_{y=k}\right) \\ + * &= x_{i, j} \cdot w_i \cdot multiplier_k + * \end{align} + * $$ + *

+ * + * where $w_i$ is the sample weight, $I_{y=k}$ is an indicator function + * + *

+ * $$ + * I_{y=k} = \begin{cases} + * 1 & y = k \\ + * 0 & else + * \end{cases} + * $$ + *

+ * + * and + * + *

+ * $$ + * multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_y}}{\sum_{k=0}^{K-1} + * e^{\vec{x}_i \cdot \vec{\beta}_k}} - I_{y=k}\right) + * $$ + *

+ * + * If any of margins is larger than 709.78, the numerical computation of multiplier and loss + * function will suffer from arithmetic overflow. This issue occurs when there are outliers in + * data which are far away from the hyperplane, and this will cause the failing of training once + * infinity is introduced. Note that this is only a concern when max(margins) > 0. + * + * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can easily + * be rewritten into the following equivalent numerically stable formula. + * + *

+ * $$ + * \ell\left(\beta, x\right) = log\left(\sum_{k=0}^{K-1} e^{margins_k - maxMargin}\right) - + * margins_{y} + maxMargin + * $$ + *

+ * + * Note that each term, $(margins_k - maxMargin)$ in the exponential is no greater than zero; as a + * result, overflow will not happen with this formula. + * + * For $multiplier$, a similar trick can be applied as the following, + * + *

+ * $$ + * multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k - maxMargin}}{\sum_{k'=0}^{K-1} + * e^{\vec{x}_i \cdot \vec{\beta}_{k'} - maxMargin}} - I_{y=k}\right) + * $$ + *

+ * * @param bcCoefficients The broadcast coefficients corresponding to the features. * @param bcFeaturesStd The broadcast standard deviation values of the features. + * @param numFeatures The number of features for the input data. * @param numClasses the number of possible outcomes for k classes classification problem in * Multinomial Logistic Regression. * @param fitIntercept Whether to fit an intercept term. + * @param multinomial Whether to use multinomial or binary loss */ private class LogisticAggregator( val bcCoefficients: Broadcast[Vector], @@ -981,15 +1124,9 @@ private class LogisticAggregator( private var weightSum = 0.0 private var lossSum = 0.0 - private val totalCoefficientLength = { - val cols = if (fitIntercept) numFeatures + 1 else numFeatures - val rows = if (multinomial) numClasses else 1 - rows * cols - } - - private val gradientSumArray = Array.ofDim[Double](totalCoefficientLength) + private val gradientSumArray = Array.ofDim[Double](coefficientSize) - if (multinomial && numClasses < 2) { + if (multinomial && numClasses <= 2) { logInfo(s"Multinomial logistic regression for binary classification yields separate " + s"coefficients for positive and negative classes. When no regularization is applied, the" + s"result will be effectively the same as binary logistic regression. When regularization" + @@ -1012,9 +1149,8 @@ private class LogisticAggregator( sum += coefficients(index) * value / featuresStd(index) } } - sum + { - if (fitIntercept) coefficients(numFeaturesPlusIntercept - 1) else 0.0 - } + if (fitIntercept) sum += coefficients(numFeaturesPlusIntercept - 1) + sum } val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label) From b4203a51f798b909029ff42ff7cbc6f6dc284422 Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 16 Aug 2016 15:06:37 -0700 Subject: [PATCH 09/14] fixing latex rendering --- .../classification/LogisticRegression.scala | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 34bc114437ad..9bee190463f0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -982,16 +982,15 @@ class BinaryLogisticRegressionSummary private[classification] ( *

* $$ * P(y_i=0|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} - * e^{\vec{x}_i^T \vec{\beta}_k}} \\ + * e^{\vec{x}_i^T \vec{\beta}_k}} \\ * P(y_i=1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_1}}{\sum_{k=0}^{K-1} - * e^{\vec{x}_i^T \vec{\beta}_k}}\\ - * ... \\ - * P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}}{\sum_{k=0}^{K-1} - * e^{\vec{x}_i^T \vec{\beta}_k}}\\ + * e^{\vec{x}_i^T \vec{\beta}_k}}\\ + * P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}\,}{\sum_{k=0}^{K-1} + * e^{\vec{x}_i^T \vec{\beta}_k}} * $$ *

* - * The model coefficients $\beta = (\beta_1, \beta_2, ..., \beta_{K-1})^T$ become a matrix + * The model coefficients $\beta = (\beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix * which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not * added, the dimension will be $K \times N$. * @@ -1003,7 +1002,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * \begin{align} * \frac{e^{\vec{x}_i^T \left(\vec{\beta}_0 + \vec{c}\right)}}{\sum_{k=0}^{K-1} * e^{\vec{x}_i^T \left(\vec{\beta}_k + \vec{c}\right)}} - * = \frac{e^{\vec{x}_i^T \vec{\beta}_0}e^{\vec{x}_i^T \vec{c}}}{e^{\vec{x}_i^T \vec{c}} + * = \frac{e^{\vec{x}_i^T \vec{\beta}_0}e^{\vec{x}_i^T \vec{c}}\,}{e^{\vec{x}_i^T \vec{c}} * \sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}} * = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}} * \end{align} @@ -1038,9 +1037,9 @@ class BinaryLogisticRegressionSummary private[classification] ( *

* $$ * \begin{align} - * \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}} &= - * x_{i,j} \cdot w_i \cdot \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k'=0}^{K-1} - * e^{\vec{x}_i \cdot \vec{\beta}_{k'}}} - I_{y=k}\right) \\ + * \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}} + * &= x_{i,j} \cdot w_i \cdot \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k'=0}^{K-1} + * e^{\vec{x}_i \cdot \vec{\beta}_{k'}}\,} - I_{y=k}\right) \\ * &= x_{i, j} \cdot w_i \cdot multiplier_k * \end{align} * $$ @@ -1061,7 +1060,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * *

* $$ - * multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_y}}{\sum_{k=0}^{K-1} + * multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k=0}^{K-1} * e^{\vec{x}_i \cdot \vec{\beta}_k}} - I_{y=k}\right) * $$ *

From 61521e6fc9fef06fd0688634024a2c377855f089 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 17 Aug 2016 13:47:25 -0700 Subject: [PATCH 10/14] do not assume coefficients are dense --- .../classification/LogisticRegression.scala | 36 +++---- .../MultinomialLogisticRegression.scala | 100 +++++++++--------- 2 files changed, 66 insertions(+), 70 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 9bee190463f0..56c43f9a7467 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1101,8 +1101,8 @@ class BinaryLogisticRegressionSummary private[classification] ( * @param multinomial Whether to use multinomial or binary loss */ private class LogisticAggregator( - val bcCoefficients: Broadcast[Vector], - val bcFeaturesStd: Broadcast[Array[Double]], + bcCoefficients: Broadcast[Vector], + bcFeaturesStd: Broadcast[Array[Double]], private val numFeatures: Int, numClasses: Int, fitIntercept: Boolean, @@ -1346,18 +1346,15 @@ private class LogisticCostFun( regParamL2: Double, multinomial: Boolean) extends DiffFunction[BDV[Double]] { - val featuresStd = bcFeaturesStd.value override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = { val coeffs = Vectors.fromBreeze(coefficients) val bcCoeffs = instances.context.broadcast(coeffs) - val localFeaturesStd = featuresStd - val numFeatures = localFeaturesStd.length - val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures + val featuresStd = bcFeaturesStd.value + val numFeatures = featuresStd.length val logisticAggregator = { - val seqOp = (c: LogisticAggregator, instance: Instance) => - c.add(instance) + val seqOp = (c: LogisticAggregator, instance: Instance) => c.add(instance) val combOp = (c1: LogisticAggregator, c2: LogisticAggregator) => c1.merge(c2) instances.treeAggregate( @@ -1371,34 +1368,37 @@ private class LogisticCostFun( val regVal = if (regParamL2 == 0.0) { 0.0 } else { - val K = if (multinomial) numClasses else numClasses - 1 var sum = 0.0 - (0 until K).foreach { k => - var j = 0 - while (j < numFeatures) { + coeffs.foreachActive { case (index, value) => + // We do not apply regularization to the intercepts + val isIntercept = fitIntercept && ((index + 1) % (numFeatures + 1) == 0) + if (!isIntercept) { // The following code will compute the loss of the regularization; also // the gradient of the regularization, and add back to totalGradientArray. - val value = coeffs(k * numFeaturesPlusIntercept + j) sum += { if (standardization) { - totalGradientArray(k * numFeaturesPlusIntercept + j) += regParamL2 * value + totalGradientArray(index) += regParamL2 * value value * value } else { - if (featuresStd(j) != 0.0) { + val featureIndex = if (fitIntercept) { + index % (numFeatures + 1) + } else { + index % numFeatures + } + if (featuresStd(featureIndex) != 0.0) { // If `standardization` is false, we still standardize the data // to improve the rate of convergence; as a result, we have to // perform this reverse standardization by penalizing each component // differently to get effectively the same objective function when // the training dataset is not standardized. - val temp = value / (featuresStd(j) * featuresStd(j)) - totalGradientArray(k * numFeaturesPlusIntercept + j) += regParamL2 * temp + val temp = value / (featuresStd(featureIndex) * featuresStd(featureIndex)) + totalGradientArray(index) += regParamL2 * temp value * temp } else { 0.0 } } } - j += 1 } } 0.5 * regParamL2 * sum diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala index 878baecc9de1..9f81600c311d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -345,63 +345,59 @@ class MultinomialLogisticRegression @Since("2.1.0") ( */ var interceptSum = 0.0 var coefSum = 0.0 - val rawCoefficients = state.x.toArray.clone() - val coefArray = Array.ofDim[Double](numFeatures * numClasses) - val interceptArray = Array.ofDim[Double](if (getFitIntercept) numClasses else 0) - (0 until numClasses).foreach { k => - var i = 0 - while (i < numFeatures) { - val rawValue = rawCoefficients(k * numFeaturesPlusIntercept + i) - val unscaledCoef = - rawValue * { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 } - coefArray(k * numFeatures + i) = unscaledCoef - coefSum += unscaledCoef - i += 1 - } - if (getFitIntercept) { - val intercept = rawCoefficients(k * numFeaturesPlusIntercept + numFeatures) - interceptArray(k) = intercept - interceptSum += intercept - } + val rawCoefficients = Vectors.fromBreeze(state.x) + val (coefMatrix, interceptVector) = rawCoefficients match { + case dv: DenseVector => + val coefArray = Array.tabulate(numClasses * numFeatures) { i => + val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i + val featureIndex = i % numFeatures + val unscaledCoef = if (featuresStd(featureIndex) != 0.0) { + dv(flatIndex) / featuresStd(featureIndex) + } else { + 0.0 + } + coefSum += unscaledCoef + unscaledCoef + } + val interceptVector = if ($(fitIntercept)) { + Vectors.dense(Array.tabulate(numClasses) { i => + val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1 + val intercept = dv(coefIndex) + interceptSum += intercept + intercept + }) + } else { + Vectors.sparse(numClasses, Seq()) + } + (new DenseMatrix(numClasses, numFeatures, coefArray, isTransposed = true), + interceptVector) + case sv: SparseVector => + throw new IllegalArgumentException("SparseVector is not supported for coefficients") } - val _coefficients = { - /* - When no regularization is applied, the coefficients lack identifiability because - we do not use a pivot class. We can add any constant value to the coefficients and - get the same likelihood. So here, we choose the mean centered coefficients for - reproducibility. This method follows the approach in glmnet, described here: + /* + When no regularization is applied, the coefficients lack identifiability because + we do not use a pivot class. We can add any constant value to the coefficients and + get the same likelihood. So here, we choose the mean centered coefficients for + reproducibility. This method follows the approach in glmnet, described here: - Friedman, et al. "Regularization Paths for Generalized Linear Models via - Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf - */ - if ($(regParam) == 0) { - val coefficientMean = coefSum / (numClasses * numFeatures) - var i = 0 - while (i < coefArray.length) { - coefArray(i) -= coefficientMean - i += 1 - } - } - new DenseMatrix(numClasses, numFeatures, coefArray, isTransposed = true) + Friedman, et al. "Regularization Paths for Generalized Linear Models via + Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf + */ + if ($(regParam) == 0.0) { + val coefficientMean = coefSum / (numClasses * numFeatures) + coefMatrix.update(_ - coefficientMean) } - - val _intercepts = if (getFitIntercept) { - /* - The intercepts are never regularized, so we always center the mean. - */ - val interceptMean = interceptSum / numClasses - var k = 0 - while (k < interceptArray.length) { - interceptArray(k) -= interceptMean - k += 1 - } - Vectors.dense(interceptArray) - } else { - Vectors.sparse(numClasses, Seq()) + /* + The intercepts are never regularized, so we always center the mean. + */ + val interceptMean = interceptSum / numClasses + interceptVector match { + case dv: DenseVector => (0 until dv.size).foreach { i => dv.toArray(i) -= interceptMean } + case sv: SparseVector => + (0 until sv.numNonzeros).foreach { i => sv.values(i) -= interceptMean } } - - (_coefficients, _intercepts, arrayBuilder.result()) + (coefMatrix, interceptVector, arrayBuilder.result()) } } if (handlePersistence) instances.unpersist() From f02b226e1170c8cc1c2d1c6acade950e0be44962 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 17 Aug 2016 17:08:40 -0700 Subject: [PATCH 11/14] address some review comments --- .../classification/LogisticRegression.scala | 84 ++++++++----------- .../MultinomialLogisticRegression.scala | 25 ++---- 2 files changed, 43 insertions(+), 66 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 56c43f9a7467..a571caec6bd7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -155,8 +155,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * Logistic regression. - * Currently, this class only supports binary classification. It will support multiclass - * in the future. + * Currently, this class only supports binary classification. For multiclass classification, + * use [[MultinomialLogisticRegression]] */ @Since("1.2.0") class LogisticRegression @Since("1.2.0") ( @@ -241,8 +241,8 @@ class LogisticRegression @Since("1.2.0") ( override def getThreshold: Double = super.getThreshold /** - * Whether to over-/under-sample training instances according to the given weights in weightCol. - * If not set or empty String, all instances are treated equally (weight 1.0). + * Sets the value of param [[weightCol]]. + * If this is not set or empty, we treat all instance weights as 1.0. * Default is not set, so all instances have weight one. * * @group setParam @@ -320,7 +320,7 @@ class LogisticRegression @Since("1.2.0") ( } if (numClasses > 2) { - val msg = s"Currently, LogisticRegression with ElasticNet in ML package only supports " + + val msg = s"LogisticRegression with ElasticNet in ML package only supports " + s"binary classification. Found $numClasses in the input dataset. Consider using " + s"MultinomialLogisticRegression instead." logError(msg) @@ -990,7 +990,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * $$ *

* - * The model coefficients $\beta = (\beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix + * The model coefficients $\beta = (\beta_0, \beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix * which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not * added, the dimension will be $K \times N$. * @@ -1094,20 +1094,19 @@ class BinaryLogisticRegressionSummary private[classification] ( * * @param bcCoefficients The broadcast coefficients corresponding to the features. * @param bcFeaturesStd The broadcast standard deviation values of the features. - * @param numFeatures The number of features for the input data. * @param numClasses the number of possible outcomes for k classes classification problem in * Multinomial Logistic Regression. * @param fitIntercept Whether to fit an intercept term. - * @param multinomial Whether to use multinomial or binary loss + * @param multinomial Whether to use multinomial (softmax) or binary loss */ private class LogisticAggregator( bcCoefficients: Broadcast[Vector], bcFeaturesStd: Broadcast[Array[Double]], - private val numFeatures: Int, numClasses: Int, fitIntercept: Boolean, multinomial: Boolean) extends Serializable with Logging { + private val numFeatures = bcFeaturesStd.value.length private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures private val coefficientSize = bcCoefficients.value.size if (multinomial) { @@ -1116,8 +1115,8 @@ private class LogisticAggregator( } else { require(coefficientSize == numFeaturesPlusIntercept, s"Expected $numFeaturesPlusIntercept " + s"coefficients but got $coefficientSize") - require(numClasses <= 2, s"Binary logistic aggregator requires numClasses in {1, 2}" + - s" but found $numClasses.") + require(numClasses == 1 || numClasses == 2, s"Binary logistic aggregator requires numClasses " + + s"in {1, 2} but found $numClasses.") } private var weightSum = 0.0 @@ -1136,32 +1135,32 @@ private class LogisticAggregator( private def binaryUpdateInPlace( features: Vector, weight: Double, - label: Double, - coefficients: Array[Double], - gradient: Array[Double], - featuresStd: Array[Double], - numFeaturesPlusIntercept: Int): Unit = { + label: Double): Unit = { + + val localFeaturesStd = bcFeaturesStd.value + val localCoefficients = bcCoefficients.value + val localGradientArray = gradientSumArray val margin = - { var sum = 0.0 features.foreachActive { (index, value) => - if (featuresStd(index) != 0.0 && value != 0.0) { - sum += coefficients(index) * value / featuresStd(index) + if (localFeaturesStd(index) != 0.0 && value != 0.0) { + sum += localCoefficients(index) * value / localFeaturesStd(index) } } - if (fitIntercept) sum += coefficients(numFeaturesPlusIntercept - 1) + if (fitIntercept) sum += localCoefficients(numFeaturesPlusIntercept - 1) sum } val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label) features.foreachActive { (index, value) => - if (featuresStd(index) != 0.0 && value != 0.0) { - gradient(index) += multiplier * value / featuresStd(index) + if (localFeaturesStd(index) != 0.0 && value != 0.0) { + localGradientArray(index) += multiplier * value / localFeaturesStd(index) } } if (fitIntercept) { - gradient(numFeaturesPlusIntercept - 1) += multiplier + localGradientArray(numFeaturesPlusIntercept - 1) += multiplier } if (label > 0) { @@ -1176,16 +1175,15 @@ private class LogisticAggregator( private def multinomialUpdateInPlace( features: Vector, weight: Double, - label: Double, - coefficients: Array[Double], - gradient: Array[Double], - featuresStd: Array[Double], - numFeaturesPlusIntercept: Int): Unit = { + label: Double): Unit = { // TODO: use level 2 BLAS operations /* Note: this can still be used when numClasses = 2 for binary logistic regression without pivoting. */ + val localFeaturesStd = bcFeaturesStd.value + val localCoefficients = bcCoefficients.value + val localGradientArray = gradientSumArray // marginOfLabel is margins(label) in the formula var marginOfLabel = 0.0 @@ -1194,13 +1192,14 @@ private class LogisticAggregator( val margins = Array.tabulate(numClasses) { i => var margin = 0.0 features.foreachActive { (index, value) => - if (featuresStd(index) != 0.0 && value != 0.0) { - margin += coefficients(i * numFeaturesPlusIntercept + index) * value / featuresStd(index) + if (localFeaturesStd(index) != 0.0 && value != 0.0) { + margin += localCoefficients(i * numFeaturesPlusIntercept + index) * + value / localFeaturesStd(index) } } if (fitIntercept) { - margin += coefficients(i * numFeaturesPlusIntercept + features.size) + margin += localCoefficients(i * numFeaturesPlusIntercept + numFeatures) } if (i == label.toInt) marginOfLabel = margin if (margin > maxMargin) { @@ -1234,14 +1233,13 @@ private class LogisticAggregator( if (label == i) 1.0 else 0.0 } features.foreachActive { (index, value) => - if (featuresStd(index) != 0.0 && value != 0.0) { - gradient(i * numFeaturesPlusIntercept + index) += - weight * multiplier * value / featuresStd(index) + if (localFeaturesStd(index) != 0.0 && value != 0.0) { + localGradientArray(i * numFeaturesPlusIntercept + index) += + weight * multiplier * value / localFeaturesStd(index) } } if (fitIntercept) { - gradient(i * numFeaturesPlusIntercept + features.size) += - weight * multiplier + localGradientArray(i * numFeaturesPlusIntercept + numFeatures) += weight * multiplier } } @@ -1268,20 +1266,10 @@ private class LogisticAggregator( if (weight == 0.0) return this - val coefficientsArray = bcCoefficients.value match { - case dv: DenseVector => dv.values - case _ => - throw new IllegalArgumentException( - "coefficients only supports dense vector" + - s"but got type ${bcCoefficients.value.getClass}.") - } - if (multinomial) { - multinomialUpdateInPlace(features, weight, label, coefficientsArray, gradientSumArray, - bcFeaturesStd.value, numFeaturesPlusIntercept) + multinomialUpdateInPlace(features, weight, label) } else { - binaryUpdateInPlace(features, weight, label, coefficientsArray, gradientSumArray, - bcFeaturesStd.value, numFeaturesPlusIntercept) + binaryUpdateInPlace(features, weight, label) } weightSum += weight this @@ -1358,7 +1346,7 @@ private class LogisticCostFun( val combOp = (c1: LogisticAggregator, c2: LogisticAggregator) => c1.merge(c2) instances.treeAggregate( - new LogisticAggregator(bcCoeffs, bcFeaturesStd, numFeatures, numClasses, fitIntercept, + new LogisticAggregator(bcCoeffs, bcFeaturesStd, numClasses, fitIntercept, multinomial) )(seqOp, combOp) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala index 9f81600c311d..1eb6c1802e35 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -40,7 +40,7 @@ import org.apache.spark.sql.types.DoubleType import org.apache.spark.storage.StorageLevel /** - * Params for multinomial logistic regression. + * Params for multinomial logistic (softmax) regression. */ private[classification] trait MultinomialLogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter @@ -70,7 +70,7 @@ private[classification] trait MultinomialLogisticRegressionParams /** * :: Experimental :: - * Multinomial Logistic regression. + * Multinomial Logistic (softmax) regression. */ @Since("2.1.0") @Experimental @@ -91,7 +91,6 @@ class MultinomialLogisticRegression @Since("2.1.0") ( */ @Since("2.1.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** @@ -104,7 +103,6 @@ class MultinomialLogisticRegression @Since("2.1.0") ( */ @Since("2.1.0") def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) - setDefault(elasticNetParam -> 0.0) /** @@ -115,7 +113,6 @@ class MultinomialLogisticRegression @Since("2.1.0") ( */ @Since("2.1.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** @@ -127,7 +124,6 @@ class MultinomialLogisticRegression @Since("2.1.0") ( */ @Since("2.1.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** @@ -138,7 +134,6 @@ class MultinomialLogisticRegression @Since("2.1.0") ( */ @Since("2.1.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** @@ -153,7 +148,6 @@ class MultinomialLogisticRegression @Since("2.1.0") ( */ @Since("2.1.0") def setStandardization(value: Boolean): this.type = set(standardization, value) - setDefault(standardization -> true) /** @@ -170,13 +164,6 @@ class MultinomialLogisticRegression @Since("2.1.0") ( override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) override protected[spark] def train(dataset: Dataset[_]): MultinomialLogisticRegressionModel = { - val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE - train(dataset, handlePersistence) - } - - protected[spark] def train( - dataset: Dataset[_], - handlePersistence: Boolean): MultinomialLogisticRegressionModel = { val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) val instances: RDD[Instance] = dataset.select(col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map { @@ -184,6 +171,7 @@ class MultinomialLogisticRegression @Since("2.1.0") ( Instance(label, weight, features) } + val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) val instr = Instrumentation.create(this, instances) @@ -192,12 +180,12 @@ class MultinomialLogisticRegression @Since("2.1.0") ( val (summarizer, labelSummarizer) = { val seqOp = (c: (MultivariateOnlineSummarizer, MultiClassSummarizer), - instance: Instance) => + instance: Instance) => (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight)) val combOp = (c1: (MultivariateOnlineSummarizer, MultiClassSummarizer), - c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) => - (c1._1.merge(c2._1), c1._2.merge(c2._2)) + c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) => + (c1._1.merge(c2._1), c1._2.merge(c2._2)) instances.treeAggregate( new MultivariateOnlineSummarizer, new MultiClassSummarizer)(seqOp, combOp) @@ -207,6 +195,7 @@ class MultinomialLogisticRegression @Since("2.1.0") ( val numInvalid = labelSummarizer.countInvalid val numFeatures = summarizer.mean.size val numFeaturesPlusIntercept = if (getFitIntercept) numFeatures + 1 else numFeatures + val numClasses = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match { case Some(n: Int) => require(n >= histogram.length, s"Specified number of classes $n was " + From 0c851d7548f6fdc6589544e8d3ba7db4a6dafbf9 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 18 Aug 2016 10:18:36 -0700 Subject: [PATCH 12/14] performance speedups in prediction and other review --- .../classification/LogisticRegression.scala | 13 +- .../MultinomialLogisticRegression.scala | 216 +++++++++--------- .../MultinomialLogisticRegressionSuite.scala | 78 ++++--- 3 files changed, 163 insertions(+), 144 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index a571caec6bd7..ea31c68e4c94 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -319,13 +319,15 @@ class LogisticRegression @Since("1.2.0") ( throw new SparkException(msg) } + val isConstantLabel = histogram.count(_ != 0) == 1 + if (numClasses > 2) { val msg = s"LogisticRegression with ElasticNet in ML package only supports " + s"binary classification. Found $numClasses in the input dataset. Consider using " + s"MultinomialLogisticRegression instead." logError(msg) throw new SparkException(msg) - } else if ($(fitIntercept) && numClasses == 2 && histogram(0) == 0.0) { + } else if ($(fitIntercept) && numClasses == 2 && isConstantLabel) { logWarning(s"All labels are one and fitIntercept=true, so the coefficients will be " + s"zeros and the intercept will be positive infinity; as a result, " + s"training is not needed.") @@ -336,12 +338,9 @@ class LogisticRegression @Since("1.2.0") ( s"training is not needed.") (Vectors.sparse(numFeatures, Seq()), Double.NegativeInfinity, Array.empty[Double]) } else { - if (!$(fitIntercept) && numClasses == 2 && histogram(0) == 0.0) { - logWarning(s"All labels are one and fitIntercept=false. It's a dangerous ground, " + - s"so the algorithm may not converge.") - } else if (!$(fitIntercept) && numClasses == 1) { - logWarning(s"All labels are zero and fitIntercept=false. It's a dangerous ground, " + - s"so the algorithm may not converge.") + if (!$(fitIntercept) && isConstantLabel) { + logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " + + s"dangerous ground, so the algorithm may not converge.") } val featuresMean = summarizer.mean.toArray diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala index 1eb6c1802e35..ba54554418ba 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -215,20 +215,29 @@ class MultinomialLogisticRegression @Since("2.1.0") ( throw new SparkException(msg) } - val labelIsConstant = histogram.count(_ != 0) == 1 + val isConstantLabel = histogram.count(_ != 0) == 1 - if ($(fitIntercept) && labelIsConstant) { - // we want to produce a model that will always predict the constant label + if ($(fitIntercept) && isConstantLabel) { + // we want to produce a model that will always predict the constant label so all the + // coefficients will be zero, and the constant label class intercept will be +inf + val constantLabelIndex = Vectors.dense(histogram).argmax (Matrices.sparse(numClasses, numFeatures, Array.fill(numFeatures + 1)(0), Array(), Array()), - Vectors.sparse(numClasses, Seq((numClasses - 1, Double.PositiveInfinity))), + Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity))), Array.empty[Double]) } else { - if (!$(fitIntercept) && labelIsConstant) { + if (!$(fitIntercept) && isConstantLabel) { logWarning(s"All labels belong to a single class and fitIntercept=false. It's" + s"a dangerous ground, so the algorithm may not converge.") } val featuresStd = summarizer.variance.toArray.map(math.sqrt) + val featuresMean = summarizer.mean.toArray + if (!$(fitIntercept) && (0 until numFeatures).exists { i => + featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) { + logWarning("Fitting MultinomialLogisticRegressionModel without intercept on dataset " + + "with bconstant nonzero column, Spark MLlib outputs zero coefficients for constant " + + "nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.") + } val regParamL1 = $(elasticNetParam) * $(regParam) val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) @@ -279,28 +288,28 @@ class MultinomialLogisticRegression @Since("2.1.0") ( it will converge faster if we initialize the intercepts such that it follows the distribution of the labels. {{{ - P(0) = \exp(b_0) / (\sum_{k=1}^K \exp(b_k)) + P(1) = \exp(b_1) / Z ... - P(K) = \exp(b_K) / (\sum_{k=1}^K \exp(b_k)) + P(K) = \exp(b_K) / Z + }}} + Where Z is a normalizing constant. Hence, + {{{ + b_k = \log(P(k)) + \log(Z) + = \log(count_k) - \log(count) + \log(Z) + = \log(count_k) + \lambda }}} - The solution to this is not identifiable, so choose the solution with minimum - L2 penalty (i.e. subtract the mean). Hence, + The solution to this is not identifiable, so choose the phase \lambda such that the + mean is centered. This yields {{{ - b_k = \log{count_k / count_0} - b_k' = b_k - \frac{1}{K} \sum b_k + b_k = \log(count_k) + b_k' = b_k - \mean(b_k) }}} */ - val referenceCoef = histogram.indices.map { i => - if (histogram(i) > 0) { - math.log(histogram(i) / (histogram(0) + 1)) // add 1 for smoothing - } else { - 0.0 - } - } - val referenceMean = referenceCoef.sum / referenceCoef.length - histogram.indices.foreach { i => + val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing + val rawMean = rawIntercepts.sum / rawIntercepts.length + rawIntercepts.indices.foreach { i => initialCoefficientsWithIntercept.toArray(i * numFeaturesPlusIntercept + numFeatures) = - referenceCoef(i) - referenceMean + rawIntercepts(i) - rawMean } } val states = optimizer.iterations(new CachedDiffFunction(costFun), @@ -332,38 +341,29 @@ class MultinomialLogisticRegression @Since("2.1.0") ( Note that the intercept in scaled space and original space is the same; as a result, no scaling is needed. */ - var interceptSum = 0.0 - var coefSum = 0.0 - val rawCoefficients = Vectors.fromBreeze(state.x) - val (coefMatrix, interceptVector) = rawCoefficients match { - case dv: DenseVector => - val coefArray = Array.tabulate(numClasses * numFeatures) { i => - val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i - val featureIndex = i % numFeatures - val unscaledCoef = if (featuresStd(featureIndex) != 0.0) { - dv(flatIndex) / featuresStd(featureIndex) - } else { - 0.0 - } - coefSum += unscaledCoef - unscaledCoef - } - val interceptVector = if ($(fitIntercept)) { - Vectors.dense(Array.tabulate(numClasses) { i => - val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1 - val intercept = dv(coefIndex) - interceptSum += intercept - intercept - }) - } else { - Vectors.sparse(numClasses, Seq()) - } - (new DenseMatrix(numClasses, numFeatures, coefArray, isTransposed = true), - interceptVector) - case sv: SparseVector => - throw new IllegalArgumentException("SparseVector is not supported for coefficients") + val rawCoefficients = state.x.toArray + val interceptsArray: Array[Double] = if ($(fitIntercept)) { + Array.tabulate(numClasses) { i => + val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1 + rawCoefficients(coefIndex) + } + } else { + Array[Double]() } + val coefficientArray: Array[Double] = Array.tabulate(numClasses * numFeatures) { i => + // flatIndex will loop though rawCoefficients, and skip the intercept terms. + val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i + val featureIndex = i % numFeatures + if (featuresStd(featureIndex) != 0.0) { + rawCoefficients(flatIndex) / featuresStd(featureIndex) + } else { + 0.0 + } + } + val coefficientMatrix = + new DenseMatrix(numClasses, numFeatures, coefficientArray, isTransposed = true) + /* When no regularization is applied, the coefficients lack identifiability because we do not use a pivot class. We can add any constant value to the coefficients and @@ -374,21 +374,24 @@ class MultinomialLogisticRegression @Since("2.1.0") ( Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf */ if ($(regParam) == 0.0) { - val coefficientMean = coefSum / (numClasses * numFeatures) - coefMatrix.update(_ - coefficientMean) + val coefficientMean = coefficientMatrix.values.sum / (numClasses * numFeatures) + coefficientMatrix.update(_ - coefficientMean) } /* The intercepts are never regularized, so we always center the mean. */ - val interceptMean = interceptSum / numClasses - interceptVector match { - case dv: DenseVector => (0 until dv.size).foreach { i => dv.toArray(i) -= interceptMean } - case sv: SparseVector => - (0 until sv.numNonzeros).foreach { i => sv.values(i) -= interceptMean } + val interceptVector = if (interceptsArray.nonEmpty) { + val interceptMean = interceptsArray.sum / numClasses + interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean } + Vectors.dense(interceptsArray) + } else { + Vectors.sparse(numClasses, Seq()) } - (coefMatrix, interceptVector, arrayBuilder.result()) + + (coefficientMatrix, interceptVector, arrayBuilder.result()) } } + if (handlePersistence) instances.unpersist() val model = copyValues( @@ -440,32 +443,30 @@ class MultinomialLogisticRegressionModel private[spark] ( /** Score (probability) for each class label. */ private val scores: Vector => Vector = (features) => { - val m = margins(features).toDense + val m = margins(features) val maxMarginIndex = m.argmax val maxMargin = m(maxMarginIndex) + val marginArray = m.toArray // adjust margins for overflow val sum = { var temp = 0.0 - if (maxMargin > 0) { - for (i <- 0 until numClasses) { - m.toArray(i) -= maxMargin - temp += math.exp(m(i)) - } - } else { - for (i <- 0 until numClasses ) { - temp += math.exp(m(i)) + var k = 0 + while (k < numClasses) { + marginArray(k) = if (maxMargin > 0) { + math.exp(marginArray(k) - maxMargin) + } else { + math.exp(marginArray(k)) } + temp += marginArray(k) + k += 1 } temp } - var i = 0 - while (i < m.size) { - m.values(i) = math.exp(m.values(i)) / sum - i += 1 - } - m + val scores = Vectors.dense(marginArray) + BLAS.scal(1 / sum, scores) + scores } /** @@ -475,11 +476,24 @@ class MultinomialLogisticRegressionModel private[spark] ( override protected def predict(features: Vector): Double = { if (isDefined(thresholds)) { val thresholds: Array[Double] = getThresholds - val scaledProbability: Array[Double] = - scores(features).toArray.zip(thresholds).map { case (p, t) => - if (t == 0.0) Double.PositiveInfinity else p / t + val probabilities = scores(features).toArray + var argMax = 0 + var max = Double.NegativeInfinity + var i = 0 + while (i < numClasses) { + if (thresholds(i) == 0.0) { + max = Double.PositiveInfinity + argMax = i + } else { + val scaled = probabilities(i) / thresholds(i) + if (scaled > max) { + max = scaled + argMax = i + } } - Vectors.dense(scaledProbability).argmax + i += 1 + } + argMax } else { scores(features).argmax } @@ -489,42 +503,34 @@ class MultinomialLogisticRegressionModel private[spark] ( rawPrediction match { case dv: DenseVector => val size = dv.size + val values = dv.values // get the maximum margin val maxMarginIndex = rawPrediction.argmax val maxMargin = rawPrediction(maxMarginIndex) if (maxMargin == Double.PositiveInfinity) { - for (j <- 0 until size) { - if (j == maxMarginIndex) { - dv.values(j) = 1.0 - } else { - dv.values(j) = 0.0 - } + var k = 0 + while (k < size) { + values(k) = if (k == maxMarginIndex) 1.0 else 0.0 + k += 1 } } else { val sum = { var temp = 0.0 - if (maxMargin > 0) { - // adjust margins for overflow - for (j <- 0 until numClasses) { - dv.values(j) -= maxMargin - temp += math.exp(dv.values(j)) - } - } else { - for (j <- 0 until numClasses) { - temp += math.exp(dv.values(j)) + var k = 0 + while (k < numClasses) { + values(k) = if (maxMargin > 0) { + math.exp(values(k) - maxMargin) + } else { + math.exp(values(k)) } + temp += values(k) + k += 1 } temp } - - // update in place - var i = 0 - while (i < size) { - dv.values(i) = math.exp(dv.values(i)) / sum - i += 1 - } + BLAS.scal(1 / sum, dv) } dv case sv: SparseVector => @@ -572,7 +578,7 @@ object MultinomialLogisticRegressionModel extends MLReadable[MultinomialLogistic private case class Data( numClasses: Int, numFeatures: Int, - intercept: Vector, + intercepts: Vector, coefficients: Matrix) override protected def saveImpl(path: String): Unit = { @@ -597,10 +603,10 @@ object MultinomialLogisticRegressionModel extends MLReadable[MultinomialLogistic val dataPath = new Path(path, "data").toString val data = sqlContext.read.format("parquet").load(dataPath) - .select("numClasses", "numFeatures", "intercept", "coefficients").head() - val numClasses = data.getInt(0) - val intercepts = data.getAs[Vector](2) - val coefficients = data.getAs[Matrix](3) + .select("numClasses", "numFeatures", "intercepts", "coefficients").head() + val numClasses = data.getAs[Int](data.fieldIndex("numClasses")) + val intercepts = data.getAs[Vector](data.fieldIndex("intercepts")) + val coefficients = data.getAs[Matrix](data.fieldIndex("coefficients")) val model = new MultinomialLogisticRegressionModel(metadata.uid, coefficients, intercepts, numClasses) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala index c7fa78f9e971..e9d4fe95f640 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala @@ -80,40 +80,41 @@ class MultinomialLogisticRegressionSuite * so we can validate the training accuracy compared with R's glmnet package. */ ignore("export test data into CSV format") { - multinomialDataset.rdd.map { case Row(label: Double, features: Vector) => + val rdd = multinomialDataset.rdd.map { case Row(label: Double, features: Vector) => label + "," + features.toArray.mkString(",") - }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset") + }.repartition(1) + rdd.saveAsTextFile("target/tmp/MultinomialLogisticRegressionSuite/multinomialDataset") } - test("params") { - ParamsSuite.checkParams(new MultinomialLogisticRegression) - val model = new MultinomialLogisticRegressionModel("mLogReg", - Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2) - ParamsSuite.checkParams(model) - } + test("params") { + ParamsSuite.checkParams(new MultinomialLogisticRegression) + val model = new MultinomialLogisticRegressionModel("mLogReg", + Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2) + ParamsSuite.checkParams(model) + } - test("multinomial logistic regression: default params") { - val mlr = new MultinomialLogisticRegression - assert(mlr.getLabelCol === "label") - assert(mlr.getFeaturesCol === "features") - assert(mlr.getPredictionCol === "prediction") - assert(mlr.getRawPredictionCol === "rawPrediction") - assert(mlr.getProbabilityCol === "probability") - assert(!mlr.isDefined(mlr.weightCol)) - assert(!mlr.isDefined(mlr.thresholds)) - assert(mlr.getFitIntercept) - assert(mlr.getStandardization) - val model = mlr.fit(dataset) - model.transform(dataset) - .select("label", "probability", "prediction", "rawPrediction") - .collect() - assert(model.getFeaturesCol === "features") - assert(model.getPredictionCol === "prediction") - assert(model.getRawPredictionCol === "rawPrediction") - assert(model.getProbabilityCol === "probability") - assert(model.intercepts !== Vectors.dense(0.0, 0.0)) - assert(model.hasParent) - } + test("multinomial logistic regression: default params") { + val mlr = new MultinomialLogisticRegression + assert(mlr.getLabelCol === "label") + assert(mlr.getFeaturesCol === "features") + assert(mlr.getPredictionCol === "prediction") + assert(mlr.getRawPredictionCol === "rawPrediction") + assert(mlr.getProbabilityCol === "probability") + assert(!mlr.isDefined(mlr.weightCol)) + assert(!mlr.isDefined(mlr.thresholds)) + assert(mlr.getFitIntercept) + assert(mlr.getStandardization) + val model = mlr.fit(dataset) + model.transform(dataset) + .select("label", "probability", "prediction", "rawPrediction") + .collect() + assert(model.getFeaturesCol === "features") + assert(model.getPredictionCol === "prediction") + assert(model.getRawPredictionCol === "rawPrediction") + assert(model.getProbabilityCol === "probability") + assert(model.intercepts !== Vectors.dense(0.0, 0.0)) + assert(model.hasParent) + } test("multinomial logistic regression with intercept without regularization") { @@ -319,9 +320,9 @@ class MultinomialLogisticRegressionSuite 0.0, 0.0, 0.0, 0.0), isTransposed = true) val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428) - assert(model1.coefficients ~== coefficientsRStd absTol 0.01) + assert(model1.coefficients ~== coefficientsRStd absTol 0.02) assert(model1.intercepts ~== interceptsRStd relTol 0.1) - assert(model2.coefficients ~== coefficientsR absTol 0.01) + assert(model2.coefficients ~== coefficientsR absTol 0.02) assert(model2.intercepts ~== interceptsR relTol 0.1) } @@ -891,6 +892,9 @@ class MultinomialLogisticRegressionSuite LabeledPoint(4.0, Vectors.dense(1.0)), LabeledPoint(4.0, Vectors.dense(2.0))) ) + val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata() + val constantDataWithMetadata = constantData + .select(constantData("label").as("label", labelMeta), constantData("features")) val mlr = new MultinomialLogisticRegression val model = mlr.fit(constantData) val results = model.transform(constantData) @@ -900,6 +904,16 @@ class MultinomialLogisticRegressionSuite assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0))) assert(pred === 4.0) } + + // ensure that the correct value is predicted when numClasses passed through metadata + val modelWithMetadata = mlr.fit(constantDataWithMetadata) + val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata) + resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity, 0.0))) + assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0))) + assert(pred === 4.0) + } // TODO: check num iters is zero when it become available in the model } From ffc64d4999be23872302201d47ab9bf4c9ce1bc6 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 18 Aug 2016 12:58:42 -0700 Subject: [PATCH 13/14] small further review changes --- .../ml/classification/MultinomialLogisticRegression.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala index ba54554418ba..db3e34bec051 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -221,7 +221,8 @@ class MultinomialLogisticRegression @Since("2.1.0") ( // we want to produce a model that will always predict the constant label so all the // coefficients will be zero, and the constant label class intercept will be +inf val constantLabelIndex = Vectors.dense(histogram).argmax - (Matrices.sparse(numClasses, numFeatures, Array.fill(numFeatures + 1)(0), Array(), Array()), + (Matrices.sparse(numClasses, numFeatures, Array.fill(numFeatures + 1)(0), + Array.empty[Int], Array.empty[Double]), Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity))), Array.empty[Double]) } else { @@ -235,7 +236,7 @@ class MultinomialLogisticRegression @Since("2.1.0") ( if (!$(fitIntercept) && (0 until numFeatures).exists { i => featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) { logWarning("Fitting MultinomialLogisticRegressionModel without intercept on dataset " + - "with bconstant nonzero column, Spark MLlib outputs zero coefficients for constant " + + "with constant nonzero column, Spark MLlib outputs zero coefficients for constant " + "nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.") } @@ -312,6 +313,7 @@ class MultinomialLogisticRegression @Since("2.1.0") ( rawIntercepts(i) - rawMean } } + val states = optimizer.iterations(new CachedDiffFunction(costFun), initialCoefficientsWithIntercept.asBreeze.toDenseVector) From fc2aa95dc89cae21d9d66d47598ddb37b787202b Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 18 Aug 2016 19:59:33 -0700 Subject: [PATCH 14/14] minor review changes, updating intercept prior derivation --- .../MultinomialLogisticRegression.scala | 13 +++-- .../MultinomialLogisticRegressionSuite.scala | 58 ++++++++++++++++--- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala index db3e34bec051..dfadd68c5f47 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala @@ -292,14 +292,15 @@ class MultinomialLogisticRegression @Since("2.1.0") ( P(1) = \exp(b_1) / Z ... P(K) = \exp(b_K) / Z + where Z = \sum_{k=1}^{K} \exp(b_k) }}} - Where Z is a normalizing constant. Hence, + Since this doesn't have a unique solution, one of the solutions that satisfies the + above equations is {{{ - b_k = \log(P(k)) + \log(Z) - = \log(count_k) - \log(count) + \log(Z) - = \log(count_k) + \lambda + \exp(b_k) = count_k * \exp(\lambda) + b_k = \log(count_k) * \lambda }}} - The solution to this is not identifiable, so choose the phase \lambda such that the + \lambda is a free parameter, so choose the phase \lambda such that the mean is centered. This yields {{{ b_k = \log(count_k) @@ -447,8 +448,8 @@ class MultinomialLogisticRegressionModel private[spark] ( private val scores: Vector => Vector = (features) => { val m = margins(features) val maxMarginIndex = m.argmax - val maxMargin = m(maxMarginIndex) val marginArray = m.toArray + val maxMargin = marginArray(maxMarginIndex) // adjust margins for overflow val sum = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala index e9d4fe95f640..0913fe559c56 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala @@ -167,9 +167,13 @@ class MultinomialLogisticRegressionSuite val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407) assert(model1.coefficients ~== coefficientsR relTol 0.05) + assert(model1.coefficients.toArray.sum ~== 0.0 absTol eps) assert(model1.intercepts ~== interceptsR relTol 0.05) + assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) assert(model2.coefficients ~== coefficientsR relTol 0.05) + assert(model2.coefficients.toArray.sum ~== 0.0 absTol eps) assert(model2.intercepts ~== interceptsR relTol 0.05) + assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression without intercept without regularization") { @@ -223,9 +227,13 @@ class MultinomialLogisticRegressionSuite 0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true) assert(model1.coefficients ~== coefficientsR relTol 0.05) - assert(model2.coefficients ~== coefficientsR relTol 0.05) + assert(model1.coefficients.toArray.sum ~== 0.0 absTol eps) assert(model1.intercepts.toArray === Array.fill(3)(0.0)) + assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficients ~== coefficientsR relTol 0.05) + assert(model2.coefficients.toArray.sum ~== 0.0 absTol eps) assert(model2.intercepts.toArray === Array.fill(3)(0.0)) + assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression with intercept with L1 regularization") { @@ -322,8 +330,10 @@ class MultinomialLogisticRegressionSuite assert(model1.coefficients ~== coefficientsRStd absTol 0.02) assert(model1.intercepts ~== interceptsRStd relTol 0.1) + assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) assert(model2.coefficients ~== coefficientsR absTol 0.02) assert(model2.intercepts ~== interceptsR relTol 0.1) + assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression without intercept with L1 regularization") { @@ -412,9 +422,11 @@ class MultinomialLogisticRegressionSuite 0.0, 0.0, 0.0, 0.0), isTransposed = true) assert(model1.coefficients ~== coefficientsRStd absTol 0.01) - assert(model2.coefficients ~== coefficientsR absTol 0.01) assert(model1.intercepts.toArray === Array.fill(3)(0.0)) + assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficients ~== coefficientsR absTol 0.01) assert(model2.intercepts.toArray === Array.fill(3)(0.0)) + assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression with intercept with L2 regularization") { @@ -506,8 +518,10 @@ class MultinomialLogisticRegressionSuite assert(model1.coefficients ~== coefficientsRStd relTol 0.05) assert(model1.intercepts ~== interceptsRStd relTol 0.05) + assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) assert(model2.coefficients ~== coefficientsR relTol 0.05) assert(model2.intercepts ~== interceptsR relTol 0.05) + assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression without intercept with L2 regularization") { @@ -595,8 +609,10 @@ class MultinomialLogisticRegressionSuite assert(model1.coefficients ~== coefficientsRStd absTol 0.01) assert(model1.intercepts.toArray === Array.fill(3)(0.0)) + assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) assert(model2.coefficients ~== coefficientsR absTol 0.01) assert(model2.intercepts.toArray === Array.fill(3)(0.0)) + assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression with intercept with elasticnet regularization") { @@ -690,9 +706,12 @@ class MultinomialLogisticRegressionSuite assert(model1.coefficients ~== coefficientsRStd absTol 0.01) assert(model1.intercepts ~== interceptsRStd absTol 0.01) + assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) assert(model2.coefficients ~== coefficientsR absTol 0.01) assert(model2.intercepts ~== interceptsR absTol 0.01) + assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) } + test("multinomial logistic regression without intercept with elasticnet regularization") { val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) @@ -781,10 +800,19 @@ class MultinomialLogisticRegressionSuite assert(model1.coefficients ~== coefficientsRStd absTol 0.01) assert(model1.intercepts.toArray === Array.fill(3)(0.0)) + assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) assert(model2.coefficients ~== coefficientsR absTol 0.01) assert(model2.intercepts.toArray === Array.fill(3)(0.0)) + assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) } + /* + test("multinomial logistic regression with intercept with strong L1 regularization") { + // TODO: implement this test to check that the priors on the intercepts are correct + // TODO: when initial model becomes available + } + */ + test("prediction") { val model = new MultinomialLogisticRegressionModel("mLogReg", Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)), @@ -878,8 +906,7 @@ class MultinomialLogisticRegressionSuite val thrown = intercept[IllegalArgumentException] { mlr.fit(df1) } - assert(thrown.getMessage.contains( - "less than the number of unique labels")) + assert(thrown.getMessage.contains("less than the number of unique labels")) // mlr should infer the number of classes if not specified val model3 = mlr.fit(dataset) @@ -892,9 +919,6 @@ class MultinomialLogisticRegressionSuite LabeledPoint(4.0, Vectors.dense(1.0)), LabeledPoint(4.0, Vectors.dense(2.0))) ) - val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata() - val constantDataWithMetadata = constantData - .select(constantData("label").as("label", labelMeta), constantData("features")) val mlr = new MultinomialLogisticRegression val model = mlr.fit(constantData) val results = model.transform(constantData) @@ -905,8 +929,25 @@ class MultinomialLogisticRegressionSuite assert(pred === 4.0) } + // force the model to be trained with only one class + val constantZeroData = spark.createDataFrame(Seq( + LabeledPoint(0.0, Vectors.dense(0.0)), + LabeledPoint(0.0, Vectors.dense(1.0)), + LabeledPoint(0.0, Vectors.dense(2.0))) + ) + val modelZeroLabel = mlr.setFitIntercept(false).fit(constantZeroData) + val resultsZero = modelZeroLabel.transform(constantZeroData) + resultsZero.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(prob === Vectors.dense(Array(1.0))) + assert(pred === 0.0) + } + // ensure that the correct value is predicted when numClasses passed through metadata - val modelWithMetadata = mlr.fit(constantDataWithMetadata) + val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata() + val constantDataWithMetadata = constantData + .select(constantData("label").as("label", labelMeta), constantData("features")) + val modelWithMetadata = mlr.setFitIntercept(true).fit(constantDataWithMetadata) val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata) resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach { case Row(raw: Vector, prob: Vector, pred: Double) => @@ -967,7 +1008,6 @@ class MultinomialLogisticRegressionSuite assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) => scaled.getDouble(0) === base.getDouble(0) }) - } test("read/write") {