[SPARK-17792][ML] L-BFGS solver for linear regression does not accept general numeric label column types

sethah · yanboliang · commit 594a2cf6f7c7 · 2016-10-06T21:14:44.000-07:00
## What changes were proposed in this pull request? Before, we computed `instances` in LinearRegression in two spots, even though they did the same thing. One of them did not cast the label column to `DoubleType`. This patch consolidates the computation and always casts the label column to `DoubleType`. ## How was this patch tested? Added a unit test to check all solvers. This test failed before this patch. Author: sethah <seth.hendrickson16@gmail.com> Closes #15364 from sethah/linreg_numeric_type. (cherry picked from commit 3713bb1) Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -163,17 +163,18 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
     val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size
     val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
 
+    val instances: RDD[Instance] = dataset.select(
+      col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map {
+      case Row(label: Double, weight: Double, features: Vector) =>
+        Instance(label, weight, features)
+    }
+
     if (($(solver) == "auto" && $(elasticNetParam) == 0.0 &&
       numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == "normal") {
       require($(elasticNetParam) == 0.0, "Only L2 regularization can be used when normal " +
         "solver is used.'")
       // For low dimensional data, WeightedLeastSquares is more efficiently since the
       // training algorithm only requires one pass through the data. (SPARK-10668)
-      val instances: RDD[Instance] = dataset.select(
-        col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map {
-          case Row(label: Double, weight: Double, features: Vector) =>
-            Instance(label, weight, features)
-      }
 
       val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam),
         $(standardization), true)
@@ -196,12 +197,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       return lrModel.setSummary(trainingSummary)
     }
 
-    val instances: RDD[Instance] =
-      dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
-        case Row(label: Double, weight: Double, features: Vector) =>
-          Instance(label, weight, features)
-      }
-
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -1019,12 +1019,14 @@ class LinearRegressionSuite
   }
 
   test("should support all NumericType labels and not support other types") {
-    val lr = new LinearRegression().setMaxIter(1)
-    MLTestingUtils.checkNumericTypes[LinearRegressionModel, LinearRegression](
-      lr, spark, isClassification = false) { (expected, actual) =>
+    for (solver <- Seq("auto", "l-bfgs", "normal")) {
+      val lr = new LinearRegression().setMaxIter(1).setSolver(solver)
+      MLTestingUtils.checkNumericTypes[LinearRegressionModel, LinearRegression](
+        lr, spark, isClassification = false) { (expected, actual) =>
         assert(expected.intercept === actual.intercept)
         assert(expected.coefficients === actual.coefficients)
       }
+    }
   }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -1019,12 +1019,14 @@ class LinearRegressionSuite`
`1019`	`1019`	`}`
`1020`	`1020`
`1021`	`1021`	`test("should support all NumericType labels and not support other types") {`
`1022`		`- val lr = new LinearRegression().setMaxIter(1)`
`1023`		`- MLTestingUtils.checkNumericTypes[LinearRegressionModel, LinearRegression](`
`1024`		`- lr, spark, isClassification = false) { (expected, actual) =>`
	`1022`	`+ for (solver <- Seq("auto", "l-bfgs", "normal")) {`
	`1023`	`+ val lr = new LinearRegression().setMaxIter(1).setSolver(solver)`
	`1024`	`+ MLTestingUtils.checkNumericTypes[LinearRegressionModel, LinearRegression](`
	`1025`	`+ lr, spark, isClassification = false) { (expected, actual) =>`
`1025`	`1026`	`assert(expected.intercept === actual.intercept)`
`1026`	`1027`	`assert(expected.coefficients === actual.coefficients)`
`1027`	`1028`	`}`
	`1029`	`+ }`
`1028`	`1030`	`}`
`1029`	`1031`	`}`
`1030`	`1032`