Address comments.

viirya · viirya · commit bb19708dcd35 · 2017-10-11T08:48:19.000Z
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
@@ -22,7 +22,13 @@ package org.apache.spark.examples.ml
 import org.apache.spark.ml.feature.Bucketizer
 // $example off$
 import org.apache.spark.sql.SparkSession
-
+/**
+ * An example for Bucketizer.
+ * Run with
+ * {{{
+ * bin/run-example ml.BucketizerExample
+ * }}}
+ */
 object BucketizerExample {
   def main(args: Array[String]): Unit = {
     val spark = SparkSession
@@ -48,6 +54,34 @@ object BucketizerExample {
     bucketedData.show()
     // $example off$
 
+    // $example on$
+    val splitsArray = Array(
+      Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity),
+      Array(Double.NegativeInfinity, -0.3, 0.0, 0.3, Double.PositiveInfinity))
+
+    val data2 = Array(
+      (-999.9, -999.9),
+      (-0.5, -0.2),
+      (-0.3, -0.1),
+      (0.0, 0.0),
+      (0.2, 0.4),
+      (999.9, 999.9))
+    val dataFrame2 = spark.createDataFrame(data2).toDF("features1", "features2")
+
+    val bucketizer2 = new Bucketizer()
+      .setInputCols(Array("features1", "features2"))
+      .setOutputCols(Array("bucketedFeatures1", "bucketedFeatures2"))
+      .setSplitsArray(splitsArray)
+
+    // Transform original data into its bucket index.
+    val bucketedData2 = bucketizer2.transform(dataFrame2)
+
+    println(s"Bucketizer output with [" +
+      s"${bucketizer2.getSplitsArray(0).length-1}, " +
+      s"${bucketizer2.getSplitsArray(1).length-1}] buckets for each input column")
+    bucketedData2.show()
+    // $example off$
+
     spark.stop()
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.ml.Model
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasInputCols, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasInputCols, HasOutputCol, HasOutputCols}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql._
 import org.apache.spark.sql.expressions.UserDefinedFunction
@@ -33,14 +33,15 @@ import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
 /**
  * `Bucketizer` maps a column of continuous features to a column of feature buckets. Since 2.3.0,
- * `Bucketizer` can also map multiple columns at once. Whether it goes to map a column or multiple
- * columns, it depends on which parameter of `inputCol` and `inputCols` is set. When both are set,
- * a log warning will be printed and by default it chooses `inputCol`.
+ * `Bucketizer` can map multiple columns at once by setting the `inputCols` parameter. Note that
+ * when both the `inputCol` and `inputCols` parameters are set, a log warning will be printed and
+ * only `inputCol` will take effect, while `inputCols` will be ignored. The `splits` parameter is
+ * only used for single column usage, and `splitsArray` is for multiple columns.
  */
 @Since("1.4.0")
 final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   extends Model[Bucketizer] with HasHandleInvalid with HasInputCol with HasOutputCol
-    with HasInputCols with DefaultParamsWritable {
+    with HasInputCols with HasOutputCols with DefaultParamsWritable {
 
   @Since("1.4.0")
   def this() = this(Identifiable.randomUID("bucketizer"))
@@ -84,7 +85,9 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   /**
    * Param for how to handle invalid entries. Options are 'skip' (filter out rows with
    * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
-   * additional bucket).
+   * additional bucket). Note that in the multiple column case, the invalid handling is applied
+   * to all columns. That said for 'error' it will throw an error if any invalids are found in
+   * any column, for 'skip' it will skip rows with any invalids in any columns, etc.
    * Default: "error"
    * @group param
    */
@@ -115,22 +118,10 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
       "specified will be treated as errors.",
     Bucketizer.checkSplitsArray)
 
-  /**
-   * Param for output column names.
-   * @group param
-   */
-  @Since("2.3.0")
-  final val outputCols: StringArrayParam = new StringArrayParam(this, "outputCols",
-    "output column names")
-
   /** @group getParam */
   @Since("2.3.0")
   def getSplitsArray: Array[Array[Double]] = $(splitsArray)
 
-  /** @group getParam */
-  @Since("2.3.0")
-  final def getOutputCols: Array[String] = $(outputCols)
-
   /** @group setParam */
   @Since("2.3.0")
   def setSplitsArray(value: Array[Array[Double]]): this.type = set(splitsArray, value)
@@ -148,7 +139,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
    * `inputCols` is set, it will map multiple columns. Otherwise, it just maps a column specified
    * by `inputCol`. A warning will be printed if both are set.
    */
-  private[ml] def isBucketizeMultipleColumns(): Boolean = {
+  private[feature] def isBucketizeMultipleColumns(): Boolean = {
     if (isSet(inputCols) && isSet(inputCol)) {
       logWarning("Both `inputCol` and `inputCols` are set, we ignore `inputCols` and this " +
         "`Bucketizer` only map one column specified by `inputCol`")
@@ -162,7 +153,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
 
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
-    transformSchema(dataset.schema)
+    val transformedSchema = transformSchema(dataset.schema)
 
     val (filteredDataset, keepInvalid) = {
       if (getHandleInvalid == Bucketizer.SKIP_INVALID) {
@@ -193,10 +184,10 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
     val newCols = inputColumns.zipWithIndex.map { case (inputCol, idx) =>
       bucketizers(idx)(filteredDataset(inputCol).cast(DoubleType))
     }
-    val newFields = outputColumns.zipWithIndex.map { case (outputCol, idx) =>
-      prepOutputField(seqOfSplits(idx), outputCol)
+    val metadata = outputColumns.map { col =>
+      transformedSchema(col).metadata
     }
-    filteredDataset.withColumns(outputColumns, newCols, newFields.map(_.metadata))
+    filteredDataset.withColumns(outputColumns, newCols, metadata)
   }
 
   private def prepOutputField(splits: Array[Double], outputCol: String): StructField = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -60,6 +60,7 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[String]("inputCol", "input column name"),
       ParamDesc[Array[String]]("inputCols", "input column names"),
       ParamDesc[String]("outputCol", "output column name", Some("uid + \"__output\"")),
+      ParamDesc[Array[String]]("outputCols", "output column names"),
       ParamDesc[Int]("checkpointInterval", "set checkpoint interval (>= 1) or " +
         "disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed " +
         "every 10 iterations", isValid = "(interval: Int) => interval == -1 || interval >= 1"),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -230,6 +230,21 @@ private[ml] trait HasOutputCol extends Params {
   final def getOutputCol: String = $(outputCol)
 }
 
+/**
+ * Trait for shared param outputCols.
+ */
+private[ml] trait HasOutputCols extends Params {
+
+  /**
+   * Param for output column names.
+   * @group param
+   */
+  final val outputCols: StringArrayParam = new StringArrayParam(this, "outputCols", "output column names")
+
+  /** @group getParam */
+  final def getOutputCols: Array[String] = $(outputCols)
+}
+
 /**
  * Trait for shared param checkpointInterval.
  */
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -200,7 +200,7 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val data = (0 until validData1.length).map { idx =>
       (validData1(idx), validData2(idx), expectedBuckets1(idx), expectedBuckets2(idx))
     }
-    val dataFrame: DataFrame = data.toSeq.toDF("feature1", "feature2", "expected1", "expected2")
+    val dataFrame: DataFrame = data.toDF("feature1", "feature2", "expected1", "expected2")
 
     val bucketizer1: Bucketizer = new Bucketizer()
       .setInputCols(Array("feature1", "feature2"))
@@ -210,16 +210,12 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     assert(bucketizer1.isBucketizeMultipleColumns())
 
     bucketizer1.transform(dataFrame).select("result1", "expected1", "result2", "expected2")
-      .collect().foreach {
-        case Row(r1: Double, e1: Double, r2: Double, e2: Double) =>
-          assert(r1 === e1,
-            s"The feature value is not correct after bucketing. Expected $e1 but found $r1")
-          assert(r2 === e2,
-            s"The feature value is not correct after bucketing. Expected $e2 but found $r2")
-      }
+    BucketizerSuite.checkBucketResults(bucketizer1.transform(dataFrame),
+      Seq("result1", "result2"),
+      Seq("expected1", "expected2"))
 
     // Check for exceptions when using a set of invalid feature values.
-    val invalidData1: Array[Double] = Array(-0.9) ++ validData1
+    val invalidData1 = Array(-0.9) ++ validData1
     val invalidData2 = Array(0.51) ++ validData1
     val badDF1 = invalidData1.zipWithIndex.toSeq.toDF("feature", "idx")
 
@@ -256,7 +252,7 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val data = (0 until validData1.length).map { idx =>
       (validData1(idx), validData2(idx), expectedBuckets1(idx), expectedBuckets2(idx))
     }
-    val dataFrame: DataFrame = data.toSeq.toDF("feature1", "feature2", "expected1", "expected2")
+    val dataFrame: DataFrame = data.toDF("feature1", "feature2", "expected1", "expected2")
 
     val bucketizer: Bucketizer = new Bucketizer()
       .setInputCols(Array("feature1", "feature2"))
@@ -265,14 +261,9 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
 
     assert(bucketizer.isBucketizeMultipleColumns())
 
-    bucketizer.transform(dataFrame).select("result1", "expected1", "result2", "expected2")
-      .collect().foreach {
-        case Row(r1: Double, e1: Double, r2: Double, e2: Double) =>
-          assert(r1 === e1,
-            s"The feature value is not correct after bucketing. Expected $e1 but found $r1")
-          assert(r2 === e2,
-            s"The feature value is not correct after bucketing. Expected $e2 but found $r2")
-      }
+    BucketizerSuite.checkBucketResults(bucketizer.transform(dataFrame),
+      Seq("result1", "result2"),
+      Seq("expected1", "expected2"))
   }
 
   test("multiple columns: Bucket continuous features, with NaN data but non-NaN splits") {
@@ -288,7 +279,7 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val data = (0 until validData1.length).map { idx =>
       (validData1(idx), validData2(idx), expectedBuckets1(idx), expectedBuckets2(idx))
     }
-    val dataFrame: DataFrame = data.toSeq.toDF("feature1", "feature2", "expected1", "expected2")
+    val dataFrame: DataFrame = data.toDF("feature1", "feature2", "expected1", "expected2")
 
     val bucketizer: Bucketizer = new Bucketizer()
       .setInputCols(Array("feature1", "feature2"))
@@ -298,14 +289,9 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     assert(bucketizer.isBucketizeMultipleColumns())
 
     bucketizer.setHandleInvalid("keep")
-    bucketizer.transform(dataFrame).select("result1", "expected1", "result2", "expected2")
-      .collect().foreach {
-        case Row(r1: Double, e1: Double, r2: Double, e2: Double) =>
-          assert(r1 === e1,
-            s"The feature value is not correct after bucketing. Expected $e1 but found $r1")
-          assert(r2 === e2,
-            s"The feature value is not correct after bucketing. Expected $e2 but found $r2")
-      }
+    BucketizerSuite.checkBucketResults(bucketizer.transform(dataFrame),
+      Seq("result1", "result2"),
+      Seq("expected1", "expected2"))
 
     bucketizer.setHandleInvalid("skip")
     val skipResults1: Array[Double] = bucketizer.transform(dataFrame)
@@ -335,7 +321,7 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     }
   }
 
-  test("multiple columns:: read/write") {
+  test("multiple columns: read/write") {
     val t = new Bucketizer()
       .setInputCols(Array("myInputCol"))
       .setOutputCols(Array("myOutputCol"))
@@ -359,13 +345,51 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       .setStages(Array(bucket))
       .fit(df)
     pl.transform(df).select("result1", "expected1", "result2", "expected2")
-      .collect().foreach {
-        case Row(r1: Double, e1: Double, r2: Double, e2: Double) =>
-          assert(r1 === e1,
-            s"The feature value is not correct after bucketing. Expected $e1 but found $r1")
-          assert(r2 === e2,
-            s"The feature value is not correct after bucketing. Expected $e2 but found $r2")
-      }
+
+    BucketizerSuite.checkBucketResults(pl.transform(df),
+      Seq("result1", "result2"), Seq("expected1", "expected2"))
+  }
+
+  test("Compare single/multiple column(s) Bucketizer in pipeline") {
+    val df = Seq((0.5, 0.3, 1.0, 1.0), (0.5, -0.4, 1.0, 0.0))
+      .toDF("feature1", "feature2", "expected1", "expected2")
+
+    val multiColsBucket = new Bucketizer()
+      .setInputCols(Array("feature1", "feature2"))
+      .setOutputCols(Array("result1", "result2"))
+      .setSplitsArray(Array(Array(-0.5, 0.0, 0.5), Array(-0.5, 0.0, 0.5)))
+
+    val plForMultiCols = new Pipeline()
+      .setStages(Array(multiColsBucket))
+      .fit(df)
+
+    val bucketForCol1 = new Bucketizer()
+      .setInputCol("feature1")
+      .setOutputCol("result1")
+      .setSplits(Array(-0.5, 0.0, 0.5))
+    val bucketForCol2 = new Bucketizer()
+      .setInputCol("feature2")
+      .setOutputCol("result2")
+      .setSplits(Array(-0.5, 0.0, 0.5))
+
+    val plForSingleCol = new Pipeline()
+      .setStages(Array(bucketForCol1, bucketForCol2))
+      .fit(df)
+
+    val resultForSingleCol = plForSingleCol.transform(df)
+      .select("result1", "expected1", "result2", "expected2")
+      .collect()
+    val resultForMultiCols = plForMultiCols.transform(df)
+      .select("result1", "expected1", "result2", "expected2")
+      .collect()
+
+    resultForSingleCol.zip(resultForMultiCols).foreach {
+        case (rowForSingle, rowForMultiCols) =>
+          assert(rowForSingle.getDouble(0) == rowForMultiCols.getDouble(0) &&
+            rowForSingle.getDouble(1) == rowForMultiCols.getDouble(1) &&
+            rowForSingle.getDouble(2) == rowForMultiCols.getDouble(2) &&
+            rowForSingle.getDouble(3) == rowForMultiCols.getDouble(3))
+    }
   }
 
   test("Both inputCol and inputCols are set") {
@@ -411,4 +435,26 @@ private object BucketizerSuite extends SparkFunSuite {
       i += 1
     }
   }
+
+  /** Checks if bucketized results match expected ones. */
+  def checkBucketResults(
+      bucketResult: DataFrame,
+      resultColumns: Seq[String],
+      expectedColumns: Seq[String]): Unit = {
+    assert(resultColumns.length == expectedColumns.length,
+      s"Given ${resultColumns.length} result columns doesn't match " +
+        s"${expectedColumns.length} expected columns.")
+    assert(resultColumns.length > 0, "At least one result and expected columns are needed.")
+
+    val allColumns = resultColumns ++ expectedColumns
+    bucketResult.select(allColumns.head, allColumns.tail: _*).collect().foreach {
+      case row =>
+        for (idx <- 0 until row.length / 2) {
+          val result = row.getDouble(idx)
+          val expected = row.getDouble(idx + row.length / 2)
+          assert(result === expected, "The feature value is not correct after bucketing. " +
+            s"Expected $expected but found $result.")
+        }
+    }
+  }
 }