Add a Bucketizer that can bin multiple columns.

viirya · viirya · commit e8f5d89ab2c3 · 2017-05-01T14:10:54.000Z
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.ml.Model
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasInputCol, HasInputCols, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql._
 import org.apache.spark.sql.expressions.UserDefinedFunction
@@ -140,6 +140,139 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   }
 }
 
+/**
+ * `MultipleBucketizer` maps columns of continuous features to columns of feature buckets.
+ */
+@Since("2.3.0")
+final class MultipleBucketizer @Since("2.3.0") (@Since("2.3.0") override val uid: String)
+  extends Model[MultipleBucketizer] with HasInputCols with DefaultParamsWritable {
+
+  @Since("2.3.0")
+  def this() = this(Identifiable.randomUID("multipleBucketizer"))
+
+  /**
+   * Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets.
+   * A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which
+   * also includes y. Splits should be of length greater than or equal to 3 and strictly increasing.
+   * Values at -inf, inf must be explicitly provided to cover all Double values;
+   * otherwise, values outside the splits specified will be treated as errors.
+   *
+   * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
+   *
+   * @group param
+   */
+  @Since("2.3.0")
+  val splitsArray: DoubleArrayArrayParam = new DoubleArrayArrayParam(this, "splitsArray",
+    "The array of split points for mapping continuous features into buckets for multiple " +
+      "columns. For each input column, with n+1 splits, there are n buckets. A bucket defined by " +
+      "splits x,y holds values in the range [x,y) except the last bucket, which also includes y. " +
+      "The splits should be of length >= 3 and strictly increasing. Values at -inf, inf must be " +
+      "explicitly provided to cover all Double values; otherwise, values outside the splits " +
+      "specified will be treated as errors.",
+    Bucketizer.checkSplitsArray)
+
+  /**
+   * Param for output column names.
+   * @group param
+   */
+  @Since("2.3.0")
+  final val outputCols: StringArrayParam = new StringArrayParam(this, "outputCols",
+    "output column names")
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getSplitsArray: Array[Array[Double]] = $(splitsArray)
+
+  /** @group getParam */
+  @Since("2.3.0")
+  final def getOutputCols: Array[String] = $(outputCols)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setSplitsArray(value: Array[Array[Double]]): this.type = set(splitsArray, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
+
+  /**
+   * Param for how to handle invalid entries. Options are 'skip' (filter out rows with
+   * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
+   * additional bucket).
+   * Default: "error"
+   * @group param
+   */
+  // TODO: Make MultipleBucketizer inherit from HasHandleInvalid.
+  @Since("2.3.0")
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
+    "invalid entries. Options are skip (filter out rows with invalid values), " +
+    "error (throw an error), or keep (keep invalid values in a special additional bucket).",
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getHandleInvalid: String = $(handleInvalid)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+  setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
+
+  @Since("2.3.0")
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    transformSchema(dataset.schema)
+    val (filteredDataset, keepInvalid) = {
+      if (getHandleInvalid == Bucketizer.SKIP_INVALID) {
+        // "skip" NaN option is set, will filter out NaN values in the dataset
+        (dataset.na.drop().toDF(), false)
+      } else {
+        (dataset.toDF(), getHandleInvalid == Bucketizer.KEEP_INVALID)
+      }
+    }
+
+    val bucketizers: Seq[UserDefinedFunction] = $(splitsArray).map { splits =>
+      udf { (feature: Double) =>
+        Bucketizer.binarySearchForBuckets(splits, feature, keepInvalid)
+      }
+    }
+
+    val newCols = $(inputCols).zipWithIndex.map { case (inputCol, idx) =>
+      bucketizers(idx)(filteredDataset(inputCol))
+    }
+    val newFields = $(outputCols).zipWithIndex.map { case (outputCol, idx) =>
+      prepOutputField(idx, outputCol)
+    }
+    filteredDataset.withColumns($(outputCols), newCols, newFields.map(_.metadata))
+  }
+
+  private def prepOutputField(idx: Int, outputCol: String): StructField = {
+    val buckets = $(splitsArray)(idx).sliding(2).map(bucket => bucket.mkString(", ")).toArray
+    val attr = new NominalAttribute(name = Some(outputCol), isOrdinal = Some(true),
+      values = Some(buckets))
+    attr.toStructField()
+  }
+
+  @Since("2.3.0")
+  override def transformSchema(schema: StructType): StructType = {
+    var transformedSchema = schema
+    $(inputCols).zip($(outputCols)).zipWithIndex.map { case ((inputCol, outputCol), idx) =>
+      SchemaUtils.checkColumnType(transformedSchema, inputCol, DoubleType)
+      transformedSchema = SchemaUtils.appendColumn(transformedSchema,
+        prepOutputField(idx, outputCol))
+    }
+    transformedSchema
+  }
+
+  @Since("2.3.0")
+  override def copy(extra: ParamMap): MultipleBucketizer = {
+    defaultCopy[MultipleBucketizer](extra).setParent(parent)
+  }
+}
+
 @Since("1.6.0")
 object Bucketizer extends DefaultParamsReadable[Bucketizer] {
 
@@ -167,6 +300,13 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
     }
   }
 
+  /**
+   * Check each splits in the splits array.
+   */
+  private[feature] def checkSplitsArray(splitsArray: Array[Array[Double]]): Boolean = {
+    splitsArray.forall(checkSplits(_))
+  }
+
   /**
    * Binary searching in several buckets to place each data point.
    * @param splits array of split points
@@ -211,3 +351,9 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
   @Since("1.6.0")
   override def load(path: String): Bucketizer = super.load(path)
 }
+
+@Since("2.3.0")
+object MultipleBucketizer extends DefaultParamsReadable[MultipleBucketizer] {
+  @Since("2.3.0")
+  override def load(path: String): MultipleBucketizer = super.load(path)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -490,6 +490,45 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
   }
 }
 
+/**
+ * :: DeveloperApi ::
+ * Specialized version of `Param[Array[Array[Double]]]` for Java.
+ */
+@DeveloperApi
+class DoubleArrayArrayParam(
+    parent: Params,
+    name: String,
+    doc: String,
+    isValid: Array[Array[Double]] => Boolean)
+  extends Param[Array[Array[Double]]](parent, name, doc, isValid) {
+
+  def this(parent: Params, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
+
+  /** Creates a param pair with a `java.util.List` of values (for Java and Python). */
+  def w(value: java.util.List[java.util.List[java.lang.Double]]): ParamPair[Array[Array[Double]]] =
+    w(value.asScala.map(_.asScala.map(_.asInstanceOf[Double]).toArray).toArray)
+
+  override def jsonEncode(value: Array[Array[Double]]): String = {
+    import org.json4s.JsonDSL._
+    compact(render(value.toSeq.map(_.toSeq.map(DoubleParam.jValueEncode))))
+  }
+
+  override def jsonDecode(json: String): Array[Array[Double]] = {
+    parse(json) match {
+      case JArray(values) =>
+        values.map {
+          case JArray(values) =>
+            values.map(DoubleParam.jValueDecode).toArray
+          case _ =>
+            throw new IllegalArgumentException(s"Cannot decode $json to Array[Array[Double]].")
+        }.toArray
+      case _ =>
+        throw new IllegalArgumentException(s"Cannot decode $json to Array[Array[Double]].")
+    }
+  }
+}
+
 /**
  * :: DeveloperApi ::
  * Specialized version of `Param[Array[Int]]` for Java.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -22,7 +22,7 @@ import scala.util.Random
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -121,10 +121,10 @@ class ParamsSuite extends SparkFunSuite {
     { // DoubleArrayParam
       val param = new DoubleArrayParam(dummy, "name", "doc")
       val values: Seq[Array[Double]] = Seq(
-         Array(),
-         Array(1.0),
-         Array(Double.NaN, Double.NegativeInfinity, Double.MinValue, -1.0, 0.0,
-           Double.MinPositiveValue, 1.0, Double.MaxValue, Double.PositiveInfinity))
+        Array(),
+        Array(1.0),
+        Array(Double.NaN, Double.NegativeInfinity, Double.MinValue, -1.0, 0.0,
+          Double.MinPositiveValue, 1.0, Double.MaxValue, Double.PositiveInfinity))
       for (value <- values) {
         val json = param.jsonEncode(value)
         val decoded = param.jsonDecode(json)
@@ -139,6 +139,36 @@ class ParamsSuite extends SparkFunSuite {
       }
     }
 
+    { // DoubleArrayArrayParam
+      val param = new DoubleArrayArrayParam(dummy, "name", "doc")
+      val values: Seq[Array[Array[Double]]] = Seq(
+        Array(Array()),
+        Array(Array(1.0)),
+        Array(Array(1.0), Array(2.0)),
+        Array(
+          Array(Double.NaN, Double.NegativeInfinity, Double.MinValue, -1.0, 0.0,
+            Double.MinPositiveValue, 1.0, Double.MaxValue, Double.PositiveInfinity),
+          Array(Double.MaxValue, Double.PositiveInfinity, Double.MinPositiveValue, 1.0,
+            Double.NaN, Double.NegativeInfinity, Double.MinValue, -1.0, 0.0)
+        ))
+
+      for (value <- values) {
+        val json = param.jsonEncode(value)
+        val decoded = param.jsonDecode(json)
+        assert(decoded.length === value.length)
+        decoded.zip(value).foreach { case (actualArray, expectedArray) =>
+          assert(actualArray.length === expectedArray.length)
+          actualArray.zip(expectedArray).foreach { case (actual, expected) =>
+            if (expected.isNaN) {
+              assert(actual.isNaN)
+            } else {
+              assert(actual === expected)
+            }
+          }
+        }
+      }
+    }
+
     { // StringArrayParam
       val param = new StringArrayParam(dummy, "name", "doc")
       val values: Seq[Array[String]] = Seq(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1882,6 +1882,56 @@ class Dataset[T] private[sql](
     }
   }
 
+  /**
+   * Returns a new Dataset by adding columns or replacing the existing columns that has
+   * the same names.
+   *
+   * @group untypedrel
+   * @since 2.3.0
+   */
+  def withColumns(colNames: Seq[String], cols: Seq[Column]): DataFrame = {
+    assert(colNames.size == cols.size,
+      s"The size of column names: ${colNames.size} isn't equal to " +
+        s"the size of columns: ${cols.size}")
+
+    val resolver = sparkSession.sessionState.analyzer.resolver
+    val output = queryExecution.analyzed.output
+
+    val columnMap = colNames.zip(cols).toMap
+
+    val replacedAndExistingColumns = output.map { field =>
+      val dupColumn = columnMap.find { case (colName, col) =>
+        resolver(field.name, colName)
+      }
+      if (dupColumn.isDefined) {
+        val colName = dupColumn.get._1
+        val col = dupColumn.get._2
+        col.as(colName)
+      } else {
+        Column(field)
+      }
+    }
+
+    val newColumns = columnMap.filter { case (colName, col) =>
+      !output.exists(f => resolver(f.name, colName))
+    }.map { case (colName, col) => col.as(colName) }
+
+    select(replacedAndExistingColumns ++ newColumns : _*)
+  }
+
+  /**
+   * Returns a new Dataset by adding columns with metadata.
+   */
+  private[spark] def withColumns(
+      colNames: Seq[String],
+      cols: Seq[Column],
+      metadata: Seq[Metadata]): DataFrame = {
+    val newCols = colNames.zip(cols).zip(metadata).map { case ((colName, col), metadata) =>
+      col.as(colName, metadata)
+    }
+    withColumns(colNames, newCols)
+  }
+
   /**
    * Returns a new Dataset by adding a column with metadata.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -555,6 +555,17 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(df.schema.map(_.name) === Seq("key", "value", "newCol"))
   }
 
+  test("withColumns") {
+    val df = testData.toDF().withColumns(Seq("newCol1", "newCol2"),
+      Seq(col("key") + 1, col("key") + 2))
+    checkAnswer(
+      df,
+      testData.collect().map { case Row(key: Int, value: String) =>
+        Row(key, value, key + 1, key + 2)
+      }.toSeq)
+    assert(df.schema.map(_.name) === Seq("key", "value", "newCol1", "newCol2"))
+  }
+
   test("replace column using withColumn") {
     val df2 = sparkContext.parallelize(Array(1, 2, 3)).toDF("x")
     val df3 = df2.withColumn("x", df2("x") + 1)
@@ -563,6 +574,15 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       Row(2) :: Row(3) :: Row(4) :: Nil)
   }
 
+  test("replace column using withColumns") {
+    val df2 = sparkContext.parallelize(Array((1, 2), (2, 3), (3, 4))).toDF("x", "y")
+    val df3 = df2.withColumns(Seq("x", "newCol1", "newCol2"),
+      Seq(df2("x") + 1, df2("y"), df2("y") + 1))
+    checkAnswer(
+      df3.select("x", "newCol1", "newCol2"),
+      Row(2, 2, 3) :: Row(3, 3, 4) :: Row(4, 4, 5) :: Nil)
+  }
+
   test("drop column using drop") {
     val df = testData.drop("key")
     checkAnswer(