apache · viirya · Oct 18, 2017 · Oct 19, 2017 · Oct 20, 2017 · Oct 20, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -41,8 +41,12 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
  * The output vectors are sparse.
  *
  * @see `StringIndexer` for converting categorical values into category indices
+ * @deprecated `OneHotEncoderEstimator` will be renamed `OneHotEncoder` and this `OneHotEncoder`
+ * will be removed in 3.0.0.
  */
 @Since("1.4.0")
+@deprecated("`OneHotEncoderEstimator` will be renamed `OneHotEncoder` and this `OneHotEncoder`" +
+  " will be removed in 3.0.0.", "2.3.0")
 class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer
   with HasInputCol with HasOutputCol with DefaultParamsWritable {
 
@@ -78,56 +82,16 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e
   override def transformSchema(schema: StructType): StructType = {
     val inputColName = $(inputCol)
     val outputColName = $(outputCol)
+    val inputFields = schema.fields
 
     require(schema(inputColName).dataType.isInstanceOf[NumericType],
       s"Input column must be of type NumericType but got ${schema(inputColName).dataType}")
-    val inputFields = schema.fields
     require(!inputFields.exists(_.name == outputColName),
       s"Output column $outputColName already exists.")
 
-    val inputAttr = Attribute.fromStructField(schema(inputColName))
-    val outputAttrNames: Option[Array[String]] = inputAttr match {
-      case nominal: NominalAttribute =>
-        if (nominal.values.isDefined) {
-          nominal.values
-        } else if (nominal.numValues.isDefined) {
-          nominal.numValues.map(n => Array.tabulate(n)(_.toString))
-        } else {
-          None
-        }
-      case binary: BinaryAttribute =>
-        if (binary.values.isDefined) {
-          binary.values
-        } else {
-          Some(Array.tabulate(2)(_.toString))
-        }
-      case _: NumericAttribute =>
-        throw new RuntimeException(
-          s"The input column $inputColName cannot be numeric.")
-      case _ =>
-        None // optimistic about unknown attributes
-    }
-
-    val filteredOutputAttrNames = outputAttrNames.map { names =>
-      if ($(dropLast)) {
-        require(names.length > 1,
-          s"The input column $inputColName should have at least two distinct values.")
-        names.dropRight(1)
-      } else {
-        names
-      }
-    }
-
-    val outputAttrGroup = if (filteredOutputAttrNames.isDefined) {
-      val attrs: Array[Attribute] = filteredOutputAttrNames.get.map { name =>
-        BinaryAttribute.defaultAttr.withName(name)
-      }
-      new AttributeGroup($(outputCol), attrs)
-    } else {
-      new AttributeGroup($(outputCol))
-    }
-
-    val outputFields = inputFields :+ outputAttrGroup.toStructField()
+    val outputField = OneHotEncoderCommon.transformOutputColumnSchema(
+      schema(inputColName), outputColName, $(dropLast))
+    val outputFields = inputFields :+ outputField
     StructType(outputFields)
   }
 
@@ -136,30 +100,17 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e
     // schema transformation
     val inputColName = $(inputCol)
     val outputColName = $(outputCol)
-    val shouldDropLast = $(dropLast)
-    var outputAttrGroup = AttributeGroup.fromStructField(
+
+    val outputAttrGroupFromSchema = AttributeGroup.fromStructField(
       transformSchema(dataset.schema)(outputColName))
-    if (outputAttrGroup.size < 0) {
-      // If the number of attributes is unknown, we check the values from the input column.
-      val numAttrs = dataset.select(col(inputColName).cast(DoubleType)).rdd.map(_.getDouble(0))
-        .treeAggregate(0.0)(
-          (m, x) => {
-            assert(x <= Int.MaxValue,
-              s"OneHotEncoder only supports up to ${Int.MaxValue} indices, but got $x")
-            assert(x >= 0.0 && x == x.toInt,
-              s"Values from column $inputColName must be indices, but got $x.")
-            math.max(m, x)
-          },
-          (m0, m1) => {
-            math.max(m0, m1)
-          }
-        ).toInt + 1
-      val outputAttrNames = Array.tabulate(numAttrs)(_.toString)
-      val filtered = if (shouldDropLast) outputAttrNames.dropRight(1) else outputAttrNames
-      val outputAttrs: Array[Attribute] =
-        filtered.map(name => BinaryAttribute.defaultAttr.withName(name))
-      outputAttrGroup = new AttributeGroup(outputColName, outputAttrs)
+
+    val outputAttrGroup = if (outputAttrGroupFromSchema.size < 0) {
+      OneHotEncoderCommon.getOutputAttrGroupFromData(
+        dataset, Seq(inputColName), Seq(outputColName), $(dropLast))(0)
+    } else {
+      outputAttrGroupFromSchema
     }
+
     val metadata = outputAttrGroup.toMetadata()
 
     // data transformation