[SPARK-8049] [MLLIB] drop tmp col from OneVsRest output

mengxr · jeanlyn · commit 53fc7dbb9520 · 2015-06-12T13:18:07.000+08:00
The temporary column should be dropped after we get the prediction column. harsha2010 Author: Xiangrui Meng <meng@databricks.com> Closes apache#6592 from mengxr/SPARK-8049 and squashes the following commits: 1d89107 [Xiangrui Meng] use SparkFunSuite 6ee70de [Xiangrui Meng] drop tmp col from OneVsRest output
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -131,6 +131,7 @@ final class OneVsRestModel private[ml] (
     // output label and label metadata as prediction
     val labelUdf = callUDF(label, DoubleType, col(accColName))
     aggregatedDataset.withColumn($(predictionCol), labelUdf.as($(predictionCol), labelMetadata))
+      .drop(accColName)
   }
 }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -93,6 +93,15 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext {
     val datasetWithLabelMetadata = dataset.select(labelWithMetadata, features)
     ova.fit(datasetWithLabelMetadata)
   }
+
+  test("SPARK-8049: OneVsRest shouldn't output temp columns") {
+    val logReg = new LogisticRegression()
+      .setMaxIter(1)
+    val ovr = new OneVsRest()
+      .setClassifier(logReg)
+    val output = ovr.fit(dataset).transform(dataset)
+    assert(output.schema.fieldNames.toSet === Set("label", "features", "prediction"))
+  }
 }
 
 private class MockLogisticRegression(uid: String) extends LogisticRegression(uid) {

Original file line number	Diff line number	Diff line change
`@@ -131,6 +131,7 @@ final class OneVsRestModel private[ml] (`
`131`	`131`	`// output label and label metadata as prediction`
`132`	`132`	`val labelUdf = callUDF(label, DoubleType, col(accColName))`
`133`	`133`	`aggregatedDataset.withColumn($(predictionCol), labelUdf.as($(predictionCol), labelMetadata))`
	`134`	`+ .drop(accColName)`
`134`	`135`	`}`
`135`	`136`	`}`
`136`	`137`