From cae52921a1491f2dee9d130aaeadc21b7e9cb168 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Wed, 9 Dec 2015 15:25:20 +0900 Subject: [PATCH 1/2] Support to drop multiple columns by reference --- .../org/apache/spark/sql/DataFrame.scala | 41 ++++++++++++++----- .../org/apache/spark/sql/DataFrameSuite.scala | 7 ++++ 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 243a8c853f90e..f4af6c3768b0a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1261,7 +1261,7 @@ class DataFrame private[sql]( * @since 1.4.0 */ def drop(colName: String): DataFrame = { - drop(Seq(colName) : _*) + drop(colName, Seq() : _*) } /** @@ -1271,10 +1271,11 @@ class DataFrame private[sql]( * @since 1.6.0 */ @scala.annotation.varargs - def drop(colNames: String*): DataFrame = { + def drop(colName: String, colNames: String*): DataFrame = { val resolver = sqlContext.analyzer.resolver - val remainingCols = - schema.filter(f => colNames.forall(n => !resolver(f.name, n))).map(f => Column(f.name)) + val remainingCols = schema.filter { f => + (colName +: colNames).forall(n => !resolver(f.name, n)) + }.map(f => Column(f.name)) if (remainingCols.size == this.schema.size) { this } else { @@ -1291,16 +1292,34 @@ class DataFrame private[sql]( * @since 1.4.1 */ def drop(col: Column): DataFrame = { - val expression = col match { + drop(Seq(col) : _*) + } + + /** + * Returns a new [[DataFrame]] with a column dropped. + * This version of drop accepts Column(s) rather than name(s). + * This is a no-op if the DataFrame doesn't have column(s) + * with an equivalent expression. + * @group dfops + * @since 1.6.0 + */ + @scala.annotation.varargs + def drop(cols: Column*): DataFrame = { + val resolver = sqlContext.analyzer.resolver + val attrs = this.logicalPlan.output + val expressions = cols.map { case Column(u: UnresolvedAttribute) => - queryExecution.analyzed.resolveQuoted(u.name, sqlContext.analyzer.resolver).getOrElse(u) + queryExecution.analyzed.resolveQuoted(u.name, resolver).getOrElse(u) case Column(expr: Expression) => expr } - val attrs = this.logicalPlan.output - val colsAfterDrop = attrs.filter { attr => - attr != expression - }.map(attr => Column(attr)) - select(colsAfterDrop : _*) + val remainingCols = attrs.filter { attr => + !expressions.contains(attr) + }.map(attr => Column(attr)) + if (remainingCols.size == this.schema.size) { + this + } else { + this.select(remainingCols: _*) + } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 605a6549dd686..6272dbf66eb67 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -402,6 +402,13 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { assert(df.schema.map(_.name) === Seq("value")) } + test("drop column using drop with column references") { + val src = Seq((0, 2, 3)).toDF("a", "b", "c") + val df = src.drop(src("a"), src("b")) + checkAnswer(df, Row(3)) + assert(df.schema.map(_.name) === Seq("c")) + } + test("drop unknown column (no-op) with column reference") { val col = Column("random") val df = testData.drop(col) From 22235269995f5c525da0d9aef948803883ca91ae Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Wed, 9 Dec 2015 15:36:58 +0900 Subject: [PATCH 2/2] Change comments --- sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index f4af6c3768b0a..d86afddd7eee3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1296,10 +1296,10 @@ class DataFrame private[sql]( } /** - * Returns a new [[DataFrame]] with a column dropped. + * Returns a new [[DataFrame]] with columns dropped. * This version of drop accepts Column(s) rather than name(s). * This is a no-op if the DataFrame doesn't have column(s) - * with an equivalent expression. + * with equivalent expression(s). * @group dfops * @since 1.6.0 */