From 443538df8a4d0d1ea4026fcecc32aa32b1de5f34 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Tue, 6 Jan 2015 21:07:22 +0800 Subject: [PATCH 1/4] Trims aliases when resolving and checking aggregate expressions --- .../apache/spark/sql/catalyst/analysis/Analyzer.scala | 8 ++++---- .../apache/spark/sql/catalyst/planning/patterns.scala | 8 ++++---- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 11 +++++++++++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index c009cc1e1e85c..43a6ec708b832 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -209,10 +209,10 @@ class Analyzer(catalog: Catalog, aggregateExprs.find { e => !isValidAggregateExpression(e.transform { - // Should trim aliases around `GetField`s. These aliases are introduced while - // resolving struct field accesses, because `GetField` is not a `NamedExpression`. - // (Should we just turn `GetField` into a `NamedExpression`?) - case Alias(g: GetField, _) => g + // Should trim aliases. These aliases can only be introduced while resolving unnamed + // expressions like `GetField` and UDF calls, because GROUP BY clause doesn't allow + // aliasing. + case a: Alias => a.child }) }.foreach { e => throw new TreeNodeException(plan, s"Expression not in GROUP BY: $e") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala index 310d127506d68..ec6f1c116b24e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala @@ -153,11 +153,11 @@ object PartialAggregation { partialEvaluations(new TreeNodeRef(e)).finalEvaluation case e: Expression => - // Should trim aliases around `GetField`s. These aliases are introduced while - // resolving struct field accesses, because `GetField` is not a `NamedExpression`. - // (Should we just turn `GetField` into a `NamedExpression`?) namedGroupingExpressions - .get(e.transform { case Alias(g: GetField, _) => g }) + // Should trim aliases. These aliases can only be introduced while resolving unnamed + // expressions like `GetField` and UDF calls, because GROUP BY clause doesn't allow + // aliasing. + .get(e.transform { case a: Alias => a.child }) .map(_.toAttribute) .getOrElse(e) }).asInstanceOf[Seq[NamedExpression]] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index d9de5686dce48..bc6e3ed31e2ba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -997,6 +997,17 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { dropTempTable("data") } + test("SPARK-4296 Grouping field with UDF as sub expression") { + registerFunction("triple", (_: Int) * 3) + jsonRDD(sparkContext.makeRDD("""{"a": 1}""" :: Nil)).registerTempTable("data") + checkAnswer(sql("SELECT triple(a) FROM data GROUP BY triple(a)"), 3) + dropTempTable("data") + + jsonRDD(sparkContext.makeRDD("""{"a": 1}""" :: Nil)).registerTempTable("data") + checkAnswer(sql("SELECT triple(a) + 1 FROM data GROUP BY triple(a) + 1"), 4) + dropTempTable("data") + } + test("SPARK-4432 Fix attribute reference resolution error when using ORDER BY") { checkAnswer( sql("SELECT a + b FROM testData2 ORDER BY a"), From 8b3a274d68064ee2971df8f644298fef811e1efd Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Mon, 12 Jan 2015 13:41:26 -0800 Subject: [PATCH 2/4] Since expressions in grouping expressions can have aliases, which can be used by the outer query block, revert this change. --- .../apache/spark/sql/catalyst/planning/patterns.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala index ec6f1c116b24e..228d67ed7839d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala @@ -143,6 +143,8 @@ object PartialAggregation { // referenced in the second aggregation. val namedGroupingExpressions: Map[Expression, NamedExpression] = groupingExpressions.map { case n: NamedExpression => (n, n) + // TODO: When a UDF is used in the group by clause, for example, GROUP BY YEAR(...), + // PartialGroup will not be a proper name. case other => (other, Alias(other, "PartialGroup")()) }.toMap @@ -153,11 +155,11 @@ object PartialAggregation { partialEvaluations(new TreeNodeRef(e)).finalEvaluation case e: Expression => + // Should trim aliases around `GetField`s. These aliases are introduced while + // resolving struct field accesses, because `GetField` is not a `NamedExpression`. + // (Should we just turn `GetField` into a `NamedExpression`?) namedGroupingExpressions - // Should trim aliases. These aliases can only be introduced while resolving unnamed - // expressions like `GetField` and UDF calls, because GROUP BY clause doesn't allow - // aliasing. - .get(e.transform { case a: Alias => a.child }) + .get(e.transform { case Alias(g: GetField, _) => g }) .map(_.toAttribute) .getOrElse(e) }).asInstanceOf[Seq[NamedExpression]] From d42b70719711d9589d5919c2c767b3a26e2f6ee8 Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Mon, 12 Jan 2015 13:47:34 -0800 Subject: [PATCH 3/4] Update comment. --- .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 43a6ec708b832..9d35d571476e5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -209,9 +209,9 @@ class Analyzer(catalog: Catalog, aggregateExprs.find { e => !isValidAggregateExpression(e.transform { - // Should trim aliases. These aliases can only be introduced while resolving unnamed - // expressions like `GetField` and UDF calls, because GROUP BY clause doesn't allow - // aliasing. + // Should trim aliases. These aliases can be introduced while resolving unnamed + // expressions like `GetField` and UDF calls, and when a user explicitly assigns + // an alias to a grouping expression in the SELECT clause. case a: Alias => a.child }) }.foreach { e => From 6cfadd245e871e4f63543ed4241203cc3aad5b9a Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Mon, 12 Jan 2015 14:36:21 -0800 Subject: [PATCH 4/4] Actually, this issue has been fixed by 3684fd21e1ffdc0adaad8ff6b31394b637e866ce. --- .../spark/sql/catalyst/analysis/Analyzer.scala | 8 ++++---- .../spark/sql/catalyst/planning/patterns.scala | 2 -- .../org/apache/spark/sql/SQLQuerySuite.scala | 11 ----------- .../spark/sql/hive/execution/SQLQuerySuite.scala | 15 +++++++++++++++ 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 9d35d571476e5..c009cc1e1e85c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -209,10 +209,10 @@ class Analyzer(catalog: Catalog, aggregateExprs.find { e => !isValidAggregateExpression(e.transform { - // Should trim aliases. These aliases can be introduced while resolving unnamed - // expressions like `GetField` and UDF calls, and when a user explicitly assigns - // an alias to a grouping expression in the SELECT clause. - case a: Alias => a.child + // Should trim aliases around `GetField`s. These aliases are introduced while + // resolving struct field accesses, because `GetField` is not a `NamedExpression`. + // (Should we just turn `GetField` into a `NamedExpression`?) + case Alias(g: GetField, _) => g }) }.foreach { e => throw new TreeNodeException(plan, s"Expression not in GROUP BY: $e") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala index 228d67ed7839d..310d127506d68 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala @@ -143,8 +143,6 @@ object PartialAggregation { // referenced in the second aggregation. val namedGroupingExpressions: Map[Expression, NamedExpression] = groupingExpressions.map { case n: NamedExpression => (n, n) - // TODO: When a UDF is used in the group by clause, for example, GROUP BY YEAR(...), - // PartialGroup will not be a proper name. case other => (other, Alias(other, "PartialGroup")()) }.toMap diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index bc6e3ed31e2ba..d9de5686dce48 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -997,17 +997,6 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { dropTempTable("data") } - test("SPARK-4296 Grouping field with UDF as sub expression") { - registerFunction("triple", (_: Int) * 3) - jsonRDD(sparkContext.makeRDD("""{"a": 1}""" :: Nil)).registerTempTable("data") - checkAnswer(sql("SELECT triple(a) FROM data GROUP BY triple(a)"), 3) - dropTempTable("data") - - jsonRDD(sparkContext.makeRDD("""{"a": 1}""" :: Nil)).registerTempTable("data") - checkAnswer(sql("SELECT triple(a) + 1 FROM data GROUP BY triple(a) + 1"), 4) - dropTempTable("data") - } - test("SPARK-4432 Fix attribute reference resolution error when using ORDER BY") { checkAnswer( sql("SELECT a + b FROM testData2 ORDER BY a"), diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index c1c3683f84ab2..c68317b9d8ddc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -214,4 +214,19 @@ class SQLQuerySuite extends QueryTest { Seq.empty[Row]) } } + + test("SPARK-4296 Grouping field with Hive UDF as sub expression") { + val rdd = sparkContext.makeRDD("""{"a": "str", "b":"1", "c":"1970-01-01 00:00:00"}""" :: Nil) + jsonRDD(rdd).registerTempTable("data") + checkAnswer( + sql("SELECT concat(a, '-', b), year(c) FROM data GROUP BY concat(a, '-', b), year(c)"), + Seq(Seq("str-1", 1970))) + + dropTempTable("data") + + jsonRDD(rdd).registerTempTable("data") + checkAnswer(sql("SELECT year(c) + 1 FROM data GROUP BY year(c) + 1"), Seq(Seq(1971))) + + dropTempTable("data") + } }