From 6aa6fc737dc0f7df7846dcd7aff5d3674d79d9ea Mon Sep 17 00:00:00 2001 From: Cheng Pan Date: Thu, 14 Dec 2023 18:11:06 +0800 Subject: [PATCH 1/6] [SPARK-28386] Cannot resolve ORDER BY columns with GROUP BY and HAVING --- .../spark/sql/catalyst/analysis/Analyzer.scala | 9 +++++++++ .../catalyst/analysis/ColumnResolutionHelper.scala | 6 ++++-- .../sql/catalyst/analysis/AnalysisSuite.scala | 14 ++++++++++++++ .../sql-tests/analyzer-results/having.sql.out | 14 ++++++++++++++ .../udf/postgreSQL/udf-select_having.sql.out | 11 +++++------ .../src/test/resources/sql-tests/inputs/having.sql | 3 +++ .../resources/sql-tests/results/having.sql.out | 9 +++++++++ 7 files changed, 58 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index e3538647e3754..94f6d33462656 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -2690,6 +2690,15 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor } s.copy(order = newSortOrder, child = newChild) }) + + case s @ Sort(_, _, f @ Filter(cond, agg: Aggregate)) + if agg.resolved && cond.resolved && s.order.forall(_.resolved) => + resolveOperatorWithAggregate(s.order.map(_.child), agg, (newExprs, newChild) => { + val newSortOrder = s.order.zip(newExprs).map { + case (sortOrder, expr) => sortOrder.copy(child = expr) + } + s.copy(order = newSortOrder, child = f.copy(child = newChild)) + }) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala index a90c61565039d..b5a1974183a05 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala @@ -323,12 +323,14 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase { } // Resolves `UnresolvedAttribute` to `TempResolvedColumn` via `plan.child.output` if plan is an - // `Aggregate`. If `TempResolvedColumn` doesn't end up as aggregate function input or grouping - // column, we will undo the column resolution later to avoid confusing error message. E,g,, if + // `Aggregate` or `Filter(_, Aggregate)`. + // If `TempResolvedColumn` doesn't end up as aggregate function input or grouping + // column, we will undo the column resolution later to avoid confusing error message. E.g., if // a table `t` has columns `c1` and `c2`, for query `SELECT ... FROM t GROUP BY c1 HAVING c2 = 0`, // even though we can resolve column `c2` here, we should undo it and fail with // "Column c2 not found". protected def resolveColWithAgg(e: Expression, plan: LogicalPlan): Expression = plan match { + case Filter(_, agg: Aggregate) => resolveColWithAgg(e, agg) case agg: Aggregate => e.transformWithPruning(_.containsAnyPattern(UNRESOLVED_ATTRIBUTE)) { case u: UnresolvedAttribute => diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 1dce989073260..c56d3cd8d29c6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -1415,6 +1415,20 @@ class AnalysisSuite extends AnalysisTest with Matchers { ) } + test("SPARK-28386: Cannot resolve ORDER BY columns with GROUP BY and HAVING") { + assertAnalysisSuccess( + parsePlan( + """ + |WITH t1 as (SELECT 1 id, 'one' name) + |SELECT LENGTH(name), COUNT(id) + |FROM t1 + |GROUP BY LENGTH(name) + |HAVING COUNT(id) > 1 + |ORDER BY LENGTH(name); + |""".stripMargin + )) + } + test("SPARK-39354: should be [TABLE_OR_VIEW_NOT_FOUND]") { assertAnalysisErrorClass(parsePlan( s""" diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out index 12eb5a34146a2..07a9d99018410 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out @@ -179,3 +179,17 @@ Filter (c1#x = 1) +- Aggregate [c1#x], [c1#x] +- SubqueryAlias t +- LocalRelation [c1#x, c2#x] + + +-- !query +SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v) +-- !query analysis +Sort [sum(v)#xL ASC NULLS FIRST], true ++- Filter (sum(v)#xL > cast(2 as bigint)) + +- Aggregate [k#x], [k#x, sum(v#x) AS sum(v)#xL] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-select_having.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-select_having.sql.out index 5f7e92a62f302..ea6b1716869ff 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-select_having.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-select_having.sql.out @@ -102,12 +102,11 @@ Project [udf(b)#x, udf(c)#x] SELECT udf(b), udf(c) FROM test_having GROUP BY b, c HAVING udf(b) = 3 ORDER BY udf(b), udf(c) -- !query analysis -Project [udf(b)#x, udf(c)#x] -+- Sort [cast(udf(cast(b#x as string)) as int) ASC NULLS FIRST, cast(udf(cast(c#x as string)) as string) ASC NULLS FIRST], true - +- Filter (udf(b)#x = 3) - +- Aggregate [b#x, c#x], [cast(udf(cast(b#x as string)) as int) AS udf(b)#x, cast(udf(cast(c#x as string)) as string) AS udf(c)#x, b#x, c#x] - +- SubqueryAlias spark_catalog.default.test_having - +- Relation spark_catalog.default.test_having[a#x,b#x,c#x,d#x] parquet +Sort [udf(b)#x ASC NULLS FIRST, udf(c)#x ASC NULLS FIRST], true ++- Filter (udf(b)#x = 3) + +- Aggregate [b#x, c#x], [cast(udf(cast(b#x as string)) as int) AS udf(b)#x, cast(udf(cast(c#x as string)) as string) AS udf(c)#x] + +- SubqueryAlias spark_catalog.default.test_having + +- Relation spark_catalog.default.test_having[a#x,b#x,c#x,d#x] parquet -- !query diff --git a/sql/core/src/test/resources/sql-tests/inputs/having.sql b/sql/core/src/test/resources/sql-tests/inputs/having.sql index 056b99e363d21..4245d5153f330 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/having.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/having.sql @@ -33,3 +33,6 @@ SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY GROUPING SETS(t.c1) HAVING t. SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY CUBE(t.c1) HAVING t.c1 = 1; SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY ROLLUP(t.c1) HAVING t.c1 = 1; SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY t.c1 HAVING t.c1 = 1; + +-- SPARK-28386: Cannot resolve ORDER BY columns with GROUP BY and HAVING +SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v) diff --git a/sql/core/src/test/resources/sql-tests/results/having.sql.out b/sql/core/src/test/resources/sql-tests/results/having.sql.out index 6eaba0b4119cb..32846c158c76d 100644 --- a/sql/core/src/test/resources/sql-tests/results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/having.sql.out @@ -134,3 +134,12 @@ SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY t.c1 HAVING t.c1 = 1 struct -- !query output 1 + + +-- !query +SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v) +-- !query schema +struct +-- !query output +three 3 +one 6 From eaafeed279551c9ae7d85a810a057344737f26a5 Mon Sep 17 00:00:00 2001 From: Cheng Pan Date: Thu, 14 Dec 2023 20:34:43 +0800 Subject: [PATCH 2/6] regenerate tpcds plan --- .../approved-plans-v2_7/q6.sf100/explain.txt | 8 ++++---- .../approved-plans-v2_7/q6.sf100/simplified.txt | 2 +- .../approved-plans-v2_7/q6/explain.txt | 8 ++++---- .../approved-plans-v2_7/q6/simplified.txt | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6.sf100/explain.txt index afdfc51a17dd4..82a6e00c79c4b 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6.sf100/explain.txt @@ -248,15 +248,15 @@ Input [2]: [ca_state#18, count#22] Keys [1]: [ca_state#18] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#23] -Results [3]: [ca_state#18 AS state#24, count(1)#23 AS cnt#25, ca_state#18] +Results [2]: [ca_state#18 AS state#24, count(1)#23 AS cnt#25] (44) Filter [codegen id : 14] -Input [3]: [state#24, cnt#25, ca_state#18] +Input [2]: [state#24, cnt#25] Condition : (cnt#25 >= 10) (45) TakeOrderedAndProject -Input [3]: [state#24, cnt#25, ca_state#18] -Arguments: 100, [cnt#25 ASC NULLS FIRST, ca_state#18 ASC NULLS FIRST], [state#24, cnt#25] +Input [2]: [state#24, cnt#25] +Arguments: 100, [cnt#25 ASC NULLS FIRST, state#24 ASC NULLS FIRST], [state#24, cnt#25] ===== Subqueries ===== diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6.sf100/simplified.txt index 7339df16a2895..d69eb47d92c88 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6.sf100/simplified.txt @@ -1,4 +1,4 @@ -TakeOrderedAndProject [cnt,ca_state,state] +TakeOrderedAndProject [cnt,state] WholeStageCodegen (14) Filter [cnt] HashAggregate [ca_state,count] [count(1),state,cnt,count] diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6/explain.txt index a2638dac56456..507d4991a046a 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6/explain.txt @@ -218,15 +218,15 @@ Input [2]: [ca_state#2, count#22] Keys [1]: [ca_state#2] Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#23] -Results [3]: [ca_state#2 AS state#24, count(1)#23 AS cnt#25, ca_state#2] +Results [2]: [ca_state#2 AS state#24, count(1)#23 AS cnt#25] (38) Filter [codegen id : 8] -Input [3]: [state#24, cnt#25, ca_state#2] +Input [2]: [state#24, cnt#25] Condition : (cnt#25 >= 10) (39) TakeOrderedAndProject -Input [3]: [state#24, cnt#25, ca_state#2] -Arguments: 100, [cnt#25 ASC NULLS FIRST, ca_state#2 ASC NULLS FIRST], [state#24, cnt#25] +Input [2]: [state#24, cnt#25] +Arguments: 100, [cnt#25 ASC NULLS FIRST, state#24 ASC NULLS FIRST], [state#24, cnt#25] ===== Subqueries ===== diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6/simplified.txt index c9c8358ba0b9f..a15b638fbb2c0 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q6/simplified.txt @@ -1,4 +1,4 @@ -TakeOrderedAndProject [cnt,ca_state,state] +TakeOrderedAndProject [cnt,state] WholeStageCodegen (8) Filter [cnt] HashAggregate [ca_state,count] [count(1),state,cnt,count] From f423022ff504bf97e27695a09ef1eca0c7d041e5 Mon Sep 17 00:00:00 2001 From: Cheng Pan Date: Fri, 15 Dec 2023 11:31:20 +0800 Subject: [PATCH 3/6] address comments --- .../analysis/ColumnResolutionHelper.scala | 6 ++---- .../analysis/ResolveReferencesInSort.scala | 16 ++++++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala index b5a1974183a05..a90c61565039d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala @@ -323,14 +323,12 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase { } // Resolves `UnresolvedAttribute` to `TempResolvedColumn` via `plan.child.output` if plan is an - // `Aggregate` or `Filter(_, Aggregate)`. - // If `TempResolvedColumn` doesn't end up as aggregate function input or grouping - // column, we will undo the column resolution later to avoid confusing error message. E.g., if + // `Aggregate`. If `TempResolvedColumn` doesn't end up as aggregate function input or grouping + // column, we will undo the column resolution later to avoid confusing error message. E,g,, if // a table `t` has columns `c1` and `c2`, for query `SELECT ... FROM t GROUP BY c1 HAVING c2 = 0`, // even though we can resolve column `c2` here, we should undo it and fail with // "Column c2 not found". protected def resolveColWithAgg(e: Expression, plan: LogicalPlan): Expression = plan match { - case Filter(_, agg: Aggregate) => resolveColWithAgg(e, agg) case agg: Aggregate => e.transformWithPruning(_.containsAnyPattern(UNRESOLVED_ATTRIBUTE)) { case u: UnresolvedAttribute => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInSort.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInSort.scala index 02583ebb8f6ba..6fa723d4a75fd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInSort.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInSort.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions.SortOrder -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Sort} +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, LogicalPlan, Project, Sort} import org.apache.spark.sql.connector.catalog.CatalogManager /** @@ -28,10 +28,11 @@ import org.apache.spark.sql.connector.catalog.CatalogManager * includes metadata columns as well. * 2. Resolves the column to a literal function which is allowed to be invoked without braces, e.g. * `SELECT col, current_date FROM t`. - * 3. If the child plan is Aggregate, resolves the column to [[TempResolvedColumn]] with the output - * of Aggregate's child plan. This is to allow Sort to host grouping expressions and aggregate - * functions, which can be pushed down to the Aggregate later. For example, - * `SELECT max(a) FROM t GROUP BY b ORDER BY min(a)`. + * 3. If the child plan is Aggregate or Filter(_, Aggregate), resolves the column to + * [[TempResolvedColumn]] with the output of Aggregate's child plan. + * This is to allow Sort to host grouping expressions and aggregate functions, which can + * be pushed down to the Aggregate later. For example, + * `SELECT max(a) FROM t GROUP BY b HAVING max(a) > 1 ORDER BY min(a)`. * 4. Resolves the column to [[AttributeReference]] with the output of a descendant plan node. * Spark will propagate the missing attributes from the descendant plan node to the Sort node. * This is to allow users to ORDER BY columns that are not in the SELECT clause, which is @@ -51,7 +52,10 @@ class ResolveReferencesInSort(val catalogManager: CatalogManager) def apply(s: Sort): LogicalPlan = { val resolvedBasic = s.order.map(resolveExpressionByPlanOutput(_, s.child)) - val resolvedWithAgg = resolvedBasic.map(resolveColWithAgg(_, s.child)) + val resolvedWithAgg = s.child match { + case Filter(_, agg: Aggregate) => resolvedBasic.map(resolveColWithAgg(_, agg)) + case _ => resolvedBasic.map(resolveColWithAgg(_, s.child)) + } val (missingAttrResolved, newChild) = resolveExprsAndAddMissingAttrs(resolvedWithAgg, s.child) val orderByAllResolved = resolveOrderByAll( s.global, newChild, missingAttrResolved.map(_.asInstanceOf[SortOrder])) From 108d742e84055cce26170c68aeb6c3bdbc82b8c8 Mon Sep 17 00:00:00 2001 From: Cheng Pan Date: Tue, 19 Dec 2023 16:21:07 +0800 Subject: [PATCH 4/6] add more test cases --- .../sql-tests/analyzer-results/having.sql.out | 157 ++++++++++++++++++ .../resources/sql-tests/inputs/having.sql | 25 ++- .../sql-tests/results/having.sql.out | 105 ++++++++++++ 3 files changed, 285 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out index 07a9d99018410..bc1b515a67ac2 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out @@ -181,6 +181,134 @@ Filter (c1#x = 1) +- LocalRelation [c1#x, c2#x] +-- !query +SELECT k FROM hav GROUP BY k ORDER BY k +-- !query analysis +Sort [k#x ASC NULLS FIRST], true ++- Aggregate [k#x], [k#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT k FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k +-- !query analysis +Sort [k#x ASC NULLS FIRST], true ++- Project [k#x] + +- Filter (sum(v#x)#xL > cast(2 as bigint)) + +- Aggregate [k#x], [k#x, sum(v#x) AS sum(v#x)#xL] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT length(k) FROM hav GROUP BY k ORDER BY k +-- !query analysis +Project [length(k)#x] ++- Sort [k#x ASC NULLS FIRST], true + +- Aggregate [k#x], [length(k#x) AS length(k)#x, k#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT length(k) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k +-- !query analysis +Project [length(k)#x] ++- Sort [k#x ASC NULLS FIRST], true + +- Project [length(k)#x, k#x] + +- Filter (sum(v#x)#xL > cast(2 as bigint)) + +- Aggregate [k#x], [length(k#x) AS length(k)#x, sum(v#x) AS sum(v#x)#xL, k#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT length(k) FROM hav GROUP BY k ORDER BY length(k) +-- !query analysis +Sort [length(k)#x ASC NULLS FIRST], true ++- Aggregate [k#x], [length(k#x) AS length(k)#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT length(k) FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k) +-- !query analysis +Project [length(k)#x] ++- Sort [length(k#x) ASC NULLS FIRST], true + +- Project [length(k)#x, k#x] + +- Filter (max(v#x)#x > 2) + +- Aggregate [k#x], [length(k#x) AS length(k)#x, max(v#x) AS max(v#x)#x, k#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT k FROM hav GROUP BY k ORDER BY length(k) +-- !query analysis +Sort [length(k#x) ASC NULLS FIRST], true ++- Aggregate [k#x], [k#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT k FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k) +-- !query analysis +Sort [length(k#x) ASC NULLS FIRST], true ++- Project [k#x] + +- Filter (max(v#x)#x > 2) + +- Aggregate [k#x], [k#x, max(v#x) AS max(v#x)#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT k, sum(v) FROM hav GROUP BY k ORDER BY sum(v) +-- !query analysis +Sort [sum(v)#xL ASC NULLS FIRST], true ++- Aggregate [k#x], [k#x, sum(v#x) AS sum(v)#xL] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + -- !query SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v) -- !query analysis @@ -193,3 +321,32 @@ Sort [sum(v)#xL ASC NULLS FIRST], true +- Project [k#x, v#x] +- SubqueryAlias hav +- LocalRelation [k#x, v#x] + + +-- !query +SELECT k, sum(v) FROM hav GROUP BY k ORDER BY avg(v) +-- !query analysis +Project [k#x, sum(v)#xL] ++- Sort [avg(v#x)#x ASC NULLS FIRST], true + +- Aggregate [k#x], [k#x, sum(v#x) AS sum(v)#xL, avg(v#x) AS avg(v#x)#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] + + +-- !query +SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY avg(v) +-- !query analysis +Project [k#x, sum(v)#xL] ++- Sort [avg(v#x)#x ASC NULLS FIRST], true + +- Filter (sum(v)#xL > cast(2 as bigint)) + +- Aggregate [k#x], [k#x, sum(v#x) AS sum(v)#xL, avg(v#x) AS avg(v#x)#x] + +- SubqueryAlias hav + +- View (`hav`, [k#x,v#x]) + +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] + +- Project [k#x, v#x] + +- SubqueryAlias hav + +- LocalRelation [k#x, v#x] diff --git a/sql/core/src/test/resources/sql-tests/inputs/having.sql b/sql/core/src/test/resources/sql-tests/inputs/having.sql index 4245d5153f330..02b93b80eee16 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/having.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/having.sql @@ -34,5 +34,26 @@ SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY CUBE(t.c1) HAVING t.c1 = 1; SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY ROLLUP(t.c1) HAVING t.c1 = 1; SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY t.c1 HAVING t.c1 = 1; --- SPARK-28386: Cannot resolve ORDER BY columns with GROUP BY and HAVING -SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v) +-- SPARK-28386: Resolve ORDER BY column with/without HAVING clause, while the column presents on SELECT list +SELECT k FROM hav GROUP BY k ORDER BY k; +SELECT k FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k; + +-- SPARK-28386: Resolve ORDER BY column with/without HAVING clause, while the column does not present on SELECT list +SELECT length(k) FROM hav GROUP BY k ORDER BY k; +SELECT length(k) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k; + +-- SPARK-28386: Resolve ORDER BY scalar function with/without HAVING clause, while the scalar function presents on SELECT list +SELECT length(k) FROM hav GROUP BY k ORDER BY length(k); +SELECT length(k) FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k); + +-- SPARK-28386: Resolve ORDER BY scalar function with/without HAVING clause, while the scalar function does not present on SELECT list +SELECT k FROM hav GROUP BY k ORDER BY length(k); +SELECT k FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k); + +-- SPARK-28386: Resolve ORDER BY agg function with/without HAVING clause, while the agg function presents on SELECT list +SELECT k, sum(v) FROM hav GROUP BY k ORDER BY sum(v); +SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v); + +-- SPARK-28386: Resolve ORDER BY agg function with/without HAVING clause, while the agg function does not present on SELECT list +SELECT k, sum(v) FROM hav GROUP BY k ORDER BY avg(v); +SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY avg(v); diff --git a/sql/core/src/test/resources/sql-tests/results/having.sql.out b/sql/core/src/test/resources/sql-tests/results/having.sql.out index 32846c158c76d..913d4bf70c16a 100644 --- a/sql/core/src/test/resources/sql-tests/results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/having.sql.out @@ -136,6 +136,92 @@ struct 1 +-- !query +SELECT k FROM hav GROUP BY k ORDER BY k +-- !query schema +struct +-- !query output +one +three +two + + +-- !query +SELECT k FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k +-- !query schema +struct +-- !query output +one +three + + +-- !query +SELECT length(k) FROM hav GROUP BY k ORDER BY k +-- !query schema +struct +-- !query output +3 +5 +3 + + +-- !query +SELECT length(k) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k +-- !query schema +struct +-- !query output +3 +5 + + +-- !query +SELECT length(k) FROM hav GROUP BY k ORDER BY length(k) +-- !query schema +struct +-- !query output +3 +3 +5 + + +-- !query +SELECT length(k) FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k) +-- !query schema +struct +-- !query output +3 +5 + + +-- !query +SELECT k FROM hav GROUP BY k ORDER BY length(k) +-- !query schema +struct +-- !query output +one +two +three + + +-- !query +SELECT k FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k) +-- !query schema +struct +-- !query output +one +three + + +-- !query +SELECT k, sum(v) FROM hav GROUP BY k ORDER BY sum(v) +-- !query schema +struct +-- !query output +two 2 +three 3 +one 6 + + -- !query SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v) -- !query schema @@ -143,3 +229,22 @@ struct -- !query output three 3 one 6 + + +-- !query +SELECT k, sum(v) FROM hav GROUP BY k ORDER BY avg(v) +-- !query schema +struct +-- !query output +two 2 +one 6 +three 3 + + +-- !query +SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY avg(v) +-- !query schema +struct +-- !query output +one 6 +three 3 From 111ec3c21bbbe336dfb2ce5714ed93b159e5bedc Mon Sep 17 00:00:00 2001 From: Cheng Pan Date: Wed, 20 Dec 2023 11:32:06 +0800 Subject: [PATCH 5/6] remove unnecessary ut --- .../sql/catalyst/analysis/AnalysisSuite.scala | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index c56d3cd8d29c6..1dce989073260 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -1415,20 +1415,6 @@ class AnalysisSuite extends AnalysisTest with Matchers { ) } - test("SPARK-28386: Cannot resolve ORDER BY columns with GROUP BY and HAVING") { - assertAnalysisSuccess( - parsePlan( - """ - |WITH t1 as (SELECT 1 id, 'one' name) - |SELECT LENGTH(name), COUNT(id) - |FROM t1 - |GROUP BY LENGTH(name) - |HAVING COUNT(id) > 1 - |ORDER BY LENGTH(name); - |""".stripMargin - )) - } - test("SPARK-39354: should be [TABLE_OR_VIEW_NOT_FOUND]") { assertAnalysisErrorClass(parsePlan( s""" From 6c85cf25876d4f4e7e15e251f18e5447910e4ec3 Mon Sep 17 00:00:00 2001 From: Cheng Pan Date: Wed, 20 Dec 2023 11:57:19 +0800 Subject: [PATCH 6/6] ut --- .../sql-tests/analyzer-results/having.sql.out | 142 ------------------ .../resources/sql-tests/inputs/having.sql | 22 +-- .../sql-tests/results/having.sql.out | 96 ------------ 3 files changed, 2 insertions(+), 258 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out index bc1b515a67ac2..d96ea1e43e18b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/having.sql.out @@ -181,134 +181,6 @@ Filter (c1#x = 1) +- LocalRelation [c1#x, c2#x] --- !query -SELECT k FROM hav GROUP BY k ORDER BY k --- !query analysis -Sort [k#x ASC NULLS FIRST], true -+- Aggregate [k#x], [k#x] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - --- !query -SELECT k FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k --- !query analysis -Sort [k#x ASC NULLS FIRST], true -+- Project [k#x] - +- Filter (sum(v#x)#xL > cast(2 as bigint)) - +- Aggregate [k#x], [k#x, sum(v#x) AS sum(v#x)#xL] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - --- !query -SELECT length(k) FROM hav GROUP BY k ORDER BY k --- !query analysis -Project [length(k)#x] -+- Sort [k#x ASC NULLS FIRST], true - +- Aggregate [k#x], [length(k#x) AS length(k)#x, k#x] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - --- !query -SELECT length(k) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k --- !query analysis -Project [length(k)#x] -+- Sort [k#x ASC NULLS FIRST], true - +- Project [length(k)#x, k#x] - +- Filter (sum(v#x)#xL > cast(2 as bigint)) - +- Aggregate [k#x], [length(k#x) AS length(k)#x, sum(v#x) AS sum(v#x)#xL, k#x] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - --- !query -SELECT length(k) FROM hav GROUP BY k ORDER BY length(k) --- !query analysis -Sort [length(k)#x ASC NULLS FIRST], true -+- Aggregate [k#x], [length(k#x) AS length(k)#x] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - --- !query -SELECT length(k) FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k) --- !query analysis -Project [length(k)#x] -+- Sort [length(k#x) ASC NULLS FIRST], true - +- Project [length(k)#x, k#x] - +- Filter (max(v#x)#x > 2) - +- Aggregate [k#x], [length(k#x) AS length(k)#x, max(v#x) AS max(v#x)#x, k#x] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - --- !query -SELECT k FROM hav GROUP BY k ORDER BY length(k) --- !query analysis -Sort [length(k#x) ASC NULLS FIRST], true -+- Aggregate [k#x], [k#x] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - --- !query -SELECT k FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k) --- !query analysis -Sort [length(k#x) ASC NULLS FIRST], true -+- Project [k#x] - +- Filter (max(v#x)#x > 2) - +- Aggregate [k#x], [k#x, max(v#x) AS max(v#x)#x] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - --- !query -SELECT k, sum(v) FROM hav GROUP BY k ORDER BY sum(v) --- !query analysis -Sort [sum(v)#xL ASC NULLS FIRST], true -+- Aggregate [k#x], [k#x, sum(v#x) AS sum(v)#xL] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - -- !query SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v) -- !query analysis @@ -323,20 +195,6 @@ Sort [sum(v)#xL ASC NULLS FIRST], true +- LocalRelation [k#x, v#x] --- !query -SELECT k, sum(v) FROM hav GROUP BY k ORDER BY avg(v) --- !query analysis -Project [k#x, sum(v)#xL] -+- Sort [avg(v#x)#x ASC NULLS FIRST], true - +- Aggregate [k#x], [k#x, sum(v#x) AS sum(v)#xL, avg(v#x) AS avg(v#x)#x] - +- SubqueryAlias hav - +- View (`hav`, [k#x,v#x]) - +- Project [cast(k#x as string) AS k#x, cast(v#x as int) AS v#x] - +- Project [k#x, v#x] - +- SubqueryAlias hav - +- LocalRelation [k#x, v#x] - - -- !query SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY avg(v) -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/having.sql b/sql/core/src/test/resources/sql-tests/inputs/having.sql index 02b93b80eee16..4c25a60c8abb4 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/having.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/having.sql @@ -34,26 +34,8 @@ SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY CUBE(t.c1) HAVING t.c1 = 1; SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY ROLLUP(t.c1) HAVING t.c1 = 1; SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY t.c1 HAVING t.c1 = 1; --- SPARK-28386: Resolve ORDER BY column with/without HAVING clause, while the column presents on SELECT list -SELECT k FROM hav GROUP BY k ORDER BY k; -SELECT k FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k; - --- SPARK-28386: Resolve ORDER BY column with/without HAVING clause, while the column does not present on SELECT list -SELECT length(k) FROM hav GROUP BY k ORDER BY k; -SELECT length(k) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k; - --- SPARK-28386: Resolve ORDER BY scalar function with/without HAVING clause, while the scalar function presents on SELECT list -SELECT length(k) FROM hav GROUP BY k ORDER BY length(k); -SELECT length(k) FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k); - --- SPARK-28386: Resolve ORDER BY scalar function with/without HAVING clause, while the scalar function does not present on SELECT list -SELECT k FROM hav GROUP BY k ORDER BY length(k); -SELECT k FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k); - --- SPARK-28386: Resolve ORDER BY agg function with/without HAVING clause, while the agg function presents on SELECT list -SELECT k, sum(v) FROM hav GROUP BY k ORDER BY sum(v); +-- SPARK-28386: Resolve ORDER BY agg function with HAVING clause, while the agg function presents on SELECT list SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v); --- SPARK-28386: Resolve ORDER BY agg function with/without HAVING clause, while the agg function does not present on SELECT list -SELECT k, sum(v) FROM hav GROUP BY k ORDER BY avg(v); +-- SPARK-28386: Resolve ORDER BY agg function with HAVING clause, while the agg function does not present on SELECT list SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY avg(v); diff --git a/sql/core/src/test/resources/sql-tests/results/having.sql.out b/sql/core/src/test/resources/sql-tests/results/having.sql.out index 913d4bf70c16a..c9d5886426364 100644 --- a/sql/core/src/test/resources/sql-tests/results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/having.sql.out @@ -136,92 +136,6 @@ struct 1 --- !query -SELECT k FROM hav GROUP BY k ORDER BY k --- !query schema -struct --- !query output -one -three -two - - --- !query -SELECT k FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k --- !query schema -struct --- !query output -one -three - - --- !query -SELECT length(k) FROM hav GROUP BY k ORDER BY k --- !query schema -struct --- !query output -3 -5 -3 - - --- !query -SELECT length(k) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY k --- !query schema -struct --- !query output -3 -5 - - --- !query -SELECT length(k) FROM hav GROUP BY k ORDER BY length(k) --- !query schema -struct --- !query output -3 -3 -5 - - --- !query -SELECT length(k) FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k) --- !query schema -struct --- !query output -3 -5 - - --- !query -SELECT k FROM hav GROUP BY k ORDER BY length(k) --- !query schema -struct --- !query output -one -two -three - - --- !query -SELECT k FROM hav GROUP BY k HAVING max(v) > 2 ORDER BY length(k) --- !query schema -struct --- !query output -one -three - - --- !query -SELECT k, sum(v) FROM hav GROUP BY k ORDER BY sum(v) --- !query schema -struct --- !query output -two 2 -three 3 -one 6 - - -- !query SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY sum(v) -- !query schema @@ -231,16 +145,6 @@ three 3 one 6 --- !query -SELECT k, sum(v) FROM hav GROUP BY k ORDER BY avg(v) --- !query schema -struct --- !query output -two 2 -one 6 -three 3 - - -- !query SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 ORDER BY avg(v) -- !query schema