Skip to content

Commit dc1db95

Browse files
committed
[SPARK-36867][SQL] Fix error message with GROUP BY alias
### What changes were proposed in this pull request? When checking unresolved attributes, we should check `Aggregate.aggregateExpressions` before `Aggregate.groupingExpressions`, because the latter may rely on the former, due to the GROUP BY alias feature. ### Why are the changes needed? improve error message ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new test Closes #34244 from cloud-fan/bug. Authored-by: Wenchen Fan <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent a453fd5 commit dc1db95

File tree

3 files changed

+30
-12
lines changed

3 files changed

+30
-12
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,14 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog {
165165
}
166166
}
167167

168-
operator transformExpressionsUp {
168+
val exprs = operator match {
169+
// `groupingExpressions` may rely on `aggregateExpressions`, due to the GROUP BY alias
170+
// feature. We should check errors in `aggregateExpressions` first.
171+
case a: Aggregate => a.aggregateExpressions ++ a.groupingExpressions
172+
case _ => operator.expressions
173+
}
174+
175+
exprs.foreach(_.foreachUp {
169176
case a: Attribute if !a.resolved =>
170177
val missingCol = a.sql
171178
val candidates = operator.inputSet.toSeq.map(_.qualifiedName)
@@ -209,27 +216,26 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog {
209216
failAnalysis(s"${wf.prettyName} function can only be evaluated in an ordered " +
210217
s"row-based window frame with a single offset: $w")
211218

212-
case w @ WindowExpression(e, s) =>
219+
case w: WindowExpression =>
213220
// Only allow window functions with an aggregate expression or an offset window
214221
// function or a Pandas window UDF.
215-
e match {
222+
w.windowFunction match {
216223
case _: AggregateExpression | _: FrameLessOffsetWindowFunction |
217-
_: AggregateWindowFunction =>
218-
w
219-
case f: PythonUDF if PythonUDF.isWindowPandasUDF(f) =>
220-
w
221-
case _ =>
222-
failAnalysis(s"Expression '$e' not supported within a window function.")
224+
_: AggregateWindowFunction => // OK
225+
case f: PythonUDF if PythonUDF.isWindowPandasUDF(f) => // OK
226+
case other =>
227+
failAnalysis(s"Expression '$other' not supported within a window function.")
223228
}
224229

225230
case s: SubqueryExpression =>
226231
checkSubqueryExpression(operator, s)
227-
s
228232

229233
case e: ExpressionWithRandomSeed if !e.seedExpression.foldable =>
230234
failAnalysis(
231235
s"Input argument to ${e.prettyName} must be a constant.")
232-
}
236+
237+
case _ =>
238+
})
233239

234240
operator match {
235241
case etw: EventTimeWatermark =>

sql/core/src/test/resources/sql-tests/inputs/group-by.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS
4545
SELECT a AS k, COUNT(b) FROM testData GROUP BY k;
4646
SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1;
4747

48+
-- GROUP BY alias with invalid col in SELECT list
49+
SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k;
50+
4851
-- Aggregate functions cannot be used in GROUP BY
4952
SELECT COUNT(b) AS k FROM testData GROUP BY k;
5053

sql/core/src/test/resources/sql-tests/results/group-by.sql.out

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- Automatically generated by SQLQueryTestSuite
2-
-- Number of queries: 64
2+
-- Number of queries: 65
33

44

55
-- !query
@@ -161,6 +161,15 @@ struct<k:int,count(b):bigint>
161161
3 2
162162

163163

164+
-- !query
165+
SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k
166+
-- !query schema
167+
struct<>
168+
-- !query output
169+
org.apache.spark.sql.AnalysisException
170+
Column 'non_existing' does not exist. Did you mean one of the following? [testdata.a, testdata.b]; line 1 pos 21
171+
172+
164173
-- !query
165174
SELECT COUNT(b) AS k FROM testData GROUP BY k
166175
-- !query schema

0 commit comments

Comments
 (0)