-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-14471][SQL] Aliases in SELECT could be used in GROUP BY #17191
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c77fc72
e2c3e5f
9332c07
55ae8aa
1724826
cd28da0
90df0b4
95e4361
db5979f
a594d2b
bba8521
6c3c5fa
110ec5e
f3a31af
620341a
658cf83
dc6ca68
7b32f46
1340862
65f6e7c
86402b0
0ae48d8
d3071fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -136,6 +136,7 @@ class Analyzer( | |
| ResolveGroupingAnalytics :: | ||
| ResolvePivot :: | ||
| ResolveOrdinalInOrderByAndGroupBy :: | ||
| ResolveAggAliasInGroupBy :: | ||
| ResolveMissingReferences :: | ||
| ExtractGenerator :: | ||
| ResolveGenerate :: | ||
|
|
@@ -172,7 +173,7 @@ class Analyzer( | |
| * Analyze cte definitions and substitute child plan with analyzed cte definitions. | ||
| */ | ||
| object CTESubstitution extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case With(child, relations) => | ||
| substituteCTE(child, relations.foldLeft(Seq.empty[(String, LogicalPlan)]) { | ||
| case (resolved, (name, relation)) => | ||
|
|
@@ -200,7 +201,7 @@ class Analyzer( | |
| * Substitute child plan with WindowSpecDefinitions. | ||
| */ | ||
| object WindowsSubstitution extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| // Lookup WindowSpecDefinitions. This rule works with unresolved children. | ||
| case WithWindowDefinition(windowDefinitions, child) => | ||
| child.transform { | ||
|
|
@@ -242,7 +243,7 @@ class Analyzer( | |
| private def hasUnresolvedAlias(exprs: Seq[NamedExpression]) = | ||
| exprs.exists(_.find(_.isInstanceOf[UnresolvedAlias]).isDefined) | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case Aggregate(groups, aggs, child) if child.resolved && hasUnresolvedAlias(aggs) => | ||
| Aggregate(groups, assignAliases(aggs), child) | ||
|
|
||
|
|
@@ -614,7 +615,7 @@ class Analyzer( | |
| case _ => plan | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case i @ InsertIntoTable(u: UnresolvedRelation, parts, child, _, _) if child.resolved => | ||
| EliminateSubqueryAliases(lookupTableFromCatalog(u)) match { | ||
| case v: View => | ||
|
|
@@ -786,7 +787,7 @@ class Analyzer( | |
| } | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case p: LogicalPlan if !p.childrenResolved => p | ||
|
|
||
| // If the projection list contains Stars, expand it. | ||
|
|
@@ -844,11 +845,10 @@ class Analyzer( | |
|
|
||
| case q: LogicalPlan => | ||
| logTrace(s"Attempting to resolve ${q.simpleString}") | ||
| q transformExpressionsUp { | ||
| q.transformExpressionsUp { | ||
| case u @ UnresolvedAttribute(nameParts) => | ||
| // Leave unchanged if resolution fails. Hopefully will be resolved next round. | ||
| val result = | ||
| withPosition(u) { q.resolveChildren(nameParts, resolver).getOrElse(u) } | ||
| // Leave unchanged if resolution fails. Hopefully will be resolved next round. | ||
| val result = withPosition(u) { q.resolveChildren(nameParts, resolver).getOrElse(u) } | ||
|
||
| logDebug(s"Resolving $u to $result") | ||
| result | ||
| case UnresolvedExtractValue(child, fieldExpr) if child.resolved => | ||
|
|
@@ -961,7 +961,7 @@ class Analyzer( | |
| * have no effect on the results. | ||
| */ | ||
| object ResolveOrdinalInOrderByAndGroupBy extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case p if !p.childrenResolved => p | ||
| // Replace the index with the related attribute for ORDER BY, | ||
| // which is a 1-base position of the projection list. | ||
|
|
@@ -997,6 +997,27 @@ class Analyzer( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Replace unresolved expressions in grouping keys with resolved ones in SELECT clauses. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a comment to say that this rule has to be run after
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, I'll update |
||
| * This rule is expected to run after [[ResolveReferences]] applied. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can remove this now. With the new check, the order doesn't matter
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
| */ | ||
| object ResolveAggAliasInGroupBy extends Rule[LogicalPlan] { | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case agg @ Aggregate(groups, aggs, child) | ||
| if conf.groupByAliases && child.resolved && aggs.forall(_.resolved) && | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not very confident about this condition, we may mistakenly resolve grouping expression by aggregate list while it should be resolved by child output. one example is the star. If the aggregate list contains a star, then we will expand the star in cc @gatorsmile
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about this code for the check? https://github.com/apache/spark/pull/17191/files#diff-57b3d87be744b7d79a9beacf8e5e5eb2R1011
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think an
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not 100% sure though, since the resolution batch currently has this rule, it seems this rule is firstly applied into unresolved grouping keys in the star case @cloud-fan suggested.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got debug info and it seemed
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok I have an idea: first, check there is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The latest fix is not enough for your suggestion?;
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea it works, but maybe give it a better name like
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
| groups.exists(_.isInstanceOf[UnresolvedAttribute]) => | ||
| // This is a strict check though, we put this to apply the rule only in alias expressions | ||
| def notResolvableByChild(attrName: String): Boolean = | ||
| !child.output.exists(a => resolver(a.name, attrName)) | ||
| agg.copy(groupingExpressions = groups.map { | ||
| case u: UnresolvedAttribute if notResolvableByChild(u.name) => | ||
| aggs.find(ne => resolver(ne.name, u.name)).getOrElse(u) | ||
| case e => e | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * In many dialects of SQL it is valid to sort by attributes that are not present in the SELECT | ||
| * clause. This rule detects such queries and adds the required attributes to the original | ||
|
|
@@ -1006,7 +1027,7 @@ class Analyzer( | |
| * The HAVING clause could also used a grouping columns that is not presented in the SELECT. | ||
| */ | ||
| object ResolveMissingReferences extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| // Skip sort with aggregate. This will be handled in ResolveAggregateFunctions | ||
| case sa @ Sort(_, _, child: Aggregate) => sa | ||
|
|
||
|
|
@@ -1130,7 +1151,7 @@ class Analyzer( | |
| * Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s. | ||
| */ | ||
| object ResolveFunctions extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case q: LogicalPlan => | ||
| q transformExpressions { | ||
| case u if !u.childrenResolved => u // Skip until children are resolved. | ||
|
|
@@ -1469,7 +1490,7 @@ class Analyzer( | |
| /** | ||
| * Resolve and rewrite all subqueries in an operator tree.. | ||
| */ | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| // In case of HAVING (a filter after an aggregate) we use both the aggregate and | ||
| // its child for resolution. | ||
| case f @ Filter(_, a: Aggregate) if f.childrenResolved => | ||
|
|
@@ -1484,7 +1505,7 @@ class Analyzer( | |
| * Turns projections that contain aggregate expressions into aggregations. | ||
| */ | ||
| object GlobalAggregates extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case Project(projectList, child) if containsAggregates(projectList) => | ||
| Aggregate(Nil, projectList, child) | ||
| } | ||
|
|
@@ -1510,7 +1531,7 @@ class Analyzer( | |
| * underlying aggregate operator and then projected away after the original operator. | ||
| */ | ||
| object ResolveAggregateFunctions extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case filter @ Filter(havingCondition, | ||
| aggregate @ Aggregate(grouping, originalAggExprs, child)) | ||
| if aggregate.resolved => | ||
|
|
@@ -1682,7 +1703,7 @@ class Analyzer( | |
| } | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case Project(projectList, _) if projectList.exists(hasNestedGenerator) => | ||
| val nestedGenerator = projectList.find(hasNestedGenerator).get | ||
| throw new AnalysisException("Generators are not supported when it's nested in " + | ||
|
|
@@ -1740,7 +1761,7 @@ class Analyzer( | |
| * that wrap the [[Generator]]. | ||
| */ | ||
| object ResolveGenerate extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case g: Generate if !g.child.resolved || !g.generator.resolved => g | ||
| case g: Generate if !g.resolved => | ||
| g.copy(generatorOutput = makeGeneratorOutput(g.generator, g.generatorOutput.map(_.name))) | ||
|
|
@@ -2057,7 +2078,7 @@ class Analyzer( | |
| * put them into an inner Project and finally project them away at the outer Project. | ||
| */ | ||
| object PullOutNondeterministic extends Rule[LogicalPlan] { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case p if !p.resolved => p // Skip unresolved nodes. | ||
| case p: Project => p | ||
| case f: Filter => f | ||
|
|
@@ -2102,7 +2123,7 @@ class Analyzer( | |
| * and we should return null if the input is null. | ||
| */ | ||
| object HandleNullInputsForUDF extends Rule[LogicalPlan] { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case p if !p.resolved => p // Skip unresolved nodes. | ||
|
|
||
| case p => p transformExpressionsUp { | ||
|
|
@@ -2167,7 +2188,7 @@ class Analyzer( | |
| * Then apply a Project on a normal Join to eliminate natural or using join. | ||
| */ | ||
| object ResolveNaturalAndUsingJoin extends Rule[LogicalPlan] { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case j @ Join(left, right, UsingJoin(joinType, usingCols), condition) | ||
| if left.resolved && right.resolved && j.duplicateResolved => | ||
| commonNaturalJoinProcessing(left, right, joinType, usingCols, None) | ||
|
|
@@ -2232,7 +2253,7 @@ class Analyzer( | |
| * to the given input attributes. | ||
| */ | ||
| object ResolveDeserializer extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case p if !p.childrenResolved => p | ||
| case p if p.resolved => p | ||
|
|
||
|
|
@@ -2318,7 +2339,7 @@ class Analyzer( | |
| * constructed is an inner class. | ||
| */ | ||
| object ResolveNewInstance extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case p if !p.childrenResolved => p | ||
| case p if p.resolved => p | ||
|
|
||
|
|
@@ -2352,7 +2373,7 @@ class Analyzer( | |
| "type of the field in the target object") | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case p if !p.childrenResolved => p | ||
| case p if p.resolved => p | ||
|
|
||
|
|
@@ -2406,7 +2427,7 @@ object CleanupAliases extends Rule[LogicalPlan] { | |
| case other => trimAliases(other) | ||
| } | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case Project(projectList, child) => | ||
| val cleanedProjectList = | ||
| projectList.map(trimNonTopLevelAliases(_).asInstanceOf[NamedExpression]) | ||
|
|
@@ -2474,7 +2495,7 @@ object TimeWindowing extends Rule[LogicalPlan] { | |
| * @return the logical plan that will generate the time windows using the Expand operator, with | ||
| * the Filter operator for correctness and Project for usability. | ||
| */ | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case p: LogicalPlan if p.children.size == 1 => | ||
| val child = p.children.head | ||
| val windowExpressions = | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,3 +35,21 @@ FROM testData; | |
|
|
||
| -- Aggregate with foldable input and multiple distinct groups. | ||
| SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a; | ||
|
|
||
| -- Aliases in SELECT could be used in GROUP BY | ||
| SELECT a AS k, COUNT(b) FROM testData GROUP BY k; | ||
| SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1; | ||
|
|
||
| -- Aggregate functions cannot be used in GROUP BY | ||
| SELECT COUNT(b) AS k FROM testData GROUP BY k; | ||
|
|
||
| -- Test data. | ||
| CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES | ||
| (1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v); | ||
| SELECT k AS a, COUNT(v) FROM testDataHasSameNameWithAlias GROUP BY a; | ||
|
|
||
| -- turn off group by aliases | ||
| set spark.sql.groupByAliases=false; | ||
|
|
||
| -- Check analysis exceptions | ||
| SELECT a AS k, COUNT(b) FROM testData GROUP BY k; | ||
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is it safer to put it in an individual batch after the
resolutionbatch? Ideally we should only run this rule if we make sure there is no other way to resolve the grouping expressions exception this rule. cc @gatorsmileThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One idea to put this rule outside
resolutionbatch is to uncheck grouping expression resolution in Aggregate.resloved. But, I feel this is a bit unsafe.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gatorsmile ping
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we have a
postHocResolutionRulesUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
aha, ok. I'll move there and check. Thanks!