-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-39819][SQL] DS V2 aggregate push down can work with Top N or Paging (Sort with group column) #37238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-39819][SQL] DS V2 aggregate push down can work with Top N or Paging (Sort with group column) #37238
Changes from all commits
0a49fbe
ae0940d
6563aff
eedf418
55a9657
ad36dc1
39e109a
eeba4e7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -400,6 +400,13 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper { | |
| } | ||
| } | ||
|
|
||
| private def findGroupColumn(alias: Alias): Option[AttributeReference] = alias match { | ||
| case alias @ Alias(attr: AttributeReference, name) if attr.name.startsWith("group_col_") => | ||
| Some(AttributeReference(name, attr.dataType)(alias.exprId)) | ||
| case Alias(alias: Alias, _) => findGroupColumn(alias) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel it's a bit hacky to assume the Alias contains the actual grouping columns. How about we generate the name mapping (grouping attribute to actual group column name) during agg pushdown, put the name mapping in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK |
||
| case _ => None | ||
| } | ||
|
|
||
| private def pushDownLimit(plan: LogicalPlan, limit: Int): (LogicalPlan, Boolean) = plan match { | ||
| case operation @ ScanOperation(_, filter, sHolder: ScanBuilderHolder) if filter.isEmpty => | ||
| val (isPushed, isPartiallyPushed) = PushDownUtils.pushLimit(sHolder.builder, limit) | ||
|
|
@@ -410,12 +417,30 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper { | |
| case s @ Sort(order, _, operation @ ScanOperation(project, filter, sHolder: ScanBuilderHolder)) | ||
| // Without building the Scan, we do not know the resulting column names after aggregate | ||
| // push-down, and thus can't push down Top-N which needs to know the ordering column names. | ||
| // TODO: we can support simple cases like GROUP BY columns directly and ORDER BY the same | ||
| // columns, which we know the resulting column names: the original table columns. | ||
| if sHolder.pushedAggregate.isEmpty && filter.isEmpty && | ||
| // In particular, we push down the simple cases like GROUP BY columns directly and ORDER BY | ||
| // the same columns, which we know the resulting column names: the original table columns. | ||
| // TODO support push down Aggregate with ORDER BY expressions. | ||
| if filter.isEmpty && | ||
| CollapseProject.canCollapseExpressions(order, project, alwaysInline = true) => | ||
| val aliasMap = getAliasMap(project) | ||
| val newOrder = order.map(replaceAlias(_, aliasMap)).asInstanceOf[Seq[SortOrder]] | ||
|
|
||
| def findGroupColForSortOrder(sortOrder: SortOrder): Option[SortOrder] = sortOrder match { | ||
| case SortOrder(attr: AttributeReference, direction, nullOrdering, sameOrderExpressions) => | ||
| findGroupColumn(aliasMap(attr)).filter { groupCol => | ||
| sHolder.relation.output.exists(out => out.semanticEquals(groupCol)) | ||
| }.map(SortOrder(_, direction, nullOrdering, sameOrderExpressions)) | ||
| case _ => None | ||
| } | ||
|
|
||
| val newOrder = if (sHolder.pushedAggregate.isDefined) { | ||
| val orderByGroupCols = order.flatMap(findGroupColForSortOrder) | ||
| if (orderByGroupCols.length != order.length) { | ||
| return (s, false) | ||
| } | ||
| orderByGroupCols | ||
| } else { | ||
| order.map(replaceAlias(_, aliasMap)).asInstanceOf[Seq[SortOrder]] | ||
| } | ||
| val normalizedOrders = DataSourceStrategy.normalizeExprs( | ||
| newOrder, sHolder.relation.output).asInstanceOf[Seq[SortOrder]] | ||
| val orders = DataSourceStrategy.translateSortOrders(normalizedOrders) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Per
SortOrder(attr: AttributeReference..., it's alwaysAttributeReference. Should it addressAlias?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. If users specify an
Aliasfor group column. It will beAlias(alias: Alias, _).