-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-25401] [SQL] Reorder join predicates to match child outputOrdering #23267
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -208,7 +208,14 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { | |
| children = withExchangeCoordinator(children, requiredChildDistributions) | ||
|
|
||
| // Now that we've performed any necessary shuffles, add sorts to guarantee output orderings: | ||
| children = children.zip(requiredChildOrderings).map { case (child, requiredOrdering) => | ||
| ensureOrdering( | ||
| reorderJoinPredicatesForOrdering(operator.withNewChildren(children)) | ||
| ) | ||
| } | ||
|
|
||
| private def ensureOrdering(operator: SparkPlan): SparkPlan = { | ||
| var children: Seq[SparkPlan] = operator.children | ||
| children = children.zip(operator.requiredChildOrdering).map { case (child, requiredOrdering) => | ||
| // If child.outputOrdering already satisfies the requiredOrdering, we do not need to sort. | ||
| if (SortOrder.orderingSatisfies(child.outputOrdering, requiredOrdering)) { | ||
| child | ||
|
|
@@ -243,24 +250,38 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { | |
| (leftKeysBuffer, rightKeysBuffer) | ||
| } | ||
|
|
||
| private def reorderJoinKeys( | ||
| private def reorderJoinKeys[A]( | ||
| leftKeys: Seq[Expression], | ||
| rightKeys: Seq[Expression], | ||
| leftPartitioning: Partitioning, | ||
| rightPartitioning: Partitioning): (Seq[Expression], Seq[Expression]) = { | ||
| leftChildDist: A, | ||
| rightChildDist: A): (Seq[Expression], Seq[Expression]) = { | ||
| if (leftKeys.forall(_.deterministic) && rightKeys.forall(_.deterministic)) { | ||
| leftPartitioning match { | ||
| leftChildDist match { | ||
| case HashPartitioning(leftExpressions, _) | ||
| if leftExpressions.length == leftKeys.length && | ||
| leftKeys.forall(x => leftExpressions.exists(_.semanticEquals(x))) => | ||
| reorder(leftKeys, rightKeys, leftExpressions, leftKeys) | ||
|
|
||
| case _ => rightPartitioning match { | ||
| case leftOrders: Seq[_] | ||
| if leftOrders.forall(_.isInstanceOf[Expression]) && | ||
| leftOrders.length == leftKeys.length && | ||
| leftKeys.forall { x => | ||
| (leftOrders.map(_.asInstanceOf[Expression])).exists(_.semanticEquals(x))} => | ||
| reorder(leftKeys, rightKeys, leftOrders.map(_.asInstanceOf[Expression]), leftKeys) | ||
|
|
||
| case _ => rightChildDist match { | ||
| case HashPartitioning(rightExpressions, _) | ||
| if rightExpressions.length == rightKeys.length && | ||
| rightKeys.forall(x => rightExpressions.exists(_.semanticEquals(x))) => | ||
| reorder(leftKeys, rightKeys, rightExpressions, rightKeys) | ||
|
|
||
| case rightOrders: Seq[_] | ||
| if rightOrders.forall(_.isInstanceOf[Expression]) && | ||
| rightOrders.length == leftKeys.length && | ||
| leftKeys.forall { x => | ||
| (rightOrders.map(_.asInstanceOf[Expression])).exists(_.semanticEquals(x))} => | ||
| reorder(leftKeys, rightKeys, rightOrders.map(_.asInstanceOf[Expression]), leftKeys) | ||
|
|
||
| case _ => (leftKeys, rightKeys) | ||
| } | ||
| } | ||
|
|
@@ -276,7 +297,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { | |
| * introduced). This rule will change the ordering of the join keys to match with the | ||
| * partitioning of the join nodes' children. | ||
| */ | ||
| private def reorderJoinPredicates(plan: SparkPlan): SparkPlan = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the historical reason (#19257 (comment)), this method was added as a workaround. So, I feel it is compliated to extend this method for this case... basically, IMO we need a general logic here to cover this case and more. cc: @cloud-fan |
||
| private def reorderJoinPredicatesForPartitioning(plan: SparkPlan): SparkPlan = { | ||
| plan match { | ||
| case ShuffledHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left, right) => | ||
| val (reorderedLeftKeys, reorderedRightKeys) = | ||
|
|
@@ -293,6 +314,21 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { | |
| } | ||
| } | ||
|
|
||
| private def reorderJoinPredicatesForOrdering(plan: SparkPlan): SparkPlan = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can avoid this and include this transformation in the former
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure this would work. The point here was to first reorder join predicates for partitioning, then check for the child outputPartitioning (which happens in the method ensureDistributionAndOrdering) and decide if we need Exchange or not and AFTER that reorder the join predicates again to satisfy the child outputOrdering to avoid Exchange. |
||
| plan match { | ||
| case SortMergeJoinExec(leftKeys, rightKeys, joinType, condition, left, right) => | ||
| val (reorderedLeftKeys, reorderedRightKeys) = | ||
| reorderJoinKeys( | ||
| leftKeys, | ||
| rightKeys, | ||
| left.outputOrdering.map(_.child), | ||
| right.outputOrdering.map(_.child)) | ||
| SortMergeJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, condition, left, right) | ||
|
|
||
| case other => other | ||
| } | ||
| } | ||
|
|
||
| def apply(plan: SparkPlan): SparkPlan = plan.transformUp { | ||
| // TODO: remove this after we create a physical operator for `RepartitionByExpression`. | ||
| case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, _) => | ||
|
|
@@ -301,6 +337,6 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { | |
| case _ => operator | ||
| } | ||
| case operator: SparkPlan => | ||
| ensureDistributionAndOrdering(reorderJoinPredicates(operator)) | ||
| ensureDistributionAndOrdering(reorderJoinPredicatesForPartitioning(operator)) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and please add a UT which fails before correcting this and passes after.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this UT test the reorderJoinKeys function? Or do you have something else in mind?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant a test like the one you added. But please, first do prove that the current solution is fine (since I doubt so, see #23267 (comment)). Once we ensure that the current change is safe, you can go ahead addressing these comments. Thanks.