-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-32649][SQL] Optimize BHJ/SHJ inner/semi join with empty hashed relation #29484
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0dc1cb2
bee7e1e
5c6e9ce
9f37032
ac5d34d
c82d835
ddd30e0
1f328a7
a7cdf6e
496eda7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.adaptive | ||
|
|
||
| import org.apache.spark.sql.catalyst.planning.ExtractSingleColumnNullAwareAntiJoin | ||
| import org.apache.spark.sql.catalyst.plans.{Inner, LeftSemi} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{Join, LocalRelation, LogicalPlan} | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.execution.joins.{EmptyHashedRelation, HashedRelation, HashedRelationWithAllNullKeys} | ||
|
|
||
| /** | ||
| * This optimization rule detects and converts a Join to an empty [[LocalRelation]]: | ||
| * 1. Join is single column NULL-aware anti join (NAAJ), and broadcasted [[HashedRelation]] | ||
| * is [[HashedRelationWithAllNullKeys]]. | ||
| * | ||
| * 2. Join is inner or left semi join, and broadcasted [[HashedRelation]] | ||
| * is [[EmptyHashedRelation]]. | ||
| * This applies to all Joins (sort merge join, shuffled hash join, and broadcast hash join), | ||
| * because sort merge join and shuffled hash join will be changed to broadcast hash join with AQE | ||
| * at the first place. | ||
| */ | ||
| object EliminateJoinToEmptyRelation extends Rule[LogicalPlan] { | ||
|
|
||
| private def canEliminate(plan: LogicalPlan, relation: HashedRelation): Boolean = plan match { | ||
| case LogicalQueryStage(_, stage: BroadcastQueryStageExec) if stage.resultOption.get().isDefined | ||
| && stage.broadcast.relationFuture.get().value == relation => true | ||
| case _ => false | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan.transformDown { | ||
| case j @ ExtractSingleColumnNullAwareAntiJoin(_, _) | ||
| if canEliminate(j.right, HashedRelationWithAllNullKeys) => | ||
| LocalRelation(j.output, data = Seq.empty, isStreaming = j.isStreaming) | ||
|
|
||
| case j @ Join(_, _, Inner, _, _) if canEliminate(j.left, EmptyHashedRelation) || | ||
| canEliminate(j.right, EmptyHashedRelation) => | ||
| LocalRelation(j.output, data = Seq.empty, isStreaming = j.isStreaming) | ||
|
|
||
| case j @ Join(_, _, LeftSemi, _, _) if canEliminate(j.right, EmptyHashedRelation) => | ||
| LocalRelation(j.output, data = Seq.empty, isStreaming = j.isStreaming) | ||
| } | ||
| } |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -226,7 +226,8 @@ class AdaptiveQueryExecSuite | |
| val df1 = spark.range(10).withColumn("a", 'id) | ||
| val df2 = spark.range(10).withColumn("b", 'id) | ||
| withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { | ||
| val testDf = df1.where('a > 10).join(df2.where('b > 10), "id").groupBy('a).count() | ||
| val testDf = df1.where('a > 10).join(df2.where('b > 10), Seq("id"), "left_outer") | ||
| .groupBy('a).count() | ||
| checkAnswer(testDf, Seq()) | ||
| val plan = testDf.queryExecution.executedPlan | ||
| assert(find(plan)(_.isInstanceOf[SortMergeJoinExec]).isDefined) | ||
|
|
@@ -238,7 +239,8 @@ class AdaptiveQueryExecSuite | |
| } | ||
|
|
||
| withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "1") { | ||
| val testDf = df1.where('a > 10).join(df2.where('b > 10), "id").groupBy('a).count() | ||
| val testDf = df1.where('a > 10).join(df2.where('b > 10), Seq("id"), "left_outer") | ||
| .groupBy('a).count() | ||
| checkAnswer(testDf, Seq()) | ||
| val plan = testDf.queryExecution.executedPlan | ||
| assert(find(plan)(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) | ||
|
|
@@ -1181,4 +1183,26 @@ class AdaptiveQueryExecSuite | |
| checkNumLocalShuffleReaders(adaptivePlan) | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-32649: Eliminate inner and semi join to empty relation") { | ||
| withSQLConf( | ||
| SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", | ||
| SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80") { | ||
| Seq( | ||
| // inner join (small table at right side) | ||
| "SELECT * FROM testData t1 join testData3 t2 ON t1.key = t2.a WHERE t2.b = 1", | ||
| // inner join (small table at left side) | ||
| "SELECT * FROM testData3 t1 join testData t2 ON t1.a = t2.key WHERE t1.b = 1", | ||
| // left semi join | ||
| "SELECT * FROM testData t1 left semi join testData3 t2 ON t1.key = t2.a AND t2.b = 1" | ||
| ).foreach(query => { | ||
| val (plan, adaptivePlan) = runAdaptiveAndVerifyResult(query) | ||
| val smj = findTopLevelSortMergeJoin(plan) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SortMergeJoin? I think this targets BHJ and SHJ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @viirya - similar to other test cases in this file - the input data stats is super large, and by default it uses SMJ. Per my comment above, SMJ/BHJ/SHJ will all turn into empty |
||
| assert(smj.size == 1) | ||
| val join = findTopLevelBaseJoin(adaptivePlan) | ||
| assert(join.isEmpty) | ||
| checkNumLocalShuffleReaders(adaptivePlan) | ||
| }) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@cloud-fan - just fyi. the change in this unit test is needed as
assert(find(plan)(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)no long true, because this is an inner join and the build side is empty. So with the change in this PR, the join operator is optimized into an empty relation operator (failure stack trace of unit test without change is here).Changed from inner join to left outer join, to help unit test pass. And I don't think changing from inner join to left outer join here can comprise any functionality of original unit test. Let me know if it's not the case. thanks.