-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-12594] [SQL] Outer Join Elimination by Filter Conditions #10567
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
01e4cdf
6835704
9180687
b38a21e
d2b84af
fda8025
ac0dccd
6e0018b
0546772
b37a64f
c2a872c
ab6dbd7
4276356
2dab708
0458770
1debdfa
763706d
4de6ec1
9422a4f
52bdf48
1e95df3
fab24cf
8b2e33b
2ee1876
b9f0090
ade6f7e
9fd63d2
a5f1c49
a61272a
aadb8e3
75eface
618f128
6977fdf
cc0262c
82357e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,8 +21,8 @@ import scala.collection.immutable.HashSet | |
|
|
||
| import org.apache.spark.sql.catalyst.analysis.{CleanupAliases, EliminateSubQueries} | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} | ||
| import org.apache.spark.sql.catalyst.expressions.aggregate._ | ||
| import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} | ||
| import org.apache.spark.sql.catalyst.planning.{ExtractFiltersAndInnerJoins, Unions} | ||
| import org.apache.spark.sql.catalyst.plans._ | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
|
|
@@ -62,6 +62,7 @@ abstract class Optimizer extends RuleExecutor[LogicalPlan] { | |
| SetOperationPushDown, | ||
| SamplePushDown, | ||
| ReorderJoin, | ||
| OuterJoinElimination, | ||
| PushPredicateThroughJoin, | ||
| PushPredicateThroughProject, | ||
| PushPredicateThroughGenerate, | ||
|
|
@@ -931,6 +932,62 @@ object ReorderJoin extends Rule[LogicalPlan] with PredicateHelper { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Elimination of outer joins, if the predicates can restrict the result sets so that | ||
| * all null-supplying rows are eliminated | ||
| * | ||
| * - full outer -> inner if both sides have such predicates | ||
| * - left outer -> inner if the right side has such predicates | ||
| * - right outer -> inner if the left side has such predicates | ||
| * - full outer -> left outer if only the left side has such predicates | ||
| * - full outer -> right outer if only the right side has such predicates | ||
| * | ||
| * This rule should be executed before pushing down the Filter | ||
| */ | ||
| object OuterJoinElimination extends Rule[LogicalPlan] with PredicateHelper { | ||
|
|
||
| /** | ||
| * Returns whether the expression returns null or false when all inputs are nulls. | ||
| */ | ||
| private def canFilterOutNull(e: Expression): Boolean = { | ||
| if (!e.deterministic) return false | ||
| val attributes = e.references.toSeq | ||
| val emptyRow = new GenericInternalRow(attributes.length) | ||
| val v = BindReferences.bindReference(e, attributes).eval(emptyRow) | ||
| v == null || v == false | ||
| } | ||
|
|
||
| private def buildNewJoinType(filter: Filter, join: Join): JoinType = { | ||
| val splitConjunctiveConditions: Seq[Expression] = splitConjunctivePredicates(filter.condition) | ||
| val leftConditions = splitConjunctiveConditions | ||
| .filter(_.references.subsetOf(join.left.outputSet)) | ||
| val rightConditions = splitConjunctiveConditions | ||
| .filter(_.references.subsetOf(join.right.outputSet)) | ||
|
|
||
| val leftHasNonNullPredicate = leftConditions.exists(canFilterOutNull) || | ||
| filter.constraints.filter(_.isInstanceOf[IsNotNull]) | ||
| .exists(expr => join.left.outputSet.intersect(expr.references).nonEmpty) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not correct, IsNotNull(Add(A, B)) does not mean both A and B should be not null. I think we don't need these constraints.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @davies Based on my understanding, For example, Please correct me if my understanding is not right. Thanks!
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For EqualNullSafe, the result is true if both a and b are All others make sense to me, we can have that.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, true. Let me correct it. Thanks! |
||
| val rightHasNonNullPredicate = rightConditions.exists(canFilterOutNull) || | ||
| filter.constraints.filter(_.isInstanceOf[IsNotNull]) | ||
| .exists(expr => join.right.outputSet.intersect(expr.references).nonEmpty) | ||
|
|
||
| join.joinType match { | ||
| case RightOuter if leftHasNonNullPredicate => Inner | ||
| case LeftOuter if rightHasNonNullPredicate => Inner | ||
| case FullOuter if leftHasNonNullPredicate && rightHasNonNullPredicate => Inner | ||
| case FullOuter if leftHasNonNullPredicate => LeftOuter | ||
| case FullOuter if rightHasNonNullPredicate => RightOuter | ||
| case o => o | ||
| } | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case f @ Filter(condition, j @ Join(_, _, RightOuter | LeftOuter | FullOuter, _)) => | ||
| val newJoinType = buildNewJoinType(f, j) | ||
| if (j.joinType == newJoinType) f else Filter(condition, j.copy(joinType = newJoinType)) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Pushes down [[Filter]] operators where the `condition` can be | ||
| * evaluated using only the attributes of the left or right side of a join. Other | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,195 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.catalyst.optimizer | ||
|
|
||
| import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries | ||
| import org.apache.spark.sql.catalyst.dsl.expressions._ | ||
| import org.apache.spark.sql.catalyst.dsl.plans._ | ||
| import org.apache.spark.sql.catalyst.plans._ | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.rules._ | ||
|
|
||
| class OuterJoinEliminationSuite extends PlanTest { | ||
| object Optimize extends RuleExecutor[LogicalPlan] { | ||
| val batches = | ||
| Batch("Subqueries", Once, | ||
| EliminateSubQueries) :: | ||
| Batch("Outer Join Elimination", Once, | ||
| OuterJoinElimination, | ||
| PushPredicateThroughJoin) :: Nil | ||
| } | ||
|
|
||
| val testRelation = LocalRelation('a.int, 'b.int, 'c.int) | ||
| val testRelation1 = LocalRelation('d.int, 'e.int, 'f.int) | ||
|
|
||
| test("joins: full outer to inner") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, FullOuter, Option("x.a".attr === "y.d".attr)) | ||
| .where("x.b".attr >= 1 && "y.d".attr >= 2) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation.where('b >= 1) | ||
| val right = testRelation1.where('d >= 2) | ||
| val correctAnswer = | ||
| left.join(right, Inner, Option("a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| test("joins: full outer to right") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, FullOuter, Option("x.a".attr === "y.d".attr)).where("y.d".attr > 2) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation | ||
| val right = testRelation1.where('d > 2) | ||
| val correctAnswer = | ||
| left.join(right, RightOuter, Option("a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| test("joins: full outer to left") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, FullOuter, Option("x.a".attr === "y.d".attr)).where("x.a".attr <=> 2) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation.where('a <=> 2) | ||
| val right = testRelation1 | ||
| val correctAnswer = | ||
| left.join(right, LeftOuter, Option("a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| test("joins: right to inner") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, RightOuter, Option("x.a".attr === "y.d".attr)).where("x.b".attr > 2) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation.where('b > 2) | ||
| val right = testRelation1 | ||
| val correctAnswer = | ||
| left.join(right, Inner, Option("a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| test("joins: left to inner") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, LeftOuter, Option("x.a".attr === "y.d".attr)) | ||
| .where("y.e".attr.isNotNull) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation | ||
| val right = testRelation1.where('e.isNotNull) | ||
| val correctAnswer = | ||
| left.join(right, Inner, Option("a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| // evaluating if mixed OR and NOT expressions can eliminate all null-supplying rows | ||
| test("joins: left to inner with complicated filter predicates #1") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, LeftOuter, Option("x.a".attr === "y.d".attr)) | ||
| .where(!'e.isNull || ('d.isNotNull && 'f.isNull)) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation | ||
| val right = testRelation1.where(!'e.isNull || ('d.isNotNull && 'f.isNull)) | ||
| val correctAnswer = | ||
| left.join(right, Inner, Option("a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| // eval(emptyRow) of 'e.in(1, 2) will return null instead of false | ||
| test("joins: left to inner with complicated filter predicates #2") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, LeftOuter, Option("x.a".attr === "y.d".attr)) | ||
| .where('e.in(1, 2)) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation | ||
| val right = testRelation1.where('e.in(1, 2)) | ||
| val correctAnswer = | ||
| left.join(right, Inner, Option("a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| // evaluating if mixed OR and AND expressions can eliminate all null-supplying rows | ||
| test("joins: left to inner with complicated filter predicates #3") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, LeftOuter, Option("x.a".attr === "y.d".attr)) | ||
| .where((!'e.isNull || ('d.isNotNull && 'f.isNull)) && 'e.isNull) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation | ||
| val right = testRelation1.where((!'e.isNull || ('d.isNotNull && 'f.isNull)) && 'e.isNull) | ||
| val correctAnswer = | ||
| left.join(right, Inner, Option("a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
| // evaluating if the expressions that have both left and right attributes | ||
| // can eliminate all null-supplying rows | ||
| // FULL OUTER => INNER | ||
| test("joins: left to inner with complicated filter predicates #4") { | ||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
|
|
||
| val originalQuery = | ||
| x.join(y, FullOuter, Option("x.a".attr === "y.d".attr)) | ||
| .where("x.b".attr + 3 === "y.e".attr) | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val left = testRelation | ||
| val right = testRelation1 | ||
| val correctAnswer = | ||
| left.join(right, Inner, Option("b".attr + 3 === "e".attr && "a".attr === "d".attr)).analyze | ||
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should also check the expression is deterministic or not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right. Let me add it. Thanks!