-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-6624][WIP] Draft of another alternative version of CNF normalization #10444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.InternalRow | |
| import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.catalyst.rules.{RuleExecutor, Rule} | ||
| import org.apache.spark.sql.catalyst.util.TypeUtils | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.Utils | ||
|
|
@@ -47,6 +48,34 @@ trait Predicate extends Expression { | |
| override def dataType: DataType = BooleanType | ||
| } | ||
|
|
||
| object Predicate extends PredicateHelper { | ||
| def toCNF(predicate: Expression, maybeThreshold: Option[Double] = None): Expression = { | ||
| val cnf = new CNFExecutor(predicate).execute(predicate) | ||
| val threshold = maybeThreshold.map(predicate.size * _).getOrElse(Double.MaxValue) | ||
| if (cnf.size > threshold) predicate else cnf | ||
| } | ||
|
|
||
| private class CNFNormalization(input: Expression) | ||
| extends Rule[Expression] { | ||
|
|
||
| override def apply(tree: Expression): Expression = { | ||
| import org.apache.spark.sql.catalyst.dsl.expressions._ | ||
|
|
||
| tree transformDown { | ||
| case Not(Not(e)) => e | ||
| case Not(a And b) => !a || !b | ||
| case Not(a Or b) => !a && !b | ||
| case a Or (b And c) => (a || b) && (a || c) | ||
| case (a And b) Or c => (a || c) && (b || c) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private class CNFExecutor(input: Expression) extends RuleExecutor[Expression] { | ||
| override protected val batches: Seq[Batch] = | ||
| Batch("CNFNormalization", FixedPoint.Unlimited, new CNFNormalization(input)) :: Nil | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| } | ||
| } | ||
|
|
||
| trait PredicateHelper { | ||
| protected def splitConjunctivePredicates(condition: Expression): Seq[Expression] = { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,6 +58,7 @@ object DefaultOptimizer extends Optimizer { | |
| ConstantFolding, | ||
| LikeSimplification, | ||
| BooleanSimplification, | ||
| CNFNormalization, | ||
| RemoveDispensableExpressions, | ||
| SimplifyFilters, | ||
| SimplifyCasts, | ||
|
|
@@ -583,6 +584,12 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper { | |
| } | ||
| } | ||
|
|
||
| object CNFNormalization extends Rule[LogicalPlan] { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case f @ Filter(condition, _) => f.copy(condition = Predicate.toCNF(condition, Some(10))) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apparently, the expansion threshold can be made a configuration option. |
||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Combines two adjacent [[Filter]] operators into one, merging the | ||
| * conditions into one conjunctive predicate. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When the threshold is exceeded, the original predicate rather than the intermediate converted predicate is returned. This because the intermediate result may not be in CNF, thus:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I disagree with 1. I don't see why it matters if it is all CNF or none. I think the heuristic we want is something like "maximize the number of simple predicates that are in CNF form". Simple here means contains just 1 attribute or binary predicate between two. These are candidates for benefiting from further optimization.
We could try cost basing this or just stopping the expansion after some amount.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maximizing the number of simple predicates sounds reasonable. We may do the conversion in a depth-first manner, i.e. always convert the left branch of an
Andand then its right branch, until either no more predicates can be converted or we reach the size limit. In this way the intermediate result is still useful.BTW, searched for CNF conversion in Hive and found HIVE-9166, which also tries to put an upper limit for ORC SARG CNF conversion. @nongli Any clues about how Impala does this?