Skip to content

Commit cc3e0a1

Browse files
committed
[SPARK-2395][SQL] Optimize common LIKE patterns.
Author: Michael Armbrust <[email protected]> Closes apache#1325 from marmbrus/slowLike and squashes the following commits: 023c3eb [Michael Armbrust] add comment. 8b421c2 [Michael Armbrust] Handle the case where the final % is actually escaped. d34d37e [Michael Armbrust] add periods. 3bbf35f [Michael Armbrust] Roll back changes to SparkBuild 53894b1 [Michael Armbrust] Fix grammar. 4094462 [Michael Armbrust] Fix grammar. 6d3d0a0 [Michael Armbrust] Optimize common LIKE patterns.
1 parent 56e009d commit cc3e0a1

File tree

2 files changed

+74
-0
lines changed

2 files changed

+74
-0
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,54 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE
156156

157157
override def toString() = s"Lower($child)"
158158
}
159+
160+
/** A base class for functions that compare two strings, returning a boolean. */
161+
abstract class StringComparison extends Expression {
162+
self: Product =>
163+
164+
type EvaluatedType = Any
165+
166+
def left: Expression
167+
def right: Expression
168+
169+
override def references = children.flatMap(_.references).toSet
170+
override def children = left :: right :: Nil
171+
172+
override def nullable: Boolean = true
173+
override def dataType: DataType = BooleanType
174+
175+
def compare(l: String, r: String): Boolean
176+
177+
override def eval(input: Row): Any = {
178+
val leftEval = left.eval(input).asInstanceOf[String]
179+
if(leftEval == null) {
180+
null
181+
} else {
182+
val rightEval = right.eval(input).asInstanceOf[String]
183+
if (rightEval == null) null else compare(leftEval, rightEval)
184+
}
185+
}
186+
187+
override def toString() = s"$nodeName($left, $right)"
188+
}
189+
190+
/**
191+
* A function that returns true if the string `left` contains the string `right`.
192+
*/
193+
case class Contains(left: Expression, right: Expression) extends StringComparison {
194+
override def compare(l: String, r: String) = l.contains(r)
195+
}
196+
197+
/**
198+
* A function that returns true if the string `left` starts with the string `right`.
199+
*/
200+
case class StartsWith(left: Expression, right: Expression) extends StringComparison {
201+
def compare(l: String, r: String) = l.startsWith(r)
202+
}
203+
204+
/**
205+
* A function that returns true if the string `left` ends with the string `right`.
206+
*/
207+
case class EndsWith(left: Expression, right: Expression) extends StringComparison {
208+
def compare(l: String, r: String) = l.endsWith(r)
209+
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ object Optimizer extends RuleExecutor[LogicalPlan] {
3434
Batch("ConstantFolding", FixedPoint(100),
3535
NullPropagation,
3636
ConstantFolding,
37+
LikeSimplification,
3738
BooleanSimplification,
3839
SimplifyFilters,
3940
SimplifyCasts,
@@ -111,6 +112,28 @@ object ColumnPruning extends Rule[LogicalPlan] {
111112
}
112113
}
113114

115+
/**
116+
* Simplifies LIKE expressions that do not need full regular expressions to evaluate the condition.
117+
* For example, when the expression is just checking to see if a string starts with a given
118+
* pattern.
119+
*/
120+
object LikeSimplification extends Rule[LogicalPlan] {
121+
// if guards below protect from escapes on trailing %.
122+
// Cases like "something\%" are not optimized, but this does not affect correctness.
123+
val startsWith = "([^_%]+)%".r
124+
val endsWith = "%([^_%]+)".r
125+
val contains = "%([^_%]+)%".r
126+
127+
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
128+
case Like(l, Literal(startsWith(pattern), StringType)) if !pattern.endsWith("\\") =>
129+
StartsWith(l, Literal(pattern))
130+
case Like(l, Literal(endsWith(pattern), StringType)) =>
131+
EndsWith(l, Literal(pattern))
132+
case Like(l, Literal(contains(pattern), StringType)) if !pattern.endsWith("\\") =>
133+
Contains(l, Literal(pattern))
134+
}
135+
}
136+
114137
/**
115138
* Replaces [[Expression Expressions]] that can be statically evaluated with
116139
* equivalent [[Literal]] values. This rule is more specific with

0 commit comments

Comments
 (0)