Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ class SqlParser extends StandardTokenParsers {
protected val NULL = Keyword("NULL")
protected val ON = Keyword("ON")
protected val OR = Keyword("OR")
protected val LIKE = Keyword("LIKE")
protected val RLIKE = Keyword("RLIKE")
protected val REGEXP = Keyword("REGEXP")
protected val ORDER = Keyword("ORDER")
protected val OUTER = Keyword("OUTER")
protected val RIGHT = Keyword("RIGHT")
Expand Down Expand Up @@ -267,6 +270,9 @@ class SqlParser extends StandardTokenParsers {
termExpression ~ ">=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => GreaterThanOrEqual(e1, e2) } |
termExpression ~ "!=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } |
termExpression ~ "<>" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } |
termExpression ~ RLIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
termExpression ~ REGEXP ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
termExpression ~ LIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => Like(e1, e2) } |
termExpression ~ IN ~ "(" ~ rep1sep(termExpression, ",") <~ ")" ^^ {
case e1 ~ _ ~ _ ~ e2 => In(e1, e2)
} |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ package object dsl {
def === (other: Expression) = Equals(expr, other)
def != (other: Expression) = Not(Equals(expr, other))

def like(other: Expression) = Like(expr, other)
def rlike(other: Expression) = RLike(expr, other)

def asc = SortOrder(expr, Ascending)
def desc = SortOrder(expr, Descending)

Expand All @@ -91,7 +94,10 @@ package object dsl {
implicit def symbolToUnresolvedAttribute(s: Symbol) = analysis.UnresolvedAttribute(s.name)

implicit class DslSymbol(sym: Symbol) extends ImplicitAttribute { def s = sym.name }
implicit class DslString(val s: String) extends ImplicitAttribute
implicit class DslString(val s: String) extends ImplicitOperators {
def expr: Expression = Literal(s)
def attr = analysis.UnresolvedAttribute(s)
}

abstract class ImplicitAttribute extends ImplicitOperators {
def s: String
Expand All @@ -111,6 +117,8 @@ package object dsl {

// Protobuf terminology
def required = a.withNullability(false)

def at(ordinal: Int) = BoundReference(ordinal, a)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,103 @@ package org.apache.spark.sql
package catalyst
package expressions

import java.util.regex.Pattern

import org.apache.spark.sql.catalyst.types.DataType
import org.apache.spark.sql.catalyst.types.StringType
import org.apache.spark.sql.catalyst.types.BooleanType
import org.apache.spark.sql.catalyst.trees.TreeNode
import org.apache.spark.sql.catalyst.errors.`package`.TreeNodeException


trait StringRegexExpression {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add scala doc to this trait and the classes below.

self: BinaryExpression =>

type EvaluatedType = Any
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EvaluatedType can be set to Boolean.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, forgot that evaluation result can be null...


def escape(v: String): String
def matches(regex: Pattern, str: String): Boolean

def nullable: Boolean = true
def dataType: DataType = BooleanType

// try cache the pattern for Literal
private lazy val cache: Pattern = right match {
case x @ Literal(value: String, StringType) => compile(value)
case _ => null
}

protected def compile(str: String): Pattern = if(str == null) {
null
} else {
// Let it raise exception if couldn't compile the regex string
Pattern.compile(escape(str))
}

case class Like(left: Expression, right: Expression) extends BinaryExpression {
def dataType = BooleanType
def nullable = left.nullable // Right cannot be null.
protected def pattern(str: String) = if(cache == null) compile(str) else cache

override def apply(input: Row): Any = {
val l = left.apply(input)
if(l == null) {
null
} else {
val r = right.apply(input)
if(r == null) {
null
} else {
val regex = pattern(r.asInstanceOf[String])
if(regex == null) {
null
} else {
matches(regex, l.asInstanceOf[String])
}
}
}
}
}

/**
* Simple RegEx pattern matching function
*/
case class Like(left: Expression, right: Expression)
extends BinaryExpression with StringRegexExpression {

def symbol = "LIKE"

// replace the _ with .{1} exactly match 1 time of any character
// replace the % with .*, match 0 or more times with any character
override def escape(v: String) = {
val sb = new StringBuilder()
var i = 0;
while (i < v.length) {
// Make a special case for "\\_" and "\\%"
val n = v.charAt(i);
if (n == '\\' && i + 1 < v.length && (v.charAt(i + 1) == '_' || v.charAt(i + 1) == '%')) {
sb.append(v.charAt(i + 1))
i += 1
} else {
if (n == '_') {
sb.append(".");
} else if (n == '%') {
sb.append(".*");
} else {
sb.append(Pattern.quote(Character.toString(n)));
}
}

i += 1
}

sb.toString()
}

override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches()
}

case class RLike(left: Expression, right: Expression)
extends BinaryExpression with StringRegexExpression {

def symbol = "RLIKE"
override def escape(v: String): String = v
override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
}
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,87 @@ class ExpressionEvaluationSuite extends FunSuite {
}
}
}

def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
expression.apply(inputRow)
}

def checkEvaluation(expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
val actual = try evaluate(expression, inputRow) catch {
case e: Exception => fail(s"Exception evaluating $expression", e)
}
if(actual != expected) {
val input = if(inputRow == EmptyRow) "" else s", input: $inputRow"
fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
}
}

test("LIKE literal Regular Expression") {
checkEvaluation(Literal(null, StringType).like("a"), null)
checkEvaluation(Literal(null, StringType).like(Literal(null, StringType)), null)
checkEvaluation("abdef" like "abdef", true)
checkEvaluation("a_%b" like "a\\__b", true)
checkEvaluation("addb" like "a_%b", true)
checkEvaluation("addb" like "a\\__b", false)
checkEvaluation("addb" like "a%\\%b", false)
checkEvaluation("a_%b" like "a%\\%b", true)
checkEvaluation("addb" like "a%", true)
checkEvaluation("addb" like "**", false)
checkEvaluation("abc" like "a%", true)
checkEvaluation("abc" like "b%", false)
checkEvaluation("abc" like "bc%", false)
}

test("LIKE Non-literal Regular Expression") {
val regEx = 'a.string.at(0)
checkEvaluation("abcd" like regEx, null, new GenericRow(Array[Any](null)))
checkEvaluation("abdef" like regEx, true, new GenericRow(Array[Any]("abdef")))
checkEvaluation("a_%b" like regEx, true, new GenericRow(Array[Any]("a\\__b")))
checkEvaluation("addb" like regEx, true, new GenericRow(Array[Any]("a_%b")))
checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("a\\__b")))
checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("a%\\%b")))
checkEvaluation("a_%b" like regEx, true, new GenericRow(Array[Any]("a%\\%b")))
checkEvaluation("addb" like regEx, true, new GenericRow(Array[Any]("a%")))
checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("**")))
checkEvaluation("abc" like regEx, true, new GenericRow(Array[Any]("a%")))
checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("b%")))
checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("bc%")))
}

test("RLIKE literal Regular Expression") {
checkEvaluation("abdef" rlike "abdef", true)
checkEvaluation("abbbbc" rlike "a.*c", true)

checkEvaluation("fofo" rlike "^fo", true)
checkEvaluation("fo\no" rlike "^fo\no$", true)
checkEvaluation("Bn" rlike "^Ba*n", true)
checkEvaluation("afofo" rlike "fo", true)
checkEvaluation("afofo" rlike "^fo", false)
checkEvaluation("Baan" rlike "^Ba?n", false)
checkEvaluation("axe" rlike "pi|apa", false)
checkEvaluation("pip" rlike "^(pi)*$", false)

checkEvaluation("abc" rlike "^ab", true)
checkEvaluation("abc" rlike "^bc", false)
checkEvaluation("abc" rlike "^ab", true)
checkEvaluation("abc" rlike "^bc", false)

intercept[java.util.regex.PatternSyntaxException] {
evaluate("abbbbc" rlike "**")
}
}

test("RLIKE Non-literal Regular Expression") {
val regEx = 'a.string.at(0)
checkEvaluation("abdef" rlike regEx, true, new GenericRow(Array[Any]("abdef")))
checkEvaluation("abbbbc" rlike regEx, true, new GenericRow(Array[Any]("a.*c")))
checkEvaluation("fofo" rlike regEx, true, new GenericRow(Array[Any]("^fo")))
checkEvaluation("fo\no" rlike regEx, true, new GenericRow(Array[Any]("^fo\no$")))
checkEvaluation("Bn" rlike regEx, true, new GenericRow(Array[Any]("^Ba*n")))

intercept[java.util.regex.PatternSyntaxException] {
evaluate("abbbbc" rlike regEx, new GenericRow(Array[Any]("**")))
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -848,12 +848,9 @@ object HiveQl {
case Token(">=", left :: right:: Nil) => GreaterThanOrEqual(nodeToExpr(left), nodeToExpr(right))
case Token("<", left :: right:: Nil) => LessThan(nodeToExpr(left), nodeToExpr(right))
case Token("<=", left :: right:: Nil) => LessThanOrEqual(nodeToExpr(left), nodeToExpr(right))
case Token("LIKE", left :: right:: Nil) =>
UnresolvedFunction("LIKE", Seq(nodeToExpr(left), nodeToExpr(right)))
case Token("RLIKE", left :: right:: Nil) =>
UnresolvedFunction("RLIKE", Seq(nodeToExpr(left), nodeToExpr(right)))
case Token("REGEXP", left :: right:: Nil) =>
UnresolvedFunction("REGEXP", Seq(nodeToExpr(left), nodeToExpr(right)))
case Token("LIKE", left :: right:: Nil) => Like(nodeToExpr(left), nodeToExpr(right))
case Token("RLIKE", left :: right:: Nil) => RLike(nodeToExpr(left), nodeToExpr(right))
case Token("REGEXP", left :: right:: Nil) => RLike(nodeToExpr(left), nodeToExpr(right))
case Token("TOK_FUNCTION", Token("TOK_ISNOTNULL", Nil) :: child :: Nil) =>
IsNotNull(nodeToExpr(child))
case Token("TOK_FUNCTION", Token("TOK_ISNULL", Nil) :: child :: Nil) =>
Expand Down