From 4a6f903897d28a3038918997e692410259a90ae3 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 19 Jun 2020 10:36:52 +0800 Subject: [PATCH 01/35] Reuse completeNextStageWithFetchFailure --- .../apache/spark/scheduler/DAGSchedulerSuite.scala | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 9d412f2dba3ce..762b14e170fcc 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -1796,9 +1796,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // lets say there is a fetch failure in this task set, which makes us go back and // run stage 0, attempt 1 - complete(taskSets(1), Seq( - (FetchFailed(makeBlockManagerId("hostA"), - shuffleDep1.shuffleId, 0L, 0, 0, "ignored"), null))) + completeNextStageWithFetchFailure(1, 0, shuffleDep1) scheduler.resubmitFailedStages() // stage 0, attempt 1 should have the properties of job2 @@ -1872,9 +1870,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // have the second stage complete normally completeShuffleMapStageSuccessfully(1, 0, 1, Seq("hostA", "hostC")) // fail the third stage because hostA went down - complete(taskSets(2), Seq( - (FetchFailed(makeBlockManagerId("hostA"), - shuffleDepTwo.shuffleId, 0L, 0, 0, "ignored"), null))) + completeNextStageWithFetchFailure(2, 0, shuffleDepTwo) // TODO assert this: // blockManagerMaster.removeExecutor("hostA-exec") // have DAGScheduler try again @@ -1900,9 +1896,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // complete stage 1 completeShuffleMapStageSuccessfully(1, 0, 1) // pretend stage 2 failed because hostA went down - complete(taskSets(2), Seq( - (FetchFailed(makeBlockManagerId("hostA"), - shuffleDepTwo.shuffleId, 0L, 0, 0, "ignored"), null))) + completeNextStageWithFetchFailure(2, 0, shuffleDepTwo) // TODO assert this: // blockManagerMaster.removeExecutor("hostA-exec") // DAGScheduler should notice the cached copy of the second shuffle and try to get it rerun. From 41633827583d6f0d91e0e48b781c25c95ec06765 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Sat, 10 Oct 2020 18:19:57 +0800 Subject: [PATCH 02/35] Support build-in LIKE_ALL function --- .../catalyst/analysis/FunctionRegistry.scala | 2 + .../spark/sql/catalyst/dsl/package.scala | 2 + .../expressions/regexpExpressions.scala | 194 ++++++++++++++++++ .../sql/catalyst/parser/AstBuilder.scala | 8 +- .../expressions/RegexpExpressionsSuite.scala | 21 ++ 5 files changed, 226 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 3fae34cbf00c2..d35c9b050957e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -344,6 +344,8 @@ object FunctionRegistry { expression[Length]("length"), expression[Levenshtein]("levenshtein"), expression[Like]("like"), + expression[LikeAll]("like_all"), + expression[NotLikeAll]("not_like_all"), expression[Lower]("lower"), expression[OctetLength]("octet_length"), expression[StringLocate]("locate"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index b61c4b8d065f2..66c263d42b907 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -102,6 +102,8 @@ package object dsl { def like(other: Expression, escapeChar: Char = '\\'): Expression = Like(expr, other, escapeChar) def rlike(other: Expression): Expression = RLike(expr, other) + def likeAll(others: Expression*): Expression = LikeAll(expr, others) + def notLikeAll(others: Expression*): Expression = NotLikeAll(expr, others) def contains(other: Expression): Expression = Contains(expr, other) def startsWith(other: Expression): Expression = StartsWith(expr, other) def endsWith(other: Expression): Expression = EndsWith(expr, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 8eb7f463e049c..57880197dd015 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -24,6 +24,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.commons.text.StringEscapeUtils +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util.{GenericArrayData, StringUtils} @@ -176,6 +177,199 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) } } +abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with NullIntolerant { + def value: Expression + def list: Seq[Expression] + def isNot: Boolean + + override def inputTypes: Seq[AbstractDataType] = { + val arrayOrStr = TypeCollection(ArrayType(StringType), StringType) + StringType +: Seq.fill(children.size - 1)(arrayOrStr) + } + + override def dataType: DataType = BooleanType + + override def children: Seq[Expression] = value +: list + + override def foldable: Boolean = value.foldable && list.forall(_.foldable) + + override def nullable: Boolean = true + + def escape(v: String): String = StringUtils.escapeLikeRegex(v, '\\') + + def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() + + override def eval(input: InternalRow): Any = { + val evaluatedValue = value.eval(input) + if (evaluatedValue == null) { + null + } else { + list.foreach { e => + val str = e.eval(input) + if (str == null) { + return null + } + val regex = Pattern.compile(escape(str.asInstanceOf[UTF8String].toString)) + if(regex == null) { + return null + } else if (isNot && matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { + return false + } else if (!isNot && !matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { + return false + } + } + return true + } + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val patternClass = classOf[Pattern].getName + val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex" + val javaDataType = CodeGenerator.javaType(value.dataType) + val valueGen = value.genCode(ctx) + val listGen = list.map(_.genCode(ctx)) + val pattern = ctx.freshName("pattern") + val rightStr = ctx.freshName("rightStr") + val escapedEscapeChar = StringEscapeUtils.escapeJava("\\") + val hasNull = ctx.freshName("hasNull") + val matched = ctx.freshName("matched") + val valueArg = ctx.freshName("valueArg") + // All the blocks are meant to be inside a do { ... } while (false); loop. + // The evaluation of variables can be stopped when we find a matching value. + val listCode = listGen.map(x => + s""" + |${x.code} + |if (${x.isNull}) { + | $hasNull = true; // ${ev.isNull} = true; + |} else if (!$hasNull && $matched) { + | String $rightStr = ${x.value}.toString(); + | $patternClass $pattern = + | $patternClass.compile($escapeFunc($rightStr, '$escapedEscapeChar')); + | if ($isNot && $pattern.matcher($valueArg.toString()).matches()) { + | $matched = false; + | } else if (!$isNot && !$pattern.matcher($valueArg.toString()).matches()) { + | $matched = false; + | } + |} + """.stripMargin) + + val resultType = CodeGenerator.javaType(dataType) + val codes = ctx.splitExpressionsWithCurrentInputs( + expressions = listCode, + funcName = "likeAll", + extraArguments = (javaDataType, valueArg) :: (CodeGenerator.JAVA_BOOLEAN, hasNull) :: + (resultType, matched) :: Nil, + returnType = resultType, + makeSplitFunction = body => + s""" + |if (!$hasNull && $matched) { + | $body; + |} + """.stripMargin, + foldFunctions = _.map { funcCall => + s""" + |if (!$hasNull && $matched) { + | $funcCall; + |} + """.stripMargin + }.mkString("\n")) + ev.copy(code = + code""" + |${valueGen.code} + |boolean $hasNull = false; + |boolean $matched = true; + |if (${valueGen.isNull}) { + | $hasNull = true; + |} else { + | $javaDataType $valueArg = ${valueGen.value}; + | $codes + |} + |final boolean ${ev.isNull} = ($hasNull == true); + |final boolean ${ev.value} = ($matched == true); + """.stripMargin) + } +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "str _FUNC_ patterns [, isNot] - Returns true if `str` matches all the pattern string " + + ", null if any arguments are null, false otherwise.", + arguments = """ + Arguments: + * str - a string expression + * patterns - a list of string expression. Each pattern is a string which is matched literally, with + exception to the following special symbols: + + _ matches any one character in the input (similar to . in posix regular expressions) + + % matches zero or more characters in the input (similar to .* in posix regular + expressions) + + Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order + to match "\abc", the pattern should be "\\abc". + + When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it fallbacks + to Spark 1.6 behavior regarding string literal parsing. For example, if the config is + enabled, the pattern to match "\abc" should be "\abc". + """, + examples = """ + Examples: + > SELECT 'foo' _FUNC_('%foo%', '%oo'); + true + > SELECT 'foo' _FUNC_('%foo%', '%bar%'); + false + > SELECT 'foo' _FUNC_('%foo%', null); + null + """, + note = """ + x LIKE ALL ('A%','%B','%C%') is equivalent to x LIKE 'A%' AND x LIKE '%B' AND x LIKE '%C%'. + """, + since = "3.1.0") +// scalastyle:on line.size.limit +case class LikeAll(value: Expression, list: Seq[Expression]) extends LikeAllBase { + override def isNot: Boolean = false +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "str _FUNC_ patterns [, isNot] - Returns true if `str` not matches all the pattern string" + + ", null if any arguments are null, false otherwise.", + arguments = """ + Arguments: + * str - a string expression + * patterns - a list of string expression. Each pattern is a string which is matched literally, with + exception to the following special symbols: + + _ matches any one character in the input (similar to . in posix regular expressions) + + % matches zero or more characters in the input (similar to .* in posix regular + expressions) + + Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order + to match "\abc", the pattern should be "\\abc". + + When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it fallbacks + to Spark 1.6 behavior regarding string literal parsing. For example, if the config is + enabled, the pattern to match "\abc" should be "\abc". + """, + examples = """ + Examples: + > SELECT 'foo' _FUNC_('tee', '%yoo%'); + true + > SELECT 'foo' _FUNC_('%oo%', '%yoo%'); + false + > SELECT 'foo' _FUNC_('%yoo%', null); + null + """, + note = """ + x NOT LIKE ALL ('A%','%B','%C%') is equivalent to x NOT LIKE 'A%' AND x NOT LIKE '%B' AND x NOT LIKE '%C%'. + """, + since = "3.1.0") +// scalastyle:on line.size.limit +case class NotLikeAll(value: Expression, list: Seq[Expression]) extends LikeAllBase { + override def isNot: Boolean = true +} + // scalastyle:off line.contains.tab @ExpressionDescription( usage = "str _FUNC_ regexp - Returns true if `str` matches `regexp`, or false otherwise.", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index f133235a2636e..0e0851eceea81 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1408,7 +1408,13 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case Some(SqlBaseParser.ANY) | Some(SqlBaseParser.SOME) => getLikeQuantifierExprs(ctx.expression).reduceLeft(Or) case Some(SqlBaseParser.ALL) => - getLikeQuantifierExprs(ctx.expression).reduceLeft(And) + if (ctx.expression.isEmpty) { + throw new ParseException("Expected something between '(' and ')'.", ctx) + } + ctx.NOT match { + case null => LikeAll(e, ctx.expression.asScala.map(expression)) + case _ => NotLikeAll(e, ctx.expression.asScala.map(expression)) + } case _ => val escapeChar = Option(ctx.escapeChar).map(string).map { str => if (str.length != 1) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 205dc10efc8a8..3db36c53814d9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -48,6 +48,27 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input } + test("LIKE ALL") { + checkEvaluation(Literal.create("foo", StringType).likeAll( + Literal.create("%foo%", StringType), + Literal.create("%oo", StringType)), true) + checkEvaluation(Literal.create("foo", StringType).likeAll( + Literal.create("%foo%", StringType), + Literal.create("%bar%", StringType)), false) + checkEvaluation(Literal.create("foo", StringType).likeAll( + Literal.create("%foo%", StringType), + Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType).notLikeAll( + Literal.create("tee", StringType), + Literal.create("%yoo%", StringType)), true) + checkEvaluation(Literal.create("foo", StringType).notLikeAll( + Literal.create("%oo%", StringType), + Literal.create("%yoo%", StringType)), false) + checkEvaluation(Literal.create("foo", StringType).notLikeAll( + Literal.create("%yoo%", StringType), + Literal.create(null, StringType)), null) + } + test("LIKE Pattern") { // null handling From 1909298bbb8a5effd4493014086233172dd6a47b Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Mon, 12 Oct 2020 11:05:53 +0800 Subject: [PATCH 03/35] Fix schema issue. --- .../spark/sql/catalyst/dsl/package.scala | 4 +-- .../expressions/regexpExpressions.scala | 30 +++++++++---------- .../sql/catalyst/parser/AstBuilder.scala | 4 +-- .../sql-functions/sql-expression-schema.md | 4 ++- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 66c263d42b907..e233381c43b57 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -102,8 +102,8 @@ package object dsl { def like(other: Expression, escapeChar: Char = '\\'): Expression = Like(expr, other, escapeChar) def rlike(other: Expression): Expression = RLike(expr, other) - def likeAll(others: Expression*): Expression = LikeAll(expr, others) - def notLikeAll(others: Expression*): Expression = NotLikeAll(expr, others) + def likeAll(others: Expression*): Expression = LikeAll(expr +: others) + def notLikeAll(others: Expression*): Expression = NotLikeAll(expr +: others) def contains(other: Expression): Expression = Contains(expr, other) def startsWith(other: Expression): Expression = StartsWith(expr, other) def endsWith(other: Expression): Expression = EndsWith(expr, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 57880197dd015..ef9aa2ae901b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -178,8 +178,8 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) } abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with NullIntolerant { - def value: Expression - def list: Seq[Expression] + def value: Expression = children.head + def list: Seq[Expression] = children.tail def isNot: Boolean override def inputTypes: Seq[AbstractDataType] = { @@ -292,8 +292,8 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N // scalastyle:off line.size.limit @ExpressionDescription( - usage = "str _FUNC_ patterns [, isNot] - Returns true if `str` matches all the pattern string " + - ", null if any arguments are null, false otherwise.", + usage = "_FUNC_(str, pattern1, pattern2, ...) - Returns true if `str` matches all the pattern string, " + + "null if any arguments are null, false otherwise.", arguments = """ Arguments: * str - a string expression @@ -314,25 +314,25 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N """, examples = """ Examples: - > SELECT 'foo' _FUNC_('%foo%', '%oo'); + > SELECT _FUNC_('foo', '%foo%', '%oo'); true - > SELECT 'foo' _FUNC_('%foo%', '%bar%'); + > SELECT _FUNC_('foo', '%foo%', '%bar%'); false - > SELECT 'foo' _FUNC_('%foo%', null); - null + > SELECT _FUNC_('foo', '%foo%', null); + NULL """, note = """ x LIKE ALL ('A%','%B','%C%') is equivalent to x LIKE 'A%' AND x LIKE '%B' AND x LIKE '%C%'. """, since = "3.1.0") // scalastyle:on line.size.limit -case class LikeAll(value: Expression, list: Seq[Expression]) extends LikeAllBase { +case class LikeAll(override val children: Seq[Expression]) extends LikeAllBase { override def isNot: Boolean = false } // scalastyle:off line.size.limit @ExpressionDescription( - usage = "str _FUNC_ patterns [, isNot] - Returns true if `str` not matches all the pattern string" + + usage = "_FUNC_(str, pattern1, pattern2, ...) - Returns true if `str` not matches all the pattern string" + ", null if any arguments are null, false otherwise.", arguments = """ Arguments: @@ -354,19 +354,19 @@ case class LikeAll(value: Expression, list: Seq[Expression]) extends LikeAllBase """, examples = """ Examples: - > SELECT 'foo' _FUNC_('tee', '%yoo%'); + > SELECT _FUNC_('foo', 'tee', '%yoo%'); true - > SELECT 'foo' _FUNC_('%oo%', '%yoo%'); + > SELECT _FUNC_('foo', '%oo%', '%yoo%'); false - > SELECT 'foo' _FUNC_('%yoo%', null); - null + > SELECT _FUNC_('foo', '%yoo%', null); + NULL """, note = """ x NOT LIKE ALL ('A%','%B','%C%') is equivalent to x NOT LIKE 'A%' AND x NOT LIKE '%B' AND x NOT LIKE '%C%'. """, since = "3.1.0") // scalastyle:on line.size.limit -case class NotLikeAll(value: Expression, list: Seq[Expression]) extends LikeAllBase { +case class NotLikeAll(override val children: Seq[Expression]) extends LikeAllBase { override def isNot: Boolean = true } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 0e0851eceea81..b6d2aa7fa7a41 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1412,8 +1412,8 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging throw new ParseException("Expected something between '(' and ')'.", ctx) } ctx.NOT match { - case null => LikeAll(e, ctx.expression.asScala.map(expression)) - case _ => NotLikeAll(e, ctx.expression.asScala.map(expression)) + case null => LikeAll(e +: ctx.expression.asScala.map(expression)) + case _ => NotLikeAll(e +: ctx.expression.asScala.map(expression)) } case _ => val escapeChar = Option(ctx.escapeChar).map(string).map { str => diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 473204c182a69..39807336338fa 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ ## Summary - - Number of queries: 340 + - Number of queries: 342 - Number of expressions that missing example: 13 - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint,window ## Schema of Built-in Functions @@ -158,6 +158,7 @@ | org.apache.spark.sql.catalyst.expressions.LessThanOrEqual | <= | SELECT 2 <= 2 | struct<(2 <= 2):boolean> | | org.apache.spark.sql.catalyst.expressions.Levenshtein | levenshtein | SELECT levenshtein('kitten', 'sitting') | struct | | org.apache.spark.sql.catalyst.expressions.Like | like | SELECT like('Spark', '_park') | struct | +| org.apache.spark.sql.catalyst.expressions.LikeAll | like_all | SELECT like_all('foo', '%foo%', '%oo') | struct | | org.apache.spark.sql.catalyst.expressions.Log | ln | SELECT ln(1) | struct | | org.apache.spark.sql.catalyst.expressions.Log10 | log10 | SELECT log10(10) | struct | | org.apache.spark.sql.catalyst.expressions.Log1p | log1p | SELECT log1p(0) | struct | @@ -190,6 +191,7 @@ | org.apache.spark.sql.catalyst.expressions.NextDay | next_day | SELECT next_day('2015-01-14', 'TU') | struct | | org.apache.spark.sql.catalyst.expressions.Not | ! | SELECT ! true | struct<(NOT true):boolean> | | org.apache.spark.sql.catalyst.expressions.Not | not | SELECT not true | struct<(NOT true):boolean> | +| org.apache.spark.sql.catalyst.expressions.NotLikeAll | not_like_all | SELECT not_like_all('foo', 'tee', '%yoo%') | struct | | org.apache.spark.sql.catalyst.expressions.Now | now | SELECT now() | struct | | org.apache.spark.sql.catalyst.expressions.NthValue | nth_value | SELECT a, b, nth_value(b, 2) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b) | struct | | org.apache.spark.sql.catalyst.expressions.NullIf | nullif | SELECT nullif(2, 2) | struct | From a7cd416f40308cfc841fb0c7210728e69ba4ac1e Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Mon, 12 Oct 2020 11:40:43 +0800 Subject: [PATCH 04/35] Optimize code --- .../apache/spark/sql/catalyst/parser/AstBuilder.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 5f4f205c6e82b..b6db0ed9142e7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1410,10 +1410,13 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case Some(SqlBaseParser.ALL) => if (ctx.expression.isEmpty) { throw new ParseException("Expected something between '(' and ')'.", ctx) + } else if (ctx.expression.size <= 14378) { + getLikeQuantifierExprs(ctx.expression).reduceLeft(And) + } else { + ctx.NOT match { + case null => LikeAll(e +: ctx.expression.asScala.map(expression)) + case _ => NotLikeAll(e +: ctx.expression.asScala.map(expression)) } - ctx.NOT match { - case null => LikeAll(e +: ctx.expression.asScala.map(expression)) - case _ => NotLikeAll(e +: ctx.expression.asScala.map(expression)) } case _ => val escapeChar = Option(ctx.escapeChar).map(string).map { str => From 0aa4e183ca1b8a50ed274f7326eca3c512faa1a1 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Mon, 12 Oct 2020 11:49:13 +0800 Subject: [PATCH 05/35] Optimize code --- .../org/apache/spark/sql/catalyst/parser/AstBuilder.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index b6db0ed9142e7..ca2f7254ae2ec 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1410,9 +1410,11 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case Some(SqlBaseParser.ALL) => if (ctx.expression.isEmpty) { throw new ParseException("Expected something between '(' and ')'.", ctx) - } else if (ctx.expression.size <= 14378) { + } else if (ctx.expression.size < 14379) { getLikeQuantifierExprs(ctx.expression).reduceLeft(And) } else { + // If there are many pattern expressions(more than 14378 elements), will throw + // StackOverflowError. So we use LikeAll or NotLikeAll instead. ctx.NOT match { case null => LikeAll(e +: ctx.expression.asScala.map(expression)) case _ => NotLikeAll(e +: ctx.expression.asScala.map(expression)) From 3e41cffb800e8e3f5a485021706f38a4fc73e07c Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Mon, 12 Oct 2020 12:03:13 +0800 Subject: [PATCH 06/35] Add test cases. --- .../expressions/regexpExpressions.scala | 6 +-- .../sql-tests/inputs/regexp-functions.sql | 10 ++++ .../results/regexp-functions.sql.out | 50 ++++++++++++++++++- 3 files changed, 61 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index ef9aa2ae901b7..556bb4a366bb8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -189,8 +189,6 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N override def dataType: DataType = BooleanType - override def children: Seq[Expression] = value +: list - override def foldable: Boolean = value.foldable && list.forall(_.foldable) override def nullable: Boolean = true @@ -326,7 +324,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N """, since = "3.1.0") // scalastyle:on line.size.limit -case class LikeAll(override val children: Seq[Expression]) extends LikeAllBase { +case class LikeAll(children: Seq[Expression]) extends LikeAllBase { override def isNot: Boolean = false } @@ -366,7 +364,7 @@ case class LikeAll(override val children: Seq[Expression]) extends LikeAllBase { """, since = "3.1.0") // scalastyle:on line.size.limit -case class NotLikeAll(override val children: Seq[Expression]) extends LikeAllBase { +case class NotLikeAll(children: Seq[Expression]) extends LikeAllBase { override def isNot: Boolean = true } diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql index 7128dee0a00d7..856c8491b8c56 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -31,3 +31,13 @@ SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)', 3); SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)', -1); SELECT regexp_extract_all('1a 2b 14m', '(\\d+)?([a-z]+)', 1); SELECT regexp_extract_all('a 2b 14m', '(\\d+)?([a-z]+)', 1); + +-- like_all +SELECT like_all('foo', '%foo%', '%oo'); +SELECT like_all('foo', '%foo%', '%bar%'); +SELECT like_all('foo', '%foo%', null); + +-- not_like_all +SELECT not_like_all('foo', 'tee', '%yoo%'); +SELECT not_like_all('foo', '%oo%', '%yoo%'); +SELECT not_like_all('foo', '%yoo%', null); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out index 2eef926f63e37..2c78a0ab9269c 100644 --- a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 30 +-- Number of queries: 36 -- !query @@ -252,3 +252,51 @@ SELECT regexp_extract_all('a 2b 14m', '(\\d+)?([a-z]+)', 1) struct> -- !query output ["","2","14"] + + +-- !query +SELECT like_all('foo', '%foo%', '%oo') +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT like_all('foo', '%foo%', '%bar%') +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT like_all('foo', '%foo%', null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT not_like_all('foo', 'tee', '%yoo%') +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT not_like_all('foo', '%oo%', '%yoo%') +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT not_like_all('foo', '%yoo%', null) +-- !query schema +struct +-- !query output +NULL \ No newline at end of file From 70b0843819e9eb18df513ead992747c24024b1c7 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Tue, 13 Oct 2020 18:55:42 +0800 Subject: [PATCH 07/35] Adjust the value --- .../org/apache/spark/sql/catalyst/parser/AstBuilder.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index ca2f7254ae2ec..f9d16a960dd26 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1410,11 +1410,12 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case Some(SqlBaseParser.ALL) => if (ctx.expression.isEmpty) { throw new ParseException("Expected something between '(' and ')'.", ctx) - } else if (ctx.expression.size < 14379) { + } else if (ctx.expression.size < 500) { + // An empirical value that will not cause StackOverflowError is used here getLikeQuantifierExprs(ctx.expression).reduceLeft(And) } else { - // If there are many pattern expressions(more than 14378 elements), will throw - // StackOverflowError. So we use LikeAll or NotLikeAll instead. + // If there are many pattern expressions, will throw StackOverflowError. + // So we use LikeAll or NotLikeAll instead. ctx.NOT match { case null => LikeAll(e +: ctx.expression.asScala.map(expression)) case _ => NotLikeAll(e +: ctx.expression.asScala.map(expression)) From d841b54007d36963ede98a3745d4dd69c8f65c3e Mon Sep 17 00:00:00 2001 From: beliefer Date: Wed, 14 Oct 2020 19:54:32 +0800 Subject: [PATCH 08/35] Delete like_all and not_like_all --- .../catalyst/analysis/FunctionRegistry.scala | 2 - .../expressions/regexpExpressions.scala | 2 - .../sql-functions/sql-expression-schema.md | 6 +-- .../sql-tests/inputs/regexp-functions.sql | 10 ---- .../results/regexp-functions.sql.out | 50 +------------------ 5 files changed, 3 insertions(+), 67 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 9457a336eb86b..508239077a70e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -344,8 +344,6 @@ object FunctionRegistry { expression[Length]("length"), expression[Levenshtein]("levenshtein"), expression[Like]("like"), - expression[LikeAll]("like_all"), - expression[NotLikeAll]("not_like_all"), expression[Lower]("lower"), expression[OctetLength]("octet_length"), expression[StringLocate]("locate"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 556bb4a366bb8..50ca5f805a1b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -232,8 +232,6 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N val hasNull = ctx.freshName("hasNull") val matched = ctx.freshName("matched") val valueArg = ctx.freshName("valueArg") - // All the blocks are meant to be inside a do { ... } while (false); loop. - // The evaluation of variables can be stopped when we find a matching value. val listCode = listGen.map(x => s""" |${x.code} diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 810b52f36f773..cd8e03b947cdc 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ ## Summary - - Number of queries: 343 + - Number of queries: 341 - Number of expressions that missing example: 13 - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint,window ## Schema of Built-in Functions @@ -158,7 +158,6 @@ | org.apache.spark.sql.catalyst.expressions.LessThanOrEqual | <= | SELECT 2 <= 2 | struct<(2 <= 2):boolean> | | org.apache.spark.sql.catalyst.expressions.Levenshtein | levenshtein | SELECT levenshtein('kitten', 'sitting') | struct | | org.apache.spark.sql.catalyst.expressions.Like | like | SELECT like('Spark', '_park') | struct | -| org.apache.spark.sql.catalyst.expressions.LikeAll | like_all | SELECT like_all('foo', '%foo%', '%oo') | struct | | org.apache.spark.sql.catalyst.expressions.Log | ln | SELECT ln(1) | struct | | org.apache.spark.sql.catalyst.expressions.Log10 | log10 | SELECT log10(10) | struct | | org.apache.spark.sql.catalyst.expressions.Log1p | log1p | SELECT log1p(0) | struct | @@ -191,7 +190,6 @@ | org.apache.spark.sql.catalyst.expressions.NextDay | next_day | SELECT next_day('2015-01-14', 'TU') | struct | | org.apache.spark.sql.catalyst.expressions.Not | ! | SELECT ! true | struct<(NOT true):boolean> | | org.apache.spark.sql.catalyst.expressions.Not | not | SELECT not true | struct<(NOT true):boolean> | -| org.apache.spark.sql.catalyst.expressions.NotLikeAll | not_like_all | SELECT not_like_all('foo', 'tee', '%yoo%') | struct | | org.apache.spark.sql.catalyst.expressions.Now | now | SELECT now() | struct | | org.apache.spark.sql.catalyst.expressions.NthValue | nth_value | SELECT a, b, nth_value(b, 2) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b) | struct | | org.apache.spark.sql.catalyst.expressions.NullIf | nullif | SELECT nullif(2, 2) | struct | @@ -348,4 +346,4 @@ | org.apache.spark.sql.catalyst.expressions.xml.XPathList | xpath | SELECT xpath('b1b2b3c1c2','a/b/text()') | structb1b2b3c1c2, a/b/text()):array> | | org.apache.spark.sql.catalyst.expressions.xml.XPathLong | xpath_long | SELECT xpath_long('12', 'sum(a/b)') | struct12, sum(a/b)):bigint> | | org.apache.spark.sql.catalyst.expressions.xml.XPathShort | xpath_short | SELECT xpath_short('12', 'sum(a/b)') | struct12, sum(a/b)):smallint> | -| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('bcc','a/c') | structbcc, a/c):string> | \ No newline at end of file +| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('bcc','a/c') | structbcc, a/c):string> | diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql index 856c8491b8c56..7128dee0a00d7 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -31,13 +31,3 @@ SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)', 3); SELECT regexp_extract_all('1a 2b 14m', '(\\d+)([a-z]+)', -1); SELECT regexp_extract_all('1a 2b 14m', '(\\d+)?([a-z]+)', 1); SELECT regexp_extract_all('a 2b 14m', '(\\d+)?([a-z]+)', 1); - --- like_all -SELECT like_all('foo', '%foo%', '%oo'); -SELECT like_all('foo', '%foo%', '%bar%'); -SELECT like_all('foo', '%foo%', null); - --- not_like_all -SELECT not_like_all('foo', 'tee', '%yoo%'); -SELECT not_like_all('foo', '%oo%', '%yoo%'); -SELECT not_like_all('foo', '%yoo%', null); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out index 2c78a0ab9269c..2eef926f63e37 100644 --- a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 36 +-- Number of queries: 30 -- !query @@ -252,51 +252,3 @@ SELECT regexp_extract_all('a 2b 14m', '(\\d+)?([a-z]+)', 1) struct> -- !query output ["","2","14"] - - --- !query -SELECT like_all('foo', '%foo%', '%oo') --- !query schema -struct --- !query output -true - - --- !query -SELECT like_all('foo', '%foo%', '%bar%') --- !query schema -struct --- !query output -false - - --- !query -SELECT like_all('foo', '%foo%', null) --- !query schema -struct --- !query output -NULL - - --- !query -SELECT not_like_all('foo', 'tee', '%yoo%') --- !query schema -struct --- !query output -true - - --- !query -SELECT not_like_all('foo', '%oo%', '%yoo%') --- !query schema -struct --- !query output -false - - --- !query -SELECT not_like_all('foo', '%yoo%', null) --- !query schema -struct --- !query output -NULL \ No newline at end of file From 369959f6c627004c99206fc6c9e252c9676b82a7 Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 08:39:49 +0800 Subject: [PATCH 09/35] Optimize code --- .../expressions/regexpExpressions.scala | 80 +------------------ 1 file changed, 4 insertions(+), 76 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 50ca5f805a1b8..bed6bdff1a933 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -189,7 +189,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N override def dataType: DataType = BooleanType - override def foldable: Boolean = value.foldable && list.forall(_.foldable) + override def foldable: Boolean = children.forall(_.foldable) override def nullable: Boolean = true @@ -207,7 +207,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N if (str == null) { return null } - val regex = Pattern.compile(escape(str.asInstanceOf[UTF8String].toString)) + val regex = Pattern.compile(StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) if(regex == null) { return null } else if (isNot && matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { @@ -280,88 +280,16 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N | $javaDataType $valueArg = ${valueGen.value}; | $codes |} - |final boolean ${ev.isNull} = ($hasNull == true); - |final boolean ${ev.value} = ($matched == true); + |final boolean ${ev.isNull} = $hasNull; + |final boolean ${ev.value} = $matched; """.stripMargin) } } -// scalastyle:off line.size.limit -@ExpressionDescription( - usage = "_FUNC_(str, pattern1, pattern2, ...) - Returns true if `str` matches all the pattern string, " + - "null if any arguments are null, false otherwise.", - arguments = """ - Arguments: - * str - a string expression - * patterns - a list of string expression. Each pattern is a string which is matched literally, with - exception to the following special symbols: - - _ matches any one character in the input (similar to . in posix regular expressions) - - % matches zero or more characters in the input (similar to .* in posix regular - expressions) - - Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order - to match "\abc", the pattern should be "\\abc". - - When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it fallbacks - to Spark 1.6 behavior regarding string literal parsing. For example, if the config is - enabled, the pattern to match "\abc" should be "\abc". - """, - examples = """ - Examples: - > SELECT _FUNC_('foo', '%foo%', '%oo'); - true - > SELECT _FUNC_('foo', '%foo%', '%bar%'); - false - > SELECT _FUNC_('foo', '%foo%', null); - NULL - """, - note = """ - x LIKE ALL ('A%','%B','%C%') is equivalent to x LIKE 'A%' AND x LIKE '%B' AND x LIKE '%C%'. - """, - since = "3.1.0") -// scalastyle:on line.size.limit case class LikeAll(children: Seq[Expression]) extends LikeAllBase { override def isNot: Boolean = false } -// scalastyle:off line.size.limit -@ExpressionDescription( - usage = "_FUNC_(str, pattern1, pattern2, ...) - Returns true if `str` not matches all the pattern string" + - ", null if any arguments are null, false otherwise.", - arguments = """ - Arguments: - * str - a string expression - * patterns - a list of string expression. Each pattern is a string which is matched literally, with - exception to the following special symbols: - - _ matches any one character in the input (similar to . in posix regular expressions) - - % matches zero or more characters in the input (similar to .* in posix regular - expressions) - - Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order - to match "\abc", the pattern should be "\\abc". - - When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it fallbacks - to Spark 1.6 behavior regarding string literal parsing. For example, if the config is - enabled, the pattern to match "\abc" should be "\abc". - """, - examples = """ - Examples: - > SELECT _FUNC_('foo', 'tee', '%yoo%'); - true - > SELECT _FUNC_('foo', '%oo%', '%yoo%'); - false - > SELECT _FUNC_('foo', '%yoo%', null); - NULL - """, - note = """ - x NOT LIKE ALL ('A%','%B','%C%') is equivalent to x NOT LIKE 'A%' AND x NOT LIKE '%B' AND x NOT LIKE '%C%'. - """, - since = "3.1.0") -// scalastyle:on line.size.limit case class NotLikeAll(children: Seq[Expression]) extends LikeAllBase { override def isNot: Boolean = true } From 1f1f42c470142f615f3bd2f17d6639e0d6459493 Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 08:48:06 +0800 Subject: [PATCH 10/35] Optimize code --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index bed6bdff1a933..d1929ccffe13d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -183,8 +183,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N def isNot: Boolean override def inputTypes: Seq[AbstractDataType] = { - val arrayOrStr = TypeCollection(ArrayType(StringType), StringType) - StringType +: Seq.fill(children.size - 1)(arrayOrStr) + StringType +: Seq.fill(children.size - 1)(StringType) } override def dataType: DataType = BooleanType From de658290b417645d4dd8b91bc1f2febb747e1f3b Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 08:51:25 +0800 Subject: [PATCH 11/35] Optimize code --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index d1929ccffe13d..a16a930395712 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -206,7 +206,8 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N if (str == null) { return null } - val regex = Pattern.compile(StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) + val regex = + Pattern.compile(StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) if(regex == null) { return null } else if (isNot && matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { From 1754f0d3e234afbd69d408a27a2ca9dea11b4ba1 Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 09:14:03 +0800 Subject: [PATCH 12/35] Optimize code --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index a16a930395712..b8ee4b4ae782d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -192,8 +192,6 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N override def nullable: Boolean = true - def escape(v: String): String = StringUtils.escapeLikeRegex(v, '\\') - def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() override def eval(input: InternalRow): Any = { From 60f01f4edfbd112ea085e118c6a50f024c8c4dff Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 13:44:59 +0800 Subject: [PATCH 13/35] Keep eval and codegen consistent --- .../expressions/regexpExpressions.scala | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index b8ee4b4ae782d..2acc9a7e0d543 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -199,22 +199,26 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N if (evaluatedValue == null) { null } else { + var hasNull = false + var match = true list.foreach { e => val str = e.eval(input) if (str == null) { - return null + hasNull = true } val regex = Pattern.compile(StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) - if(regex == null) { - return null - } else if (isNot && matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { - return false + if (isNot && matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { + match = false } else if (!isNot && !matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { - return false + match = false } } - return true + if (hasNull) { + return null + } else { + return match + } } } From c32f89b8ba34b7b689ea5d2712f55824c99ba6f0 Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 13:48:24 +0800 Subject: [PATCH 14/35] Keep eval and codegen consistent --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 2acc9a7e0d543..46922c40e77cf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -215,9 +215,9 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N } } if (hasNull) { - return null + null } else { - return match + match } } } From ec53b836c41cbe53fd4e7afc42f020f6107513d7 Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 13:56:47 +0800 Subject: [PATCH 15/35] Keep eval and codegen consistent --- .../expressions/regexpExpressions.scala | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 46922c40e77cf..6e140f718b050 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -205,20 +205,20 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N val str = e.eval(input) if (str == null) { hasNull = true + } else { + val regex = + Pattern.compile(StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) + if ((isNot && matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) || + !(isNot || matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { + match = false + } } - val regex = - Pattern.compile(StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) - if (isNot && matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { - match = false - } else if (!isNot && !matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { - match = false + if (hasNull) { + null + } else { + match } } - if (hasNull) { - null - } else { - match - } } } From c52d004060bb35205de71a00c5cd26de901d3641 Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 14:06:44 +0800 Subject: [PATCH 16/35] Keep eval and codegen consistent --- .../sql/catalyst/expressions/regexpExpressions.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 6e140f718b050..82a1e6089ecc3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -213,11 +213,11 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N match = false } } - if (hasNull) { - null - } else { - match - } + } + if (hasNull) { + null + } else { + match } } } From b770f929594dd551a544fb6b0e5f9d4f2ddff7d4 Mon Sep 17 00:00:00 2001 From: beliefer Date: Thu, 15 Oct 2020 14:42:52 +0800 Subject: [PATCH 17/35] Keep eval and codegen consistent --- .../catalyst/expressions/regexpExpressions.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 82a1e6089ecc3..d9288203c0bb0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -180,7 +180,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with NullIntolerant { def value: Expression = children.head def list: Seq[Expression] = children.tail - def isNot: Boolean + protected def isNot: Boolean override def inputTypes: Seq[AbstractDataType] = { StringType +: Seq.fill(children.size - 1)(StringType) @@ -192,7 +192,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N override def nullable: Boolean = true - def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() + private def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() override def eval(input: InternalRow): Any = { val evaluatedValue = value.eval(input) @@ -200,24 +200,24 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N null } else { var hasNull = false - var match = true - list.foreach { e => + var matched = true + list.foreach { e if !hasNull && matched => val str = e.eval(input) if (str == null) { hasNull = true } else { val regex = Pattern.compile(StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) - if ((isNot && matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) || - !(isNot || matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString)) { - match = false + val matches = matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString) + if ((isNot && matches) || !(isNot || matches)) { + matched = false } } } if (hasNull) { null } else { - match + matched } } } From be5eb8a1f092e15c941d39d517284aed67de72c9 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 16 Oct 2020 10:58:37 +0800 Subject: [PATCH 18/35] Optimize code --- .../expressions/regexpExpressions.scala | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index d9288203c0bb0..96abfe26819aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -200,24 +200,26 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N null } else { var hasNull = false - var matched = true - list.foreach { e if !hasNull && matched => - val str = e.eval(input) - if (str == null) { - hasNull = true - } else { - val regex = - Pattern.compile(StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) - val matches = matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString) - if ((isNot && matches) || !(isNot || matches)) { - matched = false + var allMatched = true + list.foreach { e => + if (!hasNull && allMatched) { + val str = e.eval(input) + if (str == null) { + hasNull = true + } else { + val regex = Pattern.compile( + StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) + val matched = matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString) + if ((isNot && matched) || !(isNot || matched)) { + allMatched = false + } } } } if (hasNull) { null } else { - matched + allMatched } } } @@ -232,21 +234,21 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N val rightStr = ctx.freshName("rightStr") val escapedEscapeChar = StringEscapeUtils.escapeJava("\\") val hasNull = ctx.freshName("hasNull") - val matched = ctx.freshName("matched") + val allMatched = ctx.freshName("allMatched") val valueArg = ctx.freshName("valueArg") val listCode = listGen.map(x => s""" |${x.code} |if (${x.isNull}) { | $hasNull = true; // ${ev.isNull} = true; - |} else if (!$hasNull && $matched) { + |} else if (!$hasNull && $allMatched) { | String $rightStr = ${x.value}.toString(); | $patternClass $pattern = | $patternClass.compile($escapeFunc($rightStr, '$escapedEscapeChar')); | if ($isNot && $pattern.matcher($valueArg.toString()).matches()) { - | $matched = false; + | $allMatched = false; | } else if (!$isNot && !$pattern.matcher($valueArg.toString()).matches()) { - | $matched = false; + | $allMatched = false; | } |} """.stripMargin) @@ -256,17 +258,17 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N expressions = listCode, funcName = "likeAll", extraArguments = (javaDataType, valueArg) :: (CodeGenerator.JAVA_BOOLEAN, hasNull) :: - (resultType, matched) :: Nil, + (resultType, allMatched) :: Nil, returnType = resultType, makeSplitFunction = body => s""" - |if (!$hasNull && $matched) { + |if (!$hasNull && $allMatched) { | $body; |} """.stripMargin, foldFunctions = _.map { funcCall => s""" - |if (!$hasNull && $matched) { + |if (!$hasNull && $allMatched) { | $funcCall; |} """.stripMargin @@ -275,7 +277,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N code""" |${valueGen.code} |boolean $hasNull = false; - |boolean $matched = true; + |boolean $allMatched = true; |if (${valueGen.isNull}) { | $hasNull = true; |} else { @@ -283,7 +285,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N | $codes |} |final boolean ${ev.isNull} = $hasNull; - |final boolean ${ev.value} = $matched; + |final boolean ${ev.value} = $allMatched; """.stripMargin) } } From fcab4e3dece69dbff9279918ab5cd18130a0d339 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 16 Oct 2020 15:51:23 +0800 Subject: [PATCH 19/35] Cache foldable pattern and avoid re-evaluate --- .../expressions/regexpExpressions.scala | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 96abfe26819aa..b8a64a5490f91 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -192,6 +192,8 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N override def nullable: Boolean = true + private def escape(v: String): String = StringUtils.escapeLikeRegex(v, '\\') + private def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() override def eval(input: InternalRow): Any = { @@ -207,8 +209,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N if (str == null) { hasNull = true } else { - val regex = Pattern.compile( - StringUtils.escapeLikeRegex(str.asInstanceOf[UTF8String].toString, '\\')) + val regex = Pattern.compile(escape(str.asInstanceOf[UTF8String].toString)) val matched = matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString) if ((isNot && matched) || !(isNot || matched)) { allMatched = false @@ -236,22 +237,38 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N val hasNull = ctx.freshName("hasNull") val allMatched = ctx.freshName("allMatched") val valueArg = ctx.freshName("valueArg") - val listCode = listGen.map(x => + val patternCache = ctx.freshName("patternCache") + // If some pattern expression is foldable, we don't want to re-evaluate the pattern again. + val cacheCode = list.zipWithIndex.collect { case (x, i) if x.foldable => + val xVal = x.eval() + if (xVal != null) { + val regexStr = + StringEscapeUtils.escapeJava(escape(xVal.asInstanceOf[UTF8String].toString())) + s"""$patternCache[$i] = $patternClass.compile("$regexStr");""" + } else { + s"""$patternCache[$i] = null;""" + } + }.mkString("\n") + + val listCode = listGen.zipWithIndex.map { case (x, i) => s""" |${x.code} |if (${x.isNull}) { | $hasNull = true; // ${ev.isNull} = true; |} else if (!$hasNull && $allMatched) { - | String $rightStr = ${x.value}.toString(); - | $patternClass $pattern = - | $patternClass.compile($escapeFunc($rightStr, '$escapedEscapeChar')); + | $patternClass $pattern = $patternCache[$i]; + | if ($pattern == null) { + | String $rightStr = ${x.value}.toString(); + | $pattern = $patternClass.compile($escapeFunc($rightStr, '$escapedEscapeChar')); + | } | if ($isNot && $pattern.matcher($valueArg.toString()).matches()) { | $allMatched = false; | } else if (!$isNot && !$pattern.matcher($valueArg.toString()).matches()) { | $allMatched = false; | } |} - """.stripMargin) + """.stripMargin + } val resultType = CodeGenerator.javaType(dataType) val codes = ctx.splitExpressionsWithCurrentInputs( @@ -276,6 +293,8 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N ev.copy(code = code""" |${valueGen.code} + |$patternClass[] $patternCache = new $patternClass[${list.length}]; + |$cacheCode |boolean $hasNull = false; |boolean $allMatched = true; |if (${valueGen.isNull}) { From f657ff0372f1cac48ea008a08c1cc7011f934d98 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 16 Oct 2020 15:56:15 +0800 Subject: [PATCH 20/35] Cache foldable pattern and avoid re-evaluate --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index b8a64a5490f91..32bc323953dbc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -238,13 +238,12 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N val allMatched = ctx.freshName("allMatched") val valueArg = ctx.freshName("valueArg") val patternCache = ctx.freshName("patternCache") - // If some pattern expression is foldable, we don't want to re-evaluate the pattern again. + // If some regex expression is foldable, we don't want to re-evaluate the pattern again. val cacheCode = list.zipWithIndex.collect { case (x, i) if x.foldable => val xVal = x.eval() if (xVal != null) { - val regexStr = - StringEscapeUtils.escapeJava(escape(xVal.asInstanceOf[UTF8String].toString())) - s"""$patternCache[$i] = $patternClass.compile("$regexStr");""" + val regex = StringEscapeUtils.escapeJava(escape(xVal.asInstanceOf[UTF8String].toString())) + s"""$patternCache[$i] = $patternClass.compile("$regex");""" } else { s"""$patternCache[$i] = null;""" } From 8df52316a1bb4bbeab427dd165b23addfaa3b859 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Mon, 19 Oct 2020 16:30:41 +0800 Subject: [PATCH 21/35] Improve performance for codegen. --- .../expressions/regexpExpressions.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 32bc323953dbc..fff9da25cd333 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -239,15 +239,15 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N val valueArg = ctx.freshName("valueArg") val patternCache = ctx.freshName("patternCache") // If some regex expression is foldable, we don't want to re-evaluate the pattern again. - val cacheCode = list.zipWithIndex.collect { case (x, i) if x.foldable => - val xVal = x.eval() - if (xVal != null) { - val regex = StringEscapeUtils.escapeJava(escape(xVal.asInstanceOf[UTF8String].toString())) + val evalList = list.filter(_.foldable).map(_.eval()) + val cacheCode = if (evalList.exists(_ == null)) { + s"$hasNull = true;" + } else { + evalList.zipWithIndex.collect { case (x, i) => + val regex = StringEscapeUtils.escapeJava(escape(x.asInstanceOf[UTF8String].toString())) s"""$patternCache[$i] = $patternClass.compile("$regex");""" - } else { - s"""$patternCache[$i] = null;""" - } - }.mkString("\n") + }.mkString("\n") + } val listCode = listGen.zipWithIndex.map { case (x, i) => s""" @@ -293,9 +293,9 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N code""" |${valueGen.code} |$patternClass[] $patternCache = new $patternClass[${list.length}]; - |$cacheCode |boolean $hasNull = false; |boolean $allMatched = true; + |$cacheCode |if (${valueGen.isNull}) { | $hasNull = true; |} else { From ad4d2d9cde81beff27c9eaadae77a132d59599cc Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Mon, 19 Oct 2020 17:45:21 +0800 Subject: [PATCH 22/35] Fix bug --- .../sql/catalyst/expressions/regexpExpressions.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index fff9da25cd333..f06cca142ab48 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -239,11 +239,13 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N val valueArg = ctx.freshName("valueArg") val patternCache = ctx.freshName("patternCache") // If some regex expression is foldable, we don't want to re-evaluate the pattern again. - val evalList = list.filter(_.foldable).map(_.eval()) - val cacheCode = if (evalList.exists(_ == null)) { + val evalList = list.zipWithIndex.filter(_._1.foldable).map { case (x, i) => + (x.eval(), i) + } + val cacheCode = if (evalList.exists(_._1 == null)) { s"$hasNull = true;" } else { - evalList.zipWithIndex.collect { case (x, i) => + evalList.collect { case (x, i) => val regex = StringEscapeUtils.escapeJava(escape(x.asInstanceOf[UTF8String].toString())) s"""$patternCache[$i] = $patternClass.compile("$regex");""" }.mkString("\n") From 55465b8fcd5dbde93c23eae99d94fb877e9cb5f3 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Thu, 22 Oct 2020 12:18:00 +0800 Subject: [PATCH 23/35] iterator all patterns. --- .../expressions/regexpExpressions.scala | 32 +++++++++---------- .../expressions/RegexpExpressionsSuite.scala | 18 +++++++++++ 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index f06cca142ab48..c10b7aa901363 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -177,7 +177,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) } } -abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with NullIntolerant { +abstract class LikeAllBase extends Expression with ImplicitCastInputTypes { def value: Expression = children.head def list: Seq[Expression] = children.tail protected def isNot: Boolean @@ -204,7 +204,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N var hasNull = false var allMatched = true list.foreach { e => - if (!hasNull && allMatched) { + if (allMatched) { val str = e.eval(input) if (str == null) { hasNull = true @@ -217,7 +217,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N } } } - if (hasNull) { + if (allMatched && hasNull) { null } else { allMatched @@ -239,24 +239,22 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N val valueArg = ctx.freshName("valueArg") val patternCache = ctx.freshName("patternCache") // If some regex expression is foldable, we don't want to re-evaluate the pattern again. - val evalList = list.zipWithIndex.filter(_._1.foldable).map { case (x, i) => - (x.eval(), i) - } - val cacheCode = if (evalList.exists(_._1 == null)) { - s"$hasNull = true;" - } else { - evalList.collect { case (x, i) => - val regex = StringEscapeUtils.escapeJava(escape(x.asInstanceOf[UTF8String].toString())) + val cacheCode = list.zipWithIndex.collect { case (x, i) if x.foldable => + val xEval = x.eval() + if (xEval == null) { + s"$patternCache[$i] = null;" + } else { + val regex = StringEscapeUtils.escapeJava(escape(xEval.asInstanceOf[UTF8String].toString())) s"""$patternCache[$i] = $patternClass.compile("$regex");""" - }.mkString("\n") - } + } + }.mkString("\n") val listCode = listGen.zipWithIndex.map { case (x, i) => s""" |${x.code} |if (${x.isNull}) { | $hasNull = true; // ${ev.isNull} = true; - |} else if (!$hasNull && $allMatched) { + |} else if ($allMatched) { | $patternClass $pattern = $patternCache[$i]; | if ($pattern == null) { | String $rightStr = ${x.value}.toString(); @@ -280,13 +278,13 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N returnType = resultType, makeSplitFunction = body => s""" - |if (!$hasNull && $allMatched) { + |if ($allMatched) { | $body; |} """.stripMargin, foldFunctions = _.map { funcCall => s""" - |if (!$hasNull && $allMatched) { + |if ($allMatched) { | $funcCall; |} """.stripMargin @@ -304,7 +302,7 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes with N | $javaDataType $valueArg = ${valueGen.value}; | $codes |} - |final boolean ${ev.isNull} = $hasNull; + |final boolean ${ev.isNull} = $allMatched && $hasNull; |final boolean ${ev.value} = $allMatched; """.stripMargin) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 3db36c53814d9..7ca1a113fad30 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -58,15 +58,33 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Literal.create("foo", StringType).likeAll( Literal.create("%foo%", StringType), Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType).likeAll( + Literal.create(null, StringType), + Literal.create("%foo%", StringType)), null) + checkEvaluation(Literal.create("foo", StringType).likeAll( + Literal.create("%feo%", StringType), + Literal.create(null, StringType)), false) + checkEvaluation(Literal.create("foo", StringType).likeAll( + Literal.create(null, StringType), + Literal.create("%feo%", StringType)), false) checkEvaluation(Literal.create("foo", StringType).notLikeAll( Literal.create("tee", StringType), Literal.create("%yoo%", StringType)), true) checkEvaluation(Literal.create("foo", StringType).notLikeAll( Literal.create("%oo%", StringType), Literal.create("%yoo%", StringType)), false) + checkEvaluation(Literal.create("foo", StringType).notLikeAll( + Literal.create("%foo%", StringType), + Literal.create(null, StringType)), false) + checkEvaluation(Literal.create("foo", StringType).notLikeAll( + Literal.create(null, StringType), + Literal.create("%foo%", StringType)), false) checkEvaluation(Literal.create("foo", StringType).notLikeAll( Literal.create("%yoo%", StringType), Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType).notLikeAll( + Literal.create(null, StringType), + Literal.create("%yoo%", StringType)), null) } test("LIKE Pattern") { From 391ba5db92cd421c091b15991760dac013787f51 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 23 Oct 2020 15:39:59 +0800 Subject: [PATCH 24/35] Optimize code --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 641bb79cf21eb..7c9bd6245d6db 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -244,12 +244,12 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes { val cacheCode = list.zipWithIndex.collect { case (x, i) if x.foldable => val xEval = x.eval() if (xEval == null) { - s"$patternCache[$i] = null;" + "" } else { val regex = StringEscapeUtils.escapeJava(escape(xEval.asInstanceOf[UTF8String].toString())) s"""$patternCache[$i] = $patternClass.compile("$regex");""" } - }.mkString("\n") + }.filterNot(_.isEmpty).mkString("\n") val listCode = listGen.zipWithIndex.map { case (x, i) => s""" @@ -297,11 +297,11 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes { |$patternClass[] $patternCache = new $patternClass[${list.length}]; |boolean $hasNull = false; |boolean $allMatched = true; - |$cacheCode |if (${valueGen.isNull}) { | $hasNull = true; |} else { | $javaDataType $valueArg = ${valueGen.value}; + | $cacheCode | $codes |} |final boolean ${ev.isNull} = $allMatched && $hasNull; From 7b7120faaa0dcfd5e152cab135d1790a550f5fa9 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 23 Oct 2020 15:43:31 +0800 Subject: [PATCH 25/35] Optimize code --- .../catalyst/expressions/regexpExpressions.scala | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 7c9bd6245d6db..5d04cbe911e9a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -242,14 +242,11 @@ abstract class LikeAllBase extends Expression with ImplicitCastInputTypes { val patternCache = ctx.freshName("patternCache") // If some regex expression is foldable, we don't want to re-evaluate the pattern again. val cacheCode = list.zipWithIndex.collect { case (x, i) if x.foldable => - val xEval = x.eval() - if (xEval == null) { - "" - } else { - val regex = StringEscapeUtils.escapeJava(escape(xEval.asInstanceOf[UTF8String].toString())) - s"""$patternCache[$i] = $patternClass.compile("$regex");""" - } - }.filterNot(_.isEmpty).mkString("\n") + (x.eval(), i) + }.filterNot(_._1 == null).map { kv => + val regex = StringEscapeUtils.escapeJava(escape(kv._1.asInstanceOf[UTF8String].toString())) + s"""$patternCache[${kv._2}] = $patternClass.compile("$regex");""" + }.mkString("\n") val listCode = listGen.zipWithIndex.map { case (x, i) => s""" From 1fc5214964a3a522f3cc0a1daf91ced342bb1b51 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Tue, 10 Nov 2020 18:38:35 +0800 Subject: [PATCH 26/35] Simplify code --- .../spark/sql/catalyst/dsl/package.scala | 4 +- .../expressions/regexpExpressions.scala | 139 +++++------------- .../sql/catalyst/parser/AstBuilder.scala | 19 +-- .../expressions/RegexpExpressionsSuite.scala | 8 +- 4 files changed, 55 insertions(+), 115 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index e233381c43b57..3e6834d89d29d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -102,8 +102,8 @@ package object dsl { def like(other: Expression, escapeChar: Char = '\\'): Expression = Like(expr, other, escapeChar) def rlike(other: Expression): Expression = RLike(expr, other) - def likeAll(others: Expression*): Expression = LikeAll(expr +: others) - def notLikeAll(others: Expression*): Expression = NotLikeAll(expr +: others) + def likeAll(others: Literal*): Expression = LikeAll(expr, others.map(_.eval(EmptyRow))) + def notLikeAll(others: Literal*): Expression = NotLikeAll(expr, others.map(_.eval(EmptyRow))) def contains(other: Expression): Expression = Contains(expr, other) def startsWith(other: Expression): Expression = StartsWith(expr, other) def endsWith(other: Expression): Expression = EndsWith(expr, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 5d04cbe911e9a..ef85e0f39d002 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Locale import java.util.regex.{Matcher, MatchResult, Pattern} +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import org.apache.commons.text.StringEscapeUtils @@ -179,140 +180,78 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) } } -abstract class LikeAllBase extends Expression with ImplicitCastInputTypes { - def value: Expression = children.head - def list: Seq[Expression] = children.tail - protected def isNot: Boolean +abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = { - StringType +: Seq.fill(children.size - 1)(StringType) - } + protected def patterns: Seq[Any] - override def dataType: DataType = BooleanType + protected def isNotDefined: Boolean + + override def inputTypes: Seq[DataType] = StringType :: Nil - override def foldable: Boolean = children.forall(_.foldable) + override def dataType: DataType = BooleanType override def nullable: Boolean = true - private def escape(v: String): String = StringUtils.escapeLikeRegex(v, '\\') + private lazy val hasNull: Boolean = patterns.contains(null) - private def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() + private lazy val cache = patterns.filterNot(_ == null) + .map(s => Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'))) override def eval(input: InternalRow): Any = { - val evaluatedValue = value.eval(input) - if (evaluatedValue == null) { + if (hasNull) { null } else { - var hasNull = false - var allMatched = true - list.foreach { e => - if (allMatched) { - val str = e.eval(input) - if (str == null) { - hasNull = true - } else { - val regex = Pattern.compile(escape(str.asInstanceOf[UTF8String].toString)) - val matched = matches(regex, evaluatedValue.asInstanceOf[UTF8String].toString) - if ((isNot && matched) || !(isNot || matched)) { - allMatched = false - } - } - } - } - if (allMatched && hasNull) { - null + val str = child.eval(input).toString + if (isNotDefined) { + !cache.exists(p => p.matcher(str).matches()) } else { - allMatched + cache.forall(p => p.matcher(str).matches()) } } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val eval = child.genCode(ctx) val patternClass = classOf[Pattern].getName - val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex" - val javaDataType = CodeGenerator.javaType(value.dataType) - val valueGen = value.genCode(ctx) - val listGen = list.map(_.genCode(ctx)) + val javaDataType = CodeGenerator.javaType(child.dataType) val pattern = ctx.freshName("pattern") - val rightStr = ctx.freshName("rightStr") - val escapedEscapeChar = StringEscapeUtils.escapeJava("\\") - val hasNull = ctx.freshName("hasNull") + val returnNull = ctx.freshName("returnNull") val allMatched = ctx.freshName("allMatched") val valueArg = ctx.freshName("valueArg") - val patternCache = ctx.freshName("patternCache") - // If some regex expression is foldable, we don't want to re-evaluate the pattern again. - val cacheCode = list.zipWithIndex.collect { case (x, i) if x.foldable => - (x.eval(), i) - }.filterNot(_._1 == null).map { kv => - val regex = StringEscapeUtils.escapeJava(escape(kv._1.asInstanceOf[UTF8String].toString())) - s"""$patternCache[${kv._2}] = $patternClass.compile("$regex");""" - }.mkString("\n") - - val listCode = listGen.zipWithIndex.map { case (x, i) => - s""" - |${x.code} - |if (${x.isNull}) { - | $hasNull = true; // ${ev.isNull} = true; - |} else if ($allMatched) { - | $patternClass $pattern = $patternCache[$i]; - | if ($pattern == null) { - | String $rightStr = ${x.value}.toString(); - | $pattern = $patternClass.compile($escapeFunc($rightStr, '$escapedEscapeChar')); - | } - | if ($isNot && $pattern.matcher($valueArg.toString()).matches()) { - | $allMatched = false; - | } else if (!$isNot && !$pattern.matcher($valueArg.toString()).matches()) { - | $allMatched = false; - | } - |} - """.stripMargin - } + val patternHasNull = ctx.addReferenceObj("hasNull", hasNull) + val patternCache = ctx.addReferenceObj("patternCache", cache.asJava) - val resultType = CodeGenerator.javaType(dataType) - val codes = ctx.splitExpressionsWithCurrentInputs( - expressions = listCode, - funcName = "likeAll", - extraArguments = (javaDataType, valueArg) :: (CodeGenerator.JAVA_BOOLEAN, hasNull) :: - (resultType, allMatched) :: Nil, - returnType = resultType, - makeSplitFunction = body => - s""" - |if ($allMatched) { - | $body; - |} - """.stripMargin, - foldFunctions = _.map { funcCall => - s""" - |if ($allMatched) { - | $funcCall; - |} - """.stripMargin - }.mkString("\n")) ev.copy(code = code""" - |${valueGen.code} - |$patternClass[] $patternCache = new $patternClass[${list.length}]; - |boolean $hasNull = false; + |${eval.code} + |boolean $returnNull = false; |boolean $allMatched = true; - |if (${valueGen.isNull}) { - | $hasNull = true; + |if (${eval.isNull} || $patternHasNull) { + | $returnNull = true; |} else { - | $javaDataType $valueArg = ${valueGen.value}; - | $cacheCode - | $codes + | $javaDataType $valueArg = ${eval.value}; + | for ($patternClass $pattern: $patternCache) { + | if ($isNotDefined && $pattern.matcher($valueArg.toString()).matches()) { + | $allMatched = false; + | break; + | } else if (!$isNotDefined && !$pattern.matcher($valueArg.toString()).matches()) { + | $allMatched = false; + | break; + | } + | } |} - |final boolean ${ev.isNull} = $allMatched && $hasNull; + |final boolean ${ev.isNull} = $returnNull; |final boolean ${ev.value} = $allMatched; """.stripMargin) } } -case class LikeAll(children: Seq[Expression]) extends LikeAllBase { - override def isNot: Boolean = false +case class LikeAll(child: Expression, patterns: Seq[Any]) extends LikeAllBase { + override def isNotDefined: Boolean = false } -case class NotLikeAll(children: Seq[Expression]) extends LikeAllBase { - override def isNot: Boolean = true +case class NotLikeAll(child: Expression, patterns: Seq[Any]) extends LikeAllBase { + override def isNotDefined: Boolean = true } // scalastyle:off line.contains.tab diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index f9d16a960dd26..7a66cd9838890 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1408,19 +1408,20 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case Some(SqlBaseParser.ANY) | Some(SqlBaseParser.SOME) => getLikeQuantifierExprs(ctx.expression).reduceLeft(Or) case Some(SqlBaseParser.ALL) => - if (ctx.expression.isEmpty) { - throw new ParseException("Expected something between '(' and ')'.", ctx) - } else if (ctx.expression.size < 500) { - // An empirical value that will not cause StackOverflowError is used here - getLikeQuantifierExprs(ctx.expression).reduceLeft(And) - } else { + validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx) + val expressions = ctx.expression.asScala.map(expression) + if (expressions.size > 200 && expressions.forall(_.foldable)) { // If there are many pattern expressions, will throw StackOverflowError. + // 200 is an empirical value that will not cause StackOverflowError is used here. // So we use LikeAll or NotLikeAll instead. + val patterns = expressions.map(_.eval(EmptyRow)) ctx.NOT match { - case null => LikeAll(e +: ctx.expression.asScala.map(expression)) - case _ => NotLikeAll(e +: ctx.expression.asScala.map(expression)) + case null => LikeAll(e, patterns) + case _ => NotLikeAll(e, patterns) + } + } else { + getLikeQuantifierExprs(ctx.expression).reduceLeft(And) } - } case _ => val escapeChar = Option(ctx.escapeChar).map(string).map { str => if (str.length != 1) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index bf948c9e1ef7b..c4b8b924ef14e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -63,10 +63,10 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal.create("%foo%", StringType)), null) checkEvaluation(Literal.create("foo", StringType).likeAll( Literal.create("%feo%", StringType), - Literal.create(null, StringType)), false) + Literal.create(null, StringType)), null) checkEvaluation(Literal.create("foo", StringType).likeAll( Literal.create(null, StringType), - Literal.create("%feo%", StringType)), false) + Literal.create("%feo%", StringType)), null) checkEvaluation(Literal.create("foo", StringType).notLikeAll( Literal.create("tee", StringType), Literal.create("%yoo%", StringType)), true) @@ -75,10 +75,10 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal.create("%yoo%", StringType)), false) checkEvaluation(Literal.create("foo", StringType).notLikeAll( Literal.create("%foo%", StringType), - Literal.create(null, StringType)), false) + Literal.create(null, StringType)), null) checkEvaluation(Literal.create("foo", StringType).notLikeAll( Literal.create(null, StringType), - Literal.create("%foo%", StringType)), false) + Literal.create("%foo%", StringType)), null) checkEvaluation(Literal.create("foo", StringType).notLikeAll( Literal.create("%yoo%", StringType), Literal.create(null, StringType)), null) From 53406d349a46dad7edf61e5eb2e27b11e92e508a Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Tue, 10 Nov 2020 19:11:06 +0800 Subject: [PATCH 27/35] Optimize code --- .../sql/catalyst/expressions/regexpExpressions.scala | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index ef85e0f39d002..923370f3c5878 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -221,6 +221,12 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w val patternHasNull = ctx.addReferenceObj("hasNull", hasNull) val patternCache = ctx.addReferenceObj("patternCache", cache.asJava) + val matchCode = if (isNotDefined) { + s"$pattern.matcher($valueArg.toString()).matches()" + } else { + s"!$pattern.matcher($valueArg.toString()).matches()" + } + ev.copy(code = code""" |${eval.code} @@ -231,10 +237,7 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w |} else { | $javaDataType $valueArg = ${eval.value}; | for ($patternClass $pattern: $patternCache) { - | if ($isNotDefined && $pattern.matcher($valueArg.toString()).matches()) { - | $allMatched = false; - | break; - | } else if (!$isNotDefined && !$pattern.matcher($valueArg.toString()).matches()) { + | if ($matchCode) { | $allMatched = false; | break; | } From 15bac5bfecb209ba7b6963d83423b659fbc5086d Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Wed, 11 Nov 2020 10:44:10 +0800 Subject: [PATCH 28/35] Add comments. --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 923370f3c5878..58a77dac0529a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -180,6 +180,9 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) } } +/** + * Optimized version of LIKE ALL, when all pattern values are literal. + */ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { protected def patterns: Seq[Any] From d039c33de33ea4bab4cea3170925c0c4f92ca771 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Wed, 11 Nov 2020 15:21:03 +0800 Subject: [PATCH 29/35] Adjust code. --- .../spark/sql/catalyst/dsl/package.scala | 4 +- .../expressions/regexpExpressions.scala | 26 +++++----- .../sql/catalyst/parser/AstBuilder.scala | 4 +- .../apache/spark/sql/internal/SQLConf.scala | 14 ++++++ .../expressions/RegexpExpressionsSuite.scala | 49 +++++-------------- .../resources/sql-tests/inputs/like-all.sql | 4 ++ 6 files changed, 50 insertions(+), 51 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 3e6834d89d29d..46674dcf95bf6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -102,8 +102,8 @@ package object dsl { def like(other: Expression, escapeChar: Char = '\\'): Expression = Like(expr, other, escapeChar) def rlike(other: Expression): Expression = RLike(expr, other) - def likeAll(others: Literal*): Expression = LikeAll(expr, others.map(_.eval(EmptyRow))) - def notLikeAll(others: Literal*): Expression = NotLikeAll(expr, others.map(_.eval(EmptyRow))) + def likeAll(others: String*): Expression = LikeAll(expr, others) + def notLikeAll(others: String*): Expression = NotLikeAll(expr, others) def contains(other: Expression): Expression = Contains(expr, other) def startsWith(other: Expression): Expression = StartsWith(expr, other) def endsWith(other: Expression): Expression = EndsWith(expr, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 58a77dac0529a..62b7ffd11e83d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -201,14 +201,19 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w .map(s => Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'))) override def eval(input: InternalRow): Any = { - if (hasNull) { + val exprValue = child.eval(input) + if (exprValue == null) { null } else { - val str = child.eval(input).toString - if (isNotDefined) { - !cache.exists(p => p.matcher(str).matches()) + val allMatched = if (isNotDefined) { + !cache.exists(p => p.matcher(exprValue.toString).matches()) } else { - cache.forall(p => p.matcher(str).matches()) + cache.forall(p => p.matcher(exprValue.toString).matches()) + } + if (allMatched && hasNull) { + null + } else { + allMatched } } } @@ -218,8 +223,8 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w val patternClass = classOf[Pattern].getName val javaDataType = CodeGenerator.javaType(child.dataType) val pattern = ctx.freshName("pattern") - val returnNull = ctx.freshName("returnNull") val allMatched = ctx.freshName("allMatched") + val valueIsNull = ctx.freshName("valueIsNull") val valueArg = ctx.freshName("valueArg") val patternHasNull = ctx.addReferenceObj("hasNull", hasNull) val patternCache = ctx.addReferenceObj("patternCache", cache.asJava) @@ -233,20 +238,19 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w ev.copy(code = code""" |${eval.code} - |boolean $returnNull = false; |boolean $allMatched = true; - |if (${eval.isNull} || $patternHasNull) { - | $returnNull = true; + |boolean $valueIsNull = false; + |if (${eval.isNull}) { + | $valueIsNull = true; |} else { | $javaDataType $valueArg = ${eval.value}; | for ($patternClass $pattern: $patternCache) { | if ($matchCode) { | $allMatched = false; - | break; | } | } |} - |final boolean ${ev.isNull} = $returnNull; + |final boolean ${ev.isNull} = $valueIsNull || ($allMatched && $patternHasNull); |final boolean ${ev.value} = $allMatched; """.stripMargin) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 7a66cd9838890..c23a0a304d937 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1410,9 +1410,9 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case Some(SqlBaseParser.ALL) => validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx) val expressions = ctx.expression.asScala.map(expression) - if (expressions.size > 200 && expressions.forall(_.foldable)) { + if (expressions.size > SQLConf.get.optimizerLikeAllConversionThreshold && + expressions.forall(_.foldable)) { // If there are many pattern expressions, will throw StackOverflowError. - // 200 is an empirical value that will not cause StackOverflowError is used here. // So we use LikeAll or NotLikeAll instead. val patterns = expressions.map(_.eval(EmptyRow)) ctx.NOT match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index dad59ba0e7327..411026bef40a7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -216,6 +216,18 @@ object SQLConf { "for using switch statements in InSet must be non-negative and less than or equal to 600") .createWithDefault(400) + val OPTIMIZER_LIKE_ALL_CONVERSION_THRESHOLD = + buildConf("spark.sql.optimizer.likeAllConversionThreshold") + .internal() + .doc("Configure the maximum size of the pattern sequence in like all. Spark will convert " + + "the logical combination of like to avoid StackOverflowError. 200 is an empirical value " + + "that will not cause StackOverflowError.") + .version("3.1.0") + .intConf + .checkValue(threshold => threshold >= 0, "The maximum size of pattern sequence " + + "in like all must be non-negative") + .createWithDefault(200) + val PLAN_CHANGE_LOG_LEVEL = buildConf("spark.sql.planChangeLog.level") .internal() .doc("Configures the log level for logging the change from the original plan to the new " + @@ -2972,6 +2984,8 @@ class SQLConf extends Serializable with Logging { def optimizerInSetSwitchThreshold: Int = getConf(OPTIMIZER_INSET_SWITCH_THRESHOLD) + def optimizerLikeAllConversionThreshold: Int = getConf(OPTIMIZER_LIKE_ALL_CONVERSION_THRESHOLD) + def planChangeLogLevel: String = getConf(PLAN_CHANGE_LOG_LEVEL) def planChangeRules: Option[String] = getConf(PLAN_CHANGE_LOG_RULES) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index c4b8b924ef14e..d6eee4bdd379a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -49,42 +49,19 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("LIKE ALL") { - checkEvaluation(Literal.create("foo", StringType).likeAll( - Literal.create("%foo%", StringType), - Literal.create("%oo", StringType)), true) - checkEvaluation(Literal.create("foo", StringType).likeAll( - Literal.create("%foo%", StringType), - Literal.create("%bar%", StringType)), false) - checkEvaluation(Literal.create("foo", StringType).likeAll( - Literal.create("%foo%", StringType), - Literal.create(null, StringType)), null) - checkEvaluation(Literal.create("foo", StringType).likeAll( - Literal.create(null, StringType), - Literal.create("%foo%", StringType)), null) - checkEvaluation(Literal.create("foo", StringType).likeAll( - Literal.create("%feo%", StringType), - Literal.create(null, StringType)), null) - checkEvaluation(Literal.create("foo", StringType).likeAll( - Literal.create(null, StringType), - Literal.create("%feo%", StringType)), null) - checkEvaluation(Literal.create("foo", StringType).notLikeAll( - Literal.create("tee", StringType), - Literal.create("%yoo%", StringType)), true) - checkEvaluation(Literal.create("foo", StringType).notLikeAll( - Literal.create("%oo%", StringType), - Literal.create("%yoo%", StringType)), false) - checkEvaluation(Literal.create("foo", StringType).notLikeAll( - Literal.create("%foo%", StringType), - Literal.create(null, StringType)), null) - checkEvaluation(Literal.create("foo", StringType).notLikeAll( - Literal.create(null, StringType), - Literal.create("%foo%", StringType)), null) - checkEvaluation(Literal.create("foo", StringType).notLikeAll( - Literal.create("%yoo%", StringType), - Literal.create(null, StringType)), null) - checkEvaluation(Literal.create("foo", StringType).notLikeAll( - Literal.create(null, StringType), - Literal.create("%yoo%", StringType)), null) + checkEvaluation(Literal.create(null, StringType).likeAll("%foo%", "%oo"), null) + checkEvaluation(Literal.create("foo", StringType).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("foo", StringType).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("foo", StringType).likeAll("%foo%", null), null) + checkEvaluation(Literal.create("foo", StringType).likeAll(null, "%foo%"), null) + checkEvaluation(Literal.create("foo", StringType).likeAll("%feo%", null), false) + checkEvaluation(Literal.create("foo", StringType).likeAll(null, "%feo%"), false) + checkEvaluation(Literal.create("foo", StringType).notLikeAll("tee", "%yoo%"), true) + checkEvaluation(Literal.create("foo", StringType).notLikeAll("%oo%", "%yoo%"), false) + checkEvaluation(Literal.create("foo", StringType).notLikeAll("%foo%", null), false) + checkEvaluation(Literal.create("foo", StringType).notLikeAll(null, "%foo%"), false) + checkEvaluation(Literal.create("foo", StringType).notLikeAll("%yoo%", null), null) + checkEvaluation(Literal.create("foo", StringType).notLikeAll(null, "%yoo%"), null) } test("LIKE Pattern") { diff --git a/sql/core/src/test/resources/sql-tests/inputs/like-all.sql b/sql/core/src/test/resources/sql-tests/inputs/like-all.sql index a084dbef61a0c..f83277376e680 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/like-all.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/like-all.sql @@ -1,3 +1,7 @@ +-- test cases for like all +--CONFIG_DIM1 spark.sql.optimizer.likeAllConversionThreshold=0 +--CONFIG_DIM1 spark.sql.optimizer.likeAllConversionThreshold=200 + CREATE OR REPLACE TEMPORARY VIEW like_all_table AS SELECT * FROM (VALUES ('google', '%oo%'), ('facebook', '%oo%'), From 0c7785bd1a56f46443637e35abb1bfb53c999413 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 13 Nov 2020 16:55:42 +0800 Subject: [PATCH 30/35] Optimize code --- .../spark/sql/catalyst/dsl/package.scala | 8 ++++--- .../expressions/regexpExpressions.scala | 6 ++--- .../sql/catalyst/parser/AstBuilder.scala | 4 ++-- .../expressions/RegexpExpressionsSuite.scala | 24 ++++++++++++------- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 46674dcf95bf6..13265432944af 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -21,7 +21,6 @@ import java.sql.{Date, Timestamp} import java.time.{Instant, LocalDate} import scala.language.implicitConversions - import org.apache.spark.api.java.function.FilterFunction import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.analysis._ @@ -31,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** * A collection of implicit conversions that create a DSL for constructing catalyst data structures. @@ -102,8 +102,10 @@ package object dsl { def like(other: Expression, escapeChar: Char = '\\'): Expression = Like(expr, other, escapeChar) def rlike(other: Expression): Expression = RLike(expr, other) - def likeAll(others: String*): Expression = LikeAll(expr, others) - def notLikeAll(others: String*): Expression = NotLikeAll(expr, others) + def likeAll(others: Expression*): Expression = + LikeAll(expr, others.map(_.eval(EmptyRow).asInstanceOf[UTF8String])) + def notLikeAll(others: Expression*): Expression = + NotLikeAll(expr, others.map(_.eval(EmptyRow).asInstanceOf[UTF8String])) def contains(other: Expression): Expression = Contains(expr, other) def startsWith(other: Expression): Expression = StartsWith(expr, other) def endsWith(other: Expression): Expression = EndsWith(expr, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 62b7ffd11e83d..e0de86c51dd7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -185,7 +185,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) */ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { - protected def patterns: Seq[Any] + protected def patterns: Seq[UTF8String] protected def isNotDefined: Boolean @@ -256,11 +256,11 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w } } -case class LikeAll(child: Expression, patterns: Seq[Any]) extends LikeAllBase { +case class LikeAll(child: Expression, patterns: Seq[UTF8String]) extends LikeAllBase { override def isNotDefined: Boolean = false } -case class NotLikeAll(child: Expression, patterns: Seq[Any]) extends LikeAllBase { +case class NotLikeAll(child: Expression, patterns: Seq[UTF8String]) extends LikeAllBase { override def isNotDefined: Boolean = true } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index c23a0a304d937..c93e318c76500 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1411,10 +1411,10 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx) val expressions = ctx.expression.asScala.map(expression) if (expressions.size > SQLConf.get.optimizerLikeAllConversionThreshold && - expressions.forall(_.foldable)) { + expressions.forall(_.foldable) && expressions.forall(_.dataType == StringType)) { // If there are many pattern expressions, will throw StackOverflowError. // So we use LikeAll or NotLikeAll instead. - val patterns = expressions.map(_.eval(EmptyRow)) + val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String]) ctx.NOT match { case null => LikeAll(e, patterns) case _ => NotLikeAll(e, patterns) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index d6eee4bdd379a..cc5ab5dc7b4e0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -52,16 +52,24 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Literal.create(null, StringType).likeAll("%foo%", "%oo"), null) checkEvaluation(Literal.create("foo", StringType).likeAll("%foo%", "%oo"), true) checkEvaluation(Literal.create("foo", StringType).likeAll("%foo%", "%bar%"), false) - checkEvaluation(Literal.create("foo", StringType).likeAll("%foo%", null), null) - checkEvaluation(Literal.create("foo", StringType).likeAll(null, "%foo%"), null) - checkEvaluation(Literal.create("foo", StringType).likeAll("%feo%", null), false) - checkEvaluation(Literal.create("foo", StringType).likeAll(null, "%feo%"), false) + checkEvaluation(Literal.create("foo", StringType) + .likeAll("%foo%", Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType) + .likeAll(Literal.create(null, StringType), "%foo%"), null) + checkEvaluation(Literal.create("foo", StringType) + .likeAll("%feo%", Literal.create(null, StringType)), false) + checkEvaluation(Literal.create("foo", StringType) + .likeAll(Literal.create(null, StringType), "%feo%"), false) checkEvaluation(Literal.create("foo", StringType).notLikeAll("tee", "%yoo%"), true) checkEvaluation(Literal.create("foo", StringType).notLikeAll("%oo%", "%yoo%"), false) - checkEvaluation(Literal.create("foo", StringType).notLikeAll("%foo%", null), false) - checkEvaluation(Literal.create("foo", StringType).notLikeAll(null, "%foo%"), false) - checkEvaluation(Literal.create("foo", StringType).notLikeAll("%yoo%", null), null) - checkEvaluation(Literal.create("foo", StringType).notLikeAll(null, "%yoo%"), null) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAll("%foo%", Literal.create(null, StringType)), false) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAll(Literal.create(null, StringType), "%foo%"), false) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAll("%yoo%", Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAll(Literal.create(null, StringType), "%yoo%"), null) } test("LIKE Pattern") { From 7af8ffe49fc02765a80a85faccaa7209fe8b9c57 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 13 Nov 2020 16:56:57 +0800 Subject: [PATCH 31/35] Optimize code --- .../main/scala/org/apache/spark/sql/catalyst/dsl/package.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 13265432944af..4cd649b07a5c0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -21,6 +21,7 @@ import java.sql.{Date, Timestamp} import java.time.{Instant, LocalDate} import scala.language.implicitConversions + import org.apache.spark.api.java.function.FilterFunction import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.analysis._ From 97c1c7389e537f0d38f1b6a17bbe9ba70c9bc6ea Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Tue, 17 Nov 2020 10:35:13 +0800 Subject: [PATCH 32/35] Revert sql-expression-schema.md --- .../src/test/resources/sql-functions/sql-expression-schema.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 3974735c87915..da83df4994d8d 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -346,4 +346,4 @@ | org.apache.spark.sql.catalyst.expressions.xml.XPathList | xpath | SELECT xpath('b1b2b3c1c2','a/b/text()') | structb1b2b3c1c2, a/b/text()):array> | | org.apache.spark.sql.catalyst.expressions.xml.XPathLong | xpath_long | SELECT xpath_long('12', 'sum(a/b)') | struct12, sum(a/b)):bigint> | | org.apache.spark.sql.catalyst.expressions.xml.XPathShort | xpath_short | SELECT xpath_short('12', 'sum(a/b)') | struct12, sum(a/b)):smallint> | -| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('bcc','a/c') | structbcc, a/c):string> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('bcc','a/c') | structbcc, a/c):string> | \ No newline at end of file From 161493347db00a1cf0fc7a4257f43e625b3e3aa5 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Tue, 17 Nov 2020 11:10:28 +0800 Subject: [PATCH 33/35] Optimize code. --- .../expressions/regexpExpressions.scala | 13 +++++----- .../apache/spark/sql/SQLQueryTestSuite.scala | 3 ++- .../sql/test/DataFrameReaderWriterSuite.scala | 24 +++++++++++++++++++ 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index e0de86c51dd7a..1d23b8d8070b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -187,7 +187,7 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w protected def patterns: Seq[UTF8String] - protected def isNotDefined: Boolean + protected def isNotLikeAll: Boolean override def inputTypes: Seq[DataType] = StringType :: Nil @@ -205,7 +205,7 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w if (exprValue == null) { null } else { - val allMatched = if (isNotDefined) { + val allMatched = if (isNotLikeAll) { !cache.exists(p => p.matcher(exprValue.toString).matches()) } else { cache.forall(p => p.matcher(exprValue.toString).matches()) @@ -226,10 +226,9 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w val allMatched = ctx.freshName("allMatched") val valueIsNull = ctx.freshName("valueIsNull") val valueArg = ctx.freshName("valueArg") - val patternHasNull = ctx.addReferenceObj("hasNull", hasNull) val patternCache = ctx.addReferenceObj("patternCache", cache.asJava) - val matchCode = if (isNotDefined) { + val matchCode = if (isNotLikeAll) { s"$pattern.matcher($valueArg.toString()).matches()" } else { s"!$pattern.matcher($valueArg.toString()).matches()" @@ -250,18 +249,18 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w | } | } |} - |final boolean ${ev.isNull} = $valueIsNull || ($allMatched && $patternHasNull); + |final boolean ${ev.isNull} = $valueIsNull || ($allMatched && $hasNull); |final boolean ${ev.value} = $allMatched; """.stripMargin) } } case class LikeAll(child: Expression, patterns: Seq[UTF8String]) extends LikeAllBase { - override def isNotDefined: Boolean = false + override def isNotLikeAll: Boolean = false } case class NotLikeAll(child: Expression, patterns: Seq[UTF8String]) extends LikeAllBase { - override def isNotDefined: Boolean = true + override def isNotLikeAll: Boolean = true } // scalastyle:off line.contains.tab diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 36e55c0994f18..5c7feec003eb4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -563,7 +563,8 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper // Filter out test files with invalid extensions such as temp files created // by vi (.swp), Mac (.DS_Store) etc. val filteredFiles = files.filter(_.getName.endsWith(validFileExtensions)) - filteredFiles ++ dirs.flatMap(listFilesRecursively) + (filteredFiles ++ dirs.flatMap(listFilesRecursively)) + .filter(_.getName.equals("window_part1.sql")) } /** Load built-in test tables into the SparkSession. */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala index 4e61dba4955af..84553fe5ae4c8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala @@ -1033,6 +1033,30 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with } } + test("abc2") { + spark.sql("create table SPARK_33045(id string) using parquet") + val values = Range(1, 90000) + spark.sql(s"select concat_ws(${values.mkString(", ")})").show + } + + test("abc1") { + spark.sql("create table SPARK_33045(id string) using parquet") + val values = Range(1, 9000) + spark.sql(s"select * from SPARK_33045 where id in (${values.mkString(", ")}, id)").show + } + + test("abc") { + spark.sql("create table SPARK_33045(id string) using parquet") + val values = Range(1, 9000) + spark.sql(s"select * from SPARK_33045 where id like all (${values.mkString(", ")})").show + } + + test("concat") { + spark.sql("create table SPARK_33045(id int) using parquet") + val values = Range(1, 900) + spark.sql(s"select concat(${values.mkString(", ")}, id) from SPARK_33045").show + } + test("Insert overwrite table command should output correct schema: basic") { withTable("tbl", "tbl2") { withView("view1") { From f0e3de1718e99c887833f230c77c17c3851f9fc7 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Tue, 17 Nov 2020 11:14:15 +0800 Subject: [PATCH 34/35] Revert some code --- .../apache/spark/sql/SQLQueryTestSuite.scala | 3 +-- .../sql/test/DataFrameReaderWriterSuite.scala | 24 ------------------- 2 files changed, 1 insertion(+), 26 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 5c7feec003eb4..36e55c0994f18 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -563,8 +563,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper // Filter out test files with invalid extensions such as temp files created // by vi (.swp), Mac (.DS_Store) etc. val filteredFiles = files.filter(_.getName.endsWith(validFileExtensions)) - (filteredFiles ++ dirs.flatMap(listFilesRecursively)) - .filter(_.getName.equals("window_part1.sql")) + filteredFiles ++ dirs.flatMap(listFilesRecursively) } /** Load built-in test tables into the SparkSession. */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala index 84553fe5ae4c8..4e61dba4955af 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala @@ -1033,30 +1033,6 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with } } - test("abc2") { - spark.sql("create table SPARK_33045(id string) using parquet") - val values = Range(1, 90000) - spark.sql(s"select concat_ws(${values.mkString(", ")})").show - } - - test("abc1") { - spark.sql("create table SPARK_33045(id string) using parquet") - val values = Range(1, 9000) - spark.sql(s"select * from SPARK_33045 where id in (${values.mkString(", ")}, id)").show - } - - test("abc") { - spark.sql("create table SPARK_33045(id string) using parquet") - val values = Range(1, 9000) - spark.sql(s"select * from SPARK_33045 where id like all (${values.mkString(", ")})").show - } - - test("concat") { - spark.sql("create table SPARK_33045(id int) using parquet") - val values = Range(1, 900) - spark.sql(s"select concat(${values.mkString(", ")}, id) from SPARK_33045").show - } - test("Insert overwrite table command should output correct schema: basic") { withTable("tbl", "tbl2") { withView("view1") { From 001eb38f603267c6a6f4e1c25430b8900644f5b7 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Thu, 19 Nov 2020 14:47:33 +0800 Subject: [PATCH 35/35] Optimize code. --- .../expressions/regexpExpressions.scala | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 1d23b8d8070b8..b4d9921488d5f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -200,20 +200,21 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w private lazy val cache = patterns.filterNot(_ == null) .map(s => Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'))) + private lazy val matchFunc = if (isNotLikeAll) { + (p: Pattern, inputValue: String) => !p.matcher(inputValue).matches() + } else { + (p: Pattern, inputValue: String) => p.matcher(inputValue).matches() + } + override def eval(input: InternalRow): Any = { val exprValue = child.eval(input) if (exprValue == null) { null } else { - val allMatched = if (isNotLikeAll) { - !cache.exists(p => p.matcher(exprValue.toString).matches()) - } else { - cache.forall(p => p.matcher(exprValue.toString).matches()) - } - if (allMatched && hasNull) { - null + if (cache.forall(matchFunc(_, exprValue.toString))) { + if (hasNull) null else true } else { - allMatched + false } } } @@ -223,12 +224,10 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w val patternClass = classOf[Pattern].getName val javaDataType = CodeGenerator.javaType(child.dataType) val pattern = ctx.freshName("pattern") - val allMatched = ctx.freshName("allMatched") - val valueIsNull = ctx.freshName("valueIsNull") val valueArg = ctx.freshName("valueArg") val patternCache = ctx.addReferenceObj("patternCache", cache.asJava) - val matchCode = if (isNotLikeAll) { + val checkNotMatchCode = if (isNotLikeAll) { s"$pattern.matcher($valueArg.toString()).matches()" } else { s"!$pattern.matcher($valueArg.toString()).matches()" @@ -237,20 +236,20 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w ev.copy(code = code""" |${eval.code} - |boolean $allMatched = true; - |boolean $valueIsNull = false; + |boolean ${ev.isNull} = false; + |boolean ${ev.value} = true; |if (${eval.isNull}) { - | $valueIsNull = true; + | ${ev.isNull} = true; |} else { | $javaDataType $valueArg = ${eval.value}; | for ($patternClass $pattern: $patternCache) { - | if ($matchCode) { - | $allMatched = false; + | if ($checkNotMatchCode) { + | ${ev.value} = false; + | break; | } | } + | if (${ev.value} && $hasNull) ${ev.isNull} = true; |} - |final boolean ${ev.isNull} = $valueIsNull || ($allMatched && $hasNull); - |final boolean ${ev.value} = $allMatched; """.stripMargin) } }