From efb2f4a78503cd9da3cc88ad0b35c81dc2560078 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 19 Sep 2024 23:33:35 -0700 Subject: [PATCH 01/20] add support for duplicate keys in from_json(_, 'variant') --- .../expressions/jsonExpressions.scala | 5 ++- .../spark/sql/VariantEndToEndSuite.scala | 32 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 7005d663a3f96..84ed14dcd09a4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -717,9 +717,12 @@ case class JsonToStructs( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) + private val variantAllowDuplicateKeys = SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS) + override def nullSafeEval(json: Any): Any = nullableSchema match { case _: VariantType => - VariantExpressionEvalUtils.parseJson(json.asInstanceOf[UTF8String]) + VariantExpressionEvalUtils.parseJson(json.asInstanceOf[UTF8String], + allowDuplicateKeys = variantAllowDuplicateKeys) case _ => converter(parser.parse(json.asInstanceOf[UTF8String])) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala index 3224baf42f3e5..19d4ac23709b6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantEndToEndSuite.scala @@ -16,6 +16,7 @@ */ package org.apache.spark.sql +import org.apache.spark.SparkThrowable import org.apache.spark.sql.QueryTest.sameRows import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Cast, Literal} @@ -28,6 +29,7 @@ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnarArray import org.apache.spark.types.variant.VariantBuilder +import org.apache.spark.types.variant.VariantUtil._ import org.apache.spark.unsafe.types.VariantVal class VariantEndToEndSuite extends QueryTest with SharedSparkSession { @@ -37,8 +39,10 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { def check(input: String, output: String = null): Unit = { val df = Seq(input).toDF("v") val variantDF = df.select(to_json(parse_json(col("v")))) + val variantDF2 = df.select(to_json(from_json(col("v"), VariantType))) val expected = if (output != null) output else input checkAnswer(variantDF, Seq(Row(expected))) + checkAnswer(variantDF2, Seq(Row(expected))) } check("null") @@ -339,4 +343,32 @@ class VariantEndToEndSuite extends QueryTest with SharedSparkSession { } } } + + test("from_json(_, 'variant') with duplicate keys") { + val json: String = """{"a": 1, "b": 2, "c": "3", "a": 4}""" + withSQLConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS.key -> "true") { + val df = Seq(json).toDF("j") + .selectExpr("from_json(j,'variant')") + val actual = df.collect().head(0).asInstanceOf[VariantVal] + val expectedValue: Array[Byte] = Array(objectHeader(false, 1, 1), + /* size */ 3, + /* id list */ 0, 1, 2, + /* offset list */ 4, 0, 2, 6, + /* field data */ primitiveHeader(INT1), 2, shortStrHeader(1), '3', + primitiveHeader(INT1), 4) + val expectedMetadata: Array[Byte] = Array(VERSION, 3, 0, 1, 2, 3, 'a', 'b', 'c') + assert(actual === new VariantVal(expectedValue, expectedMetadata)) + } + withSQLConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS.key -> "false") { + val df = Seq(json).toDF("j") + .selectExpr("from_json(j,'variant')") + checkError( + exception = intercept[SparkThrowable] { + df.collect() + }, + condition = "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION", + parameters = Map("badRecord" -> json, "failFastMode" -> "FAILFAST") + ) + } + } } From 867288371a8ca885a6132d807279ee30e415679a Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Fri, 20 Sep 2024 00:55:23 -0700 Subject: [PATCH 02/20] Addressed @MaxGekk's comment --- .../sql/catalyst/expressions/jsonExpressions.scala | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 84ed14dcd09a4..ea5c2ac388a42 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -632,7 +632,8 @@ case class JsonToStructs( schema: DataType, options: Map[String, String], child: Expression, - timeZoneId: Option[String] = None) + timeZoneId: Option[String] = None, + variantAllowDuplicateKeys: Boolean = SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS)) extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback @@ -717,8 +718,6 @@ case class JsonToStructs( override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) - private val variantAllowDuplicateKeys = SQLConf.get.getConf(SQLConf.VARIANT_ALLOW_DUPLICATE_KEYS) - override def nullSafeEval(json: Any): Any = nullableSchema match { case _: VariantType => VariantExpressionEvalUtils.parseJson(json.asInstanceOf[UTF8String], @@ -740,6 +739,12 @@ case class JsonToStructs( copy(child = newChild) } +object JsonToStructs { + def unapply( + j: JsonToStructs): Option[(DataType, Map[String, String], Expression, Option[String])] = + Some((j.schema, j.options, j.child, j.timeZoneId)) +} + /** * Converts a [[StructType]], [[ArrayType]] or [[MapType]] to a JSON output string. */ From 57a71aa5e8a0dd930460e2eb72a6717d51a93b28 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Fri, 20 Sep 2024 12:26:11 -0700 Subject: [PATCH 03/20] regenerated golden files --- .../function_from_json.explain | 2 +- .../function_from_json_orphaned.explain | 2 +- ...unction_from_json_with_json_schema.explain | 2 +- .../analyzer-results/ansi/date.sql.out | 2 +- .../analyzer-results/ansi/interval.sql.out | 6 ++-- .../ansi/parse-schema-string.sql.out | 4 +-- .../analyzer-results/ansi/timestamp.sql.out | 2 +- .../sql-tests/analyzer-results/date.sql.out | 2 +- .../analyzer-results/datetime-legacy.sql.out | 4 +-- .../analyzer-results/interval.sql.out | 6 ++-- .../analyzer-results/json-functions.sql.out | 34 +++++++++---------- .../parse-schema-string.sql.out | 4 +-- .../sql-session-variables.sql.out | 2 +- .../subexp-elimination.sql.out | 10 +++--- .../analyzer-results/timestamp.sql.out | 2 +- .../timestampNTZ/timestamp-ansi.sql.out | 2 +- .../timestampNTZ/timestamp.sql.out | 2 +- .../native/stringCastAndExpressions.sql.out | 2 +- 18 files changed, 45 insertions(+), 45 deletions(-) diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain index 1219f11d4696e..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain @@ -1,2 +1,2 @@ -Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain index 1219f11d4696e..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_orphaned.explain @@ -1,2 +1,2 @@ -Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain index 1219f11d4696e..8d1d122d156ff 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_from_json_with_json_schema.explain @@ -1,2 +1,2 @@ -Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles)) AS from_json(g)#0] +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles), false) AS from_json(g)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out index fd927b99c6456..0e4d2d4e99e26 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/date.sql.out @@ -736,7 +736,7 @@ Project [to_date(26/October/2015, Some(dd/MMMMM/yyyy), Some(America/Los_Angeles) -- !query select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"d":"26/October/2015"})#x] +Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"d":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out index 12756576ded9b..4ed020c46e38b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/interval.sql.out @@ -2074,7 +2074,7 @@ SELECT to_csv(named_struct('a', interval 32 year, 'b', interval 10 month)), from_csv(to_csv(named_struct('a', interval 32 year, 'b', interval 10 month)), 'a interval year, b interval month') -- !query analysis -Project [from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles)) AS from_json({"a":"1 days"})#x, from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None) AS from_csv(1, 1)#x, to_json(from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1 days"}))#x, to_csv(from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None), Some(America/Los_Angeles)) AS to_csv(from_csv(1, 1))#x, to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)) AS to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH))#x, from_csv(StructField(a,YearMonthIntervalType(0,0),true), StructField(b,YearMonthIntervalType(1,1),true), to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), None) AS from_csv(to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH)))#x] +Project [from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles), false) AS from_json({"a":"1 days"})#x, from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None) AS from_csv(1, 1)#x, to_json(from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1 days"}))#x, to_csv(from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None), Some(America/Los_Angeles)) AS to_csv(from_csv(1, 1))#x, to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)) AS to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH))#x, from_csv(StructField(a,YearMonthIntervalType(0,0),true), StructField(b,YearMonthIntervalType(1,1),true), to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), None) AS from_csv(to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH)))#x] +- OneRowRelation @@ -2085,7 +2085,7 @@ SELECT to_json(map('a', interval 100 day 130 minute)), from_json(to_json(map('a', interval 100 day 130 minute)), 'a interval day to minute') -- !query analysis -Project [from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE))#x, from_json(StructField(a,DayTimeIntervalType(0,2),true), to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS from_json(to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE)))#x] +Project [from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE))#x, from_json(StructField(a,DayTimeIntervalType(0,2),true), to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)), Some(America/Los_Angeles), false) AS from_json(to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE)))#x] +- OneRowRelation @@ -2096,7 +2096,7 @@ SELECT to_json(map('a', interval 32 year 10 month)), from_json(to_json(map('a', interval 32 year 10 month)), 'a interval year to month') -- !query analysis -Project [from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '32-10' YEAR TO MONTH))#x, from_json(StructField(a,YearMonthIntervalType(0,1),true), to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS from_json(to_json(map(a, INTERVAL '32-10' YEAR TO MONTH)))#x] +Project [from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '32-10' YEAR TO MONTH))#x, from_json(StructField(a,YearMonthIntervalType(0,1),true), to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), false) AS from_json(to_json(map(a, INTERVAL '32-10' YEAR TO MONTH)))#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/parse-schema-string.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/parse-schema-string.sql.out index 45fc3bd03a782..ae8e47ed3665c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/parse-schema-string.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/parse-schema-string.sql.out @@ -16,12 +16,12 @@ Project [from_csv(StructField(cube,IntegerType,true), 1, Some(America/Los_Angele -- !query select from_json('{"create":1}', 'create INT') -- !query analysis -Project [from_json(StructField(create,IntegerType,true), {"create":1}, Some(America/Los_Angeles)) AS from_json({"create":1})#x] +Project [from_json(StructField(create,IntegerType,true), {"create":1}, Some(America/Los_Angeles), false) AS from_json({"create":1})#x] +- OneRowRelation -- !query select from_json('{"cube":1}', 'cube INT') -- !query analysis -Project [from_json(StructField(cube,IntegerType,true), {"cube":1}, Some(America/Los_Angeles)) AS from_json({"cube":1})#x] +Project [from_json(StructField(cube,IntegerType,true), {"cube":1}, Some(America/Los_Angeles), false) AS from_json({"cube":1})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out index bf34490d657e3..560974d28c545 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/timestamp.sql.out @@ -730,7 +730,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out index 48137e06467e8..88c7d7b4e7d72 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/date.sql.out @@ -811,7 +811,7 @@ Project [to_date(26/October/2015, Some(dd/MMMMM/yyyy), Some(America/Los_Angeles) -- !query select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"d":"26/October/2015"})#x] +Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"d":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out index 1e49f4df8267a..4221db822d024 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/datetime-legacy.sql.out @@ -811,7 +811,7 @@ Project [to_date(26/October/2015, Some(dd/MMMMM/yyyy), Some(America/Los_Angeles) -- !query select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"d":"26/October/2015"})#x] +Project [from_json(StructField(d,DateType,true), (dateFormat,dd/MMMMM/yyyy), {"d":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"d":"26/October/2015"})#x] +- OneRowRelation @@ -1833,7 +1833,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out index 290e55052931d..d778f9983b78a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/interval.sql.out @@ -2074,7 +2074,7 @@ SELECT to_csv(named_struct('a', interval 32 year, 'b', interval 10 month)), from_csv(to_csv(named_struct('a', interval 32 year, 'b', interval 10 month)), 'a interval year, b interval month') -- !query analysis -Project [from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles)) AS from_json({"a":"1 days"})#x, from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None) AS from_csv(1, 1)#x, to_json(from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1 days"}))#x, to_csv(from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None), Some(America/Los_Angeles)) AS to_csv(from_csv(1, 1))#x, to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)) AS to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH))#x, from_csv(StructField(a,YearMonthIntervalType(0,0),true), StructField(b,YearMonthIntervalType(1,1),true), to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), None) AS from_csv(to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH)))#x] +Project [from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles), false) AS from_json({"a":"1 days"})#x, from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None) AS from_csv(1, 1)#x, to_json(from_json(StructField(a,CalendarIntervalType,true), {"a":"1 days"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1 days"}))#x, to_csv(from_csv(StructField(a,IntegerType,true), StructField(b,YearMonthIntervalType(0,0),true), 1, 1, Some(America/Los_Angeles), None), Some(America/Los_Angeles)) AS to_csv(from_csv(1, 1))#x, to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)) AS to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH))#x, from_csv(StructField(a,YearMonthIntervalType(0,0),true), StructField(b,YearMonthIntervalType(1,1),true), to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), None) AS from_csv(to_csv(named_struct(a, INTERVAL '32' YEAR, b, INTERVAL '10' MONTH)))#x] +- OneRowRelation @@ -2085,7 +2085,7 @@ SELECT to_json(map('a', interval 100 day 130 minute)), from_json(to_json(map('a', interval 100 day 130 minute)), 'a interval day to minute') -- !query analysis -Project [from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE))#x, from_json(StructField(a,DayTimeIntervalType(0,2),true), to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS from_json(to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE)))#x] +Project [from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,DayTimeIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE))#x, from_json(StructField(a,DayTimeIntervalType(0,2),true), to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE), Some(America/Los_Angeles)), Some(America/Los_Angeles), false) AS from_json(to_json(map(a, INTERVAL '100 02:10' DAY TO MINUTE)))#x] +- OneRowRelation @@ -2096,7 +2096,7 @@ SELECT to_json(map('a', interval 32 year 10 month)), from_json(to_json(map('a', interval 32 year 10 month)), 'a interval year to month') -- !query analysis -Project [from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '32-10' YEAR TO MONTH))#x, from_json(StructField(a,YearMonthIntervalType(0,1),true), to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles)) AS from_json(to_json(map(a, INTERVAL '32-10' YEAR TO MONTH)))#x] +Project [from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false) AS from_json({"a":"1"})#x, to_json(from_json(StructField(a,YearMonthIntervalType(0,0),true), {"a":"1"}, Some(America/Los_Angeles), false), Some(America/Los_Angeles)) AS to_json(from_json({"a":"1"}))#x, to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)) AS to_json(map(a, INTERVAL '32-10' YEAR TO MONTH))#x, from_json(StructField(a,YearMonthIntervalType(0,1),true), to_json(map(a, INTERVAL '32-10' YEAR TO MONTH), Some(America/Los_Angeles)), Some(America/Los_Angeles), false) AS from_json(to_json(map(a, INTERVAL '32-10' YEAR TO MONTH)))#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out index 0d7c6b2056231..fef9d0c5b6250 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/json-functions.sql.out @@ -118,14 +118,14 @@ org.apache.spark.sql.AnalysisException -- !query select from_json('{"a":1}', 'a INT') -- !query analysis -Project [from_json(StructField(a,IntegerType,true), {"a":1}, Some(America/Los_Angeles)) AS from_json({"a":1})#x] +Project [from_json(StructField(a,IntegerType,true), {"a":1}, Some(America/Los_Angeles), false) AS from_json({"a":1})#x] +- OneRowRelation -- !query select from_json('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')) -- !query analysis -Project [from_json(StructField(time,TimestampType,true), (timestampFormat,dd/MM/yyyy), {"time":"26/08/2015"}, Some(America/Los_Angeles)) AS from_json({"time":"26/08/2015"})#x] +Project [from_json(StructField(time,TimestampType,true), (timestampFormat,dd/MM/yyyy), {"time":"26/08/2015"}, Some(America/Los_Angeles), false) AS from_json({"time":"26/08/2015"})#x] +- OneRowRelation @@ -279,14 +279,14 @@ DropTempViewCommand jsonTable -- !query select from_json('{"a":1, "b":2}', 'map') -- !query analysis -Project [from_json(MapType(StringType,IntegerType,true), {"a":1, "b":2}, Some(America/Los_Angeles)) AS entries#x] +Project [from_json(MapType(StringType,IntegerType,true), {"a":1, "b":2}, Some(America/Los_Angeles), false) AS entries#x] +- OneRowRelation -- !query select from_json('{"a":1, "b":"2"}', 'struct') -- !query analysis -Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), {"a":1, "b":"2"}, Some(America/Los_Angeles)) AS from_json({"a":1, "b":"2"})#x] +Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), {"a":1, "b":"2"}, Some(America/Los_Angeles), false) AS from_json({"a":1, "b":"2"})#x] +- OneRowRelation @@ -300,70 +300,70 @@ Project [schema_of_json({"c1":0, "c2":[1]}) AS schema_of_json({"c1":0, "c2":[1]} -- !query select from_json('{"c1":[1, 2, 3]}', schema_of_json('{"c1":[0]}')) -- !query analysis -Project [from_json(StructField(c1,ArrayType(LongType,true),true), {"c1":[1, 2, 3]}, Some(America/Los_Angeles)) AS from_json({"c1":[1, 2, 3]})#x] +Project [from_json(StructField(c1,ArrayType(LongType,true),true), {"c1":[1, 2, 3]}, Some(America/Los_Angeles), false) AS from_json({"c1":[1, 2, 3]})#x] +- OneRowRelation -- !query select from_json('[1, 2, 3]', 'array') -- !query analysis -Project [from_json(ArrayType(IntegerType,true), [1, 2, 3], Some(America/Los_Angeles)) AS from_json([1, 2, 3])#x] +Project [from_json(ArrayType(IntegerType,true), [1, 2, 3], Some(America/Los_Angeles), false) AS from_json([1, 2, 3])#x] +- OneRowRelation -- !query select from_json('[1, "2", 3]', 'array') -- !query analysis -Project [from_json(ArrayType(IntegerType,true), [1, "2", 3], Some(America/Los_Angeles)) AS from_json([1, "2", 3])#x] +Project [from_json(ArrayType(IntegerType,true), [1, "2", 3], Some(America/Los_Angeles), false) AS from_json([1, "2", 3])#x] +- OneRowRelation -- !query select from_json('[1, 2, null]', 'array') -- !query analysis -Project [from_json(ArrayType(IntegerType,true), [1, 2, null], Some(America/Los_Angeles)) AS from_json([1, 2, null])#x] +Project [from_json(ArrayType(IntegerType,true), [1, 2, null], Some(America/Los_Angeles), false) AS from_json([1, 2, null])#x] +- OneRowRelation -- !query select from_json('[{"a": 1}, {"a":2}]', 'array>') -- !query analysis -Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), [{"a": 1}, {"a":2}], Some(America/Los_Angeles)) AS from_json([{"a": 1}, {"a":2}])#x] +Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), [{"a": 1}, {"a":2}], Some(America/Los_Angeles), false) AS from_json([{"a": 1}, {"a":2}])#x] +- OneRowRelation -- !query select from_json('{"a": 1}', 'array>') -- !query analysis -Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), {"a": 1}, Some(America/Los_Angeles)) AS from_json({"a": 1})#x] +Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), {"a": 1}, Some(America/Los_Angeles), false) AS from_json({"a": 1})#x] +- OneRowRelation -- !query select from_json('[null, {"a":2}]', 'array>') -- !query analysis -Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), [null, {"a":2}], Some(America/Los_Angeles)) AS from_json([null, {"a":2}])#x] +Project [from_json(ArrayType(StructType(StructField(a,IntegerType,true)),true), [null, {"a":2}], Some(America/Los_Angeles), false) AS from_json([null, {"a":2}])#x] +- OneRowRelation -- !query select from_json('[{"a": 1}, {"b":2}]', 'array>') -- !query analysis -Project [from_json(ArrayType(MapType(StringType,IntegerType,true),true), [{"a": 1}, {"b":2}], Some(America/Los_Angeles)) AS from_json([{"a": 1}, {"b":2}])#x] +Project [from_json(ArrayType(MapType(StringType,IntegerType,true),true), [{"a": 1}, {"b":2}], Some(America/Los_Angeles), false) AS from_json([{"a": 1}, {"b":2}])#x] +- OneRowRelation -- !query select from_json('[{"a": 1}, 2]', 'array>') -- !query analysis -Project [from_json(ArrayType(MapType(StringType,IntegerType,true),true), [{"a": 1}, 2], Some(America/Los_Angeles)) AS from_json([{"a": 1}, 2])#x] +Project [from_json(ArrayType(MapType(StringType,IntegerType,true),true), [{"a": 1}, 2], Some(America/Los_Angeles), false) AS from_json([{"a": 1}, 2])#x] +- OneRowRelation -- !query select from_json('{"d": "2012-12-15", "t": "2012-12-15 15:15:15"}', 'd date, t timestamp') -- !query analysis -Project [from_json(StructField(d,DateType,true), StructField(t,TimestampType,true), {"d": "2012-12-15", "t": "2012-12-15 15:15:15"}, Some(America/Los_Angeles)) AS from_json({"d": "2012-12-15", "t": "2012-12-15 15:15:15"})#x] +Project [from_json(StructField(d,DateType,true), StructField(t,TimestampType,true), {"d": "2012-12-15", "t": "2012-12-15 15:15:15"}, Some(America/Los_Angeles), false) AS from_json({"d": "2012-12-15", "t": "2012-12-15 15:15:15"})#x] +- OneRowRelation @@ -373,7 +373,7 @@ select from_json( 'd date, t timestamp', map('dateFormat', 'MM/dd yyyy', 'timestampFormat', 'MM/dd yyyy HH:mm:ss')) -- !query analysis -Project [from_json(StructField(d,DateType,true), StructField(t,TimestampType,true), (dateFormat,MM/dd yyyy), (timestampFormat,MM/dd yyyy HH:mm:ss), {"d": "12/15 2012", "t": "12/15 2012 15:15:15"}, Some(America/Los_Angeles)) AS from_json({"d": "12/15 2012", "t": "12/15 2012 15:15:15"})#x] +Project [from_json(StructField(d,DateType,true), StructField(t,TimestampType,true), (dateFormat,MM/dd yyyy), (timestampFormat,MM/dd yyyy HH:mm:ss), {"d": "12/15 2012", "t": "12/15 2012 15:15:15"}, Some(America/Los_Angeles), false) AS from_json({"d": "12/15 2012", "t": "12/15 2012 15:15:15"})#x] +- OneRowRelation @@ -383,7 +383,7 @@ select from_json( 'd date', map('dateFormat', 'MM-dd')) -- !query analysis -Project [from_json(StructField(d,DateType,true), (dateFormat,MM-dd), {"d": "02-29"}, Some(America/Los_Angeles)) AS from_json({"d": "02-29"})#x] +Project [from_json(StructField(d,DateType,true), (dateFormat,MM-dd), {"d": "02-29"}, Some(America/Los_Angeles), false) AS from_json({"d": "02-29"})#x] +- OneRowRelation @@ -393,7 +393,7 @@ select from_json( 't timestamp', map('timestampFormat', 'MM-dd')) -- !query analysis -Project [from_json(StructField(t,TimestampType,true), (timestampFormat,MM-dd), {"t": "02-29"}, Some(America/Los_Angeles)) AS from_json({"t": "02-29"})#x] +Project [from_json(StructField(t,TimestampType,true), (timestampFormat,MM-dd), {"t": "02-29"}, Some(America/Los_Angeles), false) AS from_json({"t": "02-29"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/parse-schema-string.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/parse-schema-string.sql.out index 45fc3bd03a782..ae8e47ed3665c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/parse-schema-string.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/parse-schema-string.sql.out @@ -16,12 +16,12 @@ Project [from_csv(StructField(cube,IntegerType,true), 1, Some(America/Los_Angele -- !query select from_json('{"create":1}', 'create INT') -- !query analysis -Project [from_json(StructField(create,IntegerType,true), {"create":1}, Some(America/Los_Angeles)) AS from_json({"create":1})#x] +Project [from_json(StructField(create,IntegerType,true), {"create":1}, Some(America/Los_Angeles), false) AS from_json({"create":1})#x] +- OneRowRelation -- !query select from_json('{"cube":1}', 'cube INT') -- !query analysis -Project [from_json(StructField(cube,IntegerType,true), {"cube":1}, Some(America/Los_Angeles)) AS from_json({"cube":1})#x] +Project [from_json(StructField(cube,IntegerType,true), {"cube":1}, Some(America/Los_Angeles), false) AS from_json({"cube":1})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out index eb48f0d9a28f0..7c8908711a8eb 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-session-variables.sql.out @@ -2151,7 +2151,7 @@ CreateVariable defaultvalueexpression(cast(a INT as string), 'a INT'), true -- !query SELECT from_json('{"a": 1}', var1) -- !query analysis -Project [from_json(StructField(a,IntegerType,true), {"a": 1}, Some(America/Los_Angeles)) AS from_json({"a": 1})#x] +Project [from_json(StructField(a,IntegerType,true), {"a": 1}, Some(America/Los_Angeles), false) AS from_json({"a": 1})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out index 94073f2751b3e..754b05bfa6fed 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subexp-elimination.sql.out @@ -15,7 +15,7 @@ AS testData(a, b), false, true, LocalTempView, UNSUPPORTED, true -- !query SELECT from_json(a, 'struct').a, from_json(a, 'struct').b, from_json(b, 'array>')[0].a, from_json(b, 'array>')[0].b FROM testData -- !query analysis -Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a AS from_json(a).a#x, from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).b AS from_json(a).b#x, from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].a AS from_json(b)[0].a#x, from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b AS from_json(b)[0].b#x] +Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a AS from_json(a).a#x, from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).b AS from_json(a).b#x, from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].a AS from_json(b)[0].a#x, from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b AS from_json(b)[0].b#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] @@ -27,7 +27,7 @@ Project [from_json(StructField(a,IntegerType,true), StructField(b,StringType,tru -- !query SELECT if(from_json(a, 'struct').a > 1, from_json(b, 'array>')[0].a, from_json(b, 'array>')[0].a + 1) FROM testData -- !query analysis -Project [if ((from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 1)) from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].a else (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].a + 1) AS (IF((from_json(a).a > 1), from_json(b)[0].a, (from_json(b)[0].a + 1)))#x] +Project [if ((from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 1)) from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].a else (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].a + 1) AS (IF((from_json(a).a > 1), from_json(b)[0].a, (from_json(b)[0].a + 1)))#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] @@ -39,7 +39,7 @@ Project [if ((from_json(StructField(a,IntegerType,true), StructField(b,StringTyp -- !query SELECT if(isnull(from_json(a, 'struct').a), from_json(b, 'array>')[0].b + 1, from_json(b, 'array>')[0].b) FROM testData -- !query analysis -Project [if (isnull(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a)) (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b + 1) else from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b AS (IF((from_json(a).a IS NULL), (from_json(b)[0].b + 1), from_json(b)[0].b))#x] +Project [if (isnull(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a)) (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b + 1) else from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b AS (IF((from_json(a).a IS NULL), (from_json(b)[0].b + 1), from_json(b)[0].b))#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] @@ -51,7 +51,7 @@ Project [if (isnull(from_json(StructField(a,IntegerType,true), StructField(b,Str -- !query SELECT case when from_json(a, 'struct').a > 5 then from_json(a, 'struct').b when from_json(a, 'struct').a > 4 then from_json(a, 'struct').b + 1 else from_json(a, 'struct').b + 2 end FROM testData -- !query analysis -Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 5) THEN from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).b WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 4) THEN cast((cast(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).b as double) + cast(1 as double)) as string) ELSE cast((cast(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).b as double) + cast(2 as double)) as string) END AS CASE WHEN (from_json(a).a > 5) THEN from_json(a).b WHEN (from_json(a).a > 4) THEN (from_json(a).b + 1) ELSE (from_json(a).b + 2) END#x] +Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 5) THEN from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).b WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 4) THEN cast((cast(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).b as double) + cast(1 as double)) as string) ELSE cast((cast(from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).b as double) + cast(2 as double)) as string) END AS CASE WHEN (from_json(a).a > 5) THEN from_json(a).b WHEN (from_json(a).a > 4) THEN (from_json(a).b + 1) ELSE (from_json(a).b + 2) END#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] @@ -63,7 +63,7 @@ Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,Str -- !query SELECT case when from_json(a, 'struct').a > 5 then from_json(b, 'array>')[0].b when from_json(a, 'struct').a > 4 then from_json(b, 'array>')[0].b + 1 else from_json(b, 'array>')[0].b + 2 end FROM testData -- !query analysis -Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 5) THEN from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles)).a > 4) THEN (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b + 1) ELSE (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles))[0].b + 2) END AS CASE WHEN (from_json(a).a > 5) THEN from_json(b)[0].b WHEN (from_json(a).a > 4) THEN (from_json(b)[0].b + 1) ELSE (from_json(b)[0].b + 2) END#x] +Project [CASE WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 5) THEN from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b WHEN (from_json(StructField(a,IntegerType,true), StructField(b,StringType,true), a#x, Some(America/Los_Angeles), false).a > 4) THEN (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b + 1) ELSE (from_json(ArrayType(StructType(StructField(a,IntegerType,true),StructField(b,IntegerType,true)),true), b#x, Some(America/Los_Angeles), false)[0].b + 2) END AS CASE WHEN (from_json(a).a > 5) THEN from_json(b)[0].b WHEN (from_json(a).a > 4) THEN (from_json(b)[0].b + 1) ELSE (from_json(b)[0].b + 2) END#x] +- SubqueryAlias testdata +- View (`testData`, [a#x, b#x]) +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out index 6ca35b8b141dc..dcfd783b648f8 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestamp.sql.out @@ -802,7 +802,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out index e50c860270563..ec227afc87fe1 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp-ansi.sql.out @@ -745,7 +745,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out index 098abfb3852cf..7475f837250d5 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/timestampNTZ/timestamp.sql.out @@ -805,7 +805,7 @@ Project [unix_timestamp(22 05 2020 Friday, dd MM yyyy EEEEE, Some(America/Los_An -- !query select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) -- !query analysis -Project [from_json(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles)) AS from_json({"t":"26/October/2015"})#x] +Project [from_json(StructField(t,TimestampNTZType,true), (timestampFormat,dd/MMMMM/yyyy), {"t":"26/October/2015"}, Some(America/Los_Angeles), false) AS from_json({"t":"26/October/2015"})#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out index 009e91f7ffacf..22e60d0606382 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/stringCastAndExpressions.sql.out @@ -370,7 +370,7 @@ Project [c0#x] -- !query select from_json(a, 'a INT') from t -- !query analysis -Project [from_json(StructField(a,IntegerType,true), a#x, Some(America/Los_Angeles)) AS from_json(a)#x] +Project [from_json(StructField(a,IntegerType,true), a#x, Some(America/Los_Angeles), false) AS from_json(a)#x] +- SubqueryAlias t +- View (`t`, [a#x]) +- Project [cast(a#x as string) AS a#x] From 651f9479c5a26a33e743d7cbbc95590fb10461b3 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 7 Oct 2024 18:47:33 -0700 Subject: [PATCH 04/20] added bug fix --- .../expressions/regexpExpressions.scala | 20 ++++++++++++++-- .../spark/sql/CollationSQLRegexpSuite.scala | 23 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 970397c76a1cd..4cd6ca5720cc0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -25,6 +25,7 @@ import scala.jdk.CollectionConverters._ import org.apache.commons.text.StringEscapeUtils +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} @@ -700,7 +701,14 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio m.region(position, source.length) result.delete(0, result.length()) while (m.find) { - m.appendReplacement(result, lastReplacement) + try { + m.appendReplacement(result, lastReplacement) + } catch { + case e: Exception => + throw new SparkException(s"Could not perform regexp_replace for " + + s"""`input = "$s"`, `pattern = "$p"`, `replacement = "$r"` """ + + s"""and `position = $i`""", e) + } } m.appendTail(result) UTF8String.fromString(result.toString) @@ -719,6 +727,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio val termResult = ctx.freshName("termResult") val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName + val classNameSparkException = classOf[SparkException].getCanonicalName val matcher = ctx.freshName("matcher") val source = ctx.freshName("source") @@ -748,7 +757,14 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio $matcher.region($position, $source.length()); while ($matcher.find()) { - $matcher.appendReplacement($termResult, $termLastReplacement); + try { + $matcher.appendReplacement($termResult, $termLastReplacement); + } catch (Exception e) { + throw new $classNameSparkException(java.text.MessageFormat.format( + "Could not perform regexp_replace for " + + "`input = \\"{0}\\"`, `pattern = \\"{1}\\"`, `replacement = \\"{2}\\"` and " + + "`position = {3}`", $source, $regexp, $rep, $pos), e); + } } $matcher.appendTail($termResult); ${ev.value} = UTF8String.fromString($termResult.toString()); diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 87dbbc65a3936..37ef622bc764b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.internal.SqlApiConf @@ -96,6 +97,28 @@ class CollationSQLRegexpSuite } } + test("RegExpReplace throws the right exception when replace fails on a particular row") { + val tableName = "regexpReplaceException" + withTable(tableName) { + Seq("NO_CODEGEN", "CODEGEN_ONLY").foreach { codegenMode => + withSQLConf("spark.sql.codegen.factoryMode" -> codegenMode) { + sql(s"CREATE TABLE IF NOT EXISTS $tableName(s STRING)") + sql(s"INSERT INTO $tableName VALUES('first last')") + val query = s"SELECT regexp_replace(s, '(?[a-zA-Z]+) (?[a-zA-Z]+)', " + + s"'$$3 $$1') FROM $tableName" + + val exception = intercept[SparkException] { + sql(query).collect() + } + assert(exception.getMessage.contains("""Could not perform regexp_replace for """ + + """`input = "first last"`, `pattern = "(?[a-zA-Z]+) (?[a-zA-Z]+)"`, """ + + """`replacement = "$3 $1"` and `position = 1`""")) + assert(exception.getMessage.contains("No group 3")) + } + } + } + } + test("Like simplification should work with collated strings (for default collation)") { val tableNameBinary = "T_BINARY" withTable(tableNameBinary) { From 3e2fffe02e32746423c77ab44676cdcda8e09c15 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 7 Oct 2024 18:57:12 -0700 Subject: [PATCH 05/20] minor change --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 4 ++-- .../scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 4cd6ca5720cc0..28caab6271bce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -706,7 +706,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio } catch { case e: Exception => throw new SparkException(s"Could not perform regexp_replace for " + - s"""`input = "$s"`, `pattern = "$p"`, `replacement = "$r"` """ + + s"""`source = "$s"`, `pattern = "$p"`, `replacement = "$r"` """ + s"""and `position = $i`""", e) } } @@ -762,7 +762,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio } catch (Exception e) { throw new $classNameSparkException(java.text.MessageFormat.format( "Could not perform regexp_replace for " + - "`input = \\"{0}\\"`, `pattern = \\"{1}\\"`, `replacement = \\"{2}\\"` and " + + "`source = \\"{0}\\"`, `pattern = \\"{1}\\"`, `replacement = \\"{2}\\"` and " + "`position = {3}`", $source, $regexp, $rep, $pos), e); } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 37ef622bc764b..54084c2cb3582 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -111,7 +111,7 @@ class CollationSQLRegexpSuite sql(query).collect() } assert(exception.getMessage.contains("""Could not perform regexp_replace for """ + - """`input = "first last"`, `pattern = "(?[a-zA-Z]+) (?[a-zA-Z]+)"`, """ + + """`source = "first last"`, `pattern = "(?[a-zA-Z]+) (?[a-zA-Z]+)"`, """ + """`replacement = "$3 $1"` and `position = 1`""")) assert(exception.getMessage.contains("No group 3")) } From f538a5153d8cc49036bbff9a459975713b9babb5 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 7 Oct 2024 19:07:14 -0700 Subject: [PATCH 06/20] added codegen check --- .../org/apache/spark/sql/CollationSQLRegexpSuite.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 54084c2cb3582..a5ae5edcf8962 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.execution.WholeStageCodegenExec import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, StringType} @@ -106,9 +107,11 @@ class CollationSQLRegexpSuite sql(s"INSERT INTO $tableName VALUES('first last')") val query = s"SELECT regexp_replace(s, '(?[a-zA-Z]+) (?[a-zA-Z]+)', " + s"'$$3 $$1') FROM $tableName" - + val df = sql(query) + val plan = df.queryExecution.executedPlan + assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY")) val exception = intercept[SparkException] { - sql(query).collect() + df.collect() } assert(exception.getMessage.contains("""Could not perform regexp_replace for """ + """`source = "first last"`, `pattern = "(?[a-zA-Z]+) (?[a-zA-Z]+)"`, """ + From d331e57dcec78ddb7de15d01d556736aff444596 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 10 Oct 2024 15:38:33 -0700 Subject: [PATCH 07/20] Incorporated @Max Gekk's suggestion --- .../main/resources/error/error-conditions.json | 6 ++++++ .../expressions/regexpExpressions.scala | 13 ++++--------- .../sql/errors/QueryExecutionErrors.scala | 18 ++++++++++++++++++ .../spark/sql/CollationSQLRegexpSuite.scala | 9 +++++---- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index e83202d9e5ee3..49ff93741ec17 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -3189,6 +3189,12 @@ ], "sqlState" : "42K09" }, + "INVALID_REGEXP_REPLACE" : { + "message" : [ + "Could not perform regexp_replace for `source = \"\"`, `pattern = \"\"`, `replacement = \"\"` and `position = `." + ], + "sqlState" : "22023" + }, "INVALID_VARIANT_CAST" : { "message" : [ "The variant value `` cannot be cast into ``. Please use `try_variant_get` instead." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 28caab6271bce..46ee7e9d2f0fb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -25,7 +25,6 @@ import scala.jdk.CollectionConverters._ import org.apache.commons.text.StringEscapeUtils -import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} @@ -705,9 +704,8 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio m.appendReplacement(result, lastReplacement) } catch { case e: Exception => - throw new SparkException(s"Could not perform regexp_replace for " + - s"""`source = "$s"`, `pattern = "$p"`, `replacement = "$r"` """ + - s"""and `position = $i`""", e) + throw QueryExecutionErrors.invalidRegexpReplaceError(s.toString, + p.toString, r.toString, i.asInstanceOf[Int], e) } } m.appendTail(result) @@ -727,7 +725,6 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio val termResult = ctx.freshName("termResult") val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName - val classNameSparkException = classOf[SparkException].getCanonicalName val matcher = ctx.freshName("matcher") val source = ctx.freshName("source") @@ -760,10 +757,8 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio try { $matcher.appendReplacement($termResult, $termLastReplacement); } catch (Exception e) { - throw new $classNameSparkException(java.text.MessageFormat.format( - "Could not perform regexp_replace for " + - "`source = \\"{0}\\"`, `pattern = \\"{1}\\"`, `replacement = \\"{2}\\"` and " + - "`position = {3}`", $source, $regexp, $rep, $pos), e); + throw QueryExecutionErrors.invalidRegexpReplaceError($source, $regexp.toString(), + $rep.toString(), $pos, e); } } $matcher.appendTail($termResult); diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 4a23e9766fc5d..aebdf1160d808 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -362,6 +362,24 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "groupIndex" -> groupIndex.toString())) } + def invalidRegexpReplaceError( + source: String, + pattern: String, + replacement: String, + position: Int, + cause: Throwable): RuntimeException = { + new SparkRuntimeException( + errorClass = "INVALID_REGEXP_REPLACE", + messageParameters = Map( + "source" -> source, + "pattern" -> pattern, + "replacement" -> replacement, + "position" -> position.toString + ), + cause = cause + ) + } + def invalidUrlError(url: UTF8String, e: URISyntaxException): SparkIllegalArgumentException = { new SparkIllegalArgumentException( errorClass = "INVALID_URL", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index a5ae5edcf8962..8e422e0234788 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.SparkException +import org.apache.spark.SparkRuntimeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.WholeStageCodegenExec @@ -110,13 +110,14 @@ class CollationSQLRegexpSuite val df = sql(query) val plan = df.queryExecution.executedPlan assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY")) - val exception = intercept[SparkException] { + val exception = intercept[SparkRuntimeException] { df.collect() } + assert(exception.getErrorClass == "INVALID_REGEXP_REPLACE") assert(exception.getMessage.contains("""Could not perform regexp_replace for """ + """`source = "first last"`, `pattern = "(?[a-zA-Z]+) (?[a-zA-Z]+)"`, """ + - """`replacement = "$3 $1"` and `position = 1`""")) - assert(exception.getMessage.contains("No group 3")) + """`replacement = "$3 $1"` and `position = 1`.""")) + assert(exception.getCause.getMessage.contains("No group 3")) } } } From aefcf704dd2f2143b0c381d70b4bd98f7b9d7512 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 14 Oct 2024 13:59:58 -0700 Subject: [PATCH 08/20] fix --- .../src/main/resources/error/error-conditions.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 49ff93741ec17..365795c745d9e 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -2907,6 +2907,12 @@ ], "sqlState" : "42613" }, + "INVALID_REGEXP_REPLACE" : { + "message" : [ + "Could not perform regexp_replace for `source = \"\"`, `pattern = \"\"`, `replacement = \"\"` and `position = `." + ], + "sqlState" : "22023" + }, "INVALID_SAVE_MODE" : { "message" : [ "The specified save mode is invalid. Valid save modes include \"append\", \"overwrite\", \"ignore\", \"error\", \"errorifexists\", and \"default\"." @@ -3189,12 +3195,6 @@ ], "sqlState" : "42K09" }, - "INVALID_REGEXP_REPLACE" : { - "message" : [ - "Could not perform regexp_replace for `source = \"\"`, `pattern = \"\"`, `replacement = \"\"` and `position = `." - ], - "sqlState" : "22023" - }, "INVALID_VARIANT_CAST" : { "message" : [ "The variant value `` cannot be cast into ``. Please use `try_variant_get` instead." From cf722df9e98fc912cb4f7430c4ca16c60e9a512f Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 14 Oct 2024 16:03:06 -0700 Subject: [PATCH 09/20] Update CollationSQLRegexpSuite.scala --- .../scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 8e422e0234788..4ee35ce4d09a2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -113,7 +113,7 @@ class CollationSQLRegexpSuite val exception = intercept[SparkRuntimeException] { df.collect() } - assert(exception.getErrorClass == "INVALID_REGEXP_REPLACE") + assert(exception.getCondition == "INVALID_REGEXP_REPLACE") assert(exception.getMessage.contains("""Could not perform regexp_replace for """ + """`source = "first last"`, `pattern = "(?[a-zA-Z]+) (?[a-zA-Z]+)"`, """ + """`replacement = "$3 $1"` and `position = 1`.""")) From 361a5fd23ba35b91f39b4c109a31151549acf30b Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Wed, 16 Oct 2024 15:11:49 -0700 Subject: [PATCH 10/20] improvements recommended by @MaxGekk --- .../src/main/resources/error/error-conditions.json | 2 +- .../apache/spark/sql/CollationSQLRegexpSuite.scala | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 365795c745d9e..920c1922c522c 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -2909,7 +2909,7 @@ }, "INVALID_REGEXP_REPLACE" : { "message" : [ - "Could not perform regexp_replace for `source = \"\"`, `pattern = \"\"`, `replacement = \"\"` and `position = `." + "Could not perform regexp_replace for source = \"\", pattern = \"\", replacement = \"\" and position = ." ], "sqlState" : "22023" }, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 4ee35ce4d09a2..360882d79e49d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -113,10 +113,15 @@ class CollationSQLRegexpSuite val exception = intercept[SparkRuntimeException] { df.collect() } - assert(exception.getCondition == "INVALID_REGEXP_REPLACE") - assert(exception.getMessage.contains("""Could not perform regexp_replace for """ + - """`source = "first last"`, `pattern = "(?[a-zA-Z]+) (?[a-zA-Z]+)"`, """ + - """`replacement = "$3 $1"` and `position = 1`.""")) + checkError( + exception = exception, + condition = "INVALID_REGEXP_REPLACE", + parameters = Map( + "source" -> "first last", + "pattern" -> "(?[a-zA-Z]+) (?[a-zA-Z]+)", + "replacement" -> "$3 $1", + "position" -> "1") + ) assert(exception.getCause.getMessage.contains("No group 3")) } } From 613dfbb1a061f2f4ec08900c0e709337bbfc8d6c Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 17 Oct 2024 14:30:35 -0700 Subject: [PATCH 11/20] added @MaxGekk's recommendations --- .../org/apache/spark/sql/CollationSQLRegexpSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 360882d79e49d..2ee7456deebb9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkRuntimeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.WholeStageCodegenExec -import org.apache.spark.sql.internal.SqlApiConf +import org.apache.spark.sql.internal.{SqlApiConf, SQLConf} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, StringType} @@ -101,10 +101,10 @@ class CollationSQLRegexpSuite test("RegExpReplace throws the right exception when replace fails on a particular row") { val tableName = "regexpReplaceException" withTable(tableName) { + sql(s"CREATE TABLE IF NOT EXISTS $tableName(s STRING)") + sql(s"INSERT INTO $tableName VALUES('first last')") Seq("NO_CODEGEN", "CODEGEN_ONLY").foreach { codegenMode => - withSQLConf("spark.sql.codegen.factoryMode" -> codegenMode) { - sql(s"CREATE TABLE IF NOT EXISTS $tableName(s STRING)") - sql(s"INSERT INTO $tableName VALUES('first last')") + withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codegenMode) { val query = s"SELECT regexp_replace(s, '(?[a-zA-Z]+) (?[a-zA-Z]+)', " + s"'$$3 $$1') FROM $tableName" val df = sql(query) From 390af7af8456eee60ada4318a27e0078393e05ba Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 17 Oct 2024 14:50:32 -0700 Subject: [PATCH 12/20] catching nonfatal --- .../sql/catalyst/expressions/regexpExpressions.scala | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 46ee7e9d2f0fb..156a2aa4446da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -22,6 +22,7 @@ import java.util.regex.{Matcher, MatchResult, Pattern, PatternSyntaxException} import scala.collection.mutable.ArrayBuffer import scala.jdk.CollectionConverters._ +import scala.util.control.NonFatal import org.apache.commons.text.StringEscapeUtils @@ -703,7 +704,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio try { m.appendReplacement(result, lastReplacement) } catch { - case e: Exception => + case NonFatal(e) => throw QueryExecutionErrors.invalidRegexpReplaceError(s.toString, p.toString, r.toString, i.asInstanceOf[Int], e) } @@ -757,8 +758,12 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio try { $matcher.appendReplacement($termResult, $termLastReplacement); } catch (Exception e) { - throw QueryExecutionErrors.invalidRegexpReplaceError($source, $regexp.toString(), - $rep.toString(), $pos, e); + if (scala.util.control.NonFatal.apply(e)) { + throw QueryExecutionErrors.invalidRegexpReplaceError($source, $regexp.toString(), + $rep.toString(), $pos, e); + } else { + throw e; + } } } $matcher.appendTail($termResult); From 9aec6d96eace0eea28dd27614460de984b178c40 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 17 Oct 2024 14:54:11 -0700 Subject: [PATCH 13/20] minor improvement --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 156a2aa4446da..aaf86e8b2ca2c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -733,6 +733,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio val termLastReplacement = ctx.addMutableState("String", "lastReplacement") val termLastReplacementInUTF8 = ctx.addMutableState("UTF8String", "lastReplacementInUTF8") + val nonFatal = ctx.addMutableState("scala.util.control.NonFatal", "nonFatal") val setEvNotNull = if (nullable) { s"${ev.isNull} = false;" @@ -758,7 +759,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio try { $matcher.appendReplacement($termResult, $termLastReplacement); } catch (Exception e) { - if (scala.util.control.NonFatal.apply(e)) { + if ($nonFatal.apply(e)) { throw QueryExecutionErrors.invalidRegexpReplaceError($source, $regexp.toString(), $rep.toString(), $pos, e); } else { From 8c27240bc6b6e8645a630dab5adf42d3413917d4 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Thu, 17 Oct 2024 23:03:34 -0700 Subject: [PATCH 14/20] Update regexpExpressions.scala --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index aaf86e8b2ca2c..156a2aa4446da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -733,7 +733,6 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio val termLastReplacement = ctx.addMutableState("String", "lastReplacement") val termLastReplacementInUTF8 = ctx.addMutableState("UTF8String", "lastReplacementInUTF8") - val nonFatal = ctx.addMutableState("scala.util.control.NonFatal", "nonFatal") val setEvNotNull = if (nullable) { s"${ev.isNull} = false;" @@ -759,7 +758,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio try { $matcher.appendReplacement($termResult, $termLastReplacement); } catch (Exception e) { - if ($nonFatal.apply(e)) { + if (scala.util.control.NonFatal.apply(e)) { throw QueryExecutionErrors.invalidRegexpReplaceError($source, $regexp.toString(), $rep.toString(), $pos, e); } else { From e6e5a7960af49d2ec8b18528866d3f8b8fce320a Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 21 Oct 2024 13:33:19 -0700 Subject: [PATCH 15/20] minor fix --- .../spark/sql/CollationSQLRegexpSuite.scala | 34 +------------------ .../spark/sql/StringFunctionsSuite.scala | 32 ++++++++++++++++- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 2ee7456deebb9..5027efab91840 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -17,11 +17,9 @@ package org.apache.spark.sql -import org.apache.spark.SparkRuntimeException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Project -import org.apache.spark.sql.execution.WholeStageCodegenExec -import org.apache.spark.sql.internal.{SqlApiConf, SQLConf} +import org.apache.spark.sql.internal.{SqlApiConf} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, StringType} @@ -98,36 +96,6 @@ class CollationSQLRegexpSuite } } - test("RegExpReplace throws the right exception when replace fails on a particular row") { - val tableName = "regexpReplaceException" - withTable(tableName) { - sql(s"CREATE TABLE IF NOT EXISTS $tableName(s STRING)") - sql(s"INSERT INTO $tableName VALUES('first last')") - Seq("NO_CODEGEN", "CODEGEN_ONLY").foreach { codegenMode => - withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codegenMode) { - val query = s"SELECT regexp_replace(s, '(?[a-zA-Z]+) (?[a-zA-Z]+)', " + - s"'$$3 $$1') FROM $tableName" - val df = sql(query) - val plan = df.queryExecution.executedPlan - assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY")) - val exception = intercept[SparkRuntimeException] { - df.collect() - } - checkError( - exception = exception, - condition = "INVALID_REGEXP_REPLACE", - parameters = Map( - "source" -> "first last", - "pattern" -> "(?[a-zA-Z]+) (?[a-zA-Z]+)", - "replacement" -> "$3 $1", - "position" -> "1") - ) - assert(exception.getCause.getMessage.contains("No group 3")) - } - } - } - } - test("Like simplification should work with collated strings (for default collation)") { val tableNameBinary = "T_BINARY" withTable(tableNameBinary) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index ec240d71b851f..ca47073f4ae4e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import org.apache.spark.{SPARK_DOC_ROOT, SparkIllegalArgumentException, SparkRuntimeException} import org.apache.spark.sql.catalyst.expressions.Cast._ -import org.apache.spark.sql.execution.FormattedMode +import org.apache.spark.sql.execution.{FormattedMode, WholeStageCodegenExec} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -1356,4 +1356,34 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { } } } + + test("RegExpReplace throws the right exception when replace fails on a particular row") { + val tableName = "regexpReplaceException" + withTable(tableName) { + sql(s"CREATE TABLE IF NOT EXISTS $tableName(s STRING)") + sql(s"INSERT INTO $tableName VALUES('first last')") + Seq("NO_CODEGEN", "CODEGEN_ONLY").foreach { codegenMode => + withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codegenMode) { + val query = s"SELECT regexp_replace(s, '(?[a-zA-Z]+) (?[a-zA-Z]+)', " + + s"'$$3 $$1') FROM $tableName" + val df = sql(query) + val plan = df.queryExecution.executedPlan + assert(plan.isInstanceOf[WholeStageCodegenExec] == (codegenMode == "CODEGEN_ONLY")) + val exception = intercept[SparkRuntimeException] { + df.collect() + } + checkError( + exception = exception, + condition = "INVALID_REGEXP_REPLACE", + parameters = Map( + "source" -> "first last", + "pattern" -> "(?[a-zA-Z]+) (?[a-zA-Z]+)", + "replacement" -> "$3 $1", + "position" -> "1") + ) + assert(exception.getCause.getMessage.contains("No group 3")) + } + } + } + } } From 59809fa0cbb03ca80367240216371de829e9f958 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 21 Oct 2024 15:26:21 -0700 Subject: [PATCH 16/20] Empty Commit From d2f2c475265434ca2c9c4bc7e8b2cb22ab2fcbda Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 21 Oct 2024 15:31:35 -0700 Subject: [PATCH 17/20] Empty Commit From 7617e2bbc0559a6806ef83c1475d01dffef7844a Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 21 Oct 2024 16:26:22 -0700 Subject: [PATCH 18/20] empty commit From 83dab1831f7d9cefd9556983c8235d36da803d35 Mon Sep 17 00:00:00 2001 From: Harsh Motwani Date: Mon, 21 Oct 2024 21:37:47 -0700 Subject: [PATCH 19/20] Update regexpExpressions.scala --- .../spark/sql/catalyst/expressions/regexpExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 156a2aa4446da..52460533efbe1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -757,7 +757,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio while ($matcher.find()) { try { $matcher.appendReplacement($termResult, $termLastReplacement); - } catch (Exception e) { + } catch (Throwable e) { if (scala.util.control.NonFatal.apply(e)) { throw QueryExecutionErrors.invalidRegexpReplaceError($source, $regexp.toString(), $rep.toString(), $pos, e); From c488a055d3176b67cab0584ffa0a9032d18907fa Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 23 Oct 2024 11:55:29 +0200 Subject: [PATCH 20/20] Update sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala --- .../scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 5027efab91840..87dbbc65a3936 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Project -import org.apache.spark.sql.internal.{SqlApiConf} +import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, StringType}