From 5b7fad93edef8f02bbe96824dd73036eaddc1ec5 Mon Sep 17 00:00:00 2001 From: Aman Omer Date: Wed, 6 Nov 2019 18:39:46 +0900 Subject: [PATCH 1/3] [SPARK-29462] The data type of "array()" should be array During creation of array, if CreateArray does not gets any children to set data type for array, it will create an array of null type . When empty array is created, it should be declared as array. No Tested manually Closes #26324 from amanomer/29462. Authored-by: Aman Omer Signed-off-by: HyukjinKwon --- docs/sql-migration-guide.md | 2 ++ .../catalyst/expressions/complexTypeCreator.scala | 2 +- .../apache/spark/sql/DataFrameFunctionsSuite.scala | 12 ++++++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index be0fe32ded99b..295c26b60f9bc 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -215,6 +215,8 @@ license: | For example `SELECT timestamp 'tomorrow';`. - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`. + + - Since Spark 3.0, when `array` function is called without parameters, it returns an empty array with `NullType` data type. In Spark version 2.4 and earlier, the data type of the result is `StringType`. - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 9ce87a4922c01..f7de6787993b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -47,7 +47,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { override def dataType: ArrayType = { ArrayType( TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(children.map(_.dataType)) - .getOrElse(StringType), + .getOrElse(NullType), containsNull = children.exists(_.nullable)) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 7fce03658fc16..30e84e2c5c84d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -3499,12 +3499,9 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_)) } - test("SPARK-21281 use string types by default if array and map have no argument") { + test("SPARK-21281 use string types by default if map have no argument") { val ds = spark.range(1) var expectedSchema = new StructType() - .add("x", ArrayType(StringType, containsNull = false), nullable = false) - assert(ds.select(array().as("x")).schema == expectedSchema) - expectedSchema = new StructType() .add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false) assert(ds.select(map().as("x")).schema == expectedSchema) } @@ -3577,6 +3574,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { }.getMessage assert(nonFoldableError.contains("The 'escape' parameter must be a string literal")) } + + test("SPARK-29462: Use null type by default if array have no argument") { + val ds = spark.range(1) + var expectedSchema = new StructType() + .add("x", ArrayType(NullType, containsNull = false), nullable = false) + assert(ds.select(array().as("x")).schema == expectedSchema) + } } object DataFrameFunctionsSuite { From 130f808c43b392db3ee045e8a5cbc836ab5b48dc Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Mon, 10 Feb 2020 19:49:28 +0900 Subject: [PATCH 2/3] Address comments --- docs/sql-migration-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 295c26b60f9bc..cd804c2a40e66 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -216,7 +216,7 @@ license: | - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`. - - Since Spark 3.0, when `array` function is called without parameters, it returns an empty array with `NullType` data type. In Spark version 2.4 and earlier, the data type of the result is `StringType`. + - Since Spark 3.0, when `array` function is called without parameters, it returns an empty array with `NullType` data type. In Spark version 2.4 and earlier, it returns an empty array with string type. - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception. From 90b46609be928eef436d9ac09a7e7c2bafd954c5 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Tue, 11 Feb 2020 10:59:09 +0900 Subject: [PATCH 3/3] Address comments and add a legacy configuration --- docs/sql-migration-guide.md | 2 +- .../catalyst/expressions/complexTypeCreator.scala | 11 ++++++++++- .../org/apache/spark/sql/internal/SQLConf.scala | 9 +++++++++ .../spark/sql/DataFrameFunctionsSuite.scala | 15 ++++++++++----- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index cd804c2a40e66..6a6824b60ec8b 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -216,7 +216,7 @@ license: | - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`. - - Since Spark 3.0, when `array` function is called without parameters, it returns an empty array with `NullType` data type. In Spark version 2.4 and earlier, it returns an empty array with string type. + - Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.arrayDefaultToStringType.enabled` to `true`. - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index f7de6787993b7..7335e305bfe55 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -44,10 +45,18 @@ case class CreateArray(children: Seq[Expression]) extends Expression { TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), s"function $prettyName") } + private val defaultElementType: DataType = { + if (SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING)) { + StringType + } else { + NullType + } + } + override def dataType: ArrayType = { ArrayType( TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(children.map(_.dataType)) - .getOrElse(NullType), + .getOrElse(defaultElementType), containsNull = children.exists(_.nullable)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 64c613611c861..d86f8693e0655 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2007,6 +2007,15 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_ARRAY_DEFAULT_TO_STRING = + buildConf("spark.sql.legacy.arrayDefaultToStringType.enabled") + .internal() + .doc("When set to true, it returns an empty array of string type when the `array` " + + "function is called without any parameters. Otherwise, it returns an empty " + + "array of `NullType`") + .booleanConf + .createWithDefault(false) + val TRUNCATE_TABLE_IGNORE_PERMISSION_ACL = buildConf("spark.sql.truncateTable.ignorePermissionAcl.enabled") .internal() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 30e84e2c5c84d..9e9d8c3e9a7c5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -3575,11 +3575,16 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { assert(nonFoldableError.contains("The 'escape' parameter must be a string literal")) } - test("SPARK-29462: Use null type by default if array have no argument") { - val ds = spark.range(1) - var expectedSchema = new StructType() - .add("x", ArrayType(NullType, containsNull = false), nullable = false) - assert(ds.select(array().as("x")).schema == expectedSchema) + test("SPARK-29462: Empty array of NullType for array function with no arguments") { + Seq((true, StringType), (false, NullType)).foreach { + case (arrayDefaultToString, expectedType) => + withSQLConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING.key -> arrayDefaultToString.toString) { + val schema = spark.range(1).select(array()).schema + assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[ArrayType]) + val actualType = schema.head.dataType.asInstanceOf[ArrayType].elementType + assert(actualType === expectedType) + } + } } }