From 395eee801dab323b6f32b25b43a4e92d99027551 Mon Sep 17 00:00:00 2001 From: iRakson Date: Tue, 11 Feb 2020 21:14:12 +0530 Subject: [PATCH 1/4] [SPARK-30790]map() should return map [SPARK-30790]map() should return map --- docs/sql-migration-guide.md | 2 ++ .../expressions/complexTypeCreator.scala | 12 ++++++++++-- .../org/apache/spark/sql/internal/SQLConf.scala | 9 +++++++++ .../spark/sql/DataFrameFunctionsSuite.scala | 16 +++++++++++----- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index f98fab5b4c56b..e631d1c046aab 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -218,6 +218,8 @@ license: | - Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.arrayDefaultToStringType.enabled` to `true`. + - Since Spark 3.0, when the `map` function is called without any parameters, it returns an empty map of `NullType`. In Spark version 2.4 and earlier, it returns an empty map of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.mapDefaultToStringType.enabled` to `true`. + - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception. - Since Spark 3.0, when casting interval values to string type, there is no "interval" prefix, e.g. `1 days 2 hours`. In Spark version 2.4 and earlier, the string contains the "interval" prefix like `interval 1 days 2 hours`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 7335e305bfe55..cac9df6513d39 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -145,6 +145,14 @@ case class CreateMap(children: Seq[Expression]) extends Expression { lazy val keys = children.indices.filter(_ % 2 == 0).map(children) lazy val values = children.indices.filter(_ % 2 != 0).map(children) + private val defaultElementType: DataType = { + if (SQLConf.get.getConf(SQLConf.LEGACY_MAP_DEFAULT_TO_STRING)) { + StringType + } else { + NullType + } + } + override def foldable: Boolean = children.forall(_.foldable) override def checkInputDataTypes(): TypeCheckResult = { @@ -167,9 +175,9 @@ case class CreateMap(children: Seq[Expression]) extends Expression { override lazy val dataType: MapType = { MapType( keyType = TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(keys.map(_.dataType)) - .getOrElse(StringType), + .getOrElse(defaultElementType), valueType = TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(values.map(_.dataType)) - .getOrElse(StringType), + .getOrElse(defaultElementType), valueContainsNull = values.exists(_.nullable)) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index d86f8693e0655..997f531c160bf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2016,6 +2016,15 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_MAP_DEFAULT_TO_STRING = + buildConf("spark.sql.legacy.mapDefaultToStringType.enabled") + .internal() + .doc("When set to true, it returns an empty map of string type when the `map` " + + "function is called without any parameters. Otherwise, it returns an empty " + + "map of `NullType`") + .booleanConf + .createWithDefault(false) + val TRUNCATE_TABLE_IGNORE_PERMISSION_ACL = buildConf("spark.sql.truncateTable.ignorePermissionAcl.enabled") .internal() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 9e9d8c3e9a7c5..6a12f76d4747d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -3499,11 +3499,17 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_)) } - test("SPARK-21281 use string types by default if map have no argument") { - val ds = spark.range(1) - var expectedSchema = new StructType() - .add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false) - assert(ds.select(map().as("x")).schema == expectedSchema) + test("SPARK-30790: Empty map of for map function with no arguments") { + Seq((true, StringType), (false, NullType)).foreach { + case (mapDefaultToString, expectedType) => + withSQLConf(SQLConf.LEGACY_MAP_DEFAULT_TO_STRING.key -> mapDefaultToString.toString) { + val schema = spark.range(1).select(map()).schema + assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType]) + val actualKeyType = schema.head.dataType.asInstanceOf[MapType].keyType + val actualValueType = schema.head.dataType.asInstanceOf[MapType].valueType + assert(actualKeyType === expectedType && actualValueType === expectedType) + } + } } test("SPARK-21281 fails if functions have no argument") { From 3319d3ab13262e90764c06006b9528844592890b Mon Sep 17 00:00:00 2001 From: iRakson Date: Wed, 12 Feb 2020 15:16:53 +0530 Subject: [PATCH 2/4] Fix --- docs/sql-migration-guide.md | 4 +-- .../expressions/complexTypeCreator.scala | 4 +-- .../catalyst/util/ArrayBasedMapBuilder.scala | 5 ++- .../apache/spark/sql/internal/SQLConf.scala | 20 ++++-------- .../spark/sql/DataFrameFunctionsSuite.scala | 31 ++++++++++--------- 5 files changed, 29 insertions(+), 35 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index e631d1c046aab..f38e0c5673581 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -216,9 +216,9 @@ license: | - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`. - - Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.arrayDefaultToStringType.enabled` to `true`. + - Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType.enabled` to `true`. - - Since Spark 3.0, when the `map` function is called without any parameters, it returns an empty map of `NullType`. In Spark version 2.4 and earlier, it returns an empty map of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.mapDefaultToStringType.enabled` to `true`. + - Since Spark 3.0, when the `map` function is called without any parameters, it returns an empty map with `NullType` as key/value type. In Spark version 2.4 and earlier, it returns an empty map of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType.enabled` to `true`. - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index cac9df6513d39..4bd85d304ded2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -46,7 +46,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { } private val defaultElementType: DataType = { - if (SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING)) { + if (SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) { StringType } else { NullType @@ -146,7 +146,7 @@ case class CreateMap(children: Seq[Expression]) extends Expression { lazy val values = children.indices.filter(_ % 2 != 0).map(children) private val defaultElementType: DataType = { - if (SQLConf.get.getConf(SQLConf.LEGACY_MAP_DEFAULT_TO_STRING)) { + if (SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) { StringType } else { NullType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala index 98934368205ec..37d65309e2b89 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala @@ -29,12 +29,11 @@ import org.apache.spark.unsafe.array.ByteArrayMethods */ class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Serializable { assert(!keyType.existsRecursively(_.isInstanceOf[MapType]), "key of map cannot be/contain map") - assert(keyType != NullType, "map key cannot be null type.") private lazy val keyToIndex = keyType match { // Binary type data is `byte[]`, which can't use `==` to check equality. - case _: AtomicType | _: CalendarIntervalType if !keyType.isInstanceOf[BinaryType] => - new java.util.HashMap[Any, Int]() + case _: AtomicType | _: CalendarIntervalType | _: NullType + if !keyType.isInstanceOf[BinaryType] => new java.util.HashMap[Any, Int]() case _ => // for complex types, use interpreted ordering to be able to compare unsafe data with safe // data, e.g. UnsafeRow vs GenericInternalRow. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 997f531c160bf..e14f7a4451988 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2007,21 +2007,13 @@ object SQLConf { .booleanConf .createWithDefault(false) - val LEGACY_ARRAY_DEFAULT_TO_STRING = - buildConf("spark.sql.legacy.arrayDefaultToStringType.enabled") + val LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE = + buildConf("spark.sql.legacy.createEmptyCollectionUsingStringType.enabled") .internal() - .doc("When set to true, it returns an empty array of string type when the `array` " + - "function is called without any parameters. Otherwise, it returns an empty " + - "array of `NullType`") - .booleanConf - .createWithDefault(false) - - val LEGACY_MAP_DEFAULT_TO_STRING = - buildConf("spark.sql.legacy.mapDefaultToStringType.enabled") - .internal() - .doc("When set to true, it returns an empty map of string type when the `map` " + - "function is called without any parameters. Otherwise, it returns an empty " + - "map of `NullType`") + .doc("When set to true, it returns an empty array of string type and an empty map with " + + "string type as key/value type when `array` and `map` functions are called without any " + + "parameters, respectively. Otherwise, it returns an empty array of `NullType` and " + + "empty map with `NullType` as key/value type, respectively. ") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 6a12f76d4747d..5f6c1c00b6700 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -3499,19 +3499,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_)) } - test("SPARK-30790: Empty map of for map function with no arguments") { - Seq((true, StringType), (false, NullType)).foreach { - case (mapDefaultToString, expectedType) => - withSQLConf(SQLConf.LEGACY_MAP_DEFAULT_TO_STRING.key -> mapDefaultToString.toString) { - val schema = spark.range(1).select(map()).schema - assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType]) - val actualKeyType = schema.head.dataType.asInstanceOf[MapType].keyType - val actualValueType = schema.head.dataType.asInstanceOf[MapType].valueType - assert(actualKeyType === expectedType && actualValueType === expectedType) - } - } - } - test("SPARK-21281 fails if functions have no argument") { val df = Seq(1).toDF("a") @@ -3584,7 +3571,8 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { test("SPARK-29462: Empty array of NullType for array function with no arguments") { Seq((true, StringType), (false, NullType)).foreach { case (arrayDefaultToString, expectedType) => - withSQLConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING.key -> arrayDefaultToString.toString) { + withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key -> + arrayDefaultToString.toString) { val schema = spark.range(1).select(array()).schema assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[ArrayType]) val actualType = schema.head.dataType.asInstanceOf[ArrayType].elementType @@ -3592,6 +3580,21 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-30790: Empty map with NullType as key/value type for map function with no argument") { + Seq((true, StringType), (false, NullType)).foreach { + case (mapDefaultToString, expectedType) => + withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key -> + mapDefaultToString.toString) { + val schema = spark.range(1).select(map()).schema + assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType]) + val actualKeyType = schema.head.dataType.asInstanceOf[MapType].keyType + val actualValueType = schema.head.dataType.asInstanceOf[MapType].valueType + assert(actualKeyType === expectedType) + assert(actualValueType === expectedType) + } + } + } } object DataFrameFunctionsSuite { From 1a066e1ec71fe7eca329781c76ab75c18de2a0cc Mon Sep 17 00:00:00 2001 From: iRakson Date: Wed, 12 Feb 2020 16:34:39 +0530 Subject: [PATCH 3/4] Fix migration guide --- docs/sql-migration-guide.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index f38e0c5673581..9bfa792f36079 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -216,9 +216,7 @@ license: | - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`. - - Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType.enabled` to `true`. - - - Since Spark 3.0, when the `map` function is called without any parameters, it returns an empty map with `NullType` as key/value type. In Spark version 2.4 and earlier, it returns an empty map of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType.enabled` to `true`. + - Since Spark 3.0, when the `array`/`map` function is called without any parameters, it returns an empty collection with `NullType` as element type. In Spark version 2.4 and earlier, it returns an empty collection with `StringType` as element type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType.enabled` to `true`. - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception. From e873527032ffaed583b211d81f2c9aafebe9fe07 Mon Sep 17 00:00:00 2001 From: iRakson Date: Wed, 12 Feb 2020 23:32:24 +0530 Subject: [PATCH 4/4] Review comment fix --- docs/sql-migration-guide.md | 2 +- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 9bfa792f36079..46b741687363f 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -216,7 +216,7 @@ license: | - Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`. - - Since Spark 3.0, when the `array`/`map` function is called without any parameters, it returns an empty collection with `NullType` as element type. In Spark version 2.4 and earlier, it returns an empty collection with `StringType` as element type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType.enabled` to `true`. + - Since Spark 3.0, when the `array`/`map` function is called without any parameters, it returns an empty collection with `NullType` as element type. In Spark version 2.4 and earlier, it returns an empty collection with `StringType` as element type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType` to `true`. - Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index e14f7a4451988..95b5b3afc3933 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2008,12 +2008,11 @@ object SQLConf { .createWithDefault(false) val LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE = - buildConf("spark.sql.legacy.createEmptyCollectionUsingStringType.enabled") + buildConf("spark.sql.legacy.createEmptyCollectionUsingStringType") .internal() - .doc("When set to true, it returns an empty array of string type and an empty map with " + - "string type as key/value type when `array` and `map` functions are called without any " + - "parameters, respectively. Otherwise, it returns an empty array of `NullType` and " + - "empty map with `NullType` as key/value type, respectively. ") + .doc("When set to true, Spark returns an empty collection with `StringType` as element " + + "type if the `array`/`map` function is called without any parameters. Otherwise, Spark " + + "returns an empty collection with `NullType` as element type.") .booleanConf .createWithDefault(false)