-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-24605][SQL] size(null) returns null instead of -1 #21598
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2338b78
e18568f
64dc3b2
b6539a5
80114b3
5d58d82
431fca8
64a1b7d
10e7a2d
cf9364d
bb21e13
ae2c7f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -67,37 +67,61 @@ trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression | |||||||||
|
|
||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Given an array or map, returns its size. Returns -1 if null. | ||||||||||
| * Given an array or map, returns total number of elements in it. | ||||||||||
| */ | ||||||||||
| @ExpressionDescription( | ||||||||||
| usage = "_FUNC_(expr) - Returns the size of an array or a map. Returns -1 if null.", | ||||||||||
| usage = """ | ||||||||||
| _FUNC_(expr) - Returns the size of an array or a map. | ||||||||||
| The function returns -1 if its input is null and spark.sql.legacy.sizeOfNull is set to true. | ||||||||||
| If spark.sql.legacy.sizeOfNull is set to false, the function returns null for null input. | ||||||||||
| By default, the spark.sql.legacy.sizeOfNull parameter is set to true. | ||||||||||
| """, | ||||||||||
| examples = """ | ||||||||||
| Examples: | ||||||||||
| > SELECT _FUNC_(array('b', 'd', 'c', 'a')); | ||||||||||
| 4 | ||||||||||
| > SELECT _FUNC_(map('a', 1, 'b', 2)); | ||||||||||
| 2 | ||||||||||
| > SELECT _FUNC_(NULL); | ||||||||||
| -1 | ||||||||||
| """) | ||||||||||
| case class Size(child: Expression) extends UnaryExpression with ExpectsInputTypes { | ||||||||||
| case class Size( | ||||||||||
| child: Expression, | ||||||||||
| legacySizeOfNull: Boolean) | ||||||||||
| extends UnaryExpression with ExpectsInputTypes { | ||||||||||
|
|
||||||||||
| def this(child: Expression) = | ||||||||||
| this( | ||||||||||
| child, | ||||||||||
| legacySizeOfNull = SQLConf.get.getConf(SQLConf.LEGACY_SIZE_OF_NULL)) | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since now we can access the conf also on executor side, do we need these changes? Can't we just get this value as a
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it works now, I will try to read the config on executor's side. I was just struggling to an issue in tests for another PR when SQL configs were not propagated to executors. For example: Lines 52 to 54 in a40ffc6
Also I see some other places where configs are read via passing to constructors: spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala Line 534 in b8f27ae
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it was made possible in #21376 |
||||||||||
|
|
||||||||||
| override def dataType: DataType = IntegerType | ||||||||||
| override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(ArrayType, MapType)) | ||||||||||
| override def nullable: Boolean = false | ||||||||||
| override def nullable: Boolean = if (legacySizeOfNull) false else super.nullable | ||||||||||
|
|
||||||||||
| override def eval(input: InternalRow): Any = { | ||||||||||
| val value = child.eval(input) | ||||||||||
| if (value == null) { | ||||||||||
| -1 | ||||||||||
| if (legacySizeOfNull) -1 else null | ||||||||||
| } else child.dataType match { | ||||||||||
| case _: ArrayType => value.asInstanceOf[ArrayData].numElements() | ||||||||||
| case _: MapType => value.asInstanceOf[MapData].numElements() | ||||||||||
| case other => throw new UnsupportedOperationException( | ||||||||||
| s"The size function doesn't support the operand type ${other.getClass.getCanonicalName}") | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||||||||||
| val childGen = child.genCode(ctx) | ||||||||||
| ev.copy(code = code""" | ||||||||||
| if (legacySizeOfNull) { | ||||||||||
| val childGen = child.genCode(ctx) | ||||||||||
| ev.copy(code = code""" | ||||||||||
| boolean ${ev.isNull} = false; | ||||||||||
| ${childGen.code} | ||||||||||
| ${CodeGenerator.javaType(dataType)} ${ev.value} = ${childGen.isNull} ? -1 : | ||||||||||
| (${childGen.value}).numElements();""", isNull = FalseLiteral) | ||||||||||
| } else { | ||||||||||
| defineCodeGen(ctx, ev, c => s"($c).numElements()") | ||||||||||
| } | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1324,6 +1324,12 @@ object SQLConf { | |
| "Other column values can be ignored during parsing even if they are malformed.") | ||
| .booleanConf | ||
| .createWithDefault(true) | ||
|
|
||
| val LEGACY_SIZE_OF_NULL = buildConf("spark.sql.legacy.sizeOfNull") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds more consistent to have
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we plan to fix other things accordingly too?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we would like to change/improve external behavior under flags in the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing is fine. I am good to have such prefix but I wonder what's changed after #21427 (comment). Sounds basically similar to what I suggested. Where did that discussion happen?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've created https://issues.apache.org/jira/browse/SPARK-24625 to track it. It's similar to #21427 (comment) , but as I replied in that PR, having version specific config is an overkill, while
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's basically the same except that the postfix includes a specific version, which was just a rough idea. |
||
| .doc("If it is set to true, size of null returns -1. This behavior was inherited from Hive. " + | ||
| "The size function returns null for null input if the flag is disabled.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. perhaps you should say this will be updated to false in spark 3.0? |
||
| .booleanConf | ||
| .createWithDefault(true) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1686,6 +1692,8 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def csvColumnPruning: Boolean = getConf(SQLConf.CSV_PARSER_COLUMN_PRUNING) | ||
|
|
||
| def legacySizeOfNull: Boolean = getConf(SQLConf.LEGACY_SIZE_OF_NULL) | ||
|
|
||
| /** ********************** SQLConf functionality methods ************ */ | ||
|
|
||
| /** Set Spark SQL configuration properties. */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update the description of this function?