-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-16213][SQL] Reduce runtime overhead of a program that creates an primitive array in DataFrame #13909
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-16213][SQL] Reduce runtime overhead of a program that creates an primitive array in DataFrame #13909
Changes from all commits
12b591e
909d210
d481cb0
66454f3
03e0cfa
5620733
2906c74
d5b3a8a
d29bb97
88daf42
da82efe
7914230
69e0eed
597dc72
a971336
be01d91
1c7f972
360d139
b66d0f6
08262b1
438944b
f418062
d24c7b1
c159f03
0af0828
f6e9a83
327c8ac
7a7e9c3
ee237b4
28df09f
293b344
69d5e33
2556ba5
34bff15
4a0409a
dcce4c5
2f67ac2
c986361
cfe2e3d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,11 +19,12 @@ package org.apache.spark.sql.catalyst.expressions | |
|
|
||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder | ||
| import org.apache.spark.sql.catalyst.analysis.Star | ||
| import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | ||
| import org.apache.spark.sql.catalyst.expressions.codegen._ | ||
| import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, TypeUtils} | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.Platform | ||
| import org.apache.spark.unsafe.array.ByteArrayMethods | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
||
| /** | ||
|
|
@@ -43,7 +44,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression { | |
| override def checkInputDataTypes(): TypeCheckResult = | ||
| TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), "function array") | ||
|
|
||
| override def dataType: DataType = { | ||
| override def dataType: ArrayType = { | ||
| ArrayType( | ||
| children.headOption.map(_.dataType).getOrElse(NullType), | ||
| containsNull = children.exists(_.nullable)) | ||
|
|
@@ -56,33 +57,99 @@ case class CreateArray(children: Seq[Expression]) extends Expression { | |
| } | ||
|
|
||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
| val arrayClass = classOf[GenericArrayData].getName | ||
| val values = ctx.freshName("values") | ||
| ctx.addMutableState("Object[]", values, s"this.$values = null;") | ||
|
|
||
| ev.copy(code = s""" | ||
| this.$values = new Object[${children.size}];""" + | ||
| ctx.splitExpressions( | ||
| ctx.INPUT_ROW, | ||
| children.zipWithIndex.map { case (e, i) => | ||
| val eval = e.genCode(ctx) | ||
| eval.code + s""" | ||
| if (${eval.isNull}) { | ||
| $values[$i] = null; | ||
| } else { | ||
| $values[$i] = ${eval.value}; | ||
| } | ||
| """ | ||
| }) + | ||
| s""" | ||
| final ArrayData ${ev.value} = new $arrayClass($values); | ||
| this.$values = null; | ||
| """, isNull = "false") | ||
| val et = dataType.elementType | ||
| val evals = children.map(e => e.genCode(ctx)) | ||
| val (preprocess, assigns, postprocess, arrayData) = | ||
| GenArrayData.genCodeToCreateArrayData(ctx, et, evals, false) | ||
| ev.copy( | ||
| code = preprocess + ctx.splitExpressions(ctx.INPUT_ROW, assigns) + postprocess, | ||
| value = arrayData, | ||
| isNull = "false") | ||
| } | ||
|
|
||
| override def prettyName: String = "array" | ||
| } | ||
|
|
||
| private [sql] object GenArrayData { | ||
| /** | ||
| * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class | ||
| * | ||
| * @param ctx a [[CodegenContext]] | ||
| * @param elementType data type of underlying array elements | ||
| * @param elementsCode a set of [[ExprCode]] for each element of an underlying array | ||
| * @param isMapKey if true, throw an exception when the element is null | ||
| * @return (code pre-assignments, assignments to each array elements, code post-assignments, | ||
| * arrayData name) | ||
| */ | ||
| def genCodeToCreateArrayData( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why we need 2 methods....
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i see. I will do it tonight |
||
| ctx: CodegenContext, | ||
| elementType: DataType, | ||
| elementsCode: Seq[ExprCode], | ||
| isMapKey: Boolean): (String, Seq[String], String, String) = { | ||
| val arrayName = ctx.freshName("array") | ||
| val arrayDataName = ctx.freshName("arrayData") | ||
| val numElements = elementsCode.length | ||
|
|
||
| if (!ctx.isPrimitiveType(elementType)) { | ||
| val genericArrayClass = classOf[GenericArrayData].getName | ||
| ctx.addMutableState("Object[]", arrayName, | ||
| s"this.$arrayName = new Object[${numElements}];") | ||
|
|
||
| val assignments = elementsCode.zipWithIndex.map { case (eval, i) => | ||
| val isNullAssignment = if (!isMapKey) { | ||
| s"$arrayName[$i] = null;" | ||
| } else { | ||
| "throw new RuntimeException(\"Cannot use null as map key!\");" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This message seems strange. This is not only for map key. |
||
| } | ||
| eval.code + s""" | ||
| if (${eval.isNull}) { | ||
| $isNullAssignment | ||
| } else { | ||
| $arrayName[$i] = ${eval.value}; | ||
| } | ||
| """ | ||
| } | ||
|
|
||
| ("", | ||
| assignments, | ||
| s"final ArrayData $arrayDataName = new $genericArrayClass($arrayName);", | ||
| arrayDataName) | ||
| } else { | ||
| val unsafeArraySizeInBytes = | ||
| UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + | ||
| ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) | ||
| val baseOffset = Platform.BYTE_ARRAY_OFFSET | ||
| ctx.addMutableState("UnsafeArrayData", arrayDataName, ""); | ||
|
|
||
| val primitiveValueTypeName = ctx.primitiveTypeName(elementType) | ||
| val assignments = elementsCode.zipWithIndex.map { case (eval, i) => | ||
| val isNullAssignment = if (!isMapKey) { | ||
| s"$arrayDataName.setNullAt($i);" | ||
| } else { | ||
| "throw new RuntimeException(\"Cannot use null as map key!\");" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. |
||
| } | ||
| eval.code + s""" | ||
| if (${eval.isNull}) { | ||
| $isNullAssignment | ||
| } else { | ||
| $arrayDataName.set$primitiveValueTypeName($i, ${eval.value}); | ||
| } | ||
| """ | ||
| } | ||
|
|
||
| (s""" | ||
| byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; | ||
| $arrayDataName = new UnsafeArrayData(); | ||
| Platform.putLong($arrayName, $baseOffset, $numElements); | ||
| $arrayDataName.pointTo($arrayName, $baseOffset, $unsafeArraySizeInBytes); | ||
| """, | ||
| assignments, | ||
| "", | ||
| arrayDataName) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns a catalyst Map containing the evaluation of all children expressions as keys and values. | ||
| * The children are a flatted sequence of kv pairs, e.g. (key1, value1, key2, value2, ...) | ||
|
|
@@ -133,49 +200,26 @@ case class CreateMap(children: Seq[Expression]) extends Expression { | |
| } | ||
|
|
||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
| val arrayClass = classOf[GenericArrayData].getName | ||
| val mapClass = classOf[ArrayBasedMapData].getName | ||
| val keyArray = ctx.freshName("keyArray") | ||
| val valueArray = ctx.freshName("valueArray") | ||
| ctx.addMutableState("Object[]", keyArray, s"this.$keyArray = null;") | ||
| ctx.addMutableState("Object[]", valueArray, s"this.$valueArray = null;") | ||
|
|
||
| val keyData = s"new $arrayClass($keyArray)" | ||
| val valueData = s"new $arrayClass($valueArray)" | ||
| ev.copy(code = s""" | ||
| $keyArray = new Object[${keys.size}]; | ||
| $valueArray = new Object[${values.size}];""" + | ||
| ctx.splitExpressions( | ||
| ctx.INPUT_ROW, | ||
| keys.zipWithIndex.map { case (key, i) => | ||
| val eval = key.genCode(ctx) | ||
| s""" | ||
| ${eval.code} | ||
| if (${eval.isNull}) { | ||
| throw new RuntimeException("Cannot use null as map key!"); | ||
| } else { | ||
| $keyArray[$i] = ${eval.value}; | ||
| } | ||
| """ | ||
| }) + | ||
| ctx.splitExpressions( | ||
| ctx.INPUT_ROW, | ||
| values.zipWithIndex.map { case (value, i) => | ||
| val eval = value.genCode(ctx) | ||
| s""" | ||
| ${eval.code} | ||
| if (${eval.isNull}) { | ||
| $valueArray[$i] = null; | ||
| } else { | ||
| $valueArray[$i] = ${eval.value}; | ||
| } | ||
| """ | ||
| }) + | ||
| val MapType(keyDt, valueDt, _) = dataType | ||
| val evalKeys = keys.map(e => e.genCode(ctx)) | ||
| val evalValues = values.map(e => e.genCode(ctx)) | ||
| val (preprocessKeyData, assignKeys, postprocessKeyData, keyArrayData) = | ||
| GenArrayData.genCodeToCreateArrayData(ctx, keyDt, evalKeys, true) | ||
| val (preprocessValueData, assignValues, postprocessValueData, valueArrayData) = | ||
| GenArrayData.genCodeToCreateArrayData(ctx, valueDt, evalValues, false) | ||
| val code = | ||
| s""" | ||
| final MapData ${ev.value} = new $mapClass($keyData, $valueData); | ||
| this.$keyArray = null; | ||
| this.$valueArray = null; | ||
| """, isNull = "false") | ||
| final boolean ${ev.isNull} = false; | ||
| $preprocessKeyData | ||
| ${ctx.splitExpressions(ctx.INPUT_ROW, assignKeys)} | ||
| $postprocessKeyData | ||
| $preprocessValueData | ||
| ${ctx.splitExpressions(ctx.INPUT_ROW, assignValues)} | ||
| $postprocessValueData | ||
| final MapData ${ev.value} = new $mapClass($keyArrayData, $valueArrayData); | ||
| """ | ||
| ev.copy(code = code) | ||
| } | ||
|
|
||
| override def prettyName: String = "map" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No param doc for
allowNull.