[SPARK-23736][SQL] Extending the concat function to support array columns #20858

mn-mikke · 2018-03-19T14:56:27Z

What changes were proposed in this pull request?

The PR adds a logic for easy concatenation of multiple array columns and covers:

Concat expression has been extended to support array columns
A Python wrapper

How was this patch tested?

New tests added into:

CollectionExpressionsSuite
DataFrameFunctionsSuite
typeCoercion/native/concat.sql

Codegen examples

Primitive-type elements

val df = Seq(
  (Seq(1 ,2), Seq(3, 4)),
  (Seq(1, 2, 3), null)
).toDF("a", "b")
df.filter('a.isNotNull).select(concat('a, 'b)).debugCodegen()

Result:

/* 033 */         boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 034 */         ArrayData inputadapter_value = inputadapter_isNull ?
/* 035 */         null : (inputadapter_row.getArray(0));
/* 036 */
/* 037 */         if (!(!inputadapter_isNull)) continue;
/* 038 */
/* 039 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 040 */
/* 041 */         ArrayData[] project_args = new ArrayData[2];
/* 042 */
/* 043 */         if (!false) {
/* 044 */           project_args[0] = inputadapter_value;
/* 045 */         }
/* 046 */
/* 047 */         boolean inputadapter_isNull1 = inputadapter_row.isNullAt(1);
/* 048 */         ArrayData inputadapter_value1 = inputadapter_isNull1 ?
/* 049 */         null : (inputadapter_row.getArray(1));
/* 050 */         if (!inputadapter_isNull1) {
/* 051 */           project_args[1] = inputadapter_value1;
/* 052 */         }
/* 053 */
/* 054 */         ArrayData project_value = new Object() {
/* 055 */           public ArrayData concat(ArrayData[] args) {
/* 056 */             for (int z = 0; z < 2; z++) {
/* 057 */               if (args[z] == null) return null;
/* 058 */             }
/* 059 */
/* 060 */             long project_numElements = 0L;
/* 061 */             for (int z = 0; z < 2; z++) {
/* 062 */               project_numElements += args[z].numElements();
/* 063 */             }
/* 064 */             if (project_numElements > 2147483632) {
/* 065 */               throw new RuntimeException("Unsuccessful try to concat arrays with " + project_numElements +
/* 066 */                 " elements due to exceeding the array size limit 2147483632.");
/* 067 */             }
/* 068 */
/* 069 */             long project_size = UnsafeArrayData.calculateSizeOfUnderlyingByteArray(
/* 070 */               project_numElements,
/* 071 */               4);
/* 072 */             if (project_size > 2147483632) {
/* 073 */               throw new RuntimeException("Unsuccessful try to concat arrays with " + project_size +
/* 074 */                 " bytes of data due to exceeding the limit 2147483632 bytes" +
/* 075 */                 " for UnsafeArrayData.");
/* 076 */             }
/* 077 */
/* 078 */             byte[] project_array = new byte[(int)project_size];
/* 079 */             UnsafeArrayData project_arrayData = new UnsafeArrayData();
/* 080 */             Platform.putLong(project_array, 16, project_numElements);
/* 081 */             project_arrayData.pointTo(project_array, 16, (int)project_size);
/* 082 */             int project_counter = 0;
/* 083 */             for (int y = 0; y < 2; y++) {
/* 084 */               for (int z = 0; z < args[y].numElements(); z++) {
/* 085 */                 if (args[y].isNullAt(z)) {
/* 086 */                   project_arrayData.setNullAt(project_counter);
/* 087 */                 } else {
/* 088 */                   project_arrayData.setInt(
/* 089 */                     project_counter,
/* 090 */                     args[y].getInt(z)
/* 091 */                   );
/* 092 */                 }
/* 093 */                 project_counter++;
/* 094 */               }
/* 095 */             }
/* 096 */             return project_arrayData;
/* 097 */           }
/* 098 */         }.concat(project_args);
/* 099 */         boolean project_isNull = project_value == null;

Non-primitive-type elements

val df = Seq(
  (Seq("aa" ,"bb"), Seq("ccc", "ddd")),
  (Seq("x", "y"), null)
).toDF("a", "b")
df.filter('a.isNotNull).select(concat('a, 'b)).debugCodegen()

Result:

/* 033 */         boolean inputadapter_isNull = inputadapter_row.isNullAt(0);
/* 034 */         ArrayData inputadapter_value = inputadapter_isNull ?
/* 035 */         null : (inputadapter_row.getArray(0));
/* 036 */
/* 037 */         if (!(!inputadapter_isNull)) continue;
/* 038 */
/* 039 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 040 */
/* 041 */         ArrayData[] project_args = new ArrayData[2];
/* 042 */
/* 043 */         if (!false) {
/* 044 */           project_args[0] = inputadapter_value;
/* 045 */         }
/* 046 */
/* 047 */         boolean inputadapter_isNull1 = inputadapter_row.isNullAt(1);
/* 048 */         ArrayData inputadapter_value1 = inputadapter_isNull1 ?
/* 049 */         null : (inputadapter_row.getArray(1));
/* 050 */         if (!inputadapter_isNull1) {
/* 051 */           project_args[1] = inputadapter_value1;
/* 052 */         }
/* 053 */
/* 054 */         ArrayData project_value = new Object() {
/* 055 */           public ArrayData concat(ArrayData[] args) {
/* 056 */             for (int z = 0; z < 2; z++) {
/* 057 */               if (args[z] == null) return null;
/* 058 */             }
/* 059 */
/* 060 */             long project_numElements = 0L;
/* 061 */             for (int z = 0; z < 2; z++) {
/* 062 */               project_numElements += args[z].numElements();
/* 063 */             }
/* 064 */             if (project_numElements > 2147483632) {
/* 065 */               throw new RuntimeException("Unsuccessful try to concat arrays with " + project_numElements +
/* 066 */                 " elements due to exceeding the array size limit 2147483632.");
/* 067 */             }
/* 068 */
/* 069 */             Object[] project_arrayObjects = new Object[(int)project_numElements];
/* 070 */             int project_counter = 0;
/* 071 */             for (int y = 0; y < 2; y++) {
/* 072 */               for (int z = 0; z < args[y].numElements(); z++) {
/* 073 */                 project_arrayObjects[project_counter] = args[y].getUTF8String(z);
/* 074 */                 project_counter++;
/* 075 */               }
/* 076 */             }
/* 077 */             return new org.apache.spark.sql.catalyst.util.GenericArrayData(project_arrayObjects);
/* 078 */           }
/* 079 */         }.concat(project_args);
/* 080 */         boolean project_isNull = project_value == null;

…tenating multiple array columns into one.

gatorsmile · 2018-03-19T17:12:44Z

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala

    expression[MapValues]("map_values"),
    expression[Size]("size"),
    expression[SortArray]("sort_array"),
+    expression[ConcatArrays]("concat_arrays"),


Why not reusing concat?

concat(array1, array2, ..., arrayN) -> array ?

I've already played with this option in my mind, but I'm not sure how concat would be categorized.

Currently, concat is defined as a pure string operation:
/**

@group string_funcs

@SInCE 1.5.0
/
@scala.annotation.varargs
def concat(exprs: Column): Column

Whereas the functionality in this PR belongs rather to the collection_funcs group.

Having just one function for both expressions would be elegant, but can you advise what group should be assigned to concat?

How about move it to collection functions?

Ok, will merge the functions into one. Do you find having one expression class concatenation per the concatenation type ok?

I'm afraid if I incorporate all the logic into one expression class then the code will become messy since each codeGen and eveluation has a different nature.

maropu · 2018-03-19T22:53:25Z

Thinks for this work! One question; why do you think we need to support this api in Spark native? Other libraries support this as first-class?

mn-mikke · 2018-03-20T10:42:08Z

@maropu What other libraries do you mean? I'm not aware of any library providing this functionality on top Spark SQL.

When using Spark SQL as an ETL tool for structured and nested data, people are forced to use UDFs for transforming arrays since current api for array columns is lacking. This approach brings several drawbacks:

bad code readability
Catalyst is blind when performing optimizations
impossibility to track data lineage of the transformation (a key aspect for the financial industry, see Spline and Spline paper)

So my colleagues and I decided to extend the current Spark SQL API with well-known collection functions like concat, flatten, zipWithIndex, etc. We don't want to keep this functionality just in our fork of Spark, but would like to share it with others.

WeichenXu123 · 2018-03-23T03:32:27Z

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala

+   * @param f a function that accepts a sequence of non-null evaluation result names of children
+   *          and returns Java code to compute the output.
+   */
+  protected def nullSafeCodeGen(


This method looks almost the same with the one in BinaryExpression. Can you avoid the code duplication ?

We will combine it with concat

@WeichenXu123 I do agree that there are strong similarities in the code.

If you take a look at UniryExpression, BinaryExpression, TernaryExpression, you will see that methods responsible for null save evaluation and code generation are the same except the number of parameters. My intention has been to generalize the methods into the NullSaveEvaluation trait and remove the original methods in a different PR once the trait is in. I didn't want to create a big bang PR because of one additional function in API.

I feel it's ok to discuss this in follow-up activities cuz this is less related to this pr. So, can you make this pr minimal as much as possible?

Ok, will try.

gatorsmile · 2018-03-23T03:41:01Z

@maropu Maybe you can help @mn-mikke review this PR ? Will open an umbrella JIRA for the built-in functions we plan to do in Apache 2.4. In the list, we have multiple for operating nested data.

maropu · 2018-03-23T23:40:18Z

ok, I'll check later!

viirya · 2018-03-24T06:02:53Z

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala

 trait UserDefinedExpression
+
+/**
+ * The trait covers logic for performing null save evaluation and code generation.


typo: null safe.

viirya · 2018-03-24T06:05:01Z

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala

+   * override this.
+   */
+  override def eval(input: InternalRow): Any =
+  {


Spark usually use the style like:

override def eval(input: InternalRow): Any = { val values = children.map(_.eval(input)) if (values.contains(null)) { null } else { nullSafeEval(values) } }

You could follow the style of other codes.

There are other places where the braces {} style doesn't follow Spark codes. We should keep the same code style.

Think I fixed all style differences.

Seems the style fix is missed here.

viirya · 2018-03-24T06:14:10Z

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala

+   */
+  override def eval(input: InternalRow): Any =
+  {
+    val values = children.map(_.eval(input))


We probably don't need to evaluate all children. Once any child expression is null, we can just return null.

viirya · 2018-03-24T06:20:02Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    val arrayCheck = checkInputDataTypesAreArrays
+    if(arrayCheck.isFailure) arrayCheck


Style issue:

if (...) { ... } else { ... }

viirya · 2018-03-24T06:31:45Z

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala

+/**
+ * The trait covers logic for performing null save evaluation and code generation.
+ */
+trait NullSafeEvaluation extends Expression


Do we need to bring in NullSafeEvaluation? If only ConcatArray uses it, we may not need to add this.

viirya · 2018-03-24T06:50:57Z

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala

+          $resultCode
+        """ /: children.zip(gens)) {
+          case (acc, (child, gen)) =>
+            gen.code + ctx.nullSafeExec(child.nullable, gen.isNull)(acc)


For example, for a binary expression, doesn't this generate code like:

rightGen.code + ctx.nullSafeExec(right.nullable, rightGen.isNull) { leftGen.code + ctx.nullSafeExec(left.nullable, leftGen.isNull) { ${ev.isNull} = false; // resultCode could change nullability. $resultCode } }

Although for deterministic expressions, the evaluation order doesn't matter. But for non-deterministic, I'm little concerned that it may cause unexpected change.

viirya · 2018-03-24T06:52:42Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+ * Concatenates multiple arrays into one.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr, ...) - Concatenates multiple arrays into one.",


Defines that the element types of the arrays must be the same.

viirya · 2018-03-24T07:04:18Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+    val primitiveValueTypeName = CodeGenerator.primitiveTypeName(elementType)
+    val assignments = elements.map { el =>
+      s"""
+        |for(int z = 0; z < $el.numElements(); z++) {


Stype: for (

viirya · 2018-03-24T07:04:20Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+    val assignments = elements.map { el =>
+      s"""
+        |for(int z = 0; z < $el.numElements(); z++) {
+        | if($el.isNullAt(z)) {


Style: if ().

viirya · 2018-03-24T07:15:46Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+      |Object[] $arrayName = new Object[$numElemName];
+      |int $counter = 0;
+      |$assignments
+      |$arrayDataName = new $genericArrayClass($arrayName);


Can't we concate complex elements into UnsafeArrayData?

+1, can we reuse the UnsafeArrayWriter logic for this case?

Really like this idea! I think it would require moving the complex type insertion logic from InterprettedUnsafeProjection directly to UnsafeDataWriter and introduce in that way write methods for complex type fields. I'm not sure whether this big refactoring task is still in the scope of this PR.

Also see that we could improve codeGen of CreateArray in the same way.

You couldn't use UnsafeArrayData in the complex case?

Yeah, currently there are no write methods on UnsafeArrayWriter or set methods on UnsafeArrayData that we could leverage for complex types. In theory, we could follow the same approach as in InterprettedUnsafeProjection and each complex type to a byte array and subsequently insert the produced byte array into the target UnsafeArrayData. Since this logic could be utilized from more places (e.g. CreateArray), it should be encapsulated into UnsafeArrayWriter or UnsafeArrayData at first. What do you think?

maropu · 2018-03-26T01:59:58Z

sql/core/src/main/scala/org/apache/spark/sql/functions.scala

+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  def concat_arrays(columns: Column*): Column = withExpr { ConcatArrays(columns.map(_.expr)) }


We need to add this func. in sql/functions here? It seems we might recommend users to use these kinds of functions via selectExpr, so is it okay to add this only in FunctionRegistry in terms of code simplicity and maintainablity? Thoughts? @viirya @gatorsmile

maropu · 2018-03-26T03:02:50Z

We should handle different (and compatible) typed arrays in this funs?

scala> sql("select concat_arrays(array(1L, 2L), array(3, 4))").show
org.apache.spark.sql.AnalysisException: cannot resolve 'concat_arrays(array(1L, 2L), array(3, 4))' due to data type mismatch: input to function concat_arrays sh
ould all be the same type, but it's [array<bigint>, array<int>]; line 1 pos 7;
'Project [unresolvedalias(concat_arrays(array(1, 2), array(3, 4)), None)]
+- OneRowRelation

Also, could you add more tests for this case in SQLQueryTestSuite? probably, we can add a new test file like concat_arrays.sql in typeCoercion.native.

maropu · 2018-03-26T03:12:14Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+      > SELECT _FUNC_(array(1, 2, 3), array(4, 5), array(6));
+       [1,2,3,4,5,6]
+  """)
+case class ConcatArrays(children: Seq[Expression]) extends Expression with NullSafeEvaluation {


Can we add a common base class (e.g., ConcatLike) for handling nested ConcatArrays in the optimizer(CombineConcat)?

spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala

Line 649 in e4bec7c

object CombineConcats extends Rule[LogicalPlan] {

maropu · 2018-03-26T05:02:48Z

Also, postgresql has the function array_cat for concatenating arrays, so it might be better to make the behaviour the same with the postgresql one:
https://www.postgresql.org/docs/10/static/functions-array.html

HyukjinKwon · 2018-03-26T07:47:20Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+    Examples:
+      > SELECT _FUNC_(array(1, 2, 3), array(4, 5), array(6));
+       [1,2,3,4,5,6]
+  """)


Shall we add since too?

... [1,2,3,4,5,6] """, since = "2.4.0")

HyukjinKwon · 2018-03-26T07:51:49Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+    else TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), s"function $prettyName")
+  }
+
+  private def checkInputDataTypesAreArrays(): TypeCheckResult =


Can we just put this in checkInputDataTypes?

HyukjinKwon · 2018-03-26T08:08:02Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+  override def dataType: ArrayType =
+    children
+      .headOption.map(_.dataType.asInstanceOf[ArrayType])
+      .getOrElse(ArrayType.defaultConcreteType.asInstanceOf[ArrayType])


Should we allow empty children? I can't think of a use case for now and we should better disallow it first.

Definitely share your opinion, but I think we should be consistent across the whole Spark SQL API. Functions like concat and concat_ws accept empty children as well.

Hm .. but then this is array<null> when the children are empty. Seems CreateArray's type is array<string> in this case.

Ok, changing to return type array<string> when no children are provided. Also I've created the jira ticket SPARK-23798 since I don't see any reason why it couldn't return a default concrete type in this case. Hope I don't miss anything.

HyukjinKwon · 2018-03-26T08:12:40Z

python/pyspark/sql/functions.py

+    Collection function: Concatenates multiple arrays into one.
+
+    :param cols: list of column names (string) or list of :class:`Column` expressions that have
+        the same data type.


Shall we note cols are expected to be array type?

…ession.

mn-mikke · 2018-03-26T17:43:50Z

Merged concat and concat_arrays functions into one via an unresolved expression and subsequent resolution. Do you have any objections to this approach?

gatorsmile · 2018-03-26T17:51:01Z

ok to test

gatorsmile · 2018-03-26T17:51:12Z

@mn-mikke Could you update the PR title?

SparkQA · 2018-03-26T17:54:32Z

Test build #88596 has finished for PR 20858 at commit bb46c3d.

This patch fails Scala style tests.
This patch merges cleanly.
This patch adds the following public classes (experimental):
case class UnresolvedConcat(children: Seq[Expression]) extends Expression

… file.

mn-mikke · 2018-03-26T18:44:58Z

retest please

SparkQA · 2018-03-26T20:40:07Z

Test build #88598 has finished for PR 20858 at commit 11205af.

This patch fails Spark unit tests.
This patch merges cleanly.
This patch adds no public classes.

mn-mikke · 2018-03-26T22:19:32Z

retest please

SparkQA · 2018-03-27T00:49:38Z

Test build #88605 has finished for PR 20858 at commit 753499d.

This patch fails Spark unit tests.
This patch merges cleanly.
This patch adds no public classes.

HyukjinKwon · 2018-03-27T00:59:03Z

python/pyspark/sql/functions.py

+    [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column)))


Why did we move this down .. ?

The whole file is divide into sections according to groups of functions. Based on @gatorsmile's suggestion, the concat function should be categorized as a collection function. So I moved the function to comply with the file structure.

mn-mikke · 2018-03-27T09:36:54Z

It seems that we experienced the same problem with failing "RateSourceV2Suite.basic microbatch execution" test reported here

SparkQA · 2018-04-12T22:22:26Z

Test build #89286 has finished for PR 20858 at commit 6bb33e6.

This patch passes all tests.
This patch does not merge cleanly.
This patch adds no public classes.

SparkQA · 2018-04-12T23:39:47Z

Test build #89302 has finished for PR 20858 at commit 944e0c9.

This patch fails Spark unit tests.
This patch merges cleanly.
This patch adds no public classes.

SparkQA · 2018-04-13T10:25:44Z

Test build #89323 has finished for PR 20858 at commit 7f5124b.

This patch fails Spark unit tests.
This patch merges cleanly.
This patch adds no public classes.

kiszk · 2018-04-13T11:04:33Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+        ("UnsafeArrayData", arrayData),
+        ("int[]", counter)))
+
+    s"""new Object() {


nit:

s""" |new Object() { ...

kiszk · 2018-04-13T11:25:33Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+      if (inputs.contains(null)) {
+        null
+      } else {
+        val elements = inputs.flatMap(_.asInstanceOf[ArrayData].toObjectArray(elementType))


Can we always allocate an concatenated array? I think that the total array element size may be overflow in some cases.

ueshin · 2018-04-16T07:08:04Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+      arguments = Seq(
+        (s"${javaType}[]", "args"),
+        ("UnsafeArrayData", arrayData),
+        ("int[]", counter)))


I guess we can simply use for-loop here?

for (int $idx = 0; $idx < ${children.length}; $idx++) { for (int z = 0; z < args[$idx].numElements(); z++) { ... } }

ueshin · 2018-04-16T07:11:00Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+        |int[] $tempVariableName = new int[]{0};
+        |$assignmentSection
+        |final int $numElementsConstant = $tempVariableName[0];
+      """.stripMargin,


I guess we can simply use for-loop here?

int $tempVariableName = 0; for (int $idx = 0; $idx < ${children.length}; $idx++) { $tempVariableName += args[$idx].numElements(); } final int $numElementsConstant = $tempVariableName;

ueshin · 2018-04-16T07:17:21Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+         |boolean[] $isNullVariable = new boolean[]{false};
+         |$assignmentSection;
+         |if ($isNullVariable[0]) return null;
+       """.stripMargin


I guess we can simply use for-loop here?

for (int $idx = 0; $idx < ${children.length}; $idx++) { if (args[$idx] == null) { return null; } }

We can return as soon as we found null in this case.

ueshin · 2018-04-16T07:23:58Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+    val assignmentSection = ctx.splitExpressions(
+      expressions = assignments,
+      funcName = "complexArrayConcat",
+      arguments = Seq((s"${javaType}[]", "args"), ("Object[]", arrayData), ("int[]", counter)))


I guess we can simply use for-loop here?

for (int $idx = 0; $idx < ${children.length}; $idx++) { for (int z = 0; z < args[$idx].numElements(); z++) { ... } }

ueshin · 2018-04-16T07:32:05Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+    val assignments = (0 until children.length).map { idx =>
+      s"""
+         |for (int z = 0; z < args[$idx].numElements(); z++) {
+         |  $arrayData[$counter[0]] = ${CodeGenerator.getValue(s"args[$idx]", elementType, "z")};


We need to check null?

Here we operate only with non-primitive types where null is treated as a regular value so the null check shouldn't be necessary.
The added tests should cover this scenario.

… for loops.

SparkQA · 2018-04-16T20:18:41Z

Test build #89402 has finished for PR 20858 at commit 600ae89.

This patch passes all tests.
This patch merges cleanly.
This patch adds no public classes.

SparkQA · 2018-04-17T17:41:39Z

Test build #89456 has finished for PR 20858 at commit f2a67e8.

This patch passes all tests.
This patch merges cleanly.
This patch adds no public classes.

ueshin · 2018-04-18T08:37:02Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+
+  override def dataType: DataType = children.map(_.dataType).headOption.getOrElse(StringType)
+
+  lazy val javaType: String = CodeGenerator.javaType(dataType)


We can move this into doGenCode() method.

Good point! But I think it would be better to reuse javaType also in genCodeForPrimitiveArrays and genCodeForNonPrimitiveArrays.

ueshin · 2018-04-18T08:40:33Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+    ev.copy(s"""
+      $initCode
+      $codes
+      ${javaType} ${ev.value} = $concatenator.concat($args);


nit: $javaType

ueshin · 2018-04-18T08:46:10Z

LGTM except for nits.

SparkQA · 2018-04-18T13:36:33Z

Test build #89495 has finished for PR 20858 at commit 8a125d9.

This patch passes all tests.
This patch merges cleanly.
This patch adds no public classes.

SparkQA · 2018-04-18T17:15:13Z

Test build #89504 has finished for PR 20858 at commit 5a4cc8c.

This patch passes all tests.
This patch merges cleanly.
This patch adds the following public classes (experimental):
class HasCollectSubModels(Params):
class Summarizer(object):
class SummaryBuilder(JavaWrapper):
class CrossValidator(Estimator, ValidatorParams, HasParallelism, HasCollectSubModels,
class TrainValidationSplit(Estimator, ValidatorParams, HasParallelism, HasCollectSubModels,
case class Reverse(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
case class ArrayMin(child: Expression) extends UnaryExpression with ImplicitCastInputTypes
class ArrayDataIndexedSeq[T](arrayData: ArrayData, dataType: DataType) extends IndexedSeq[T]
abstract class MemoryStreamBase[A : Encoder](sqlContext: SQLContext) extends BaseStreamingSource
class ContinuousMemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
case class GetRecord(offset: ContinuousMemoryStreamPartitionOffset)
class ContinuousMemoryStreamDataReaderFactory(
class ContinuousMemoryStreamDataReader(
case class ContinuousMemoryStreamOffset(partitionNums: Map[Int, Int])
case class ContinuousMemoryStreamPartitionOffset(partition: Int, numProcessed: Int)

SparkQA · 2018-04-19T12:18:31Z

Test build #89560 has finished for PR 20858 at commit f7bdcf7.

This patch passes all tests.
This patch merges cleanly.
This patch adds the following public classes (experimental):
case class ArrayPosition(left: Expression, right: Expression)

SparkQA · 2018-04-19T16:49:46Z

Test build #89573 has finished for PR 20858 at commit 36d5d25.

This patch passes all tests.
This patch merges cleanly.
This patch adds the following public classes (experimental):
case class ElementAt(left: Expression, right: Expression) extends GetMapValueUtil
abstract class GetMapValueUtil extends BinaryExpression with ImplicitCastInputTypes
case class GetMapValue(child: Expression, key: Expression)

ueshin · 2018-04-20T05:57:43Z

Thanks! merging to master.

rxin · 2018-09-19T06:08:26Z

...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

+
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  override def eval(input: InternalRow): Any = dataType match {


so this pattern match will probably cause significant regression in the interpreted (non-codegen) mode, due to the way scala pattern matching is implemented.

Thanks! I've created #22471 to call the pattern matching only once.

WDYT about Reverse? It looks like a similar problem.

[SPARK-23736][SQL] Implementation of the concat_arrays function conca…

282e724

…tenating multiple array columns into one.

gatorsmile reviewed Mar 19, 2018

View reviewed changes

WeichenXu123 reviewed Mar 23, 2018

View reviewed changes

viirya reviewed Mar 24, 2018

View reviewed changes

maropu reviewed Mar 26, 2018

View reviewed changes

HyukjinKwon reviewed Mar 26, 2018

View reviewed changes

mn-mikke added 3 commits March 26, 2018 12:55

[SPARK-23736][SQL] Code style fixes.

aa5a089

[SPARK-23736][SQL] Improving the description of the ConcatArrays expr…

90d3ab7

…ession.

[SPARK-23736][SQL] Merging concat and concat_arrays into one function.

bb46c3d

[SPARK-23736][SQL] Adding new line at the end of the unresolved.scala…

11205af

… file.

mn-mikke changed the title ~~[SPARK-23736][SQL] Implementation of the concat_arrays function concatenating multiple array columns into one.~~ [SPARK-23736][SQL] Extending the concat function to support array columns Mar 26, 2018

[SPARK-23736][SQL] Fixing failing unit test from DDLSuite.

753499d

HyukjinKwon reviewed Mar 27, 2018

View reviewed changes

[SPARK-23736][SQL] Adding more tests

7f5124b

kiszk reviewed Apr 13, 2018

View reviewed changes

wajda mentioned this pull request Apr 13, 2018

[SPARK-23821][SQL] Collection function: flatten #20938

Closed

ueshin reviewed Apr 16, 2018

View reviewed changes

mn-mikke added 2 commits April 16, 2018 18:01

[SPARK-23736][SQL] Checks of max array size + Rewriting codegen using…

0201e4b

… for loops.

[SPARK-23736][SQL] Merging current master into the feature branch.

600ae89

[SPARK-23736][SQL] Fixing exception messages

f2a67e8

ueshin reviewed Apr 18, 2018

View reviewed changes

mn-mikke added 2 commits April 18, 2018 11:17

[SPARK-23736][SQL] Small refactoring

8a125d9

[SPARK-23736][SQL] Merging current master to the feature branch

5a4cc8c

[SPARK-23736][SQL] Merging current master to the feature branch.

f7bdcf7

[SPARK-23736][SQL] Merging current master to the feature branch.

36d5d25

asfgit closed this in e6b4660 Apr 20, 2018

ueshin mentioned this pull request Apr 26, 2018

[SPARK-23922][SQL] Add arrays_overlap function #21028

Closed

rxin reviewed Sep 19, 2018

View reviewed changes

mn-mikke mentioned this pull request Sep 19, 2018

[SPARK-25469][SQL] Eval methods of Concat, Reverse and ElementAt should use pattern matching only once #22471

Closed


		override def dataType: DataType = children.map(_.dataType).headOption.getOrElse(StringType)

		lazy val javaType: String = CodeGenerator.javaType(dataType)


		override def foldable: Boolean = children.forall(_.foldable)

		override def eval(input: InternalRow): Any = dataType match {

[SPARK-23736][SQL] Extending the concat function to support array columns #20858

[SPARK-23736][SQL] Extending the concat function to support array columns #20858

Uh oh!

Conversation

mn-mikke commented Mar 19, 2018 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

What changes were proposed in this pull request?

How was this patch tested?

Codegen examples

Primitive-type elements

Non-primitive-type elements

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

maropu commented Mar 19, 2018

Uh oh!

mn-mikke commented Mar 20, 2018

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

maropu Apr 3, 2018 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

gatorsmile commented Mar 23, 2018

Uh oh!

maropu commented Mar 23, 2018

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

maropu Mar 26, 2018 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

maropu commented Mar 26, 2018

Uh oh!

mn-mikke commented Mar 19, 2018 •

edited

Loading

maropu Apr 3, 2018 •

edited

Loading

maropu Mar 26, 2018 •

edited

Loading