[SPARK-23821][SQL] Merging current master to the feature branch.

mn-mikke · mn-mikke · commit f11aa7be7cb5 · 2018-04-19T09:59:12.000+02:00
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -1845,6 +1845,23 @@ def array_contains(col, value):
     return Column(sc._jvm.functions.array_contains(_to_java_column(col), value))
 
 
+@since(2.4)
+def array_position(col, value):
+    """
+    Collection function: Locates the position of the first occurrence of the given value
+    in the given array. Returns null if either of the arguments are null.
+
+    .. note:: The position is not zero based, but 1 based index. Returns 0 if the given
+        value could not be found in the array.
+
+    >>> df = spark.createDataFrame([(["c", "b", "a"],), ([],)], ['data'])
+    >>> df.select(array_position(df.data, "a")).collect()
+    [Row(array_position(data, a)=3), Row(array_position(data, a)=0)]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.array_position(_to_java_column(col), value))
+
+
 @since(1.4)
 def explode(col):
     """Returns a new row for each element in the given array or map.
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
@@ -104,7 +104,8 @@ def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None,
         :param topics:  list of topic_name to consume.
         :param kafkaParams: Additional params for Kafka.
         :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting
-                            point of the stream.
+                            point of the stream (a dictionary mapping `TopicAndPartition` to
+                            integers).
         :param keyDecoder:  A function used to decode key (default is utf8_decoder).
         :param valueDecoder:  A function used to decode value (default is utf8_decoder).
         :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
diff --git a/python/pyspark/streaming/listener.py b/python/pyspark/streaming/listener.py
@@ -23,6 +23,12 @@ class StreamingListener(object):
     def __init__(self):
         pass
 
+    def onStreamingStarted(self, streamingStarted):
+        """
+        Called when the streaming has been started.
+        """
+        pass
+
     def onReceiverStarted(self, receiverStarted):
         """
         Called when a receiver has been started
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
@@ -507,6 +507,10 @@ def __init__(self):
             self.batchInfosCompleted = []
             self.batchInfosStarted = []
             self.batchInfosSubmitted = []
+            self.streamingStartedTime = []
+
+        def onStreamingStarted(self, streamingStarted):
+            self.streamingStartedTime.append(streamingStarted.time)
 
         def onBatchSubmitted(self, batchSubmitted):
             self.batchInfosSubmitted.append(batchSubmitted.batchInfo())
@@ -530,9 +534,12 @@ def func(dstream):
         batchInfosSubmitted = batch_collector.batchInfosSubmitted
         batchInfosStarted = batch_collector.batchInfosStarted
         batchInfosCompleted = batch_collector.batchInfosCompleted
+        streamingStartedTime = batch_collector.streamingStartedTime
 
         self.wait_for(batchInfosCompleted, 4)
 
+        self.assertEqual(len(streamingStartedTime), 1)
+
         self.assertGreaterEqual(len(batchInfosSubmitted), 4)
         for info in batchInfosSubmitted:
             self.assertGreaterEqual(info.batchTime().milliseconds(), 0)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -402,6 +402,7 @@ object FunctionRegistry {
     // collection functions
     expression[CreateArray]("array"),
     expression[ArrayContains]("array_contains"),
+    expression[ArrayPosition]("array_position"),
     expression[CreateMap]("map"),
     expression[CreateNamedStruct]("named_struct"),
     expression[MapKeys]("map_keys"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -508,6 +508,62 @@ case class ArrayMax(child: Expression) extends UnaryExpression with ImplicitCast
   override def prettyName: String = "array_max"
 }
 
+
+/**
+ * Returns the position of the first occurrence of element in the given array as long.
+ * Returns 0 if the given value could not be found in the array. Returns null if either of
+ * the arguments are null
+ *
+ * NOTE: that this is not zero based, but 1-based index. The first element in the array has
+ *       index 1.
+ */
+@ExpressionDescription(
+  usage = """
+    _FUNC_(array, element) - Returns the (1-based) index of the first element of the array as long.
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(array(3, 2, 1), 1);
+       3
+  """,
+  since = "2.4.0")
+case class ArrayPosition(left: Expression, right: Expression)
+  extends BinaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = LongType
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(ArrayType, left.dataType.asInstanceOf[ArrayType].elementType)
+
+  override def nullSafeEval(arr: Any, value: Any): Any = {
+    arr.asInstanceOf[ArrayData].foreach(right.dataType, (i, v) =>
+      if (v == value) {
+        return (i + 1).toLong
+      }
+    )
+    0L
+  }
+
+  override def prettyName: String = "array_position"
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, (arr, value) => {
+      val pos = ctx.freshName("arrayPosition")
+      val i = ctx.freshName("i")
+      val getValue = CodeGenerator.getValue(arr, right.dataType, i)
+      s"""
+         |int $pos = 0;
+         |for (int $i = 0; $i < $arr.numElements(); $i ++) {
+         |  if (!$arr.isNullAt($i) && ${ctx.genEqual(right.dataType, value, getValue)}) {
+         |    $pos = $i + 1;
+         |    break;
+         |  }
+         |}
+         |${ev.value} = (long) $pos;
+       """.stripMargin
+    })
+  }
+}
+
 /**
  * Transforms an array of arrays into a single array.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
@@ -141,28 +141,29 @@ abstract class ArrayData extends SpecializedGetters with Serializable {
 
   def toArray[T: ClassTag](elementType: DataType): Array[T] = {
     val size = numElements()
+    val accessor = InternalRow.getAccessor(elementType)
     val values = new Array[T](size)
     var i = 0
     while (i < size) {
       if (isNullAt(i)) {
         values(i) = null.asInstanceOf[T]
       } else {
-        values(i) = get(i, elementType).asInstanceOf[T]
+        values(i) = accessor(this, i).asInstanceOf[T]
       }
       i += 1
     }
     values
   }
 
-  // todo: specialize this.
   def foreach(elementType: DataType, f: (Int, Any) => Unit): Unit = {
     val size = numElements()
+    val accessor = InternalRow.getAccessor(elementType)
     var i = 0
     while (i < size) {
       if (isNullAt(i)) {
         f(i, null)
       } else {
-        f(i, get(i, elementType))
+        f(i, accessor(this, i))
       }
       i += 1
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -170,6 +170,28 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(Reverse(aa), Seq(Seq("e"), Seq("c", "d"), Seq("a", "b")))
   }
 
+  test("Array Position") {
+    val a0 = Literal.create(Seq(1, null, 2, 3), ArrayType(IntegerType))
+    val a1 = Literal.create(Seq[String](null, ""), ArrayType(StringType))
+    val a2 = Literal.create(Seq(null), ArrayType(LongType))
+    val a3 = Literal.create(null, ArrayType(StringType))
+
+    checkEvaluation(ArrayPosition(a0, Literal(3)), 4L)
+    checkEvaluation(ArrayPosition(a0, Literal(1)), 1L)
+    checkEvaluation(ArrayPosition(a0, Literal(0)), 0L)
+    checkEvaluation(ArrayPosition(a0, Literal.create(null, IntegerType)), null)
+
+    checkEvaluation(ArrayPosition(a1, Literal("")), 2L)
+    checkEvaluation(ArrayPosition(a1, Literal("a")), 0L)
+    checkEvaluation(ArrayPosition(a1, Literal.create(null, StringType)), null)
+
+    checkEvaluation(ArrayPosition(a2, Literal(1L)), 0L)
+    checkEvaluation(ArrayPosition(a2, Literal.create(null, LongType)), null)
+
+    checkEvaluation(ArrayPosition(a3, Literal("")), null)
+    checkEvaluation(ArrayPosition(a3, Literal.create(null, StringType)), null)
+  }
+
   test("Flatten") {
     // Primitive-type test cases
     val intArrayType = ArrayType(ArrayType(IntegerType))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3038,6 +3038,20 @@ object functions {
     ArrayContains(column.expr, Literal(value))
   }
 
+  /**
+   * Locates the position of the first occurrence of the value in the given array as long.
+   * Returns null if either of the arguments are null.
+   *
+   * @note The position is not zero based, but 1 based index. Returns 0 if value
+   * could not be found in array.
+   *
+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  def array_position(column: Column, value: Any): Column = withExpr {
+    ArrayPosition(column.expr, Literal(value))
+  }
+
   /**
    * Creates a new row for each element in the given array or map column.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -535,6 +535,40 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("array position function") {
+    val df = Seq(
+      (Seq[Int](1, 2), "x"),
+      (Seq[Int](), "x")
+    ).toDF("a", "b")
+
+    checkAnswer(
+      df.select(array_position(df("a"), 1)),
+      Seq(Row(1L), Row(0L))
+    )
+    checkAnswer(
+      df.selectExpr("array_position(a, 1)"),
+      Seq(Row(1L), Row(0L))
+    )
+
+    checkAnswer(
+      df.select(array_position(df("a"), null)),
+      Seq(Row(null), Row(null))
+    )
+    checkAnswer(
+      df.selectExpr("array_position(a, null)"),
+      Seq(Row(null), Row(null))
+    )
+
+    checkAnswer(
+      df.selectExpr("array_position(array(array(1), null)[0], 1)"),
+      Seq(Row(1L), Row(1L))
+    )
+    checkAnswer(
+      df.selectExpr("array_position(array(1, null), array(1, null)[0])"),
+      Seq(Row(1L), Row(1L))
+    )
+  }
+
   test("flatten function") {
     val dummyFilter = (c: Column) => c.isNull || c.isNotNull // to switch codeGen on
     val oneRowDF = Seq((1, "a", Seq(1, 2, 3))).toDF("i", "s", "arr")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
@@ -17,14 +17,16 @@
 
 package org.apache.spark.sql
 
+import java.util.concurrent.{CountDownLatch, TimeUnit}
+
 import scala.concurrent.duration._
 import scala.math.abs
 import scala.util.Random
 
 import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.{SparkException, TaskContext}
-import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
+import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -152,39 +154,53 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
   }
 
   test("Cancelling stage in a query with Range.") {
-    val listener = new SparkListener {
-      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
-        eventually(timeout(10.seconds), interval(1.millis)) {
-          assert(DataFrameRangeSuite.stageToKill > 0)
+    // Save and restore the value because SparkContext is shared
+    val savedInterruptOnCancel = sparkContext
+      .getLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL)
+
+    try {
+      sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "true")
+
+      for (codegen <- Seq(true, false)) {
+        // This countdown latch used to make sure with all the stages cancelStage called in listener
+        val latch = new CountDownLatch(2)
+
+        val listener = new SparkListener {
+          override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+            sparkContext.cancelStage(taskStart.stageId)
+            latch.countDown()
+          }
         }
-        sparkContext.cancelStage(DataFrameRangeSuite.stageToKill)
-      }
-    }
 
-    sparkContext.addSparkListener(listener)
-    for (codegen <- Seq(true, false)) {
-      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegen.toString()) {
-        DataFrameRangeSuite.stageToKill = -1
-        val ex = intercept[SparkException] {
-          spark.range(0, 100000000000L, 1, 1).map { x =>
-            DataFrameRangeSuite.stageToKill = TaskContext.get().stageId()
-            x
-          }.toDF("id").agg(sum("id")).collect()
+        sparkContext.addSparkListener(listener)
+        withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegen.toString()) {
+          val ex = intercept[SparkException] {
+            sparkContext.range(0, 10000L, numSlices = 10).mapPartitions { x =>
+              x.synchronized {
+                x.wait()
+              }
+              x
+            }.toDF("id").agg(sum("id")).collect()
+          }
+          ex.getCause() match {
+            case null =>
+              assert(ex.getMessage().contains("cancelled"))
+            case cause: SparkException =>
+              assert(cause.getMessage().contains("cancelled"))
+            case cause: Throwable =>
+              fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
+          }
         }
-        ex.getCause() match {
-          case null =>
-            assert(ex.getMessage().contains("cancelled"))
-          case cause: SparkException =>
-            assert(cause.getMessage().contains("cancelled"))
-          case cause: Throwable =>
-            fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
+        latch.await(20, TimeUnit.SECONDS)
+        eventually(timeout(20.seconds)) {
+          assert(sparkContext.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
         }
+        sparkContext.removeSparkListener(listener)
       }
-      eventually(timeout(20.seconds)) {
-        assert(sparkContext.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
-      }
+    } finally {
+      sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL,
+        savedInterruptOnCancel)
     }
-    sparkContext.removeSparkListener(listener)
   }
 
   test("SPARK-20430 Initialize Range parameters in a driver side") {
@@ -204,7 +220,3 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
     }
   }
 }
-
-object DataFrameRangeSuite {
-  @volatile var stageToKill = -1
-}