[SPARK-22861][SQL] SQLAppStatusListener handles multi-job executions.

squito · squito · commit 29d184c1e39c · 2017-12-21T10:54:14.000-06:00
When one execution has multiple jobs, we need to append to the set of
stages, not replace them on every job.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -31,7 +31,6 @@ import org.apache.spark.sql.internal.StaticSQLConf._
 import org.apache.spark.status.{ElementTrackingStore, KVUtils, LiveEntity}
 import org.apache.spark.status.config._
 import org.apache.spark.ui.SparkUI
-import org.apache.spark.util.kvstore.KVStore
 
 private[sql] class SQLAppStatusListener(
     conf: SparkConf,
@@ -88,7 +87,7 @@ private[sql] class SQLAppStatusListener(
     }
 
     exec.jobs = exec.jobs + (jobId -> JobExecutionStatus.RUNNING)
-    exec.stages = event.stageIds.toSet
+    exec.stages ++= event.stageIds.toSet
     update(exec)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -361,6 +361,46 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
     assertJobs(store.execution(0), failed = Seq(0))
   }
 
+  sqlStoreTest("handle one execution with multiple jobs") { (store, bus) =>
+    val executionId = 0
+    val df = createTestDataFrame
+    bus.postToAll(SparkListenerSQLExecutionStart(
+      executionId,
+      "test",
+      "test",
+      df.queryExecution.toString,
+      SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
+      System.currentTimeMillis()))
+
+    var stageId = 0
+    def twoStageJob(jobId: Int): Unit = {
+      val stages = Seq(stageId, stageId + 1).map { id => createStageInfo(id, 0)}
+      stageId += 2
+      bus.postToAll(SparkListenerJobStart(
+        jobId = jobId,
+        time = System.currentTimeMillis(),
+        stageInfos = stages,
+        createProperties(executionId)))
+      stages.foreach { s =>
+        bus.postToAll(SparkListenerStageSubmitted(s))
+        bus.postToAll(SparkListenerStageCompleted(s))
+      }
+      bus.postToAll(SparkListenerJobEnd(
+        jobId = jobId,
+        time = System.currentTimeMillis(),
+        JobSucceeded
+      ))
+    }
+    // submit two jobs with the same executionId
+    twoStageJob(0)
+    twoStageJob(1)
+    bus.postToAll(SparkListenerSQLExecutionEnd(
+      executionId, System.currentTimeMillis()))
+
+    assertJobs(store.execution(0), completed = 0 to 1)
+    assert(store.execution(0).get.stages === (0 to 3).toSet)
+  }
+
   test("SPARK-11126: no memory leak when running non SQL jobs") {
     val previousStageNumber = statusStore.executionsList().size
     spark.sparkContext.parallelize(1 to 10).foreach(i => ())

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,6 @@ import org.apache.spark.sql.internal.StaticSQLConf._`
`31`	`31`	`import org.apache.spark.status.{ElementTrackingStore, KVUtils, LiveEntity}`
`32`	`32`	`import org.apache.spark.status.config._`
`33`	`33`	`import org.apache.spark.ui.SparkUI`
`34`		`-import org.apache.spark.util.kvstore.KVStore`
`35`	`34`
`36`	`35`	`private[sql] class SQLAppStatusListener(`
`37`	`36`	`conf: SparkConf,`
`@@ -88,7 +87,7 @@ private[sql] class SQLAppStatusListener(`
`88`	`87`	`}`
`89`	`88`
`90`	`89`	`exec.jobs = exec.jobs + (jobId -> JobExecutionStatus.RUNNING)`
`91`		`- exec.stages = event.stageIds.toSet`
	`90`	`+ exec.stages ++= event.stageIds.toSet`
`92`	`91`	`update(exec)`
`93`	`92`	`}`
`94`	`93`