add comment

cloud-fan · cloud-fan · commit 959dda64b331 · 2019-01-29T22:42:42.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlan.scala
@@ -28,9 +28,9 @@ import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, SparkPlanInfo, S
 import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate
 
 /**
- * A root node to trigger query stages and execute the query plan adaptively. It incrementally
+ * A root node to execute the query plan adaptively. It creates query stages, and incrementally
  * updates the query plan when a query stage is materialized and provides accurate runtime
- * statistics.
+ * data statistics.
  */
 case class AdaptiveSparkPlan(initialPlan: SparkPlan, session: SparkSession)
   extends LeafExecNode{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala
@@ -22,6 +22,12 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.ExecutedCommandExec
 
+/**
+ * This rule wraps the query plan with an [[AdaptiveSparkPlan]], which executes the query plan
+ * adaptively with runtime data statistics. Note that this rule must be run after
+ * [[org.apache.spark.sql.execution.exchange.EnsureRequirements]], so that the exchange nodes are
+ * already inserted.
+ */
 case class InsertAdaptiveSparkPlan(session: SparkSession) extends Rule[SparkPlan] {
 
   override def apply(plan: SparkPlan): SparkPlan = plan match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStage.scala
@@ -29,8 +29,9 @@ import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.exchange._
 
 /**
- * In adaptive execution mode, an execution plan is divided into multiple QueryStages w.r.t. the
- * exchange as boundary. Each QueryStage is a sub-tree that runs in a single Spark stage.
+ * A query stage is an individual sub-tree of a query plan, which can be executed ahead and provide
+ * accurate data statistics. For example, a sub-tree under shuffle/broadcast node is a query stage.
+ * Each query stage runs in a single Spark job/stage.
  */
 abstract class QueryStage extends LeafExecNode {
 
@@ -96,7 +97,7 @@ case class ResultQueryStage(id: Int, plan: SparkPlan) extends QueryStage {
 }
 
 /**
- * A shuffle QueryStage whose child is a ShuffleExchangeExec.
+ * A shuffle QueryStage whose child is a [[ShuffleExchangeExec]].
  */
 case class ShuffleQueryStage(id: Int, plan: ShuffleExchangeExec) extends QueryStage {
 
@@ -120,7 +121,7 @@ case class ShuffleQueryStage(id: Int, plan: ShuffleExchangeExec) extends QuerySt
 }
 
 /**
- * A broadcast QueryStage whose child is a BroadcastExchangeExec.
+ * A broadcast QueryStage whose child is a [[BroadcastExchangeExec]].
  */
 case class BroadcastQueryStage(id: Int, plan: BroadcastExchangeExec) extends QueryStage {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageCreator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageCreator.scala
@@ -32,7 +32,31 @@ import org.apache.spark.util.{EventLoop, ThreadUtils}
 /**
  * This class dynamically creates [[QueryStage]] bottom-up, optimize the query plan of query stages
  * and materialize them. It creates as many query stages as possible at the same time, and
- * creates/optimizes a query stage when all its child stages are materialized.
+ * materialize a query stage when all its child stages are materialized.
+ *
+ * To create query stages, we traverse the query tree bottom up. When we hit an exchange node, and
+ * all the child query stages of this exchange node are materialized, we try to create a new query
+ * stage for this exchange node.
+ *
+ * To create a new query stage, we first optimize the sub-tree of the exchange. After optimization,
+ * we check the output partitioning of the optimized sub-tree, and see if the exchange node is still
+ * necessary.
+ *
+ * If the exchange node becomes unnecessary, remove it and give up this query stage creation, and
+ * continue to traverse the query plan tree until we hit the next exchange node.
+ *
+ * If the exchange node is still needed, create the query stage and optimize its sub-tree again.
+ * It's necessary to have both the pre-creation optimization and post-creation optimization, because
+ * these 2 optimization have different assumptions. For pre-creation optimization, the shuffle node
+ * may be removed later on and the current sub-tree may be only a part of a query stage, so we don't
+ * have the big picture of the query stage yet. For post-creation optimization, the query stage is
+ * created and we have the big picture of the query stage.
+ *
+ * After the query stage is optimized, we materialize it asynchronously, and continue to traverse
+ * the query plan tree to create more query stages.
+ *
+ * When a query stage completes materialization, we trigger the process of query stages creation and
+ * traverse the query plan tree again.
  */
 class QueryStageCreator(
     initialPlan: SparkPlan,
@@ -48,16 +72,24 @@ class QueryStageCreator(
 
   private val stageCache = mutable.HashMap.empty[StructType, mutable.Buffer[(Exchange, QueryStage)]]
 
-  private val phaseOneOptimizerRules: Seq[Rule[SparkPlan]] = Seq(
+  // The optimizer rules that will be applied to a sub-tree of the query plan before the stage is
+  // created. Note that we may end up not creating the query stage, so the rules here should not
+  // assume the given sub-plan-tree is the entire query plan of the query stage. For example, if a
+  // rule want to collect all the child query stages, it should not be put here.
+  private val preStageCreationOptimizerRules: Seq[Rule[SparkPlan]] = Seq(
     AssertChildStagesMaterialized
   )
 
-  private val phaseTwoOptimizerRules: Seq[Rule[SparkPlan]] = Seq(
+  // The optimizer rules that will be applied to a sub-tree of the query plan after the stage is
+  // created. Note that once the stage is created, we will not remove it anymore. If a rule changes
+  // the output partitioning of the sub-plan-tree, which may help to remove the exchange node, it's
+  // better to put it in `preStageCreationOptimizerRules`, so that we may create less query stages.
+  private val postStageCreationOptimizerRules: Seq[Rule[SparkPlan]] = Seq(
     ReduceNumShufflePartitions(conf),
     CollapseCodegenStages(conf),
     ReuseSubquery(conf))
 
-  private var currentPlan = createBottomQueryStages(initialPlan)
+  private var currentPlan = createQueryStages(initialPlan)
 
   private implicit def executionContext: ExecutionContextExecutorService = {
     QueryStageCreator.executionContext
@@ -80,26 +112,29 @@ class QueryStageCreator(
         stop()
       } else {
         readyStages += stage.id
-        currentPlan = createBottomQueryStages(currentPlan)
+        currentPlan = createQueryStages(currentPlan)
       }
   }
 
-  private def phaseOneOptimize(plan: SparkPlan): SparkPlan = {
-    phaseOneOptimizerRules.foldLeft(plan) {
+  private def preStageCreationOptimize(plan: SparkPlan): SparkPlan = {
+    preStageCreationOptimizerRules.foldLeft(plan) {
       case (current, rule) => rule(current)
     }
   }
 
-  private def phaseTwoOptimize(plan: SparkPlan): SparkPlan = {
-    phaseTwoOptimizerRules.foldLeft(plan) {
+  private def postStageCreationOptimize(plan: SparkPlan): SparkPlan = {
+    postStageCreationOptimizerRules.foldLeft(plan) {
       case (current, rule) => rule(current)
     }
   }
 
-  private def createBottomQueryStages(plan: SparkPlan): SparkPlan = {
-    val result = tryCreateQueryStage(plan)
-    if (result.stageReady) {
-      val finalPlan = phaseTwoOptimize(phaseOneOptimize(result.newPlan))
+  /**
+   * Traverse the query plan bottom-up, and creates query stages as many as possible.
+   */
+  private def createQueryStages(plan: SparkPlan): SparkPlan = {
+    val result = createQueryStages0(plan)
+    if (result.allChildStagesReady) {
+      val finalPlan = postStageCreationOptimize(preStageCreationOptimize(result.newPlan))
       post(StageReady(ResultQueryStage(currentStageId, finalPlan)))
       finalPlan
     } else {
@@ -108,57 +143,70 @@ class QueryStageCreator(
     }
   }
 
-  private def tryCreateQueryStage(plan: SparkPlan): CreateStageResult = plan match {
+  /**
+   * This method is called recursively to traverse the plan tree bottom-up. This method returns two
+   * information: 1) the new plan after we insert query stages. 2) whether or not the child query
+   * stages of the new plan are all ready.
+   *
+   * if the current plan is an exchange node, and all its child query stages are ready, we try to
+   * create a new query stage.
+   */
+  private def createQueryStages0(plan: SparkPlan): CreateStageResult = plan match {
     case e: Exchange =>
       val similarStages = stageCache.getOrElseUpdate(e.schema, mutable.Buffer.empty)
       similarStages.find(_._1.sameResult(e)) match {
         case Some((_, existingStage)) if conf.exchangeReuseEnabled =>
           CreateStageResult(
             newPlan = ReusedQueryStage(existingStage, e.output),
-            stageReady = readyStages.contains(existingStage.id))
+            allChildStagesReady = readyStages.contains(existingStage.id))
 
         case _ =>
-          val result = tryCreateQueryStage(e.child)
-          if (result.stageReady) {
-            val optimizedPlan = phaseOneOptimize(result.newPlan)
+          val result = createQueryStages0(e.child)
+          // Try to create a query stage only when all the child query stages are ready.
+          if (result.allChildStagesReady) {
+            val optimizedPlan = preStageCreationOptimize(result.newPlan)
             e match {
               case s: ShuffleExchangeExec =>
                 (s.desiredPartitioning, optimizedPlan.outputPartitioning) match {
                   case (desired: HashPartitioning, actual: HashPartitioning)
                       if desired.semanticEquals(actual) =>
                     // This shuffle exchange is unnecessary now, remove it. The reason maybe:
-                    //   1. the child plan has changed its output partitioning, and makes this
-                    //      exchange node unnecessary.
+                    //   1. the child plan has changed its output partitioning after optimization,
+                    //      and makes this exchange node unnecessary.
                     //   2. this exchange node is user specified, which turns out to be unnecessary.
-                    CreateStageResult(newPlan = optimizedPlan, stageReady = true)
+                    CreateStageResult(newPlan = optimizedPlan, allChildStagesReady = true)
                   case _ =>
                     val queryStage = createQueryStage(s.copy(child = optimizedPlan))
                     similarStages.append(e -> queryStage)
-                    CreateStageResult(newPlan = queryStage, stageReady = false)
+                    // We've created a new stage, which is obviously not ready yet.
+                    CreateStageResult(newPlan = queryStage, allChildStagesReady = false)
                 }
 
               case b: BroadcastExchangeExec =>
                 val queryStage = createQueryStage(b.copy(child = optimizedPlan))
                 similarStages.append(e -> queryStage)
-                CreateStageResult(newPlan = queryStage, stageReady = false)
+                // We've created a new stage, which is obviously not ready yet.
+                CreateStageResult(newPlan = queryStage, allChildStagesReady = false)
             }
           } else {
-            CreateStageResult(newPlan = e.withNewChildren(Seq(result.newPlan)), stageReady = false)
+            CreateStageResult(
+              newPlan = e.withNewChildren(Seq(result.newPlan)),
+              allChildStagesReady = false)
           }
       }
 
     case q: QueryStage =>
-      CreateStageResult(newPlan = q, stageReady = readyStages.contains(q.id))
+      CreateStageResult(newPlan = q, allChildStagesReady = readyStages.contains(q.id))
 
     case _ =>
-      val results = plan.children.map(tryCreateQueryStage)
+      val results = plan.children.map(createQueryStages0)
       CreateStageResult(
         newPlan = plan.withNewChildren(results.map(_.newPlan)),
-        stageReady = results.forall(_.stageReady))
+        allChildStagesReady = results.forall(_.allChildStagesReady))
   }
 
   private def createQueryStage(e: Exchange): QueryStage = {
-    val optimizedPlan = phaseTwoOptimize(e.child)
+    val optimizedPlan = postStageCreationOptimize(e.child)
     val queryStage = e match {
       case s: ShuffleExchangeExec =>
         ShuffleQueryStage(currentStageId, s.copy(child = optimizedPlan))
@@ -173,7 +221,7 @@ class QueryStageCreator(
   override protected def onError(e: Throwable): Unit = callback.onError(e)
 }
 
-case class CreateStageResult(newPlan: SparkPlan, stageReady: Boolean)
+case class CreateStageResult(newPlan: SparkPlan, allChildStagesReady: Boolean)
 
 object QueryStageCreator {
   private val executionContext = ExecutionContext.fromExecutorService(