address comments

Davies Liu · Davies Liu · commit 7df43ca78846 · 2016-03-08T15:35:25.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -239,9 +239,9 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
   override def innerChildren: Seq[PlanType] = subqueries
 
   /**
-   * Cleaned copy of this query plan.
+   * Canonicalized copy of this query plan.
    */
-  protected lazy val cleaned: PlanType = this
+  protected lazy val canonicalized: PlanType = this
 
   /**
    * Returns true when the given query plan will return the same results as this query plan.
@@ -257,8 +257,8 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
    * can do better should override this function.
    */
   def sameResult(plan: PlanType): Boolean = {
-    val cleanLeft = this.cleaned
-    val cleanRight = plan.cleaned
+    val cleanLeft = this.canonicalized
+    val cleanRight = plan.canonicalized
     cleanLeft.getClass == cleanRight.getClass &&
       cleanLeft.children.size == cleanRight.children.size &&
       cleanLeft.cleanArgs == cleanRight.cleanArgs &&
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -114,7 +114,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
    */
   def childrenResolved: Boolean = children.forall(_.resolved)
 
-  override lazy val cleaned: LogicalPlan = EliminateSubqueryAliases(this)
+  override lazy val canonicalized: LogicalPlan = EliminateSubqueryAliases(this)
 
   /**
    * Optionally resolves the given strings to a [[NamedExpression]] using the input from all child
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
@@ -35,6 +35,8 @@ class SparkPlanInfo(
     val metrics: Seq[SQLMetricInfo]) {
 
   override def hashCode(): Int = {
+    // hashCode of simpleString should be good enough to distinguish the plans from each other
+    // within a plan
     simpleString.hashCode
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala
@@ -37,8 +37,9 @@ abstract class Exchange extends UnaryNode {
 }
 
 /**
- * A wrapper for reused exchange to have different output, which is required to resolve the
- * attributes in following plans.
+ * A wrapper for reused exchange to have different output, because two exchanges which produce
+ * logically identical output will have distinct sets of output attribute ids, so we need to
+ * preserve the original ids because they're what downstream operators are expecting.
  */
 case class ReusedExchange(override val output: Seq[Attribute], child: Exchange) extends LeafNode {
 
@@ -73,15 +74,15 @@ private[sql] case class ReuseExchange(sqlContext: SQLContext) extends Rule[Spark
     val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]()
     plan.transformUp {
       case exchange: Exchange =>
+        // the exchanges that have same results usually also have same schemas (same column names).
         val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]())
         val samePlan = sameSchema.find { e =>
           exchange.sameResult(e)
         }
         if (samePlan.isDefined) {
           // Keep the output of this exchange, the following plans require that to resolve
           // attributes.
-          val reused = ReusedExchange(exchange.output, samePlan.get)
-          reused
+          ReusedExchange(exchange.output, samePlan.get)
         } else {
           sameSchema += exchange
           exchange
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
@@ -104,12 +104,12 @@ case class ShuffleExchange(
   /**
    * Caches the created ShuffleRowRDD so we can reuse that.
    */
-  private var shuffleRDD: ShuffledRowRDD = null
+  private var cachedShuffleRDD: ShuffledRowRDD = null
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     // Returns the same ShuffleRowRDD if this plan is used by multiple plans.
-    if (shuffleRDD == null) {
-      shuffleRDD = coordinator match {
+    if (cachedShuffleRDD == null) {
+      cachedShuffleRDD = coordinator match {
         case Some(exchangeCoordinator) =>
           val shuffleRDD = exchangeCoordinator.postShuffleRDD(this)
           assert(shuffleRDD.partitions.length == newPartitioning.numPartitions)
@@ -119,7 +119,7 @@ case class ShuffleExchange(
           preparePostShuffleRDD(shuffleDependency)
       }
     }
-    shuffleRDD
+    cachedShuffleRDD
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
@@ -104,8 +104,7 @@ private[sql] object SparkPlanGraph {
         } else {
           subgraph.nodes += node
         }
-        // ShuffleExchange or BroadcastExchange
-        if (name.endsWith("Exchange")) {
+        if (name == "ShuffleExchange" || name == "BroadcastExchange") {
           exchanges += planInfo -> node
         }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1341,6 +1341,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val df = sqlContext.range(100)
     val agg1 = df.groupBy().count()
     val agg2 = df.groupBy().count()
+    // two aggregates with different ExprId within them should have same result
     agg1.queryExecution.executedPlan.sameResult(agg2.queryExecution.executedPlan)
   }
 

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,8 @@ class SparkPlanInfo(`
`35`	`35`	`val metrics: Seq[SQLMetricInfo]) {`
`36`	`36`
`37`	`37`	`override def hashCode(): Int = {`
	`38`	`+ // hashCode of simpleString should be good enough to distinguish the plans from each other`
	`39`	`+ // within a plan`
`38`	`40`	`simpleString.hashCode`
`39`	`41`	`}`
`40`	`42`
Original file line number	Diff line number	Diff line change
`@@ -104,12 +104,12 @@ case class ShuffleExchange(`
`104`	`104`	`/**`
`105`	`105`	`* Caches the created ShuffleRowRDD so we can reuse that.`
`106`	`106`	`*/`
`107`		`- private var shuffleRDD: ShuffledRowRDD = null`
	`107`	`+ private var cachedShuffleRDD: ShuffledRowRDD = null`
`108`	`108`
`109`	`109`	`protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {`
`110`	`110`	`// Returns the same ShuffleRowRDD if this plan is used by multiple plans.`
`111`		`- if (shuffleRDD == null) {`
`112`		`- shuffleRDD = coordinator match {`
	`111`	`+ if (cachedShuffleRDD == null) {`
	`112`	`+ cachedShuffleRDD = coordinator match {`
`113`	`113`	`case Some(exchangeCoordinator) =>`
`114`	`114`	`val shuffleRDD = exchangeCoordinator.postShuffleRDD(this)`
`115`	`115`	`assert(shuffleRDD.partitions.length == newPartitioning.numPartitions)`
`@@ -119,7 +119,7 @@ case class ShuffleExchange(`
`119`	`119`	`preparePostShuffleRDD(shuffleDependency)`
`120`	`120`	`}`
`121`	`121`	`}`
`122`		`- shuffleRDD`
	`122`	`+ cachedShuffleRDD`
`123`	`123`	`}`
`124`	`124`	`}`
`125`	`125`
Original file line number	Diff line number	Diff line change
`@@ -104,8 +104,7 @@ private[sql] object SparkPlanGraph {`
`104`	`104`	`} else {`
`105`	`105`	`subgraph.nodes += node`
`106`	`106`	`}`
`107`		`- // ShuffleExchange or BroadcastExchange`
`108`		`- if (name.endsWith("Exchange")) {`
	`107`	`+ if (name == "ShuffleExchange" \|\| name == "BroadcastExchange") {`
`109`	`108`	`exchanges += planInfo -> node`
`110`	`109`	`}`
`111`	`110`
Original file line number	Diff line number	Diff line change
`@@ -1341,6 +1341,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {`
`1341`	`1341`	`val df = sqlContext.range(100)`
`1342`	`1342`	`val agg1 = df.groupBy().count()`
`1343`	`1343`	`val agg2 = df.groupBy().count()`
	`1344`	`+ // two aggregates with different ExprId within them should have same result`
`1344`	`1345`	`agg1.queryExecution.executedPlan.sameResult(agg2.queryExecution.executedPlan)`
`1345`	`1346`	`}`
`1346`	`1347`