fix bug

lianhuiwang · lianhuiwang · commit 77ea002b310e · 2016-07-09T17:07:48.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.execution
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
 
 import scala.collection.mutable.ArrayBuffer
-import scala.concurrent.{ExecutionContext, Future}
-import scala.concurrent.duration._
+import scala.concurrent.ExecutionContext
 
 import org.apache.spark.{broadcast, SparkEnv}
 import org.apache.spark.internal.Logging
@@ -138,51 +137,30 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   }
 
   /**
-   * List of (uncorrelated scalar subquery, future holding the subquery result) for this plan node.
+   * List of uncorrelated scalar subquery for this plan node.
    * This list is populated by [[prepareSubqueries]], which is called in [[prepare]].
    */
   @transient
-  private val subqueryResults = new ArrayBuffer[(ScalarSubquery, Future[Array[InternalRow]])]
+  private var allSubqueries = new ArrayBuffer[ScalarSubquery]
 
   /**
    * Finds scalar subquery expressions in this plan node and starts evaluating them.
-   * The list of subqueries are added to [[subqueryResults]].
+   * The list of subqueries are added to [[allSubqueries]].
    */
   protected def prepareSubqueries(): Unit = {
-    val allSubqueries = expressions.flatMap(_.collect {
-      case e: ScalarSubquery if !e.isExecuted => e
-    }).distinct
-    allSubqueries.asInstanceOf[Seq[ScalarSubquery]].foreach { e =>
-      e.updateExecutedState()
-      val futureResult = Future {
-        // Each subquery should return only one row (and one column). We take two here and throws
-        // an exception later if the number of rows is greater than one.
-        e.executedPlan.executeTake(2)
-      }(SparkPlan.subqueryExecutionContext)
-      subqueryResults += e -> futureResult
+    expressions.flatMap(_.collect { case e: ScalarSubquery => e }).distinct.foreach { e =>
+      e.submitSubqueryEvaluated()
+      allSubqueries += e
     }
   }
 
   /**
-   * Blocks the thread until all subqueries finish evaluation and update the results.
+   * Blocks the thread until all subqueries finish evaluation.
    */
   protected def waitForSubqueries(): Unit = synchronized {
-    // fill in the result of subqueries
-    subqueryResults.foreach { case (e, futureResult) =>
-      val rows = ThreadUtils.awaitResult(futureResult, Duration.Inf)
-      if (rows.length > 1) {
-        sys.error(s"more than one row returned by a subquery used as an expression:\n${e.plan}")
-      }
-      if (rows.length == 1) {
-        assert(rows(0).numFields == 1,
-          s"Expects 1 field, but got ${rows(0).numFields}; something went wrong in analysis")
-        e.updateResult(rows(0).get(0, e.dataType))
-      } else {
-        // If there is no rows returned, the result should be null.
-        e.updateResult(null)
-      }
+    allSubqueries.foreach { e =>
+      e.awaitSubqueryResult()
     }
-    subqueryResults.clear()
   }
 
   /**
@@ -393,11 +371,6 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   }
 }
 
-object SparkPlan {
-  private[execution] val subqueryExecutionContext = ExecutionContext.fromExecutorService(
-    ThreadUtils.newDaemonCachedThreadPool("subquery", 16))
-}
-
 private[sql] trait LeafExecNode extends SparkPlan {
   override def children: Seq[SparkPlan] = Nil
   override def producedAttributes: AttributeSet = outputSet
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.execution
 
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions
@@ -28,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCo
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.util.ThreadUtils
 
 /**
  * A subquery that will return only one row and one column.
@@ -53,15 +56,10 @@ case class ScalarSubquery(
   // the first column in first row from `query`.
   @volatile private var result: Any = null
   @volatile private var updated: Boolean = false
-  @volatile private var executed: Boolean = false
+  @volatile private var evaluated: Boolean = false
+  @volatile private var futureResult: Future[Array[InternalRow]] = _
 
-  def isExecuted: Boolean = executed
-
-  def updateExecutedState() : Unit = {
-    executed = true
-  }
-
-  def updateResult(v: Any): Unit = {
+  private def updateResult(v: Any): Unit = {
     result = v
     updated = true
   }
@@ -76,6 +74,40 @@ case class ScalarSubquery(
     Literal.create(result, dataType).doGenCode(ctx, ev)
   }
 
+  /**
+   * Submit the subquery to be evaluated. No need to do if it has been evaluated before.
+   */
+  def submitSubqueryEvaluated(): Unit = synchronized {
+    if (!evaluated) {
+      futureResult = Future {
+        // Each subquery should return only one row (and one column). We take two here and throws
+        // an exception later if the number of rows is greater than one.
+        executedPlan.executeTake(2)
+      }(ScalarSubquery.subqueryExecutionContext)
+      evaluated = true
+    }
+  }
+
+  /**
+   * Blocks the thread until the evaluation of subquery has been finished.
+   */
+  def awaitSubqueryResult(): Unit = synchronized {
+    if (!updated) {
+      val rows = ThreadUtils.awaitResult(futureResult, Duration.Inf)
+      if (rows.length > 1) {
+        sys.error(s"more than one row returned by a subquery used as an expression:\n${plan}")
+      }
+      if (rows.length == 1) {
+        assert(rows(0).numFields == 1,
+          s"Expects 1 field, but got ${rows(0).numFields}; something went wrong in analysis")
+        updateResult(rows(0).get(0, dataType))
+      } else {
+        // If there is no rows returned, the result should be null.
+        updateResult(null)
+      }
+    }
+  }
+
   override def equals(o: Any): Boolean = o match {
     case other: ScalarSubquery => this.eq(other)
     case _ => false
@@ -84,6 +116,11 @@ case class ScalarSubquery(
   override def hashCode: Int = exprId.hashCode()
 }
 
+object ScalarSubquery {
+  private[execution] val subqueryExecutionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("subquery", 16))
+}
+
 /**
  * A wrapper for reused uncorrelated ScalarSubquery to avoid the re-computing for subqueries with
  * the same "canonical" logical plan in a query, because uncorrelated subqueries with the same
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2896,31 +2896,4 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       sql(s"SELECT '$literal' AS DUMMY"),
       Row(s"$expected") :: Nil)
   }
-
-  test("SPARK-16456: Reuse the uncorrelated scalar subqueries with the same logical plan") {
-    withTempTable("t1", "t2", "t3") {
-      val df = (1 to 3).map(i => (i, i)).toDF("key", "value")
-      df.createOrReplaceTempView("t1")
-      df.createOrReplaceTempView("t2")
-      df.createOrReplaceTempView("t3")
-      checkAnswer(
-        sql(
-          """
-            |WITH max_test AS
-            |(
-            | SELECT max(key) as max_key FROM t1
-            |),
-            |max_test2 AS
-            |(
-            | SELECT max(key) as max_key FROM t1
-            |)
-            |SELECT key FROM t2
-            |WHERE key = (SELECT max_key FROM max_test) and value = (SELECT max_key FROM max_test)
-            |UNION ALL
-            |SELECT key FROM t3
-            |WHERE key = (SELECT max_key FROM max_test) and value = (SELECT max_key FROM max_test2)
-          """.stripMargin
-        ), Row(3) :: Row(3) :: Nil)
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -571,4 +571,31 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       Row(1.0, false) :: Row(1.0, false) :: Row(2.0, true) :: Row(2.0, true) ::
         Row(3.0, false) :: Row(5.0, true) :: Row(null, false) :: Row(null, true) :: Nil)
   }
+
+  test("SPARK-16456: Reuse the uncorrelated scalar subqueries with the same logical plan") {
+    withTempTable("t1", "t2", "t3") {
+      val df = (1 to 3).map(i => (i, i)).toDF("key", "value")
+      df.createOrReplaceTempView("t1")
+      df.createOrReplaceTempView("t2")
+      df.createOrReplaceTempView("t3")
+      checkAnswer(
+        sql(
+          """
+            |WITH max_test AS
+            |(
+            | SELECT max(key) as max_key FROM t1
+            |),
+            |max_test2 AS
+            |(
+            | SELECT max(key) as max_key FROM t1
+            |)
+            |SELECT key FROM t2
+            |WHERE key = (SELECT max_key FROM max_test) and value = (SELECT max_key FROM max_test)
+            |UNION ALL
+            |SELECT key FROM t3
+            |WHERE key = (SELECT max_key FROM max_test) and value = (SELECT max_key FROM max_test2)
+          """.stripMargin
+        ), Row(3) :: Row(3) :: Nil)
+    }
+  }
 }