hide the queryExecution of DataFrame

chenghao-intel · chenghao-intel · commit a5647d9d61b1 · 2015-06-01T07:50:11.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -115,7 +115,7 @@ private[sql] object DataFrame {
 @Experimental
 class DataFrame private[sql](
     @transient val sqlContext: SQLContext,
-    @DeveloperApi @transient var queryExecution: SQLContext#QueryExecution)
+    @DeveloperApi @transient private var _queryExecution: SQLContext#QueryExecution)
   extends RDDApi[Row] with Serializable {
 
   /**
@@ -134,6 +134,8 @@ class DataFrame private[sql](
     })
   }
 
+  @DeveloperApi def queryExecution: SQLContext#QueryExecution = _queryExecution
+
   @transient protected[sql] val logicalPlan: LogicalPlan = queryExecution.logical match {
     // For various commands (like DDL) and queries with side effects, we force query optimization to
     // happen right away to let these side effects take place eagerly.
@@ -1317,7 +1319,7 @@ class DataFrame private[sql](
    */
   override def persist(newLevel: StorageLevel): this.type = {
     sqlContext.cacheManager.cacheQuery(this, None, newLevel)
-    this.queryExecution = new sqlContext.QueryExecution(this.queryExecution.logical)
+    this._queryExecution = new sqlContext.QueryExecution(this.queryExecution.logical)
     this
   }
 
@@ -1327,7 +1329,7 @@ class DataFrame private[sql](
    */
   override def unpersist(blocking: Boolean): this.type = {
     sqlContext.cacheManager.tryUncacheQuery(this, blocking)
-    this.queryExecution = new sqlContext.QueryExecution(this.queryExecution.logical)
+    this._queryExecution = new sqlContext.QueryExecution(this.queryExecution.logical)
     this
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -122,15 +122,20 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils {
     import org.apache.spark.sql.types._
 
     val df = Seq(Tuple1(1), Tuple1(2), Tuple1(3)).toDF("index")
+    // we except the id is materialized once
     def id:() => String = () => { UUID.randomUUID().toString() }
 
-    // Expect the ID to have materialized at this point
     val dfWithId = df.withColumn("id", callUDF(id, StringType))
+    // Make a new DataFrame (actually the same reference to the old one)
     val cached = dfWithId.cache()
+    // Trigger the cache
     val d0 = dfWithId.collect()
     val d1 = cached.collect()
     val d2 = cached.collect()
 
+    // Since the ID is only materialized once, then all of the records
+    // should come from the cache, not by re-computing. Otherwise, the ID
+    // will be different
     assert(d0.map(_(0)) === d2.map(_(0)))
     assert(d0.map(_(1)) === d2.map(_(1)))