another fix

cloud-fan · cloud-fan · commit 8b166308f9f2 · 2015-06-26T00:05:43.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1418,12 +1418,14 @@ class DataFrame private[sql](
   lazy val rdd: RDD[Row] = {
     // use a local variable to make sure the map closure doesn't capture the whole DataFrame
     val schema = this.schema
-    queryExecution.executedPlan.execute().mapPartitions { rows =>
+    internalRowRdd.mapPartitions { rows =>
       val converter = CatalystTypeConverters.createToScalaConverter(schema)
       rows.map(converter(_).asInstanceOf[Row])
     }
   }
 
+  private[sql] def internalRowRdd = queryExecution.executedPlan.execute()
+
   /**
    * Returns the content of the [[DataFrame]] as a [[JavaRDD]] of [[Row]]s.
    * @group rdd
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -22,7 +22,6 @@ import scala.collection.mutable.{Map => MutableMap}
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.types.{ArrayType, StructField, StructType}
 import org.apache.spark.sql.{Column, DataFrame}
 
@@ -91,7 +90,7 @@ private[sql] object FrequentItems extends Logging {
       (name, originalSchema.fields(index).dataType)
     }
 
-    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
+    val freqItems = df.select(cols.map(Column(_)) : _*).internalRowRdd.aggregate(countMaps)(
       seqOp = (counts, row) => {
         var i = 0
         while (i < numCols) {
@@ -111,17 +110,13 @@ private[sql] object FrequentItems extends Logging {
         baseCounts
       }
     )
-
+    val justItems = freqItems.map(m => m.baseMap.keys.toSeq)
+    val resultRow = InternalRow(justItems : _*)
     // append frequent Items to the column name for easy debugging
     val outputCols = colInfo.map { v =>
       StructField(v._1 + "_freqItems", ArrayType(v._2, false))
     }
-    val schema = StructType(outputCols)
-
-    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
-    val justItems = freqItems.map(m => m.baseMap.keys.toSeq)
-    val resultRow = converter(InternalRow(justItems : _*)).asInstanceOf[InternalRow]
-
-    new DataFrame(df.sqlContext, LocalRelation(schema.toAttributes, Seq(resultRow)))
+    val schema = StructType(outputCols).toAttributes
+    new DataFrame(df.sqlContext, LocalRelation(schema, Seq(resultRow)))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -81,7 +81,7 @@ private[sql] object StatFunctions extends Logging {
         s"with dataType ${data.get.dataType} not supported.")
     }
     val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType)))
-    df.select(columns: _*).rdd.aggregate(new CovarianceCounter)(
+    df.select(columns: _*).internalRowRdd.aggregate(new CovarianceCounter)(
       seqOp = (counter, row) => {
         counter.add(row.getDouble(0), row.getDouble(1))
       },
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -154,7 +154,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
     writerContainer.driverSideSetup()
 
     try {
-      df.sqlContext.sparkContext.runJob(df.queryExecution.executedPlan.execute(), writeRows _)
+      df.sqlContext.sparkContext.runJob(df.internalRowRdd, writeRows _)
       writerContainer.commitJob()
       relation.refresh()
     } catch { case cause: Throwable =>
@@ -220,7 +220,7 @@ private[sql] case class InsertIntoHadoopFsRelation(
     writerContainer.driverSideSetup()
 
     try {
-      df.sqlContext.sparkContext.runJob(df.queryExecution.executedPlan.execute(), writeRows _)
+      df.sqlContext.sparkContext.runJob(df.internalRowRdd, writeRows _)
       writerContainer.commitJob()
       relation.refresh()
     } catch { case cause: Throwable =>

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@ private[sql] object StatFunctions extends Logging {`
`81`	`81`	`s"with dataType ${data.get.dataType} not supported.")`
`82`	`82`	`}`
`83`	`83`	`val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType)))`
`84`		`- df.select(columns: _*).rdd.aggregate(new CovarianceCounter)(`
	`84`	`+ df.select(columns: _*).internalRowRdd.aggregate(new CovarianceCounter)(`
`85`	`85`	`seqOp = (counter, row) => {`
`86`	`86`	`counter.add(row.getDouble(0), row.getDouble(1))`
`87`	`87`	`},`