From 5128fe27aa265b7359a914bd508ff366b9762544 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 19 Jun 2014 08:00:31 +0100 Subject: [PATCH 1/2] Make sure InsertIntoHiveTable doesn't execute each time you ask for its result. --- .../org/apache/spark/sql/hive/execution/hiveOperators.scala | 6 +++++- .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/hiveOperators.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/hiveOperators.scala index a839231449161..240aa0df4935a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/hiveOperators.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/hiveOperators.scala @@ -344,12 +344,16 @@ case class InsertIntoHiveTable( writer.commitJob() } + override def execute() = result + /** * Inserts all the rows in the table into Hive. Row objects are properly serialized with the * `org.apache.hadoop.hive.serde2.SerDe` and the * `org.apache.hadoop.mapred.OutputFormat` provided by the table definition. + * + * Note: this is run once and then kept to avoid double insertions. */ - def execute() = { + private lazy val result: RDD[Row] = { val childRdd = child.execute() assert(childRdd != null) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index fe698f0fc57b8..ae9f6acbcec0e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -28,6 +28,12 @@ import org.apache.spark.sql.{SchemaRDD, execution, Row} */ class HiveQuerySuite extends HiveComparisonTest { + test("create as table as runs once") { + hql("CREATE TABLE foo AS SELECT 1 FROM src LIMIT 1").collect() + assert(hql("SELECT COUNT(*) FROM foo").collect().head.getLong(0) === 1, + "Incorrect number of rows in created table") + } + createQueryTest("between", "SELECT * FROM src WHERE key Between 1 and 2") From 9c6d9e4218dff661b4536b09b84dc314d903983e Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 19 Jun 2014 08:03:29 +0100 Subject: [PATCH 2/2] Fix typo. --- .../org/apache/spark/sql/hive/execution/HiveQuerySuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index ae9f6acbcec0e..c5dc45cdbca4d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.{SchemaRDD, execution, Row} */ class HiveQuerySuite extends HiveComparisonTest { - test("create as table as runs once") { + test("CREATE TABLE AS runs once") { hql("CREATE TABLE foo AS SELECT 1 FROM src LIMIT 1").collect() assert(hql("SELECT COUNT(*) FROM foo").collect().head.getLong(0) === 1, "Incorrect number of rows in created table")