fix

gengliangwang · gengliangwang · commit 6bd09b1bcc06 · 2019-01-24T17:02:45.000+08:00
diff --git a/docs/sql-data-sources-parquet.md b/docs/sql-data-sources-parquet.md
@@ -295,18 +295,6 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
     </p>
   </td>
 </tr>
-<tr>
-  <td><code>spark.sql.optimizer.metadataOnly</code></td>
-  <td>true</td>
-  <td>
-    <p>
-      When true, enable the metadata-only query optimization that use the table's metadata to
-      produce the partition columns instead of table scans. It applies when all the columns scanned
-      are partition columns and the query has an aggregate operator that satisfies distinct
-      semantics.
-    </p>
-  </td>
-</tr>
 <tr>
   <td><code>spark.sql.parquet.writeLegacyFormat</code></td>
   <td>false</td>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -585,9 +585,10 @@ object SQLConf {
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
       "scanned are partition columns and the query has an aggregate operator that satisfies " +
-      "distinct semantics.")
+      "distinct semantics. By default the optimization is disabled, since it may return " +
+      "incorrect results with empty tables.")
     .booleanConf
-    .createWithDefault(true)
+    .createWithDefault(false)
 
   val COLUMN_NAME_OF_CORRUPT_RECORD = buildConf("spark.sql.columnNameOfCorruptRecord")
     .doc("The name of internal column for storing raw/un-parsed JSON and CSV records that fail " +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2422,7 +2422,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(s"$expected") :: Nil)
   }
 
-  test("SPARK-15752 optimize metadata only query for datasource table") {
+  ignore("SPARK-15752 optimize metadata only query for datasource table") {
     withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
       withTable("srcpart_15752") {
         val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))
@@ -2966,6 +2966,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+      withTable("t") {
+        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -58,7 +58,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   private def testMetadataOnly(name: String, sqls: String*): Unit = {
-    test(name) {
+    ignore(name) {
       withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
         sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }
       }
@@ -69,7 +69,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   private def testNotMetadataOnly(name: String, sqls: String*): Unit = {
-    test(name) {
+    ignore(name) {
       withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
         sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -86,6 +86,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     assert(message.contains("Table or view not found"))
   }
 
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+      withTable("t") {
+        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+      }
+    }
+  }
+
   test("script") {
     assume(TestUtils.testCommandAvailable("/bin/bash"))
     assume(TestUtils.testCommandAvailable("echo | sed"))
@@ -1770,7 +1781,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  test("SPARK-15752 optimize metadata only query for hive table") {
+  ignore("SPARK-15752 optimize metadata only query for hive table") {
     withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
       withTable("data_15752", "srcpart_15752", "srctext_15752") {
         val df = Seq((1, "2"), (3, "4")).toDF("key", "value")

Original file line number	Diff line number	Diff line change
`@@ -2422,7 +2422,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {`
`2422`	`2422`	`Row(s"$expected") :: Nil)`
`2423`	`2423`	`}`
`2424`	`2424`
`2425`		`- test("SPARK-15752 optimize metadata only query for datasource table") {`
	`2425`	`+ ignore("SPARK-15752 optimize metadata only query for datasource table") {`
`2426`	`2426`	`withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {`
`2427`	`2427`	`withTable("srcpart_15752") {`
`2428`	`2428`	`val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))`
`@@ -2966,6 +2966,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {`
`2966`	`2966`	`}`
`2967`	`2967`	`}`
`2968`	`2968`	`}`
	`2969`	`+`
	`2970`	`+ test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {`
	`2971`	`+ withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {`
	`2972`	`+ withTable("t") {`
	`2973`	`+ sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")`
	`2974`	`+ sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")`
	`2975`	`+ checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))`
	`2976`	`+ checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))`
	`2977`	`+ }`
	`2978`	`+ }`
	`2979`	`+ }`
`2969`	`2980`	`}`
`2970`	`2981`
`2971`	`2982`	`case class Foo(bar: Option[String])`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {`
`58`	`58`	`}`
`59`	`59`
`60`	`60`	`private def testMetadataOnly(name: String, sqls: String*): Unit = {`
`61`		`- test(name) {`
	`61`	`+ ignore(name) {`
`62`	`62`	`withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {`
`63`	`63`	`sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }`
`64`	`64`	`}`
`@@ -69,7 +69,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {`
`69`	`69`	`}`
`70`	`70`
`71`	`71`	`private def testNotMetadataOnly(name: String, sqls: String*): Unit = {`
`72`		`- test(name) {`
	`72`	`+ ignore(name) {`
`73`	`73`	`withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {`
`74`	`74`	`sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }`
`75`	`75`	`}`