From 6bd09b1bcc0667f8bae5694a6bff293398dd168d Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Thu, 24 Jan 2019 17:02:45 +0800
Subject: [PATCH 1/7] fix

---
 docs/sql-data-sources-parquet.md                    | 12 ------------
 .../org/apache/spark/sql/internal/SQLConf.scala     |  5 +++--
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala  | 13 ++++++++++++-
 .../execution/OptimizeMetadataOnlyQuerySuite.scala  |  4 ++--
 .../spark/sql/hive/execution/SQLQuerySuite.scala    | 13 ++++++++++++-
 5 files changed, 29 insertions(+), 18 deletions(-)
diff --git a/docs/sql-data-sources-parquet.md b/docs/sql-data-sources-parquet.md
index dcd2936518465..5532bf9fa3f66 100644
--- a/docs/sql-data-sources-parquet.md
+++ b/docs/sql-data-sources-parquet.md
@@ -295,18 +295,6 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
     </p>
   </td>
 </tr>
-<tr>
-  <td><code>spark.sql.optimizer.metadataOnly</code></td>
-  <td>true</td>
-  <td>
-    <p>
-      When true, enable the metadata-only query optimization that use the table's metadata to
-      produce the partition columns instead of table scans. It applies when all the columns scanned
-      are partition columns and the query has an aggregate operator that satisfies distinct
-      semantics.
-    </p>
-  </td>
-</tr>
 <tr>
   <td><code>spark.sql.parquet.writeLegacyFormat</code></td>
   <td>false</td>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 6b301c3c9cb5b..f6113447a1484 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -585,9 +585,10 @@ object SQLConf {
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
       "scanned are partition columns and the query has an aggregate operator that satisfies " +
-      "distinct semantics.")
+      "distinct semantics. By default the optimization is disabled, since it may return " +
+      "incorrect results with empty tables.")
     .booleanConf
-    .createWithDefault(true)
+    .createWithDefault(false)
 
   val COLUMN_NAME_OF_CORRUPT_RECORD = buildConf("spark.sql.columnNameOfCorruptRecord")
     .doc("The name of internal column for storing raw/un-parsed JSON and CSV records that fail " +
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 806f0b2239fe6..68cd3a0f3eef3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2422,7 +2422,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(s"$expected") :: Nil)
   }
 
-  test("SPARK-15752 optimize metadata only query for datasource table") {
+  ignore("SPARK-15752 optimize metadata only query for datasource table") {
     withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
       withTable("srcpart_15752") {
         val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))
@@ -2966,6 +2966,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+      withTable("t") {
+        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
index a543eb8351656..f37802309be97 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -58,7 +58,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   private def testMetadataOnly(name: String, sqls: String*): Unit = {
-    test(name) {
+    ignore(name) {
       withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
         sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }
       }
@@ -69,7 +69,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   private def testNotMetadataOnly(name: String, sqls: String*): Unit = {
-    test(name) {
+    ignore(name) {
       withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
         sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 70efad103d13e..df4d536a9c9c2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -86,6 +86,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     assert(message.contains("Table or view not found"))
   }
 
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+      withTable("t") {
+        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+      }
+    }
+  }
+
   test("script") {
     assume(TestUtils.testCommandAvailable("/bin/bash"))
     assume(TestUtils.testCommandAvailable("echo | sed"))
@@ -1770,7 +1781,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  test("SPARK-15752 optimize metadata only query for hive table") {
+  ignore("SPARK-15752 optimize metadata only query for hive table") {
     withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
       withTable("data_15752", "srcpart_15752", "srctext_15752") {
         val df = Seq((1, "2"), (3, "4")).toDF("key", "value")

From 1c81586d7c7b3a7a9785719c486a6d857265eb89 Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Thu, 24 Jan 2019 22:35:14 +0800
Subject: [PATCH 2/7] revert the whole rule

---
 .../apache/spark/sql/internal/SQLConf.scala   |  11 --
 .../execution/OptimizeMetadataOnlyQuery.scala | 173 ------------------
 .../spark/sql/execution/SparkOptimizer.scala  |   1 -
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  47 -----
 .../OptimizeMetadataOnlyQuerySuite.scala      | 150 ---------------
 .../OptimizeHiveMetadataOnlyQuerySuite.scala  |  78 --------
 .../sql/hive/execution/SQLQuerySuite.scala    | 100 ----------
 7 files changed, 560 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
 delete mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f6113447a1484..516193bdeaaa8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -581,15 +581,6 @@ object SQLConf {
     .checkValues(HiveCaseSensitiveInferenceMode.values.map(_.toString))
     .createWithDefault(HiveCaseSensitiveInferenceMode.INFER_AND_SAVE.toString)
 
-  val OPTIMIZER_METADATA_ONLY = buildConf("spark.sql.optimizer.metadataOnly")
-    .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
-      "to produce the partition columns instead of table scans. It applies when all the columns " +
-      "scanned are partition columns and the query has an aggregate operator that satisfies " +
-      "distinct semantics. By default the optimization is disabled, since it may return " +
-      "incorrect results with empty tables.")
-    .booleanConf
-    .createWithDefault(false)
-
   val COLUMN_NAME_OF_CORRUPT_RECORD = buildConf("spark.sql.columnNameOfCorruptRecord")
     .doc("The name of internal column for storing raw/un-parsed JSON and CSV records that fail " +
       "to parse.")
@@ -1784,8 +1775,6 @@ class SQLConf extends Serializable with Logging {
 
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
-  def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
-
   def wholeStageEnabled: Boolean = getConf(WHOLESTAGE_CODEGEN_ENABLED)
 
   def wholeStageUseIdInClassName: Boolean = getConf(WHOLESTAGE_CODEGEN_USE_ID_IN_CLASS_NAME)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
deleted file mode 100644
index 3ca03ab2939aa..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution
-
-import java.util.Locale
-
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.catalog.{HiveTableRelation, SessionCatalog}
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.planning.PhysicalOperation
-import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
-import org.apache.spark.sql.internal.SQLConf
-
-/**
- * This rule optimizes the execution of queries that can be answered by looking only at
- * partition-level metadata. This applies when all the columns scanned are partition columns, and
- * the query has an aggregate operator that satisfies the following conditions:
- * 1. aggregate expression is partition columns.
- *  e.g. SELECT col FROM tbl GROUP BY col.
- * 2. aggregate function on partition columns with DISTINCT.
- *  e.g. SELECT col1, count(DISTINCT col2) FROM tbl GROUP BY col1.
- * 3. aggregate function on partition columns which have same result w or w/o DISTINCT keyword.
- *  e.g. SELECT col1, Max(col2) FROM tbl GROUP BY col1.
- */
-case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[LogicalPlan] {
-
-  def apply(plan: LogicalPlan): LogicalPlan = {
-    if (!SQLConf.get.optimizerMetadataOnly) {
-      return plan
-    }
-
-    plan.transform {
-      case a @ Aggregate(_, aggExprs, child @ PhysicalOperation(
-          projectList, filters, PartitionedRelation(partAttrs, rel))) =>
-        // We only apply this optimization when only partitioned attributes are scanned.
-        if (AttributeSet((projectList ++ filters).flatMap(_.references)).subsetOf(partAttrs)) {
-          // The project list and filters all only refer to partition attributes, which means the
-          // the Aggregator operator can also only refer to partition attributes, and filters are
-          // all partition filters. This is a metadata only query we can optimize.
-          val aggFunctions = aggExprs.flatMap(_.collect {
-            case agg: AggregateExpression => agg
-          })
-          val isAllDistinctAgg = aggFunctions.forall { agg =>
-            agg.isDistinct || (agg.aggregateFunction match {
-              // `Max`, `Min`, `First` and `Last` are always distinct aggregate functions no matter
-              // they have DISTINCT keyword or not, as the result will be same.
-              case _: Max => true
-              case _: Min => true
-              case _: First => true
-              case _: Last => true
-              case _ => false
-            })
-          }
-          if (isAllDistinctAgg) {
-            a.withNewChildren(Seq(replaceTableScanWithPartitionMetadata(child, rel, filters)))
-          } else {
-            a
-          }
-        } else {
-          a
-        }
-    }
-  }
-
-  /**
-   * Returns the partition attributes of the table relation plan.
-   */
-  private def getPartitionAttrs(
-      partitionColumnNames: Seq[String],
-      relation: LogicalPlan): Seq[Attribute] = {
-    val attrMap = relation.output.map(a => a.name.toLowerCase(Locale.ROOT) -> a).toMap
-    partitionColumnNames.map { colName =>
-      attrMap.getOrElse(colName.toLowerCase(Locale.ROOT),
-        throw new AnalysisException(s"Unable to find the column `$colName` " +
-          s"given [${relation.output.map(_.name).mkString(", ")}]")
-      )
-    }
-  }
-
-  /**
-   * Transform the given plan, find its table scan nodes that matches the given relation, and then
-   * replace the table scan node with its corresponding partition values.
-   */
-  private def replaceTableScanWithPartitionMetadata(
-      child: LogicalPlan,
-      relation: LogicalPlan,
-      partFilters: Seq[Expression]): LogicalPlan = {
-    // this logic comes from PruneFileSourcePartitions. it ensures that the filter names match the
-    // relation's schema. PartitionedRelation ensures that the filters only reference partition cols
-    val normalizedFilters = partFilters.map { e =>
-      e transform {
-        case a: AttributeReference =>
-          a.withName(relation.output.find(_.semanticEquals(a)).get.name)
-      }
-    }
-
-    child transform {
-      case plan if plan eq relation =>
-        relation match {
-          case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, isStreaming) =>
-            val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l)
-            val partitionData = fsRelation.location.listFiles(normalizedFilters, Nil)
-            LocalRelation(partAttrs, partitionData.map(_.values), isStreaming)
-
-          case relation: HiveTableRelation =>
-            val partAttrs = getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation)
-            val caseInsensitiveProperties =
-              CaseInsensitiveMap(relation.tableMeta.storage.properties)
-            val timeZoneId = caseInsensitiveProperties.get(DateTimeUtils.TIMEZONE_OPTION)
-              .getOrElse(SQLConf.get.sessionLocalTimeZone)
-            val partitions = if (partFilters.nonEmpty) {
-              catalog.listPartitionsByFilter(relation.tableMeta.identifier, normalizedFilters)
-            } else {
-              catalog.listPartitions(relation.tableMeta.identifier)
-            }
-
-            val partitionData = partitions.map { p =>
-              InternalRow.fromSeq(partAttrs.map { attr =>
-                Cast(Literal(p.spec(attr.name)), attr.dataType, Option(timeZoneId)).eval()
-              })
-            }
-            LocalRelation(partAttrs, partitionData)
-
-          case _ =>
-            throw new IllegalStateException(s"unrecognized table scan node: $relation, " +
-              s"please turn off ${SQLConf.OPTIMIZER_METADATA_ONLY.key} and try again.")
-        }
-    }
-  }
-
-  /**
-   * A pattern that finds the partitioned table relation node inside the given plan, and returns a
-   * pair of the partition attributes and the table relation node.
-   */
-  object PartitionedRelation extends PredicateHelper {
-
-    def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = {
-      plan match {
-        case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, _)
-            if fsRelation.partitionSchema.nonEmpty =>
-          val partAttrs = AttributeSet(getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l))
-          Some((partAttrs, l))
-
-        case relation: HiveTableRelation if relation.tableMeta.partitionColumnNames.nonEmpty =>
-          val partAttrs = AttributeSet(
-            getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation))
-          Some((partAttrs, relation))
-
-        case _ => None
-      }
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 6c6d344240cea..5e5255ec91e8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -30,7 +30,6 @@ class SparkOptimizer(
   extends Optimizer(catalog) {
 
   override def defaultBatches: Seq[Batch] = (preOptimizationBatches ++ super.defaultBatches :+
-    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
     Batch("Extract Python UDFs", Once,
       Seq(ExtractPythonUDFFromAggregate, ExtractPythonUDFs): _*) :+
     Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 68cd3a0f3eef3..2a33101c23dbe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2422,42 +2422,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(s"$expected") :: Nil)
   }
 
-  ignore("SPARK-15752 optimize metadata only query for datasource table") {
-    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
-      withTable("srcpart_15752") {
-        val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))
-          .toDF("col1", "col2", "partcol1", "partcol2")
-        data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart_15752")
-        checkAnswer(
-          sql("select partcol1 from srcpart_15752 group by partcol1"),
-          Row(0) :: Row(1) :: Nil)
-        checkAnswer(
-          sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"),
-          Row(1))
-        checkAnswer(
-          sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"),
-          Row(0, 1) :: Row(1, 1) :: Nil)
-        checkAnswer(
-          sql("select partcol1, count(distinct partcol2) from srcpart_15752  where partcol1 = 1 " +
-            "group by partcol1"),
-          Row(1, 1) :: Nil)
-        checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil)
-        checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
-        checkAnswer(
-          sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " +
-            "where partcol1 = 1) t"),
-          Row(2))
-        checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1))
-        checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1))
-        checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1))
-        checkAnswer(
-          sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " +
-            "where partcol1 = 1) t"),
-          Row(2))
-      }
-    }
-  }
-
   test("SPARK-16975: Column-partition path starting '_' should be handled correctly") {
     withTempDir { dir =>
       val dataDir = new File(dir, "data").getCanonicalPath
@@ -2966,17 +2930,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
-
-  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
-    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
-      withTable("t") {
-        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
-        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
-        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
-        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
-      }
-    }
-  }
 }
 
 case class Foo(bar: Option[String])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
deleted file mode 100644
index f37802309be97..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution
-
-import java.io.File
-
-import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY
-import org.apache.spark.sql.test.SharedSQLContext
-
-class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
-  import testImplicits._
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd"))
-      .toDF("col1", "col2", "partcol1", "partcol2")
-    data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart")
-  }
-
-  override protected def afterAll(): Unit = {
-    try {
-      sql("DROP TABLE IF EXISTS srcpart")
-    } finally {
-      super.afterAll()
-    }
-  }
-
-  private def assertMetadataOnlyQuery(df: DataFrame): Unit = {
-    val localRelations = df.queryExecution.optimizedPlan.collect {
-      case l @ LocalRelation(_, _, _) => l
-    }
-    assert(localRelations.size == 1)
-  }
-
-  private def assertNotMetadataOnlyQuery(df: DataFrame): Unit = {
-    val localRelations = df.queryExecution.optimizedPlan.collect {
-      case l @ LocalRelation(_, _, _) => l
-    }
-    assert(localRelations.size == 0)
-  }
-
-  private def testMetadataOnly(name: String, sqls: String*): Unit = {
-    ignore(name) {
-      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
-        sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }
-      }
-      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
-        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
-      }
-    }
-  }
-
-  private def testNotMetadataOnly(name: String, sqls: String*): Unit = {
-    ignore(name) {
-      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
-        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
-      }
-      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
-        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
-      }
-    }
-  }
-
-  testMetadataOnly(
-    "Aggregate expression is partition columns",
-    "select partcol1 from srcpart group by partcol1",
-    "select partcol2 from srcpart where partcol1 = 0 group by partcol2")
-
-  testMetadataOnly(
-    "Distinct aggregate function on partition columns",
-    "SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1",
-    "SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 group by partcol1")
-
-  testMetadataOnly(
-    "Distinct on partition columns",
-    "select distinct partcol1, partcol2 from srcpart",
-    "select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")
-
-  testMetadataOnly(
-    "Aggregate function on partition columns which have same result w or w/o DISTINCT keyword",
-    "select max(partcol1) from srcpart",
-    "select min(partcol1) from srcpart where partcol1 = 0",
-    "select first(partcol1) from srcpart",
-    "select last(partcol1) from srcpart where partcol1 = 0",
-    "select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2",
-    "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")
-
-  testNotMetadataOnly(
-    "Don't optimize metadata only query for non-partition columns",
-    "select col1 from srcpart group by col1",
-    "select partcol1, max(col1) from srcpart group by partcol1",
-    "select partcol1, count(distinct col1) from srcpart group by partcol1",
-    "select distinct partcol1, col1 from srcpart")
-
-  testNotMetadataOnly(
-    "Don't optimize metadata only query for non-distinct aggregate function on partition columns",
-    "select partcol1, sum(partcol2) from srcpart group by partcol1",
-    "select partcol1, count(partcol2) from srcpart group by partcol1")
-
-  testNotMetadataOnly(
-    "Don't optimize metadata only query for GroupingSet/Union operator",
-    "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)",
-    "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " +
-      "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")
-
-  test("SPARK-21884 Fix StackOverflowError on MetadataOnlyQuery") {
-    withTable("t_1000") {
-      sql("CREATE TABLE t_1000 (a INT, p INT) USING PARQUET PARTITIONED BY (p)")
-      (1 to 1000).foreach(p => sql(s"ALTER TABLE t_1000 ADD PARTITION (p=$p)"))
-      sql("SELECT COUNT(DISTINCT p) FROM t_1000").collect()
-    }
-  }
-
-  test("Incorrect result caused by the rule OptimizeMetadataOnlyQuery") {
-    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
-      withTempPath { path =>
-        val tablePath = new File(s"${path.getCanonicalPath}/cOl3=c/cOl1=a/cOl5=e")
-        Seq(("a", "b", "c", "d", "e")).toDF("cOl1", "cOl2", "cOl3", "cOl4", "cOl5")
-          .write.json(tablePath.getCanonicalPath)
-
-        val df = spark.read.json(path.getCanonicalPath).select("CoL1", "CoL5", "CoL3").distinct()
-        checkAnswer(df, Row("a", "e", "c"))
-
-        val localRelation = df.queryExecution.optimizedPlan.collectFirst {
-          case l: LocalRelation => l
-        }
-        assert(localRelation.nonEmpty, "expect to see a LocalRelation")
-        assert(localRelation.get.output.map(_.name) == Seq("cOl3", "cOl1", "cOl5"))
-      }
-    }
-  }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala
deleted file mode 100644
index 1e525c46a9cfb..0000000000000
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive
-
-import org.scalatest.BeforeAndAfter
-
-import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.QueryTest
-import org.apache.spark.sql.catalyst.expressions.NamedExpression
-import org.apache.spark.sql.catalyst.plans.logical.{Distinct, Filter, Project, SubqueryAlias}
-import org.apache.spark.sql.hive.test.TestHiveSingleton
-import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY
-import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
-
-class OptimizeHiveMetadataOnlyQuerySuite extends QueryTest with TestHiveSingleton
-    with BeforeAndAfter with SQLTestUtils {
-
-  import spark.implicits._
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sql("CREATE TABLE metadata_only (id bigint, data string) PARTITIONED BY (part int)")
-    (0 to 10).foreach(p => sql(s"ALTER TABLE metadata_only ADD PARTITION (part=$p)"))
-  }
-
-  override protected def afterAll(): Unit = {
-    try {
-      sql("DROP TABLE IF EXISTS metadata_only")
-    } finally {
-      super.afterAll()
-    }
-  }
-
-  test("SPARK-23877: validate metadata-only query pushes filters to metastore") {
-    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
-      val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount
-
-      // verify the number of matching partitions
-      assert(sql("SELECT DISTINCT part FROM metadata_only WHERE part < 5").collect().length === 5)
-
-      // verify that the partition predicate was pushed down to the metastore
-      assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount === 5)
-    }
-  }
-
-  test("SPARK-23877: filter on projected expression") {
-    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
-      val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount
-
-      // verify the matching partitions
-      val partitions = spark.internalCreateDataFrame(Distinct(Filter(($"x" < 5).expr,
-        Project(Seq(($"part" + 1).as("x").expr.asInstanceOf[NamedExpression]),
-          spark.table("metadata_only").logicalPlan.asInstanceOf[SubqueryAlias].child)))
-          .queryExecution.toRdd, StructType(Seq(StructField("x", IntegerType))))
-
-      checkAnswer(partitions, Seq(1, 2, 3, 4).toDF("x"))
-
-      // verify that the partition predicate was not pushed down to the metastore
-      assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount == 11)
-    }
-  }
-}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index df4d536a9c9c2..01c40d1541734 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -86,17 +86,6 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     assert(message.contains("Table or view not found"))
   }
 
-  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
-    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
-      withTable("t") {
-        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
-        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
-        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
-        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
-      }
-    }
-  }
-
   test("script") {
     assume(TestUtils.testCommandAvailable("/bin/bash"))
     assume(TestUtils.testCommandAvailable("echo | sed"))
@@ -1781,95 +1770,6 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  ignore("SPARK-15752 optimize metadata only query for hive table") {
-    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
-      withTable("data_15752", "srcpart_15752", "srctext_15752") {
-        val df = Seq((1, "2"), (3, "4")).toDF("key", "value")
-        df.createOrReplaceTempView("data_15752")
-        sql(
-          """
-            |CREATE TABLE srcpart_15752 (col1 INT, col2 STRING)
-            |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS parquet
-          """.stripMargin)
-        for (partcol1 <- Seq(0, 1); partcol2 <- Seq("a", "b")) {
-          sql(
-            s"""
-              |INSERT OVERWRITE TABLE srcpart_15752
-              |PARTITION (partcol1='$partcol1', partcol2='$partcol2')
-              |select key, value from data_15752
-            """.stripMargin)
-        }
-        checkAnswer(
-          sql("select partcol1 from srcpart_15752 group by partcol1"),
-          Row(0) :: Row(1) :: Nil)
-        checkAnswer(
-          sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"),
-          Row(1))
-        checkAnswer(
-          sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"),
-          Row(0, 2) :: Row(1, 2) :: Nil)
-        checkAnswer(
-          sql("select partcol1, count(distinct partcol2) from srcpart_15752 where partcol1 = 1 " +
-            "group by partcol1"),
-          Row(1, 2) :: Nil)
-        checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil)
-        checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
-        checkAnswer(
-          sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " +
-            "where partcol1 = 1) t"),
-          Row(2))
-        checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
-        checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1))
-        checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1))
-        checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1))
-        checkAnswer(
-          sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " +
-            "where partcol1 = 1) t"),
-          Row(2))
-
-        sql(
-          """
-            |CREATE TABLE srctext_15752 (col1 INT, col2 STRING)
-            |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS textfile
-          """.stripMargin)
-        for (partcol1 <- Seq(0, 1); partcol2 <- Seq("a", "b")) {
-          sql(
-            s"""
-              |INSERT OVERWRITE TABLE srctext_15752
-              |PARTITION (partcol1='$partcol1', partcol2='$partcol2')
-              |select key, value from data_15752
-            """.stripMargin)
-        }
-        checkAnswer(
-          sql("select partcol1 from srctext_15752 group by partcol1"),
-          Row(0) :: Row(1) :: Nil)
-        checkAnswer(
-          sql("select partcol1 from srctext_15752 where partcol1 = 1 group by partcol1"),
-          Row(1))
-        checkAnswer(
-          sql("select partcol1, count(distinct partcol2) from srctext_15752 group by partcol1"),
-          Row(0, 2) :: Row(1, 2) :: Nil)
-        checkAnswer(
-          sql("select partcol1, count(distinct partcol2) from srctext_15752  where partcol1 = 1 " +
-            "group by partcol1"),
-          Row(1, 2) :: Nil)
-        checkAnswer(sql("select distinct partcol1 from srctext_15752"), Row(0) :: Row(1) :: Nil)
-        checkAnswer(sql("select distinct partcol1 from srctext_15752 where partcol1 = 1"), Row(1))
-        checkAnswer(
-          sql("select distinct col from (select partcol1 + 1 as col from srctext_15752 " +
-            "where partcol1 = 1) t"),
-          Row(2))
-        checkAnswer(sql("select max(partcol1) from srctext_15752"), Row(1))
-        checkAnswer(sql("select max(partcol1) from srctext_15752 where partcol1 = 1"), Row(1))
-        checkAnswer(sql("select max(partcol1) from (select partcol1 from srctext_15752) t"), Row(1))
-        checkAnswer(
-          sql("select max(col) from (select partcol1 + 1 as col from srctext_15752 " +
-            "where partcol1 = 1) t"),
-          Row(2))
-      }
-    }
-  }
-
   test("SPARK-17354: Partitioning by dates/timestamps works with Parquet vectorized reader") {
     withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
       sql(

From 15cc8726a6454e45e5330bebd7e96d31500c2d6b Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Fri, 25 Jan 2019 01:34:21 +0800
Subject: [PATCH 3/7] Revert "revert the whole rule"

This reverts commit 1c81586d7c7b3a7a9785719c486a6d857265eb89.
---
 .../apache/spark/sql/internal/SQLConf.scala   |  11 ++
 .../execution/OptimizeMetadataOnlyQuery.scala | 173 ++++++++++++++++++
 .../spark/sql/execution/SparkOptimizer.scala  |   1 +
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  47 +++++
 .../OptimizeMetadataOnlyQuerySuite.scala      | 150 +++++++++++++++
 .../OptimizeHiveMetadataOnlyQuerySuite.scala  |  78 ++++++++
 .../sql/hive/execution/SQLQuerySuite.scala    | 100 ++++++++++
 7 files changed, 560 insertions(+)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 516193bdeaaa8..f6113447a1484 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -581,6 +581,15 @@ object SQLConf {
     .checkValues(HiveCaseSensitiveInferenceMode.values.map(_.toString))
     .createWithDefault(HiveCaseSensitiveInferenceMode.INFER_AND_SAVE.toString)
 
+  val OPTIMIZER_METADATA_ONLY = buildConf("spark.sql.optimizer.metadataOnly")
+    .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
+      "to produce the partition columns instead of table scans. It applies when all the columns " +
+      "scanned are partition columns and the query has an aggregate operator that satisfies " +
+      "distinct semantics. By default the optimization is disabled, since it may return " +
+      "incorrect results with empty tables.")
+    .booleanConf
+    .createWithDefault(false)
+
   val COLUMN_NAME_OF_CORRUPT_RECORD = buildConf("spark.sql.columnNameOfCorruptRecord")
     .doc("The name of internal column for storing raw/un-parsed JSON and CSV records that fail " +
       "to parse.")
@@ -1775,6 +1784,8 @@ class SQLConf extends Serializable with Logging {
 
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
+  def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
+
   def wholeStageEnabled: Boolean = getConf(WHOLESTAGE_CODEGEN_ENABLED)
 
   def wholeStageUseIdInClassName: Boolean = getConf(WHOLESTAGE_CODEGEN_USE_ID_IN_CLASS_NAME)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
new file mode 100644
index 0000000000000..3ca03ab2939aa
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import java.util.Locale
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.catalog.{HiveTableRelation, SessionCatalog}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * This rule optimizes the execution of queries that can be answered by looking only at
+ * partition-level metadata. This applies when all the columns scanned are partition columns, and
+ * the query has an aggregate operator that satisfies the following conditions:
+ * 1. aggregate expression is partition columns.
+ *  e.g. SELECT col FROM tbl GROUP BY col.
+ * 2. aggregate function on partition columns with DISTINCT.
+ *  e.g. SELECT col1, count(DISTINCT col2) FROM tbl GROUP BY col1.
+ * 3. aggregate function on partition columns which have same result w or w/o DISTINCT keyword.
+ *  e.g. SELECT col1, Max(col2) FROM tbl GROUP BY col1.
+ */
+case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[LogicalPlan] {
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    if (!SQLConf.get.optimizerMetadataOnly) {
+      return plan
+    }
+
+    plan.transform {
+      case a @ Aggregate(_, aggExprs, child @ PhysicalOperation(
+          projectList, filters, PartitionedRelation(partAttrs, rel))) =>
+        // We only apply this optimization when only partitioned attributes are scanned.
+        if (AttributeSet((projectList ++ filters).flatMap(_.references)).subsetOf(partAttrs)) {
+          // The project list and filters all only refer to partition attributes, which means the
+          // the Aggregator operator can also only refer to partition attributes, and filters are
+          // all partition filters. This is a metadata only query we can optimize.
+          val aggFunctions = aggExprs.flatMap(_.collect {
+            case agg: AggregateExpression => agg
+          })
+          val isAllDistinctAgg = aggFunctions.forall { agg =>
+            agg.isDistinct || (agg.aggregateFunction match {
+              // `Max`, `Min`, `First` and `Last` are always distinct aggregate functions no matter
+              // they have DISTINCT keyword or not, as the result will be same.
+              case _: Max => true
+              case _: Min => true
+              case _: First => true
+              case _: Last => true
+              case _ => false
+            })
+          }
+          if (isAllDistinctAgg) {
+            a.withNewChildren(Seq(replaceTableScanWithPartitionMetadata(child, rel, filters)))
+          } else {
+            a
+          }
+        } else {
+          a
+        }
+    }
+  }
+
+  /**
+   * Returns the partition attributes of the table relation plan.
+   */
+  private def getPartitionAttrs(
+      partitionColumnNames: Seq[String],
+      relation: LogicalPlan): Seq[Attribute] = {
+    val attrMap = relation.output.map(a => a.name.toLowerCase(Locale.ROOT) -> a).toMap
+    partitionColumnNames.map { colName =>
+      attrMap.getOrElse(colName.toLowerCase(Locale.ROOT),
+        throw new AnalysisException(s"Unable to find the column `$colName` " +
+          s"given [${relation.output.map(_.name).mkString(", ")}]")
+      )
+    }
+  }
+
+  /**
+   * Transform the given plan, find its table scan nodes that matches the given relation, and then
+   * replace the table scan node with its corresponding partition values.
+   */
+  private def replaceTableScanWithPartitionMetadata(
+      child: LogicalPlan,
+      relation: LogicalPlan,
+      partFilters: Seq[Expression]): LogicalPlan = {
+    // this logic comes from PruneFileSourcePartitions. it ensures that the filter names match the
+    // relation's schema. PartitionedRelation ensures that the filters only reference partition cols
+    val normalizedFilters = partFilters.map { e =>
+      e transform {
+        case a: AttributeReference =>
+          a.withName(relation.output.find(_.semanticEquals(a)).get.name)
+      }
+    }
+
+    child transform {
+      case plan if plan eq relation =>
+        relation match {
+          case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, isStreaming) =>
+            val partAttrs = getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l)
+            val partitionData = fsRelation.location.listFiles(normalizedFilters, Nil)
+            LocalRelation(partAttrs, partitionData.map(_.values), isStreaming)
+
+          case relation: HiveTableRelation =>
+            val partAttrs = getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation)
+            val caseInsensitiveProperties =
+              CaseInsensitiveMap(relation.tableMeta.storage.properties)
+            val timeZoneId = caseInsensitiveProperties.get(DateTimeUtils.TIMEZONE_OPTION)
+              .getOrElse(SQLConf.get.sessionLocalTimeZone)
+            val partitions = if (partFilters.nonEmpty) {
+              catalog.listPartitionsByFilter(relation.tableMeta.identifier, normalizedFilters)
+            } else {
+              catalog.listPartitions(relation.tableMeta.identifier)
+            }
+
+            val partitionData = partitions.map { p =>
+              InternalRow.fromSeq(partAttrs.map { attr =>
+                Cast(Literal(p.spec(attr.name)), attr.dataType, Option(timeZoneId)).eval()
+              })
+            }
+            LocalRelation(partAttrs, partitionData)
+
+          case _ =>
+            throw new IllegalStateException(s"unrecognized table scan node: $relation, " +
+              s"please turn off ${SQLConf.OPTIMIZER_METADATA_ONLY.key} and try again.")
+        }
+    }
+  }
+
+  /**
+   * A pattern that finds the partitioned table relation node inside the given plan, and returns a
+   * pair of the partition attributes and the table relation node.
+   */
+  object PartitionedRelation extends PredicateHelper {
+
+    def unapply(plan: LogicalPlan): Option[(AttributeSet, LogicalPlan)] = {
+      plan match {
+        case l @ LogicalRelation(fsRelation: HadoopFsRelation, _, _, _)
+            if fsRelation.partitionSchema.nonEmpty =>
+          val partAttrs = AttributeSet(getPartitionAttrs(fsRelation.partitionSchema.map(_.name), l))
+          Some((partAttrs, l))
+
+        case relation: HiveTableRelation if relation.tableMeta.partitionColumnNames.nonEmpty =>
+          val partAttrs = AttributeSet(
+            getPartitionAttrs(relation.tableMeta.partitionColumnNames, relation))
+          Some((partAttrs, relation))
+
+        case _ => None
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index 5e5255ec91e8c..6c6d344240cea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -30,6 +30,7 @@ class SparkOptimizer(
   extends Optimizer(catalog) {
 
   override def defaultBatches: Seq[Batch] = (preOptimizationBatches ++ super.defaultBatches :+
+    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
     Batch("Extract Python UDFs", Once,
       Seq(ExtractPythonUDFFromAggregate, ExtractPythonUDFs): _*) :+
     Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 2a33101c23dbe..68cd3a0f3eef3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2422,6 +2422,42 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(s"$expected") :: Nil)
   }
 
+  ignore("SPARK-15752 optimize metadata only query for datasource table") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+      withTable("srcpart_15752") {
+        val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))
+          .toDF("col1", "col2", "partcol1", "partcol2")
+        data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart_15752")
+        checkAnswer(
+          sql("select partcol1 from srcpart_15752 group by partcol1"),
+          Row(0) :: Row(1) :: Nil)
+        checkAnswer(
+          sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"),
+          Row(1))
+        checkAnswer(
+          sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"),
+          Row(0, 1) :: Row(1, 1) :: Nil)
+        checkAnswer(
+          sql("select partcol1, count(distinct partcol2) from srcpart_15752  where partcol1 = 1 " +
+            "group by partcol1"),
+          Row(1, 1) :: Nil)
+        checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil)
+        checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(
+          sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " +
+            "where partcol1 = 1) t"),
+          Row(2))
+        checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1))
+        checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1))
+        checkAnswer(
+          sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " +
+            "where partcol1 = 1) t"),
+          Row(2))
+      }
+    }
+  }
+
   test("SPARK-16975: Column-partition path starting '_' should be handled correctly") {
     withTempDir { dir =>
       val dataDir = new File(dir, "data").getCanonicalPath
@@ -2930,6 +2966,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+      withTable("t") {
+        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
new file mode 100644
index 0000000000000..f37802309be97
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import java.io.File
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY
+import org.apache.spark.sql.test.SharedSQLContext
+
+class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd"))
+      .toDF("col1", "col2", "partcol1", "partcol2")
+    data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart")
+  }
+
+  override protected def afterAll(): Unit = {
+    try {
+      sql("DROP TABLE IF EXISTS srcpart")
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  private def assertMetadataOnlyQuery(df: DataFrame): Unit = {
+    val localRelations = df.queryExecution.optimizedPlan.collect {
+      case l @ LocalRelation(_, _, _) => l
+    }
+    assert(localRelations.size == 1)
+  }
+
+  private def assertNotMetadataOnlyQuery(df: DataFrame): Unit = {
+    val localRelations = df.queryExecution.optimizedPlan.collect {
+      case l @ LocalRelation(_, _, _) => l
+    }
+    assert(localRelations.size == 0)
+  }
+
+  private def testMetadataOnly(name: String, sqls: String*): Unit = {
+    ignore(name) {
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+        sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }
+      }
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+      }
+    }
+  }
+
+  private def testNotMetadataOnly(name: String, sqls: String*): Unit = {
+    ignore(name) {
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+      }
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+      }
+    }
+  }
+
+  testMetadataOnly(
+    "Aggregate expression is partition columns",
+    "select partcol1 from srcpart group by partcol1",
+    "select partcol2 from srcpart where partcol1 = 0 group by partcol2")
+
+  testMetadataOnly(
+    "Distinct aggregate function on partition columns",
+    "SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1",
+    "SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 group by partcol1")
+
+  testMetadataOnly(
+    "Distinct on partition columns",
+    "select distinct partcol1, partcol2 from srcpart",
+    "select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")
+
+  testMetadataOnly(
+    "Aggregate function on partition columns which have same result w or w/o DISTINCT keyword",
+    "select max(partcol1) from srcpart",
+    "select min(partcol1) from srcpart where partcol1 = 0",
+    "select first(partcol1) from srcpart",
+    "select last(partcol1) from srcpart where partcol1 = 0",
+    "select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2",
+    "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")
+
+  testNotMetadataOnly(
+    "Don't optimize metadata only query for non-partition columns",
+    "select col1 from srcpart group by col1",
+    "select partcol1, max(col1) from srcpart group by partcol1",
+    "select partcol1, count(distinct col1) from srcpart group by partcol1",
+    "select distinct partcol1, col1 from srcpart")
+
+  testNotMetadataOnly(
+    "Don't optimize metadata only query for non-distinct aggregate function on partition columns",
+    "select partcol1, sum(partcol2) from srcpart group by partcol1",
+    "select partcol1, count(partcol2) from srcpart group by partcol1")
+
+  testNotMetadataOnly(
+    "Don't optimize metadata only query for GroupingSet/Union operator",
+    "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)",
+    "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " +
+      "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")
+
+  test("SPARK-21884 Fix StackOverflowError on MetadataOnlyQuery") {
+    withTable("t_1000") {
+      sql("CREATE TABLE t_1000 (a INT, p INT) USING PARQUET PARTITIONED BY (p)")
+      (1 to 1000).foreach(p => sql(s"ALTER TABLE t_1000 ADD PARTITION (p=$p)"))
+      sql("SELECT COUNT(DISTINCT p) FROM t_1000").collect()
+    }
+  }
+
+  test("Incorrect result caused by the rule OptimizeMetadataOnlyQuery") {
+    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
+      withTempPath { path =>
+        val tablePath = new File(s"${path.getCanonicalPath}/cOl3=c/cOl1=a/cOl5=e")
+        Seq(("a", "b", "c", "d", "e")).toDF("cOl1", "cOl2", "cOl3", "cOl4", "cOl5")
+          .write.json(tablePath.getCanonicalPath)
+
+        val df = spark.read.json(path.getCanonicalPath).select("CoL1", "CoL5", "CoL3").distinct()
+        checkAnswer(df, Row("a", "e", "c"))
+
+        val localRelation = df.queryExecution.optimizedPlan.collectFirst {
+          case l: LocalRelation => l
+        }
+        assert(localRelation.nonEmpty, "expect to see a LocalRelation")
+        assert(localRelation.get.output.map(_.name) == Seq("cOl3", "cOl1", "cOl5"))
+      }
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala
new file mode 100644
index 0000000000000..1e525c46a9cfb
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/OptimizeHiveMetadataOnlyQuerySuite.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.expressions.NamedExpression
+import org.apache.spark.sql.catalyst.plans.logical.{Distinct, Filter, Project, SubqueryAlias}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+
+class OptimizeHiveMetadataOnlyQuerySuite extends QueryTest with TestHiveSingleton
+    with BeforeAndAfter with SQLTestUtils {
+
+  import spark.implicits._
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sql("CREATE TABLE metadata_only (id bigint, data string) PARTITIONED BY (part int)")
+    (0 to 10).foreach(p => sql(s"ALTER TABLE metadata_only ADD PARTITION (part=$p)"))
+  }
+
+  override protected def afterAll(): Unit = {
+    try {
+      sql("DROP TABLE IF EXISTS metadata_only")
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  test("SPARK-23877: validate metadata-only query pushes filters to metastore") {
+    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
+      val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount
+
+      // verify the number of matching partitions
+      assert(sql("SELECT DISTINCT part FROM metadata_only WHERE part < 5").collect().length === 5)
+
+      // verify that the partition predicate was pushed down to the metastore
+      assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount === 5)
+    }
+  }
+
+  test("SPARK-23877: filter on projected expression") {
+    withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") {
+      val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount
+
+      // verify the matching partitions
+      val partitions = spark.internalCreateDataFrame(Distinct(Filter(($"x" < 5).expr,
+        Project(Seq(($"part" + 1).as("x").expr.asInstanceOf[NamedExpression]),
+          spark.table("metadata_only").logicalPlan.asInstanceOf[SubqueryAlias].child)))
+          .queryExecution.toRdd, StructType(Seq(StructField("x", IntegerType))))
+
+      checkAnswer(partitions, Seq(1, 2, 3, 4).toDF("x"))
+
+      // verify that the partition predicate was not pushed down to the metastore
+      assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount == 11)
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 01c40d1541734..df4d536a9c9c2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -86,6 +86,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     assert(message.contains("Table or view not found"))
   }
 
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+      withTable("t") {
+        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+      }
+    }
+  }
+
   test("script") {
     assume(TestUtils.testCommandAvailable("/bin/bash"))
     assume(TestUtils.testCommandAvailable("echo | sed"))
@@ -1770,6 +1781,95 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  ignore("SPARK-15752 optimize metadata only query for hive table") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+      withTable("data_15752", "srcpart_15752", "srctext_15752") {
+        val df = Seq((1, "2"), (3, "4")).toDF("key", "value")
+        df.createOrReplaceTempView("data_15752")
+        sql(
+          """
+            |CREATE TABLE srcpart_15752 (col1 INT, col2 STRING)
+            |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS parquet
+          """.stripMargin)
+        for (partcol1 <- Seq(0, 1); partcol2 <- Seq("a", "b")) {
+          sql(
+            s"""
+              |INSERT OVERWRITE TABLE srcpart_15752
+              |PARTITION (partcol1='$partcol1', partcol2='$partcol2')
+              |select key, value from data_15752
+            """.stripMargin)
+        }
+        checkAnswer(
+          sql("select partcol1 from srcpart_15752 group by partcol1"),
+          Row(0) :: Row(1) :: Nil)
+        checkAnswer(
+          sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"),
+          Row(1))
+        checkAnswer(
+          sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"),
+          Row(0, 2) :: Row(1, 2) :: Nil)
+        checkAnswer(
+          sql("select partcol1, count(distinct partcol2) from srcpart_15752 where partcol1 = 1 " +
+            "group by partcol1"),
+          Row(1, 2) :: Nil)
+        checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil)
+        checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(
+          sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " +
+            "where partcol1 = 1) t"),
+          Row(2))
+        checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1))
+        checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1))
+        checkAnswer(
+          sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " +
+            "where partcol1 = 1) t"),
+          Row(2))
+
+        sql(
+          """
+            |CREATE TABLE srctext_15752 (col1 INT, col2 STRING)
+            |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS textfile
+          """.stripMargin)
+        for (partcol1 <- Seq(0, 1); partcol2 <- Seq("a", "b")) {
+          sql(
+            s"""
+              |INSERT OVERWRITE TABLE srctext_15752
+              |PARTITION (partcol1='$partcol1', partcol2='$partcol2')
+              |select key, value from data_15752
+            """.stripMargin)
+        }
+        checkAnswer(
+          sql("select partcol1 from srctext_15752 group by partcol1"),
+          Row(0) :: Row(1) :: Nil)
+        checkAnswer(
+          sql("select partcol1 from srctext_15752 where partcol1 = 1 group by partcol1"),
+          Row(1))
+        checkAnswer(
+          sql("select partcol1, count(distinct partcol2) from srctext_15752 group by partcol1"),
+          Row(0, 2) :: Row(1, 2) :: Nil)
+        checkAnswer(
+          sql("select partcol1, count(distinct partcol2) from srctext_15752  where partcol1 = 1 " +
+            "group by partcol1"),
+          Row(1, 2) :: Nil)
+        checkAnswer(sql("select distinct partcol1 from srctext_15752"), Row(0) :: Row(1) :: Nil)
+        checkAnswer(sql("select distinct partcol1 from srctext_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(
+          sql("select distinct col from (select partcol1 + 1 as col from srctext_15752 " +
+            "where partcol1 = 1) t"),
+          Row(2))
+        checkAnswer(sql("select max(partcol1) from srctext_15752"), Row(1))
+        checkAnswer(sql("select max(partcol1) from srctext_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(sql("select max(partcol1) from (select partcol1 from srctext_15752) t"), Row(1))
+        checkAnswer(
+          sql("select max(col) from (select partcol1 + 1 as col from srctext_15752 " +
+            "where partcol1 = 1) t"),
+          Row(2))
+      }
+    }
+  }
+
   test("SPARK-17354: Partitioning by dates/timestamps works with Parquet vectorized reader") {
     withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
       sql(

From 8399d773ddcab959daa0d75b8795757f9ca69a88 Mon Sep 17 00:00:00 2001
From: Gengliang Wang <gengliang.wang@databricks.com>
Date: Fri, 25 Jan 2019 02:14:11 +0800
Subject: [PATCH 4/7] address comments

---
 .../apache/spark/sql/internal/SQLConf.scala   |  1 +
 .../execution/OptimizeMetadataOnlyQuery.scala |  4 ++
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 40 +++++++++++++++----
 .../OptimizeMetadataOnlyQuerySuite.scala      |  4 +-
 .../sql/hive/execution/SQLQuerySuite.scala    | 31 ++++++++------
 5 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f6113447a1484..4e3dfdc714c80 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -582,6 +582,7 @@ object SQLConf {
     .createWithDefault(HiveCaseSensitiveInferenceMode.INFER_AND_SAVE.toString)
 
   val OPTIMIZER_METADATA_ONLY = buildConf("spark.sql.optimizer.metadataOnly")
+    .internal()
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
       "scanned are partition columns and the query has an aggregate operator that satisfies " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
index 3ca03ab2939aa..9bda36124966c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -72,6 +72,10 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
             })
           }
           if (isAllDistinctAgg) {
+            logWarning("Since configuration `spark.sql.optimizer.metadataOnly` is enabled, " +
+              "Spark will scan partition-level metadata without scanning data files. " +
+              "This could result in wrong results when with empty partition data."
+            )
             a.withNewChildren(Seq(replaceTableScanWithPartitionMetadata(child, rel, filters)))
           } else {
             a
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 68cd3a0f3eef3..b8c4d73f1b2b4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2422,7 +2422,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(s"$expected") :: Nil)
   }
 
-  ignore("SPARK-15752 optimize metadata only query for datasource table") {
+  test("SPARK-15752 optimize metadata only query for datasource table") {
     withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
       withTable("srcpart_15752") {
         val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))
@@ -2968,12 +2968,38 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
-    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
-      withTable("t") {
-        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
-        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
-        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
-        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+    Seq(true, false).foreach { enableOptimizeMetadataOnlyQuery =>
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> enableOptimizeMetadataOnlyQuery.toString) {
+        withTable("t") {
+          sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+          sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+          if (enableOptimizeMetadataOnlyQuery) {
+            // The result is wrong if we enable the configuration.
+            checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(5))
+          } else {
+            checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+          }
+          checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+        }
+
+        withTempPath { path =>
+          val tabLocation = path.getCanonicalPath
+          val partLocation1 = tabLocation + "/p=3"
+          val partLocation2 = tabLocation + "/p=1"
+          // SPARK-23271 empty RDD when saved should write a metadata only file
+          val df = spark.emptyDataFrame.select(lit(1).as("col"))
+          df.write.parquet(partLocation1)
+          val df2 = spark.range(10).toDF("col")
+          df2.write.parquet(partLocation2)
+          val readDF = spark.read.parquet(tabLocation)
+          if (enableOptimizeMetadataOnlyQuery) {
+            // The result is wrong if we enable the configuration.
+            checkAnswer(readDF.selectExpr("max(p)"), Row(3))
+          } else {
+            checkAnswer(readDF.selectExpr("max(p)"), Row(1))
+          }
+          checkAnswer(readDF.selectExpr("max(col)"), Row(9))
+        }
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
index f37802309be97..a543eb8351656 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -58,7 +58,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   private def testMetadataOnly(name: String, sqls: String*): Unit = {
-    ignore(name) {
+    test(name) {
       withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
         sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }
       }
@@ -69,7 +69,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   private def testNotMetadataOnly(name: String, sqls: String*): Unit = {
-    ignore(name) {
+    test(name) {
       withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
         sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index df4d536a9c9c2..d506edc0cf088 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -86,17 +86,6 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     assert(message.contains("Table or view not found"))
   }
 
-  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
-    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
-      withTable("t") {
-        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
-        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
-        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
-        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
-      }
-    }
-  }
-
   test("script") {
     assume(TestUtils.testCommandAvailable("/bin/bash"))
     assume(TestUtils.testCommandAvailable("echo | sed"))
@@ -1781,7 +1770,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  ignore("SPARK-15752 optimize metadata only query for hive table") {
+  test("SPARK-15752 optimize metadata only query for hive table") {
     withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
       withTable("data_15752", "srcpart_15752", "srctext_15752") {
         val df = Seq((1, "2"), (3, "4")).toDF("key", "value")
@@ -2341,4 +2330,22 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    Seq(true, false).foreach { enableOptimizeMetadataOnlyQuery =>
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> enableOptimizeMetadataOnlyQuery.toString) {
+        withTable("t") {
+          sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+          sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+          if (enableOptimizeMetadataOnlyQuery) {
+            // The result is wrong if we enable the configuration.
+            checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(5))
+          } else {
+            checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+          }
+          checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+        }
+      }
+    }
+  }
+
 }

From 1524205866e745de21d9e574f9b8f3883ee51c4a Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Thu, 24 Jan 2019 15:18:47 -0800
Subject: [PATCH 5/7] Update SQLConf.scala

---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4e3dfdc714c80..0f013cea87b14 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -587,7 +587,7 @@ object SQLConf {
       "to produce the partition columns instead of table scans. It applies when all the columns " +
       "scanned are partition columns and the query has an aggregate operator that satisfies " +
       "distinct semantics. By default the optimization is disabled, since it may return " +
-      "incorrect results with empty tables.")
+      "incorrect results when the files are empty..")
     .booleanConf
     .createWithDefault(false)
 

From f7dac39da786e6a787426233db6c5b56897efd07 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Thu, 24 Jan 2019 15:19:14 -0800
Subject: [PATCH 6/7] Update SQLConf.scala

---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 0f013cea87b14..da595e7a352db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -587,7 +587,7 @@ object SQLConf {
       "to produce the partition columns instead of table scans. It applies when all the columns " +
       "scanned are partition columns and the query has an aggregate operator that satisfies " +
       "distinct semantics. By default the optimization is disabled, since it may return " +
-      "incorrect results when the files are empty..")
+      "incorrect results when the files are empty.")
     .booleanConf
     .createWithDefault(false)
 

From 9e99d4bfc1ec15d74b29bb13f9260faf53b38451 Mon Sep 17 00:00:00 2001
From: Xiao Li <gatorsmile@gmail.com>
Date: Thu, 24 Jan 2019 15:20:10 -0800
Subject: [PATCH 7/7] Update OptimizeMetadataOnlyQuery.scala

---
 .../apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
index 9bda36124966c..45e5f415e8da1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala
@@ -74,7 +74,8 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic
           if (isAllDistinctAgg) {
             logWarning("Since configuration `spark.sql.optimizer.metadataOnly` is enabled, " +
               "Spark will scan partition-level metadata without scanning data files. " +
-              "This could result in wrong results when with empty partition data."
+              "This could result in wrong results when the partition metadata exists but the " +
+              "inclusive data files are empty."
             )
             a.withNewChildren(Seq(replaceTableScanWithPartitionMetadata(child, rel, filters)))
           } else {