[SPARK-26181][SQL] the hasMinMaxStats method of ColumnStatsMap is not correct

adrian-wang · cloud-fan · commit f716a4788dbb · 2018-12-03T23:55:04.000+08:00
## What changes were proposed in this pull request? For now the `hasMinMaxStats` will return the same as `hasCountStats`, which is obviously not as expected. ## How was this patch tested? Existing tests. Closes #23152 from adrian-wang/minmaxstats. Authored-by: Daoyuan Wang <me@daoyuan.wang> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit 8534d75) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -670,6 +670,14 @@ case class FilterEstimation(plan: Filter) extends Logging {
         logDebug("[CBO] No range comparison statistics for String/Binary type " + attrLeft)
         return None
       case _ =>
+        if (!colStatsMap.hasMinMaxStats(attrLeft)) {
+          logDebug("[CBO] No min/max statistics for " + attrLeft)
+          return None
+        }
+        if (!colStatsMap.hasMinMaxStats(attrRight)) {
+          logDebug("[CBO] No min/max statistics for " + attrRight)
+          return None
+        }
     }
 
     val colStatLeft = colStatsMap(attrLeft)
@@ -879,13 +887,13 @@ case class ColumnStatsMap(originalMap: AttributeMap[ColumnStat]) {
   }
 
   def hasCountStats(a: Attribute): Boolean =
-    get(a).map(_.hasCountStats).getOrElse(false)
+    get(a).exists(_.hasCountStats)
 
   def hasDistinctCount(a: Attribute): Boolean =
-    get(a).map(_.distinctCount.isDefined).getOrElse(false)
+    get(a).exists(_.distinctCount.isDefined)
 
   def hasMinMaxStats(a: Attribute): Boolean =
-    get(a).map(_.hasCountStats).getOrElse(false)
+    get(a).exists(_.hasMinMaxStats)
 
   /**
    * Gets column stat for the given attribute. Prefer the column stat in updatedMap than that in
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
 import org.apache.spark.sql.catalyst.plans.LeftOuter
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.{ColumnStatsMap, FilterEstimation}
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
@@ -821,6 +822,32 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 3)
   }
 
+  test("ColumnStatsMap tests") {
+    val attrNoDistinct = AttributeReference("att_without_distinct", IntegerType)()
+    val attrNoCount = AttributeReference("att_without_count", BooleanType)()
+    val attrNoMinMax = AttributeReference("att_without_min_max", DateType)()
+    val colStatNoDistinct = ColumnStat(distinctCount = None, min = Some(1), max = Some(10),
+      nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))
+    val colStatNoCount = ColumnStat(distinctCount = Some(2), min = Some(false), max = Some(true),
+      nullCount = None, avgLen = Some(1), maxLen = Some(1))
+    val colStatNoMinMax = ColumnStat(distinctCount = Some(1), min = None, max = None,
+      nullCount = Some(1), avgLen = None, maxLen = None)
+    val columnStatsMap = ColumnStatsMap(AttributeMap(Seq(
+      attrNoDistinct -> colStatNoDistinct,
+      attrNoCount -> colStatNoCount,
+      attrNoMinMax -> colStatNoMinMax
+    )))
+    assert(!columnStatsMap.hasDistinctCount(attrNoDistinct))
+    assert(columnStatsMap.hasDistinctCount(attrNoCount))
+    assert(columnStatsMap.hasDistinctCount(attrNoMinMax))
+    assert(!columnStatsMap.hasCountStats(attrNoDistinct))
+    assert(!columnStatsMap.hasCountStats(attrNoCount))
+    assert(columnStatsMap.hasCountStats(attrNoMinMax))
+    assert(columnStatsMap.hasMinMaxStats(attrNoDistinct))
+    assert(columnStatsMap.hasMinMaxStats(attrNoCount))
+    assert(!columnStatsMap.hasMinMaxStats(attrNoMinMax))
+  }
+
   private def childStatsTestPlan(outList: Seq[Attribute], tableRowCount: BigInt): StatsTestPlan = {
     StatsTestPlan(
       outputList = outList,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2276,4 +2276,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+
+  test("SPARK-26181 hasMinMaxStats method of ColumnStatsMap is not correct") {
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+      withTable("all_null") {
+        sql("create table all_null (attr1 int, attr2 int)")
+        sql("insert into all_null values (null, null)")
+        sql("analyze table all_null compute statistics for columns attr1, attr2")
+        // check if the stats can be calculated without Cast exception.
+        sql("select * from all_null where attr1 < 1").queryExecution.stringWithStats
+        sql("select * from all_null where attr1 < attr2").queryExecution.stringWithStats
+      }
+    }
+  }
+
 }

Original file line number	Diff line number	Diff line change
`@@ -2276,4 +2276,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {`
`2276`	`2276`	`}`
`2277`	`2277`	`}`
`2278`	`2278`
	`2279`	`+`
	`2280`	`+ test("SPARK-26181 hasMinMaxStats method of ColumnStatsMap is not correct") {`
	`2281`	`+ withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {`
	`2282`	`+ withTable("all_null") {`
	`2283`	`+ sql("create table all_null (attr1 int, attr2 int)")`
	`2284`	`+ sql("insert into all_null values (null, null)")`
	`2285`	`+ sql("analyze table all_null compute statistics for columns attr1, attr2")`
	`2286`	`+ // check if the stats can be calculated without Cast exception.`
	`2287`	`+ sql("select * from all_null where attr1 < 1").queryExecution.stringWithStats`
	`2288`	`+ sql("select * from all_null where attr1 < attr2").queryExecution.stringWithStats`
	`2289`	`+ }`
	`2290`	`+ }`
	`2291`	`+ }`
	`2292`	`+`
`2279`	`2293`	`}`