Skip to content

Commit f716a47

Browse files
adrian-wangcloud-fan
authored andcommitted
[SPARK-26181][SQL] the hasMinMaxStats method of ColumnStatsMap is not correct
## What changes were proposed in this pull request? For now the `hasMinMaxStats` will return the same as `hasCountStats`, which is obviously not as expected. ## How was this patch tested? Existing tests. Closes #23152 from adrian-wang/minmaxstats. Authored-by: Daoyuan Wang <[email protected]> Signed-off-by: Wenchen Fan <[email protected]> (cherry picked from commit 8534d75) Signed-off-by: Wenchen Fan <[email protected]>
1 parent 91b86b7 commit f716a47

File tree

3 files changed

+52
-3
lines changed

3 files changed

+52
-3
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -670,6 +670,14 @@ case class FilterEstimation(plan: Filter) extends Logging {
670670
logDebug("[CBO] No range comparison statistics for String/Binary type " + attrLeft)
671671
return None
672672
case _ =>
673+
if (!colStatsMap.hasMinMaxStats(attrLeft)) {
674+
logDebug("[CBO] No min/max statistics for " + attrLeft)
675+
return None
676+
}
677+
if (!colStatsMap.hasMinMaxStats(attrRight)) {
678+
logDebug("[CBO] No min/max statistics for " + attrRight)
679+
return None
680+
}
673681
}
674682

675683
val colStatLeft = colStatsMap(attrLeft)
@@ -879,13 +887,13 @@ case class ColumnStatsMap(originalMap: AttributeMap[ColumnStat]) {
879887
}
880888

881889
def hasCountStats(a: Attribute): Boolean =
882-
get(a).map(_.hasCountStats).getOrElse(false)
890+
get(a).exists(_.hasCountStats)
883891

884892
def hasDistinctCount(a: Attribute): Boolean =
885-
get(a).map(_.distinctCount.isDefined).getOrElse(false)
893+
get(a).exists(_.distinctCount.isDefined)
886894

887895
def hasMinMaxStats(a: Attribute): Boolean =
888-
get(a).map(_.hasCountStats).getOrElse(false)
896+
get(a).exists(_.hasMinMaxStats)
889897

890898
/**
891899
* Gets column stat for the given attribute. Prefer the column stat in updatedMap than that in

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._
2323
import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
2424
import org.apache.spark.sql.catalyst.plans.LeftOuter
2525
import org.apache.spark.sql.catalyst.plans.logical._
26+
import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.{ColumnStatsMap, FilterEstimation}
2627
import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
2728
import org.apache.spark.sql.catalyst.util.DateTimeUtils
2829
import org.apache.spark.sql.types._
@@ -821,6 +822,32 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
821822
expectedRowCount = 3)
822823
}
823824

825+
test("ColumnStatsMap tests") {
826+
val attrNoDistinct = AttributeReference("att_without_distinct", IntegerType)()
827+
val attrNoCount = AttributeReference("att_without_count", BooleanType)()
828+
val attrNoMinMax = AttributeReference("att_without_min_max", DateType)()
829+
val colStatNoDistinct = ColumnStat(distinctCount = None, min = Some(1), max = Some(10),
830+
nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))
831+
val colStatNoCount = ColumnStat(distinctCount = Some(2), min = Some(false), max = Some(true),
832+
nullCount = None, avgLen = Some(1), maxLen = Some(1))
833+
val colStatNoMinMax = ColumnStat(distinctCount = Some(1), min = None, max = None,
834+
nullCount = Some(1), avgLen = None, maxLen = None)
835+
val columnStatsMap = ColumnStatsMap(AttributeMap(Seq(
836+
attrNoDistinct -> colStatNoDistinct,
837+
attrNoCount -> colStatNoCount,
838+
attrNoMinMax -> colStatNoMinMax
839+
)))
840+
assert(!columnStatsMap.hasDistinctCount(attrNoDistinct))
841+
assert(columnStatsMap.hasDistinctCount(attrNoCount))
842+
assert(columnStatsMap.hasDistinctCount(attrNoMinMax))
843+
assert(!columnStatsMap.hasCountStats(attrNoDistinct))
844+
assert(!columnStatsMap.hasCountStats(attrNoCount))
845+
assert(columnStatsMap.hasCountStats(attrNoMinMax))
846+
assert(columnStatsMap.hasMinMaxStats(attrNoDistinct))
847+
assert(columnStatsMap.hasMinMaxStats(attrNoCount))
848+
assert(!columnStatsMap.hasMinMaxStats(attrNoMinMax))
849+
}
850+
824851
private def childStatsTestPlan(outList: Seq[Attribute], tableRowCount: BigInt): StatsTestPlan = {
825852
StatsTestPlan(
826853
outputList = outList,

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2276,4 +2276,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
22762276
}
22772277
}
22782278

2279+
2280+
test("SPARK-26181 hasMinMaxStats method of ColumnStatsMap is not correct") {
2281+
withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
2282+
withTable("all_null") {
2283+
sql("create table all_null (attr1 int, attr2 int)")
2284+
sql("insert into all_null values (null, null)")
2285+
sql("analyze table all_null compute statistics for columns attr1, attr2")
2286+
// check if the stats can be calculated without Cast exception.
2287+
sql("select * from all_null where attr1 < 1").queryExecution.stringWithStats
2288+
sql("select * from all_null where attr1 < attr2").queryExecution.stringWithStats
2289+
}
2290+
}
2291+
}
2292+
22792293
}

0 commit comments

Comments
 (0)