From 3e91bbe30cfbe1f21d83a69554708ffb12b16116 Mon Sep 17 00:00:00 2001 From: mingbo_pb Date: Thu, 4 Apr 2019 00:25:50 +0800 Subject: [PATCH 1/3] SPARK-27351 Wrong outputRows estimation after AggregateEstimation with only null value column --- .../statsEstimation/AggregateEstimation.scala | 2 +- .../statsEstimation/AggregateEstimationSuite.scala | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala index 0606d0d516bbd..aa400c26b9eff 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala @@ -40,7 +40,7 @@ object AggregateEstimation { // the data contains all combinations of distinct values of group-by columns. var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))( (res, expr) => res * - childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount.get) + childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount.get.max(BigInt(1))) outputRows = if (agg.groupingExpressions.isEmpty) { // If there's no group-by columns, the output is a single row containing values of aggregate diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala index dfa6e467166a5..c247050438129 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala @@ -38,7 +38,9 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest { attr("key22") -> ColumnStat(distinctCount = Some(2), min = Some(10), max = Some(20), nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)), attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None, - nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)) + nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)), + attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None, + nullCount = Some(4), avgLen = Some(4), maxLen = Some(4)) )) private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1) @@ -116,6 +118,14 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest { expectedOutputRowCount = 0) } + test("group-by column with only null value") { + checkAggStats( + tableColumns = Seq("key22", "key32"), + tableRowCount = 6, + groupByColumns = Seq("key22", "key32"), + expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get) + } + test("non-cbo estimation") { val attributes = Seq("key12").map(nameToAttr) val child = StatsTestPlan( From aa9859a6a288fbd08a77ea65cc77a602000dcfd6 Mon Sep 17 00:00:00 2001 From: pengbo Date: Thu, 11 Apr 2019 22:31:37 +0800 Subject: [PATCH 2/3] check null value count for only null value column --- .../statsEstimation/AggregateEstimation.scala | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala index aa400c26b9eff..ae717c5f39653 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala @@ -39,8 +39,16 @@ object AggregateEstimation { // Multiply distinct counts of group-by columns. This is an upper bound, which assumes // the data contains all combinations of distinct values of group-by columns. var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))( - (res, expr) => res * - childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount.get.max(BigInt(1))) + (res, expr) => { + val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute]) + val distinctValue: BigInt = if (columnStat.distinctCount.get == 0 && + columnStat.nullCount.get > 0) { + 1 + } else { + columnStat.distinctCount.get + } + res * distinctValue + }) outputRows = if (agg.groupingExpressions.isEmpty) { // If there's no group-by columns, the output is a single row containing values of aggregate From 6a9d35f0760ebed290655ebe8e26871456524bf5 Mon Sep 17 00:00:00 2001 From: pengbo Date: Sun, 14 Apr 2019 09:37:54 +0800 Subject: [PATCH 3/3] refine code to make columnStat.distinctCount.get called only once --- .../plans/logical/statsEstimation/AggregateEstimation.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala index ae717c5f39653..1198d3fc53cbd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala @@ -41,11 +41,11 @@ object AggregateEstimation { var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))( (res, expr) => { val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute]) - val distinctValue: BigInt = if (columnStat.distinctCount.get == 0 && - columnStat.nullCount.get > 0) { + val distinctCount = columnStat.distinctCount.get + val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) { 1 } else { - columnStat.distinctCount.get + distinctCount } res * distinctValue })