[SPARK-22626][SQL] deals with wrong Hive's statistics (zero rowCount)

wangyum · cloud-fan · commit dff440f1ecdb · 2017-12-03T23:05:39.000+08:00
This pr to ensure that the Hive's statistics `totalSize` (or `rawDataSize`) > 0, `rowCount` also must be > 0. Otherwise may cause OOM when CBO is enabled. unit tests Author: Yuming Wang <wgyumg@gmail.com> Closes #19831 from wangyum/SPARK-22626.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -418,7 +418,7 @@ private[hive] class HiveClientImpl(
       // Note that this statistics could be overridden by Spark's statistics if that's available.
       val totalSize = properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_))
       val rawDataSize = properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_))
-      val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_)).filter(_ >= 0)
+      val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_))
       // TODO: check if this estimate is valid for tables after partition pruning.
       // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
       // relatively cheap if parameters for the table are populated into the metastore.
@@ -430,9 +430,9 @@ private[hive] class HiveClientImpl(
         // so when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero,
         // return None. Later, we will use the other ways to estimate the statistics.
         if (totalSize.isDefined && totalSize.get > 0L) {
-          Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount))
+          Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount.filter(_ > 0)))
         } else if (rawDataSize.isDefined && rawDataSize.get > 0) {
-          Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount = rowCount))
+          Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount = rowCount.filter(_ > 0)))
         } else {
           // TODO: still fill the rowCount even if sizeInBytes is empty. Might break anything?
           None
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -1360,4 +1360,23 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     }
 
   }
+
+  test("Deals with wrong Hive's statistics (zero rowCount)") {
+    withTable("maybe_big") {
+      sql("CREATE TABLE maybe_big (c1 bigint)" +
+        "TBLPROPERTIES ('numRows'='0', 'rawDataSize'='60000000000', 'totalSize'='8000000000000')")
+
+      val relation = spark.table("maybe_big").queryExecution.analyzed.children.head
+        .asInstanceOf[HiveTableRelation]
+
+      val properties = relation.tableMeta.ignoredProperties
+      assert(properties("totalSize").toLong > 0)
+      assert(properties("rawDataSize").toLong > 0)
+      assert(properties("numRows").toLong == 0)
+
+      assert(relation.stats.sizeInBytes > 0)
+      // May be cause OOM if rowCount == 0 when enables CBO, see SPARK-22626 for details.
+      assert(relation.stats.rowCount.isEmpty)
+    }
+  }
 }