Skip to content

Commit dff440f

Browse files
wangyumcloud-fan
authored andcommitted
[SPARK-22626][SQL] deals with wrong Hive's statistics (zero rowCount)
This pr to ensure that the Hive's statistics `totalSize` (or `rawDataSize`) > 0, `rowCount` also must be > 0. Otherwise may cause OOM when CBO is enabled. unit tests Author: Yuming Wang <[email protected]> Closes #19831 from wangyum/SPARK-22626.
1 parent 2c16267 commit dff440f

File tree

2 files changed

+22
-3
lines changed

2 files changed

+22
-3
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ private[hive] class HiveClientImpl(
418418
// Note that this statistics could be overridden by Spark's statistics if that's available.
419419
val totalSize = properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_))
420420
val rawDataSize = properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_))
421-
val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_)).filter(_ >= 0)
421+
val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_))
422422
// TODO: check if this estimate is valid for tables after partition pruning.
423423
// NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
424424
// relatively cheap if parameters for the table are populated into the metastore.
@@ -430,9 +430,9 @@ private[hive] class HiveClientImpl(
430430
// so when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero,
431431
// return None. Later, we will use the other ways to estimate the statistics.
432432
if (totalSize.isDefined && totalSize.get > 0L) {
433-
Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount))
433+
Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount.filter(_ > 0)))
434434
} else if (rawDataSize.isDefined && rawDataSize.get > 0) {
435-
Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount = rowCount))
435+
Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount = rowCount.filter(_ > 0)))
436436
} else {
437437
// TODO: still fill the rowCount even if sizeInBytes is empty. Might break anything?
438438
None

sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1360,4 +1360,23 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
13601360
}
13611361

13621362
}
1363+
1364+
test("Deals with wrong Hive's statistics (zero rowCount)") {
1365+
withTable("maybe_big") {
1366+
sql("CREATE TABLE maybe_big (c1 bigint)" +
1367+
"TBLPROPERTIES ('numRows'='0', 'rawDataSize'='60000000000', 'totalSize'='8000000000000')")
1368+
1369+
val relation = spark.table("maybe_big").queryExecution.analyzed.children.head
1370+
.asInstanceOf[HiveTableRelation]
1371+
1372+
val properties = relation.tableMeta.ignoredProperties
1373+
assert(properties("totalSize").toLong > 0)
1374+
assert(properties("rawDataSize").toLong > 0)
1375+
assert(properties("numRows").toLong == 0)
1376+
1377+
assert(relation.stats.sizeInBytes > 0)
1378+
// May be cause OOM if rowCount == 0 when enables CBO, see SPARK-22626 for details.
1379+
assert(relation.stats.rowCount.isEmpty)
1380+
}
1381+
}
13631382
}

0 commit comments

Comments
 (0)