@@ -30,6 +30,7 @@ import org.apache.spark.sql.test.SharedSQLContext
3030import org .apache .spark .sql .test .SQLTestData ._
3131import org .apache .spark .sql .types ._
3232import org .apache .spark .storage .StorageLevel ._
33+ import org .apache .spark .util .Utils
3334
3435class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
3536 import testImplicits ._
@@ -480,4 +481,32 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
480481 }
481482 }
482483 }
484+
485+ test(" SPARK-22673: InMemoryRelation should utilize existing stats whenever possible" ) {
486+ withSQLConf(" spark.sql.cbo.enabled" -> " true" ) {
487+ // scalastyle:off
488+ val workDir = s " ${Utils .createTempDir()}/table1 "
489+ val data = Seq (100 , 200 , 300 , 400 ).toDF(" count" )
490+ data.write.parquet(workDir)
491+ val dfFromFile = spark.read.parquet(workDir).cache()
492+ val inMemoryRelation = dfFromFile.queryExecution.optimizedPlan.collect {
493+ case plan : InMemoryRelation => plan
494+ }.head
495+ // InMemoryRelation's stats is Long.MaxValue before the underlying RDD is materialized
496+ assert(inMemoryRelation.computeStats().sizeInBytes === Long .MaxValue )
497+ // InMemoryRelation's stats is updated after materializing RDD
498+ dfFromFile.collect()
499+ assert(inMemoryRelation.computeStats().sizeInBytes === 16 )
500+ // test of catalog table
501+ val dfFromTable = spark.catalog.createTable(" table1" , workDir).cache()
502+ val inMemoryRelation2 = dfFromTable.queryExecution.optimizedPlan.
503+ collect { case plan : InMemoryRelation => plan }.head
504+ // Even CBO enabled, InMemoryRelation's stats keeps as the default one before table's stats
505+ // is calculated
506+ assert(inMemoryRelation2.computeStats().sizeInBytes === Long .MaxValue )
507+ // InMemoryRelation's stats should be updated after calculating stats of the table
508+ spark.sql(" ANALYZE TABLE table1 COMPUTE STATISTICS" )
509+ assert(inMemoryRelation2.computeStats().sizeInBytes === 16 )
510+ }
511+ }
483512}
0 commit comments