Get size info from metastore for MetastoreRelation.

concretevitamin · concretevitamin · commit 6e594b845029 · 2014-07-29T11:48:28.000-07:00
Additionally, remove size estimate from ParquetRelation since the Hadoop
FileSystem API calls can be expensive (e.g. S3FileSystem has a lot of
RPCs).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -22,11 +22,9 @@ import java.io.IOException
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.fs.permission.FsAction
-import org.apache.hadoop.mapreduce.Job
 
 import parquet.hadoop.ParquetOutputFormat
 import parquet.hadoop.metadata.CompressionCodecName
-import parquet.hadoop.util.ContextUtil
 import parquet.schema.MessageType
 
 import org.apache.spark.sql.SQLContext
@@ -53,15 +51,6 @@ private[sql] case class ParquetRelation(
 
   self: Product =>
 
-  @transient override lazy val statistics = Statistics(
-    // TODO: investigate getting encoded column statistics in the parquet file?
-    sizeInBytes = {
-      val hdfsPath = new Path(path)
-      val fs = hdfsPath.getFileSystem(conf.getOrElse(ContextUtil.getConfiguration(new Job())))
-      math.max(fs.getContentSummary(hdfsPath).getLength, 1L) // TODO: in bytes or system-dependent?
-    }
-  )
-
   /** Schema derived from ParquetFile */
   def parquetSchema: MessageType =
     ParquetTypesConverter
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -269,18 +269,18 @@ private[hive] case class MetastoreRelation
     new Partition(hiveQlTable, p)
   }
 
-  // TODO: are there any stats in hiveQlTable.getSkewedInfo that we can use?
-  @transient override lazy val statistics = new Statistics {
+  @transient override lazy val statistics = Statistics(
     // TODO: check if this estimate is valid for tables after partition pruning.
-    // Size getters adapted from SizeBasedBigTableSelectorForAutoSMJ.java in Hive (version 0.13).
-    override val sizeInBytes: Long =
-      math.max(maybeGetSize(hiveConf, hiveQlTable.getProperty("totalSize"), path), 1L)
-
-    private[this] def maybeGetSize(conf: HiveConf, size: String, path: Path): Long = {
-      val res = try { Some(size.toLong) } catch { case _: Exception => None }
-      res.getOrElse { path.getFileSystem(conf).getContentSummary(path).getLength }
+    sizeInBytes = {
+      // NOTE: kind of hacky, but this should be relatively cheap if parameters for the table are
+      // populated into the metastore.  An alternative would be going through Hadoop's FileSystem
+      // API, which can be expensive if a lot of RPCs are involved.  Besides `totalSize`, there are
+      // also `numFiles`, `numRows`, `rawDataSize` keys we can look at in the future.
+      val sizeMaybeFromMetastore =
+        Option(hiveQlTable.getParameters.get("totalSize")).map(_.toLong).getOrElse(-1L)
+      math.max(sizeMaybeFromMetastore, 1L)
     }
-  }
+  )
 
   val tableDesc = new TableDesc(
     Class.forName(hiveQlTable.getSerializationLib).asInstanceOf[Class[Deserializer]],
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -23,32 +23,16 @@ import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.execution.{BroadcastHashJoin, ShuffledHashJoin}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.parquet.{ParquetRelation, ParquetTestData}
-import org.apache.spark.util.Utils
 
 class StatisticsSuite extends QueryTest {
 
-  test("estimates the size of a test ParquetRelation") {
-    ParquetTestData.writeFile()
-    val testRDD = parquetFile(ParquetTestData.testDir.toString)
-
-    val sizes = testRDD.logicalPlan.collect { case j: ParquetRelation =>
-      (j.statistics.sizeInBytes, j.newInstance.statistics.sizeInBytes)
-    }
-    assert(sizes.size === 1)
-    assert(sizes(0)._1 == sizes(0)._2, "after .newInstance, estimates are different from before")
-    assert(sizes(0)._1 > 1, "1 is the default, indicating the absence of a meaningful estimate")
-
-    Utils.deleteRecursively(ParquetTestData.testDir)
-  }
-
   test("estimates the size of a test MetastoreRelation") {
     val rdd = hql("""SELECT * FROM src""")
     val sizes = rdd.queryExecution.analyzed.collect { case mr: MetastoreRelation =>
       mr.statistics.sizeInBytes
     }
     assert(sizes.size === 1)
-    assert(sizes(0) > 1, "1 is the default, indicating the absence of a meaningful estimate")
+    assert(sizes(0) == 5812, s"expected exact size 5812 for test table 'src', got ${sizes(0)}")
   }
 
   test("auto converts to broadcast hash join, by size estimate of a relation") {
@@ -95,30 +79,6 @@ class StatisticsSuite extends QueryTest {
       after()
     }
 
-    /** Tests for ParquetRelation */
-    val parquetQuery =
-      """SELECT a.mystring, b.myint
-        |FROM psrc a
-        |JOIN psrc b
-        |ON a.mylong = 0 AND a.mylong = b.mylong""".stripMargin
-    val parquetAnswer = Seq(("abc", 5))
-    def parquetBefore(): Unit = {
-      ParquetTestData.writeFile()
-      val testRDD = parquetFile(ParquetTestData.testDir.toString)
-      testRDD.registerAsTable("psrc")
-    }
-    def parquetAfter() = {
-      Utils.deleteRecursively(ParquetTestData.testDir)
-      reset()
-    }
-    mkTest(
-      parquetBefore,
-      parquetAfter,
-      parquetQuery,
-      parquetAnswer,
-      implicitly[ClassTag[ParquetRelation]]
-    )
-
     /** Tests for MetastoreRelation */
     val metastoreQuery = """SELECT * FROM src a JOIN src b ON a.key = 238 AND a.key = b.key"""
     val metastoreAnswer = Seq.fill(4)((238, "val_238", 238, "val_238"))