Add hooks for selecting the set of files for a table scan

mbautin · mbautin · commit 0adc99b00d2e · 2015-07-27T02:45:46.000+03:00
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -150,6 +150,9 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   @transient
   protected[sql] lazy val substitutor = new VariableSubstitution()
 
+  @transient
+  protected[sql] var hadoopFileSelector: Option[HadoopFileSelector] = None
+
   /**
    * The copy of the hive client that is used for execution.  Currently this must always be
    * Hive 13 as this is the version of Hive that is packaged with Spark SQL.  This copy of the
@@ -514,6 +517,15 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
         case _ => super.simpleString
       }
   }
+
+  def setTableNamePreprocessor(tableNamePreprocessor: (String) => String): Unit = {
+    catalog.setTableNamePreprocessor(tableNamePreprocessor)
+  }
+
+  def setHadoopFileSelector(hadoopFileSelector: Option[HadoopFileSelector]): Unit = {
+    this.hadoopFileSelector = hadoopFileSelector
+  }
+
 }
 
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -217,14 +217,21 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     client.getTableOption(databaseName, tblName).isDefined
   }
 
+  private[this] var tableNamePreprocessor: (String) => String = identity
+
+  def setTableNamePreprocessor(newTableNamePreprocessor: (String) => String): Unit = {
+    tableNamePreprocessor = newTableNamePreprocessor
+  }
+
   def lookupRelation(
       tableIdentifier: Seq[String],
       alias: Option[String]): LogicalPlan = {
     val tableIdent = processTableIdentifier(tableIdentifier)
     val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse(
       client.currentDatabase)
-    val tblName = tableIdent.last
-    val table = client.getTable(databaseName, tblName)
+    val rawTableName = tableIdent.last
+    val tblName = tableNamePreprocessor(rawTableName)
+    val table = client.getTable(databaseName, tblName).withTableName(rawTableName)
 
     if (table.properties.get("spark.sql.sources.provider").isDefined) {
       val dataSourceTable =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.hive
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{Path, PathFilter}
+import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants._
 import org.apache.hadoop.hive.ql.exec.Utilities
@@ -106,7 +106,11 @@ class HadoopTableReader(
     val broadcastedHiveConf = _broadcastedHiveConf
 
     val tablePath = hiveTable.getPath
-    val inputPathStr = applyFilterIfNeeded(tablePath, filterOpt)
+    val fs = tablePath.getFileSystem(sc.hiveconf)
+    val inputPathStr =
+      sc.hadoopFileSelector.flatMap(
+        _.selectFiles(relation.tableName, fs, tablePath)).map(_.mkString(",")).getOrElse(
+        applyFilterIfNeeded(tablePath, filterOpt))
 
     // logDebug("Table input: %s".format(tablePath))
     val ifc = hiveTable.getInputFormatClass
@@ -396,3 +400,16 @@ private[hive] object HadoopTableReader extends HiveInspectors with Logging {
     }
   }
 }
+
+abstract class HadoopFileSelector {
+  /**
+   * Select files constituting a table from the given base path according to the client's custom
+   * algorithm. This is only applied to non-partitioned tables.
+   * @param tableName table name to select files for
+   * @param fs the filesystem containing the table
+   * @param basePath base path of the table in the filesystem
+   * @return a set of files, or [[None]] if the custom file selection algorithm does not apply
+   *         to this table.
+   */
+  def selectFiles(tableName: String, fs: FileSystem, basePath: Path): Option[Seq[Path]]
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -67,6 +67,8 @@ private[hive] case class HiveTable(
     this
   }
 
+  def withTableName(newName: String): HiveTable = copy(name = newName).withClient(client)
+
   def database: String = specifiedDatabase.getOrElse(sys.error("database not resolved"))
 
   def isPartitioned: Boolean = partitionColumns.nonEmpty

Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,8 @@ private[hive] case class HiveTable(`
`67`	`67`	`this`
`68`	`68`	`}`
`69`	`69`
	`70`	`+ def withTableName(newName: String): HiveTable = copy(name = newName).withClient(client)`
	`71`	`+`
`70`	`72`	`def database: String = specifiedDatabase.getOrElse(sys.error("database not resolved"))`
`71`	`73`
`72`	`74`	`def isPartitioned: Boolean = partitionColumns.nonEmpty`