Add docstrings to hooks used for selecting a custom set of files

mbautin · mbautin · commit dcbe6839c9dd · 2015-07-27T20:26:07.000+03:00
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -518,10 +518,28 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
       }
   }
 
+  /**
+   * Allows the user to pre-process table names before the Hive metastore is looked up. This can
+   * be used to encode additional information into the table name, such as a version number
+   * (e.g. `mytable_v1`, `mytable_v2`, etc.)
+   * @param tableNamePreprocessor a function to be applied to Hive table name before we look up the
+   *                              table in the Hive metastore.
+   */
   def setTableNamePreprocessor(tableNamePreprocessor: (String) => String): Unit = {
     catalog.setTableNamePreprocessor(tableNamePreprocessor)
   }
 
+  /**
+   * Allows to register a custom way to select files/directories to be included in a table scan
+   * based on the table name. This can be used together with [[setTableNamePreprocessor]] to
+   * customize table scan results based on the specified table name, e.g. `mytable_v1` could have a
+   * different set of files than `mytable_v2`, and both of these "virtual tables" would be backed
+   * by a real Hive table `mytable`. Note that the table name passed to the user-provided file
+   * selection method is the name specified in the query, not the table name in the Hive metastore
+   * that is generated by applying the user-specified "table name preprocessor" method.
+   * @param hadoopFileSelector
+   * @see [[setTableNamePreprocessor]]
+   */
   def setHadoopFileSelector(hadoopFileSelector: Option[HadoopFileSelector]): Unit = {
     this.hadoopFileSelector = hadoopFileSelector
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -107,9 +107,9 @@ class HadoopTableReader(
 
     val tablePath = hiveTable.getPath
     val fs = tablePath.getFileSystem(sc.hiveconf)
-    val inputPaths: Seq[Path] =
+    val inputPaths: Seq[String] =
       sc.hadoopFileSelector.flatMap(
-        _.selectFiles(hiveTable.getTableName, fs, tablePath).
+        _.selectFiles(hiveTable.getTableName, fs, tablePath)
       ).map(_.map(_.toString)).getOrElse(applyFilterIfNeeded(tablePath, filterOpt))
 
     // logDebug("Table input: %s".format(tablePath))
@@ -263,7 +263,7 @@ class HadoopTableReader(
       case Some(filter) =>
         val fs = path.getFileSystem(sc.hiveconf)
         fs.listStatus(path, filter).map(_.getPath.toString)
-      case None => Seq(path)
+      case None => Seq(path.toString)
     }
   }
 
@@ -404,7 +404,9 @@ abstract class HadoopFileSelector {
   /**
    * Select files constituting a table from the given base path according to the client's custom
    * algorithm. This is only applied to non-partitioned tables.
-   * @param tableName table name to select files for
+   * @param tableName table name to select files for. This is the exact table name specified
+   *                  in the query, not a "preprocessed" file name returned by the user-defined
+   *                  function registered via [[HiveContext.setTableNamePreprocessor]].
    * @param fs the filesystem containing the table
    * @param basePath base path of the table in the filesystem
    * @return a set of files, or [[None]] if the custom file selection algorithm does not apply