Skip to content

Commit dcbe683

Browse files
committed
Add docstrings to hooks used for selecting a custom set of files
1 parent 3958ff9 commit dcbe683

File tree

2 files changed

+24
-4
lines changed

2 files changed

+24
-4
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,10 +518,28 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
518518
}
519519
}
520520

521+
/**
522+
* Allows the user to pre-process table names before the Hive metastore is looked up. This can
523+
* be used to encode additional information into the table name, such as a version number
524+
* (e.g. `mytable_v1`, `mytable_v2`, etc.)
525+
* @param tableNamePreprocessor a function to be applied to Hive table name before we look up the
526+
* table in the Hive metastore.
527+
*/
521528
def setTableNamePreprocessor(tableNamePreprocessor: (String) => String): Unit = {
522529
catalog.setTableNamePreprocessor(tableNamePreprocessor)
523530
}
524531

532+
/**
533+
* Allows to register a custom way to select files/directories to be included in a table scan
534+
* based on the table name. This can be used together with [[setTableNamePreprocessor]] to
535+
* customize table scan results based on the specified table name, e.g. `mytable_v1` could have a
536+
* different set of files than `mytable_v2`, and both of these "virtual tables" would be backed
537+
* by a real Hive table `mytable`. Note that the table name passed to the user-provided file
538+
* selection method is the name specified in the query, not the table name in the Hive metastore
539+
* that is generated by applying the user-specified "table name preprocessor" method.
540+
* @param hadoopFileSelector
541+
* @see [[setTableNamePreprocessor]]
542+
*/
525543
def setHadoopFileSelector(hadoopFileSelector: Option[HadoopFileSelector]): Unit = {
526544
this.hadoopFileSelector = hadoopFileSelector
527545
}

sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,9 @@ class HadoopTableReader(
107107

108108
val tablePath = hiveTable.getPath
109109
val fs = tablePath.getFileSystem(sc.hiveconf)
110-
val inputPaths: Seq[Path] =
110+
val inputPaths: Seq[String] =
111111
sc.hadoopFileSelector.flatMap(
112-
_.selectFiles(hiveTable.getTableName, fs, tablePath).
112+
_.selectFiles(hiveTable.getTableName, fs, tablePath)
113113
).map(_.map(_.toString)).getOrElse(applyFilterIfNeeded(tablePath, filterOpt))
114114

115115
// logDebug("Table input: %s".format(tablePath))
@@ -263,7 +263,7 @@ class HadoopTableReader(
263263
case Some(filter) =>
264264
val fs = path.getFileSystem(sc.hiveconf)
265265
fs.listStatus(path, filter).map(_.getPath.toString)
266-
case None => Seq(path)
266+
case None => Seq(path.toString)
267267
}
268268
}
269269

@@ -404,7 +404,9 @@ abstract class HadoopFileSelector {
404404
/**
405405
* Select files constituting a table from the given base path according to the client's custom
406406
* algorithm. This is only applied to non-partitioned tables.
407-
* @param tableName table name to select files for
407+
* @param tableName table name to select files for. This is the exact table name specified
408+
* in the query, not a "preprocessed" file name returned by the user-defined
409+
* function registered via [[HiveContext.setTableNamePreprocessor]].
408410
* @param fs the filesystem containing the table
409411
* @param basePath base path of the table in the filesystem
410412
* @return a set of files, or [[None]] if the custom file selection algorithm does not apply

0 commit comments

Comments
 (0)