Skip to content

Commit 0b3d298

Browse files
ianlcsdcsd-jenkins
authored andcommitted
SPY-1737: avoid excessive/redundant calls to DistributedFileSystem.isDirectory (#227)
* SPY-1737: avoid excessive/redundant calls to DistributedFileSystem.isDirectory * debug level logging
1 parent a42b084 commit 0b3d298

File tree

2 files changed

+12
-15
lines changed

2 files changed

+12
-15
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -227,23 +227,28 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
227227
result.copy(output = newOutput)
228228
}
229229

230+
/**
231+
* Selecting directories at driver side by interacting with hdfs won't scale.
232+
* TODO: migrate to parquet partitioning
233+
*/
230234
private[hive] def selectParquetLocationDirectories(
231235
tableName: String,
232236
locationOpt: Option[Path]): Seq[Path] = {
237+
val start = System.currentTimeMillis
233238
val hadoopConf = sparkSession.sparkContext.hadoopConfiguration
234239
val paths: Option[Seq[Path]] = for {
235240
selector <- sparkSession.sharedState.externalCatalog.findHadoopFileSelector
236241
location <- locationOpt
237242
fs = location.getFileSystem(hadoopConf)
238-
selectedPaths <- selector.selectFiles(tableName, fs, location)
239-
selectedDir = for {
240-
selectedPath <- selectedPaths
241-
if selectedPath
242-
.getFileSystem(hadoopConf)
243-
.isDirectory(selectedPath)
244-
} yield selectedPath
243+
// Csd's HadoopFileSelector should guarantee to return directories only,
244+
selectedDir <- selector.selectFiles(tableName, fs, location)
245245
if selectedDir.nonEmpty
246246
} yield selectedDir
247+
logDebug(
248+
s"process duration of HiveMetastoreCatalog.selectParquetLocationDirectories(" +
249+
s"$tableName, $locationOpt): ${System.currentTimeMillis - start}, selected directories: " +
250+
s"${paths.map(_.size).getOrElse(0)}")
251+
247252
paths.getOrElse(Seq(locationOpt.orNull))
248253
}
249254

sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -223,14 +223,6 @@ class ParquetLocationSelectionSuite extends QueryTest with SQLTestUtils with Tes
223223
hmc.selectParquetLocationDirectories("sometable", Option(new Path("somewhere")))
224224
}
225225

226-
// ensure file existence for somewhere/sometable
227-
somewhereSometable.delete()
228-
somewhereSometable.createNewFile()
229-
// somewhere/sometable is a file => will not be selected
230-
assertResult(Seq(new Path("somewhere"))) {
231-
hmc.selectParquetLocationDirectories("otherplace", Option(new Path("somewhere")))
232-
}
233-
234226
// no location specified, none selected
235227
assertResult(Seq(null)) {
236228
hmc.selectParquetLocationDirectories("sometable", Option(null))

0 commit comments

Comments
 (0)