@@ -227,23 +227,28 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
227227 result.copy(output = newOutput)
228228 }
229229
230+ /**
231+ * Selecting directories at driver side by interacting with hdfs won't scale.
232+ * TODO: migrate to parquet partitioning
233+ */
230234 private [hive] def selectParquetLocationDirectories (
231235 tableName : String ,
232236 locationOpt : Option [Path ]): Seq [Path ] = {
237+ val start = System .currentTimeMillis
233238 val hadoopConf = sparkSession.sparkContext.hadoopConfiguration
234239 val paths : Option [Seq [Path ]] = for {
235240 selector <- sparkSession.sharedState.externalCatalog.findHadoopFileSelector
236241 location <- locationOpt
237242 fs = location.getFileSystem(hadoopConf)
238- selectedPaths <- selector.selectFiles(tableName, fs, location)
239- selectedDir = for {
240- selectedPath <- selectedPaths
241- if selectedPath
242- .getFileSystem(hadoopConf)
243- .isDirectory(selectedPath)
244- } yield selectedPath
243+ // Csd's HadoopFileSelector should guarantee to return directories only,
244+ selectedDir <- selector.selectFiles(tableName, fs, location)
245245 if selectedDir.nonEmpty
246246 } yield selectedDir
247+ logDebug(
248+ s " process duration of HiveMetastoreCatalog.selectParquetLocationDirectories( " +
249+ s " $tableName, $locationOpt): ${System .currentTimeMillis - start}, selected directories: " +
250+ s " ${paths.map(_.size).getOrElse(0 )}" )
251+
247252 paths.getOrElse(Seq (locationOpt.orNull))
248253 }
249254
0 commit comments