diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index fa65f88992f84..e47f64a74bd1a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -227,23 +227,28 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log result.copy(output = newOutput) } + /** + * Selecting directories at driver side by interacting with hdfs won't scale. + * TODO: migrate to parquet partitioning + */ private[hive] def selectParquetLocationDirectories( tableName: String, locationOpt: Option[Path]): Seq[Path] = { + val start = System.currentTimeMillis val hadoopConf = sparkSession.sparkContext.hadoopConfiguration val paths: Option[Seq[Path]] = for { selector <- sparkSession.sharedState.externalCatalog.findHadoopFileSelector location <- locationOpt fs = location.getFileSystem(hadoopConf) - selectedPaths <- selector.selectFiles(tableName, fs, location) - selectedDir = for { - selectedPath <- selectedPaths - if selectedPath - .getFileSystem(hadoopConf) - .isDirectory(selectedPath) - } yield selectedPath + // Csd's HadoopFileSelector should guarantee to return directories only, + selectedDir <- selector.selectFiles(tableName, fs, location) if selectedDir.nonEmpty } yield selectedDir + logDebug( + s"process duration of HiveMetastoreCatalog.selectParquetLocationDirectories(" + + s"$tableName, $locationOpt): ${System.currentTimeMillis - start}, selected directories: " + + s"${paths.map(_.size).getOrElse(0)}") + paths.getOrElse(Seq(locationOpt.orNull)) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index f6c41b6ee0fbd..4e19d79b738d3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -223,14 +223,6 @@ class ParquetLocationSelectionSuite extends QueryTest with SQLTestUtils with Tes hmc.selectParquetLocationDirectories("sometable", Option(new Path("somewhere"))) } - // ensure file existence for somewhere/sometable - somewhereSometable.delete() - somewhereSometable.createNewFile() - // somewhere/sometable is a file => will not be selected - assertResult(Seq(new Path("somewhere"))) { - hmc.selectParquetLocationDirectories("otherplace", Option(new Path("somewhere"))) - } - // no location specified, none selected assertResult(Seq(null)) { hmc.selectParquetLocationDirectories("sometable", Option(null))