SPY-1737: avoid excessive/redundant calls to DistributedFileSystem.isDirectory (#227)

ianlcsd · csd-jenkins · commit 0b3d2983feb4 · 2018-04-19T18:12:45.000-07:00
* SPY-1737: avoid excessive/redundant calls to DistributedFileSystem.isDirectory

* debug level logging
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -227,23 +227,28 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     result.copy(output = newOutput)
   }
 
+  /**
+   * Selecting directories at driver side by interacting with hdfs won't scale.
+   * TODO: migrate to parquet partitioning
+   */
   private[hive] def selectParquetLocationDirectories(
       tableName: String,
       locationOpt: Option[Path]): Seq[Path] = {
+    val start = System.currentTimeMillis
     val hadoopConf = sparkSession.sparkContext.hadoopConfiguration
     val paths: Option[Seq[Path]] = for {
       selector <- sparkSession.sharedState.externalCatalog.findHadoopFileSelector
       location <- locationOpt
       fs = location.getFileSystem(hadoopConf)
-      selectedPaths <- selector.selectFiles(tableName, fs, location)
-      selectedDir = for {
-        selectedPath <- selectedPaths
-        if selectedPath
-          .getFileSystem(hadoopConf)
-          .isDirectory(selectedPath)
-      } yield selectedPath
+      // Csd's HadoopFileSelector should guarantee to return directories only,
+      selectedDir <- selector.selectFiles(tableName, fs, location)
       if selectedDir.nonEmpty
     } yield selectedDir
+    logDebug(
+      s"process duration of HiveMetastoreCatalog.selectParquetLocationDirectories(" +
+        s"$tableName, $locationOpt): ${System.currentTimeMillis - start}, selected directories: " +
+        s"${paths.map(_.size).getOrElse(0)}")
+
     paths.getOrElse(Seq(locationOpt.orNull))
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -223,14 +223,6 @@ class ParquetLocationSelectionSuite extends QueryTest with SQLTestUtils with Tes
       hmc.selectParquetLocationDirectories("sometable", Option(new Path("somewhere")))
     }
 
-    // ensure file existence for somewhere/sometable
-    somewhereSometable.delete()
-    somewhereSometable.createNewFile()
-    // somewhere/sometable is a file => will not be selected
-    assertResult(Seq(new Path("somewhere"))) {
-      hmc.selectParquetLocationDirectories("otherplace", Option(new Path("somewhere")))
-    }
-
     // no location specified, none selected
     assertResult(Seq(null)) {
       hmc.selectParquetLocationDirectories("sometable", Option(null))