[SPARK-26871][SQL] File Source V2: avoid creating unnecessary FileIndex in the write path

gengliangwang · Jackey Lee · commit adbfabeb8ae4 · 2019-02-18T18:54:33.000+08:00
## What changes were proposed in this pull request? In apache#23383, the file source V2 framework is implemented. In the PR, `FileIndex` is created as a member of `FileTable`, so that we can implement partition pruning like apache@0f9fcab in the future(As data source V2 catalog is under development, partition pruning is removed from the PR) However, after write path of file source V2 is implemented, I find that a simple write will create an unnecessary `FileIndex`, which is required by `FileTable`. This is a sort of regression. And we can see there is a warning message when writing to ORC files ``` WARN InMemoryFileIndex: The directory file:/tmp/foo was not found. Was it deleted very recently? ``` This PR is to make `FileIndex` as a lazy value in `FileTable`, so that we can avoid creating unnecessary `FileIndex` in the write path. ## How was this patch tested? Existing unit test Closes apache#23774 from gengliangwang/moveFileIndexInV2. Authored-by: Gengliang Wang <gengliang.wang@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallbackOrcDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallbackOrcDataSourceV2.scala
@@ -35,7 +35,7 @@ class FallbackOrcDataSourceV2(sparkSession: SparkSession) extends Rule[LogicalPl
   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
     case i @ InsertIntoTable(d @DataSourceV2Relation(table: OrcTable, _, _), _, _, _, _) =>
       val v1FileFormat = new OrcFileFormat
-      val relation = HadoopFsRelation(table.getFileIndex, table.getFileIndex.partitionSchema,
+      val relation = HadoopFsRelation(table.fileIndex, table.fileIndex.partitionSchema,
         table.schema(), None, v1FileFormat, d.options)(sparkSession)
       i.copy(table = LogicalRelation(relation))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala
@@ -16,13 +16,10 @@
  */
 package org.apache.spark.sql.execution.datasources.v2
 
-import scala.collection.JavaConverters._
-
-import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.sources.DataSourceRegister
-import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, SupportsBatchRead, TableProvider}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.sources.v2.TableProvider
 
 /**
  * A base interface for data source v2 implementations of the built-in file-based data sources.
@@ -38,17 +35,4 @@ trait FileDataSourceV2 extends TableProvider with DataSourceRegister {
   def fallBackFileFormat: Class[_ <: FileFormat]
 
   lazy val sparkSession = SparkSession.active
-
-  def getFileIndex(
-      options: DataSourceOptions,
-      userSpecifiedSchema: Option[StructType]): PartitioningAwareFileIndex = {
-    val filePaths = options.paths()
-    val hadoopConf =
-      sparkSession.sessionState.newHadoopConfWithOptions(options.asMap().asScala.toMap)
-    val rootPathsSpecified = DataSource.checkAndGlobPathIfNecessary(filePaths, hadoopConf,
-      checkEmptyGlobPath = true, checkFilesExist = options.checkFilesExist())
-    val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
-    new InMemoryFileIndex(sparkSession, rootPathsSpecified,
-      options.asMap().asScala.toMap, userSpecifiedSchema, fileStatusCache)
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
@@ -16,19 +16,30 @@
  */
 package org.apache.spark.sql.execution.datasources.v2
 
+import scala.collection.JavaConverters._
+
 import org.apache.hadoop.fs.FileStatus
 
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.sources.v2.{SupportsBatchRead, SupportsBatchWrite, Table}
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, SupportsBatchRead, SupportsBatchWrite, Table}
 import org.apache.spark.sql.types.StructType
 
 abstract class FileTable(
     sparkSession: SparkSession,
-    fileIndex: PartitioningAwareFileIndex,
+    options: DataSourceOptions,
     userSpecifiedSchema: Option[StructType])
   extends Table with SupportsBatchRead with SupportsBatchWrite {
-  def getFileIndex: PartitioningAwareFileIndex = this.fileIndex
+  lazy val fileIndex: PartitioningAwareFileIndex = {
+    val filePaths = options.paths()
+    val hadoopConf =
+      sparkSession.sessionState.newHadoopConfWithOptions(options.asMap().asScala.toMap)
+    val rootPathsSpecified = DataSource.checkAndGlobPathIfNecessary(filePaths, hadoopConf,
+      checkEmptyGlobPath = true, checkFilesExist = options.checkFilesExist())
+    val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
+    new InMemoryFileIndex(sparkSession, rootPathsSpecified,
+      options.asMap().asScala.toMap, userSpecifiedSchema, fileStatusCache)
+  }
 
   lazy val dataSchema: StructType = userSpecifiedSchema.orElse {
     inferSchema(fileIndex.allFiles())
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala
@@ -34,13 +34,11 @@ class OrcDataSourceV2 extends FileDataSourceV2 {
 
   override def getTable(options: DataSourceOptions): Table = {
     val tableName = getTableName(options)
-    val fileIndex = getFileIndex(options, None)
-    OrcTable(tableName, sparkSession, fileIndex, None)
+    OrcTable(tableName, sparkSession, options, None)
   }
 
   override def getTable(options: DataSourceOptions, schema: StructType): Table = {
     val tableName = getTableName(options)
-    val fileIndex = getFileIndex(options, Some(schema))
-    OrcTable(tableName, sparkSession, fileIndex, Some(schema))
+    OrcTable(tableName, sparkSession, options, Some(schema))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources.v2.orc
 import org.apache.hadoop.fs.FileStatus
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
 import org.apache.spark.sql.execution.datasources.orc.OrcUtils
 import org.apache.spark.sql.execution.datasources.v2.FileTable
 import org.apache.spark.sql.sources.v2.DataSourceOptions
@@ -29,9 +28,9 @@ import org.apache.spark.sql.types.StructType
 case class OrcTable(
     name: String,
     sparkSession: SparkSession,
-    fileIndex: PartitioningAwareFileIndex,
+    options: DataSourceOptions,
     userSpecifiedSchema: Option[StructType])
-  extends FileTable(sparkSession, fileIndex, userSpecifiedSchema) {
+  extends FileTable(sparkSession, options, userSpecifiedSchema) {
   override def newScanBuilder(options: DataSourceOptions): OrcScanBuilder =
     new OrcScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
 

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ class FallbackOrcDataSourceV2(sparkSession: SparkSession) extends Rule[LogicalPl`
`35`	`35`	`override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {`
`36`	`36`	`case i @ InsertIntoTable(d @DataSourceV2Relation(table: OrcTable, _, _), _, _, _, _) =>`
`37`	`37`	`val v1FileFormat = new OrcFileFormat`
`38`		`- val relation = HadoopFsRelation(table.getFileIndex, table.getFileIndex.partitionSchema,`
	`38`	`+ val relation = HadoopFsRelation(table.fileIndex, table.fileIndex.partitionSchema,`
`39`	`39`	`table.schema(), None, v1FileFormat, d.options)(sparkSession)`
`40`	`40`	`i.copy(table = LogicalRelation(relation))`
`41`	`41`	`}`
Original file line number	Diff line number	Diff line change
`@@ -34,13 +34,11 @@ class OrcDataSourceV2 extends FileDataSourceV2 {`
`34`	`34`
`35`	`35`	`override def getTable(options: DataSourceOptions): Table = {`
`36`	`36`	`val tableName = getTableName(options)`
`37`		`- val fileIndex = getFileIndex(options, None)`
`38`		`- OrcTable(tableName, sparkSession, fileIndex, None)`
	`37`	`+ OrcTable(tableName, sparkSession, options, None)`
`39`	`38`	`}`
`40`	`39`
`41`	`40`	`override def getTable(options: DataSourceOptions, schema: StructType): Table = {`
`42`	`41`	`val tableName = getTableName(options)`
`43`		`- val fileIndex = getFileIndex(options, Some(schema))`
`44`		`- OrcTable(tableName, sparkSession, fileIndex, Some(schema))`
	`42`	`+ OrcTable(tableName, sparkSession, options, Some(schema))`
`45`	`43`	`}`
`46`	`44`	`}`