[SPARK-31935][SQL][FOLLOWUP] Hadoop file system config should be effective in data source options

cloud-fan · gengliangwang · commit 6edb20df834f · 2020-07-02T06:09:54.000+08:00
### What changes were proposed in this pull request? This is a followup of #28760 to fix the remaining issues: 1. should consider data source options when refreshing cache by path at the end of `InsertIntoHadoopFsRelationCommand` 2. should consider data source options when inferring schema for file source 3. should consider data source options when getting the qualified path in file source v2. ### Why are the changes needed? We didn't catch these issues in #28760, because the test case is to check error when initializing the file system. If we initialize the file system multiple times during a simple read/write action, the test case actually only test the first time. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? rewrite the test to make sure the entire data source read/write action can succeed. Closes #28948 from cloud-fan/fix. Authored-by: Wenchen Fan <wenchen@databricks.com> Signed-off-by: Gengliang Wang <gengliang.wang@databricks.com>
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroDataSourceV2.scala b/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroDataSourceV2.scala
@@ -31,13 +31,13 @@ class AvroDataSourceV2 extends FileDataSourceV2 {
 
   override def getTable(options: CaseInsensitiveStringMap): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     AvroTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)
   }
 
   override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     AvroTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -248,12 +248,17 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper {
    * `HadoopFsRelation` node(s) as part of its logical plan.
    */
   def recacheByPath(spark: SparkSession, resourcePath: String): Unit = {
-    val (fs, qualifiedPath) = {
-      val path = new Path(resourcePath)
-      val fs = path.getFileSystem(spark.sessionState.newHadoopConf())
-      (fs, fs.makeQualified(path))
-    }
+    val path = new Path(resourcePath)
+    val fs = path.getFileSystem(spark.sessionState.newHadoopConf())
+    recacheByPath(spark, path, fs)
+  }
 
+  /**
+   * Tries to re-cache all the cache entries that contain `resourcePath` in one or more
+   * `HadoopFsRelation` node(s) as part of its logical plan.
+   */
+  def recacheByPath(spark: SparkSession, resourcePath: Path, fs: FileSystem): Unit = {
+    val qualifiedPath = fs.makeQualified(resourcePath)
     recacheByCondition(spark, _.plan.find(lookupAndRefresh(_, fs, qualifiedPath)).isDefined)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -192,7 +192,7 @@ case class InsertIntoHadoopFsRelationCommand(
       // refresh cached files in FileIndex
       fileIndex.foreach(_.refresh())
       // refresh data cache if table is cached
-      sparkSession.catalog.refreshByPath(outputPath.toString)
+      sparkSession.sharedState.cacheManager.recacheByPath(sparkSession, outputPath, fs)
 
       if (catalogTable.nonEmpty) {
         CommandUtils.updateTableStats(sparkSession, catalogTable.get)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala
@@ -32,10 +32,12 @@ object SchemaMergeUtils extends Logging {
    */
   def mergeSchemasInParallel(
       sparkSession: SparkSession,
+      parameters: Map[String, String],
       files: Seq[FileStatus],
       schemaReader: (Seq[FileStatus], Configuration, Boolean) => Seq[StructType])
       : Option[StructType] = {
-    val serializedConf = new SerializableConfiguration(sparkSession.sessionState.newHadoopConf())
+    val serializedConf = new SerializableConfiguration(
+      sparkSession.sessionState.newHadoopConfWithOptions(parameters))
 
     // !! HACK ALERT !!
     // Here is a hack for Parquet, but it can be used by Orc as well.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -109,7 +109,7 @@ object OrcUtils extends Logging {
     val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf)
     if (orcOptions.mergeSchema) {
       SchemaMergeUtils.mergeSchemasInParallel(
-        sparkSession, files, OrcUtils.readOrcSchemasInParallel)
+        sparkSession, options, files, OrcUtils.readOrcSchemasInParallel)
     } else {
       OrcUtils.readSchema(sparkSession, files)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -475,6 +475,7 @@ object ParquetFileFormat extends Logging {
    *     S3 nodes).
    */
   def mergeSchemasInParallel(
+      parameters: Map[String, String],
       filesToTouch: Seq[FileStatus],
       sparkSession: SparkSession): Option[StructType] = {
     val assumeBinaryIsString = sparkSession.sessionState.conf.isParquetBinaryAsString
@@ -490,7 +491,7 @@ object ParquetFileFormat extends Logging {
         .map(ParquetFileFormat.readSchemaFromFooter(_, converter))
     }
 
-    SchemaMergeUtils.mergeSchemasInParallel(sparkSession, filesToTouch, reader)
+    SchemaMergeUtils.mergeSchemasInParallel(sparkSession, parameters, filesToTouch, reader)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
@@ -104,7 +104,7 @@ object ParquetUtils {
           .orElse(filesByType.data.headOption)
           .toSeq
       }
-    ParquetFileFormat.mergeSchemasInParallel(filesToTouch, sparkSession)
+    ParquetFileFormat.mergeSchemasInParallel(parameters, filesToTouch, sparkSession)
   }
 
   case class FileTypes(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala
@@ -18,7 +18,10 @@ package org.apache.spark.sql.execution.datasources.v2
 
 import java.util
 
+import scala.collection.JavaConverters._
+
 import com.fasterxml.jackson.databind.ObjectMapper
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.SparkSession
@@ -53,14 +56,16 @@ trait FileDataSourceV2 extends TableProvider with DataSourceRegister {
     paths ++ Option(map.get("path")).toSeq
   }
 
-  protected def getTableName(paths: Seq[String]): String = {
-    val name = shortName() + " " + paths.map(qualifiedPathName).mkString(",")
+  protected def getTableName(map: CaseInsensitiveStringMap, paths: Seq[String]): String = {
+    val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(
+      map.asCaseSensitiveMap().asScala.toMap)
+    val name = shortName() + " " + paths.map(qualifiedPathName(_, hadoopConf)).mkString(",")
     Utils.redact(sparkSession.sessionState.conf.stringRedactionPattern, name)
   }
 
-  private def qualifiedPathName(path: String): String = {
+  private def qualifiedPathName(path: String, hadoopConf: Configuration): String = {
     val hdfsPath = new Path(path)
-    val fs = hdfsPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
+    val fs = hdfsPath.getFileSystem(hadoopConf)
     hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory).toString
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala
@@ -31,13 +31,13 @@ class CSVDataSourceV2 extends FileDataSourceV2 {
 
   override def getTable(options: CaseInsensitiveStringMap): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     CSVTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)
   }
 
   override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     CSVTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala
@@ -31,13 +31,13 @@ class JsonDataSourceV2 extends FileDataSourceV2 {
 
   override def getTable(options: CaseInsensitiveStringMap): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     JsonTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)
   }
 
   override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     JsonTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala
@@ -31,13 +31,13 @@ class OrcDataSourceV2 extends FileDataSourceV2 {
 
   override def getTable(options: CaseInsensitiveStringMap): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     OrcTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)
   }
 
   override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     OrcTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala
@@ -31,13 +31,13 @@ class ParquetDataSourceV2 extends FileDataSourceV2 {
 
   override def getTable(options: CaseInsensitiveStringMap): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     ParquetTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)
   }
 
   override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     ParquetTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala
@@ -31,13 +31,13 @@ class TextDataSourceV2 extends FileDataSourceV2 {
 
   override def getTable(options: CaseInsensitiveStringMap): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     TextTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)
   }
 
   override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
     val paths = getPaths(options)
-    val tableName = getTableName(paths)
+    val tableName = getTableName(options, paths)
     TextTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -18,12 +18,14 @@
 package org.apache.spark.sql
 
 import java.io.{File, FileNotFoundException}
+import java.net.URI
 import java.nio.file.{Files, StandardOpenOption}
 import java.util.Locale
 
 import scala.collection.mutable
 
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{LocalFileSystem, Path}
 
 import org.apache.spark.SparkException
 import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
@@ -845,19 +847,15 @@ class FileBasedDataSourceSuite extends QueryTest
 
   test("SPARK-31935: Hadoop file system config should be effective in data source options") {
     Seq("parquet", "").foreach { format =>
-      withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> format) {
+      withSQLConf(
+        SQLConf.USE_V1_SOURCE_LIST.key -> format,
+        "fs.file.impl" -> classOf[FakeFileSystemRequiringDSOption].getName,
+        "fs.file.impl.disable.cache" -> "true") {
         withTempDir { dir =>
-          val path = dir.getCanonicalPath
-          val defaultFs = "nonexistFS://nonexistFS"
-          val expectMessage = "No FileSystem for scheme nonexistFS"
-          val message1 = intercept[java.io.IOException] {
-            spark.range(10).write.option("fs.defaultFS", defaultFs).parquet(path)
-          }.getMessage
-          assert(message1.filterNot(Set(':', '"').contains) == expectMessage)
-          val message2 = intercept[java.io.IOException] {
-            spark.read.option("fs.defaultFS", defaultFs).parquet(path)
-          }.getMessage
-          assert(message2.filterNot(Set(':', '"').contains) == expectMessage)
+          val path = "file:" + dir.getCanonicalPath.stripPrefix("file:")
+          spark.range(10).write.option("ds_option", "value").mode("overwrite").parquet(path)
+          checkAnswer(
+            spark.read.option("ds_option", "value").parquet(path), spark.range(10).toDF())
         }
       }
     }
@@ -932,3 +930,10 @@ object TestingUDT {
     override def userClass: Class[NullData] = classOf[NullData]
   }
 }
+
+class FakeFileSystemRequiringDSOption extends LocalFileSystem {
+  override def initialize(name: URI, conf: Configuration): Unit = {
+    super.initialize(name, conf)
+    require(conf.get("ds_option", "") == "value")
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -213,9 +213,7 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
           Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten
 
         val schema = SchemaMergeUtils.mergeSchemasInParallel(
-          spark,
-          fileStatuses,
-          schemaReader)
+          spark, Map.empty, fileStatuses, schemaReader)
 
         assert(schema.isDefined)
         assert(schema.get == StructType(Seq(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -70,9 +70,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
     val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf)
     if (orcOptions.mergeSchema) {
       SchemaMergeUtils.mergeSchemasInParallel(
-        sparkSession,
-        files,
-        OrcFileOperator.readOrcSchemasInParallel)
+        sparkSession, options, files, OrcFileOperator.readOrcSchemasInParallel)
     } else {
       val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
       OrcFileOperator.readSchema(

Original file line number	Diff line number	Diff line change
`@@ -31,13 +31,13 @@ class AvroDataSourceV2 extends FileDataSourceV2 {`
`31`	`31`
`32`	`32`	`override def getTable(options: CaseInsensitiveStringMap): Table = {`
`33`	`33`	`val paths = getPaths(options)`
`34`		`- val tableName = getTableName(paths)`
	`34`	`+ val tableName = getTableName(options, paths)`
`35`	`35`	`AvroTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {`
`39`	`39`	`val paths = getPaths(options)`
`40`		`- val tableName = getTableName(paths)`
	`40`	`+ val tableName = getTableName(options, paths)`
`41`	`41`	`AvroTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)`
`42`	`42`	`}`
`43`	`43`	`}`
Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,7 @@ object OrcUtils extends Logging {`
`109`	`109`	`val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf)`
`110`	`110`	`if (orcOptions.mergeSchema) {`
`111`	`111`	`SchemaMergeUtils.mergeSchemasInParallel(`
`112`		`- sparkSession, files, OrcUtils.readOrcSchemasInParallel)`
	`112`	`+ sparkSession, options, files, OrcUtils.readOrcSchemasInParallel)`
`113`	`113`	`} else {`
`114`	`114`	`OrcUtils.readSchema(sparkSession, files)`
`115`	`115`	`}`
Original file line number	Diff line number	Diff line change
`@@ -475,6 +475,7 @@ object ParquetFileFormat extends Logging {`
`475`	`475`	`* S3 nodes).`
`476`	`476`	`*/`
`477`	`477`	`def mergeSchemasInParallel(`
	`478`	`+ parameters: Map[String, String],`
`478`	`479`	`filesToTouch: Seq[FileStatus],`
`479`	`480`	`sparkSession: SparkSession): Option[StructType] = {`
`480`	`481`	`val assumeBinaryIsString = sparkSession.sessionState.conf.isParquetBinaryAsString`
`@@ -490,7 +491,7 @@ object ParquetFileFormat extends Logging {`
`490`	`491`	`.map(ParquetFileFormat.readSchemaFromFooter(_, converter))`
`491`	`492`	`}`
`492`	`493`
`493`		`- SchemaMergeUtils.mergeSchemasInParallel(sparkSession, filesToTouch, reader)`
	`494`	`+ SchemaMergeUtils.mergeSchemasInParallel(sparkSession, parameters, filesToTouch, reader)`
`494`	`495`	`}`
`495`	`496`
`496`	`497`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ object ParquetUtils {`
`104`	`104`	`.orElse(filesByType.data.headOption)`
`105`	`105`	`.toSeq`
`106`	`106`	`}`
`107`		`- ParquetFileFormat.mergeSchemasInParallel(filesToTouch, sparkSession)`
	`107`	`+ ParquetFileFormat.mergeSchemasInParallel(parameters, filesToTouch, sparkSession)`
`108`	`108`	`}`
`109`	`109`
`110`	`110`	`case class FileTypes(`
Original file line number	Diff line number	Diff line change
`@@ -31,13 +31,13 @@ class CSVDataSourceV2 extends FileDataSourceV2 {`
`31`	`31`
`32`	`32`	`override def getTable(options: CaseInsensitiveStringMap): Table = {`
`33`	`33`	`val paths = getPaths(options)`
`34`		`- val tableName = getTableName(paths)`
	`34`	`+ val tableName = getTableName(options, paths)`
`35`	`35`	`CSVTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {`
`39`	`39`	`val paths = getPaths(options)`
`40`		`- val tableName = getTableName(paths)`
	`40`	`+ val tableName = getTableName(options, paths)`
`41`	`41`	`CSVTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)`
`42`	`42`	`}`
`43`	`43`	`}`
Original file line number	Diff line number	Diff line change
`@@ -31,13 +31,13 @@ class JsonDataSourceV2 extends FileDataSourceV2 {`
`31`	`31`
`32`	`32`	`override def getTable(options: CaseInsensitiveStringMap): Table = {`
`33`	`33`	`val paths = getPaths(options)`
`34`		`- val tableName = getTableName(paths)`
	`34`	`+ val tableName = getTableName(options, paths)`
`35`	`35`	`JsonTable(tableName, sparkSession, options, paths, None, fallbackFileFormat)`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {`
`39`	`39`	`val paths = getPaths(options)`
`40`		`- val tableName = getTableName(paths)`
	`40`	`+ val tableName = getTableName(options, paths)`
`41`	`41`	`JsonTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat)`
`42`	`42`	`}`
`43`	`43`	`}`