[SPARK-28030][SQL] convert filePath to URI in binary file data source

mengxr · mengxr · commit 4f4829b4ae26 · 2019-06-12T13:24:02.000-07:00
## What changes were proposed in this pull request? Convert `PartitionedFile.filePath` to URI first in binary file data source. Otherwise Spark will throw a FileNotFound exception because we create `Path` with URL encoded string, instead of wrapping it with URI. ## How was this patch tested? Unit test. Closes #24855 from mengxr/SPARK-28030. Authored-by: Xiangrui Meng <meng@databricks.com> Signed-off-by: Xiangrui Meng <meng@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -38,7 +38,7 @@ import org.apache.spark.util.NextIterator
  * that need to be prepended to each row.
  *
  * @param partitionValues value of partition columns to be prepended to each row.
- * @param filePath path of the file to read
+ * @param filePath URI of the file to read
  * @param start the beginning offset (in bytes) of the block.
  * @param length number of bytes to read.
  * @param locations locality information (list of nodes that have the data).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.binaryfile
 
+import java.net.URI
 import java.sql.Timestamp
 
 import com.google.common.io.{ByteStreams, Closeables}
@@ -100,7 +101,7 @@ class BinaryFileFormat extends FileFormat with DataSourceRegister {
     val maxLength = sparkSession.conf.get(SOURCES_BINARY_FILE_MAX_LENGTH)
 
     file: PartitionedFile => {
-      val path = new Path(file.filePath)
+      val path = new Path(new URI(file.filePath))
       val fs = path.getFileSystem(broadcastedHadoopConf.value.value)
       val status = fs.getFileStatus(path)
       if (filterFuncs.forall(_.apply(status))) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala
@@ -368,4 +368,18 @@ class BinaryFileFormatSuite extends QueryTest with SharedSQLContext with SQLTest
       assert(caught.getMessage.contains("exceeds the max length allowed"))
     }
   }
+
+  test("SPARK-28030: support chars in file names that require URL encoding") {
+    withTempDir { dir =>
+      val file = new File(dir, "test space.txt")
+      val content = "123".getBytes
+      Files.write(file.toPath, content, StandardOpenOption.CREATE, StandardOpenOption.WRITE)
+      val df = spark.read.format(BINARY_FILE).load(dir.getPath)
+      df.select(col(PATH), col(CONTENT)).first() match {
+        case Row(p: String, c: Array[Byte]) =>
+          assert(p.endsWith(file.getAbsolutePath), "should support space in file name")
+          assert(c === content, "should read file with space in file name")
+      }
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ import org.apache.spark.util.NextIterator`
`38`	`38`	`* that need to be prepended to each row.`
`39`	`39`	`*`
`40`	`40`	`* @param partitionValues value of partition columns to be prepended to each row.`
`41`		`- * @param filePath path of the file to read`
	`41`	`+ * @param filePath URI of the file to read`
`42`	`42`	`* @param start the beginning offset (in bytes) of the block.`
`43`	`43`	`* @param length number of bytes to read.`
`44`	`44`	`* @param locations locality information (list of nodes that have the data).`
Original file line number	Diff line number	Diff line change
`@@ -368,4 +368,18 @@ class BinaryFileFormatSuite extends QueryTest with SharedSQLContext with SQLTest`
`368`	`368`	`assert(caught.getMessage.contains("exceeds the max length allowed"))`
`369`	`369`	`}`
`370`	`370`	`}`
	`371`	`+`
	`372`	`+ test("SPARK-28030: support chars in file names that require URL encoding") {`
	`373`	`+ withTempDir { dir =>`
	`374`	`+ val file = new File(dir, "test space.txt")`
	`375`	`+ val content = "123".getBytes`
	`376`	`+ Files.write(file.toPath, content, StandardOpenOption.CREATE, StandardOpenOption.WRITE)`
	`377`	`+ val df = spark.read.format(BINARY_FILE).load(dir.getPath)`
	`378`	`+ df.select(col(PATH), col(CONTENT)).first() match {`
	`379`	`+ case Row(p: String, c: Array[Byte]) =>`
	`380`	`+ assert(p.endsWith(file.getAbsolutePath), "should support space in file name")`
	`381`	`+ assert(c === content, "should read file with space in file name")`
	`382`	`+ }`
	`383`	`+ }`
	`384`	`+ }`
`371`	`385`	`}`