From 56a35a3aff88ed24e370823368e63ba09832dce1 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Tue, 8 Dec 2015 15:27:19 -0800
Subject: [PATCH 1/2] Make Utils.fetchFile support files that contain special
 characters

---
 .../org/apache/spark/HttpFileServer.scala     |  6 ++---
 .../spark/rpc/netty/NettyStreamManager.scala  |  5 ++--
 .../scala/org/apache/spark/util/Utils.scala   | 26 ++++++++++++++++++-
 .../org/apache/spark/rpc/RpcEnvSuite.scala    |  4 +++
 .../org/apache/spark/util/UtilsSuite.scala    | 11 ++++++++
 5 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/HttpFileServer.scala b/core/src/main/scala/org/apache/spark/HttpFileServer.scala
index 7cf7bc0dc681..ee9bfea8bc57 100644
--- a/core/src/main/scala/org/apache/spark/HttpFileServer.scala
+++ b/core/src/main/scala/org/apache/spark/HttpFileServer.scala
@@ -63,12 +63,12 @@ private[spark] class HttpFileServer(
 
   def addFile(file: File) : String = {
     addFileToDir(file, fileDir)
-    serverUri + "/files/" + file.getName
+    serverUri + "/files/" + Utils.encodeFileNameToURIRawPath(file.getName)
   }
 
   def addJar(file: File) : String = {
     addFileToDir(file, jarDir)
-    serverUri + "/jars/" + file.getName
+    serverUri + "/jars/" + Utils.encodeFileNameToURIRawPath(file.getName)
   }
 
   def addFileToDir(file: File, dir: File) : String = {
@@ -80,7 +80,7 @@ private[spark] class HttpFileServer(
       throw new IllegalArgumentException(s"$file cannot be a directory.")
     }
     Files.copy(file, new File(dir, file.getName))
-    dir + "/" + file.getName
+    dir + "/" + Utils.encodeFileNameToURIRawPath(file.getName)
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyStreamManager.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyStreamManager.scala
index a2768b4252dc..534348288ae8 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyStreamManager.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyStreamManager.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.ConcurrentHashMap
 import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
 import org.apache.spark.network.server.StreamManager
 import org.apache.spark.rpc.RpcEnvFileServer
+import org.apache.spark.util.Utils
 
 /**
  * StreamManager implementation for serving files from a NettyRpcEnv.
@@ -51,13 +52,13 @@ private[netty] class NettyStreamManager(rpcEnv: NettyRpcEnv)
   override def addFile(file: File): String = {
     require(files.putIfAbsent(file.getName(), file) == null,
       s"File ${file.getName()} already registered.")
-    s"${rpcEnv.address.toSparkURL}/files/${file.getName()}"
+    s"${rpcEnv.address.toSparkURL}/files/${Utils.encodeFileNameToURIRawPath(file.getName())}"
   }
 
   override def addJar(file: File): String = {
     require(jars.putIfAbsent(file.getName(), file) == null,
       s"JAR ${file.getName()} already registered.")
-    s"${rpcEnv.address.toSparkURL}/jars/${file.getName()}"
+    s"${rpcEnv.address.toSparkURL}/jars/${Utils.encodeFileNameToURIRawPath(file.getName())}"
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 9dbe66e7eefb..f01de80d4c48 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -331,6 +331,30 @@ private[spark] object Utils extends Logging {
   }
 
   /**
+   * A file name may contain some invalid url characters, such as " ". This method will convert the
+   * file name to a raw path accepted by `java.net.URI(String)`.
+   *
+   * Note: the file name must not contain "/" or "\"
+   */
+  def encodeFileNameToURIRawPath(fileName: String): String = {
+    require(!fileName.contains("/") && !fileName.contains("\\"))
+    // `file` and `localhost` are not used. Just to prevent URI from parsing `fileName` as
+    // scheme or host. The prefix "/" is required because URI doesn't accept a relative path.
+    // We should remove it after we get the raw path.
+    new URI("file", null, "localhost", -1, "/" + fileName, null, null).getRawPath.substring(1)
+  }
+
+  /**
+   * Get the file name from uri's raw path and decode it. The raw path of uri must not end with "/".
+   */
+  def decodeFileNameInURI(uri: URI): String = {
+    val rawPath = uri.getRawPath
+    assert(!rawPath.endsWith("/"))
+    val rawFileName = rawPath.split("/").last
+    new URI("file:///" + rawFileName).getPath.substring(1)
+  }
+
+    /**
    * Download a file or directory to target directory. Supports fetching the file in a variety of
    * ways, including HTTP, Hadoop-compatible filesystems, and files on a standard filesystem, based
    * on the URL parameter. Fetching directories is only supported from Hadoop-compatible
@@ -351,7 +375,7 @@ private[spark] object Utils extends Logging {
       hadoopConf: Configuration,
       timestamp: Long,
       useCache: Boolean) {
-    val fileName = url.split("/").last
+    val fileName = decodeFileNameInURI(new URI(url))
     val targetFile = new File(targetDir, fileName)
     val fetchCacheEnabled = conf.getBoolean("spark.files.useFetchCache", defaultValue = true)
     if (useCache && fetchCacheEnabled) {
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index 6cc958a5f6bc..4e4911345bed 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -729,12 +729,15 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     val tempDir = Utils.createTempDir()
     val file = new File(tempDir, "file")
     Files.write(UUID.randomUUID().toString(), file, UTF_8)
+    val fileWithSpecialChars = new File(tempDir, "file name")
+    Files.write(UUID.randomUUID().toString(), fileWithSpecialChars, UTF_8)
     val empty = new File(tempDir, "empty")
     Files.write("", empty, UTF_8);
     val jar = new File(tempDir, "jar")
     Files.write(UUID.randomUUID().toString(), jar, UTF_8)
 
     val fileUri = env.fileServer.addFile(file)
+    val fileWithSpecialCharsUri = env.fileServer.addFile(fileWithSpecialChars)
     val emptyUri = env.fileServer.addFile(empty)
     val jarUri = env.fileServer.addJar(jar)
 
@@ -744,6 +747,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
 
     val files = Seq(
       (file, fileUri),
+      (fileWithSpecialChars, fileWithSpecialCharsUri),
       (empty, emptyUri),
       (jar, jarUri))
     files.foreach { case (f, uri) =>
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 68b0da76bc13..fdb51d440eff 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -734,4 +734,15 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
       conf.set("spark.executor.instances", "0")) === true)
   }
 
+  test("encodeFileNameToURIRawPath") {
+    assert(Utils.encodeFileNameToURIRawPath("abc") === "abc")
+    assert(Utils.encodeFileNameToURIRawPath("abc xyz") === "abc%20xyz")
+    assert(Utils.encodeFileNameToURIRawPath("abc:xyz") === "abc:xyz")
+  }
+
+  test("decodeFileNameInURI") {
+    assert(Utils.decodeFileNameInURI(new URI("files:///abc/xyz")) === "xyz")
+    assert(Utils.decodeFileNameInURI(new URI("files:///abc")) === "abc")
+    assert(Utils.decodeFileNameInURI(new URI("files:///abc%20xyz")) === "abc xyz")
+  }
 }

From 2c3164386040b5051e0332652cff9d2052b90cdb Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 9 Dec 2015 10:08:39 -0800
Subject: [PATCH 2/2] Remove assert to support fetching directory for
 Hadoop-compatible filesystem

---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index f01de80d4c48..fce89dfccfe2 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -331,7 +331,7 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * A file name may contain some invalid url characters, such as " ". This method will convert the
+   * A file name may contain some invalid URI characters, such as " ". This method will convert the
    * file name to a raw path accepted by `java.net.URI(String)`.
    *
    * Note: the file name must not contain "/" or "\"
@@ -345,11 +345,11 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * Get the file name from uri's raw path and decode it. The raw path of uri must not end with "/".
+   * Get the file name from uri's raw path and decode it. If the raw path of uri ends with "/",
+   * return the name before the last "/".
    */
   def decodeFileNameInURI(uri: URI): String = {
     val rawPath = uri.getRawPath
-    assert(!rawPath.endsWith("/"))
     val rawFileName = rawPath.split("/").last
     new URI("file:///" + rawFileName).getPath.substring(1)
   }