Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions core/src/main/scala/org/apache/spark/HttpFileServer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ private[spark] class HttpFileServer(

def addFile(file: File) : String = {
addFileToDir(file, fileDir)
serverUri + "/files/" + file.getName
serverUri + "/files/" + Utils.encodeFileNameToURIRawPath(file.getName)
}

def addJar(file: File) : String = {
addFileToDir(file, jarDir)
serverUri + "/jars/" + file.getName
serverUri + "/jars/" + Utils.encodeFileNameToURIRawPath(file.getName)
}

def addFileToDir(file: File, dir: File) : String = {
Expand All @@ -80,7 +80,7 @@ private[spark] class HttpFileServer(
throw new IllegalArgumentException(s"$file cannot be a directory.")
}
Files.copy(file, new File(dir, file.getName))
dir + "/" + file.getName
dir + "/" + Utils.encodeFileNameToURIRawPath(file.getName)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import java.util.concurrent.ConcurrentHashMap
import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
import org.apache.spark.network.server.StreamManager
import org.apache.spark.rpc.RpcEnvFileServer
import org.apache.spark.util.Utils

/**
* StreamManager implementation for serving files from a NettyRpcEnv.
Expand Down Expand Up @@ -51,13 +52,13 @@ private[netty] class NettyStreamManager(rpcEnv: NettyRpcEnv)
override def addFile(file: File): String = {
require(files.putIfAbsent(file.getName(), file) == null,
s"File ${file.getName()} already registered.")
s"${rpcEnv.address.toSparkURL}/files/${file.getName()}"
s"${rpcEnv.address.toSparkURL}/files/${Utils.encodeFileNameToURIRawPath(file.getName())}"
}

override def addJar(file: File): String = {
require(jars.putIfAbsent(file.getName(), file) == null,
s"JAR ${file.getName()} already registered.")
s"${rpcEnv.address.toSparkURL}/jars/${file.getName()}"
s"${rpcEnv.address.toSparkURL}/jars/${Utils.encodeFileNameToURIRawPath(file.getName())}"
}

}
26 changes: 25 additions & 1 deletion core/src/main/scala/org/apache/spark/util/Utils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,30 @@ private[spark] object Utils extends Logging {
}

/**
* A file name may contain some invalid URI characters, such as " ". This method will convert the
* file name to a raw path accepted by `java.net.URI(String)`.
*
* Note: the file name must not contain "/" or "\"
*/
def encodeFileNameToURIRawPath(fileName: String): String = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Utils.resolveURI already covers this functionality.
(It's "URI" and "URL" not "uri" and "url".)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Utils.resolveURI cannot handle some file names, such as abc:xyz.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that file name is a problem in a URL though. : is not reserved in that part. You'd have to send this to java.net.URI "manually" like you've done, or else use file:/abc:xyz. Then it's doing the same thing; your method wouldn't escape the colon either.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to encode colon. The purpose here is: assume we have a uri prefix: spark://localhost:1234/ and a file name: abc xyz, and want to concat the prefix and the file name to spark://localhost:1234/abc%20xyz so that we can pass it to java.net.URI(String).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's right, but the existing method already does just that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I pass abc:xyz to URI.resolveURI, it doesn't work:

scala> Utils.resolveURI("abc:xyz").getRawPath
res0: String = null

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it must be file:/abc:xyz since you need to tell it it needs to be treated as a file name. Meh, at this point why not just new URI("file:/" + fileName).getRawPath.substring(1) and keep it simple?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meh, at this point why not just new URI("file:/" + fileName).getRawPath.substring(1) and keep it simple?

The parameter of URI(String) must be an encoded path. But here fileName is what we want to encode.

require(!fileName.contains("/") && !fileName.contains("\\"))
// `file` and `localhost` are not used. Just to prevent URI from parsing `fileName` as
// scheme or host. The prefix "/" is required because URI doesn't accept a relative path.
// We should remove it after we get the raw path.
new URI("file", null, "localhost", -1, "/" + fileName, null, null).getRawPath.substring(1)
}

/**
* Get the file name from uri's raw path and decode it. If the raw path of uri ends with "/",
* return the name before the last "/".
*/
def decodeFileNameInURI(uri: URI): String = {
val rawPath = uri.getRawPath
val rawFileName = rawPath.split("/").last
new URI("file:///" + rawFileName).getPath.substring(1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't get this function -- the URI would already have methods to return the 'non-raw' path right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I want to split the raw path before decoding it, so that it can handle uris that contain %2F(/). I think some http links may contain %2F.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see -- URI handles all that just fine as far as I can tell but there's no way to tell that the final element in a path is a string containing a slash, instead of two path elements separated by a slash, since the class has no way to access path elements.

I think you don't need to go to a URI, back to string, back to URI here. This may not even need a method for the one-liner: new URI("file:///" + rawFileName).getPath.substring(1) used one place.

As an aside, some app servers will be picky about serving URIs with %2F in some places, since this has been used in the past for some security exploits, to disguise a cheeky request for some local file URL that would otherwise be caught by (faulty) logic in the app that's not thinking about escaped sequences and trying to handle raw paths. I think Tomcat won't for example. It may be overkill but might even be worth considering conservatively rejecting such a URI anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created a method here so that I can write unit tests for this one to confirm the desired behavior. For the security issue of %2F, I think the server should take care of it, since the attacker can also use such special URI without Spark.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but isn't that an argument that you don't need this method (or at least you don't need more than new URI("file:/" + rawFileName).getPath.substring(1))?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but isn't that an argument that you don't need this method (or at least you don't need more than new URI("file:/" + rawFileName).getPath.substring(1))?

the uri here may contains more than a file name, such as file:/abc/xyz. But the method should return only xyz. So new URI("file:/" + rawFileName).getPath.substring(1)) doesn't work.

}

/**
* Download a file or directory to target directory. Supports fetching the file in a variety of
* ways, including HTTP, Hadoop-compatible filesystems, and files on a standard filesystem, based
* on the URL parameter. Fetching directories is only supported from Hadoop-compatible
Expand All @@ -351,7 +375,7 @@ private[spark] object Utils extends Logging {
hadoopConf: Configuration,
timestamp: Long,
useCache: Boolean) {
val fileName = url.split("/").last
val fileName = decodeFileNameInURI(new URI(url))
val targetFile = new File(targetDir, fileName)
val fetchCacheEnabled = conf.getBoolean("spark.files.useFetchCache", defaultValue = true)
if (useCache && fetchCacheEnabled) {
Expand Down
4 changes: 4 additions & 0 deletions core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -729,12 +729,15 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
val tempDir = Utils.createTempDir()
val file = new File(tempDir, "file")
Files.write(UUID.randomUUID().toString(), file, UTF_8)
val fileWithSpecialChars = new File(tempDir, "file name")
Files.write(UUID.randomUUID().toString(), fileWithSpecialChars, UTF_8)
val empty = new File(tempDir, "empty")
Files.write("", empty, UTF_8);
val jar = new File(tempDir, "jar")
Files.write(UUID.randomUUID().toString(), jar, UTF_8)

val fileUri = env.fileServer.addFile(file)
val fileWithSpecialCharsUri = env.fileServer.addFile(fileWithSpecialChars)
val emptyUri = env.fileServer.addFile(empty)
val jarUri = env.fileServer.addJar(jar)

Expand All @@ -744,6 +747,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {

val files = Seq(
(file, fileUri),
(fileWithSpecialChars, fileWithSpecialCharsUri),
(empty, emptyUri),
(jar, jarUri))
files.foreach { case (f, uri) =>
Expand Down
11 changes: 11 additions & 0 deletions core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -734,4 +734,15 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
conf.set("spark.executor.instances", "0")) === true)
}

test("encodeFileNameToURIRawPath") {
assert(Utils.encodeFileNameToURIRawPath("abc") === "abc")
assert(Utils.encodeFileNameToURIRawPath("abc xyz") === "abc%20xyz")
assert(Utils.encodeFileNameToURIRawPath("abc:xyz") === "abc:xyz")
}

test("decodeFileNameInURI") {
assert(Utils.decodeFileNameInURI(new URI("files:///abc/xyz")) === "xyz")
assert(Utils.decodeFileNameInURI(new URI("files:///abc")) === "abc")
assert(Utils.decodeFileNameInURI(new URI("files:///abc%20xyz")) === "abc xyz")
}
}