SparkR support add files to Spark job and get by executors.

yanboliang · yanboliang · commit 5c49428738d8 · 2016-09-17T08:53:54.000-07:00
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
@@ -225,6 +225,37 @@ setCheckpointDir <- function(sc, dirName) {
   invisible(callJMethod(sc, "setCheckpointDir", suppressWarnings(normalizePath(dirName))))
 }
 
+#' Add a file or directory to be downloaded with this Spark job on every node.
+#'
+#' The path passed can be either a local file, a file in HDFS (or other Hadoop-supported
+#' filesystems), or an HTTP, HTTPS or FTP URI.  To access the file in Spark jobs,
+#' use sparkFiles.get(fileName) to find its download location.
+#'
+#' A directory can be given if the recursive option is set to true.
+#' Currently directories are only supported for Hadoop-supported filesystems.
+#'
+#' @param path The path of the files to be added
+#' @param recursive Recursive or not if the path is directory. Default is FALSE.
+#' @noRd
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' addFile(sc, "myfile")
+#'}
+addFile <- function(sc, path) {
+  invisible(callJMethod(sc, "addFile", suppressWarnings(normalizePath(path))))
+}
+
+#' Get the root directory that contains files added through addFile.
+sparkFiles.getRootDirectory <- function() {
+  callJStatic("org.apache.spark.SparkFiles", "getRootDirectory")
+}
+
+#' Get the absolute path of a file added through addFile.
+sparkFiles.get <- function(fileName) {
+  callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName))
+}
+
 #' Run a function over a list of elements, distributing the computations with Spark
 #'
 #' Run a function over a list of elements, distributing the computations with Spark. Applies a
diff --git a/R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/testthat/test_rdd.R
@@ -801,4 +801,15 @@ test_that("Test correct concurrency of RRDD.compute()", {
   expect_equal(count, 1000)
 })
 
+test_that("add and get file to be downloaded with Spark job on every node", {
+  path <- tempfile(pattern = "hello", fileext = ".txt")
+  filename <- basename(path)
+  words <- "Hello World!"
+  writeLines(words, path)
+  addFile(sc, path)
+  download_path <- sparkFiles.get(filename)
+  expect_equal(readLines(download_path), words)
+  unlink(path)
+})
+
 sparkR.session.stop()