Skip to content

Commit cb6873e

Browse files
committed
Merge pull request apache#126 from sun-rui/SPARKR-147
[SPARKR-147] Support multiple directories as input to textFile.
2 parents 4d4fc30 + f04c6e0 commit cb6873e

File tree

5 files changed

+49
-15
lines changed

5 files changed

+49
-15
lines changed

pkg/R/context.R

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,22 @@
11
# context.R: SparkContext driven functions
22

3+
getMinSplits <- function(sc, minSplits) {
4+
if (is.null(minSplits)) {
5+
ssc <- .jcall(sc, "Lorg/apache/spark/SparkContext;", "sc")
6+
defaultParallelism <- .jcall(ssc, "I", "defaultParallelism")
7+
minSplits <- min(defaultParallelism, 2)
8+
}
9+
as.integer(minSplits)
10+
}
11+
312
#' Create an RDD from a text file.
413
#'
514
#' This function reads a text file from HDFS, a local file system (available on all
615
#' nodes), or any Hadoop-supported file system URI, and creates an
716
#' RDD of strings from it.
817
#'
918
#' @param sc SparkContext to use
10-
#' @param path Path of file to read
19+
#' @param path Path of file to read. A vector of multiple paths is allowed.
1120
#' @param minSplits Minimum number of splits to be created. If NULL, the default
1221
#' value is chosen based on available parallelism.
1322
#' @return RDD where each item is of type \code{character}
@@ -17,17 +26,10 @@
1726
#' sc <- sparkR.init()
1827
#' lines <- textFile(sc, "myfile.txt")
1928
#'}
20-
21-
getMinSplits <- function(sc, minSplits) {
22-
if (is.null(minSplits)) {
23-
ssc <- .jcall(sc, "Lorg/apache/spark/SparkContext;", "sc")
24-
defaultParallelism <- .jcall(ssc, "I", "defaultParallelism")
25-
minSplits <- min(defaultParallelism, 2)
26-
}
27-
as.integer(minSplits)
28-
}
29-
3029
textFile <- function(sc, path, minSplits = NULL) {
30+
#' Convert a string vector of paths to a string containing comma separated paths
31+
path <- paste(path, collapse=",")
32+
3133
jrdd <- .jcall(sc, "Lorg/apache/spark/api/java/JavaRDD;", "textFile", path,
3234
getMinSplits(sc, minSplits))
3335
RDD(jrdd, FALSE)
@@ -39,7 +41,7 @@ textFile <- function(sc, path, minSplits = NULL) {
3941
#' saveAsObjectFile() of the RDD class.
4042
#'
4143
#' @param sc SparkContext to use
42-
#' @param path Path of file to read
44+
#' @param path Path of file to read. A vector of multiple paths is allowed.
4345
#' @param minSplits Minimum number of splits to be created. If NULL, the default
4446
#' value is chosen based on available parallelism.
4547
#' @return RDD containing serialized R objects.
@@ -50,8 +52,10 @@ textFile <- function(sc, path, minSplits = NULL) {
5052
#' sc <- sparkR.init()
5153
#' rdd <- objectFile(sc, "myfile")
5254
#'}
53-
5455
objectFile <- function(sc, path, minSplits = NULL) {
56+
#' Convert a string vector of paths to a string containing comma separated paths
57+
path <- paste(path, collapse=",")
58+
5559
jrdd <- .jcall(sc, "Lorg/apache/spark/api/java/JavaRDD;", "objectFile", path,
5660
getMinSplits(sc, minSplits))
5761
# Assume the RDD contains serialized R objects.

pkg/inst/tests/test_binaryFile.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,19 @@ test_that("saveAsObjectFile()/objectFile() following RDD transformations works",
5555
unlink(fileName2, recursive = TRUE)
5656
})
5757

58+
test_that("saveAsObjectFile()/objectFile() works with multiple paths", {
59+
fileName1 <- tempfile(pattern="spark-test", fileext=".tmp")
60+
fileName2 <- tempfile(pattern="spark-test", fileext=".tmp")
61+
62+
rdd1 <- parallelize(sc, "Spark is pretty.")
63+
saveAsObjectFile(rdd1, fileName1)
64+
rdd2 <- parallelize(sc, "Spark is awesome.")
65+
saveAsObjectFile(rdd2, fileName2)
66+
67+
rdd <- objectFile(sc, c(fileName1, fileName2))
68+
expect_true(count(rdd) == 2)
69+
70+
unlink(fileName1, recursive = TRUE)
71+
unlink(fileName2, recursive = TRUE)
72+
})
73+

pkg/inst/tests/test_textFile.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,17 @@ test_that("textFile() and saveAsTextFile() word count works as expected", {
108108
unlink(fileName1)
109109
unlink(fileName2)
110110
})
111+
112+
test_that("textFile() on multiple paths", {
113+
fileName1 <- tempfile(pattern="spark-test", fileext=".tmp")
114+
fileName2 <- tempfile(pattern="spark-test", fileext=".tmp")
115+
writeLines("Spark is pretty.", fileName1)
116+
writeLines("Spark is awesome.", fileName2)
117+
118+
rdd <- textFile(sc, c(fileName1, fileName2))
119+
expect_true(count(rdd) == 2)
120+
121+
unlink(fileName1)
122+
unlink(fileName2)
123+
})
124+

pkg/man/objectFile.Rd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ objectFile(sc, path, minSplits = NULL)
88
\arguments{
99
\item{sc}{SparkContext to use}
1010

11-
\item{path}{Path of file to read}
11+
\item{path}{Path of file to read. A vector of multiple paths is allowed.}
1212

1313
\item{minSplits}{Minimum number of splits to be created. If NULL, the default
1414
value is chosen based on available parallelism.}

pkg/man/textFile.Rd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ textFile(sc, path, minSplits = NULL)
77
\arguments{
88
\item{sc}{SparkContext to use}
99

10-
\item{path}{Path of file to read}
10+
\item{path}{Path of file to read. A vector of multiple paths is allowed.}
1111

1212
\item{minSplits}{Minimum number of splits to be created.
1313
If NULL, the default value is chosen based on available

0 commit comments

Comments
 (0)