11# context.R: SparkContext driven functions
22
3+ getMinSplits <- function (sc , minSplits ) {
4+ if (is.null(minSplits )) {
5+ ssc <- .jcall(sc , " Lorg/apache/spark/SparkContext;" , " sc" )
6+ defaultParallelism <- .jcall(ssc , " I" , " defaultParallelism" )
7+ minSplits <- min(defaultParallelism , 2 )
8+ }
9+ as.integer(minSplits )
10+ }
11+
312# ' Create an RDD from a text file.
413# '
514# ' This function reads a text file from HDFS, a local file system (available on all
615# ' nodes), or any Hadoop-supported file system URI, and creates an
716# ' RDD of strings from it.
817# '
918# ' @param sc SparkContext to use
10- # ' @param path Path of file to read
19+ # ' @param path Path of file to read. A vector of multiple paths is allowed.
1120# ' @param minSplits Minimum number of splits to be created. If NULL, the default
1221# ' value is chosen based on available parallelism.
1322# ' @return RDD where each item is of type \code{character}
1726# ' sc <- sparkR.init()
1827# ' lines <- textFile(sc, "myfile.txt")
1928# '}
20-
21- getMinSplits <- function (sc , minSplits ) {
22- if (is.null(minSplits )) {
23- ssc <- .jcall(sc , " Lorg/apache/spark/SparkContext;" , " sc" )
24- defaultParallelism <- .jcall(ssc , " I" , " defaultParallelism" )
25- minSplits <- min(defaultParallelism , 2 )
26- }
27- as.integer(minSplits )
28- }
29-
3029textFile <- function (sc , path , minSplits = NULL ) {
30+ # ' Convert a string vector of paths to a string containing comma separated paths
31+ path <- paste(path , collapse = " ," )
32+
3133 jrdd <- .jcall(sc , " Lorg/apache/spark/api/java/JavaRDD;" , " textFile" , path ,
3234 getMinSplits(sc , minSplits ))
3335 RDD(jrdd , FALSE )
@@ -39,7 +41,7 @@ textFile <- function(sc, path, minSplits = NULL) {
3941# ' saveAsObjectFile() of the RDD class.
4042# '
4143# ' @param sc SparkContext to use
42- # ' @param path Path of file to read
44+ # ' @param path Path of file to read. A vector of multiple paths is allowed.
4345# ' @param minSplits Minimum number of splits to be created. If NULL, the default
4446# ' value is chosen based on available parallelism.
4547# ' @return RDD containing serialized R objects.
@@ -50,8 +52,10 @@ textFile <- function(sc, path, minSplits = NULL) {
5052# ' sc <- sparkR.init()
5153# ' rdd <- objectFile(sc, "myfile")
5254# '}
53-
5455objectFile <- function (sc , path , minSplits = NULL ) {
56+ # ' Convert a string vector of paths to a string containing comma separated paths
57+ path <- paste(path , collapse = " ," )
58+
5559 jrdd <- .jcall(sc , " Lorg/apache/spark/api/java/JavaRDD;" , " objectFile" , path ,
5660 getMinSplits(sc , minSplits ))
5761 # Assume the RDD contains serialized R objects.
0 commit comments