Skip to content

Commit ae78312

Browse files
author
Davies Liu
committed
Merge pull request #237 from sun-rui/SPARKR-154_3
[SPARKR-154] Phase 2: implement cartesian().
1 parent 1bdcb63 commit ae78312

File tree

6 files changed

+197
-62
lines changed

6 files changed

+197
-62
lines changed

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ exportMethods(
55
"aggregateByKey",
66
"aggregateRDD",
77
"cache",
8+
"cartesian",
89
"checkpoint",
910
"coalesce",
1011
"cogroup",

R/pkg/R/RDD.R

Lines changed: 36 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,69 +1496,43 @@ setMethod("zipRDD",
14961496
stop("Can only zip RDDs which have the same number of partitions.")
14971497
}
14981498

1499-
if (getSerializedMode(x) != getSerializedMode(other) ||
1500-
getSerializedMode(x) == "byte") {
1501-
# Append the number of elements in each partition to that partition so that we can later
1502-
# check if corresponding partitions of both RDDs have the same number of elements.
1503-
#
1504-
# Note that this appending also serves the purpose of reserialization, because even if
1505-
# any RDD is serialized, we need to reserialize it to make sure its partitions are encoded
1506-
# as a single byte array. For example, partitions of an RDD generated from partitionBy()
1507-
# may be encoded as multiple byte arrays.
1508-
appendLength <- function(part) {
1509-
part[[length(part) + 1]] <- length(part) + 1
1510-
part
1511-
}
1512-
x <- lapplyPartition(x, appendLength)
1513-
other <- lapplyPartition(other, appendLength)
1514-
}
1515-
1516-
zippedJRDD <- callJMethod(getJRDD(x), "zip", getJRDD(other))
1517-
# The zippedRDD's elements are of scala Tuple2 type. The serialized
1518-
# flag Here is used for the elements inside the tuples.
1519-
serializerMode <- getSerializedMode(x)
1520-
zippedRDD <- RDD(zippedJRDD, serializerMode)
1521-
1522-
partitionFunc <- function(split, part) {
1523-
len <- length(part)
1524-
if (len > 0) {
1525-
if (serializerMode == "byte") {
1526-
lengthOfValues <- part[[len]]
1527-
lengthOfKeys <- part[[len - lengthOfValues]]
1528-
stopifnot(len == lengthOfKeys + lengthOfValues)
1529-
1530-
# check if corresponding partitions of both RDDs have the same number of elements.
1531-
if (lengthOfKeys != lengthOfValues) {
1532-
stop("Can only zip RDDs with same number of elements in each pair of corresponding partitions.")
1533-
}
1534-
1535-
if (lengthOfKeys > 1) {
1536-
keys <- part[1 : (lengthOfKeys - 1)]
1537-
values <- part[(lengthOfKeys + 1) : (len - 1)]
1538-
} else {
1539-
keys <- list()
1540-
values <- list()
1541-
}
1542-
} else {
1543-
# Keys, values must have same length here, because this has
1544-
# been validated inside the JavaRDD.zip() function.
1545-
keys <- part[c(TRUE, FALSE)]
1546-
values <- part[c(FALSE, TRUE)]
1547-
}
1548-
mapply(
1549-
function(k, v) {
1550-
list(k, v)
1551-
},
1552-
keys,
1553-
values,
1554-
SIMPLIFY = FALSE,
1555-
USE.NAMES = FALSE)
1556-
} else {
1557-
part
1558-
}
1559-
}
1499+
rdds <- appendPartitionLengths(x, other)
1500+
jrdd <- callJMethod(getJRDD(rdds[[1]]), "zip", getJRDD(rdds[[2]]))
1501+
# The jrdd's elements are of scala Tuple2 type. The serialized
1502+
# flag here is used for the elements inside the tuples.
1503+
rdd <- RDD(jrdd, getSerializedMode(rdds[[1]]))
1504+
1505+
mergePartitions(rdd, TRUE)
1506+
})
15601507

1561-
PipelinedRDD(zippedRDD, partitionFunc)
1508+
#' Cartesian product of this RDD and another one.
1509+
#'
1510+
#' Return the Cartesian product of this RDD and another one,
1511+
#' that is, the RDD of all pairs of elements (a, b) where a
1512+
#' is in this and b is in other.
1513+
#'
1514+
#' @param x An RDD.
1515+
#' @param other An RDD.
1516+
#' @return A new RDD which is the Cartesian product of these two RDDs.
1517+
#' @examples
1518+
#'\dontrun{
1519+
#' sc <- sparkR.init()
1520+
#' rdd <- parallelize(sc, 1:2)
1521+
#' sortByKey(cartesian(rdd, rdd))
1522+
#' # list(list(1, 1), list(1, 2), list(2, 1), list(2, 2))
1523+
#'}
1524+
#' @rdname cartesian
1525+
#' @aliases cartesian,RDD,RDD-method
1526+
setMethod("cartesian",
1527+
signature(x = "RDD", other = "RDD"),
1528+
function(x, other) {
1529+
rdds <- appendPartitionLengths(x, other)
1530+
jrdd <- callJMethod(getJRDD(rdds[[1]]), "cartesian", getJRDD(rdds[[2]]))
1531+
# The jrdd's elements are of scala Tuple2 type. The serialized
1532+
# flag here is used for the elements inside the tuples.
1533+
rdd <- RDD(jrdd, getSerializedMode(rdds[[1]]))
1534+
1535+
mergePartitions(rdd, FALSE)
15621536
})
15631537

15641538
#' Subtract an RDD with another RDD.

R/pkg/R/generics.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,10 @@ setGeneric("countByKey", function(x) { standardGeneric("countByKey") })
237237
#' @export
238238
setGeneric("flatMapValues", function(X, FUN) { standardGeneric("flatMapValues") })
239239

240+
#' @rdname cartesian
241+
#' @export
242+
setGeneric("cartesian", function(x, other) { standardGeneric("cartesian") })
243+
240244
#' @rdname intersection
241245
#' @export
242246
setGeneric("intersection", function(x, other, numPartitions = 1L) {

R/pkg/R/utils.R

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,3 +465,83 @@ cleanClosure <- function(func, checkedFuncs = new.env()) {
465465
}
466466
func
467467
}
468+
469+
# Append partition lengths to each partition in two input RDDs if needed.
470+
# param
471+
# x An RDD.
472+
# Other An RDD.
473+
# return value
474+
# A list of two result RDDs.
475+
appendPartitionLengths <- function(x, other) {
476+
if (getSerializedMode(x) != getSerializedMode(other) ||
477+
getSerializedMode(x) == "byte") {
478+
# Append the number of elements in each partition to that partition so that we can later
479+
# know the boundary of elements from x and other.
480+
#
481+
# Note that this appending also serves the purpose of reserialization, because even if
482+
# any RDD is serialized, we need to reserialize it to make sure its partitions are encoded
483+
# as a single byte array. For example, partitions of an RDD generated from partitionBy()
484+
# may be encoded as multiple byte arrays.
485+
appendLength <- function(part) {
486+
len <- length(part)
487+
part[[len + 1]] <- len + 1
488+
part
489+
}
490+
x <- lapplyPartition(x, appendLength)
491+
other <- lapplyPartition(other, appendLength)
492+
}
493+
list (x, other)
494+
}
495+
496+
# Perform zip or cartesian between elements from two RDDs in each partition
497+
# param
498+
# rdd An RDD.
499+
# zip A boolean flag indicating this call is for zip operation or not.
500+
# return value
501+
# A result RDD.
502+
mergePartitions <- function(rdd, zip) {
503+
serializerMode <- getSerializedMode(rdd)
504+
partitionFunc <- function(split, part) {
505+
len <- length(part)
506+
if (len > 0) {
507+
if (serializerMode == "byte") {
508+
lengthOfValues <- part[[len]]
509+
lengthOfKeys <- part[[len - lengthOfValues]]
510+
stopifnot(len == lengthOfKeys + lengthOfValues)
511+
512+
# For zip operation, check if corresponding partitions of both RDDs have the same number of elements.
513+
if (zip && lengthOfKeys != lengthOfValues) {
514+
stop("Can only zip RDDs with same number of elements in each pair of corresponding partitions.")
515+
}
516+
517+
if (lengthOfKeys > 1) {
518+
keys <- part[1 : (lengthOfKeys - 1)]
519+
} else {
520+
keys <- list()
521+
}
522+
if (lengthOfValues > 1) {
523+
values <- part[(lengthOfKeys + 1) : (len - 1)]
524+
} else {
525+
values <- list()
526+
}
527+
528+
if (!zip) {
529+
return(mergeCompactLists(keys, values))
530+
}
531+
} else {
532+
keys <- part[c(TRUE, FALSE)]
533+
values <- part[c(FALSE, TRUE)]
534+
}
535+
mapply(
536+
function(k, v) { list(k, v) },
537+
keys,
538+
values,
539+
SIMPLIFY = FALSE,
540+
USE.NAMES = FALSE)
541+
} else {
542+
part
543+
}
544+
}
545+
546+
PipelinedRDD(rdd, partitionFunc)
547+
}

R/pkg/inst/tests/test_rdd.R

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,49 @@ test_that("zipRDD() on RDDs", {
468468
unlink(fileName)
469469
})
470470

471+
test_that("cartesian() on RDDs", {
472+
rdd <- parallelize(sc, 1:3)
473+
actual <- collect(cartesian(rdd, rdd))
474+
expect_equal(sortKeyValueList(actual),
475+
list(
476+
list(1, 1), list(1, 2), list(1, 3),
477+
list(2, 1), list(2, 2), list(2, 3),
478+
list(3, 1), list(3, 2), list(3, 3)))
479+
480+
# test case where one RDD is empty
481+
emptyRdd <- parallelize(sc, list())
482+
actual <- collect(cartesian(rdd, emptyRdd))
483+
expect_equal(actual, list())
484+
485+
mockFile = c("Spark is pretty.", "Spark is awesome.")
486+
fileName <- tempfile(pattern="spark-test", fileext=".tmp")
487+
writeLines(mockFile, fileName)
488+
489+
rdd <- textFile(sc, fileName)
490+
actual <- collect(cartesian(rdd, rdd))
491+
expected <- list(
492+
list("Spark is awesome.", "Spark is pretty."),
493+
list("Spark is awesome.", "Spark is awesome."),
494+
list("Spark is pretty.", "Spark is pretty."),
495+
list("Spark is pretty.", "Spark is awesome."))
496+
expect_equal(sortKeyValueList(actual), expected)
497+
498+
rdd1 <- parallelize(sc, 0:1)
499+
actual <- collect(cartesian(rdd1, rdd))
500+
expect_equal(sortKeyValueList(actual),
501+
list(
502+
list(0, "Spark is pretty."),
503+
list(0, "Spark is awesome."),
504+
list(1, "Spark is pretty."),
505+
list(1, "Spark is awesome.")))
506+
507+
rdd1 <- map(rdd, function(x) { x })
508+
actual <- collect(cartesian(rdd, rdd1))
509+
expect_equal(sortKeyValueList(actual), expected)
510+
511+
unlink(fileName)
512+
})
513+
471514
test_that("subtract() on RDDs", {
472515
l <- list(1, 1, 2, 2, 3, 4)
473516
rdd1 <- parallelize(sc, l)

pkg/man/cartesian.Rd

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
% Generated by roxygen2 (4.0.2): do not edit by hand
2+
\docType{methods}
3+
\name{cartesian,RDD,RDD-method}
4+
\alias{cartesian}
5+
\alias{cartesian,RDD,RDD-method}
6+
\title{Cartesian product of this RDD and another one.}
7+
\usage{
8+
\S4method{cartesian}{RDD,RDD}(x, other)
9+
10+
cartesian(x, other)
11+
}
12+
\arguments{
13+
\item{x}{An RDD.}
14+
15+
\item{other}{An RDD.}
16+
}
17+
\value{
18+
A new RDD which is the Cartesian product of these two RDDs.
19+
}
20+
\description{
21+
Return the Cartesian product of this RDD and another one,
22+
that is, the RDD of all pairs of elements (a, b) where a
23+
is in this and b is in other.
24+
}
25+
\examples{
26+
\dontrun{
27+
sc <- sparkR.init()
28+
rdd <- parallelize(sc, 1:2)
29+
sortByKey(cartesian(rdd, rdd))
30+
# list(list(1, 1), list(1, 2), list(2, 1), list(2, 2))
31+
}
32+
}
33+

0 commit comments

Comments
 (0)