Skip to content

Commit 02f81db

Browse files
committed
Adding an option to include both number of partitions and the cols
1 parent 01beaba commit 02f81db

File tree

3 files changed

+28
-16
lines changed

3 files changed

+28
-16
lines changed

R/pkg/R/DataFrame.R

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -570,14 +570,16 @@ setMethod("unpersist",
570570

571571
#' Repartition
572572
#'
573-
#' There are two different options for repartition
574-
#' Option 1
575-
#' Return a new SparkDataFrame that has exactly numPartitions partitions.
576-
#' Option 2
577-
#' Return a new SparkDataFrame which has as many partitions as the number of unique
578-
#' groups identified by column(s) values which are being specified by the input.
579-
#' If both numPartitions and columns are specified, Option 1 will be chosen.
580-
#'
573+
#' The following options for repartitioning are possible:
574+
#' \itemize{
575+
#' \item{"Option 1"} {Return a new SparkDataFrame partitioned by
576+
#' the given columns into `numPartitions`.}
577+
#' \item{"Option 2"} {Return a new SparkDataFrame that has exactly `numPartitions`.}
578+
#' \item{"Option 3"} {Return a new SparkDataFrame partitioned by the given columns,
579+
#' preserving the existing number of partitions.}
580+
#' \item{"Option 4"} {Return a new SparkDataFrame that has exactly the default
581+
#' number of numPartitions: 200.}
582+
#'}
581583
#' @param x A SparkDataFrame
582584
#' @param numPartitions The number of partitions to use.
583585
#' @param col The column by which the partitioning will be performed.
@@ -595,19 +597,29 @@ setMethod("unpersist",
595597
#' newDF <- repartition(df, 2L)
596598
#' newDF <- repartition(df, numPartitions = 2L)
597599
#' newDF <- repartition(df, col = df$"col1", df$"col2")
600+
#' newDF <- repartition(df, 3L, col = df$"col1", df$"col2")
598601
#'}
599602
setMethod("repartition",
600603
signature(x = "SparkDataFrame"),
601604
function(x, numPartitions = NULL, col = NULL, ...) {
602-
if (!is.null(numPartitions) && (class(numPartitions) == "numeric"
603-
|| class(numPartitions) == "integer")) {
604-
sdf <- callJMethod(x@sdf, "repartition", numToInt(numPartitions))
605+
if (!is.null(numPartitions) && (class(numPartitions) == "numeric" ||
606+
class(numPartitions) == "integer")) {
607+
# number of partitions and columns both are specified
608+
if (!is.null(col) && class(col) == "Column") {
609+
cols <- list(col, ...)
610+
jcol <- lapply(cols, function(c) { c@jc })
611+
sdf <- callJMethod(x@sdf, "repartition", numToInt(numPartitions), jcol)
612+
} else {
613+
# only number of partitions is specified
614+
sdf <- callJMethod(x@sdf, "repartition", numToInt(numPartitions))
615+
}
605616
} else if (!is.null(col) && class(col) == "Column") {
617+
# only columns are specified
606618
cols <- list(col, ...)
607619
jcol <- lapply(cols, function(c) { c@jc })
608620
sdf <- callJMethod(x@sdf, "repartition", jcol)
609621
} else {
610-
stop("Please specify numPartitions or at least one column")
622+
sdf <- callJMethod(x@sdf, "repartition", 200L)
611623
}
612624
dataFrame(sdf)
613625
})

R/pkg/R/RDD.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,11 +1025,11 @@ setMethod("keyBy",
10251025
setMethod("repartition",
10261026
signature(x = "RDD"),
10271027
function(x, numPartitions) {
1028-
if (!is.null(numPartitions) && (class(numPartitions) == "numeric"
1029-
|| class(numPartitions) == "integer")) {
1028+
if (!is.null(numPartitions) && (class(numPartitions) == "numeric" ||
1029+
class(numPartitions) == "integer")) {
10301030
coalesce(x, numPartitions, TRUE)
10311031
} else {
1032-
stop("Please, specify the number of partitions")
1032+
coalesce(x, 200L)
10331033
}
10341034
})
10351035

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2106,7 +2106,7 @@ test_that("repartition by columns on DataFrame", {
21062106
},
21072107
schema)
21082108

2109-
# Number of partitions partitions is equal to 2
2109+
# Number of partitions is equal to 2
21102110
expect_equal(nrow(df1), 2)
21112111
})
21122112

0 commit comments

Comments
 (0)