Adding an option to include both number of partitions and the cols

NarineK · NarineK · commit 02f81db96c1e · 2016-05-04T16:02:50.000-07:00
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -570,14 +570,16 @@ setMethod("unpersist",
 
 #' Repartition
 #'
-#' There are two different options for repartition
-#' Option 1
-#'   Return a new SparkDataFrame that has exactly numPartitions partitions.
-#' Option 2
-#'   Return a new SparkDataFrame which has as many partitions as the number of unique
-#'   groups identified by column(s) values which are being specified by the input.
-#' If both numPartitions and columns are specified, Option 1 will be chosen.
-#'
+#' The following options for repartitioning are possible:
+#' \itemize{
+#'  \item{"Option 1"} {Return a new SparkDataFrame partitioned by
+#'                      the given columns into `numPartitions`.}
+#'  \item{"Option 2"} {Return a new SparkDataFrame that has exactly `numPartitions`.}
+#'  \item{"Option 3"} {Return a new SparkDataFrame partitioned by the given columns,
+#'                      preserving the existing number of partitions.}
+#'  \item{"Option 4"} {Return a new SparkDataFrame that has exactly the default
+#'                      number of numPartitions: 200.}
+#'}
 #' @param x A SparkDataFrame
 #' @param numPartitions The number of partitions to use.
 #' @param col The column by which the partitioning will be performed.
@@ -595,19 +597,29 @@ setMethod("unpersist",
 #' newDF <- repartition(df, 2L)
 #' newDF <- repartition(df, numPartitions = 2L)
 #' newDF <- repartition(df, col = df$"col1", df$"col2")
+#' newDF <- repartition(df, 3L, col = df$"col1", df$"col2")
 #'}
 setMethod("repartition",
           signature(x = "SparkDataFrame"),
           function(x, numPartitions = NULL, col = NULL, ...) {
-            if (!is.null(numPartitions) && (class(numPartitions) == "numeric"
-              || class(numPartitions) == "integer")) {
-              sdf <- callJMethod(x@sdf, "repartition", numToInt(numPartitions))
+            if (!is.null(numPartitions) && (class(numPartitions) == "numeric" ||
+              class(numPartitions) == "integer")) {
+              # number of partitions and columns both are specified
+              if (!is.null(col) && class(col) == "Column") {
+                cols <- list(col, ...)
+                jcol <- lapply(cols, function(c) { c@jc })
+                sdf <- callJMethod(x@sdf, "repartition", numToInt(numPartitions), jcol)
+              } else {
+                # only number of partitions is specified
+                sdf <- callJMethod(x@sdf, "repartition", numToInt(numPartitions))
+              }
             } else if (!is.null(col) && class(col) == "Column") {
+              # only columns are specified
               cols <- list(col, ...)
               jcol <- lapply(cols, function(c) { c@jc })
               sdf <- callJMethod(x@sdf, "repartition", jcol)
             } else {
-              stop("Please specify numPartitions or at least one column")
+              sdf <- callJMethod(x@sdf, "repartition", 200L)
             }
             dataFrame(sdf)
           })
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
@@ -1025,11 +1025,11 @@ setMethod("keyBy",
 setMethod("repartition",
           signature(x = "RDD"),
           function(x, numPartitions) {
-            if (!is.null(numPartitions) && (class(numPartitions) == "numeric"
-              || class(numPartitions) == "integer")) {
+            if (!is.null(numPartitions) && (class(numPartitions) == "numeric" ||
+              class(numPartitions) == "integer")) {
               coalesce(x, numPartitions, TRUE)
             } else {
-              stop("Please, specify the number of partitions")
+              coalesce(x, 200L)
             }
           })
 
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -2106,7 +2106,7 @@ test_that("repartition by columns on DataFrame", {
     },
     schema)
 
-  # Number of partitions partitions is equal to 2
+  # Number of partitions is equal to 2
   expect_equal(nrow(df1), 2)
 })