Skip to content

Commit df9b94a

Browse files
Sun Ruishivaram
authored andcommitted
[SPARK-7482] [SPARKR] Rename some DataFrame API methods in SparkR to match their counterparts in Scala.
Author: Sun Rui <[email protected]> Closes #6007 from sun-rui/SPARK-7482 and squashes the following commits: 5c5cf5e [Sun Rui] Implement alias loadDF() as a new function. 3a30c10 [Sun Rui] Rename load()/save() to read.df()/write.df(). Also add loadDF()/saveDF() as aliases. 9f569d6 [Sun Rui] [SPARK-7482][SparkR] Rename some DataFrame API methods in SparkR to match their counterparts in Scala.
1 parent 208b902 commit df9b94a

File tree

6 files changed

+71
-49
lines changed

6 files changed

+71
-49
lines changed

R/pkg/NAMESPACE

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ exportMethods("arrange",
3737
"registerTempTable",
3838
"rename",
3939
"repartition",
40-
"sampleDF",
40+
"sample",
4141
"sample_frac",
4242
"saveAsParquetFile",
4343
"saveAsTable",
@@ -53,7 +53,8 @@ exportMethods("arrange",
5353
"unpersist",
5454
"where",
5555
"withColumn",
56-
"withColumnRenamed")
56+
"withColumnRenamed",
57+
"write.df")
5758

5859
exportClasses("Column")
5960

@@ -101,6 +102,7 @@ export("cacheTable",
101102
"jsonFile",
102103
"loadDF",
103104
"parquetFile",
105+
"read.df",
104106
"sql",
105107
"table",
106108
"tableNames",

R/pkg/R/DataFrame.R

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,8 @@ setMethod("registerTempTable",
294294
#'\dontrun{
295295
#' sc <- sparkR.init()
296296
#' sqlCtx <- sparkRSQL.init(sc)
297-
#' df <- loadDF(sqlCtx, path, "parquet")
298-
#' df2 <- loadDF(sqlCtx, path2, "parquet")
297+
#' df <- read.df(sqlCtx, path, "parquet")
298+
#' df2 <- read.df(sqlCtx, path2, "parquet")
299299
#' registerTempTable(df, "table1")
300300
#' insertInto(df2, "table1", overwrite = TRUE)
301301
#'}
@@ -473,14 +473,14 @@ setMethod("distinct",
473473
dataFrame(sdf)
474474
})
475475

476-
#' SampleDF
476+
#' Sample
477477
#'
478478
#' Return a sampled subset of this DataFrame using a random seed.
479479
#'
480480
#' @param x A SparkSQL DataFrame
481481
#' @param withReplacement Sampling with replacement or not
482482
#' @param fraction The (rough) sample target fraction
483-
#' @rdname sampleDF
483+
#' @rdname sample
484484
#' @aliases sample_frac
485485
#' @export
486486
#' @examples
@@ -489,10 +489,10 @@ setMethod("distinct",
489489
#' sqlCtx <- sparkRSQL.init(sc)
490490
#' path <- "path/to/file.json"
491491
#' df <- jsonFile(sqlCtx, path)
492-
#' collect(sampleDF(df, FALSE, 0.5))
493-
#' collect(sampleDF(df, TRUE, 0.5))
492+
#' collect(sample(df, FALSE, 0.5))
493+
#' collect(sample(df, TRUE, 0.5))
494494
#'}
495-
setMethod("sampleDF",
495+
setMethod("sample",
496496
# TODO : Figure out how to send integer as java.lang.Long to JVM so
497497
# we can send seed as an argument through callJMethod
498498
signature(x = "DataFrame", withReplacement = "logical",
@@ -503,13 +503,13 @@ setMethod("sampleDF",
503503
dataFrame(sdf)
504504
})
505505

506-
#' @rdname sampleDF
507-
#' @aliases sampleDF
506+
#' @rdname sample
507+
#' @aliases sample
508508
setMethod("sample_frac",
509509
signature(x = "DataFrame", withReplacement = "logical",
510510
fraction = "numeric"),
511511
function(x, withReplacement, fraction) {
512-
sampleDF(x, withReplacement, fraction)
512+
sample(x, withReplacement, fraction)
513513
})
514514

515515
#' Count
@@ -1303,17 +1303,17 @@ setMethod("except",
13031303
#' @param source A name for external data source
13041304
#' @param mode One of 'append', 'overwrite', 'error', 'ignore'
13051305
#'
1306-
#' @rdname saveAsTable
1306+
#' @rdname write.df
13071307
#' @export
13081308
#' @examples
13091309
#'\dontrun{
13101310
#' sc <- sparkR.init()
13111311
#' sqlCtx <- sparkRSQL.init(sc)
13121312
#' path <- "path/to/file.json"
13131313
#' df <- jsonFile(sqlCtx, path)
1314-
#' saveAsTable(df, "myfile")
1314+
#' write.df(df, "myfile", "parquet", "overwrite")
13151315
#' }
1316-
setMethod("saveDF",
1316+
setMethod("write.df",
13171317
signature(df = "DataFrame", path = 'character', source = 'character',
13181318
mode = 'character'),
13191319
function(df, path = NULL, source = NULL, mode = "append", ...){
@@ -1334,6 +1334,15 @@ setMethod("saveDF",
13341334
callJMethod(df@sdf, "save", source, jmode, options)
13351335
})
13361336

1337+
#' @rdname write.df
1338+
#' @aliases saveDF
1339+
#' @export
1340+
setMethod("saveDF",
1341+
signature(df = "DataFrame", path = 'character', source = 'character',
1342+
mode = 'character'),
1343+
function(df, path = NULL, source = NULL, mode = "append", ...){
1344+
write.df(df, path, source, mode, ...)
1345+
})
13371346

13381347
#' saveAsTable
13391348
#'

R/pkg/R/RDD.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical",
927927
MAXINT)))))
928928

929929
# TODO(zongheng): investigate if this call is an in-place shuffle?
930-
sample(samples)[1:total]
930+
base::sample(samples)[1:total]
931931
})
932932

933933
# Creates tuples of the elements in this RDD by applying a function.
@@ -996,7 +996,7 @@ setMethod("coalesce",
996996
if (shuffle || numPartitions > SparkR:::numPartitions(x)) {
997997
func <- function(partIndex, part) {
998998
set.seed(partIndex) # partIndex as seed
999-
start <- as.integer(sample(numPartitions, 1) - 1)
999+
start <- as.integer(base::sample(numPartitions, 1) - 1)
10001000
lapply(seq_along(part),
10011001
function(i) {
10021002
pos <- (start + i) %% numPartitions

R/pkg/R/SQLContext.R

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ clearCache <- function(sqlCtx) {
421421
#' \dontrun{
422422
#' sc <- sparkR.init()
423423
#' sqlCtx <- sparkRSQL.init(sc)
424-
#' df <- loadDF(sqlCtx, path, "parquet")
424+
#' df <- read.df(sqlCtx, path, "parquet")
425425
#' registerTempTable(df, "table")
426426
#' dropTempTable(sqlCtx, "table")
427427
#' }
@@ -450,10 +450,10 @@ dropTempTable <- function(sqlCtx, tableName) {
450450
#'\dontrun{
451451
#' sc <- sparkR.init()
452452
#' sqlCtx <- sparkRSQL.init(sc)
453-
#' df <- load(sqlCtx, "path/to/file.json", source = "json")
453+
#' df <- read.df(sqlCtx, "path/to/file.json", source = "json")
454454
#' }
455455

456-
loadDF <- function(sqlCtx, path = NULL, source = NULL, ...) {
456+
read.df <- function(sqlCtx, path = NULL, source = NULL, ...) {
457457
options <- varargsToEnv(...)
458458
if (!is.null(path)) {
459459
options[['path']] <- path
@@ -462,6 +462,13 @@ loadDF <- function(sqlCtx, path = NULL, source = NULL, ...) {
462462
dataFrame(sdf)
463463
}
464464

465+
#' @aliases loadDF
466+
#' @export
467+
468+
loadDF <- function(sqlCtx, path = NULL, source = NULL, ...) {
469+
read.df(sqlCtx, path, source, ...)
470+
}
471+
465472
#' Create an external table
466473
#'
467474
#' Creates an external table based on the dataset in a data source,

R/pkg/R/generics.R

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -456,19 +456,19 @@ setGeneric("rename", function(x, ...) { standardGeneric("rename") })
456456
#' @export
457457
setGeneric("registerTempTable", function(x, tableName) { standardGeneric("registerTempTable") })
458458

459-
#' @rdname sampleDF
459+
#' @rdname sample
460460
#' @export
461-
setGeneric("sample_frac",
461+
setGeneric("sample",
462462
function(x, withReplacement, fraction, seed) {
463-
standardGeneric("sample_frac")
464-
})
463+
standardGeneric("sample")
464+
})
465465

466-
#' @rdname sampleDF
466+
#' @rdname sample
467467
#' @export
468-
setGeneric("sampleDF",
468+
setGeneric("sample_frac",
469469
function(x, withReplacement, fraction, seed) {
470-
standardGeneric("sampleDF")
471-
})
470+
standardGeneric("sample_frac")
471+
})
472472

473473
#' @rdname saveAsParquetFile
474474
#' @export
@@ -480,7 +480,11 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
480480
standardGeneric("saveAsTable")
481481
})
482482

483-
#' @rdname saveAsTable
483+
#' @rdname write.df
484+
#' @export
485+
setGeneric("write.df", function(df, path, source, mode, ...) { standardGeneric("write.df") })
486+
487+
#' @rdname write.df
484488
#' @export
485489
setGeneric("saveDF", function(df, path, source, mode, ...) { standardGeneric("saveDF") })
486490

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -209,18 +209,18 @@ test_that("registerTempTable() results in a queryable table and sql() results in
209209
})
210210

211211
test_that("insertInto() on a registered table", {
212-
df <- loadDF(sqlCtx, jsonPath, "json")
213-
saveDF(df, parquetPath, "parquet", "overwrite")
214-
dfParquet <- loadDF(sqlCtx, parquetPath, "parquet")
212+
df <- read.df(sqlCtx, jsonPath, "json")
213+
write.df(df, parquetPath, "parquet", "overwrite")
214+
dfParquet <- read.df(sqlCtx, parquetPath, "parquet")
215215

216216
lines <- c("{\"name\":\"Bob\", \"age\":24}",
217217
"{\"name\":\"James\", \"age\":35}")
218218
jsonPath2 <- tempfile(pattern="jsonPath2", fileext=".tmp")
219219
parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
220220
writeLines(lines, jsonPath2)
221-
df2 <- loadDF(sqlCtx, jsonPath2, "json")
222-
saveDF(df2, parquetPath2, "parquet", "overwrite")
223-
dfParquet2 <- loadDF(sqlCtx, parquetPath2, "parquet")
221+
df2 <- read.df(sqlCtx, jsonPath2, "json")
222+
write.df(df2, parquetPath2, "parquet", "overwrite")
223+
dfParquet2 <- read.df(sqlCtx, parquetPath2, "parquet")
224224

225225
registerTempTable(dfParquet, "table1")
226226
insertInto(dfParquet2, "table1")
@@ -421,12 +421,12 @@ test_that("distinct() on DataFrames", {
421421
expect_true(count(uniques) == 3)
422422
})
423423

424-
test_that("sampleDF on a DataFrame", {
424+
test_that("sample on a DataFrame", {
425425
df <- jsonFile(sqlCtx, jsonPath)
426-
sampled <- sampleDF(df, FALSE, 1.0)
426+
sampled <- sample(df, FALSE, 1.0)
427427
expect_equal(nrow(collect(sampled)), count(df))
428428
expect_true(inherits(sampled, "DataFrame"))
429-
sampled2 <- sampleDF(df, FALSE, 0.1)
429+
sampled2 <- sample(df, FALSE, 0.1)
430430
expect_true(count(sampled2) < 3)
431431

432432
# Also test sample_frac
@@ -491,16 +491,16 @@ test_that("column calculation", {
491491
expect_true(count(df2) == 3)
492492
})
493493

494-
test_that("load() from json file", {
495-
df <- loadDF(sqlCtx, jsonPath, "json")
494+
test_that("read.df() from json file", {
495+
df <- read.df(sqlCtx, jsonPath, "json")
496496
expect_true(inherits(df, "DataFrame"))
497497
expect_true(count(df) == 3)
498498
})
499499

500-
test_that("save() as parquet file", {
501-
df <- loadDF(sqlCtx, jsonPath, "json")
502-
saveDF(df, parquetPath, "parquet", mode="overwrite")
503-
df2 <- loadDF(sqlCtx, parquetPath, "parquet")
500+
test_that("write.df() as parquet file", {
501+
df <- read.df(sqlCtx, jsonPath, "json")
502+
write.df(df, parquetPath, "parquet", mode="overwrite")
503+
df2 <- read.df(sqlCtx, parquetPath, "parquet")
504504
expect_true(inherits(df2, "DataFrame"))
505505
expect_true(count(df2) == 3)
506506
})
@@ -670,7 +670,7 @@ test_that("unionAll(), except(), and intersect() on a DataFrame", {
670670
"{\"name\":\"James\", \"age\":35}")
671671
jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp")
672672
writeLines(lines, jsonPath2)
673-
df2 <- loadDF(sqlCtx, jsonPath2, "json")
673+
df2 <- read.df(sqlCtx, jsonPath2, "json")
674674

675675
unioned <- arrange(unionAll(df, df2), df$age)
676676
expect_true(inherits(unioned, "DataFrame"))
@@ -712,19 +712,19 @@ test_that("mutate() and rename()", {
712712
expect_true(columns(newDF2)[1] == "newerAge")
713713
})
714714

715-
test_that("saveDF() on DataFrame and works with parquetFile", {
715+
test_that("write.df() on DataFrame and works with parquetFile", {
716716
df <- jsonFile(sqlCtx, jsonPath)
717-
saveDF(df, parquetPath, "parquet", mode="overwrite")
717+
write.df(df, parquetPath, "parquet", mode="overwrite")
718718
parquetDF <- parquetFile(sqlCtx, parquetPath)
719719
expect_true(inherits(parquetDF, "DataFrame"))
720720
expect_equal(count(df), count(parquetDF))
721721
})
722722

723723
test_that("parquetFile works with multiple input paths", {
724724
df <- jsonFile(sqlCtx, jsonPath)
725-
saveDF(df, parquetPath, "parquet", mode="overwrite")
725+
write.df(df, parquetPath, "parquet", mode="overwrite")
726726
parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
727-
saveDF(df, parquetPath2, "parquet", mode="overwrite")
727+
write.df(df, parquetPath2, "parquet", mode="overwrite")
728728
parquetDF <- parquetFile(sqlCtx, parquetPath, parquetPath2)
729729
expect_true(inherits(parquetDF, "DataFrame"))
730730
expect_true(count(parquetDF) == count(df)*2)

0 commit comments

Comments
 (0)