Skip to content

Commit 18a8554

Browse files
committed
2 parents 53d9d27 + 1629331 commit 18a8554

File tree

530 files changed

+17905
-6410
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

530 files changed

+17905
-6410
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
263263
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
264264
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
265265
(The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
266-
(The New BSD License) Py4J (net.sf.py4j:py4j:0.10.3 - http://py4j.sourceforge.net/)
266+
(The New BSD License) Py4J (net.sf.py4j:py4j:0.10.4 - http://py4j.sourceforge.net/)
267267
(Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
268268
(BSD licence) sbt and sbt-launch-lib.bash
269269
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)

R/pkg/NAMESPACE

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
importFrom("methods", "setGeneric", "setMethod", "setOldClass")
44
importFrom("methods", "is", "new", "signature", "show")
55
importFrom("stats", "gaussian", "setNames")
6-
importFrom("utils", "download.file", "packageVersion", "untar")
6+
importFrom("utils", "download.file", "object.size", "packageVersion", "untar")
77

88
# Disable native libraries till we figure out how to package it
99
# See SPARKR-7839
@@ -43,7 +43,9 @@ exportMethods("glm",
4343
"spark.isoreg",
4444
"spark.gaussianMixture",
4545
"spark.als",
46-
"spark.kstest")
46+
"spark.kstest",
47+
"spark.logit",
48+
"spark.randomForest")
4749

4850
# Job group lifecycle management methods
4951
export("setJobGroup",
@@ -71,6 +73,7 @@ exportMethods("arrange",
7173
"covar_samp",
7274
"covar_pop",
7375
"createOrReplaceTempView",
76+
"crossJoin",
7477
"crosstab",
7578
"dapply",
7679
"dapplyCollect",
@@ -123,6 +126,7 @@ exportMethods("arrange",
123126
"selectExpr",
124127
"show",
125128
"showDF",
129+
"storageLevel",
126130
"subset",
127131
"summarize",
128132
"summary",
@@ -347,7 +351,9 @@ export("as.DataFrame",
347351
"uncacheTable",
348352
"print.summary.GeneralizedLinearRegressionModel",
349353
"read.ml",
350-
"print.summary.KSTest")
354+
"print.summary.KSTest",
355+
"print.summary.RandomForestRegressionModel",
356+
"print.summary.RandomForestClassificationModel")
351357

352358
export("structField",
353359
"structField.jobj",
@@ -372,6 +378,8 @@ S3method(print, structField)
372378
S3method(print, structType)
373379
S3method(print, summary.GeneralizedLinearRegressionModel)
374380
S3method(print, summary.KSTest)
381+
S3method(print, summary.RandomForestRegressionModel)
382+
S3method(print, summary.RandomForestClassificationModel)
375383
S3method(structField, character)
376384
S3method(structField, jobj)
377385
S3method(structType, jobj)

R/pkg/R/DataFrame.R

Lines changed: 81 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ setMethod("colnames<-",
365365

366366
# Check if the column names have . in it
367367
if (any(regexec(".", value, fixed = TRUE)[[1]][1] != -1)) {
368-
stop("Colum names cannot contain the '.' symbol.")
368+
stop("Column names cannot contain the '.' symbol.")
369369
}
370370

371371
sdf <- callJMethod(x@sdf, "toDF", as.list(value))
@@ -633,7 +633,7 @@ setMethod("persist",
633633
#' @param ... further arguments to be passed to or from other methods.
634634
#'
635635
#' @family SparkDataFrame functions
636-
#' @rdname unpersist-methods
636+
#' @rdname unpersist
637637
#' @aliases unpersist,SparkDataFrame-method
638638
#' @name unpersist
639639
#' @export
@@ -654,6 +654,32 @@ setMethod("unpersist",
654654
x
655655
})
656656

657+
#' StorageLevel
658+
#'
659+
#' Get storagelevel of this SparkDataFrame.
660+
#'
661+
#' @param x the SparkDataFrame to get the storageLevel.
662+
#'
663+
#' @family SparkDataFrame functions
664+
#' @rdname storageLevel
665+
#' @aliases storageLevel,SparkDataFrame-method
666+
#' @name storageLevel
667+
#' @export
668+
#' @examples
669+
#'\dontrun{
670+
#' sparkR.session()
671+
#' path <- "path/to/file.json"
672+
#' df <- read.json(path)
673+
#' persist(df, "MEMORY_AND_DISK")
674+
#' storageLevel(df)
675+
#'}
676+
#' @note storageLevel since 2.1.0
677+
setMethod("storageLevel",
678+
signature(x = "SparkDataFrame"),
679+
function(x) {
680+
storageLevelToString(callJMethod(x@sdf, "storageLevel"))
681+
})
682+
657683
#' Repartition
658684
#'
659685
#' The following options for repartition are possible:
@@ -735,7 +761,8 @@ setMethod("toJSON",
735761

736762
#' Save the contents of SparkDataFrame as a JSON file
737763
#'
738-
#' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out
764+
#' Save the contents of a SparkDataFrame as a JSON file (\href{http://jsonlines.org/}{
765+
#' JSON Lines text format or newline-delimited JSON}). Files written out
739766
#' with this method can be read back in as a SparkDataFrame using read.json().
740767
#'
741768
#' @param x A SparkDataFrame
@@ -761,7 +788,7 @@ setMethod("write.json",
761788
function(x, path, mode = "error", ...) {
762789
write <- callJMethod(x@sdf, "write")
763790
write <- setWriteOptions(write, mode = mode, ...)
764-
invisible(callJMethod(write, "json", path))
791+
invisible(handledCallJMethod(write, "json", path))
765792
})
766793

767794
#' Save the contents of SparkDataFrame as an ORC file, preserving the schema.
@@ -792,7 +819,7 @@ setMethod("write.orc",
792819
function(x, path, mode = "error", ...) {
793820
write <- callJMethod(x@sdf, "write")
794821
write <- setWriteOptions(write, mode = mode, ...)
795-
invisible(callJMethod(write, "orc", path))
822+
invisible(handledCallJMethod(write, "orc", path))
796823
})
797824

798825
#' Save the contents of SparkDataFrame as a Parquet file, preserving the schema.
@@ -824,7 +851,7 @@ setMethod("write.parquet",
824851
function(x, path, mode = "error", ...) {
825852
write <- callJMethod(x@sdf, "write")
826853
write <- setWriteOptions(write, mode = mode, ...)
827-
invisible(callJMethod(write, "parquet", path))
854+
invisible(handledCallJMethod(write, "parquet", path))
828855
})
829856

830857
#' @rdname write.parquet
@@ -868,7 +895,7 @@ setMethod("write.text",
868895
function(x, path, mode = "error", ...) {
869896
write <- callJMethod(x@sdf, "write")
870897
write <- setWriteOptions(write, mode = mode, ...)
871-
invisible(callJMethod(write, "text", path))
898+
invisible(handledCallJMethod(write, "text", path))
872899
})
873900

874901
#' Distinct
@@ -2271,12 +2298,13 @@ setMethod("dropDuplicates",
22712298

22722299
#' Join
22732300
#'
2274-
#' Join two SparkDataFrames based on the given join expression.
2301+
#' Joins two SparkDataFrames based on the given join expression.
22752302
#'
22762303
#' @param x A SparkDataFrame
22772304
#' @param y A SparkDataFrame
22782305
#' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a
2279-
#' Column expression. If joinExpr is omitted, join() will perform a Cartesian join
2306+
#' Column expression. If joinExpr is omitted, the default, inner join is attempted and an error is
2307+
#' thrown if it would be a Cartesian Product. For Cartesian join, use crossJoin instead.
22802308
#' @param joinType The type of join to perform. The following join types are available:
22812309
#' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left',
22822310
#' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner".
@@ -2285,23 +2313,24 @@ setMethod("dropDuplicates",
22852313
#' @aliases join,SparkDataFrame,SparkDataFrame-method
22862314
#' @rdname join
22872315
#' @name join
2288-
#' @seealso \link{merge}
2316+
#' @seealso \link{merge} \link{crossJoin}
22892317
#' @export
22902318
#' @examples
22912319
#'\dontrun{
22922320
#' sparkR.session()
22932321
#' df1 <- read.json(path)
22942322
#' df2 <- read.json(path2)
2295-
#' join(df1, df2) # Performs a Cartesian
22962323
#' join(df1, df2, df1$col1 == df2$col2) # Performs an inner join based on expression
22972324
#' join(df1, df2, df1$col1 == df2$col2, "right_outer")
2325+
#' join(df1, df2) # Attempts an inner join
22982326
#' }
22992327
#' @note join since 1.4.0
23002328
setMethod("join",
23012329
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
23022330
function(x, y, joinExpr = NULL, joinType = NULL) {
23032331
if (is.null(joinExpr)) {
2304-
sdf <- callJMethod(x@sdf, "crossJoin", y@sdf)
2332+
# this may not fail until the planner checks for Cartesian join later on.
2333+
sdf <- callJMethod(x@sdf, "join", y@sdf)
23052334
} else {
23062335
if (class(joinExpr) != "Column") stop("joinExpr must be a Column")
23072336
if (is.null(joinType)) {
@@ -2322,22 +2351,52 @@ setMethod("join",
23222351
dataFrame(sdf)
23232352
})
23242353

2354+
#' CrossJoin
2355+
#'
2356+
#' Returns Cartesian Product on two SparkDataFrames.
2357+
#'
2358+
#' @param x A SparkDataFrame
2359+
#' @param y A SparkDataFrame
2360+
#' @return A SparkDataFrame containing the result of the join operation.
2361+
#' @family SparkDataFrame functions
2362+
#' @aliases crossJoin,SparkDataFrame,SparkDataFrame-method
2363+
#' @rdname crossJoin
2364+
#' @name crossJoin
2365+
#' @seealso \link{merge} \link{join}
2366+
#' @export
2367+
#' @examples
2368+
#'\dontrun{
2369+
#' sparkR.session()
2370+
#' df1 <- read.json(path)
2371+
#' df2 <- read.json(path2)
2372+
#' crossJoin(df1, df2) # Performs a Cartesian
2373+
#' }
2374+
#' @note crossJoin since 2.1.0
2375+
setMethod("crossJoin",
2376+
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
2377+
function(x, y) {
2378+
sdf <- callJMethod(x@sdf, "crossJoin", y@sdf)
2379+
dataFrame(sdf)
2380+
})
2381+
23252382
#' Merges two data frames
23262383
#'
23272384
#' @name merge
2328-
#' @param x the first data frame to be joined
2329-
#' @param y the second data frame to be joined
2385+
#' @param x the first data frame to be joined.
2386+
#' @param y the second data frame to be joined.
23302387
#' @param by a character vector specifying the join columns. If by is not
23312388
#' specified, the common column names in \code{x} and \code{y} will be used.
2389+
#' If by or both by.x and by.y are explicitly set to NULL or of length 0, the Cartesian
2390+
#' Product of x and y will be returned.
23322391
#' @param by.x a character vector specifying the joining columns for x.
23332392
#' @param by.y a character vector specifying the joining columns for y.
23342393
#' @param all a boolean value setting \code{all.x} and \code{all.y}
23352394
#' if any of them are unset.
23362395
#' @param all.x a boolean value indicating whether all the rows in x should
2337-
#' be including in the join
2396+
#' be including in the join.
23382397
#' @param all.y a boolean value indicating whether all the rows in y should
2339-
#' be including in the join
2340-
#' @param sort a logical argument indicating whether the resulting columns should be sorted
2398+
#' be including in the join.
2399+
#' @param sort a logical argument indicating whether the resulting columns should be sorted.
23412400
#' @param suffixes a string vector of length 2 used to make colnames of
23422401
#' \code{x} and \code{y} unique.
23432402
#' The first element is appended to each colname of \code{x}.
@@ -2351,20 +2410,21 @@ setMethod("join",
23512410
#' @family SparkDataFrame functions
23522411
#' @aliases merge,SparkDataFrame,SparkDataFrame-method
23532412
#' @rdname merge
2354-
#' @seealso \link{join}
2413+
#' @seealso \link{join} \link{crossJoin}
23552414
#' @export
23562415
#' @examples
23572416
#'\dontrun{
23582417
#' sparkR.session()
23592418
#' df1 <- read.json(path)
23602419
#' df2 <- read.json(path2)
2361-
#' merge(df1, df2) # Performs a Cartesian
2420+
#' merge(df1, df2) # Performs an inner join by common columns
23622421
#' merge(df1, df2, by = "col1") # Performs an inner join based on expression
23632422
#' merge(df1, df2, by.x = "col1", by.y = "col2", all.y = TRUE)
23642423
#' merge(df1, df2, by.x = "col1", by.y = "col2", all.x = TRUE)
23652424
#' merge(df1, df2, by.x = "col1", by.y = "col2", all.x = TRUE, all.y = TRUE)
23662425
#' merge(df1, df2, by.x = "col1", by.y = "col2", all = TRUE, sort = FALSE)
23672426
#' merge(df1, df2, by = "col1", all = TRUE, suffixes = c("-X", "-Y"))
2427+
#' merge(df1, df2, by = NULL) # Performs a Cartesian join
23682428
#' }
23692429
#' @note merge since 1.5.0
23702430
setMethod("merge",
@@ -2401,7 +2461,7 @@ setMethod("merge",
24012461
joinY <- by
24022462
} else {
24032463
# if by or both by.x and by.y have length 0, use Cartesian Product
2404-
joinRes <- join(x, y)
2464+
joinRes <- crossJoin(x, y)
24052465
return (joinRes)
24062466
}
24072467

@@ -3282,7 +3342,7 @@ setMethod("write.jdbc",
32823342
jprops <- varargsToJProperties(...)
32833343
write <- callJMethod(x@sdf, "write")
32843344
write <- callJMethod(write, "mode", jmode)
3285-
invisible(callJMethod(write, "jdbc", url, tableName, jprops))
3345+
invisible(handledCallJMethod(write, "jdbc", url, tableName, jprops))
32863346
})
32873347

32883348
#' randomSplit

R/pkg/R/RDD.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ setMethod("persistRDD",
261261
#' cache(rdd) # rdd@@env$isCached == TRUE
262262
#' unpersistRDD(rdd) # rdd@@env$isCached == FALSE
263263
#'}
264-
#' @rdname unpersist-methods
264+
#' @rdname unpersist
265265
#' @aliases unpersist,RDD-method
266266
#' @noRd
267267
setMethod("unpersistRDD",

R/pkg/R/SQLContext.R

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,8 @@ setMethod("toDF", signature(x = "RDD"),
324324

325325
#' Create a SparkDataFrame from a JSON file.
326326
#'
327-
#' Loads a JSON file (one object per line), returning the result as a SparkDataFrame
327+
#' Loads a JSON file (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
328+
#' ), returning the result as a SparkDataFrame
328329
#' It goes through the entire dataset once to determine the schema.
329330
#'
330331
#' @param path Path of file to read. A vector of multiple paths is allowed.
@@ -349,7 +350,7 @@ read.json.default <- function(path, ...) {
349350
paths <- as.list(suppressWarnings(normalizePath(path)))
350351
read <- callJMethod(sparkSession, "read")
351352
read <- callJMethod(read, "options", options)
352-
sdf <- callJMethod(read, "json", paths)
353+
sdf <- handledCallJMethod(read, "json", paths)
353354
dataFrame(sdf)
354355
}
355356

@@ -421,7 +422,7 @@ read.orc <- function(path, ...) {
421422
path <- suppressWarnings(normalizePath(path))
422423
read <- callJMethod(sparkSession, "read")
423424
read <- callJMethod(read, "options", options)
424-
sdf <- callJMethod(read, "orc", path)
425+
sdf <- handledCallJMethod(read, "orc", path)
425426
dataFrame(sdf)
426427
}
427428

@@ -443,7 +444,7 @@ read.parquet.default <- function(path, ...) {
443444
paths <- as.list(suppressWarnings(normalizePath(path)))
444445
read <- callJMethod(sparkSession, "read")
445446
read <- callJMethod(read, "options", options)
446-
sdf <- callJMethod(read, "parquet", paths)
447+
sdf <- handledCallJMethod(read, "parquet", paths)
447448
dataFrame(sdf)
448449
}
449450

@@ -495,7 +496,7 @@ read.text.default <- function(path, ...) {
495496
paths <- as.list(suppressWarnings(normalizePath(path)))
496497
read <- callJMethod(sparkSession, "read")
497498
read <- callJMethod(read, "options", options)
498-
sdf <- callJMethod(read, "text", paths)
499+
sdf <- handledCallJMethod(read, "text", paths)
499500
dataFrame(sdf)
500501
}
501502

@@ -913,12 +914,13 @@ read.jdbc <- function(url, tableName,
913914
} else {
914915
numPartitions <- numToInt(numPartitions)
915916
}
916-
sdf <- callJMethod(read, "jdbc", url, tableName, as.character(partitionColumn),
917-
numToInt(lowerBound), numToInt(upperBound), numPartitions, jprops)
917+
sdf <- handledCallJMethod(read, "jdbc", url, tableName, as.character(partitionColumn),
918+
numToInt(lowerBound), numToInt(upperBound), numPartitions, jprops)
918919
} else if (length(predicates) > 0) {
919-
sdf <- callJMethod(read, "jdbc", url, tableName, as.list(as.character(predicates)), jprops)
920+
sdf <- handledCallJMethod(read, "jdbc", url, tableName, as.list(as.character(predicates)),
921+
jprops)
920922
} else {
921-
sdf <- callJMethod(read, "jdbc", url, tableName, jprops)
923+
sdf <- handledCallJMethod(read, "jdbc", url, tableName, jprops)
922924
}
923925
dataFrame(sdf)
924926
}

0 commit comments

Comments
 (0)