Skip to content

Commit f2d29a1

Browse files
author
Davies Liu
committed
Merge branch 'master' of github.com:apache/spark into str_index
2 parents 515519b + 3fc0cb9 commit f2d29a1

File tree

592 files changed

+26124
-9451
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

592 files changed

+26124
-9451
lines changed

R/pkg/NAMESPACE

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ export("print.jobj")
1212

1313
# MLlib integration
1414
exportMethods("glm",
15-
"predict")
15+
"predict",
16+
"summary")
1617

1718
# Job group lifecycle management methods
1819
export("setJobGroup",
@@ -26,7 +27,9 @@ exportMethods("arrange",
2627
"collect",
2728
"columns",
2829
"count",
30+
"crosstab",
2931
"describe",
32+
"dim",
3033
"distinct",
3134
"dropna",
3235
"dtypes",
@@ -43,11 +46,15 @@ exportMethods("arrange",
4346
"isLocal",
4447
"join",
4548
"limit",
49+
"names",
50+
"ncol",
51+
"nrow",
4652
"orderBy",
4753
"mutate",
4854
"names",
4955
"persist",
5056
"printSchema",
57+
"rbind",
5158
"registerTempTable",
5259
"rename",
5360
"repartition",
@@ -64,6 +71,7 @@ exportMethods("arrange",
6471
"summarize",
6572
"take",
6673
"unionAll",
74+
"unique",
6775
"unpersist",
6876
"where",
6977
"withColumn",

R/pkg/R/DataFrame.R

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,16 @@ setMethod("names",
255255
columns(x)
256256
})
257257

258+
#' @rdname columns
259+
setMethod("names<-",
260+
signature(x = "DataFrame"),
261+
function(x, value) {
262+
if (!is.null(value)) {
263+
sdf <- callJMethod(x@sdf, "toDF", listToSeq(as.list(value)))
264+
dataFrame(sdf)
265+
}
266+
})
267+
258268
#' Register Temporary Table
259269
#'
260270
#' Registers a DataFrame as a Temporary Table in the SQLContext
@@ -473,6 +483,18 @@ setMethod("distinct",
473483
dataFrame(sdf)
474484
})
475485

486+
#' @title Distinct rows in a DataFrame
487+
#
488+
#' @description Returns a new DataFrame containing distinct rows in this DataFrame
489+
#'
490+
#' @rdname unique
491+
#' @aliases unique
492+
setMethod("unique",
493+
signature(x = "DataFrame"),
494+
function(x) {
495+
distinct(x)
496+
})
497+
476498
#' Sample
477499
#'
478500
#' Return a sampled subset of this DataFrame using a random seed.
@@ -534,6 +556,58 @@ setMethod("count",
534556
callJMethod(x@sdf, "count")
535557
})
536558

559+
#' @title Number of rows for a DataFrame
560+
#' @description Returns number of rows in a DataFrames
561+
#'
562+
#' @name nrow
563+
#'
564+
#' @rdname nrow
565+
#' @aliases count
566+
setMethod("nrow",
567+
signature(x = "DataFrame"),
568+
function(x) {
569+
count(x)
570+
})
571+
572+
#' Returns the number of columns in a DataFrame
573+
#'
574+
#' @param x a SparkSQL DataFrame
575+
#'
576+
#' @rdname ncol
577+
#' @export
578+
#' @examples
579+
#'\dontrun{
580+
#' sc <- sparkR.init()
581+
#' sqlContext <- sparkRSQL.init(sc)
582+
#' path <- "path/to/file.json"
583+
#' df <- jsonFile(sqlContext, path)
584+
#' ncol(df)
585+
#' }
586+
setMethod("ncol",
587+
signature(x = "DataFrame"),
588+
function(x) {
589+
length(columns(x))
590+
})
591+
592+
#' Returns the dimentions (number of rows and columns) of a DataFrame
593+
#' @param x a SparkSQL DataFrame
594+
#'
595+
#' @rdname dim
596+
#' @export
597+
#' @examples
598+
#'\dontrun{
599+
#' sc <- sparkR.init()
600+
#' sqlContext <- sparkRSQL.init(sc)
601+
#' path <- "path/to/file.json"
602+
#' df <- jsonFile(sqlContext, path)
603+
#' dim(df)
604+
#' }
605+
setMethod("dim",
606+
signature(x = "DataFrame"),
607+
function(x) {
608+
c(count(x), ncol(x))
609+
})
610+
537611
#' Collects all the elements of a Spark DataFrame and coerces them into an R data.frame.
538612
#'
539613
#' @param x A SparkSQL DataFrame
@@ -1231,6 +1305,22 @@ setMethod("unionAll",
12311305
dataFrame(unioned)
12321306
})
12331307

1308+
#' @title Union two or more DataFrames
1309+
#
1310+
#' @description Returns a new DataFrame containing rows of all parameters.
1311+
#
1312+
#' @rdname rbind
1313+
#' @aliases unionAll
1314+
setMethod("rbind",
1315+
signature(... = "DataFrame"),
1316+
function(x, ..., deparse.level = 1) {
1317+
if (nargs() == 3) {
1318+
unionAll(x, ...)
1319+
} else {
1320+
unionAll(x, Recall(..., deparse.level = 1))
1321+
}
1322+
})
1323+
12341324
#' Intersect
12351325
#'
12361326
#' Return a new DataFrame containing rows only in both this DataFrame
@@ -1322,9 +1412,11 @@ setMethod("write.df",
13221412
"org.apache.spark.sql.parquet")
13231413
}
13241414
allModes <- c("append", "overwrite", "error", "ignore")
1415+
# nolint start
13251416
if (!(mode %in% allModes)) {
13261417
stop('mode should be one of "append", "overwrite", "error", "ignore"')
13271418
}
1419+
# nolint end
13281420
jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
13291421
options <- varargsToEnv(...)
13301422
if (!is.null(path)) {
@@ -1384,9 +1476,11 @@ setMethod("saveAsTable",
13841476
"org.apache.spark.sql.parquet")
13851477
}
13861478
allModes <- c("append", "overwrite", "error", "ignore")
1479+
# nolint start
13871480
if (!(mode %in% allModes)) {
13881481
stop('mode should be one of "append", "overwrite", "error", "ignore"')
13891482
}
1483+
# nolint end
13901484
jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
13911485
options <- varargsToEnv(...)
13921486
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
@@ -1554,3 +1648,31 @@ setMethod("fillna",
15541648
}
15551649
dataFrame(sdf)
15561650
})
1651+
1652+
#' crosstab
1653+
#'
1654+
#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
1655+
#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
1656+
#' non-zero pair frequencies will be returned.
1657+
#'
1658+
#' @param col1 name of the first column. Distinct items will make the first item of each row.
1659+
#' @param col2 name of the second column. Distinct items will make the column names of the output.
1660+
#' @return a local R data.frame representing the contingency table. The first column of each row
1661+
#' will be the distinct values of `col1` and the column names will be the distinct values
1662+
#' of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
1663+
#' occurrences will have zero as their counts.
1664+
#'
1665+
#' @rdname statfunctions
1666+
#' @export
1667+
#' @examples
1668+
#' \dontrun{
1669+
#' df <- jsonFile(sqlCtx, "/path/to/file.json")
1670+
#' ct = crosstab(df, "title", "gender")
1671+
#' }
1672+
setMethod("crosstab",
1673+
signature(x = "DataFrame", col1 = "character", col2 = "character"),
1674+
function(x, col1, col2) {
1675+
statFunctions <- callJMethod(x@sdf, "stat")
1676+
sct <- callJMethod(statFunctions, "crosstab", col1, col2)
1677+
collect(dataFrame(sct))
1678+
})

R/pkg/R/RDD.R

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,9 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val)
8585

8686
isPipelinable <- function(rdd) {
8787
e <- rdd@env
88+
# nolint start
8889
!(e$isCached || e$isCheckpointed)
90+
# nolint end
8991
}
9092

9193
if (!inherits(prev, "PipelinedRDD") || !isPipelinable(prev)) {
@@ -97,7 +99,8 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val)
9799
# prev_serializedMode is used during the delayed computation of JRDD in getJRDD
98100
} else {
99101
pipelinedFunc <- function(partIndex, part) {
100-
func(partIndex, prev@func(partIndex, part))
102+
f <- prev@func
103+
func(partIndex, f(partIndex, part))
101104
}
102105
.Object@func <- cleanClosure(pipelinedFunc)
103106
.Object@prev_jrdd <- prev@prev_jrdd # maintain the pipeline
@@ -841,7 +844,7 @@ setMethod("sampleRDD",
841844
if (withReplacement) {
842845
count <- rpois(1, fraction)
843846
if (count > 0) {
844-
res[(len + 1):(len + count)] <- rep(list(elem), count)
847+
res[ (len + 1) : (len + count) ] <- rep(list(elem), count)
845848
len <- len + count
846849
}
847850
} else {
@@ -1261,12 +1264,12 @@ setMethod("pipeRDD",
12611264
signature(x = "RDD", command = "character"),
12621265
function(x, command, env = list()) {
12631266
func <- function(part) {
1264-
trim.trailing.func <- function(x) {
1267+
trim_trailing_func <- function(x) {
12651268
sub("[\r\n]*$", "", toString(x))
12661269
}
1267-
input <- unlist(lapply(part, trim.trailing.func))
1270+
input <- unlist(lapply(part, trim_trailing_func))
12681271
res <- system2(command, stdout = TRUE, input = input, env = env)
1269-
lapply(res, trim.trailing.func)
1272+
lapply(res, trim_trailing_func)
12701273
}
12711274
lapplyPartition(x, func)
12721275
})

R/pkg/R/backend.R

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ invokeJava <- function(isStatic, objId, methodName, ...) {
110110

111111
# TODO: check the status code to output error information
112112
returnStatus <- readInt(conn)
113-
stopifnot(returnStatus == 0)
113+
if (returnStatus != 0) {
114+
stop(readString(conn))
115+
}
114116
readObject(conn)
115117
}

R/pkg/R/client.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack
4848
jars <- paste("--jars", jars)
4949
}
5050

51-
if (packages != "") {
51+
if (!identical(packages, "")) {
5252
packages <- paste("--packages", packages)
5353
}
5454

R/pkg/R/column.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ functions <- c("min", "max", "sum", "avg", "mean", "count", "abs", "sqrt",
6565
"acos", "asin", "atan", "cbrt", "ceiling", "cos", "cosh", "exp",
6666
"expm1", "floor", "log", "log10", "log1p", "rint", "sign",
6767
"sin", "sinh", "tan", "tanh", "toDegrees", "toRadians")
68-
binary_mathfunctions<- c("atan2", "hypot")
68+
binary_mathfunctions <- c("atan2", "hypot")
6969

7070
createOperator <- function(op) {
7171
setMethod(op,

R/pkg/R/context.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ parallelize <- function(sc, coll, numSlices = 1) {
121121
numSlices <- length(coll)
122122

123123
sliceLen <- ceiling(length(coll) / numSlices)
124-
slices <- split(coll, rep(1:(numSlices + 1), each = sliceLen)[1:length(coll)])
124+
slices <- split(coll, rep(1: (numSlices + 1), each = sliceLen)[1:length(coll)])
125125

126126
# Serialize each slice: obtain a list of raws, or a list of lists (slices) of
127127
# 2-tuples of raws

R/pkg/R/deserialize.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,11 @@ readList <- function(con) {
102102

103103
readRaw <- function(con) {
104104
dataLen <- readInt(con)
105-
data <- readBin(con, raw(), as.integer(dataLen), endian = "big")
105+
readBin(con, raw(), as.integer(dataLen), endian = "big")
106106
}
107107

108108
readRawLen <- function(con, dataLen) {
109-
data <- readBin(con, raw(), as.integer(dataLen), endian = "big")
109+
readBin(con, raw(), as.integer(dataLen), endian = "big")
110110
}
111111

112112
readDeserialize <- function(con) {

R/pkg/R/generics.R

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ setGeneric("count", function(x) { standardGeneric("count") })
5959
# @export
6060
setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
6161

62+
# @rdname statfunctions
63+
# @export
64+
setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
65+
6266
# @rdname distinct
6367
# @export
6468
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
@@ -250,8 +254,10 @@ setGeneric("flatMapValues", function(X, FUN) { standardGeneric("flatMapValues")
250254

251255
# @rdname intersection
252256
# @export
253-
setGeneric("intersection", function(x, other, numPartitions = 1) {
254-
standardGeneric("intersection") })
257+
setGeneric("intersection",
258+
function(x, other, numPartitions = 1) {
259+
standardGeneric("intersection")
260+
})
255261

256262
# @rdname keys
257263
# @export
@@ -485,9 +491,7 @@ setGeneric("sample",
485491
#' @rdname sample
486492
#' @export
487493
setGeneric("sample_frac",
488-
function(x, withReplacement, fraction, seed) {
489-
standardGeneric("sample_frac")
490-
})
494+
function(x, withReplacement, fraction, seed) { standardGeneric("sample_frac") })
491495

492496
#' @rdname saveAsParquetFile
493497
#' @export
@@ -549,8 +553,8 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
549553

550554
#' @rdname withColumnRenamed
551555
#' @export
552-
setGeneric("withColumnRenamed", function(x, existingCol, newCol) {
553-
standardGeneric("withColumnRenamed") })
556+
setGeneric("withColumnRenamed",
557+
function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })
554558

555559

556560
###################### Column Methods ##########################
@@ -665,3 +669,7 @@ setGeneric("upper", function(x) { standardGeneric("upper") })
665669
#' @rdname glm
666670
#' @export
667671
setGeneric("glm")
672+
673+
#' @rdname rbind
674+
#' @export
675+
setGeneric("rbind", signature = "...")

0 commit comments

Comments
 (0)