Skip to content

Commit 0731dd9

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents a2aab3d + be80def commit 0731dd9

File tree

1,449 files changed

+94147
-26539
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,449 files changed

+94147
-26539
lines changed

.rat-excludes

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,11 @@ local-1430917381535_2
8686
DESCRIPTION
8787
NAMESPACE
8888
test_support/*
89+
.*Rd
90+
help/*
91+
html/*
92+
INDEX
8993
.lintr
94+
gen-java.*
95+
.*avpr
96+
org.apache.spark.sql.sources.DataSourceRegister

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -948,6 +948,6 @@ The following components are provided under the MIT License. See project link fo
948948
(MIT License) SLF4J LOG4J-12 Binding (org.slf4j:slf4j-log4j12:1.7.5 - http://www.slf4j.org)
949949
(MIT License) pyrolite (org.spark-project:pyrolite:2.0.1 - http://pythonhosted.org/Pyro4/)
950950
(MIT License) scopt (com.github.scopt:scopt_2.10:3.2.0 - https://github.com/scopt/scopt)
951-
(The MIT License) Mockito (org.mockito:mockito-all:1.8.5 - http://www.mockito.org)
951+
(The MIT License) Mockito (org.mockito:mockito-core:1.9.5 - http://www.mockito.org)
952952
(MIT License) jquery (https://jquery.org/license/)
953953
(MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs)

R/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R
66

77
#### Build Spark
88

9-
Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-PsparkR` profile to build the R package. For example to use the default Hadoop versions you can run
9+
Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
1010
```
1111
build/mvn -DskipTests -Psparkr package
1212
```

R/install-dev.bat

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,8 @@ set SPARK_HOME=%~dp0..
2525
MKDIR %SPARK_HOME%\R\lib
2626

2727
R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" %SPARK_HOME%\R\pkg\
28+
29+
rem Zip the SparkR package so that it can be distributed to worker nodes on YARN
30+
pushd %SPARK_HOME%\R\lib
31+
%JAVA_HOME%\bin\jar.exe cfM "%SPARK_HOME%\R\lib\sparkr.zip" SparkR
32+
popd

R/install-dev.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,12 @@ LIB_DIR="$FWDIR/lib"
3434

3535
mkdir -p $LIB_DIR
3636

37-
pushd $FWDIR
37+
pushd $FWDIR > /dev/null
3838

3939
# Generate Rd files if devtools is installed
4040
Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
4141

4242
# Install SparkR to $LIB_DIR
4343
R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
4444

45-
popd
45+
popd > /dev/null

R/pkg/DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ Collate:
2929
'client.R'
3030
'context.R'
3131
'deserialize.R'
32+
'mllib.R'
3233
'serialize.R'
3334
'sparkR.R'
3435
'utils.R'
35-
'zzz.R'

R/pkg/NAMESPACE

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ export("sparkR.init")
1010
export("sparkR.stop")
1111
export("print.jobj")
1212

13+
# MLlib integration
14+
exportMethods("glm",
15+
"predict",
16+
"summary")
17+
1318
# Job group lifecycle management methods
1419
export("setJobGroup",
1520
"clearJobGroup",
@@ -22,7 +27,9 @@ exportMethods("arrange",
2227
"collect",
2328
"columns",
2429
"count",
30+
"crosstab",
2531
"describe",
32+
"dim",
2633
"distinct",
2734
"dropna",
2835
"dtypes",
@@ -39,11 +46,16 @@ exportMethods("arrange",
3946
"isLocal",
4047
"join",
4148
"limit",
49+
"merge",
50+
"names",
51+
"ncol",
52+
"nrow",
4253
"orderBy",
4354
"mutate",
4455
"names",
4556
"persist",
4657
"printSchema",
58+
"rbind",
4759
"registerTempTable",
4860
"rename",
4961
"repartition",
@@ -58,8 +70,10 @@ exportMethods("arrange",
5870
"show",
5971
"showDF",
6072
"summarize",
73+
"summary",
6174
"take",
6275
"unionAll",
76+
"unique",
6377
"unpersist",
6478
"where",
6579
"withColumn",
@@ -77,6 +91,7 @@ exportMethods("abs",
7791
"atan",
7892
"atan2",
7993
"avg",
94+
"between",
8095
"cast",
8196
"cbrt",
8297
"ceiling",

R/pkg/R/DataFrame.R

Lines changed: 151 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ setMethod("isLocal",
169169
#'}
170170
setMethod("showDF",
171171
signature(x = "DataFrame"),
172-
function(x, numRows = 20) {
173-
s <- callJMethod(x@sdf, "showString", numToInt(numRows))
172+
function(x, numRows = 20, truncate = TRUE) {
173+
s <- callJMethod(x@sdf, "showString", numToInt(numRows), truncate)
174174
cat(s)
175175
})
176176

@@ -255,6 +255,16 @@ setMethod("names",
255255
columns(x)
256256
})
257257

258+
#' @rdname columns
259+
setMethod("names<-",
260+
signature(x = "DataFrame"),
261+
function(x, value) {
262+
if (!is.null(value)) {
263+
sdf <- callJMethod(x@sdf, "toDF", listToSeq(as.list(value)))
264+
dataFrame(sdf)
265+
}
266+
})
267+
258268
#' Register Temporary Table
259269
#'
260270
#' Registers a DataFrame as a Temporary Table in the SQLContext
@@ -473,6 +483,18 @@ setMethod("distinct",
473483
dataFrame(sdf)
474484
})
475485

486+
#' @title Distinct rows in a DataFrame
487+
#
488+
#' @description Returns a new DataFrame containing distinct rows in this DataFrame
489+
#'
490+
#' @rdname unique
491+
#' @aliases unique
492+
setMethod("unique",
493+
signature(x = "DataFrame"),
494+
function(x) {
495+
distinct(x)
496+
})
497+
476498
#' Sample
477499
#'
478500
#' Return a sampled subset of this DataFrame using a random seed.
@@ -534,6 +556,58 @@ setMethod("count",
534556
callJMethod(x@sdf, "count")
535557
})
536558

559+
#' @title Number of rows for a DataFrame
560+
#' @description Returns number of rows in a DataFrames
561+
#'
562+
#' @name nrow
563+
#'
564+
#' @rdname nrow
565+
#' @aliases count
566+
setMethod("nrow",
567+
signature(x = "DataFrame"),
568+
function(x) {
569+
count(x)
570+
})
571+
572+
#' Returns the number of columns in a DataFrame
573+
#'
574+
#' @param x a SparkSQL DataFrame
575+
#'
576+
#' @rdname ncol
577+
#' @export
578+
#' @examples
579+
#'\dontrun{
580+
#' sc <- sparkR.init()
581+
#' sqlContext <- sparkRSQL.init(sc)
582+
#' path <- "path/to/file.json"
583+
#' df <- jsonFile(sqlContext, path)
584+
#' ncol(df)
585+
#' }
586+
setMethod("ncol",
587+
signature(x = "DataFrame"),
588+
function(x) {
589+
length(columns(x))
590+
})
591+
592+
#' Returns the dimentions (number of rows and columns) of a DataFrame
593+
#' @param x a SparkSQL DataFrame
594+
#'
595+
#' @rdname dim
596+
#' @export
597+
#' @examples
598+
#'\dontrun{
599+
#' sc <- sparkR.init()
600+
#' sqlContext <- sparkRSQL.init(sc)
601+
#' path <- "path/to/file.json"
602+
#' df <- jsonFile(sqlContext, path)
603+
#' dim(df)
604+
#' }
605+
setMethod("dim",
606+
signature(x = "DataFrame"),
607+
function(x) {
608+
c(count(x), ncol(x))
609+
})
610+
537611
#' Collects all the elements of a Spark DataFrame and coerces them into an R data.frame.
538612
#'
539613
#' @param x A SparkSQL DataFrame
@@ -1205,6 +1279,15 @@ setMethod("join",
12051279
dataFrame(sdf)
12061280
})
12071281

1282+
#' rdname merge
1283+
#' aliases join
1284+
setMethod("merge",
1285+
signature(x = "DataFrame", y = "DataFrame"),
1286+
function(x, y, joinExpr = NULL, joinType = NULL, ...) {
1287+
join(x, y, joinExpr, joinType)
1288+
})
1289+
1290+
12081291
#' UnionAll
12091292
#'
12101293
#' Return a new DataFrame containing the union of rows in this DataFrame
@@ -1231,6 +1314,22 @@ setMethod("unionAll",
12311314
dataFrame(unioned)
12321315
})
12331316

1317+
#' @title Union two or more DataFrames
1318+
#
1319+
#' @description Returns a new DataFrame containing rows of all parameters.
1320+
#
1321+
#' @rdname rbind
1322+
#' @aliases unionAll
1323+
setMethod("rbind",
1324+
signature(... = "DataFrame"),
1325+
function(x, ..., deparse.level = 1) {
1326+
if (nargs() == 3) {
1327+
unionAll(x, ...)
1328+
} else {
1329+
unionAll(x, Recall(..., deparse.level = 1))
1330+
}
1331+
})
1332+
12341333
#' Intersect
12351334
#'
12361335
#' Return a new DataFrame containing rows only in both this DataFrame
@@ -1314,21 +1413,23 @@ setMethod("except",
13141413
#' write.df(df, "myfile", "parquet", "overwrite")
13151414
#' }
13161415
setMethod("write.df",
1317-
signature(df = "DataFrame", path = 'character'),
1416+
signature(df = "DataFrame", path = "character"),
13181417
function(df, path, source = NULL, mode = "append", ...){
13191418
if (is.null(source)) {
13201419
sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
13211420
source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
13221421
"org.apache.spark.sql.parquet")
13231422
}
13241423
allModes <- c("append", "overwrite", "error", "ignore")
1424+
# nolint start
13251425
if (!(mode %in% allModes)) {
13261426
stop('mode should be one of "append", "overwrite", "error", "ignore"')
13271427
}
1428+
# nolint end
13281429
jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
13291430
options <- varargsToEnv(...)
13301431
if (!is.null(path)) {
1331-
options[['path']] = path
1432+
options[["path"]] <- path
13321433
}
13331434
callJMethod(df@sdf, "save", source, jmode, options)
13341435
})
@@ -1337,7 +1438,7 @@ setMethod("write.df",
13371438
#' @aliases saveDF
13381439
#' @export
13391440
setMethod("saveDF",
1340-
signature(df = "DataFrame", path = 'character'),
1441+
signature(df = "DataFrame", path = "character"),
13411442
function(df, path, source = NULL, mode = "append", ...){
13421443
write.df(df, path, source, mode, ...)
13431444
})
@@ -1375,18 +1476,20 @@ setMethod("saveDF",
13751476
#' saveAsTable(df, "myfile")
13761477
#' }
13771478
setMethod("saveAsTable",
1378-
signature(df = "DataFrame", tableName = 'character', source = 'character',
1379-
mode = 'character'),
1479+
signature(df = "DataFrame", tableName = "character", source = "character",
1480+
mode = "character"),
13801481
function(df, tableName, source = NULL, mode="append", ...){
13811482
if (is.null(source)) {
13821483
sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
13831484
source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
13841485
"org.apache.spark.sql.parquet")
13851486
}
13861487
allModes <- c("append", "overwrite", "error", "ignore")
1488+
# nolint start
13871489
if (!(mode %in% allModes)) {
13881490
stop('mode should be one of "append", "overwrite", "error", "ignore"')
13891491
}
1492+
# nolint end
13901493
jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
13911494
options <- varargsToEnv(...)
13921495
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
@@ -1430,6 +1533,19 @@ setMethod("describe",
14301533
dataFrame(sdf)
14311534
})
14321535

1536+
#' @title Summary
1537+
#'
1538+
#' @description Computes statistics for numeric columns of the DataFrame
1539+
#'
1540+
#' @rdname summary
1541+
#' @aliases describe
1542+
setMethod("summary",
1543+
signature(x = "DataFrame"),
1544+
function(x) {
1545+
describe(x)
1546+
})
1547+
1548+
14331549
#' dropna
14341550
#'
14351551
#' Returns a new DataFrame omitting rows with null values.
@@ -1554,3 +1670,31 @@ setMethod("fillna",
15541670
}
15551671
dataFrame(sdf)
15561672
})
1673+
1674+
#' crosstab
1675+
#'
1676+
#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
1677+
#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
1678+
#' non-zero pair frequencies will be returned.
1679+
#'
1680+
#' @param col1 name of the first column. Distinct items will make the first item of each row.
1681+
#' @param col2 name of the second column. Distinct items will make the column names of the output.
1682+
#' @return a local R data.frame representing the contingency table. The first column of each row
1683+
#' will be the distinct values of `col1` and the column names will be the distinct values
1684+
#' of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
1685+
#' occurrences will have zero as their counts.
1686+
#'
1687+
#' @rdname statfunctions
1688+
#' @export
1689+
#' @examples
1690+
#' \dontrun{
1691+
#' df <- jsonFile(sqlCtx, "/path/to/file.json")
1692+
#' ct = crosstab(df, "title", "gender")
1693+
#' }
1694+
setMethod("crosstab",
1695+
signature(x = "DataFrame", col1 = "character", col2 = "character"),
1696+
function(x, col1, col2) {
1697+
statFunctions <- callJMethod(x@sdf, "stat")
1698+
sct <- callJMethod(statFunctions, "crosstab", col1, col2)
1699+
collect(dataFrame(sct))
1700+
})

0 commit comments

Comments
 (0)