Skip to content

Commit a704376

Browse files
committed
Merge branch 'master' into sc-4439
2 parents 17507fa + 06e3398 commit a704376

File tree

380 files changed

+8971
-3286
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

380 files changed

+8971
-3286
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
/lib/
2424
R-unit-tests.log
2525
R/unit-tests.out
26+
R/cran-check.out
2627
build/*.jar
2728
build/apache-maven*
2829
build/scala*

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ notifications:
4444
# 5. Run maven install before running lint-java.
4545
install:
4646
- export MAVEN_SKIP_RC=1
47-
- build/mvn -T 4 -q -DskipTests -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
47+
- build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
4848

4949
# 6. Run lint-java.
5050
script:

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
263263
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
264264
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
265265
(The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
266-
(The New BSD License) Py4J (net.sf.py4j:py4j:0.10.1 - http://py4j.sourceforge.net/)
266+
(The New BSD License) Py4J (net.sf.py4j:py4j:0.10.3 - http://py4j.sourceforge.net/)
267267
(Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
268268
(BSD licence) sbt and sbt-launch-lib.bash
269269
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)

R/WINDOWS.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,23 @@ To build SparkR on Windows, the following steps are required
44

55
1. Install R (>= 3.1) and [Rtools](http://cran.r-project.org/bin/windows/Rtools/). Make sure to
66
include Rtools and R in `PATH`.
7+
78
2. Install
89
[JDK7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) and set
910
`JAVA_HOME` in the system environment variables.
11+
1012
3. Download and install [Maven](http://maven.apache.org/download.html). Also include the `bin`
1113
directory in Maven in `PATH`.
14+
1215
4. Set `MAVEN_OPTS` as described in [Building Spark](http://spark.apache.org/docs/latest/building-spark.html).
13-
5. Open a command shell (`cmd`) in the Spark directory and run `mvn -DskipTests -Psparkr package`
16+
17+
5. Open a command shell (`cmd`) in the Spark directory and build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
18+
19+
```bash
20+
mvn.cmd -DskipTests -Psparkr package
21+
```
22+
23+
`.\build\mvn` is a shell script so `mvn.cmd` should be used directly on Windows.
1424

1525
## Unit tests
1626

R/check-cran.sh

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,22 @@ $FWDIR/create-docs.sh
4343
"$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg
4444

4545
# Run check as-cran.
46-
# TODO(shivaram): Remove the skip tests once we figure out the install mechanism
47-
4846
VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`
4947

50-
"$R_SCRIPT_PATH/"R CMD check --as-cran SparkR_"$VERSION".tar.gz
48+
CRAN_CHECK_OPTIONS="--as-cran"
49+
50+
if [ -n "$NO_TESTS" ]
51+
then
52+
CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-tests"
53+
fi
54+
55+
if [ -n "$NO_MANUAL" ]
56+
then
57+
CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-manual"
58+
fi
59+
60+
echo "Running CRAN check with $CRAN_CHECK_OPTIONS options"
61+
62+
"$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
5163

5264
popd > /dev/null

R/pkg/DESCRIPTION

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,16 @@ Package: SparkR
22
Type: Package
33
Title: R Frontend for Apache Spark
44
Version: 2.0.0
5-
Date: 2016-07-07
6-
Author: The Apache Software Foundation
7-
Maintainer: Shivaram Venkataraman <[email protected]>
5+
Date: 2016-08-27
6+
Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
7+
email = "[email protected]"),
8+
person("Xiangrui", "Meng", role = "aut",
9+
email = "[email protected]"),
10+
person("Felix", "Cheung", role = "aut",
11+
email = "[email protected]"),
12+
person(family = "The Apache Software Foundation", role = c("aut", "cph")))
13+
URL: http://www.apache.org/ http://spark.apache.org/
14+
BugReports: https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-ContributingBugReports
815
Depends:
916
R (>= 3.0),
1017
methods
@@ -32,6 +39,7 @@ Collate:
3239
'deserialize.R'
3340
'functions.R'
3441
'install.R'
42+
'jvm.R'
3543
'mllib.R'
3644
'serialize.R'
3745
'sparkR.R'

R/pkg/NAMESPACE

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Imports from base R
2-
importFrom(methods, setGeneric, setMethod, setOldClass)
2+
# Do not include stats:: "rpois", "runif" - causes error at runtime
3+
importFrom("methods", "setGeneric", "setMethod", "setOldClass")
4+
importFrom("methods", "is", "new", "signature", "show")
5+
importFrom("stats", "gaussian", "setNames")
6+
importFrom("utils", "download.file", "packageVersion", "untar")
37

48
# Disable native libraries till we figure out how to package it
59
# See SPARKR-7839
@@ -23,6 +27,7 @@ exportMethods("glm",
2327
"summary",
2428
"spark.kmeans",
2529
"fitted",
30+
"spark.mlp",
2631
"spark.naiveBayes",
2732
"spark.survreg",
2833
"spark.lda",
@@ -359,4 +364,8 @@ S3method(structField, jobj)
359364
S3method(structType, jobj)
360365
S3method(structType, structField)
361366

367+
export("sparkR.newJObject")
368+
export("sparkR.callJMethod")
369+
export("sparkR.callJStatic")
370+
362371
export("install.spark")

R/pkg/R/DataFrame.R

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ setMethod("explain",
150150

151151
#' isLocal
152152
#'
153-
#' Returns True if the `collect` and `take` methods can be run locally
153+
#' Returns True if the \code{collect} and \code{take} methods can be run locally
154154
#' (without any Spark executors).
155155
#'
156156
#' @param x A SparkDataFrame
@@ -182,7 +182,7 @@ setMethod("isLocal",
182182
#' @param numRows the number of rows to print. Defaults to 20.
183183
#' @param truncate whether truncate long strings. If \code{TRUE}, strings more than
184184
#' 20 characters will be truncated. However, if set greater than zero,
185-
#' truncates strings longer than `truncate` characters and all cells
185+
#' truncates strings longer than \code{truncate} characters and all cells
186186
#' will be aligned right.
187187
#' @param ... further arguments to be passed to or from other methods.
188188
#' @family SparkDataFrame functions
@@ -212,9 +212,9 @@ setMethod("showDF",
212212

213213
#' show
214214
#'
215-
#' Print the SparkDataFrame column names and types
215+
#' Print class and type information of a Spark object.
216216
#'
217-
#' @param object a SparkDataFrame.
217+
#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec.
218218
#'
219219
#' @family SparkDataFrame functions
220220
#' @rdname show
@@ -642,10 +642,10 @@ setMethod("unpersist",
642642
#' The following options for repartition are possible:
643643
#' \itemize{
644644
#' \item{1.} {Return a new SparkDataFrame partitioned by
645-
#' the given columns into `numPartitions`.}
646-
#' \item{2.} {Return a new SparkDataFrame that has exactly `numPartitions`.}
645+
#' the given columns into \code{numPartitions}.}
646+
#' \item{2.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.}
647647
#' \item{3.} {Return a new SparkDataFrame partitioned by the given column(s),
648-
#' using `spark.sql.shuffle.partitions` as number of partitions.}
648+
#' using \code{spark.sql.shuffle.partitions} as number of partitions.}
649649
#'}
650650
#' @param x a SparkDataFrame.
651651
#' @param numPartitions the number of partitions to use.
@@ -1132,9 +1132,8 @@ setMethod("take",
11321132

11331133
#' Head
11341134
#'
1135-
#' Return the first NUM rows of a SparkDataFrame as a R data.frame. If NUM is NULL,
1136-
#' then head() returns the first 6 rows in keeping with the current data.frame
1137-
#' convention in R.
1135+
#' Return the first \code{num} rows of a SparkDataFrame as a R data.frame. If \code{num} is not
1136+
#' specified, then head() returns the first 6 rows as with R data.frame.
11381137
#'
11391138
#' @param x a SparkDataFrame.
11401139
#' @param num the number of rows to return. Default is 6.
@@ -1406,11 +1405,11 @@ setMethod("dapplyCollect",
14061405
#'
14071406
#' @param cols grouping columns.
14081407
#' @param func a function to be applied to each group partition specified by grouping
1409-
#' column of the SparkDataFrame. The function `func` takes as argument
1408+
#' column of the SparkDataFrame. The function \code{func} takes as argument
14101409
#' a key - grouping columns and a data frame - a local R data.frame.
1411-
#' The output of `func` is a local R data.frame.
1410+
#' The output of \code{func} is a local R data.frame.
14121411
#' @param schema the schema of the resulting SparkDataFrame after the function is applied.
1413-
#' The schema must match to output of `func`. It has to be defined for each
1412+
#' The schema must match to output of \code{func}. It has to be defined for each
14141413
#' output column with preferred output column name and corresponding data type.
14151414
#' @return A SparkDataFrame.
14161415
#' @family SparkDataFrame functions
@@ -1497,9 +1496,9 @@ setMethod("gapply",
14971496
#'
14981497
#' @param cols grouping columns.
14991498
#' @param func a function to be applied to each group partition specified by grouping
1500-
#' column of the SparkDataFrame. The function `func` takes as argument
1499+
#' column of the SparkDataFrame. The function \code{func} takes as argument
15011500
#' a key - grouping columns and a data frame - a local R data.frame.
1502-
#' The output of `func` is a local R data.frame.
1501+
#' The output of \code{func} is a local R data.frame.
15031502
#' @return A data.frame.
15041503
#' @family SparkDataFrame functions
15051504
#' @aliases gapplyCollect,SparkDataFrame-method
@@ -1657,7 +1656,7 @@ setMethod("$", signature(x = "SparkDataFrame"),
16571656
getColumn(x, name)
16581657
})
16591658

1660-
#' @param value a Column or NULL. If NULL, the specified Column is dropped.
1659+
#' @param value a Column or \code{NULL}. If \code{NULL}, the specified Column is dropped.
16611660
#' @rdname select
16621661
#' @name $<-
16631662
#' @aliases $<-,SparkDataFrame-method
@@ -1747,7 +1746,7 @@ setMethod("[", signature(x = "SparkDataFrame"),
17471746
#' @family subsetting functions
17481747
#' @examples
17491748
#' \dontrun{
1750-
#' # Columns can be selected using `[[` and `[`
1749+
#' # Columns can be selected using [[ and [
17511750
#' df[[2]] == df[["age"]]
17521751
#' df[,2] == df[,"age"]
17531752
#' df[,c("name", "age")]
@@ -1792,7 +1791,7 @@ setMethod("subset", signature(x = "SparkDataFrame"),
17921791
#' select(df, df$name, df$age + 1)
17931792
#' select(df, c("col1", "col2"))
17941793
#' select(df, list(df$name, df$age + 1))
1795-
#' # Similar to R data frames columns can also be selected using `$`
1794+
#' # Similar to R data frames columns can also be selected using $
17961795
#' df[,df$age]
17971796
#' }
17981797
#' @note select(SparkDataFrame, character) since 1.4.0
@@ -2443,7 +2442,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
24432442
#' Return a new SparkDataFrame containing the union of rows
24442443
#'
24452444
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
2446-
#' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL.
2445+
#' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
24472446
#' Note that this does not remove duplicate rows across the two SparkDataFrames.
24482447
#'
24492448
#' @param x A SparkDataFrame
@@ -2486,7 +2485,7 @@ setMethod("unionAll",
24862485

24872486
#' Union two or more SparkDataFrames
24882487
#'
2489-
#' Union two or more SparkDataFrames. This is equivalent to `UNION ALL` in SQL.
2488+
#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
24902489
#' Note that this does not remove duplicate rows across the two SparkDataFrames.
24912490
#'
24922491
#' @param x a SparkDataFrame.
@@ -2519,7 +2518,7 @@ setMethod("rbind",
25192518
#' Intersect
25202519
#'
25212520
#' Return a new SparkDataFrame containing rows only in both this SparkDataFrame
2522-
#' and another SparkDataFrame. This is equivalent to `INTERSECT` in SQL.
2521+
#' and another SparkDataFrame. This is equivalent to \code{INTERSECT} in SQL.
25232522
#'
25242523
#' @param x A SparkDataFrame
25252524
#' @param y A SparkDataFrame
@@ -2547,7 +2546,7 @@ setMethod("intersect",
25472546
#' except
25482547
#'
25492548
#' Return a new SparkDataFrame containing rows in this SparkDataFrame
2550-
#' but not in another SparkDataFrame. This is equivalent to `EXCEPT` in SQL.
2549+
#' but not in another SparkDataFrame. This is equivalent to \code{EXCEPT} in SQL.
25512550
#'
25522551
#' @param x a SparkDataFrame.
25532552
#' @param y a SparkDataFrame.
@@ -2576,8 +2575,8 @@ setMethod("except",
25762575

25772576
#' Save the contents of SparkDataFrame to a data source.
25782577
#'
2579-
#' The data source is specified by the `source` and a set of options (...).
2580-
#' If `source` is not specified, the default data source configured by
2578+
#' The data source is specified by the \code{source} and a set of options (...).
2579+
#' If \code{source} is not specified, the default data source configured by
25812580
#' spark.sql.sources.default will be used.
25822581
#'
25832582
#' Additionally, mode is used to specify the behavior of the save operation when data already
@@ -2613,7 +2612,7 @@ setMethod("except",
26132612
#' @note write.df since 1.4.0
26142613
setMethod("write.df",
26152614
signature(df = "SparkDataFrame", path = "character"),
2616-
function(df, path, source = NULL, mode = "error", ...){
2615+
function(df, path, source = NULL, mode = "error", ...) {
26172616
if (is.null(source)) {
26182617
source <- getDefaultSqlSource()
26192618
}
@@ -2635,14 +2634,14 @@ setMethod("write.df",
26352634
#' @note saveDF since 1.4.0
26362635
setMethod("saveDF",
26372636
signature(df = "SparkDataFrame", path = "character"),
2638-
function(df, path, source = NULL, mode = "error", ...){
2637+
function(df, path, source = NULL, mode = "error", ...) {
26392638
write.df(df, path, source, mode, ...)
26402639
})
26412640

26422641
#' Save the contents of the SparkDataFrame to a data source as a table
26432642
#'
2644-
#' The data source is specified by the `source` and a set of options (...).
2645-
#' If `source` is not specified, the default data source configured by
2643+
#' The data source is specified by the \code{source} and a set of options (...).
2644+
#' If \code{source} is not specified, the default data source configured by
26462645
#' spark.sql.sources.default will be used.
26472646
#'
26482647
#' Additionally, mode is used to specify the behavior of the save operation when
@@ -2675,7 +2674,7 @@ setMethod("saveDF",
26752674
#' @note saveAsTable since 1.4.0
26762675
setMethod("saveAsTable",
26772676
signature(df = "SparkDataFrame", tableName = "character"),
2678-
function(df, tableName, source = NULL, mode="error", ...){
2677+
function(df, tableName, source = NULL, mode="error", ...) {
26792678
if (is.null(source)) {
26802679
source <- getDefaultSqlSource()
26812680
}
@@ -2752,11 +2751,11 @@ setMethod("summary",
27522751
#' @param how "any" or "all".
27532752
#' if "any", drop a row if it contains any nulls.
27542753
#' if "all", drop a row only if all its values are null.
2755-
#' if minNonNulls is specified, how is ignored.
2754+
#' if \code{minNonNulls} is specified, how is ignored.
27562755
#' @param minNonNulls if specified, drop rows that have less than
2757-
#' minNonNulls non-null values.
2756+
#' \code{minNonNulls} non-null values.
27582757
#' This overwrites the how parameter.
2759-
#' @param cols optional list of column names to consider. In `fillna`,
2758+
#' @param cols optional list of column names to consider. In \code{fillna},
27602759
#' columns specified in cols that do not have matching data
27612760
#' type are ignored. For example, if value is a character, and
27622761
#' subset contains a non-character column, then the non-character
@@ -2879,8 +2878,8 @@ setMethod("fillna",
28792878
#' in your system to accommodate the contents.
28802879
#'
28812880
#' @param x a SparkDataFrame.
2882-
#' @param row.names NULL or a character vector giving the row names for the data frame.
2883-
#' @param optional If `TRUE`, converting column names is optional.
2881+
#' @param row.names \code{NULL} or a character vector giving the row names for the data frame.
2882+
#' @param optional If \code{TRUE}, converting column names is optional.
28842883
#' @param ... additional arguments to pass to base::as.data.frame.
28852884
#' @return A data.frame.
28862885
#' @family SparkDataFrame functions
@@ -3058,7 +3057,7 @@ setMethod("str",
30583057
#' @note drop since 2.0.0
30593058
setMethod("drop",
30603059
signature(x = "SparkDataFrame"),
3061-
function(x, col, ...) {
3060+
function(x, col) {
30623061
stopifnot(class(col) == "character" || class(col) == "Column")
30633062

30643063
if (class(col) == "Column") {
@@ -3218,8 +3217,8 @@ setMethod("histogram",
32183217
#' and to not change the existing data.
32193218
#' }
32203219
#'
3221-
#' @param x s SparkDataFrame.
3222-
#' @param url JDBC database url of the form `jdbc:subprotocol:subname`.
3220+
#' @param x a SparkDataFrame.
3221+
#' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}.
32233222
#' @param tableName yhe name of the table in the external database.
32243223
#' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default).
32253224
#' @param ... additional JDBC database connection properties.
@@ -3237,7 +3236,7 @@ setMethod("histogram",
32373236
#' @note write.jdbc since 2.0.0
32383237
setMethod("write.jdbc",
32393238
signature(x = "SparkDataFrame", url = "character", tableName = "character"),
3240-
function(x, url, tableName, mode = "error", ...){
3239+
function(x, url, tableName, mode = "error", ...) {
32413240
jmode <- convertToJSaveMode(mode)
32423241
jprops <- varargsToJProperties(...)
32433242
write <- callJMethod(x@sdf, "write")

0 commit comments

Comments
 (0)