apache
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.travis.yml‎
Lines changed: 1 addition & 1 deletion b/‎.travis.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/WINDOWS.md‎
Lines changed: 11 additions & 1 deletion b/‎R/WINDOWS.md‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎R/check-cran.sh‎
Lines changed: 15 additions & 3 deletions b/‎R/check-cran.sh‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎R/pkg/DESCRIPTION‎
Lines changed: 11 additions & 3 deletions b/‎R/pkg/DESCRIPTION‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 10 additions & 1 deletion b/‎R/pkg/NAMESPACE‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 37 additions & 38 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 37 additions & 38 deletions
@@ -23,6 +23,7 @@
 /lib/
 R-unit-tests.log
 R/unit-tests.out
+R/cran-check.out
 build/*.jar
 build/apache-maven*
 build/scala*
 
@@ -44,7 +44,7 @@ notifications:
 # 5. Run maven install before running lint-java.
 install:
   - export MAVEN_SKIP_RC=1
-  - build/mvn -T 4 -q -DskipTests -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
+  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
 
 # 6. Run lint-java.
 script:
 
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.1 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.3 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
 
@@ -4,13 +4,23 @@ To build SparkR on Windows, the following steps are required
 
 1. Install R (>= 3.1) and [Rtools](http://cran.r-project.org/bin/windows/Rtools/). Make sure to
 include Rtools and R in `PATH`.
+
 2. Install
 [JDK7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) and set
 `JAVA_HOME` in the system environment variables.
+
 3. Download and install [Maven](http://maven.apache.org/download.html). Also include the `bin`
 directory in Maven in `PATH`.
+
 4. Set `MAVEN_OPTS` as described in [Building Spark](http://spark.apache.org/docs/latest/building-spark.html).
-5. Open a command shell (`cmd`) in the Spark directory and run `mvn -DskipTests -Psparkr package`
+
+5. Open a command shell (`cmd`) in the Spark directory and build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
+
+    ```bash
+    mvn.cmd -DskipTests -Psparkr package
+    ```
+
+    `.\build\mvn` is a shell script so `mvn.cmd` should be used directly on Windows.
 
 ##  Unit tests
 
 
@@ -43,10 +43,22 @@ $FWDIR/create-docs.sh
 "$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg
 
 # Run check as-cran.
-# TODO(shivaram): Remove the skip tests once we figure out the install mechanism
-
 VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`
 
-"$R_SCRIPT_PATH/"R CMD check --as-cran SparkR_"$VERSION".tar.gz
+CRAN_CHECK_OPTIONS="--as-cran"
+
+if [ -n "$NO_TESTS" ]
+then
+  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-tests"
+fi
+
+if [ -n "$NO_MANUAL" ]
+then
+  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-manual"
+fi
+
+echo "Running CRAN check with $CRAN_CHECK_OPTIONS options"
+
+"$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
 
 popd > /dev/null
@@ -2,9 +2,16 @@ Package: SparkR
 Type: Package
 Title: R Frontend for Apache Spark
 Version: 2.0.0
-Date: 2016-07-07
-Author: The Apache Software Foundation
-Maintainer: Shivaram Venkataraman <[email protected]>
+Date: 2016-08-27
+Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
+                    email = "[email protected]"),
+             person("Xiangrui", "Meng", role = "aut",
+                    email = "[email protected]"),
+             person("Felix", "Cheung", role = "aut",
+                    email = "[email protected]"),
+             person(family = "The Apache Software Foundation", role = c("aut", "cph")))
+URL: http://www.apache.org/ http://spark.apache.org/
+BugReports: https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-ContributingBugReports
 Depends:
     R (>= 3.0),
     methods
@@ -32,6 +39,7 @@ Collate:
     'deserialize.R'
     'functions.R'
     'install.R'
+    'jvm.R'
     'mllib.R'
     'serialize.R'
     'sparkR.R'
 
@@ -1,5 +1,9 @@
 # Imports from base R
-importFrom(methods, setGeneric, setMethod, setOldClass)
+# Do not include stats:: "rpois", "runif" - causes error at runtime
+importFrom("methods", "setGeneric", "setMethod", "setOldClass")
+importFrom("methods", "is", "new", "signature", "show")
+importFrom("stats", "gaussian", "setNames")
+importFrom("utils", "download.file", "packageVersion", "untar")
 
 # Disable native libraries till we figure out how to package it
 # See SPARKR-7839
@@ -23,6 +27,7 @@ exportMethods("glm",
               "summary",
               "spark.kmeans",
               "fitted",
+              "spark.mlp",
               "spark.naiveBayes",
               "spark.survreg",
               "spark.lda",
@@ -359,4 +364,8 @@ S3method(structField, jobj)
 S3method(structType, jobj)
 S3method(structType, structField)
 
+export("sparkR.newJObject")
+export("sparkR.callJMethod")
+export("sparkR.callJStatic")
+
 export("install.spark")
@@ -150,7 +150,7 @@ setMethod("explain",
 
 #' isLocal
 #'
-#' Returns True if the `collect` and `take` methods can be run locally
+#' Returns True if the \code{collect} and \code{take} methods can be run locally
 #' (without any Spark executors).
 #'
 #' @param x A SparkDataFrame
@@ -182,7 +182,7 @@ setMethod("isLocal",
 #' @param numRows the number of rows to print. Defaults to 20.
 #' @param truncate whether truncate long strings. If \code{TRUE}, strings more than
 #'                 20 characters will be truncated. However, if set greater than zero,
-#'                 truncates strings longer than `truncate` characters and all cells
+#'                 truncates strings longer than \code{truncate} characters and all cells
 #'                 will be aligned right.
 #' @param ... further arguments to be passed to or from other methods.
 #' @family SparkDataFrame functions
@@ -212,9 +212,9 @@ setMethod("showDF",
 
 #' show
 #'
-#' Print the SparkDataFrame column names and types
+#' Print class and type information of a Spark object.
 #'
-#' @param object a SparkDataFrame.
+#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec.
 #'
 #' @family SparkDataFrame functions
 #' @rdname show
@@ -642,10 +642,10 @@ setMethod("unpersist",
 #' The following options for repartition are possible:
 #' \itemize{
 #'  \item{1.} {Return a new SparkDataFrame partitioned by
-#'                      the given columns into `numPartitions`.}
-#'  \item{2.} {Return a new SparkDataFrame that has exactly `numPartitions`.}
+#'                      the given columns into \code{numPartitions}.}
+#'  \item{2.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.}
 #'  \item{3.} {Return a new SparkDataFrame partitioned by the given column(s),
-#'                      using `spark.sql.shuffle.partitions` as number of partitions.}
+#'                      using \code{spark.sql.shuffle.partitions} as number of partitions.}
 #'}
 #' @param x a SparkDataFrame.
 #' @param numPartitions the number of partitions to use.
@@ -1132,9 +1132,8 @@ setMethod("take",
 
 #' Head
 #'
-#' Return the first NUM rows of a SparkDataFrame as a R data.frame. If NUM is NULL,
-#' then head() returns the first 6 rows in keeping with the current data.frame
-#' convention in R.
+#' Return the first \code{num} rows of a SparkDataFrame as a R data.frame. If \code{num} is not
+#' specified, then head() returns the first 6 rows as with R data.frame.
 #'
 #' @param x a SparkDataFrame.
 #' @param num the number of rows to return. Default is 6.
@@ -1406,11 +1405,11 @@ setMethod("dapplyCollect",
 #'
 #' @param cols grouping columns.
 #' @param func a function to be applied to each group partition specified by grouping
-#'             column of the SparkDataFrame. The function `func` takes as argument
+#'             column of the SparkDataFrame. The function \code{func} takes as argument
 #'             a key - grouping columns and a data frame - a local R data.frame.
-#'             The output of `func` is a local R data.frame.
+#'             The output of \code{func} is a local R data.frame.
 #' @param schema the schema of the resulting SparkDataFrame after the function is applied.
-#'               The schema must match to output of `func`. It has to be defined for each
+#'               The schema must match to output of \code{func}. It has to be defined for each
 #'               output column with preferred output column name and corresponding data type.
 #' @return A SparkDataFrame.
 #' @family SparkDataFrame functions
@@ -1497,9 +1496,9 @@ setMethod("gapply",
 #'
 #' @param cols grouping columns.
 #' @param func a function to be applied to each group partition specified by grouping
-#'             column of the SparkDataFrame. The function `func` takes as argument
+#'             column of the SparkDataFrame. The function \code{func} takes as argument
 #'             a key - grouping columns and a data frame - a local R data.frame.
-#'             The output of `func` is a local R data.frame.
+#'             The output of \code{func} is a local R data.frame.
 #' @return A data.frame.
 #' @family SparkDataFrame functions
 #' @aliases gapplyCollect,SparkDataFrame-method
@@ -1657,7 +1656,7 @@ setMethod("$", signature(x = "SparkDataFrame"),
             getColumn(x, name)
           })
 
-#' @param value a Column or NULL. If NULL, the specified Column is dropped.
+#' @param value a Column or \code{NULL}. If \code{NULL}, the specified Column is dropped.
 #' @rdname select
 #' @name $<-
 #' @aliases $<-,SparkDataFrame-method
@@ -1747,7 +1746,7 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #' @family subsetting functions
 #' @examples
 #' \dontrun{
-#'   # Columns can be selected using `[[` and `[`
+#'   # Columns can be selected using [[ and [
 #'   df[[2]] == df[["age"]]
 #'   df[,2] == df[,"age"]
 #'   df[,c("name", "age")]
@@ -1792,7 +1791,7 @@ setMethod("subset", signature(x = "SparkDataFrame"),
 #'   select(df, df$name, df$age + 1)
 #'   select(df, c("col1", "col2"))
 #'   select(df, list(df$name, df$age + 1))
-#'   # Similar to R data frames columns can also be selected using `$`
+#'   # Similar to R data frames columns can also be selected using $
 #'   df[,df$age]
 #' }
 #' @note select(SparkDataFrame, character) since 1.4.0
@@ -2443,7 +2442,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #' Return a new SparkDataFrame containing the union of rows
 #'
 #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
-#' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL.
+#' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
 #' Note that this does not remove duplicate rows across the two SparkDataFrames.
 #'
 #' @param x A SparkDataFrame
@@ -2486,7 +2485,7 @@ setMethod("unionAll",
 
 #' Union two or more SparkDataFrames
 #'
-#' Union two or more SparkDataFrames. This is equivalent to `UNION ALL` in SQL.
+#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
 #' Note that this does not remove duplicate rows across the two SparkDataFrames.
 #'
 #' @param x a SparkDataFrame.
@@ -2519,7 +2518,7 @@ setMethod("rbind",
 #' Intersect
 #'
 #' Return a new SparkDataFrame containing rows only in both this SparkDataFrame
-#' and another SparkDataFrame. This is equivalent to `INTERSECT` in SQL.
+#' and another SparkDataFrame. This is equivalent to \code{INTERSECT} in SQL.
 #'
 #' @param x A SparkDataFrame
 #' @param y A SparkDataFrame
@@ -2547,7 +2546,7 @@ setMethod("intersect",
 #' except
 #'
 #' Return a new SparkDataFrame containing rows in this SparkDataFrame
-#' but not in another SparkDataFrame. This is equivalent to `EXCEPT` in SQL.
+#' but not in another SparkDataFrame. This is equivalent to \code{EXCEPT} in SQL.
 #'
 #' @param x a SparkDataFrame.
 #' @param y a SparkDataFrame.
@@ -2576,8 +2575,8 @@ setMethod("except",
 
 #' Save the contents of SparkDataFrame to a data source.
 #'
-#' The data source is specified by the `source` and a set of options (...).
-#' If `source` is not specified, the default data source configured by
+#' The data source is specified by the \code{source} and a set of options (...).
+#' If \code{source} is not specified, the default data source configured by
 #' spark.sql.sources.default will be used.
 #'
 #' Additionally, mode is used to specify the behavior of the save operation when data already
@@ -2613,7 +2612,7 @@ setMethod("except",
 #' @note write.df since 1.4.0
 setMethod("write.df",
           signature(df = "SparkDataFrame", path = "character"),
-          function(df, path, source = NULL, mode = "error", ...){
+          function(df, path, source = NULL, mode = "error", ...) {
             if (is.null(source)) {
               source <- getDefaultSqlSource()
             }
@@ -2635,14 +2634,14 @@ setMethod("write.df",
 #' @note saveDF since 1.4.0
 setMethod("saveDF",
           signature(df = "SparkDataFrame", path = "character"),
-          function(df, path, source = NULL, mode = "error", ...){
+          function(df, path, source = NULL, mode = "error", ...) {
             write.df(df, path, source, mode, ...)
           })
 
 #' Save the contents of the SparkDataFrame to a data source as a table
 #'
-#' The data source is specified by the `source` and a set of options (...).
-#' If `source` is not specified, the default data source configured by
+#' The data source is specified by the \code{source} and a set of options (...).
+#' If \code{source} is not specified, the default data source configured by
 #' spark.sql.sources.default will be used.
 #'
 #' Additionally, mode is used to specify the behavior of the save operation when
@@ -2675,7 +2674,7 @@ setMethod("saveDF",
 #' @note saveAsTable since 1.4.0
 setMethod("saveAsTable",
           signature(df = "SparkDataFrame", tableName = "character"),
-          function(df, tableName, source = NULL, mode="error", ...){
+          function(df, tableName, source = NULL, mode="error", ...) {
             if (is.null(source)) {
               source <- getDefaultSqlSource()
             }
@@ -2752,11 +2751,11 @@ setMethod("summary",
 #' @param how "any" or "all".
 #'            if "any", drop a row if it contains any nulls.
 #'            if "all", drop a row only if all its values are null.
-#'            if minNonNulls is specified, how is ignored.
+#'            if \code{minNonNulls} is specified, how is ignored.
 #' @param minNonNulls if specified, drop rows that have less than
-#'                    minNonNulls non-null values.
+#'                    \code{minNonNulls} non-null values.
 #'                    This overwrites the how parameter.
-#' @param cols optional list of column names to consider. In `fillna`,
+#' @param cols optional list of column names to consider. In \code{fillna},
 #'             columns specified in cols that do not have matching data
 #'             type are ignored. For example, if value is a character, and
 #'             subset contains a non-character column, then the non-character
@@ -2879,8 +2878,8 @@ setMethod("fillna",
 #' in your system to accommodate the contents.
 #'
 #' @param x a SparkDataFrame.
-#' @param row.names NULL or a character vector giving the row names for the data frame.
-#' @param optional If `TRUE`, converting column names is optional.
+#' @param row.names \code{NULL} or a character vector giving the row names for the data frame.
+#' @param optional If \code{TRUE}, converting column names is optional.
 #' @param ... additional arguments to pass to base::as.data.frame.
 #' @return A data.frame.
 #' @family SparkDataFrame functions
@@ -3058,7 +3057,7 @@ setMethod("str",
 #' @note drop since 2.0.0
 setMethod("drop",
           signature(x = "SparkDataFrame"),
-          function(x, col, ...) {
+          function(x, col) {
             stopifnot(class(col) == "character" || class(col) == "Column")
 
             if (class(col) == "Column") {
@@ -3218,8 +3217,8 @@ setMethod("histogram",
 #'         and to not change the existing data.
 #' }
 #'
-#' @param x s SparkDataFrame.
-#' @param url JDBC database url of the form `jdbc:subprotocol:subname`.
+#' @param x a SparkDataFrame.
+#' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}.
 #' @param tableName yhe name of the table in the external database.
 #' @param mode one of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default).
 #' @param ... additional JDBC database connection properties.
@@ -3237,7 +3236,7 @@ setMethod("histogram",
 #' @note write.jdbc since 2.0.0
 setMethod("write.jdbc",
           signature(x = "SparkDataFrame", url = "character", tableName = "character"),
-          function(x, url, tableName, mode = "error", ...){
+          function(x, url, tableName, mode = "error", ...) {
             jmode <- convertToJSaveMode(mode)
             jprops <- varargsToJProperties(...)
             write <- callJMethod(x@sdf, "write")