apache
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.travis.yml‎
Lines changed: 1 addition & 1 deletion b/‎.travis.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 4 additions & 1 deletion b/‎R/pkg/NAMESPACE‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 45 additions & 4 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 45 additions & 4 deletions
diff --git a/‎R/pkg/R/RDD.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/R/RDD.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 97 additions & 9 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 97 additions & 9 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 18 additions & 3 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎R/pkg/R/install.R‎
Lines changed: 13 additions & 3 deletions b/‎R/pkg/R/install.R‎
Lines changed: 13 additions & 3 deletions
@@ -42,6 +42,7 @@ dependency-reduced-pom.xml
 derby.log
 dev/create-release/*final
 dev/create-release/*txt
+dev/pr-deps/
 dist/
 docs/_site
 docs/api
 
@@ -44,7 +44,7 @@ notifications:
 # 5. Run maven install before running lint-java.
 install:
   - export MAVEN_SKIP_RC=1
-  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
+  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Pkinesis-asl -Phive -Phive-thriftserver install
 
 # 6. Run lint-java.
 script:
 
@@ -65,7 +65,8 @@ exportMethods("glm",
               "spark.logit",
               "spark.randomForest",
               "spark.gbt",
-              "spark.bisectingKmeans")
+              "spark.bisectingKmeans",
+              "spark.svmLinear")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -81,6 +82,7 @@ exportMethods("arrange",
               "as.data.frame",
               "attach",
               "cache",
+              "coalesce",
               "collect",
               "colnames",
               "colnames<-",
@@ -325,6 +327,7 @@ exportMethods("%in%",
               "toDegrees",
               "toRadians",
               "to_date",
+              "to_timestamp",
               "to_utc_timestamp",
               "translate",
               "trim",
 
@@ -415,7 +415,7 @@ setMethod("coltypes",
                   type <- PRIMITIVE_TYPES[[specialtype]]
                 }
               }
-              type
+              type[[1]]
             })
 
             # Find which types don't have mapping to R
@@ -678,14 +678,53 @@ setMethod("storageLevel",
             storageLevelToString(callJMethod(x@sdf, "storageLevel"))
           })
 
+#' Coalesce
+#'
+#' Returns a new SparkDataFrame that has exactly \code{numPartitions} partitions.
+#' This operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100
+#' partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of
+#' the current partitions. If a larger number of partitions is requested, it will stay at the
+#' current number of partitions.
+#'
+#' However, if you're doing a drastic coalesce on a SparkDataFrame, e.g. to numPartitions = 1,
+#' this may result in your computation taking place on fewer nodes than
+#' you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+#' call \code{repartition}. This will add a shuffle step, but means the
+#' current upstream partitions will be executed in parallel (per whatever
+#' the current partitioning is).
+#'
+#' @param numPartitions the number of partitions to use.
+#'
+#' @family SparkDataFrame functions
+#' @rdname coalesce
+#' @name coalesce
+#' @aliases coalesce,SparkDataFrame-method
+#' @seealso \link{repartition}
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' newDF <- coalesce(df, 1L)
+#'}
+#' @note coalesce(SparkDataFrame) since 2.1.1
+setMethod("coalesce",
+          signature(x = "SparkDataFrame"),
+          function(x, numPartitions) {
+            stopifnot(is.numeric(numPartitions))
+            sdf <- callJMethod(x@sdf, "coalesce", numToInt(numPartitions))
+            dataFrame(sdf)
+          })
+
 #' Repartition
 #'
 #' The following options for repartition are possible:
 #' \itemize{
-#'  \item{1.} {Return a new SparkDataFrame partitioned by
+#'  \item{1.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.}
+#'  \item{2.} {Return a new SparkDataFrame hash partitioned by
 #'                      the given columns into \code{numPartitions}.}
-#'  \item{2.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.}
-#'  \item{3.} {Return a new SparkDataFrame partitioned by the given column(s),
+#'  \item{3.} {Return a new SparkDataFrame hash partitioned by the given column(s),
 #'                      using \code{spark.sql.shuffle.partitions} as number of partitions.}
 #'}
 #' @param x a SparkDataFrame.
@@ -697,6 +736,7 @@ setMethod("storageLevel",
 #' @rdname repartition
 #' @name repartition
 #' @aliases repartition,SparkDataFrame-method
+#' @seealso \link{coalesce}
 #' @export
 #' @examples
 #'\dontrun{
@@ -1136,6 +1176,7 @@ setMethod("collect",
                   if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") {
                     vec <- do.call(c, col)
                     stopifnot(class(vec) != "list")
+                    class(vec) <- PRIMITIVE_TYPES[[colType]]
                     df[[colIndex]] <- vec
                   } else {
                     df[[colIndex]] <- col
 
@@ -1028,7 +1028,7 @@ setMethod("repartitionRDD",
           signature(x = "RDD"),
           function(x, numPartitions) {
             if (!is.null(numPartitions) && is.numeric(numPartitions)) {
-              coalesce(x, numPartitions, TRUE)
+              coalesceRDD(x, numPartitions, TRUE)
             } else {
               stop("Please, specify the number of partitions")
             }
@@ -1049,7 +1049,7 @@ setMethod("repartitionRDD",
 #' @rdname coalesce
 #' @aliases coalesce,RDD
 #' @noRd
-setMethod("coalesce",
+setMethod("coalesceRDD",
            signature(x = "RDD", numPartitions = "numeric"),
            function(x, numPartitions, shuffle = FALSE) {
              numPartitions <- numToInt(numPartitions)
 
@@ -286,6 +286,28 @@ setMethod("ceil",
             column(jc)
           })
 
+#' Returns the first column that is not NA
+#'
+#' Returns the first column that is not NA, or NA if all inputs are.
+#'
+#' @rdname coalesce
+#' @name coalesce
+#' @family normal_funcs
+#' @export
+#' @aliases coalesce,Column-method
+#' @examples \dontrun{coalesce(df$c, df$d, df$e)}
+#' @note coalesce(Column) since 2.1.1
+setMethod("coalesce",
+          signature(x = "Column"),
+          function(x, ...) {
+            jcols <- lapply(list(x, ...), function (x) {
+              stopifnot(class(x) == "Column")
+              x@jc
+            })
+            jc <- callJStatic("org.apache.spark.sql.functions", "coalesce", jcols)
+            column(jc)
+          })
+
 #' Though scala functions has "col" function, we don't expose it in SparkR
 #' because we don't want to conflict with the "col" function in the R base
 #' package and we also have "column" function exported which is an alias of "col".
@@ -297,15 +319,15 @@ col <- function(x) {
 #' Returns a Column based on the given column name
 #'
 #' Returns a Column based on the given column name.
-#
+#'
 #' @param x Character column name.
 #'
 #' @rdname column
 #' @name column
 #' @family normal_funcs
 #' @export
 #' @aliases column,character-method
-#' @examples \dontrun{column(df)}
+#' @examples \dontrun{column("name")}
 #' @note column since 1.6.0
 setMethod("column",
           signature(x = "character"),
@@ -1730,24 +1752,90 @@ setMethod("toRadians",
 
 #' to_date
 #'
-#' Converts the column into DateType.
+#' Converts the column into a DateType. You may optionally specify a format
+#' according to the rules in:
+#' \url{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}.
+#' If the string cannot be parsed according to the specified format (or default),
+#' the value of the column will be null.
+#' The default format is 'yyyy-MM-dd'.
 #'
-#' @param x Column to compute on.
+#' @param x Column to parse.
+#' @param format string to use to parse x Column to DateType. (optional)
 #'
 #' @rdname to_date
 #' @name to_date
 #' @family datetime_funcs
-#' @aliases to_date,Column-method
+#' @aliases to_date,Column,missing-method
 #' @export
-#' @examples \dontrun{to_date(df$c)}
-#' @note to_date since 1.5.0
+#' @examples
+#' \dontrun{
+#' to_date(df$c)
+#' to_date(df$c, 'yyyy-MM-dd')
+#' }
+#' @note to_date(Column) since 1.5.0
 setMethod("to_date",
-          signature(x = "Column"),
-          function(x) {
+          signature(x = "Column", format = "missing"),
+          function(x, format) {
             jc <- callJStatic("org.apache.spark.sql.functions", "to_date", x@jc)
             column(jc)
           })
 
+#' @rdname to_date
+#' @name to_date
+#' @family datetime_funcs
+#' @aliases to_date,Column,character-method
+#' @export
+#' @note to_date(Column, character) since 2.2.0
+setMethod("to_date",
+          signature(x = "Column", format = "character"),
+          function(x, format) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "to_date", x@jc, format)
+            column(jc)
+          })
+
+#' to_timestamp
+#'
+#' Converts the column into a TimestampType. You may optionally specify a format
+#' according to the rules in:
+#' \url{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}.
+#' If the string cannot be parsed according to the specified format (or default),
+#' the value of the column will be null.
+#' The default format is 'yyyy-MM-dd HH:mm:ss'.
+#'
+#' @param x Column to parse.
+#' @param format string to use to parse x Column to DateType. (optional)
+#'
+#' @rdname to_timestamp
+#' @name to_timestamp
+#' @family datetime_funcs
+#' @aliases to_timestamp,Column,missing-method
+#' @export
+#' @examples
+#' \dontrun{
+#' to_timestamp(df$c)
+#' to_timestamp(df$c, 'yyyy-MM-dd')
+#' }
+#' @note to_timestamp(Column) since 2.2.0
+setMethod("to_timestamp",
+          signature(x = "Column", format = "missing"),
+          function(x, format) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "to_timestamp", x@jc)
+            column(jc)
+          })
+
+#' @rdname to_timestamp
+#' @name to_timestamp
+#' @family datetime_funcs
+#' @aliases to_timestamp,Column,character-method
+#' @export
+#' @note to_timestamp(Column, character) since 2.2.0
+setMethod("to_timestamp",
+          signature(x = "Column", format = "character"),
+          function(x, format) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "to_timestamp", x@jc, format)
+            column(jc)
+          })
+
 #' trim
 #'
 #' Trim the spaces from both ends for the specified string column.
 
@@ -28,7 +28,7 @@ setGeneric("cacheRDD", function(x) { standardGeneric("cacheRDD") })
 # @rdname coalesce
 # @seealso repartition
 # @export
-setGeneric("coalesce", function(x, numPartitions, ...) { standardGeneric("coalesce") })
+setGeneric("coalesceRDD", function(x, numPartitions, ...) { standardGeneric("coalesceRDD") })
 
 # @rdname checkpoint-methods
 # @export
@@ -66,7 +66,7 @@ setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("fre
 # @rdname approxQuantile
 # @export
 setGeneric("approxQuantile",
-           function(x, col, probabilities, relativeError) {
+           function(x, cols, probabilities, relativeError) {
              standardGeneric("approxQuantile")
            })
 
@@ -406,6 +406,13 @@ setGeneric("attach")
 #' @export
 setGeneric("cache", function(x) { standardGeneric("cache") })
 
+#' @rdname coalesce
+#' @param x a Column or a SparkDataFrame.
+#' @param ... additional argument(s). If \code{x} is a Column, additional Columns can be optionally
+#'        provided.
+#' @export
+setGeneric("coalesce", function(x, ...) { standardGeneric("coalesce") })
+
 #' @rdname collect
 #' @export
 setGeneric("collect", function(x, ...) { standardGeneric("collect") })
@@ -1256,7 +1263,11 @@ setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
 
 #' @rdname to_date
 #' @export
-setGeneric("to_date", function(x) { standardGeneric("to_date") })
+setGeneric("to_date", function(x, format) { standardGeneric("to_date") })
+
+#' @rdname to_timestamp
+#' @export
+setGeneric("to_timestamp", function(x, format) { standardGeneric("to_timestamp") })
 
 #' @rdname to_utc_timestamp
 #' @export
@@ -1397,6 +1408,10 @@ setGeneric("spark.randomForest",
 #' @export
 setGeneric("spark.survreg", function(data, formula) { standardGeneric("spark.survreg") })
 
+#' @rdname spark.svmLinear
+#' @export
+setGeneric("spark.svmLinear", function(data, formula, ...) { standardGeneric("spark.svmLinear") })
+
 #' @rdname spark.lda
 #' @export
 setGeneric("spark.posterior", function(object, newData) { standardGeneric("spark.posterior") })
 
@@ -21,9 +21,9 @@
 #' Download and Install Apache Spark to a Local Directory
 #'
 #' \code{install.spark} downloads and installs Spark to a local directory if
-#' it is not found. The Spark version we use is the same as the SparkR version.
-#' Users can specify a desired Hadoop version, the remote mirror site, and
-#' the directory where the package is installed locally.
+#' it is not found. If SPARK_HOME is set in the environment, and that directory is found, that is
+#' returned. The Spark version we use is the same as the SparkR version. Users can specify a desired
+#' Hadoop version, the remote mirror site, and the directory where the package is installed locally.
 #'
 #' The full url of remote file is inferred from \code{mirrorUrl} and \code{hadoopVersion}.
 #' \code{mirrorUrl} specifies the remote path to a Spark folder. It is followed by a subfolder
@@ -68,6 +68,16 @@
 #'          \href{http://spark.apache.org/downloads.html}{Apache Spark}
 install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
                           localDir = NULL, overwrite = FALSE) {
+  sparkHome <- Sys.getenv("SPARK_HOME")
+  if (isSparkRShell()) {
+    stopifnot(nchar(sparkHome) > 0)
+    message("Spark is already running in sparkR shell.")
+    return(invisible(sparkHome))
+  } else if (!is.na(file.info(sparkHome)$isdir)) {
+    message("Spark package found in SPARK_HOME: ", sparkHome)
+    return(invisible(sparkHome))
+  }
+
   version <- paste0("spark-", packageVersion("SparkR"))
   hadoopVersion <- tolower(hadoopVersion)
   hadoopVersionName <- hadoopVersionName(hadoopVersion)