apache
diff --git a/‎.github/PULL_REQUEST_TEMPLATE‎
Lines changed: 12 additions & 0 deletions b/‎.github/PULL_REQUEST_TEMPLATE‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 9 additions & 9 deletions b/‎LICENSE‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 4 additions & 1 deletion b/‎R/pkg/NAMESPACE‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 3 additions & 3 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 15 additions & 0 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎R/pkg/R/mllib.R‎
Lines changed: 70 additions & 4 deletions b/‎R/pkg/R/mllib.R‎
Lines changed: 70 additions & 4 deletions
diff --git a/‎R/pkg/R/pairRDD.R‎
Lines changed: 5 additions & 5 deletions b/‎R/pkg/R/pairRDD.R‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎R/pkg/R/serialize.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/R/serialize.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/R/sparkR.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/R/sparkR.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/R/stats.R‎
Lines changed: 39 additions & 0 deletions b/‎R/pkg/R/stats.R‎
Lines changed: 39 additions & 0 deletions
@@ -0,0 +1,12 @@
+## What changes were proposed in this pull request?
+
+(Please fill in changes proposed in this fix)
+
+
+## How was this patch tested?
+
+(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
+
+
+(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)
+
@@ -249,14 +249,14 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
         (Interpreter classes (all .scala files in repl/src/main/scala
         except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
         and for SerializableMapWrapper in JavaUtils.scala)
-     (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.10.5 - http://www.scala-lang.org/)
-     (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.10.5 - http://www.scala-lang.org/)
-     (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.10.5 - http://www.scala-lang.org/)
-     (BSD-like) Scala Library (org.scala-lang:scala-library:2.10.5 - http://www.scala-lang.org/)
-     (BSD-like) Scalap (org.scala-lang:scalap:2.10.5 - http://www.scala-lang.org/)
-     (BSD-style) scalacheck (org.scalacheck:scalacheck_2.10:1.10.0 - http://www.scalacheck.org)
-     (BSD-style) spire (org.spire-math:spire_2.10:0.7.1 - http://spire-math.org)
-     (BSD-style) spire-macros (org.spire-math:spire-macros_2.10:0.7.1 - http://spire-math.org)
+     (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.11.7 - http://www.scala-lang.org/)
+     (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.11.7 - http://www.scala-lang.org/)
+     (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.11.7 - http://www.scala-lang.org/)
+     (BSD-like) Scala Library (org.scala-lang:scala-library:2.11.7 - http://www.scala-lang.org/)
+     (BSD-like) Scalap (org.scala-lang:scalap:2.11.7 - http://www.scala-lang.org/)
+     (BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org)
+     (BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org)
+     (BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org)
      (New BSD License) Kryo (com.esotericsoftware.kryo:kryo:2.21 - http://code.google.com/p/kryo/)
      (New BSD License) MinLog (com.esotericsoftware.minlog:minlog:1.2 - http://code.google.com/p/minlog/)
      (New BSD License) ReflectASM (com.esotericsoftware.reflectasm:reflectasm:1.07 - http://code.google.com/p/reflectasm/)
@@ -283,7 +283,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (MIT License) SLF4J API Module (org.slf4j:slf4j-api:1.7.5 - http://www.slf4j.org)
      (MIT License) SLF4J LOG4J-12 Binding (org.slf4j:slf4j-log4j12:1.7.5 - http://www.slf4j.org)
      (MIT License) pyrolite (org.spark-project:pyrolite:2.0.1 - http://pythonhosted.org/Pyro4/)
-     (MIT License) scopt (com.github.scopt:scopt_2.10:3.2.0 - https://github.com/scopt/scopt)
+     (MIT License) scopt (com.github.scopt:scopt_2.11:3.2.0 - https://github.com/scopt/scopt)
      (The MIT License) Mockito (org.mockito:mockito-core:1.9.5 - http://www.mockito.org)
      (MIT License) jquery (https://jquery.org/license/)
      (MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs)
 
@@ -13,7 +13,9 @@ export("print.jobj")
 # MLlib integration
 exportMethods("glm",
               "predict",
-              "summary")
+              "summary",
+              "kmeans",
+              "fitted")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -109,6 +111,7 @@ exportMethods("%in%",
               "add_months",
               "alias",
               "approxCountDistinct",
+              "approxQuantile",
               "array_contains",
               "asc",
               "ascii",
 
@@ -1962,7 +1962,7 @@ setMethod("sha2", signature(y = "Column", x = "numeric"),
 
 #' shiftLeft
 #'
-#' Shift the the given value numBits left. If the given value is a long value, this function
+#' Shift the given value numBits left. If the given value is a long value, this function
 #' will return a long value else it will return an integer value.
 #'
 #' @family math_funcs
@@ -1980,7 +1980,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
 
 #' shiftRight
 #'
-#' Shift the the given value numBits right. If the given value is a long value, it will return
+#' Shift the given value numBits right. If the given value is a long value, it will return
 #' a long value else it will return an integer value.
 #'
 #' @family math_funcs
@@ -1998,7 +1998,7 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"),
 
 #' shiftRightUnsigned
 #'
-#' Unsigned shift the the given value numBits right. If the given value is a long value,
+#' Unsigned shift the given value numBits right. If the given value is a long value,
 #' it will return a long value else it will return an integer value.
 #'
 #' @family math_funcs
 
@@ -67,6 +67,13 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 # @export
 setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
 
+# @rdname statfunctions
+# @export
+setGeneric("approxQuantile",
+           function(x, col, probabilities, relativeError) {
+             standardGeneric("approxQuantile")
+           })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
@@ -1160,3 +1167,11 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") })
 #' @rdname rbind
 #' @export
 setGeneric("rbind", signature = "...")
+
+#' @rdname kmeans
+#' @export
+setGeneric("kmeans")
+
+#' @rdname fitted
+#' @export
+setGeneric("fitted")
@@ -104,11 +104,11 @@ setMethod("predict", signature(object = "PipelineModel"),
 setMethod("summary", signature(object = "PipelineModel"),
           function(object, ...) {
             modelName <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelName", object@model)
+                                     "getModelName", object@model)
             features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelFeatures", object@model)
+                                    "getModelFeatures", object@model)
             coefficients <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelCoefficients", object@model)
+                                        "getModelCoefficients", object@model)
             if (modelName == "LinearRegressionModel") {
               devianceResiduals <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
                                                "getModelDevianceResiduals", object@model)
@@ -119,10 +119,76 @@ setMethod("summary", signature(object = "PipelineModel"),
               colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
               rownames(coefficients) <- unlist(features)
               return(list(devianceResiduals = devianceResiduals, coefficients = coefficients))
-            } else {
+            } else if (modelName == "LogisticRegressionModel") {
               coefficients <- as.matrix(unlist(coefficients))
               colnames(coefficients) <- c("Estimate")
               rownames(coefficients) <- unlist(features)
               return(list(coefficients = coefficients))
+            } else if (modelName == "KMeansModel") {
+              modelSize <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                       "getKMeansModelSize", object@model)
+              cluster <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                     "getKMeansCluster", object@model, "classes")
+              k <- unlist(modelSize)[1]
+              size <- unlist(modelSize)[-1]
+              coefficients <- t(matrix(coefficients, ncol = k))
+              colnames(coefficients) <- unlist(features)
+              rownames(coefficients) <- 1:k
+              return(list(coefficients = coefficients, size = size, cluster = dataFrame(cluster)))
+            } else {
+              stop(paste("Unsupported model", modelName, sep = " "))
+            }
+          })
+
+#' Fit a k-means model
+#'
+#' Fit a k-means model, similarly to R's kmeans().
+#'
+#' @param x DataFrame for training
+#' @param centers Number of centers
+#' @param iter.max Maximum iteration number
+#' @param algorithm Algorithm choosen to fit the model
+#' @return A fitted k-means model
+#' @rdname kmeans
+#' @export
+#' @examples
+#'\dontrun{
+#' model <- kmeans(x, centers = 2, algorithm="random")
+#'}
+setMethod("kmeans", signature(x = "DataFrame"),
+          function(x, centers, iter.max = 10, algorithm = c("random", "k-means||")) {
+            columnNames <- as.array(colnames(x))
+            algorithm <- match.arg(algorithm)
+            model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers", "fitKMeans", x@sdf,
+                                 algorithm, iter.max, centers, columnNames)
+            return(new("PipelineModel", model = model))
+         })
+
+#' Get fitted result from a model
+#'
+#' Get fitted result from a model, similarly to R's fitted().
+#'
+#' @param object A fitted MLlib model
+#' @return DataFrame containing fitted values
+#' @rdname fitted
+#' @export
+#' @examples
+#'\dontrun{
+#' model <- kmeans(trainingData, 2)
+#' fitted.model <- fitted(model)
+#' showDF(fitted.model)
+#'}
+setMethod("fitted", signature(object = "PipelineModel"),
+          function(object, method = c("centers", "classes"), ...) {
+            modelName <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                     "getModelName", object@model)
+
+            if (modelName == "KMeansModel") {
+              method <- match.arg(method)
+              fittedResult <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                          "getKMeansCluster", object@model, method)
+              return(dataFrame(fittedResult))
+            } else {
+              stop(paste("Unsupported model", modelName, sep = " "))
             }
           })
@@ -305,11 +305,11 @@ setMethod("groupByKey",
 #'  Merge values by key
 #'
 #' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
-#' and merges the values for each key using an associative reduce function.
+#' and merges the values for each key using an associative and commutative reduce function.
 #'
 #' @param x The RDD to reduce by key. Should be an RDD where each element is
 #'             list(K, V) or c(K, V).
-#' @param combineFunc The associative reduce function to use.
+#' @param combineFunc The associative and commutative reduce function to use.
 #' @param numPartitions Number of partitions to create.
 #' @return An RDD where each element is list(K, V') where V' is the merged
 #'         value
@@ -347,12 +347,12 @@ setMethod("reduceByKey",
 #' Merge values by key locally
 #'
 #' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
-#' and merges the values for each key using an associative reduce function, but return the
-#' results immediately to the driver as an R list.
+#' and merges the values for each key using an associative and commutative reduce function, but
+#' return the results immediately to the driver as an R list.
 #'
 #' @param x The RDD to reduce by key. Should be an RDD where each element is
 #'             list(K, V) or c(K, V).
-#' @param combineFunc The associative reduce function to use.
+#' @param combineFunc The associative and commutative reduce function to use.
 #' @return A list of elements of type list(K, V') where V' is the merged value for each key
 #' @seealso reduceByKey
 #' @examples
 
@@ -54,7 +54,7 @@ writeObject <- function(con, object, writeType = TRUE) {
   # passing in vectors as arrays and instead require arrays to be passed
   # as lists.
   type <- class(object)[[1]]  # class of POSIXlt is c("POSIXlt", "POSIXt")
-  # Checking types is needed here, since ‘is.na’ only handles atomic vectors,
+  # Checking types is needed here, since 'is.na' only handles atomic vectors,
   # lists and pairlists
   if (type %in% c("integer", "character", "logical", "double", "numeric")) {
     if (is.na(object)) {
 
@@ -299,7 +299,7 @@ sparkRHive.init <- function(jsc = NULL) {
 #'
 #' @param sc existing spark context
 #' @param groupid the ID to be assigned to job groups
-#' @param description description for the the job group ID
+#' @param description description for the job group ID
 #' @param interruptOnCancel flag to indicate if the job is interrupted on job cancellation
 #' @examples
 #'\dontrun{
 
@@ -130,6 +130,45 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
             collect(dataFrame(sct))
           })
 
+#' approxQuantile
+#'
+#' Calculates the approximate quantiles of a numerical column of a DataFrame.
+#'
+#' The result of this algorithm has the following deterministic bound:
+#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error
+#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank
+#' of `x` is close to (p * N). More precisely,
+#'   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
+#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
+#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param col The name of the numerical column.
+#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
+#'                      For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
+#'                      the exact quantiles are computed, which could be very expensive.
+#'                      Note that values greater than 1 are accepted but give the same result as 1.
+#' @return The approximate quantiles at the given probabilities.
+#'
+#' @rdname statfunctions
+#' @name approxQuantile
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+#' }
+setMethod("approxQuantile",
+          signature(x = "DataFrame", col = "character",
+                    probabilities = "numeric", relativeError = "numeric"),
+          function(x, col, probabilities, relativeError) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            callJMethod(statFunctions, "approxQuantile", col,
+                        as.list(probabilities), relativeError)
+          })
+
 #' sampleBy
 #'
 #' Returns a stratified sample without replacement based on the fraction given on each stratum.