apache
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 0 deletions b/‎LICENSE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 63 additions & 0 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 4 additions & 0 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/pkg/R/mllib.R‎
Lines changed: 62 additions & 29 deletions b/‎R/pkg/R/mllib.R‎
Lines changed: 62 additions & 29 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_context.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/inst/tests/testthat/test_context.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/inst/tests/testthat/test_rdd.R‎
Lines changed: 8 additions & 0 deletions b/‎R/pkg/inst/tests/testthat/test_rdd.R‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_sparkSQL.R‎
Lines changed: 36 additions & 0 deletions b/‎R/pkg/inst/tests/testthat/test_sparkSQL.R‎
Lines changed: 36 additions & 0 deletions
@@ -238,6 +238,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
      (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.2.7 - https://github.com/jpmml/jpmml-model)
      (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
+     (BSD License) ANTLR 4.5.2-1 (org.antlr:antlr4:4.5.2-1 - http://wwww.antlr.org/)
      (BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)
      (BSD licence) ANTLR StringTemplate (org.antlr:stringtemplate:3.2.1 - http://www.stringtemplate.org)
      (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)
 
@@ -265,6 +265,7 @@ exportMethods("%in%",
               "var_samp",
               "weekofyear",
               "when",
+              "window",
               "year")
 
 exportClasses("GroupedData")
 
@@ -2131,6 +2131,69 @@ setMethod("from_unixtime", signature(x = "Column"),
             column(jc)
           })
 
+#' window
+#'
+#' Bucketize rows into one or more time windows given a timestamp specifying column. Window
+#' starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
+#' [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in
+#' the order of months are not supported.
+#'
+#' The time column must be of TimestampType.
+#'
+#' Durations are provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid
+#' interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'.
+#' If the `slideDuration` is not provided, the windows will be tumbling windows.
+#'
+#' The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start
+#' window intervals. For example, in order to have hourly tumbling windows that start 15 minutes
+#' past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`.
+#'
+#' The output column will be a struct called 'window' by default with the nested columns 'start'
+#' and 'end'.
+#'
+#' @family datetime_funcs
+#' @rdname window
+#' @name window
+#' @export
+#' @examples
+#'\dontrun{
+#'   # One minute windows every 15 seconds 10 seconds after the minute, e.g. 09:00:10-09:01:10,
+#'   # 09:00:25-09:01:25, 09:00:40-09:01:40, ...
+#'   window(df$time, "1 minute", "15 seconds", "10 seconds")
+#'
+#'   # One minute tumbling windows 15 seconds after the minute, e.g. 09:00:15-09:01:15,
+#'    # 09:01:15-09:02:15...
+#'   window(df$time, "1 minute", startTime = "15 seconds")
+#'
+#'   # Thirty second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
+#'   window(df$time, "30 seconds", "10 seconds")
+#'}
+setMethod("window", signature(x = "Column"),
+          function(x, windowDuration, slideDuration = NULL, startTime = NULL) {
+            stopifnot(is.character(windowDuration))
+            if (!is.null(slideDuration) && !is.null(startTime)) {
+              stopifnot(is.character(slideDuration) && is.character(startTime))
+              jc <- callJStatic("org.apache.spark.sql.functions",
+                                "window",
+                                x@jc, windowDuration, slideDuration, startTime)
+            } else if (!is.null(slideDuration)) {
+              stopifnot(is.character(slideDuration))
+              jc <- callJStatic("org.apache.spark.sql.functions",
+                                "window",
+                                x@jc, windowDuration, slideDuration)
+            } else if (!is.null(startTime)) {
+              stopifnot(is.character(startTime))
+              jc <- callJStatic("org.apache.spark.sql.functions",
+                                "window",
+                                x@jc, windowDuration, windowDuration, startTime)
+            } else {
+              jc <- callJStatic("org.apache.spark.sql.functions",
+                                "window",
+                                x@jc, windowDuration)
+            }
+            column(jc)
+          })
+
 #' locate
 #'
 #' Locate the position of the first occurrence of substr.
 
@@ -1152,6 +1152,10 @@ setGeneric("var_samp", function(x) { standardGeneric("var_samp") })
 #' @export
 setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
 
+#' @rdname window
+#' @export
+setGeneric("window", function(x, ...) { standardGeneric("window") })
+
 #' @rdname year
 #' @export
 setGeneric("year", function(x) { standardGeneric("year") })
 
@@ -32,6 +32,11 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
 #' @export
 setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj"))
 
+#' @title S4 class that represents a KMeansModel
+#' @param jobj a Java object reference to the backing Scala KMeansModel
+#' @export
+setClass("KMeansModel", representation(jobj = "jobj"))
+
 #' Fits a generalized linear model
 #'
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
@@ -154,17 +159,6 @@ setMethod("summary", signature(object = "PipelineModel"),
               colnames(coefficients) <- c("Estimate")
               rownames(coefficients) <- unlist(features)
               return(list(coefficients = coefficients))
-            } else if (modelName == "KMeansModel") {
-              modelSize <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                       "getKMeansModelSize", object@model)
-              cluster <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                     "getKMeansCluster", object@model, "classes")
-              k <- unlist(modelSize)[1]
-              size <- unlist(modelSize)[-1]
-              coefficients <- t(matrix(coefficients, ncol = k))
-              colnames(coefficients) <- unlist(features)
-              rownames(coefficients) <- 1:k
-              return(list(coefficients = coefficients, size = size, cluster = dataFrame(cluster)))
             } else {
               stop(paste("Unsupported model", modelName, sep = " "))
             }
@@ -213,21 +207,21 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 #' @examples
 #' \dontrun{
 #' model <- kmeans(x, centers = 2, algorithm="random")
-#'}
+#' }
 setMethod("kmeans", signature(x = "DataFrame"),
           function(x, centers, iter.max = 10, algorithm = c("random", "k-means||")) {
             columnNames <- as.array(colnames(x))
             algorithm <- match.arg(algorithm)
-            model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers", "fitKMeans", x@sdf,
-                                 algorithm, iter.max, centers, columnNames)
-            return(new("PipelineModel", model = model))
+            jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", x@sdf,
+                                centers, iter.max, algorithm, columnNames)
+            return(new("KMeansModel", jobj = jobj))
          })
 
-#' Get fitted result from a model
+#' Get fitted result from a k-means model
 #'
-#' Get fitted result from a model, similarly to R's fitted().
+#' Get fitted result from a k-means model, similarly to R's fitted().
 #'
-#' @param object A fitted MLlib model
+#' @param object A fitted k-means model
 #' @return DataFrame containing fitted values
 #' @rdname fitted
 #' @export
@@ -237,19 +231,58 @@ setMethod("kmeans", signature(x = "DataFrame"),
 #' fitted.model <- fitted(model)
 #' showDF(fitted.model)
 #'}
-setMethod("fitted", signature(object = "PipelineModel"),
+setMethod("fitted", signature(object = "KMeansModel"),
           function(object, method = c("centers", "classes"), ...) {
-            modelName <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                     "getModelName", object@model)
+            method <- match.arg(method)
+            return(dataFrame(callJMethod(object@jobj, "fitted", method)))
+          })
 
-            if (modelName == "KMeansModel") {
-              method <- match.arg(method)
-              fittedResult <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                          "getKMeansCluster", object@model, method)
-              return(dataFrame(fittedResult))
-            } else {
-              stop(paste("Unsupported model", modelName, sep = " "))
-            }
+#' Get the summary of a k-means model
+#'
+#' Returns the summary of a k-means model produced by kmeans(),
+#' similarly to R's summary().
+#'
+#' @param object a fitted k-means model
+#' @return the model's coefficients, size and cluster
+#' @rdname summary
+#' @export
+#' @examples
+#' \dontrun{
+#' model <- kmeans(trainingData, 2)
+#' summary(model)
+#' }
+setMethod("summary", signature(object = "KMeansModel"),
+          function(object, ...) {
+            jobj <- object@jobj
+            features <- callJMethod(jobj, "features")
+            coefficients <- callJMethod(jobj, "coefficients")
+            cluster <- callJMethod(jobj, "cluster")
+            k <- callJMethod(jobj, "k")
+            size <- callJMethod(jobj, "size")
+            coefficients <- t(matrix(coefficients, ncol = k))
+            colnames(coefficients) <- unlist(features)
+            rownames(coefficients) <- 1:k
+            return(list(coefficients = coefficients, size = size, cluster = dataFrame(cluster)))
+          })
+
+#' Make predictions from a k-means model
+#'
+#' Make predictions from a model produced by kmeans().
+#'
+#' @param object A fitted k-means model
+#' @param newData DataFrame for testing
+#' @return DataFrame containing predicted labels in a column named "prediction"
+#' @rdname predict
+#' @export
+#' @examples
+#' \dontrun{
+#' model <- kmeans(trainingData, 2)
+#' predicted <- predict(model, testData)
+#' showDF(predicted)
+#' }
+setMethod("predict", signature(object = "KMeansModel"),
+          function(object, newData) {
+            return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
           })
 
 #' Fit a Bernoulli naive Bayes model
 
@@ -26,7 +26,7 @@ test_that("Check masked functions", {
   maskedBySparkR <- masked[funcSparkROrEmpty]
   namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
                      "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
-                     "summary", "transform", "drop")
+                     "summary", "transform", "drop", "window")
   expect_equal(length(maskedBySparkR), length(namesOfMasked))
   expect_equal(sort(maskedBySparkR), sort(namesOfMasked))
   # above are those reported as masked when `library(SparkR)`
 
@@ -791,3 +791,11 @@ test_that("sampleByKey() on pairwise RDDs", {
   expect_equal(lookup(sample, 3)[which.min(lookup(sample, 3))] >= 0, TRUE)
   expect_equal(lookup(sample, 3)[which.max(lookup(sample, 3))] <= 2000, TRUE)
 })
+
+test_that("Test correct concurrency of RRDD.compute()", {
+  rdd <- parallelize(sc, 1:1000, 100)
+  jrdd <- getJRDD(lapply(rdd, function(x) { x }), "row")
+  zrdd <- callJMethod(jrdd, "zip", jrdd)
+  count <- callJMethod(zrdd, "count")
+  expect_equal(count, 1000)
+})
@@ -1204,6 +1204,42 @@ test_that("greatest() and least() on a DataFrame", {
   expect_equal(collect(select(df, least(df$a, df$b)))[, 1], c(1, 3))
 })
 
+test_that("time windowing (window()) with all inputs", {
+  df <- createDataFrame(sqlContext, data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+  df$window <- window(df$t, "5 seconds", "5 seconds", "0 seconds")
+  local <- collect(df)$v
+  # Not checking time windows because of possible time zone issues. Just checking that the function
+  # works
+  expect_equal(local, c(1))
+})
+
+test_that("time windowing (window()) with slide duration", {
+  df <- createDataFrame(sqlContext, data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+  df$window <- window(df$t, "5 seconds", "2 seconds")
+  local <- collect(df)$v
+  # Not checking time windows because of possible time zone issues. Just checking that the function
+  # works
+  expect_equal(local, c(1, 1))
+})
+
+test_that("time windowing (window()) with start time", {
+  df <- createDataFrame(sqlContext, data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+  df$window <- window(df$t, "5 seconds", startTime = "2 seconds")
+  local <- collect(df)$v
+  # Not checking time windows because of possible time zone issues. Just checking that the function
+  # works
+  expect_equal(local, c(1))
+})
+
+test_that("time windowing (window()) with just window duration", {
+  df <- createDataFrame(sqlContext, data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+  df$window <- window(df$t, "5 seconds")
+  local <- collect(df)$v
+  # Not checking time windows because of possible time zone issues. Just checking that the function
+  # works
+  expect_equal(local, c(1))
+})
+
 test_that("when(), otherwise() and ifelse() on a DataFrame", {
   l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
   df <- createDataFrame(sqlContext, l)