CodingCat
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 3 additions & 1 deletion b/‎R/pkg/NAMESPACE‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 28 additions & 0 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎R/pkg/R/backend.R‎
Lines changed: 3 additions & 1 deletion b/‎R/pkg/R/backend.R‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎R/pkg/R/client.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/R/client.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/R/deserialize.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/R/deserialize.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 11 additions & 7 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎R/pkg/R/mllib.R‎
Lines changed: 27 additions & 1 deletion b/‎R/pkg/R/mllib.R‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎R/pkg/R/pairRDD.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/R/pairRDD.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/R/sparkR.R‎
Lines changed: 6 additions & 6 deletions b/‎R/pkg/R/sparkR.R‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎R/pkg/inst/tests/test_client.R‎
Lines changed: 4 additions & 0 deletions b/‎R/pkg/inst/tests/test_client.R‎
Lines changed: 4 additions & 0 deletions
@@ -12,7 +12,8 @@ export("print.jobj")
 
 # MLlib integration
 exportMethods("glm",
-              "predict")
+              "predict",
+              "summary")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -26,6 +27,7 @@ exportMethods("arrange",
               "collect",
               "columns",
               "count",
+              "crosstab",
               "describe",
               "distinct",
               "dropna",
 
@@ -1554,3 +1554,31 @@ setMethod("fillna",
             }
             dataFrame(sdf)
           })
+
+#' crosstab
+#'
+#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
+#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
+#' non-zero pair frequencies will be returned.
+#'
+#' @param col1 name of the first column. Distinct items will make the first item of each row.
+#' @param col2 name of the second column. Distinct items will make the column names of the output.
+#' @return a local R data.frame representing the contingency table. The first column of each row
+#'         will be the distinct values of `col1` and the column names will be the distinct values
+#'         of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
+#'         occurrences will have zero as their counts.
+#'
+#' @rdname statfunctions
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlCtx, "/path/to/file.json")
+#' ct = crosstab(df, "title", "gender")
+#' }
+setMethod("crosstab",
+          signature(x = "DataFrame", col1 = "character", col2 = "character"),
+          function(x, col1, col2) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            sct <- callJMethod(statFunctions, "crosstab", col1, col2)
+            collect(dataFrame(sct))
+          })
@@ -110,6 +110,8 @@ invokeJava <- function(isStatic, objId, methodName, ...) {
 
   # TODO: check the status code to output error information
   returnStatus <- readInt(conn)
-  stopifnot(returnStatus == 0)
+  if (returnStatus != 0) {
+    stop(readString(conn))
+  }
   readObject(conn)
 }
@@ -48,7 +48,7 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack
     jars <- paste("--jars", jars)
   }
 
-  if (packages != "") {
+  if (!identical(packages, "")) {
     packages <- paste("--packages", packages)
   }
 
 
@@ -102,11 +102,11 @@ readList <- function(con) {
 
 readRaw <- function(con) {
   dataLen <- readInt(con)
-  data <- readBin(con, raw(), as.integer(dataLen), endian = "big")
+  readBin(con, raw(), as.integer(dataLen), endian = "big")
 }
 
 readRawLen <- function(con, dataLen) {
-  data <- readBin(con, raw(), as.integer(dataLen), endian = "big")
+  readBin(con, raw(), as.integer(dataLen), endian = "big")
 }
 
 readDeserialize <- function(con) {
 
@@ -59,6 +59,10 @@ setGeneric("count", function(x) { standardGeneric("count") })
 # @export
 setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
 
+# @rdname statfunctions
+# @export
+setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
@@ -250,8 +254,10 @@ setGeneric("flatMapValues", function(X, FUN) { standardGeneric("flatMapValues")
 
 # @rdname intersection
 # @export
-setGeneric("intersection", function(x, other, numPartitions = 1) {
-  standardGeneric("intersection") })
+setGeneric("intersection",
+           function(x, other, numPartitions = 1) {
+             standardGeneric("intersection")
+           })
 
 # @rdname keys
 # @export
@@ -485,9 +491,7 @@ setGeneric("sample",
 #' @rdname sample
 #' @export
 setGeneric("sample_frac",
-           function(x, withReplacement, fraction, seed) {
-             standardGeneric("sample_frac")
-           })
+           function(x, withReplacement, fraction, seed) { standardGeneric("sample_frac") })
 
 #' @rdname saveAsParquetFile
 #' @export
@@ -549,8 +553,8 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
 
 #' @rdname withColumnRenamed
 #' @export
-setGeneric("withColumnRenamed", function(x, existingCol, newCol) {
-  standardGeneric("withColumnRenamed") })
+setGeneric("withColumnRenamed",
+           function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })
 
 
 ###################### Column Methods ##########################
 
@@ -27,7 +27,7 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
 #'
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~' and '+'.
+#'                operators are supported, including '~', '+', '-', and '.'.
 #' @param data DataFrame for training
 #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
 #' @param lambda Regularization parameter
@@ -71,3 +71,29 @@ setMethod("predict", signature(object = "PipelineModel"),
           function(object, newData) {
             return(dataFrame(callJMethod(object@model, "transform", newData@sdf)))
           })
+
+#' Get the summary of a model
+#'
+#' Returns the summary of a model produced by glm(), similarly to R's summary().
+#'
+#' @param model A fitted MLlib model
+#' @return a list with a 'coefficient' component, which is the matrix of coefficients. See
+#'         summary.glm for more information.
+#' @rdname glm
+#' @export
+#' @examples
+#'\dontrun{
+#' model <- glm(y ~ x, trainingData)
+#' summary(model)
+#'}
+setMethod("summary", signature(object = "PipelineModel"),
+          function(object) {
+            features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                   "getModelFeatures", object@model)
+            weights <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                   "getModelWeights", object@model)
+            coefficients <- as.matrix(unlist(weights))
+            colnames(coefficients) <- c("Estimate")
+            rownames(coefficients) <- unlist(features)
+            return(list(coefficients = coefficients))
+          })
@@ -202,8 +202,8 @@ setMethod("partitionBy",
 
             packageNamesArr <- serialize(.sparkREnv$.packages,
                                          connection = NULL)
-            broadcastArr <- lapply(ls(.broadcastNames), function(name) {
-                                   get(name, .broadcastNames) })
+            broadcastArr <- lapply(ls(.broadcastNames),
+                                   function(name) { get(name, .broadcastNames) })
             jrdd <- getJRDD(x)
 
             # We create a PairwiseRRDD that extends RDD[(Int, Array[Byte])],
 
@@ -22,7 +22,8 @@
 connExists <- function(env) {
   tryCatch({
     exists(".sparkRCon", envir = env) && isOpen(env[[".sparkRCon"]])
-  }, error = function(err) {
+  },
+  error = function(err) {
     return(FALSE)
   })
 }
@@ -104,16 +105,13 @@ sparkR.init <- function(
     return(get(".sparkRjsc", envir = .sparkREnv))
   }
 
-  sparkMem <- Sys.getenv("SPARK_MEM", "1024m")
   jars <- suppressWarnings(normalizePath(as.character(sparkJars)))
 
   # Classpath separator is ";" on Windows
   # URI needs four /// as from http://stackoverflow.com/a/18522792
   if (.Platform$OS.type == "unix") {
-    collapseChar <- ":"
     uriSep <- "//"
   } else {
-    collapseChar <- ";"
     uriSep <- "////"
   }
 
@@ -156,7 +154,8 @@ sparkR.init <- function(
   .sparkREnv$backendPort <- backendPort
   tryCatch({
     connectBackend("localhost", backendPort)
-  }, error = function(err) {
+  },
+  error = function(err) {
     stop("Failed to connect JVM\n")
   })
 
@@ -267,7 +266,8 @@ sparkRHive.init <- function(jsc = NULL) {
   ssc <- callJMethod(sc, "sc")
   hiveCtx <- tryCatch({
     newJObject("org.apache.spark.sql.hive.HiveContext", ssc)
-  }, error = function(err) {
+  },
+  error = function(err) {
     stop("Spark SQL is not built with Hive support")
   })
 
 
@@ -30,3 +30,7 @@ test_that("no package specified doesn't add packages flag", {
   expect_equal(gsub("[[:space:]]", "", args),
                "")
 })
+
+test_that("multiple packages don't produce a warning", {
+  expect_that(generateSparkSubmitArgs("", "", "", "", c("A", "B")), not(gives_warning()))
+})
Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,8 @@ invokeJava <- function(isStatic, objId, methodName, ...) {`
`110`	`110`
`111`	`111`	`# TODO: check the status code to output error information`
`112`	`112`	`returnStatus <- readInt(conn)`
`113`		`- stopifnot(returnStatus == 0)`
	`113`	`+ if (returnStatus != 0) {`
	`114`	`+ stop(readString(conn))`
	`115`	`+ }`
`114`	`116`	`readObject(conn)`
`115`	`117`	`}`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ generateSparkSubmitArgs <- function(args, sparkHome, jars, sparkSubmitOpts, pack`
`48`	`48`	`jars <- paste("--jars", jars)`
`49`	`49`	`}`
`50`	`50`
`51`		`- if (packages != "") {`
	`51`	`+ if (!identical(packages, "")) {`
`52`	`52`	`packages <- paste("--packages", packages)`
`53`	`53`	`}`
`54`	`54`
Original file line number	Diff line number	Diff line change
`@@ -102,11 +102,11 @@ readList <- function(con) {`
`102`	`102`
`103`	`103`	`readRaw <- function(con) {`
`104`	`104`	`dataLen <- readInt(con)`
`105`		`- data <- readBin(con, raw(), as.integer(dataLen), endian = "big")`
	`105`	`+ readBin(con, raw(), as.integer(dataLen), endian = "big")`
`106`	`106`	`}`
`107`	`107`
`108`	`108`	`readRawLen <- function(con, dataLen) {`
`109`		`- data <- readBin(con, raw(), as.integer(dataLen), endian = "big")`
	`109`	`+ readBin(con, raw(), as.integer(dataLen), endian = "big")`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`readDeserialize <- function(con) {`