CodingCat
diff --git a/‎LICENSE‎
Lines changed: 46 additions & 0 deletions b/‎LICENSE‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 129 additions & 6 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 129 additions & 6 deletions
diff --git a/‎R/pkg/R/SQLContext.R‎
Lines changed: 5 additions & 0 deletions b/‎R/pkg/R/SQLContext.R‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 20 additions & 2 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎R/pkg/R/serialize.R‎
Lines changed: 9 additions & 1 deletion b/‎R/pkg/R/serialize.R‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎R/pkg/R/sparkR.R‎
Lines changed: 19 additions & 5 deletions b/‎R/pkg/R/sparkR.R‎
Lines changed: 19 additions & 5 deletions
@@ -853,6 +853,52 @@ and
 
 Vis.js may be distributed under either license.
 
+========================================================================
+For dagre-d3 (core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js):
+========================================================================
+Copyright (c) 2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+========================================================================
+For graphlib-dot (core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js):
+========================================================================
+Copyright (c) 2012-2013 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
 ========================================================================
 BSD-style licenses
 ========================================================================
 
@@ -19,9 +19,11 @@ exportMethods("arrange",
               "count",
               "describe",
               "distinct",
+              "dropna",
               "dtypes",
               "except",
               "explain",
+              "fillna",
               "filter",
               "first",
               "group_by",
 
@@ -1314,9 +1314,8 @@ setMethod("except",
 #' write.df(df, "myfile", "parquet", "overwrite")
 #' }
 setMethod("write.df",
-          signature(df = "DataFrame", path = 'character', source = 'character',
-                    mode = 'character'),
-          function(df, path = NULL, source = NULL, mode = "append", ...){
+          signature(df = "DataFrame", path = 'character'),
+          function(df, path, source = NULL, mode = "append", ...){
             if (is.null(source)) {
               sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
               source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
@@ -1338,9 +1337,8 @@ setMethod("write.df",
 #' @aliases saveDF
 #' @export
 setMethod("saveDF",
-          signature(df = "DataFrame", path = 'character', source = 'character',
-                    mode = 'character'),
-          function(df, path = NULL, source = NULL, mode = "append", ...){
+          signature(df = "DataFrame", path = 'character'),
+          function(df, path, source = NULL, mode = "append", ...){
             write.df(df, path, source, mode, ...)
           })
 
@@ -1431,3 +1429,128 @@ setMethod("describe",
             sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
             dataFrame(sdf)
           })
+
+#' dropna
+#'
+#' Returns a new DataFrame omitting rows with null values.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param how "any" or "all".
+#'            if "any", drop a row if it contains any nulls.
+#'            if "all", drop a row only if all its values are null.
+#'            if minNonNulls is specified, how is ignored.
+#' @param minNonNulls If specified, drop rows that have less than
+#'                    minNonNulls non-null values.
+#'                    This overwrites the how parameter.
+#' @param cols Optional list of column names to consider.
+#' @return A DataFrame
+#' 
+#' @rdname nafunctions
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' dropna(df)
+#' }
+setMethod("dropna",
+          signature(x = "DataFrame"),
+          function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
+            how <- match.arg(how)
+            if (is.null(cols)) {
+              cols <- columns(x)
+            }
+            if (is.null(minNonNulls)) {
+              minNonNulls <- if (how == "any") { length(cols) } else { 1 }
+            }
+            
+            naFunctions <- callJMethod(x@sdf, "na")
+            sdf <- callJMethod(naFunctions, "drop",
+                               as.integer(minNonNulls), listToSeq(as.list(cols)))
+            dataFrame(sdf)
+          })
+
+#' @aliases dropna
+#' @export
+setMethod("na.omit",
+          signature(x = "DataFrame"),
+          function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
+            dropna(x, how, minNonNulls, cols)
+          })
+
+#' fillna
+#'
+#' Replace null values.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param value Value to replace null values with.
+#'              Should be an integer, numeric, character or named list.
+#'              If the value is a named list, then cols is ignored and
+#'              value must be a mapping from column name (character) to 
+#'              replacement value. The replacement value must be an
+#'              integer, numeric or character.
+#' @param cols optional list of column names to consider.
+#'             Columns specified in cols that do not have matching data
+#'             type are ignored. For example, if value is a character, and 
+#'             subset contains a non-character column, then the non-character
+#'             column is simply ignored.
+#' @return A DataFrame
+#' 
+#' @rdname nafunctions
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' fillna(df, 1)
+#' fillna(df, list("age" = 20, "name" = "unknown"))
+#' }
+setMethod("fillna",
+          signature(x = "DataFrame"),
+          function(x, value, cols = NULL) {
+            if (!(class(value) %in% c("integer", "numeric", "character", "list"))) {
+              stop("value should be an integer, numeric, charactor or named list.")
+            }
+            
+            if (class(value) == "list") {
+              # Check column names in the named list
+              colNames <- names(value)
+              if (length(colNames) == 0 || !all(colNames != "")) {
+                stop("value should be an a named list with each name being a column name.")
+              }
+              
+              # Convert to the named list to an environment to be passed to JVM
+              valueMap <- new.env()
+              for (col in colNames) {
+                # Check each item in the named list is of valid type
+                v <- value[[col]]
+                if (!(class(v) %in% c("integer", "numeric", "character"))) {
+                  stop("Each item in value should be an integer, numeric or charactor.")
+                }
+                valueMap[[col]] <- v
+              }
+              
+              # When value is a named list, caller is expected not to pass in cols
+              if (!is.null(cols)) {
+                warning("When value is a named list, cols is ignored!")
+                cols <- NULL
+              }
+              
+              value <- valueMap
+            } else if (is.integer(value)) {
+              # Cast an integer to a numeric
+              value <- as.numeric(value)
+            }
+            
+            naFunctions <- callJMethod(x@sdf, "na")
+            sdf <- if (length(cols) == 0) {
+              callJMethod(naFunctions, "fill", value)
+            } else {
+              callJMethod(naFunctions, "fill", value, listToSeq(as.list(cols)))
+            }
+            dataFrame(sdf)
+          })
@@ -457,6 +457,11 @@ read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
   if (!is.null(path)) {
     options[['path']] <- path
   }
+  if (is.null(source)) {
+    sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
+    source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
+                          "org.apache.spark.sql.parquet")
+  }
   sdf <- callJMethod(sqlContext, "load", source, options)
   dataFrame(sdf)
 }
 
@@ -396,6 +396,20 @@ setGeneric("columns", function(x) {standardGeneric("columns") })
 #' @export
 setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
 
+#' @rdname nafunctions
+#' @export
+setGeneric("dropna",
+           function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { 
+             standardGeneric("dropna") 
+           })
+
+#' @rdname nafunctions
+#' @export
+setGeneric("na.omit",
+           function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { 
+             standardGeneric("na.omit") 
+           })
+
 #' @rdname schema
 #' @export
 setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
@@ -408,6 +422,10 @@ setGeneric("explain", function(x, ...) { standardGeneric("explain") })
 #' @export
 setGeneric("except", function(x, y) { standardGeneric("except") })
 
+#' @rdname nafunctions
+#' @export
+setGeneric("fillna", function(x, value, cols = NULL) { standardGeneric("fillna") })
+
 #' @rdname filter
 #' @export
 setGeneric("filter", function(x, condition) { standardGeneric("filter") })
@@ -482,11 +500,11 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
 
 #' @rdname write.df
 #' @export
-setGeneric("write.df", function(df, path, source, mode, ...) { standardGeneric("write.df") })
+setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
 
 #' @rdname write.df
 #' @export
-setGeneric("saveDF", function(df, path, source, mode, ...) { standardGeneric("saveDF") })
+setGeneric("saveDF", function(df, path, ...) { standardGeneric("saveDF") })
 
 #' @rdname schema
 #' @export
 
@@ -160,6 +160,14 @@ writeList <- function(con, arr) {
   }
 }
 
+# Used to pass arrays where the elements can be of different types
+writeGenericList <- function(con, list) {
+  writeInt(con, length(list))
+  for (elem in list) {
+    writeObject(con, elem)
+  }
+}
+  
 # Used to pass in hash maps required on Java side.
 writeEnv <- function(con, env) {
   len <- length(env)
@@ -168,7 +176,7 @@ writeEnv <- function(con, env) {
   if (len > 0) {
     writeList(con, as.list(ls(env)))
     vals <- lapply(ls(env), function(x) { env[[x]] })
-    writeList(con, as.list(vals))
+    writeGenericList(con, as.list(vals))
   }
 }
 
 
@@ -225,14 +225,21 @@ sparkR.init <- function(
 #' sqlContext <- sparkRSQL.init(sc)
 #'}
 
-sparkRSQL.init <- function(jsc) {
+sparkRSQL.init <- function(jsc = NULL) {
   if (exists(".sparkRSQLsc", envir = .sparkREnv)) {
     return(get(".sparkRSQLsc", envir = .sparkREnv))
   }
 
+  # If jsc is NULL, create a Spark Context
+  sc <- if (is.null(jsc)) {
+    sparkR.init()
+  } else {
+    jsc
+  }
+
   sqlContext <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
-                        "createSQLContext",
-                        jsc)
+                            "createSQLContext",
+                            sc)
   assign(".sparkRSQLsc", sqlContext, envir = .sparkREnv)
   sqlContext
 }
@@ -249,12 +256,19 @@ sparkRSQL.init <- function(jsc) {
 #' sqlContext <- sparkRHive.init(sc)
 #'}
 
-sparkRHive.init <- function(jsc) {
+sparkRHive.init <- function(jsc = NULL) {
   if (exists(".sparkRHivesc", envir = .sparkREnv)) {
     return(get(".sparkRHivesc", envir = .sparkREnv))
   }
 
-  ssc <- callJMethod(jsc, "sc")
+  # If jsc is NULL, create a Spark Context
+  sc <- if (is.null(jsc)) {
+    sparkR.init()
+  } else {
+    jsc
+  }
+
+  ssc <- callJMethod(sc, "sc")
   hiveCtx <- tryCatch({
     newJObject("org.apache.spark.sql.hive.HiveContext", ssc)
   }, error = function(err) {
Original file line number	Diff line number	Diff line change
`@@ -457,6 +457,11 @@ read.df <- function(sqlContext, path = NULL, source = NULL, ...) {`
`457`	`457`	`if (!is.null(path)) {`
`458`	`458`	`options[['path']] <- path`
`459`	`459`	`}`
	`460`	`+ if (is.null(source)) {`
	`461`	`+ sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)`
	`462`	`+ source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",`
	`463`	`+ "org.apache.spark.sql.parquet")`
	`464`	`+ }`
`460`	`465`	`sdf <- callJMethod(sqlContext, "load", source, options)`
`461`	`466`	`dataFrame(sdf)`
`462`	`467`	`}`
Original file line number	Diff line number	Diff line change
`@@ -160,6 +160,14 @@ writeList <- function(con, arr) {`
`160`	`160`	`}`
`161`	`161`	`}`
`162`	`162`
	`163`	`+# Used to pass arrays where the elements can be of different types`
	`164`	`+writeGenericList <- function(con, list) {`
	`165`	`+ writeInt(con, length(list))`
	`166`	`+ for (elem in list) {`
	`167`	`+ writeObject(con, elem)`
	`168`	`+ }`
	`169`	`+}`
	`170`	`+`
`163`	`171`	`# Used to pass in hash maps required on Java side.`
`164`	`172`	`writeEnv <- function(con, env) {`
`165`	`173`	`len <- length(env)`
`@@ -168,7 +176,7 @@ writeEnv <- function(con, env) {`
`168`	`176`	`if (len > 0) {`
`169`	`177`	`writeList(con, as.list(ls(env)))`
`170`	`178`	`vals <- lapply(ls(env), function(x) { env[[x]] })`
`171`		`- writeList(con, as.list(vals))`
	`179`	`+ writeGenericList(con, as.list(vals))`
`172`	`180`	`}`
`173`	`181`	`}`
`174`	`182`