apache · nssalian · Aug 10, 2015 · Aug 10, 2015 · Aug 10, 2015 · Aug 10, 2015
diff --git a/.rat-excludes b/.rat-excludes
@@ -94,3 +94,4 @@ INDEX
 gen-java.*
 .*avpr
 org.apache.spark.sql.sources.DataSourceRegister
+.*parquet
diff --git a/R/README.md b/R/README.md
@@ -63,5 +63,7 @@ You can also run the unit-tests for SparkR by running (you need to install the [
 The `./bin/spark-submit` and `./bin/sparkR` can also be used to submit jobs to YARN clusters. You will need to set YARN conf dir before doing so. For example on CDH you can run
 ```
 export YARN_CONF_DIR=/etc/hadoop/conf
+./bin/spark-submit --master yarn --deploy-mode cluster (or client) examples/src/main/r/dataframe.R
+OR
 ./bin/spark-submit --master yarn examples/src/main/r/dataframe.R
 ```
diff --git a/R/install-dev.bat b/R/install-dev.bat
@@ -25,8 +25,3 @@ set SPARK_HOME=%~dp0..
 MKDIR %SPARK_HOME%\R\lib
 
 R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib"  %SPARK_HOME%\R\pkg\
-
-rem Zip the SparkR package so that it can be distributed to worker nodes on YARN
-pushd %SPARK_HOME%\R\lib
-%JAVA_HOME%\bin\jar.exe cfM "%SPARK_HOME%\R\lib\sparkr.zip" SparkR
-popd
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SparkR
 Type: Package
 Title: R frontend for Spark
-Version: 1.4.0
+Version: 1.5.0
 Date: 2013-09-09
 Author: The Apache Software Foundation
 Maintainer: Shivaram Venkataraman <[email protected]>
@@ -29,6 +29,7 @@ Collate:
     'client.R'
     'context.R'
     'deserialize.R'
+    'functions.R'
     'mllib.R'
     'serialize.R'
     'sparkR.R'

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -84,57 +84,136 @@ exportClasses("Column")
 
 exportMethods("abs",
               "acos",
+              "add_months",
               "alias",
               "approxCountDistinct",
               "asc",
+              "ascii",
               "asin",
               "atan",
               "atan2",
               "avg",
+              "base64",
               "between",
+              "bin",
+              "bitwiseNOT",
               "cast",
               "cbrt",
+              "ceil",
               "ceiling",
+              "concat",
+              "concat_ws",
               "contains",
+              "conv",
               "cos",
               "cosh",
+              "count",
               "countDistinct",
+              "crc32",
+              "date_add",
+              "date_format",
+              "date_sub",
+              "datediff",
+              "dayofmonth",
+              "dayofyear",
               "desc",
               "endsWith",
               "exp",
+              "explode",
               "expm1",
+              "expr",
+              "factorial",
+              "first",
               "floor",
+              "format_number",
+              "format_string",
+              "from_unixtime",
+              "from_utc_timestamp",
               "getField",
               "getItem",
+              "greatest",
+              "hex",
+              "hour",
               "hypot",
+              "ifelse",
+              "initcap",
+              "instr",
+              "isNaN",
               "isNotNull",
               "isNull",
               "last",
+              "last_day",
+              "least",
+              "length",
+              "levenshtein",
               "like",
+              "lit",
+              "locate",
               "log",
               "log10",
               "log1p",
+              "log2",
               "lower",
+              "lpad",
+              "ltrim",
               "max",
+              "md5",
               "mean",
               "min",
+              "minute",
+              "month",
+              "months_between",
               "n",
               "n_distinct",
+              "nanvl",
+              "negate",
+              "next_day",
+              "otherwise",
+              "pmod",
+              "quarter",
+              "rand",
+              "randn",
+              "regexp_extract",
+              "regexp_replace",
+              "reverse",
               "rint",
               "rlike",
+              "round",
+              "rpad",
+              "rtrim",
+              "second",
+              "sha1",
+              "sha2",
+              "shiftLeft",
+              "shiftRight",
+              "shiftRightUnsigned",
               "sign",
+              "signum",
               "sin",
               "sinh",
+              "size",
+              "soundex",
               "sqrt",
               "startsWith",
               "substr",
+              "substring_index",
               "sum",
               "sumDistinct",
               "tan",
               "tanh",
               "toDegrees",
               "toRadians",
-              "upper")
+              "to_date",
+              "to_utc_timestamp",
+              "translate",
+              "trim",
+              "unbase64",
+              "unhex",
+              "unix_timestamp",
+              "upper",
+              "weekofyear",
+              "when",
+              "year")
 
 exportClasses("GroupedData")
 exportMethods("agg")

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
@@ -60,12 +60,6 @@ operators <- list(
 )
 column_functions1 <- c("asc", "desc", "isNull", "isNotNull")
 column_functions2 <- c("like", "rlike", "startsWith", "endsWith", "getField", "getItem", "contains")
-functions <- c("min", "max", "sum", "avg", "mean", "count", "abs", "sqrt",
-               "first", "last", "lower", "upper", "sumDistinct",
-               "acos", "asin", "atan", "cbrt", "ceiling", "cos", "cosh", "exp",
-               "expm1", "floor", "log", "log10", "log1p", "rint", "sign",
-               "sin", "sinh", "tan", "tanh", "toDegrees", "toRadians")
-binary_mathfunctions <- c("atan2", "hypot")
 
 createOperator <- function(op) {
   setMethod(op,
@@ -111,33 +105,6 @@ createColumnFunction2 <- function(name) {
             })
 }
 
-createStaticFunction <- function(name) {
-  setMethod(name,
-            signature(x = "Column"),
-            function(x) {
-              if (name == "ceiling") {
-                  name <- "ceil"
-              }
-              if (name == "sign") {
-                  name <- "signum"
-              }
-              jc <- callJStatic("org.apache.spark.sql.functions", name, x@jc)
-              column(jc)
-            })
-}
-
-createBinaryMathfunctions <- function(name) {
-  setMethod(name,
-            signature(y = "Column"),
-            function(y, x) {
-              if (class(x) == "Column") {
-                x <- x@jc
-              }
-              jc <- callJStatic("org.apache.spark.sql.functions", name, y@jc, x)
-              column(jc)
-            })
-}
-
 createMethods <- function() {
   for (op in names(operators)) {
     createOperator(op)
@@ -148,12 +115,6 @@ createMethods <- function() {
   for (name in column_functions2) {
     createColumnFunction2(name)
   }
-  for (x in functions) {
-    createStaticFunction(x)
-  }
-  for (name in binary_mathfunctions) {
-    createBinaryMathfunctions(name)
-  }
 }
 
 createMethods()
@@ -243,44 +204,16 @@ setMethod("%in%",
             return(column(jc))
           })
 
-#' Approx Count Distinct
+#' otherwise
 #'
-#' @rdname column
-#' @return the approximate number of distinct items in a group.
-setMethod("approxCountDistinct",
-          signature(x = "Column"),
-          function(x, rsd = 0.95) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd)
-            column(jc)
-          })
-
-#' Count Distinct
+#' If values in the specified column are null, returns the value. 
+#' Can be used in conjunction with `when` to specify a default value for expressions.
 #'
 #' @rdname column
-#' @return the number of distinct items in a group.
-setMethod("countDistinct",
-          signature(x = "Column"),
-          function(x, ...) {
-            jcol <- lapply(list(...), function (x) {
-              x@jc
-            })
-            jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc,
-                              listToSeq(jcol))
+setMethod("otherwise",
+          signature(x = "Column", value = "ANY"),
+          function(x, value) {
+            value <- ifelse(class(value) == "Column", value@jc, value)
+            jc <- callJMethod(x@jc, "otherwise", value)
             column(jc)
           })
-
-#' @rdname column
-#' @aliases countDistinct
-setMethod("n_distinct",
-          signature(x = "Column"),
-          function(x, ...) {
-            countDistinct(x, ...)
-          })
-
-#' @rdname column
-#' @aliases count
-setMethod("n",
-          signature(x = "Column"),
-          function(x) {
-            count(x)
-          })
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
@@ -176,10 +176,14 @@ readRow <- function(inputCon) {
 
 # Take a single column as Array[Byte] and deserialize it into an atomic vector
 readCol <- function(inputCon, numRows) {
-  # sapply can not work with POSIXlt
-  do.call(c, lapply(1:numRows, function(x) {
-    value <- readObject(inputCon)
-    # Replace NULL with NA so we can coerce to vectors
-    if (is.null(value)) NA else value
-  }))
+  if (numRows > 0) {
+    # sapply can not work with POSIXlt
+    do.call(c, lapply(1:numRows, function(x) {
+      value <- readObject(inputCon)
+      # Replace NULL with NA so we can coerce to vectors
+      if (is.null(value)) NA else value
+    }))
+  } else {
+    vector()
+  }
 }