apache · olarayej · Sep 24, 2015 · Sep 24, 2015 · Sep 24, 2015 · Sep 24, 2015
diff --git a/LICENSE b/LICENSE
diff --git a/NOTICE b/NOTICE
@@ -572,38 +572,3 @@ Copyright 2009-2013 The Apache Software Foundation
 
 Apache Avro IPC
 Copyright 2009-2013 The Apache Software Foundation
-
-
-Vis.js
-Copyright 2010-2015 Almende B.V.
-
-Vis.js is dual licensed under both
-
-  * The Apache 2.0 License
-    http://www.apache.org/licenses/LICENSE-2.0
-
-    and
-
-  * The MIT License
-    http://opensource.org/licenses/MIT
-
-Vis.js may be distributed under either license.
-
-
-Vis.js uses and redistributes the following third-party libraries:
-
-- component-emitter
-  https://github.com/component/emitter
-  The MIT License
-
-- hammer.js
-  http://hammerjs.github.io/
-  The MIT License
-
-- moment.js
-  http://momentjs.com/
-  The MIT License
-
-- keycharm
-  https://github.com/AlexDM0/keycharm
-  The MIT License
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -34,4 +34,5 @@ Collate:
     'serialize.R'
     'sparkR.R'
     'stats.R'
+    'types.R'
     'utils.R'
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -23,9 +23,11 @@ export("setJobGroup",
 exportClasses("DataFrame")
 
 exportMethods("arrange",
-              "attach",
+              "as.data.frame",
+              "attach", 
               "cache",
               "collect",
+              "coltypes",
               "columns",
               "count",
               "cov",
@@ -264,4 +266,3 @@ export("structField",
        "structType.structField",
        "print.structType")
 
-export("as.data.frame")
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2152,3 +2152,58 @@ setMethod("with",
             newEnv <- assignNewEnv(data)
             eval(substitute(expr), envir = newEnv, enclos = newEnv)
           })
+
+#' Returns the column types of a DataFrame.
+#' 
+#' @name coltypes
+#' @title Get column types of a DataFrame
+#' @param x (DataFrame)
+#' @return value (character) A character vector with the column types of the given DataFrame
+#' @rdname coltypes
+setMethod("coltypes",
+          signature(x = "DataFrame"),
+          function(x) {
+            # TODO: This may be moved as a global parameter
+            # These are the supported data types and how they map to
+            # R's data types
+            DATA_TYPES <- c("string"="character",
+                            "long"="integer",
+                            "tinyint"="integer",
+                            "short"="integer",
+                            "integer"="integer",
+                            "byte"="integer",
+                            "double"="numeric",
+                            "float"="numeric",
+                            "decimal"="numeric",
+                            "boolean"="logical"
+            )
+
+            # Get the data types of the DataFrame by invoking dtypes() function
+            types <- sapply(dtypes(x), function(x) {x[[2]]})
+
+            # Map Spark data types into R's data types using DATA_TYPES environment
+            rTypes <- sapply(types, USE.NAMES=F, FUN=function(x) {
+
+              # Check for primitive types
+              type <- PRIMITIVE_TYPES[[x]]
+              if (is.null(type)) {
+                # Check for complex types
+                typeName <- Filter(function(t) { substring(x, 1, nchar(t)) == t},
+                                   names(COMPLEX_TYPES))
+                if (length(typeName) > 0) {
+                  type <- COMPLEX_TYPES[[typeName]]
+                } else {
+                  stop(paste("Unsupported data type: ", x))
+                }
+              }
+              type
+            })
+
+            # Find which types don't have mapping to R
+            naIndices <- which(is.na(rTypes))
+
+            # Assign the original scala data types to the unmatched ones
+            rTypes[naIndices] <- types[naIndices]
+
+            rTypes
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -1027,7 +1027,6 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
 #' @export
 setGeneric("year", function(x) { standardGeneric("year") })
 
-
 #' @rdname glm
 #' @export
 setGeneric("glm")
@@ -1047,3 +1046,7 @@ setGeneric("attach")
 #' @rdname with
 #' @export
 setGeneric("with")
+
+#' @rdname coltypes
+#' @export
+setGeneric("coltypes", function(x) { standardGeneric("coltypes") })
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -27,7 +27,7 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
 #'
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and '-'.
+#'                operators are supported, including '~', '+', '-', and '.'.
 #' @param data DataFrame for training
 #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
 #' @param lambda Regularization parameter
@@ -41,8 +41,7 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' sqlContext <- sparkRSQL.init(sc)
 #' data(iris)
 #' df <- createDataFrame(sqlContext, iris)
-#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
-#' summary(model)
+#' model <- glm(Sepal_Length ~ Sepal_Width, df)
 #'}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
           function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0,

diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
@@ -115,20 +115,7 @@ structField.jobj <- function(x) {
 }
 
 checkType <- function(type) {
-  primtiveTypes <- c("byte",
-                     "integer",
-                     "float",
-                     "double",
-                     "numeric",
-                     "character",
-                     "string",
-                     "binary",
-                     "raw",
-                     "logical",
-                     "boolean",
-                     "timestamp",
-                     "date")
-  if (type %in% primtiveTypes) {
+  if (type %in% names(PRIMITIVE_TYPES)) {
     return()
   } else {
     # Check complex types

diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# types.R. This file handles the data type mapping between Spark and R
+
+# The primitive data types, where names(PRIMITIVE_TYPES) are Scala types whereas
+# values are equivalent R types. This is stored in an environment to allow for
+# more efficient look up (environments use hashmaps).
+PRIMITIVE_TYPES <- as.environment(list(
+  "byte"="integer",
+  "tinyint"="integer",
+  "smallint"="integer",
+  "integer"="integer",
+  "bigint"="numeric",
+  "float"="numeric",
+  "double"="numeric",
+  "decimal"="numeric",
+  "string"="character",
+  "binary"="raw",
+  "boolean"="logical",
+  "timestamp"="POSIXct",
+  "date"="Date"))
+
+# The complex data types. These do not have any direct mapping to R's types.
+COMPLEX_TYPES <- list(
+  "map"=NA,
+  "array"=NA,
+  "struct"=NA)
+
+# The full list of data types.
+DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
@@ -61,14 +61,6 @@ test_that("dot minus and intercept vs native glm", {
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
-test_that("feature interaction vs native glm", {
-  training <- createDataFrame(sqlContext, iris)
-  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
 test_that("summary coefficients match with native glm", {
   training <- createDataFrame(sqlContext, iris)
   stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, solver = "l-bfgs"))
@@ -77,7 +69,7 @@ test_that("summary coefficients match with native glm", {
   expect_true(all(abs(rCoefs - coefs) < 1e-6))
   expect_true(all(
     as.character(stats$features) ==
-    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+    c("(Intercept)", "Sepal_Length", "Species__versicolor", "Species__virginica")))
 })
 
 test_that("summary coefficients match with native glm of family 'binomial'", {

diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
@@ -695,13 +695,6 @@ test_that("select with column", {
   expect_equal(columns(df3), c("x"))
   expect_equal(count(df3), 3)
   expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
-
-  df4 <- select(df, c("name", "age"))
-  expect_equal(columns(df4), c("name", "age"))
-  expect_equal(count(df4), 3)
-
-  expect_error(select(df, c("name", "age"), "name"),
-                "To select multiple columns, use a character vector or list for col")
 })
 
 test_that("subsetting", {
@@ -1467,8 +1460,9 @@ test_that("SQL error message is returned from JVM", {
   expect_equal(grepl("Table not found: blah", retError), TRUE)
 })
 
+irisDF <- createDataFrame(sqlContext, iris)
+
 test_that("Method as.data.frame as a synonym for collect()", {
-  irisDF <- createDataFrame(sqlContext, iris)
   expect_equal(as.data.frame(irisDF), collect(irisDF))
   irisDF2 <- irisDF[irisDF$Species == "setosa", ]
   expect_equal(as.data.frame(irisDF2), collect(irisDF2))
@@ -1503,6 +1497,27 @@ test_that("with() on a DataFrame", {
   expect_equal(nrow(sum2), 35)
 })
 
+test_that("Method coltypes() to get R's data types of a DataFrame", {
+  expect_equal(coltypes(irisDF), c(rep("numeric", 4), "character"))
+
+  data <- data.frame(c1=c(1,2,3),
+                     c2=c(T,F,T),
+                     c3=c("2015/01/01 10:00:00", "2015/01/02 10:00:00", "2015/01/03 10:00:00"))
+
+  schema <- structType(structField("c1", "byte"),
+                       structField("c3", "boolean"),
+                       structField("c4", "timestamp"))
+
+  # Test primitive types
+  DF <- createDataFrame(sqlContext, data, schema)
+  expect_equal(coltypes(DF), c("integer", "logical", "POSIXct"))
+
+  # Test complex types
+  x <- createDataFrame(sqlContext, list(list(as.environment(
+    list("a"="b", "c"="d", "e"="f")))))
+  expect_equal(coltypes(x), "map<string,string>")
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)
diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ will run the Pi example locally.
 
 You can set the MASTER environment variable when running examples to submit
 examples to a cluster. This can be a mesos:// or spark:// URL,
-"yarn" to run on YARN, and "local" to run
+"yarn-cluster" or "yarn-client" to run on YARN, and "local" to run
 locally with one thread, or "local[N]" to run locally with N threads. You
 can also use an abbreviated class name if the class is in the `examples`
 package. For instance:

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -192,7 +192,7 @@ private long[] writePartitionedFile(File outputFile) throws IOException {
         } finally {
           Closeables.close(in, copyThrewException);
         }
-        if (!partitionWriters[i].fileSegment().file().delete()) {
+        if (!blockManager.diskBlockManager().getFile(partitionWriters[i].blockId()).delete()) {
           logger.error("Unable to delete file for partition {}", i);
         }
       }

diff --git a/core/src/main/java/org/apache/spark/util/collection/TimSort.java b/core/src/main/java/org/apache/spark/util/collection/TimSort.java
@@ -15,24 +15,6 @@
  * limitations under the License.
  */
 
-/*
- * Based on TimSort.java from the Android Open Source Project
- *
- *  Copyright (C) 2008 The Android Open Source Project
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *       http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
 package org.apache.spark.util.collection;
 
 import java.util.Comparator;

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java
@@ -21,7 +21,6 @@
 
 import org.apache.spark.annotation.Private;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.unsafe.types.ByteArray;
 import org.apache.spark.unsafe.types.UTF8String;
 import org.apache.spark.util.Utils;
 
@@ -63,7 +62,21 @@ public int compare(long aPrefix, long bPrefix) {
     }
 
     public static long computePrefix(byte[] bytes) {
-      return ByteArray.getPrefix(bytes);
+      if (bytes == null) {
+        return 0L;
+      } else {
+        /**
+         * TODO: If a wrapper for BinaryType is created (SPARK-8786),
+         * these codes below will be in the wrapper class.
+         */
+        final int minLen = Math.min(bytes.length, 8);
+        long p = 0;
+        for (int i = 0; i < minLen; ++i) {
+          p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i))
+              << (56 - 8 * i);
+        }
+        return p;
+      }
     }
   }