From bfc47e6beafb9517fac1d6c66cb1fd4354c1f4e6 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 9 Nov 2015 23:12:20 +0000 Subject: [PATCH 1/5] All changes from PR 8904 --- R/pkg/DESCRIPTION | 1 + R/pkg/NAMESPACE | 6 ++--- R/pkg/R/DataFrame.R | 44 ++++++++++++++++++++++++++++++++ R/pkg/R/generics.R | 4 +++ R/pkg/R/schema.R | 15 +---------- R/pkg/R/types.R | 43 +++++++++++++++++++++++++++++++ R/pkg/inst/tests/test_sparkSQL.R | 18 ++++++++++++- 7 files changed, 113 insertions(+), 18 deletions(-) create mode 100644 R/pkg/R/types.R diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 3d6edb70ec98..369714f7b99c 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -34,4 +34,5 @@ Collate: 'serialize.R' 'sparkR.R' 'stats.R' + 'types.R' 'utils.R' diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 56b8ed0bf271..52fd6c9f76c5 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -23,9 +23,11 @@ export("setJobGroup", exportClasses("DataFrame") exportMethods("arrange", + "as.data.frame", "attach", "cache", "collect", + "coltypes", "columns", "count", "cov", @@ -262,6 +264,4 @@ export("structField", "structType", "structType.jobj", "structType.structField", - "print.structType") - -export("as.data.frame") + "print.structType") \ No newline at end of file diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 44ce9414da5c..0977a1597213 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2152,3 +2152,47 @@ setMethod("with", newEnv <- assignNewEnv(data) eval(substitute(expr), envir = newEnv, enclos = newEnv) }) + +#' Returns the column types of a DataFrame. +#' +#' @name coltypes +#' @title Get column types of a DataFrame +#' @param x (DataFrame) +#' @return value (character) A character vector with the column types of the given DataFrame +#' @rdname coltypes +setMethod("coltypes", + signature(x = "DataFrame"), + function(x) { + # Get the data types of the DataFrame by invoking dtypes() function + types <- sapply(dtypes(x), function(x) {x[[2]]}) + + # Map Spark data types into R's data types using DATA_TYPES environment + rTypes <- sapply(types, USE.NAMES=F, FUN=function(x) { + + # Check for primitive types + type <- PRIMITIVE_TYPES[[x]] + + if (is.null(type)) { + # Check for complex types + for (t in names(COMPLEX_TYPES)) { + if (substring(x, 1, nchar(t)) == t) { + type <- COMPLEX_TYPES[[t]] + break + } + } + + if (is.null(type)) { + stop(paste("Unsupported data type: ", x)) + } + } + type + }) + + # Find which types don't have mapping to R + naIndices <- which(is.na(rTypes)) + + # Assign the original scala data types to the unmatched ones + rTypes[naIndices] <- types[naIndices] + + rTypes + }) \ No newline at end of file diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 083d37fee28a..cdfdafd1b3f4 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1047,3 +1047,7 @@ setGeneric("attach") #' @rdname with #' @export setGeneric("with") + +#' @rdname coltypes +#' @export +setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) \ No newline at end of file diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index 6f0e9a94e9bf..12093da1baa1 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -115,20 +115,7 @@ structField.jobj <- function(x) { } checkType <- function(type) { - primtiveTypes <- c("byte", - "integer", - "float", - "double", - "numeric", - "character", - "string", - "binary", - "raw", - "logical", - "boolean", - "timestamp", - "date") - if (type %in% primtiveTypes) { + if (type %in% names(PRIMITIVE_TYPES)) { return() } else { # Check complex types diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R new file mode 100644 index 000000000000..1828c23ab0f6 --- /dev/null +++ b/R/pkg/R/types.R @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# types.R. This file handles the data type mapping between Spark and R + +# The primitive data types, where names(PRIMITIVE_TYPES) are Scala types whereas +# values are equivalent R types. This is stored in an environment to allow for +# more efficient look up (environments use hashmaps). +PRIMITIVE_TYPES <- as.environment(list( + "byte"="integer", + "tinyint"="integer", + "smallint"="integer", + "integer"="integer", + "bigint"="numeric", + "float"="numeric", + "double"="numeric", + "decimal"="numeric", + "string"="character", + "binary"="raw", + "boolean"="logical", + "timestamp"="POSIXct", + "date"="Date")) + +# The complex data types. These do not have any direct mapping to R's types. +COMPLEX_TYPES <- list( + "map"=NA, + "array"=NA, + "struct"=NA) + +# The full list of data types. +DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES)) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index fbdb9a8f1ef6..77d09e18253a 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1467,8 +1467,9 @@ test_that("SQL error message is returned from JVM", { expect_equal(grepl("Table not found: blah", retError), TRUE) }) +irisDF <- createDataFrame(sqlContext, iris) + test_that("Method as.data.frame as a synonym for collect()", { - irisDF <- createDataFrame(sqlContext, iris) expect_equal(as.data.frame(irisDF), collect(irisDF)) irisDF2 <- irisDF[irisDF$Species == "setosa", ] expect_equal(as.data.frame(irisDF2), collect(irisDF2)) @@ -1503,6 +1504,21 @@ test_that("with() on a DataFrame", { expect_equal(nrow(sum2), 35) }) +test_that("Method coltypes() to get R's data types of a DataFrame", { + expect_equal(coltypes(irisDF), c(rep("numeric", 4), "character")) + + data <- data.frame(c1=c(1,2,3), + c2=c(T,F,T), + c3=c("2015/01/01 10:00:00", "2015/01/02 10:00:00", "2015/01/03 10:00:00")) + + schema <- structType(structField("c1", "byte"), + structField("c3", "boolean"), + structField("c4", "timestamp")) + + DF <- createDataFrame(sqlContext, data, schema) + expect_equal(coltypes(DF), c("integer", "logical", "timestamp")) +}) + unlink(parquetPath) unlink(jsonPath) unlink(jsonPathNa) From 635bbe77160780223bbd14e99dfaabc2f09f4a07 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 10 Nov 2015 00:46:33 +0000 Subject: [PATCH 2/5] Added tests --- R/pkg/inst/tests/test_sparkSQL.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 77d09e18253a..2fe59cb7919e 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1515,8 +1515,14 @@ test_that("Method coltypes() to get R's data types of a DataFrame", { structField("c3", "boolean"), structField("c4", "timestamp")) + # Test primitive types DF <- createDataFrame(sqlContext, data, schema) - expect_equal(coltypes(DF), c("integer", "logical", "timestamp")) + expect_equal(coltypes(DF), c("integer", "logical", "POSIXct")) + + # Test complex types + x <- createDataFrame(sqlContext, list(list(as.environment( + list("a"="b", "c"="d", "e"="f"))))) + expect_equal(coltypes(x), "map") }) unlink(parquetPath) From 0eabaf89310d8f08a4094511ebf9a1cc4893ad34 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 10 Nov 2015 01:00:07 +0000 Subject: [PATCH 3/5] Removed white space --- R/pkg/R/DataFrame.R | 14 +++++++------- R/pkg/inst/tests/test_sparkSQL.R | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 0977a1597213..c3dec22a90e4 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2165,13 +2165,13 @@ setMethod("coltypes", function(x) { # Get the data types of the DataFrame by invoking dtypes() function types <- sapply(dtypes(x), function(x) {x[[2]]}) - + # Map Spark data types into R's data types using DATA_TYPES environment rTypes <- sapply(types, USE.NAMES=F, FUN=function(x) { - + # Check for primitive types type <- PRIMITIVE_TYPES[[x]] - + if (is.null(type)) { # Check for complex types for (t in names(COMPLEX_TYPES)) { @@ -2180,19 +2180,19 @@ setMethod("coltypes", break } } - + if (is.null(type)) { stop(paste("Unsupported data type: ", x)) } } type }) - + # Find which types don't have mapping to R naIndices <- which(is.na(rTypes)) - + # Assign the original scala data types to the unmatched ones rTypes[naIndices] <- types[naIndices] - + rTypes }) \ No newline at end of file diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 2fe59cb7919e..06f52d021cff 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1506,19 +1506,19 @@ test_that("with() on a DataFrame", { test_that("Method coltypes() to get R's data types of a DataFrame", { expect_equal(coltypes(irisDF), c(rep("numeric", 4), "character")) - + data <- data.frame(c1=c(1,2,3), c2=c(T,F,T), c3=c("2015/01/01 10:00:00", "2015/01/02 10:00:00", "2015/01/03 10:00:00")) - + schema <- structType(structField("c1", "byte"), structField("c3", "boolean"), structField("c4", "timestamp")) - + # Test primitive types DF <- createDataFrame(sqlContext, data, schema) expect_equal(coltypes(DF), c("integer", "logical", "POSIXct")) - + # Test complex types x <- createDataFrame(sqlContext, list(list(as.environment( list("a"="b", "c"="d", "e"="f"))))) From a031aed7ee65a29137f09a61e7cc13fafdcaee10 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 10 Nov 2015 01:44:29 +0000 Subject: [PATCH 4/5] Fixed lookup in schema.R --- R/pkg/R/schema.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index 12093da1baa1..c6ddb562270b 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -115,7 +115,7 @@ structField.jobj <- function(x) { } checkType <- function(type) { - if (type %in% names(PRIMITIVE_TYPES)) { + if (!is.null(PRIMITIVE_TYPES[[type]])) { return() } else { # Check complex types From 01366a6a83ca26da6d6e1233306cf6148d5d051f Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 10 Nov 2015 06:46:48 +0000 Subject: [PATCH 5/5] Docs improvements for coltypes() --- R/pkg/R/DataFrame.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index c3dec22a90e4..0b7bcfc3321b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2157,9 +2157,14 @@ setMethod("with", #' #' @name coltypes #' @title Get column types of a DataFrame +#' @family dataframe_funcs #' @param x (DataFrame) #' @return value (character) A character vector with the column types of the given DataFrame #' @rdname coltypes +#' @examples \dontrun{ +#' irisDF <- createDataFrame(sqlContext, iris) +#' coltypes(irisDF) +#' } setMethod("coltypes", signature(x = "DataFrame"), function(x) {