From a6bc8d9708496d16395a05e506ae3a2a29b0ec08 Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Tue, 17 Nov 2015 22:10:44 +0800 Subject: [PATCH 1/6] [SPARK-11781][SPARKR] SparkR has problem in inferring type of raw type. --- R/pkg/R/DataFrame.R | 25 +++++++++++++++---------- R/pkg/R/SQLContext.R | 2 +- R/pkg/inst/tests/test_sparkSQL.R | 6 ++++++ 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 8a13e7a36766..4c4edc527ec8 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -700,25 +700,30 @@ setMethod("collect", # data of complex type can be held. But getting a cell from a column # of list type returns a list instead of a vector. So for columns of # non-complex type, append them as vector. + # + # For columns of complex type, be careful to access them. + # Get a column of complex type returns a list. + # Get a cell from a column of complex type returns a list instead of a vector. col <- listCols[[colIndex]] if (length(col) <= 0) { df[[names[colIndex]]] <- col } else { # TODO: more robust check on column of primitive types - vec <- do.call(c, col) - if (class(vec) != "list") { - df[[names[colIndex]]] <- vec + if (!any(sapply(col, function(e) { length(e) > 1 }))) { + vec <- do.call(c, col) + if (class(vec) != "list") { + df[[names[colIndex]]] <- vec + } else { + df[[names[colIndex]]] <- col + } } else { - # For columns of complex type, be careful to access them. - # Get a column of complex type returns a list. - # Get a cell from a column of complex type returns a list instead of a vector. df[[names[colIndex]]] <- col - } + } + } } + df } - df - } - }) + }) #' Limit #' diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index a62b25fde926..85541c8e2244 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -63,7 +63,7 @@ infer_type <- function(x) { }) type <- Reduce(paste0, type) type <- paste0("struct<", substr(type, 1, nchar(type) - 1), ">") - } else if (length(x) > 1) { + } else if (length(x) > 1 && type != "binary") { paste0("array<", infer_type(x[[1]]), ">") } else { type diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 3f4f319fe745..07872ae03a55 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -72,6 +72,8 @@ test_that("infer types and check types", { expect_equal(infer_type(e), "map") expect_error(checkType("map"), "Key type in a map must be string or character") + + expect_equal(infer_type(as.raw(c(1, 2 ,3))), "binary") }) test_that("structType and structField", { @@ -250,6 +252,10 @@ test_that("create DataFrame from list or data.frame", { mtcarsdf <- createDataFrame(sqlContext, mtcars) expect_equivalent(collect(mtcarsdf), mtcars) + + bytes <- as.raw(c(1, 2, 3)) + df <- createDataFrame(sqlContext, list(list(bytes))) + expect_equal(collect(df)[[1]][[1]], bytes) }) test_that("create DataFrame with different data types", { From 7fd0d0938c35151c9665865b63ee238fab9d30a6 Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Wed, 18 Nov 2015 10:43:31 +0800 Subject: [PATCH 2/6] Minor style fix. --- R/pkg/inst/tests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 07872ae03a55..1808bbffc672 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -73,7 +73,7 @@ test_that("infer types and check types", { expect_error(checkType("map"), "Key type in a map must be string or character") - expect_equal(infer_type(as.raw(c(1, 2 ,3))), "binary") + expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary") }) test_that("structType and structField", { From 8ed731e9b6023c89181ef5e209531a7724f80e58 Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Wed, 18 Nov 2015 11:21:48 +0800 Subject: [PATCH 3/6] Fix R style. --- R/pkg/inst/tests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 1808bbffc672..5bd146a19a4b 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -72,7 +72,7 @@ test_that("infer types and check types", { expect_equal(infer_type(e), "map") expect_error(checkType("map"), "Key type in a map must be string or character") - + expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary") }) From 2af83b7787ad7c3bb867131c9f28be3f25198277 Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Mon, 23 Nov 2015 20:09:17 +0800 Subject: [PATCH 4/6] Use schema to determine if a collected column can be coerced into an atomic vector. --- R/pkg/R/DataFrame.R | 20 +++++++++----------- R/pkg/R/types.R | 37 +++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 4c4edc527ec8..57789dc04528 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -676,8 +676,8 @@ setMethod("dim", setMethod("collect", signature(x = "DataFrame"), function(x, stringsAsFactors = FALSE) { - names <- columns(x) - ncol <- length(names) + dtypes <- dtypes(x) + ncol <- length(dtypes) if (ncol <= 0) { # empty data.frame with 0 columns and 0 rows data.frame() @@ -705,19 +705,17 @@ setMethod("collect", # Get a column of complex type returns a list. # Get a cell from a column of complex type returns a list instead of a vector. col <- listCols[[colIndex]] + colName <- dtypes[[colIndex]][[1]] if (length(col) <= 0) { - df[[names[colIndex]]] <- col + df[[colName]] <- col } else { - # TODO: more robust check on column of primitive types - if (!any(sapply(col, function(e) { length(e) > 1 }))) { + colType <- dtypes[[colIndex]][[2]] + if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") { vec <- do.call(c, col) - if (class(vec) != "list") { - df[[names[colIndex]]] <- vec - } else { - df[[names[colIndex]]] <- col - } + stopifnot (class(vec) != "list") + df[[colName]] <- vec } else { - df[[names[colIndex]]] <- col + df[[colName]] <- col } } } diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index 1828c23ab0f6..677b9d8e6bd9 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -19,25 +19,30 @@ # values are equivalent R types. This is stored in an environment to allow for # more efficient look up (environments use hashmaps). PRIMITIVE_TYPES <- as.environment(list( - "byte"="integer", - "tinyint"="integer", - "smallint"="integer", - "integer"="integer", - "bigint"="numeric", - "float"="numeric", - "double"="numeric", - "decimal"="numeric", - "string"="character", - "binary"="raw", - "boolean"="logical", - "timestamp"="POSIXct", - "date"="Date")) + "tinyint" = "integer", + "smallint" = "integer", + "int" = "integer", + "bigint" = "numeric", + "float" = "numeric", + "double" = "numeric", + "decimal" = "numeric", + "string" = "character", + "binary" = "raw", + "boolean" = "logical", + "timestamp" = "POSIXct", + "date" = "Date", + # following types are not SQL types returned by dtypes(). They are listed here for usage + # by checkType() in schema.R. + # TODO: refactor checkType() in schema.R. + "byte" = "integer", + "integer" = "integer" + )) # The complex data types. These do not have any direct mapping to R's types. COMPLEX_TYPES <- list( - "map"=NA, - "array"=NA, - "struct"=NA) + "map" = NA, + "array" = NA, + "struct" = NA) # The full list of data types. DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES)) From f073c3aede9fd258d2354db04d5eae7c14e40c25 Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Fri, 27 Nov 2015 09:44:13 +0800 Subject: [PATCH 5/6] Fix R style. --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 57789dc04528..bada4812de32 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -712,7 +712,7 @@ setMethod("collect", colType <- dtypes[[colIndex]][[2]] if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") { vec <- do.call(c, col) - stopifnot (class(vec) != "list") + stopifnot(class(vec) != "list") df[[colName]] <- vec } else { df[[colName]] <- col From 0b1af635a88a3ebe10c5f5ba0b007dfe936ab51b Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Sun, 29 Nov 2015 15:39:34 +0800 Subject: [PATCH 6/6] Add a comment. --- R/pkg/R/DataFrame.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index bada4812de32..4cb0bd27d0f4 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -710,6 +710,7 @@ setMethod("collect", df[[colName]] <- col } else { colType <- dtypes[[colIndex]][[2]] + # Note that "binary" columns behave like complex types. if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") { vec <- do.call(c, col) stopifnot(class(vec) != "list")