Skip to content

Commit d2405cb

Browse files
Oscar D. Lara Yejasshivaram
authored andcommitted
[SPARK-10863][SPARKR] Method coltypes() (New version)
This is a follow up on PR #8984, as the corresponding branch for such PR was damaged. Author: Oscar D. Lara Yejas <[email protected]> Closes #9579 from olarayej/SPARK-10863_NEW14. (cherry picked from commit 47735cd) Signed-off-by: Shivaram Venkataraman <[email protected]>
1 parent 7c4ade0 commit d2405cb

File tree

7 files changed

+124
-18
lines changed

7 files changed

+124
-18
lines changed

R/pkg/DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,5 @@ Collate:
3434
'serialize.R'
3535
'sparkR.R'
3636
'stats.R'
37+
'types.R'
3738
'utils.R'

R/pkg/NAMESPACE

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ export("setJobGroup",
2323
exportClasses("DataFrame")
2424

2525
exportMethods("arrange",
26+
"as.data.frame",
2627
"attach",
2728
"cache",
2829
"collect",
30+
"coltypes",
2931
"columns",
3032
"count",
3133
"cov",
@@ -262,6 +264,4 @@ export("structField",
262264
"structType",
263265
"structType.jobj",
264266
"structType.structField",
265-
"print.structType")
266-
267-
export("as.data.frame")
267+
"print.structType")

R/pkg/R/DataFrame.R

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2152,3 +2152,52 @@ setMethod("with",
21522152
newEnv <- assignNewEnv(data)
21532153
eval(substitute(expr), envir = newEnv, enclos = newEnv)
21542154
})
2155+
2156+
#' Returns the column types of a DataFrame.
2157+
#'
2158+
#' @name coltypes
2159+
#' @title Get column types of a DataFrame
2160+
#' @family dataframe_funcs
2161+
#' @param x (DataFrame)
2162+
#' @return value (character) A character vector with the column types of the given DataFrame
2163+
#' @rdname coltypes
2164+
#' @examples \dontrun{
2165+
#' irisDF <- createDataFrame(sqlContext, iris)
2166+
#' coltypes(irisDF)
2167+
#' }
2168+
setMethod("coltypes",
2169+
signature(x = "DataFrame"),
2170+
function(x) {
2171+
# Get the data types of the DataFrame by invoking dtypes() function
2172+
types <- sapply(dtypes(x), function(x) {x[[2]]})
2173+
2174+
# Map Spark data types into R's data types using DATA_TYPES environment
2175+
rTypes <- sapply(types, USE.NAMES=F, FUN=function(x) {
2176+
2177+
# Check for primitive types
2178+
type <- PRIMITIVE_TYPES[[x]]
2179+
2180+
if (is.null(type)) {
2181+
# Check for complex types
2182+
for (t in names(COMPLEX_TYPES)) {
2183+
if (substring(x, 1, nchar(t)) == t) {
2184+
type <- COMPLEX_TYPES[[t]]
2185+
break
2186+
}
2187+
}
2188+
2189+
if (is.null(type)) {
2190+
stop(paste("Unsupported data type: ", x))
2191+
}
2192+
}
2193+
type
2194+
})
2195+
2196+
# Find which types don't have mapping to R
2197+
naIndices <- which(is.na(rTypes))
2198+
2199+
# Assign the original scala data types to the unmatched ones
2200+
rTypes[naIndices] <- types[naIndices]
2201+
2202+
rTypes
2203+
})

R/pkg/R/generics.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,3 +1047,7 @@ setGeneric("attach")
10471047
#' @rdname with
10481048
#' @export
10491049
setGeneric("with")
1050+
1051+
#' @rdname coltypes
1052+
#' @export
1053+
setGeneric("coltypes", function(x) { standardGeneric("coltypes") })

R/pkg/R/schema.R

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -115,20 +115,7 @@ structField.jobj <- function(x) {
115115
}
116116

117117
checkType <- function(type) {
118-
primtiveTypes <- c("byte",
119-
"integer",
120-
"float",
121-
"double",
122-
"numeric",
123-
"character",
124-
"string",
125-
"binary",
126-
"raw",
127-
"logical",
128-
"boolean",
129-
"timestamp",
130-
"date")
131-
if (type %in% primtiveTypes) {
118+
if (!is.null(PRIMITIVE_TYPES[[type]])) {
132119
return()
133120
} else {
134121
# Check complex types

R/pkg/R/types.R

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one or more
2+
# contributor license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright ownership.
4+
# The ASF licenses this file to You under the Apache License, Version 2.0
5+
# (the "License"); you may not use this file except in compliance with
6+
# the License. You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
# types.R. This file handles the data type mapping between Spark and R
17+
18+
# The primitive data types, where names(PRIMITIVE_TYPES) are Scala types whereas
19+
# values are equivalent R types. This is stored in an environment to allow for
20+
# more efficient look up (environments use hashmaps).
21+
PRIMITIVE_TYPES <- as.environment(list(
22+
"byte"="integer",
23+
"tinyint"="integer",
24+
"smallint"="integer",
25+
"integer"="integer",
26+
"bigint"="numeric",
27+
"float"="numeric",
28+
"double"="numeric",
29+
"decimal"="numeric",
30+
"string"="character",
31+
"binary"="raw",
32+
"boolean"="logical",
33+
"timestamp"="POSIXct",
34+
"date"="Date"))
35+
36+
# The complex data types. These do not have any direct mapping to R's types.
37+
COMPLEX_TYPES <- list(
38+
"map"=NA,
39+
"array"=NA,
40+
"struct"=NA)
41+
42+
# The full list of data types.
43+
DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1467,8 +1467,9 @@ test_that("SQL error message is returned from JVM", {
14671467
expect_equal(grepl("Table not found: blah", retError), TRUE)
14681468
})
14691469

1470+
irisDF <- createDataFrame(sqlContext, iris)
1471+
14701472
test_that("Method as.data.frame as a synonym for collect()", {
1471-
irisDF <- createDataFrame(sqlContext, iris)
14721473
expect_equal(as.data.frame(irisDF), collect(irisDF))
14731474
irisDF2 <- irisDF[irisDF$Species == "setosa", ]
14741475
expect_equal(as.data.frame(irisDF2), collect(irisDF2))
@@ -1503,6 +1504,27 @@ test_that("with() on a DataFrame", {
15031504
expect_equal(nrow(sum2), 35)
15041505
})
15051506

1507+
test_that("Method coltypes() to get R's data types of a DataFrame", {
1508+
expect_equal(coltypes(irisDF), c(rep("numeric", 4), "character"))
1509+
1510+
data <- data.frame(c1=c(1,2,3),
1511+
c2=c(T,F,T),
1512+
c3=c("2015/01/01 10:00:00", "2015/01/02 10:00:00", "2015/01/03 10:00:00"))
1513+
1514+
schema <- structType(structField("c1", "byte"),
1515+
structField("c3", "boolean"),
1516+
structField("c4", "timestamp"))
1517+
1518+
# Test primitive types
1519+
DF <- createDataFrame(sqlContext, data, schema)
1520+
expect_equal(coltypes(DF), c("integer", "logical", "POSIXct"))
1521+
1522+
# Test complex types
1523+
x <- createDataFrame(sqlContext, list(list(as.environment(
1524+
list("a"="b", "c"="d", "e"="f")))))
1525+
expect_equal(coltypes(x), "map<string,string>")
1526+
})
1527+
15061528
unlink(parquetPath)
15071529
unlink(jsonPath)
15081530
unlink(jsonPathNa)

0 commit comments

Comments
 (0)