Skip to content

Commit 08a74e5

Browse files
author
Marcelo Vanzin
committed
Merge branch 'master' into SPARK-11563
2 parents 09d03e7 + db51652 commit 08a74e5

File tree

558 files changed

+14722
-5320
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

558 files changed

+14722
-5320
lines changed

.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,5 @@ INDEX
8282
gen-java.*
8383
.*avpr
8484
org.apache.spark.sql.sources.DataSourceRegister
85+
org.apache.spark.scheduler.SparkHistoryListenerFactory
8586
.*parquet

LICENSE

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
Apache License
32
Version 2.0, January 2004
43
http://www.apache.org/licenses/
@@ -237,7 +236,7 @@ The following components are provided under a BSD-style license. See project lin
237236
The text of each license is also included at licenses/LICENSE-[project].txt.
238237

239238
(BSD 3 Clause) netlib core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
240-
(BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model)
239+
(BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.2.7 - https://github.com/jpmml/jpmml-model)
241240
(BSD 3-clause style license) jblas (org.jblas:jblas:1.2.4 - http://jblas.org/)
242241
(BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
243242
(BSD licence) ANTLR ST4 4.0.4 (org.antlr:ST4:4.0.4 - http://www.stringtemplate.org)

R/pkg/NAMESPACE

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@ exportMethods("arrange",
2727
"attach",
2828
"cache",
2929
"collect",
30+
"colnames",
31+
"colnames<-",
3032
"coltypes",
33+
"coltypes<-",
3134
"columns",
3235
"count",
3336
"cov",
@@ -56,6 +59,7 @@ exportMethods("arrange",
5659
"mutate",
5760
"na.omit",
5861
"names",
62+
"names<-",
5963
"ncol",
6064
"nrow",
6165
"orderBy",
@@ -123,15 +127,17 @@ exportMethods("%in%",
123127
"count",
124128
"countDistinct",
125129
"crc32",
126-
"cumeDist",
130+
"cume_dist",
127131
"date_add",
128132
"date_format",
129133
"date_sub",
130134
"datediff",
131135
"dayofmonth",
132136
"dayofyear",
133-
"denseRank",
137+
"decode",
138+
"dense_rank",
134139
"desc",
140+
"encode",
135141
"endsWith",
136142
"exp",
137143
"explode",
@@ -188,7 +194,7 @@ exportMethods("%in%",
188194
"next_day",
189195
"ntile",
190196
"otherwise",
191-
"percentRank",
197+
"percent_rank",
192198
"pmod",
193199
"quarter",
194200
"rand",
@@ -200,7 +206,7 @@ exportMethods("%in%",
200206
"rint",
201207
"rlike",
202208
"round",
203-
"rowNumber",
209+
"row_number",
204210
"rpad",
205211
"rtrim",
206212
"second",
@@ -221,6 +227,7 @@ exportMethods("%in%",
221227
"stddev",
222228
"stddev_pop",
223229
"stddev_samp",
230+
"struct",
224231
"sqrt",
225232
"startsWith",
226233
"substr",
@@ -276,4 +283,4 @@ export("structField",
276283
"structType",
277284
"structType.jobj",
278285
"structType.structField",
279-
"print.structType")
286+
"print.structType")

R/pkg/R/DataFrame.R

Lines changed: 136 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ setMethod("dtypes",
254254
#' @family DataFrame functions
255255
#' @rdname columns
256256
#' @name columns
257+
257258
#' @export
258259
#' @examples
259260
#'\dontrun{
@@ -262,6 +263,7 @@ setMethod("dtypes",
262263
#' path <- "path/to/file.json"
263264
#' df <- jsonFile(sqlContext, path)
264265
#' columns(df)
266+
#' colnames(df)
265267
#'}
266268
setMethod("columns",
267269
signature(x = "DataFrame"),
@@ -290,6 +292,121 @@ setMethod("names<-",
290292
}
291293
})
292294

295+
#' @rdname columns
296+
#' @name colnames
297+
setMethod("colnames",
298+
signature(x = "DataFrame"),
299+
function(x) {
300+
columns(x)
301+
})
302+
303+
#' @rdname columns
304+
#' @name colnames<-
305+
setMethod("colnames<-",
306+
signature(x = "DataFrame", value = "character"),
307+
function(x, value) {
308+
sdf <- callJMethod(x@sdf, "toDF", as.list(value))
309+
dataFrame(sdf)
310+
})
311+
312+
#' coltypes
313+
#'
314+
#' Get column types of a DataFrame
315+
#'
316+
#' @param x A SparkSQL DataFrame
317+
#' @return value A character vector with the column types of the given DataFrame
318+
#' @rdname coltypes
319+
#' @name coltypes
320+
#' @family DataFrame functions
321+
#' @export
322+
#' @examples
323+
#'\dontrun{
324+
#' irisDF <- createDataFrame(sqlContext, iris)
325+
#' coltypes(irisDF)
326+
#'}
327+
setMethod("coltypes",
328+
signature(x = "DataFrame"),
329+
function(x) {
330+
# Get the data types of the DataFrame by invoking dtypes() function
331+
types <- sapply(dtypes(x), function(x) {x[[2]]})
332+
333+
# Map Spark data types into R's data types using DATA_TYPES environment
334+
rTypes <- sapply(types, USE.NAMES=F, FUN=function(x) {
335+
# Check for primitive types
336+
type <- PRIMITIVE_TYPES[[x]]
337+
338+
if (is.null(type)) {
339+
# Check for complex types
340+
for (t in names(COMPLEX_TYPES)) {
341+
if (substring(x, 1, nchar(t)) == t) {
342+
type <- COMPLEX_TYPES[[t]]
343+
break
344+
}
345+
}
346+
347+
if (is.null(type)) {
348+
stop(paste("Unsupported data type: ", x))
349+
}
350+
}
351+
type
352+
})
353+
354+
# Find which types don't have mapping to R
355+
naIndices <- which(is.na(rTypes))
356+
357+
# Assign the original scala data types to the unmatched ones
358+
rTypes[naIndices] <- types[naIndices]
359+
360+
rTypes
361+
})
362+
363+
#' coltypes
364+
#'
365+
#' Set the column types of a DataFrame.
366+
#'
367+
#' @param x A SparkSQL DataFrame
368+
#' @param value A character vector with the target column types for the given
369+
#' DataFrame. Column types can be one of integer, numeric/double, character, logical, or NA
370+
#' to keep that column as-is.
371+
#' @rdname coltypes
372+
#' @name coltypes<-
373+
#' @export
374+
#' @examples
375+
#'\dontrun{
376+
#' sc <- sparkR.init()
377+
#' sqlContext <- sparkRSQL.init(sc)
378+
#' path <- "path/to/file.json"
379+
#' df <- jsonFile(sqlContext, path)
380+
#' coltypes(df) <- c("character", "integer")
381+
#' coltypes(df) <- c(NA, "numeric")
382+
#'}
383+
setMethod("coltypes<-",
384+
signature(x = "DataFrame", value = "character"),
385+
function(x, value) {
386+
cols <- columns(x)
387+
ncols <- length(cols)
388+
if (length(value) == 0) {
389+
stop("Cannot set types of an empty DataFrame with no Column")
390+
}
391+
if (length(value) != ncols) {
392+
stop("Length of type vector should match the number of columns for DataFrame")
393+
}
394+
newCols <- lapply(seq_len(ncols), function(i) {
395+
col <- getColumn(x, cols[i])
396+
if (!is.na(value[i])) {
397+
stype <- rToSQLTypes[[value[i]]]
398+
if (is.null(stype)) {
399+
stop("Only atomic type is supported for column types")
400+
}
401+
cast(col, stype)
402+
} else {
403+
col
404+
}
405+
})
406+
nx <- select(x, newCols)
407+
dataFrame(nx@sdf)
408+
})
409+
293410
#' Register Temporary Table
294411
#'
295412
#' Registers a DataFrame as a Temporary Table in the SQLContext
@@ -676,8 +793,8 @@ setMethod("dim",
676793
setMethod("collect",
677794
signature(x = "DataFrame"),
678795
function(x, stringsAsFactors = FALSE) {
679-
names <- columns(x)
680-
ncol <- length(names)
796+
dtypes <- dtypes(x)
797+
ncol <- length(dtypes)
681798
if (ncol <= 0) {
682799
# empty data.frame with 0 columns and 0 rows
683800
data.frame()
@@ -700,25 +817,29 @@ setMethod("collect",
700817
# data of complex type can be held. But getting a cell from a column
701818
# of list type returns a list instead of a vector. So for columns of
702819
# non-complex type, append them as vector.
820+
#
821+
# For columns of complex type, be careful to access them.
822+
# Get a column of complex type returns a list.
823+
# Get a cell from a column of complex type returns a list instead of a vector.
703824
col <- listCols[[colIndex]]
704825
if (length(col) <= 0) {
705-
df[[names[colIndex]]] <- col
826+
df[[colIndex]] <- col
706827
} else {
707-
# TODO: more robust check on column of primitive types
708-
vec <- do.call(c, col)
709-
if (class(vec) != "list") {
710-
df[[names[colIndex]]] <- vec
828+
colType <- dtypes[[colIndex]][[2]]
829+
# Note that "binary" columns behave like complex types.
830+
if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") {
831+
vec <- do.call(c, col)
832+
stopifnot(class(vec) != "list")
833+
df[[colIndex]] <- vec
711834
} else {
712-
# For columns of complex type, be careful to access them.
713-
# Get a column of complex type returns a list.
714-
# Get a cell from a column of complex type returns a list instead of a vector.
715-
df[[names[colIndex]]] <- col
716-
}
835+
df[[colIndex]] <- col
836+
}
837+
}
717838
}
839+
names(df) <- names(x)
840+
df
718841
}
719-
df
720-
}
721-
})
842+
})
722843

723844
#' Limit
724845
#'
@@ -2102,52 +2223,3 @@ setMethod("with",
21022223
newEnv <- assignNewEnv(data)
21032224
eval(substitute(expr), envir = newEnv, enclos = newEnv)
21042225
})
2105-
2106-
#' Returns the column types of a DataFrame.
2107-
#'
2108-
#' @name coltypes
2109-
#' @title Get column types of a DataFrame
2110-
#' @family dataframe_funcs
2111-
#' @param x (DataFrame)
2112-
#' @return value (character) A character vector with the column types of the given DataFrame
2113-
#' @rdname coltypes
2114-
#' @examples \dontrun{
2115-
#' irisDF <- createDataFrame(sqlContext, iris)
2116-
#' coltypes(irisDF)
2117-
#' }
2118-
setMethod("coltypes",
2119-
signature(x = "DataFrame"),
2120-
function(x) {
2121-
# Get the data types of the DataFrame by invoking dtypes() function
2122-
types <- sapply(dtypes(x), function(x) {x[[2]]})
2123-
2124-
# Map Spark data types into R's data types using DATA_TYPES environment
2125-
rTypes <- sapply(types, USE.NAMES=F, FUN=function(x) {
2126-
2127-
# Check for primitive types
2128-
type <- PRIMITIVE_TYPES[[x]]
2129-
2130-
if (is.null(type)) {
2131-
# Check for complex types
2132-
for (t in names(COMPLEX_TYPES)) {
2133-
if (substring(x, 1, nchar(t)) == t) {
2134-
type <- COMPLEX_TYPES[[t]]
2135-
break
2136-
}
2137-
}
2138-
2139-
if (is.null(type)) {
2140-
stop(paste("Unsupported data type: ", x))
2141-
}
2142-
}
2143-
type
2144-
})
2145-
2146-
# Find which types don't have mapping to R
2147-
naIndices <- which(is.na(rTypes))
2148-
2149-
# Assign the original scala data types to the unmatched ones
2150-
rTypes[naIndices] <- types[naIndices]
2151-
2152-
rTypes
2153-
})

0 commit comments

Comments
 (0)