Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
dab0565
Method str()
Nov 11, 2015
b5129cd
Fixed R style issues
Nov 11, 2015
d825d2c
Fixed R style issues
Nov 11, 2015
daa3d41
Fixed style. Added more specific tests
Nov 14, 2015
5b4f6b1
Replaced %++% by paste0
Nov 14, 2015
6d226e9
Removed white space
Nov 14, 2015
05bb4aa
Merge branch 'master' of https://github.com/apache/spark into SPARK-1…
Nov 16, 2015
b74288b
Style changes and added quotes to character fields
Nov 16, 2015
db96730
Fixed R style issues
Nov 16, 2015
992bf89
Merge branch 'master' of https://github.com/apache/spark into SPARK-1…
Nov 18, 2015
6bb5bd4
Merged str function
Nov 18, 2015
4135029
Update generics.R
Nov 18, 2015
a995f6e
Update generics.R
Nov 18, 2015
7f4adbc
Update generics.R
Nov 18, 2015
957b3c2
Update generics.R
Nov 19, 2015
1705432
Added tests for utils:::str
Nov 20, 2015
cfdfc01
Renamed dataFrame for localDF for clarity
Nov 23, 2015
5d7deb8
Merge branch 'master' of https://github.com/apache/spark into SPARK-1…
Nov 23, 2015
7b8a563
Merge branch 'master' of https://github.com/apache/spark into SPARK-1…
Dec 7, 2015
4b416cc
Merged str() code after master update
Dec 8, 2015
8140e20
Added max_char_per_row constant
Dec 8, 2015
6a7ff1b
Removed caching logic. Updated tests
Dec 17, 2015
cfb85e4
Updated str() tests
Dec 17, 2015
0ac7384
Fixed tests
Dec 17, 2015
a7141cc
Fixed tests
Dec 17, 2015
74c9651
Removed unnecessary initialization
Dec 18, 2015
1428925
Added back transform()
Dec 18, 2015
40a5202
Removed space
Dec 18, 2015
5bdf3f9
Removed space
Dec 18, 2015
38c21f3
Removed space
Dec 18, 2015
2701898
Removed duplicate with() declaration and re-ordered generics for Data…
Jan 12, 2016
2a8115d
Made spacing uniform for types.R
Jan 12, 2016
0ffcb4f
Changed rdname of transform for mutate
Jan 13, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ export("as.DataFrame",
"parquetFile",
"read.df",
"sql",
"str",
"table",
"tableNames",
"tables",
Expand Down
73 changes: 73 additions & 0 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2223,3 +2223,76 @@ setMethod("with",
newEnv <- assignNewEnv(data)
eval(substitute(expr), envir = newEnv, enclos = newEnv)
})

#' Display the structure of a DataFrame, including column names, column types, as well as a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the first line should be a title ? cc @felixcheung

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's ok since he has @name and @title tag below.
Apparently this is the doc style adopted in DataFrame.R - we should make it consistent across source files though at some point.

#' a small sample of rows.
#' @name str
#' @title Compactly display the structure of a dataset
#' @rdname str
#' @family DataFrame functions
#' @param object a DataFrame
#' @examples \dontrun{
#' # Create a DataFrame from the Iris dataset
#' irisDF <- createDataFrame(sqlContext, iris)
#'
#' # Show the structure of the DataFrame
#' str(irisDF)
#' }
setMethod("str",
signature(object = "DataFrame"),
function(object) {

# TODO: These could be made global parameters, though in R it's not the case
MAX_CHAR_PER_ROW <- 120
MAX_COLS <- 100

# Get the column names and types of the DataFrame
names <- names(object)
types <- coltypes(object)

# Get the first elements of the dataset. Limit number of columns accordingly
localDF <- if (ncol(object) > MAX_COLS) {
head(object[, c(1:MAX_COLS)])
} else {
head(object)
}

# The number of observations will not be displayed as computing the
# number of rows is a very expensive operation
cat(paste0("'", class(object), "': ", length(names), " variables:\n"))

if (nrow(localDF) > 0) {
for (i in 1 : ncol(localDF)) {
# Get the first elements for each column

firstElements <- if (types[i] == "character") {
paste(paste0("\"", localDF[,i], "\""), collapse = " ")
} else {
paste(localDF[,i], collapse = " ")
}

# Add the corresponding number of spaces for alignment
spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="")

# Get the short type. For 'character', it would be 'chr';
# 'for numeric', it's 'num', etc.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a 100 char line limit. I think the comments here or in line 2230 should fit in one line ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Combining those two lines will end up in 106 characters

dataType <- SHORT_TYPES[[types[i]]]
if (is.null(dataType)) {
dataType <- substring(types[i], 1, 3)
}

# Concatenate the colnames, coltypes, and first
# elements of each column
line <- paste0(" $ ", names[i], spaces, ": ",
dataType, " ",firstElements)

# Chop off extra characters if this is too long
cat(substr(line, 1, MAX_CHAR_PER_ROW))
cat("\n")
}

if (ncol(localDF) < ncol(object)) {
cat(paste0("\nDisplaying first ", ncol(localDF), " columns only."))
}
}
})
36 changes: 18 additions & 18 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,6 @@ setGeneric("subtractByKey",
setGeneric("value", function(bcast) { standardGeneric("value") })



#################### DataFrame Methods ########################

#' @rdname agg
Expand All @@ -389,6 +388,14 @@ setGeneric("agg", function (x, ...) { standardGeneric("agg") })
#' @export
setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })

#' @rdname as.data.frame
#' @export
setGeneric("as.data.frame")

#' @rdname attach
#' @export
setGeneric("attach")

#' @rdname columns
#' @export
setGeneric("colnames", function(x, do.NULL = TRUE, prefix = "col") { standardGeneric("colnames") })
Expand Down Expand Up @@ -529,13 +536,12 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
standardGeneric("saveAsTable")
})

#' @rdname withColumn
#' @export
setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })
setGeneric("str")

#' @rdname write.df
#' @rdname mutate
#' @export
setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })

#' @rdname write.df
#' @export
Expand Down Expand Up @@ -581,6 +587,10 @@ setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
#' @export
setGeneric("where", function(x, condition) { standardGeneric("where") })

#' @rdname with
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there 2 'with' here in this file?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch. I have fixed it.

#' @export
setGeneric("with")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed this and also re-ordered generics declaration for attach and as.data.frame.


#' @rdname withColumn
#' @export
setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn") })
Expand All @@ -590,6 +600,9 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
setGeneric("withColumnRenamed",
function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })

#' @rdname write.df
#' @export
setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })

###################### Column Methods ##########################

Expand Down Expand Up @@ -1093,7 +1106,6 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
#' @export
setGeneric("year", function(x) { standardGeneric("year") })


#' @rdname glm
#' @export
setGeneric("glm")
Expand All @@ -1105,15 +1117,3 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") })
#' @rdname rbind
#' @export
setGeneric("rbind", signature = "...")

#' @rdname as.data.frame
#' @export
setGeneric("as.data.frame")

#' @rdname attach
#' @export
setGeneric("attach")

#' @rdname with
#' @export
setGeneric("with")
21 changes: 17 additions & 4 deletions R/pkg/R/types.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,23 @@ COMPLEX_TYPES <- list(
# The full list of data types.
DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))

SHORT_TYPES <- as.environment(list(
"character" = "chr",
"logical" = "logi",
"POSIXct" = "POSIXct",
"integer" = "int",
"numeric" = "num",
"raw" = "raw",
"Date" = "Date",
"map" = "map",
"array" = "array",
"struct" = "struct"
))

# An environment for mapping R to Scala, names are R types and values are Scala types.
rToSQLTypes <- as.environment(list(
"integer" = "integer", # in R, integer is 32bit
"numeric" = "double", # in R, numeric == double which is 64bit
"double" = "double",
"integer" = "integer", # in R, integer is 32bit
"numeric" = "double", # in R, numeric == double which is 64bit
"double" = "double",
"character" = "string",
"logical" = "boolean"))
"logical" = "boolean"))
31 changes: 31 additions & 0 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1725,6 +1725,37 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", {
"Only atomic type is supported for column types")
})

test_that("Method str()", {
# Structure of Iris
iris2 <- iris
colnames(iris2) <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width", "Species")
iris2$col <- TRUE
irisDF2 <- createDataFrame(sqlContext, iris2)

out <- capture.output(str(irisDF2))
expect_equal(length(out), 7)
expect_equal(out[1], "'DataFrame': 6 variables:")
expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4")
expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9")
expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7")
expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4")
expect_equal(out[6], paste0(" $ Species : chr \"setosa\" \"setosa\" \"",
"setosa\" \"setosa\" \"setosa\" \"setosa\""))
expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE")

# A random dataset with many columns. This test is to check str limits
# the number of columns. Therefore, it will suffice to check for the
# number of returned rows
x <- runif(200, 1, 10)
df <- data.frame(t(as.matrix(data.frame(x,x,x,x,x,x,x,x,x))))
DF <- createDataFrame(sqlContext, df)
out <- capture.output(str(DF))
expect_equal(length(out), 103)

# Test utils:::str
expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris)))
})

unlink(parquetPath)
unlink(jsonPath)
unlink(jsonPathNa)