From dab05655cb517d8ca86a1c61ed5c8c28240ade1a Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 11 Nov 2015 01:18:29 +0000 Subject: [PATCH 01/29] Method str() --- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 102 ++++++++++++++++++++++++++++++- R/pkg/R/generics.R | 6 +- R/pkg/R/types.R | 13 ++++ R/pkg/inst/tests/test_sparkSQL.R | 16 +++++ 5 files changed, 135 insertions(+), 3 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 52fd6c9f76c54..b787d5e38e21f 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -252,6 +252,7 @@ export("as.DataFrame", "parquetFile", "read.df", "sql", + "str", "table", "tableNames", "tables", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index cc868069d1e5a..57731a17f4d2b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2200,4 +2200,104 @@ setMethod("coltypes", rTypes[naIndices] <- types[naIndices] rTypes - }) \ No newline at end of file + }) + +#' Display the structure of a DataFrame, including column names, column types, as well as a +#' a small sample of rows. +#' @name str +#' @title Compactly display the structure of a dataset +#' @rdname str_data_frame +#' @family dataframe_funcs +#' @param x a DataFrame +#' @examples \dontrun{ +#' +#' # Create a DataFrame from the Iris dataset +#' irisDF <- createDataFrame(sqlContext, iris) +#' +#' # Show the structure of the DataFrame +#' str(irisDF) +#' +#' } +setMethod("str", signature="DataFrame", definition= + function(object) { + + # A synonym for easily concatenating strings + "%++%" <- function(x, y) { + paste(x, y, sep = "") + } + + # TODO: These could be made global parameters, though in R it's not the case + DEFAULT_HEAD_ROWS <- 6 + MAX_CHAR_PER_ROW <- 120 + MAX_COLS <- 100 + + # Get the column names and types of the DataFrame + names <- names(object) + types <- coltypes(object) + + # Get the number of rows. + # TODO: Ideally, this should be cached + cachedCount <- nrow(object) + + # Get the first elements of the dataset. Limit number of columns accordingly + dataFrame <- if (ncol(object) > MAX_COLS) { + head(object[, c(1:MAX_COLS)], DEFAULT_HEAD_ROWS) + } else { + head(object, DEFAULT_HEAD_ROWS) + } + + # The number of observations will be displayed only if the number + # of rows of the dataset has already been cached. + if (!is.null(cachedCount)) { + cat("'" %++% class(object) %++% "': " %++% cachedCount %++% " obs. of " %++% + length(names) %++% " variables:\n") + } else { + cat("'" %++% class(object) %++% "': " %++% length(names) %++% " variables:\n") + } + + # Whether the ... should be printed at the end of each row + ellipsis <- FALSE + + # Add ellipsis (i.e., "...") if there are more rows than shown + if (!is.null(cachedCount)) { + if (nrow(object) > DEFAULT_HEAD_ROWS) { + ellipsis <- TRUE + } + } + + if (nrow(dataFrame) > 0) { + for (i in 1 : ncol(dataFrame)) { + firstElements <- "" + + # Get the first elements for each column + if (types[i] == "chr") { + firstElements <- paste("\"" %++% dataFrame[,i] %++% "\"", collapse = " ") + } else { + firstElements <- paste(dataFrame[,i], collapse = " ") + } + + # Add the corresponding number of spaces for alignment + spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="") + + # Get the short type. For 'character', it would be 'chr'; 'for numeric', it's 'num', etc. + dataType <- SHORT_TYPES[[types[i]]] + if (is.null(dataType)) { + dataType <- substring(types[i], 1, 3) + } + + # Concatenate the colnames, coltypes, and first elements of each column + line <- " $ " %++% names[i] %++% spaces %++% ": " %++% dataType %++% " " %++% firstElements + + # Chop off extra characters if this is too long + cat(substr(line, 1, MAX_CHAR_PER_ROW)) + if (ellipsis) { + cat(" ...") + } + cat("\n") + } + + if (ncol(dataFrame) < ncol(object)) { + cat("\nDisplaying first " %++% ncol(dataFrame) %++% " columns only.") + } + } + }) \ No newline at end of file diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 89731affeb898..cca47c4e8c6c5 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1027,7 +1027,6 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") }) #' @export setGeneric("year", function(x) { standardGeneric("year") }) - #' @rdname glm #' @export setGeneric("glm") @@ -1050,4 +1049,7 @@ setGeneric("with") #' @rdname coltypes #' @export -setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) \ No newline at end of file +setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) + +#' @export +setGeneric("str") \ No newline at end of file diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index 1828c23ab0f6d..1958dad4500a8 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -41,3 +41,16 @@ COMPLEX_TYPES <- list( # The full list of data types. DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES)) + +SHORT_TYPES <- as.environment(list( + "character"="chr", + "logical"="logi", + "POSIXct"="POSIXct", + "integer"="int", + "numeric"="num", + "raw"="raw", + "Date"="Date", + "map"="map", + "array"="array", + "struct"="struct" +)) \ No newline at end of file diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 06f52d021cff8..37ae3bbb653ab 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1525,6 +1525,22 @@ test_that("Method coltypes() to get R's data types of a DataFrame", { expect_equal(coltypes(x), "map") }) +test_that("Method str()", { + # Structure of Iiris + iris2 <- iris + iris2$col <- TRUE + irisDF2 <- createDataFrame(sqlContext, iris2) + out <- capture.output(str(irisDF2)) + expect_equal(length(out), 7) + + # A random dataset with many columns + x <- runif(200, 1, 10) + df <- data.frame(t(as.matrix(data.frame(x,x,x,x,x,x,x,x,x)))) + DF <- createDataFrame(sqlContext, df) + out <- capture.output(str(DF)) + expect_equal(length(out), 103) +}) + unlink(parquetPath) unlink(jsonPath) unlink(jsonPathNa) From b5129cd12711a678527b927999bf8f9ce5aa73fa Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 11 Nov 2015 01:40:19 +0000 Subject: [PATCH 02/29] Fixed R style issues --- R/pkg/R/DataFrame.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 57731a17f4d2b..0fe0fb83790d0 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2249,7 +2249,7 @@ setMethod("str", signature="DataFrame", definition= # The number of observations will be displayed only if the number # of rows of the dataset has already been cached. if (!is.null(cachedCount)) { - cat("'" %++% class(object) %++% "': " %++% cachedCount %++% " obs. of " %++% + cat("'" %++% class(object) %++% "': " %++% cachedCount %++% " obs. of " %++% length(names) %++% " variables:\n") } else { cat("'" %++% class(object) %++% "': " %++% length(names) %++% " variables:\n") @@ -2274,19 +2274,22 @@ setMethod("str", signature="DataFrame", definition= firstElements <- paste("\"" %++% dataFrame[,i] %++% "\"", collapse = " ") } else { firstElements <- paste(dataFrame[,i], collapse = " ") - } + } # Add the corresponding number of spaces for alignment spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="") - # Get the short type. For 'character', it would be 'chr'; 'for numeric', it's 'num', etc. + # Get the short type. For 'character', it would be 'chr'; + # 'for numeric', it's 'num', etc. dataType <- SHORT_TYPES[[types[i]]] if (is.null(dataType)) { dataType <- substring(types[i], 1, 3) } - # Concatenate the colnames, coltypes, and first elements of each column - line <- " $ " %++% names[i] %++% spaces %++% ": " %++% dataType %++% " " %++% firstElements + # Concatenate the colnames, coltypes, and first + # elements of each column + line <- " $ " %++% names[i] %++% spaces %++% ": " %++% + dataType %++% " " %++% firstElements # Chop off extra characters if this is too long cat(substr(line, 1, MAX_CHAR_PER_ROW)) From d825d2c9f151056869f49a05f8a81d8036b40597 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 11 Nov 2015 01:45:19 +0000 Subject: [PATCH 03/29] Fixed R style issues --- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 0fe0fb83790d0..53b12724624b0 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2279,14 +2279,14 @@ setMethod("str", signature="DataFrame", definition= # Add the corresponding number of spaces for alignment spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="") - # Get the short type. For 'character', it would be 'chr'; + # Get the short type. For 'character', it would be 'chr'; # 'for numeric', it's 'num', etc. dataType <- SHORT_TYPES[[types[i]]] if (is.null(dataType)) { dataType <- substring(types[i], 1, 3) } - # Concatenate the colnames, coltypes, and first + # Concatenate the colnames, coltypes, and first # elements of each column line <- " $ " %++% names[i] %++% spaces %++% ": " %++% dataType %++% " " %++% firstElements From daa3d41a78f6373c8a71340f0494ab7fb7ee6947 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Sat, 14 Nov 2015 01:59:09 +0000 Subject: [PATCH 04/29] Fixed style. Added more specific tests --- R/pkg/R/DataFrame.R | 145 +++++++++++++++---------------- R/pkg/R/generics.R | 41 +++++---- R/pkg/inst/tests/test_sparkSQL.R | 17 +++- 3 files changed, 105 insertions(+), 98 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 53b12724624b0..45e7c5f8ef477 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2218,89 +2218,88 @@ setMethod("coltypes", #' str(irisDF) #' #' } -setMethod("str", signature="DataFrame", definition= - function(object) { +setMethod("str", + signature(object = "DataFrame"), + function(object) { - # A synonym for easily concatenating strings - "%++%" <- function(x, y) { - paste(x, y, sep = "") - } + # A synonym for easily concatenating strings + "%++%" <- function(x, y) { + paste(x, y, sep = "") + } - # TODO: These could be made global parameters, though in R it's not the case - DEFAULT_HEAD_ROWS <- 6 - MAX_CHAR_PER_ROW <- 120 - MAX_COLS <- 100 - - # Get the column names and types of the DataFrame - names <- names(object) - types <- coltypes(object) - - # Get the number of rows. - # TODO: Ideally, this should be cached - cachedCount <- nrow(object) - - # Get the first elements of the dataset. Limit number of columns accordingly - dataFrame <- if (ncol(object) > MAX_COLS) { - head(object[, c(1:MAX_COLS)], DEFAULT_HEAD_ROWS) - } else { - head(object, DEFAULT_HEAD_ROWS) - } - - # The number of observations will be displayed only if the number - # of rows of the dataset has already been cached. - if (!is.null(cachedCount)) { - cat("'" %++% class(object) %++% "': " %++% cachedCount %++% " obs. of " %++% - length(names) %++% " variables:\n") - } else { - cat("'" %++% class(object) %++% "': " %++% length(names) %++% " variables:\n") - } + # TODO: These could be made global parameters, though in R it's not the case + DEFAULT_HEAD_ROWS <- 6 + MAX_CHAR_PER_ROW <- 120 + MAX_COLS <- 100 + + # Get the column names and types of the DataFrame + names <- names(object) + types <- coltypes(object) + + # Get the number of rows. + # TODO: Ideally, this should be cached + cachedCount <- nrow(object) + + # Get the first elements of the dataset. Limit number of columns accordingly + dataFrame <- if (ncol(object) > MAX_COLS) { + head(object[, c(1:MAX_COLS)], DEFAULT_HEAD_ROWS) + } else { + head(object, DEFAULT_HEAD_ROWS) + } + + # The number of observations will be displayed only if the number + # of rows of the dataset has already been cached. + if (!is.null(cachedCount)) { + cat("'" %++% class(object) %++% "': " %++% cachedCount %++% " obs. of " %++% + length(names) %++% " variables:\n") + } else { + cat("'" %++% class(object) %++% "': " %++% length(names) %++% " variables:\n") + } - # Whether the ... should be printed at the end of each row - ellipsis <- FALSE + # Whether the ... should be printed at the end of each row + ellipsis <- FALSE - # Add ellipsis (i.e., "...") if there are more rows than shown - if (!is.null(cachedCount)) { - if (nrow(object) > DEFAULT_HEAD_ROWS) { - ellipsis <- TRUE - } - } + # Add ellipsis (i.e., "...") if there are more rows than shown + if (!is.null(cachedCount) && (cachedCount > DEFAULT_HEAD_ROWS)) { + ellipsis <- TRUE + } - if (nrow(dataFrame) > 0) { - for (i in 1 : ncol(dataFrame)) { - firstElements <- "" + if (nrow(dataFrame) > 0) { + for (i in 1 : ncol(dataFrame)) { + firstElements <- "" - # Get the first elements for each column - if (types[i] == "chr") { - firstElements <- paste("\"" %++% dataFrame[,i] %++% "\"", collapse = " ") - } else { - firstElements <- paste(dataFrame[,i], collapse = " ") - } + # Get the first elements for each column + if (types[i] == "chr") { + firstElements <- paste("\"" %++% dataFrame[,i] %++% "\"", collapse = " ") + } else { + firstElements <- paste(dataFrame[,i], collapse = " ") + } - # Add the corresponding number of spaces for alignment - spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="") + # Add the corresponding number of spaces for alignment + spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="") - # Get the short type. For 'character', it would be 'chr'; - # 'for numeric', it's 'num', etc. - dataType <- SHORT_TYPES[[types[i]]] - if (is.null(dataType)) { - dataType <- substring(types[i], 1, 3) - } + # Get the short type. For 'character', it would be 'chr'; + # 'for numeric', it's 'num', etc. + dataType <- SHORT_TYPES[[types[i]]] + if (is.null(dataType)) { + dataType <- substring(types[i], 1, 3) + } - # Concatenate the colnames, coltypes, and first - # elements of each column - line <- " $ " %++% names[i] %++% spaces %++% ": " %++% - dataType %++% " " %++% firstElements + # Concatenate the colnames, coltypes, and first + # elements of each column + line <- " $ " %++% names[i] %++% spaces %++% ": " %++% + dataType %++% " " %++% firstElements - # Chop off extra characters if this is too long - cat(substr(line, 1, MAX_CHAR_PER_ROW)) - if (ellipsis) { - cat(" ...") - } - cat("\n") + # Chop off extra characters if this is too long + cat(substr(line, 1, MAX_CHAR_PER_ROW)) + if (ellipsis) { + cat(" ...") } + cat("\n") + } - if (ncol(dataFrame) < ncol(object)) { - cat("\nDisplaying first " %++% ncol(dataFrame) %++% " columns only.") - } + if (ncol(dataFrame) < ncol(object)) { + cat("\nDisplaying first " %++% ncol(dataFrame) %++% " columns only.") } - }) \ No newline at end of file + } + }) \ No newline at end of file diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index cca47c4e8c6c5..e48d5341e2dc2 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -388,7 +388,6 @@ setGeneric("subtractByKey", setGeneric("value", function(bcast) { standardGeneric("value") }) - #################### DataFrame Methods ######################## #' @rdname agg @@ -399,6 +398,18 @@ setGeneric("agg", function (x, ...) { standardGeneric("agg") }) #' @export setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") }) +#' @rdname as.data.frame +#' @export +setGeneric("as.data.frame") + +#' @rdname attach +#' @export +setGeneric("attach") + +#' @rdname coltypes +#' @export +setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) + #' @rdname schema #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) @@ -579,6 +590,10 @@ setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") }) #' @export setGeneric("where", function(x, condition) { standardGeneric("where") }) +#' @rdname with +#' @export +setGeneric("with") + #' @rdname withColumn #' @export setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn") }) @@ -971,6 +986,9 @@ setGeneric("size", function(x) { standardGeneric("size") }) #' @export setGeneric("soundex", function(x) { standardGeneric("soundex") }) +#' @export +setGeneric("str") + #' @rdname substring_index #' @export setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") }) @@ -1033,23 +1051,4 @@ setGeneric("glm") #' @rdname rbind #' @export -setGeneric("rbind", signature = "...") - -#' @rdname as.data.frame -#' @export -setGeneric("as.data.frame") - -#' @rdname attach -#' @export -setGeneric("attach") - -#' @rdname with -#' @export -setGeneric("with") - -#' @rdname coltypes -#' @export -setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) - -#' @export -setGeneric("str") \ No newline at end of file +setGeneric("rbind", signature = "...") \ No newline at end of file diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 37ae3bbb653ab..4b97fa7fec3b2 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1526,14 +1526,23 @@ test_that("Method coltypes() to get R's data types of a DataFrame", { }) test_that("Method str()", { - # Structure of Iiris + # Structure of Iris iris2 <- iris iris2$col <- TRUE irisDF2 <- createDataFrame(sqlContext, iris2) out <- capture.output(str(irisDF2)) expect_equal(length(out), 7) - - # A random dataset with many columns + expect_equal(out[1], "'DataFrame': 150 obs. of 6 variables:") + expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4 ...") + expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9 ...") + expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7 ...") + expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4 ...") + expect_equal(out[6], " $ Species : chr setosa setosa setosa setosa setosa setosa ...") + expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE ...") + + # A random dataset with many columns. This test is to check str limits + # the number of columns. Therefore, it will suffice to check for the + # number of returned rows x <- runif(200, 1, 10) df <- data.frame(t(as.matrix(data.frame(x,x,x,x,x,x,x,x,x)))) DF <- createDataFrame(sqlContext, df) @@ -1543,4 +1552,4 @@ test_that("Method str()", { unlink(parquetPath) unlink(jsonPath) -unlink(jsonPathNa) +unlink(jsonPathNa) \ No newline at end of file From 5b4f6b1a38ab51fef74eea2eef3f3699a72a331a Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Sat, 14 Nov 2015 02:16:25 +0000 Subject: [PATCH 05/29] Replaced %++% by paste0 --- R/pkg/R/DataFrame.R | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 45e7c5f8ef477..a5a7977f6c06e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2222,11 +2222,6 @@ setMethod("str", signature(object = "DataFrame"), function(object) { - # A synonym for easily concatenating strings - "%++%" <- function(x, y) { - paste(x, y, sep = "") - } - # TODO: These could be made global parameters, though in R it's not the case DEFAULT_HEAD_ROWS <- 6 MAX_CHAR_PER_ROW <- 120 @@ -2250,10 +2245,10 @@ setMethod("str", # The number of observations will be displayed only if the number # of rows of the dataset has already been cached. if (!is.null(cachedCount)) { - cat("'" %++% class(object) %++% "': " %++% cachedCount %++% " obs. of " %++% - length(names) %++% " variables:\n") + cat(paste0("'", class(object), "': ", cachedCount, " obs. of ", + length(names), " variables:\n")) } else { - cat("'" %++% class(object) %++% "': " %++% length(names) %++% " variables:\n") + cat(paste0("'", class(object), "': ", length(names), " variables:\n")) } # Whether the ... should be printed at the end of each row @@ -2270,7 +2265,7 @@ setMethod("str", # Get the first elements for each column if (types[i] == "chr") { - firstElements <- paste("\"" %++% dataFrame[,i] %++% "\"", collapse = " ") + firstElements <- paste(paste0("\"", dataFrame[,i], "\""), collapse = " ") } else { firstElements <- paste(dataFrame[,i], collapse = " ") } @@ -2287,8 +2282,8 @@ setMethod("str", # Concatenate the colnames, coltypes, and first # elements of each column - line <- " $ " %++% names[i] %++% spaces %++% ": " %++% - dataType %++% " " %++% firstElements + line <- paste0(" $ ", names[i], spaces, ": ", + dataType, " ",firstElements) # Chop off extra characters if this is too long cat(substr(line, 1, MAX_CHAR_PER_ROW)) @@ -2299,7 +2294,7 @@ setMethod("str", } if (ncol(dataFrame) < ncol(object)) { - cat("\nDisplaying first " %++% ncol(dataFrame) %++% " columns only.") + cat(paste0("\nDisplaying first ", ncol(dataFrame), " columns only.")) } } }) \ No newline at end of file From 6d226e9a662d4890657848f8f7181a85c82a17c6 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Sat, 14 Nov 2015 02:19:06 +0000 Subject: [PATCH 06/29] Removed white space --- R/pkg/inst/tests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 4b97fa7fec3b2..2d8bf7a18c02f 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1539,7 +1539,7 @@ test_that("Method str()", { expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4 ...") expect_equal(out[6], " $ Species : chr setosa setosa setosa setosa setosa setosa ...") expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE ...") - + # A random dataset with many columns. This test is to check str limits # the number of columns. Therefore, it will suffice to check for the # number of returned rows From b74288b4e2a06e06d37fdab7a0a2e539a5bcb5f0 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 16 Nov 2015 20:05:29 +0000 Subject: [PATCH 07/29] Style changes and added quotes to character fields --- R/pkg/R/DataFrame.R | 17 +++++++---------- R/pkg/inst/tests/test_sparkSQL.R | 2 +- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 7df4b93860230..3e24cebe6a723 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2204,24 +2204,21 @@ setMethod("coltypes", #' a small sample of rows. #' @name str #' @title Compactly display the structure of a dataset -#' @rdname str_data_frame -#' @family dataframe_funcs -#' @param x a DataFrame +#' @rdname str +#' @family DataFrame functions +#' @param object a DataFrame #' @examples \dontrun{ -#' #' # Create a DataFrame from the Iris dataset #' irisDF <- createDataFrame(sqlContext, iris) #' #' # Show the structure of the DataFrame #' str(irisDF) -#' #' } setMethod("str", signature(object = "DataFrame"), function(object) { # TODO: These could be made global parameters, though in R it's not the case - DEFAULT_HEAD_ROWS <- 6 MAX_CHAR_PER_ROW <- 120 MAX_COLS <- 100 @@ -2235,9 +2232,9 @@ setMethod("str", # Get the first elements of the dataset. Limit number of columns accordingly dataFrame <- if (ncol(object) > MAX_COLS) { - head(object[, c(1:MAX_COLS)], DEFAULT_HEAD_ROWS) + head(object[, c(1:MAX_COLS)]) } else { - head(object, DEFAULT_HEAD_ROWS) + head(object) } # The number of observations will be displayed only if the number @@ -2253,7 +2250,7 @@ setMethod("str", ellipsis <- FALSE # Add ellipsis (i.e., "...") if there are more rows than shown - if (!is.null(cachedCount) && (cachedCount > DEFAULT_HEAD_ROWS)) { + if (!is.null(cachedCount) && (cachedCount > 6)) { ellipsis <- TRUE } @@ -2262,7 +2259,7 @@ setMethod("str", firstElements <- "" # Get the first elements for each column - if (types[i] == "chr") { + if (types[i] == "character") { firstElements <- paste(paste0("\"", dataFrame[,i], "\""), collapse = " ") } else { firstElements <- paste(dataFrame[,i], collapse = " ") diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 3e3ad77025f03..c2b2b9188a49a 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1604,7 +1604,7 @@ test_that("Method str()", { expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9 ...") expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7 ...") expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4 ...") - expect_equal(out[6], " $ Species : chr setosa setosa setosa setosa setosa setosa ...") + expect_equal(out[6], " $ Species : chr \"setosa\" \"setosa\" \"setosa\" \"setosa\" \"setosa\" \"setosa\" ...") expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE ...") # A random dataset with many columns. This test is to check str limits From db96730fbcccb516f246f245a0a0ec6fa20f360b Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 16 Nov 2015 21:38:38 +0000 Subject: [PATCH 08/29] Fixed R style issues --- R/pkg/inst/tests/test_sparkSQL.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index c2b2b9188a49a..d79a6717b4f63 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1604,7 +1604,8 @@ test_that("Method str()", { expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9 ...") expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7 ...") expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4 ...") - expect_equal(out[6], " $ Species : chr \"setosa\" \"setosa\" \"setosa\" \"setosa\" \"setosa\" \"setosa\" ...") + expect_equal(out[6], paste0(" $ Species : chr \"setosa\" \"setosa\" \"", + "setosa\" \"setosa\" \"setosa\" \"setosa\" ...")) expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE ...") # A random dataset with many columns. This test is to check str limits From 6bb5bd4e6e36605d901b58eedb54d315f768ecc4 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 18 Nov 2015 15:24:49 -0800 Subject: [PATCH 09/29] Merged str function --- R/pkg/R/DataFrame.R | 94 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 34177e3cdd94f..d98ec070b71a6 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2199,3 +2199,97 @@ setMethod("coltypes", rTypes }) + +#' Display the structure of a DataFrame, including column names, column types, as well as a +#' a small sample of rows. +#' @name str +#' @title Compactly display the structure of a dataset +#' @rdname str +#' @family DataFrame functions +#' @param object a DataFrame +#' @examples \dontrun{ +#' # Create a DataFrame from the Iris dataset +#' irisDF <- createDataFrame(sqlContext, iris) +#' +#' # Show the structure of the DataFrame +#' str(irisDF) +#' } +setMethod("str", + signature(object = "DataFrame"), + function(object) { + + # TODO: These could be made global parameters, though in R it's not the case + MAX_CHAR_PER_ROW <- 120 + MAX_COLS <- 100 + + # Get the column names and types of the DataFrame + names <- names(object) + types <- coltypes(object) + + # Get the number of rows. + # TODO: Ideally, this should be cached + cachedCount <- nrow(object) + + # Get the first elements of the dataset. Limit number of columns accordingly + dataFrame <- if (ncol(object) > MAX_COLS) { + head(object[, c(1:MAX_COLS)]) + } else { + head(object) + } + + # The number of observations will be displayed only if the number + # of rows of the dataset has already been cached. + if (!is.null(cachedCount)) { + cat(paste0("'", class(object), "': ", cachedCount, " obs. of ", + length(names), " variables:\n")) + } else { + cat(paste0("'", class(object), "': ", length(names), " variables:\n")) + } + + # Whether the ... should be printed at the end of each row + ellipsis <- FALSE + + # Add ellipsis (i.e., "...") if there are more rows than shown + if (!is.null(cachedCount) && (cachedCount > 6)) { + ellipsis <- TRUE + } + + if (nrow(dataFrame) > 0) { + for (i in 1 : ncol(dataFrame)) { + firstElements <- "" + + # Get the first elements for each column + if (types[i] == "character") { + firstElements <- paste(paste0("\"", dataFrame[,i], "\""), collapse = " ") + } else { + firstElements <- paste(dataFrame[,i], collapse = " ") + } + + # Add the corresponding number of spaces for alignment + spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="") + + # Get the short type. For 'character', it would be 'chr'; + # 'for numeric', it's 'num', etc. + dataType <- SHORT_TYPES[[types[i]]] + if (is.null(dataType)) { + dataType <- substring(types[i], 1, 3) + } + + # Concatenate the colnames, coltypes, and first + # elements of each column + line <- paste0(" $ ", names[i], spaces, ": ", + dataType, " ",firstElements) + + # Chop off extra characters if this is too long + cat(substr(line, 1, MAX_CHAR_PER_ROW)) + if (ellipsis) { + cat(" ...") + } + cat("\n") + } + + if (ncol(dataFrame) < ncol(object)) { + cat(paste0("\nDisplaying first ", ncol(dataFrame), " columns only.")) + } + } + }) From 41350293774de09953495a930722585e61fd17ca Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 18 Nov 2015 15:28:44 -0800 Subject: [PATCH 10/29] Update generics.R --- R/pkg/R/generics.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 738dd9bc5df3f..5be6e9fb9cbbd 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1099,3 +1099,6 @@ setGeneric("with") #' @rdname coltypes #' @export setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) + ++#' @export ++setGeneric("str") From a995f6e665354308a52c4f4c44e047a5aa946d9a Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 18 Nov 2015 15:30:31 -0800 Subject: [PATCH 11/29] Update generics.R --- R/pkg/R/generics.R | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 5be6e9fb9cbbd..ea3d9c67e7a3f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -396,6 +396,10 @@ setGeneric("attach") #' @export setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) +#' @rdname coltypes +#' @export +setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) + #' @rdname schema #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) @@ -520,10 +524,17 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) { standardGeneric("saveAsTable") }) +#' @export +setGeneric("str") + #' @rdname withColumn #' @export setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") }) +#' @rdname with +#' @export +setGeneric("with") + #' @rdname write.df #' @export setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") }) @@ -1083,22 +1094,3 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") }) #' @rdname rbind #' @export setGeneric("rbind", signature = "...") - -#' @rdname as.data.frame -#' @export -setGeneric("as.data.frame") - -#' @rdname attach -#' @export -setGeneric("attach") - -#' @rdname with -#' @export -setGeneric("with") - -#' @rdname coltypes -#' @export -setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) - -+#' @export -+setGeneric("str") From 7f4adbcb9bc5926de4ed67a95e56692894c33054 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 18 Nov 2015 15:38:03 -0800 Subject: [PATCH 12/29] Update generics.R --- R/pkg/R/generics.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ea3d9c67e7a3f..9a01c29c7a868 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -396,10 +396,6 @@ setGeneric("attach") #' @export setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) -#' @rdname coltypes -#' @export -setGeneric("coltypes", function(x) { standardGeneric("coltypes") }) - #' @rdname schema #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) From 957b3c29b60d49ff6f2bc39e75b4cc21b6018c8d Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 19 Nov 2015 14:54:01 -0800 Subject: [PATCH 13/29] Update generics.R --- R/pkg/R/generics.R | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 9a01c29c7a868..cd0a8cbba20f8 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -523,18 +523,6 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) { #' @export setGeneric("str") -#' @rdname withColumn -#' @export -setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") }) - -#' @rdname with -#' @export -setGeneric("with") - -#' @rdname write.df -#' @export -setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") }) - #' @rdname write.df #' @export setGeneric("saveDF", function(df, path, ...) { standardGeneric("saveDF") }) @@ -571,6 +559,10 @@ setGeneric("toJSON", function(x) { standardGeneric("toJSON") }) setGeneric("toRDD", function(x) { standardGeneric("toRDD") }) +#' @rdname withColumn +#' @export +setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") }) + #' @rdname unionAll #' @export setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") }) @@ -592,6 +584,9 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn setGeneric("withColumnRenamed", function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") }) +#' @rdname write.df +#' @export +setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") }) ###################### Column Methods ########################## From 1705432c6c785ec0c48cd318c2d539ad8aca41a4 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 19 Nov 2015 16:12:11 -0800 Subject: [PATCH 14/29] Added tests for utils:::str --- R/pkg/inst/tests/test_sparkSQL.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 4d3ee72bae4e0..3ba7132ee74e1 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1630,8 +1630,11 @@ test_that("Method str()", { DF <- createDataFrame(sqlContext, df) out <- capture.output(str(DF)) expect_equal(length(out), 103) + + # Test utils:::str + expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris))) }) unlink(parquetPath) unlink(jsonPath) -unlink(jsonPathNa) \ No newline at end of file +unlink(jsonPathNa) From cfdfc013b2084d9141e52226407db4930d506fc2 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 23 Nov 2015 11:10:53 -0800 Subject: [PATCH 15/29] Renamed dataFrame for localDF for clarity --- R/pkg/R/DataFrame.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index d98ec070b71a6..29e8da72b42f7 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2231,7 +2231,7 @@ setMethod("str", cachedCount <- nrow(object) # Get the first elements of the dataset. Limit number of columns accordingly - dataFrame <- if (ncol(object) > MAX_COLS) { + localDF <- if (ncol(object) > MAX_COLS) { head(object[, c(1:MAX_COLS)]) } else { head(object) @@ -2254,15 +2254,15 @@ setMethod("str", ellipsis <- TRUE } - if (nrow(dataFrame) > 0) { - for (i in 1 : ncol(dataFrame)) { + if (nrow(localDF) > 0) { + for (i in 1 : ncol(localDF)) { firstElements <- "" # Get the first elements for each column if (types[i] == "character") { - firstElements <- paste(paste0("\"", dataFrame[,i], "\""), collapse = " ") + firstElements <- paste(paste0("\"", localDF[,i], "\""), collapse = " ") } else { - firstElements <- paste(dataFrame[,i], collapse = " ") + firstElements <- paste(localDF[,i], collapse = " ") } # Add the corresponding number of spaces for alignment @@ -2288,8 +2288,8 @@ setMethod("str", cat("\n") } - if (ncol(dataFrame) < ncol(object)) { - cat(paste0("\nDisplaying first ", ncol(dataFrame), " columns only.")) + if (ncol(localDF) < ncol(object)) { + cat(paste0("\nDisplaying first ", ncol(localDF), " columns only.")) } } }) From 4b416cc0804c4f3d721eebcdbc3e94c18e66f4de Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Mon, 7 Dec 2015 17:32:31 -0800 Subject: [PATCH 16/29] Merged str() code after master update --- R/pkg/R/DataFrame.R | 93 +++++++++++++++++++++++++++++++++++++++++++++ R/pkg/R/types.R | 13 +++++++ 2 files changed, 106 insertions(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 81b4e6b91d8a2..f8323b16272a0 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2223,3 +2223,96 @@ setMethod("with", newEnv <- assignNewEnv(data) eval(substitute(expr), envir = newEnv, enclos = newEnv) }) + +#' Display the structure of a DataFrame, including column names, column types, as well as a +#' a small sample of rows. +#' @name str +#' @title Compactly display the structure of a dataset +#' @rdname str +#' @family DataFrame functions +#' @param object a DataFrame +#' @examples \dontrun{ +#' # Create a DataFrame from the Iris dataset +#' irisDF <- createDataFrame(sqlContext, iris) +#' +#' # Show the structure of the DataFrame +#' str(irisDF) +#' } +setMethod("str", + signature(object = "DataFrame"), + function(object) { + + # TODO: These could be made global parameters, though in R it's not the case + MAX_COLS <- 100 + + # Get the column names and types of the DataFrame + names <- names(object) + types <- coltypes(object) + + # Get the number of rows. + # TODO: Ideally, this should be cached + cachedCount <- nrow(object) + + # Get the first elements of the dataset. Limit number of columns accordingly + localDF <- if (ncol(object) > MAX_COLS) { + head(object[, c(1:MAX_COLS)]) + } else { + head(object) + } + + # The number of observations will be displayed only if the number + # of rows of the dataset has already been cached. + if (!is.null(cachedCount)) { + cat(paste0("'", class(object), "': ", cachedCount, " obs. of ", + length(names), " variables:\n")) + } else { + cat(paste0("'", class(object), "': ", length(names), " variables:\n")) + } + + # Whether the ... should be printed at the end of each row + ellipsis <- FALSE + + # Add ellipsis (i.e., "...") if there are more rows than shown + if (!is.null(cachedCount) && (cachedCount > 6)) { + ellipsis <- TRUE + } + + if (nrow(localDF) > 0) { + for (i in 1 : ncol(localDF)) { + firstElements <- "" + + # Get the first elements for each column + if (types[i] == "character") { + firstElements <- paste(paste0("\"", localDF[,i], "\""), collapse = " ") + } else { + firstElements <- paste(localDF[,i], collapse = " ") + } + + # Add the corresponding number of spaces for alignment + spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="") + + # Get the short type. For 'character', it would be 'chr'; + # 'for numeric', it's 'num', etc. + dataType <- SHORT_TYPES[[types[i]]] + if (is.null(dataType)) { + dataType <- substring(types[i], 1, 3) + } + + # Concatenate the colnames, coltypes, and first + # elements of each column + line <- paste0(" $ ", names[i], spaces, ": ", + dataType, " ",firstElements) + + # Chop off extra characters if this is too long + cat(substr(line, 1, MAX_CHAR_PER_ROW)) + if (ellipsis) { + cat(" ...") + } + cat("\n") + } + + if (ncol(localDF) < ncol(object)) { + cat(paste0("\nDisplaying first ", ncol(localDF), " columns only.")) + } + } + }) diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index 1f06af7e904fe..537b1bd277e67 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -47,6 +47,19 @@ COMPLEX_TYPES <- list( # The full list of data types. DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES)) +SHORT_TYPES <- as.environment(list( + "character"="chr", + "logical"="logi", + "POSIXct"="POSIXct", + "integer"="int", + "numeric"="num", + "raw"="raw", + "Date"="Date", + "map"="map", + "array"="array", + "struct"="struct" +)) + # An environment for mapping R to Scala, names are R types and values are Scala types. rToSQLTypes <- as.environment(list( "integer" = "integer", # in R, integer is 32bit From 8140e20a6eff13e0869176f747f72d897832c329 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 8 Dec 2015 10:32:14 -0800 Subject: [PATCH 17/29] Added max_char_per_row constant --- R/pkg/R/DataFrame.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index f8323b16272a0..ae81e7fd4db5c 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2243,6 +2243,7 @@ setMethod("str", function(object) { # TODO: These could be made global parameters, though in R it's not the case + MAX_CHAR_PER_ROW <- 120 MAX_COLS <- 100 # Get the column names and types of the DataFrame From 6a7ff1b16ce3bcde2c52144d4893e11002b36047 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 17 Dec 2015 11:33:44 -0800 Subject: [PATCH 18/29] Removed caching logic. Updated tests --- R/pkg/R/DataFrame.R | 28 ++++------------------- R/pkg/inst/tests/testthat/test_sparkSQL.R | 14 ++++++------ 2 files changed, 11 insertions(+), 31 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ae81e7fd4db5c..eacbf3143404c 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2250,10 +2250,6 @@ setMethod("str", names <- names(object) types <- coltypes(object) - # Get the number of rows. - # TODO: Ideally, this should be cached - cachedCount <- nrow(object) - # Get the first elements of the dataset. Limit number of columns accordingly localDF <- if (ncol(object) > MAX_COLS) { head(object[, c(1:MAX_COLS)]) @@ -2261,22 +2257,9 @@ setMethod("str", head(object) } - # The number of observations will be displayed only if the number - # of rows of the dataset has already been cached. - if (!is.null(cachedCount)) { - cat(paste0("'", class(object), "': ", cachedCount, " obs. of ", - length(names), " variables:\n")) - } else { - cat(paste0("'", class(object), "': ", length(names), " variables:\n")) - } - - # Whether the ... should be printed at the end of each row - ellipsis <- FALSE - - # Add ellipsis (i.e., "...") if there are more rows than shown - if (!is.null(cachedCount) && (cachedCount > 6)) { - ellipsis <- TRUE - } + # The number of observations will not be displayed as computing the + # number of rows is a very expensive operation + cat(paste0("'", class(object), "': ", length(names), " variables:\n")) if (nrow(localDF) > 0) { for (i in 1 : ncol(localDF)) { @@ -2306,9 +2289,6 @@ setMethod("str", # Chop off extra characters if this is too long cat(substr(line, 1, MAX_CHAR_PER_ROW)) - if (ellipsis) { - cat(" ...") - } cat("\n") } @@ -2316,4 +2296,4 @@ setMethod("str", cat(paste0("\nDisplaying first ", ncol(localDF), " columns only.")) } } - }) + }) \ No newline at end of file diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 1137367d26e36..0984b737e0aba 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1732,14 +1732,14 @@ test_that("Method str()", { irisDF2 <- createDataFrame(sqlContext, iris2) out <- capture.output(str(irisDF2)) expect_equal(length(out), 7) - expect_equal(out[1], "'DataFrame': 150 obs. of 6 variables:") - expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4 ...") - expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9 ...") - expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7 ...") - expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4 ...") + expect_equal(out[1], "'DataFrame': 6 variables:") + expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4") + expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9") + expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7") + expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4") expect_equal(out[6], paste0(" $ Species : chr \"setosa\" \"setosa\" \"", - "setosa\" \"setosa\" \"setosa\" \"setosa\" ...")) - expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE ...") + "setosa\" \"setosa\" \"setosa\" \"setosa\"")) + expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE") # A random dataset with many columns. This test is to check str limits # the number of columns. Therefore, it will suffice to check for the From cfb85e4e850c6d76914f53829f4b63066967628a Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 17 Dec 2015 12:54:47 -0800 Subject: [PATCH 19/29] Updated str() tests --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 0984b737e0aba..555f088b7b84e 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1725,11 +1725,13 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { "Only atomic type is supported for column types") }) +# Structure of Iris +iris2 <- iris +colnames(iris2) <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width") +iris2$col <- TRUE +irisDF2 <- createDataFrame(sqlContext, iris2) + test_that("Method str()", { - # Structure of Iris - iris2 <- iris - iris2$col <- TRUE - irisDF2 <- createDataFrame(sqlContext, iris2) out <- capture.output(str(irisDF2)) expect_equal(length(out), 7) expect_equal(out[1], "'DataFrame': 6 variables:") From 0ac73841110e65881c2b492a140de3d42991f7c2 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 17 Dec 2015 14:05:47 -0800 Subject: [PATCH 20/29] Fixed tests --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 555f088b7b84e..95b870994a0e2 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1726,12 +1726,13 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { }) # Structure of Iris -iris2 <- iris -colnames(iris2) <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width") -iris2$col <- TRUE -irisDF2 <- createDataFrame(sqlContext, iris2) test_that("Method str()", { + iris2 <- iris + colnames(iris2) <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width", "Species") + iris2$col <- TRUE + irisDF2 <- createDataFrame(sqlContext, iris2) + out <- capture.output(str(irisDF2)) expect_equal(length(out), 7) expect_equal(out[1], "'DataFrame': 6 variables:") From a7141ccd3f8b0dae684e6c503fee6464130cbaf0 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 17 Dec 2015 14:06:35 -0800 Subject: [PATCH 21/29] Fixed tests --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 95b870994a0e2..8d282e537de90 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1725,14 +1725,13 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { "Only atomic type is supported for column types") }) -# Structure of Iris - test_that("Method str()", { + # Structure of Iris iris2 <- iris colnames(iris2) <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width", "Species") iris2$col <- TRUE irisDF2 <- createDataFrame(sqlContext, iris2) - + out <- capture.output(str(irisDF2)) expect_equal(length(out), 7) expect_equal(out[1], "'DataFrame': 6 variables:") From 74c9651b22527385590f9940db83f60d55ae89b0 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Thu, 17 Dec 2015 17:08:16 -0800 Subject: [PATCH 22/29] Removed unnecessary initialization --- R/pkg/R/DataFrame.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index eacbf3143404c..71afec2f069d5 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2263,13 +2263,12 @@ setMethod("str", if (nrow(localDF) > 0) { for (i in 1 : ncol(localDF)) { - firstElements <- "" - # Get the first elements for each column - if (types[i] == "character") { - firstElements <- paste(paste0("\"", localDF[,i], "\""), collapse = " ") + + firstElements <- if (types[i] == "character") { + paste(paste0("\"", localDF[,i], "\""), collapse = " ") } else { - firstElements <- paste(localDF[,i], collapse = " ") + paste(localDF[,i], collapse = " ") } # Add the corresponding number of spaces for alignment From 14289250da08190f2296cc9e9b0d33b65eb2ae4f Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 18 Dec 2015 11:07:18 -0800 Subject: [PATCH 23/29] Added back transform() --- R/pkg/R/generics.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 82159040220a3..dc43d84d0a4a5 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -531,6 +531,10 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) { #' @export setGeneric("str") +#' @rdname withColumn +#' @export +setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") }) + #' @rdname write.df #' @export setGeneric("saveDF", function(df, path, ...) { standardGeneric("saveDF") }) From 40a52025f378cca577e638dc1bbc4cf4a56438df Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 18 Dec 2015 12:11:03 -0800 Subject: [PATCH 24/29] Removed space --- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 71afec2f069d5..2382159cc6520 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2264,11 +2264,11 @@ setMethod("str", if (nrow(localDF) > 0) { for (i in 1 : ncol(localDF)) { # Get the first elements for each column - + firstElements <- if (types[i] == "character") { paste(paste0("\"", localDF[,i], "\""), collapse = " ") } else { - paste(localDF[,i], collapse = " ") + firstElements <- paste(localDF[,i], collapse = " ") } # Add the corresponding number of spaces for alignment From 5bdf3f9b3586cf1edabc19a51e49f1b04421b749 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 18 Dec 2015 12:13:48 -0800 Subject: [PATCH 25/29] Removed space --- R/pkg/R/DataFrame.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 2382159cc6520..765196ca215c9 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2260,7 +2260,7 @@ setMethod("str", # The number of observations will not be displayed as computing the # number of rows is a very expensive operation cat(paste0("'", class(object), "': ", length(names), " variables:\n")) - + if (nrow(localDF) > 0) { for (i in 1 : ncol(localDF)) { # Get the first elements for each column @@ -2268,7 +2268,7 @@ setMethod("str", firstElements <- if (types[i] == "character") { paste(paste0("\"", localDF[,i], "\""), collapse = " ") } else { - firstElements <- paste(localDF[,i], collapse = " ") + paste(localDF[,i], collapse = " ") } # Add the corresponding number of spaces for alignment From 38c21f34f110cebda937a2923cfd21e7715c1d39 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Fri, 18 Dec 2015 14:10:48 -0800 Subject: [PATCH 26/29] Removed space --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 765196ca215c9..ecd3bb79300a9 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2260,7 +2260,7 @@ setMethod("str", # The number of observations will not be displayed as computing the # number of rows is a very expensive operation cat(paste0("'", class(object), "': ", length(names), " variables:\n")) - + if (nrow(localDF) > 0) { for (i in 1 : ncol(localDF)) { # Get the first elements for each column From 2701898aa73e790ebb4122f2a18939baa9244e75 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 12 Jan 2016 10:30:13 -0800 Subject: [PATCH 27/29] Removed duplicate with() declaration and re-ordered generics for DataFrame --- R/pkg/R/generics.R | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index dc43d84d0a4a5..4806558e658a5 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -388,6 +388,14 @@ setGeneric("agg", function (x, ...) { standardGeneric("agg") }) #' @export setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") }) +#' @rdname as.data.frame +#' @export +setGeneric("as.data.frame") + +#' @rdname attach +#' @export +setGeneric("attach") + #' @rdname columns #' @export setGeneric("colnames", function(x, do.NULL = TRUE, prefix = "col") { standardGeneric("colnames") }) @@ -531,7 +539,7 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) { #' @export setGeneric("str") -#' @rdname withColumn +#' @rdname transform #' @export setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") }) @@ -1109,15 +1117,3 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") }) #' @rdname rbind #' @export setGeneric("rbind", signature = "...") - -#' @rdname as.data.frame -#' @export -setGeneric("as.data.frame") - -#' @rdname attach -#' @export -setGeneric("attach") - -#' @rdname with -#' @export -setGeneric("with") From 2a8115d2ea22ce1bfa69e234dae575cc3d6cf94f Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Tue, 12 Jan 2016 10:36:39 -0800 Subject: [PATCH 28/29] Made spacing uniform for types.R --- R/pkg/R/types.R | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index 537b1bd277e67..ad048b1cd1795 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -48,22 +48,22 @@ COMPLEX_TYPES <- list( DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES)) SHORT_TYPES <- as.environment(list( - "character"="chr", - "logical"="logi", - "POSIXct"="POSIXct", - "integer"="int", - "numeric"="num", - "raw"="raw", - "Date"="Date", - "map"="map", - "array"="array", - "struct"="struct" + "character" = "chr", + "logical" = "logi", + "POSIXct" = "POSIXct", + "integer" = "int", + "numeric" = "num", + "raw" = "raw", + "Date" = "Date", + "map" = "map", + "array" = "array", + "struct" = "struct" )) # An environment for mapping R to Scala, names are R types and values are Scala types. rToSQLTypes <- as.environment(list( - "integer" = "integer", # in R, integer is 32bit - "numeric" = "double", # in R, numeric == double which is 64bit - "double" = "double", + "integer" = "integer", # in R, integer is 32bit + "numeric" = "double", # in R, numeric == double which is 64bit + "double" = "double", "character" = "string", - "logical" = "boolean")) + "logical" = "boolean")) From 0ffcb4f1f05c1ae8ba5150ebeb250310409b88e8 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 13 Jan 2016 11:24:23 -0800 Subject: [PATCH 29/29] Changed rdname of transform for mutate --- R/pkg/R/generics.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 4806558e658a5..98e32f2ebcd61 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -539,7 +539,7 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) { #' @export setGeneric("str") -#' @rdname transform +#' @rdname mutate #' @export setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })