diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 05b94adeeb93..000000000000 --- a/.travis.yml +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Spark provides this Travis CI configuration file to help contributors -# check Scala/Java style conformance and JDK7/8 compilation easily -# during their preparing pull requests. -# - Scalastyle is executed during `maven install` implicitly. -# - Java Checkstyle is executed by `lint-java`. -# See the related discussion here. -# https://github.com/apache/spark/pull/12980 - -# 1. Choose OS (Ubuntu 14.04.3 LTS Server Edition 64bit, ~2 CORE, 7.5GB RAM) -sudo: required -dist: trusty - -# 2. Choose language and target JDKs for parallel builds. -language: java -jdk: - - oraclejdk8 - -# 3. Setup cache directory for SBT and Maven. -cache: - directories: - - $HOME/.sbt - - $HOME/.m2 - -# 4. Turn off notifications. -notifications: - email: false - -# 5. Run maven install before running lint-java. -install: - - export MAVEN_SKIP_RC=1 - - build/mvn -T 4 -q -DskipTests -Pkubernetes -Pmesos -Pyarn -Pkinesis-asl -Phive -Phive-thriftserver install - -# 6. Run lint-java. -script: - - dev/lint-java diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index f52d785e05cd..cdaaa6104e6a 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 2.4.0 +Version: 3.0.0 Title: R Frontend for Apache Spark Description: Provides an R Frontend for Apache Spark. Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 4f2d4c7c002d..34691883bc5a 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -503,7 +503,6 @@ setMethod("createOrReplaceTempView", #' @param x A SparkDataFrame #' @param tableName A character vector containing the name of the table #' -#' @family SparkDataFrame functions #' @seealso \link{createOrReplaceTempView} #' @rdname registerTempTable-deprecated #' @name registerTempTable @@ -2955,6 +2954,9 @@ setMethod("exceptAll", #' @param source a name for external data source. #' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore' #' save mode (it is 'error' by default) +#' @param partitionBy a name or a list of names of columns to partition the output by on the file +#' system. If specified, the output is laid out on the file system similar +#' to Hive's partitioning scheme. #' @param ... additional argument(s) passed to the method. #' #' @family SparkDataFrame functions @@ -2966,13 +2968,13 @@ setMethod("exceptAll", #' sparkR.session() #' path <- "path/to/file.json" #' df <- read.json(path) -#' write.df(df, "myfile", "parquet", "overwrite") +#' write.df(df, "myfile", "parquet", "overwrite", partitionBy = c("col1", "col2")) #' saveDF(df, parquetPath2, "parquet", mode = "append", mergeSchema = TRUE) #' } #' @note write.df since 1.4.0 setMethod("write.df", signature(df = "SparkDataFrame"), - function(df, path = NULL, source = NULL, mode = "error", ...) { + function(df, path = NULL, source = NULL, mode = "error", partitionBy = NULL, ...) { if (!is.null(path) && !is.character(path)) { stop("path should be character, NULL or omitted.") } @@ -2986,8 +2988,18 @@ setMethod("write.df", if (is.null(source)) { source <- getDefaultSqlSource() } + cols <- NULL + if (!is.null(partitionBy)) { + if (!all(sapply(partitionBy, function(c) is.character(c)))) { + stop("All partitionBy column names should be characters.") + } + cols <- as.list(partitionBy) + } write <- callJMethod(df@sdf, "write") write <- callJMethod(write, "format", source) + if (!is.null(cols)) { + write <- callJMethod(write, "partitionBy", cols) + } write <- setWriteOptions(write, path = path, mode = mode, ...) write <- handledCallJMethod(write, "save") }) @@ -3986,7 +3998,17 @@ setMethod("hint", signature(x = "SparkDataFrame", name = "character"), function(x, name, ...) { parameters <- list(...) - stopifnot(all(sapply(parameters, is.character))) + if (!all(sapply(parameters, function(y) { + if (is.character(y) || is.numeric(y)) { + TRUE + } else if (is.list(y)) { + all(sapply(y, function(z) { is.character(z) || is.numeric(z) })) + } else { + FALSE + } + }))) { + stop("sql hint should be character, numeric, or list with character or numeric.") + } jdf <- callJMethod(x@sdf, "hint", name, parameters) dataFrame(jdf) }) diff --git a/R/pkg/R/catalog.R b/R/pkg/R/catalog.R index baf4d861fcf8..c2d0fc38786b 100644 --- a/R/pkg/R/catalog.R +++ b/R/pkg/R/catalog.R @@ -69,7 +69,6 @@ createExternalTable <- function(x, ...) { #' @param ... additional named parameters as options for the data source. #' @return A SparkDataFrame. #' @rdname createTable -#' @seealso \link{createExternalTable} #' @examples #'\dontrun{ #' sparkR.session() diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index f168ca76b600..e99136723f65 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -167,18 +167,30 @@ parallelize <- function(sc, coll, numSlices = 1) { # 2-tuples of raws serializedSlices <- lapply(slices, serialize, connection = NULL) - # The PRC backend cannot handle arguments larger than 2GB (INT_MAX) + # The RPC backend cannot handle arguments larger than 2GB (INT_MAX) # If serialized data is safely less than that threshold we send it over the PRC channel. # Otherwise, we write it to a file and send the file name if (objectSize < sizeLimit) { jrdd <- callJStatic("org.apache.spark.api.r.RRDD", "createRDDFromArray", sc, serializedSlices) } else { - fileName <- writeToTempFile(serializedSlices) - jrdd <- tryCatch(callJStatic( - "org.apache.spark.api.r.RRDD", "createRDDFromFile", sc, fileName, as.integer(numSlices)), - finally = { - file.remove(fileName) - }) + if (callJStatic("org.apache.spark.api.r.RUtils", "getEncryptionEnabled", sc)) { + # the length of slices here is the parallelism to use in the jvm's sc.parallelize() + parallelism <- as.integer(numSlices) + jserver <- newJObject("org.apache.spark.api.r.RParallelizeServer", sc, parallelism) + authSecret <- callJMethod(jserver, "secret") + port <- callJMethod(jserver, "port") + conn <- socketConnection(port = port, blocking = TRUE, open = "wb", timeout = 1500) + doServerAuth(conn, authSecret) + writeToConnection(serializedSlices, conn) + jrdd <- callJMethod(jserver, "getResult") + } else { + fileName <- writeToTempFile(serializedSlices) + jrdd <- tryCatch(callJStatic( + "org.apache.spark.api.r.RRDD", "createRDDFromFile", sc, fileName, as.integer(numSlices)), + finally = { + file.remove(fileName) + }) + } } RDD(jrdd, "byte") @@ -194,14 +206,21 @@ getMaxAllocationLimit <- function(sc) { )) } +writeToConnection <- function(serializedSlices, conn) { + tryCatch({ + for (slice in serializedSlices) { + writeBin(as.integer(length(slice)), conn, endian = "big") + writeBin(slice, conn, endian = "big") + } + }, finally = { + close(conn) + }) +} + writeToTempFile <- function(serializedSlices) { fileName <- tempfile() conn <- file(fileName, "wb") - for (slice in serializedSlices) { - writeBin(as.integer(length(slice)), conn, endian = "big") - writeBin(slice, conn, endian = "big") - } - close(conn) + writeToConnection(serializedSlices, conn) fileName } diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 572dee50127b..6a8fef5aa7b2 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -198,8 +198,9 @@ NULL #' } #' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains #' additional named properties to control how it is converted, accepts the same -#' options as the JSON data source. In \code{arrays_zip}, this contains additional -#' Columns of arrays to be merged. +#' options as the JSON data source. Additionally \code{to_json} supports the "pretty" +#' option which enables pretty JSON generation. In \code{arrays_zip}, this contains +#' additional Columns of arrays to be merged. #' @name column_collection_functions #' @rdname column_collection_functions #' @family collection functions @@ -2203,9 +2204,16 @@ setMethod("from_json", signature(x = "Column", schema = "characterOrstructType") }) #' @details -#' \code{from_utc_timestamp}: Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a -#' time in UTC, and renders that time as a timestamp in the given time zone. For example, 'GMT+1' -#' would yield '2017-07-14 03:40:00.0'. +#' \code{from_utc_timestamp}: This is a common function for databases supporting TIMESTAMP WITHOUT +#' TIMEZONE. This function takes a timestamp which is timezone-agnostic, and interprets it as a +#' timestamp in UTC, and renders that timestamp as a timestamp in the given time zone. +#' However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not +#' timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to +#' the given timezone. +#' This function may return confusing result if the input is a string with timezone, e.g. +#' (\code{2018-03-13T06:18:23+00:00}). The reason is that, Spark firstly cast the string to +#' timestamp according to the timezone in the string, and finally display the result by converting +#' the timestamp to string according to the session local timezone. #' #' @rdname column_datetime_diff_functions #' @@ -2261,9 +2269,16 @@ setMethod("next_day", signature(y = "Column", x = "character"), }) #' @details -#' \code{to_utc_timestamp}: Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a -#' time in the given time zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' -#' would yield '2017-07-14 01:40:00.0'. +#' \code{to_utc_timestamp}: This is a common function for databases supporting TIMESTAMP WITHOUT +#' TIMEZONE. This function takes a timestamp which is timezone-agnostic, and interprets it as a +#' timestamp in the given timezone, and renders that timestamp as a timestamp in UTC. +#' However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not +#' timezone-agnostic. So in Spark this function just shift the timestamp value from the given +#' timezone to UTC timezone. +#' This function may return confusing result if the input is a string with timezone, e.g. +#' (\code{2018-03-13T06:18:23+00:00}). The reason is that, Spark firstly cast the string to +#' timestamp according to the timezone in the string, and finally display the result by converting +#' the timestamp to string according to the session local timezone. #' #' @rdname column_datetime_diff_functions #' @aliases to_utc_timestamp to_utc_timestamp,Column,character-method @@ -3458,13 +3473,21 @@ setMethod("collect_set", #' @details #' \code{split_string}: Splits string on regular expression. -#' Equivalent to \code{split} SQL function. +#' Equivalent to \code{split} SQL function. Optionally a +#' \code{limit} can be specified #' #' @rdname column_string_functions +#' @param limit determines the length of the returned array. +#' \itemize{ +#' \item \code{limit > 0}: length of the array will be at most \code{limit} +#' \item \code{limit <= 0}: the returned array can have any length +#' } +#' #' @aliases split_string split_string,Column-method #' @examples #' #' \dontrun{ +#' head(select(df, split_string(df$Class, "\\d", 2))) #' head(select(df, split_string(df$Sex, "a"))) #' head(select(df, split_string(df$Class, "\\d"))) #' # This is equivalent to the following SQL expression @@ -3472,8 +3495,9 @@ setMethod("collect_set", #' @note split_string 2.3.0 setMethod("split_string", signature(x = "Column", pattern = "character"), - function(x, pattern) { - jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern) + function(x, pattern, limit = -1) { + jc <- callJStatic("org.apache.spark.sql.functions", + "split", x@jc, pattern, as.integer(limit)) column(jc) }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 27c1b312d645..697d124095a7 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1258,7 +1258,7 @@ setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") #' @rdname column_string_functions #' @name NULL -setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") }) +setGeneric("split_string", function(x, pattern, ...) { standardGeneric("split_string") }) #' @rdname column_string_functions #' @name NULL diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index d3a9cbae7d80..038fefadaaef 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -626,6 +626,8 @@ sparkConfToSubmitOps[["spark.driver.extraLibraryPath"]] <- "--driver-library-pat sparkConfToSubmitOps[["spark.master"]] <- "--master" sparkConfToSubmitOps[["spark.yarn.keytab"]] <- "--keytab" sparkConfToSubmitOps[["spark.yarn.principal"]] <- "--principal" +sparkConfToSubmitOps[["spark.kerberos.keytab"]] <- "--keytab" +sparkConfToSubmitOps[["spark.kerberos.principal"]] <- "--principal" # Utility function that returns Spark Submit arguments as a string diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R index 3577929323b8..1525bdb2f5c8 100644 --- a/R/pkg/tests/fulltests/test_Serde.R +++ b/R/pkg/tests/fulltests/test_Serde.R @@ -124,3 +124,35 @@ test_that("SerDe of list of lists", { }) sparkR.session.stop() + +# Note that this test should be at the end of tests since the configruations used here are not +# specific to sessions, and the Spark context is restarted. +test_that("createDataFrame large objects", { + for (encryptionEnabled in list("true", "false")) { + # To simulate a large object scenario, we set spark.r.maxAllocationLimit to a smaller value + conf <- list(spark.r.maxAllocationLimit = "100", + spark.io.encryption.enabled = encryptionEnabled) + + suppressWarnings(sparkR.session(master = sparkRTestMaster, + sparkConfig = conf, + enableHiveSupport = FALSE)) + + sc <- getSparkContext() + actual <- callJStatic("org.apache.spark.api.r.RUtils", "getEncryptionEnabled", sc) + expected <- as.logical(encryptionEnabled) + expect_equal(actual, expected) + + tryCatch({ + # suppress warnings from dot in the field names. See also SPARK-21536. + df <- suppressWarnings(createDataFrame(iris, numPartitions = 3)) + expect_equal(getNumPartitions(df), 3) + expect_equal(dim(df), dim(iris)) + + df <- createDataFrame(cars, numPartitions = 3) + expect_equal(collect(df), cars) + }, + finally = { + sparkR.stop() + }) + } +}) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 0c4bdb31b027..5cc75aa3f367 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -316,18 +316,6 @@ test_that("create DataFrame from RDD", { unsetHiveContext() }) -test_that("createDataFrame uses files for large objects", { - # To simulate a large file scenario, we set spark.r.maxAllocationLimit to a smaller value - conf <- callJMethod(sparkSession, "conf") - callJMethod(conf, "set", "spark.r.maxAllocationLimit", "100") - df <- suppressWarnings(createDataFrame(iris, numPartitions = 3)) - expect_equal(getNumPartitions(df), 3) - - # Resetting the conf back to default value - callJMethod(conf, "set", "spark.r.maxAllocationLimit", toString(.Machine$integer.max / 10)) - expect_equal(dim(df), dim(iris)) -}) - test_that("read/write csv as DataFrame", { if (windows_with_hadoop()) { csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv") @@ -1831,6 +1819,14 @@ test_that("string operators", { collect(select(df4, split_string(df4$a, "\\\\")))[1, 1], list(list("a.b@c.d 1", "b")) ) + expect_equal( + collect(select(df4, split_string(df4$a, "\\.", 2)))[1, 1], + list(list("a", "b@c.d 1\\b")) + ) + expect_equal( + collect(select(df4, split_string(df4$a, "b", 0)))[1, 1], + list(list("a.", "@c.d 1\\", "")) + ) l5 <- list(list(a = "abc")) df5 <- createDataFrame(l5) @@ -2419,6 +2415,15 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { expect_true(any(grepl("BroadcastHashJoin", execution_plan_broadcast))) }) +test_that("test hint", { + df <- sql("SELECT * FROM range(10e10)") + hintList <- list("hint2", "hint3", "hint4") + execution_plan_hint <- capture.output( + explain(hint(df, "hint1", 1.23456, "aaaaaaaaaa", hintList), TRUE) + ) + expect_true(any(grepl("1.23456, aaaaaaaaaa", execution_plan_hint))) +}) + test_that("toJSON() on DataFrame", { df <- as.DataFrame(cars) df_json <- toJSON(df) @@ -2704,8 +2709,16 @@ test_that("read/write text files", { expect_equal(colnames(df2), c("value")) expect_equal(count(df2), count(df) * 2) + df3 <- createDataFrame(list(list(1L, "1"), list(2L, "2"), list(1L, "1"), list(2L, "2")), + schema = c("key", "value")) + textPath3 <- tempfile(pattern = "textPath3", fileext = ".txt") + write.df(df3, textPath3, "text", mode = "overwrite", partitionBy = "key") + df4 <- read.df(textPath3, "text") + expect_equal(count(df3), count(df4)) + unlink(textPath) unlink(textPath2) + unlink(textPath3) }) test_that("read/write text files - compression option", { diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R index 94d75188fb94..1e9641855888 100644 --- a/R/pkg/tests/run-all.R +++ b/R/pkg/tests/run-all.R @@ -18,50 +18,55 @@ library(testthat) library(SparkR) -# Turn all warnings into errors -options("warn" = 2) +# SPARK-25572 +if (identical(Sys.getenv("NOT_CRAN"), "true")) { -if (.Platform$OS.type == "windows") { - Sys.setenv(TZ = "GMT") -} + # Turn all warnings into errors + options("warn" = 2) -# Setup global test environment -# Install Spark first to set SPARK_HOME + if (.Platform$OS.type == "windows") { + Sys.setenv(TZ = "GMT") + } -# NOTE(shivaram): We set overwrite to handle any old tar.gz files or directories left behind on -# CRAN machines. For Jenkins we should already have SPARK_HOME set. -install.spark(overwrite = TRUE) + # Setup global test environment + # Install Spark first to set SPARK_HOME -sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R") -sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db") -invisible(lapply(sparkRWhitelistSQLDirs, - function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)})) -sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE) + # NOTE(shivaram): We set overwrite to handle any old tar.gz files or directories left behind on + # CRAN machines. For Jenkins we should already have SPARK_HOME set. + install.spark(overwrite = TRUE) -sparkRTestMaster <- "local[1]" -sparkRTestConfig <- list() -if (identical(Sys.getenv("NOT_CRAN"), "true")) { - sparkRTestMaster <- "" -} else { - # Disable hsperfdata on CRAN - old_java_opt <- Sys.getenv("_JAVA_OPTIONS") - Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt)) - tmpDir <- tempdir() - tmpArg <- paste0("-Djava.io.tmpdir=", tmpDir) - sparkRTestConfig <- list(spark.driver.extraJavaOptions = tmpArg, - spark.executor.extraJavaOptions = tmpArg) -} + sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R") + sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db") + invisible(lapply(sparkRWhitelistSQLDirs, + function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)})) + sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE) -test_package("SparkR") + sparkRTestMaster <- "local[1]" + sparkRTestConfig <- list() + if (identical(Sys.getenv("NOT_CRAN"), "true")) { + sparkRTestMaster <- "" + } else { + # Disable hsperfdata on CRAN + old_java_opt <- Sys.getenv("_JAVA_OPTIONS") + Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt)) + tmpDir <- tempdir() + tmpArg <- paste0("-Djava.io.tmpdir=", tmpDir) + sparkRTestConfig <- list(spark.driver.extraJavaOptions = tmpArg, + spark.executor.extraJavaOptions = tmpArg) + } -if (identical(Sys.getenv("NOT_CRAN"), "true")) { - # set random seed for predictable results. mostly for base's sample() in tree and classification - set.seed(42) - # for testthat 1.0.2 later, change reporter from "summary" to default_reporter() - testthat:::run_tests("SparkR", - file.path(sparkRDir, "pkg", "tests", "fulltests"), - NULL, - "summary") -} + test_package("SparkR") + + if (identical(Sys.getenv("NOT_CRAN"), "true")) { + # set random seed for predictable results. mostly for base's sample() in tree and classification + set.seed(42) + # for testthat 1.0.2 later, change reporter from "summary" to default_reporter() + testthat:::run_tests("SparkR", + file.path(sparkRDir, "pkg", "tests", "fulltests"), + NULL, + "summary") + } -SparkR:::uninstallDownloadedSpark() + SparkR:::uninstallDownloadedSpark() + +} diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 090363c5f8a3..ad934947437b 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -157,8 +157,8 @@ Property Name | Property group | spark-submit equivalent `spark.driver.extraClassPath` | Runtime Environment | `--driver-class-path` `spark.driver.extraJavaOptions` | Runtime Environment | `--driver-java-options` `spark.driver.extraLibraryPath` | Runtime Environment | `--driver-library-path` -`spark.yarn.keytab` | Application Properties | `--keytab` -`spark.yarn.principal` | Application Properties | `--principal` +`spark.kerberos.keytab` | Application Properties | `--keytab` +`spark.kerberos.principal` | Application Properties | `--principal` **For Windows users**: Due to different file prefixes across operating systems, to avoid the issue of potential wrong prefix, a current workaround is to specify `spark.sql.warehouse.dir` when starting the `SparkSession`. diff --git a/assembly/README b/assembly/README index affd281a1385..d5dafab47741 100644 --- a/assembly/README +++ b/assembly/README @@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command If you need to build an assembly for a different version of Hadoop the hadoop-version system property needs to be set as in this example: - -Dhadoop.version=2.7.7 + -Dhadoop.version=2.7.3 diff --git a/assembly/pom.xml b/assembly/pom.xml index 9608c96fd536..b0337e58cca7 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh index d6371051ef7f..228494de6d5a 100755 --- a/bin/docker-image-tool.sh +++ b/bin/docker-image-tool.sh @@ -54,6 +54,8 @@ function build { img_path=$IMG_PATH --build-arg spark_jars=assembly/target/scala-$SPARK_SCALA_VERSION/jars + --build-arg + k8s_tests=resource-managers/kubernetes/integration-tests/tests ) else # Not passed as an argument to docker, but used to validate the Spark directory. diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 8c148359c302..23a0f4920690 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java index bd8d9486acde..771a9541bb34 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java @@ -54,11 +54,8 @@ public final byte[] serialize(Object o) throws Exception { return ((String) o).getBytes(UTF_8); } else { ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - GZIPOutputStream out = new GZIPOutputStream(bytes); - try { + try (GZIPOutputStream out = new GZIPOutputStream(bytes)) { mapper.writeValue(out, o); - } finally { - out.close(); } return bytes.toByteArray(); } @@ -69,11 +66,8 @@ public final T deserialize(byte[] data, Class klass) throws Exception { if (klass.equals(String.class)) { return (T) new String(data, UTF_8); } else { - GZIPInputStream in = new GZIPInputStream(new ByteArrayInputStream(data)); - try { + try (GZIPInputStream in = new GZIPInputStream(new ByteArrayInputStream(data))) { return mapper.readValue(in, klass); - } finally { - in.close(); } } } diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java index 205f7df87c5b..39a952f2b0df 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java @@ -217,7 +217,7 @@ public void testSkip() throws Exception { public void testNegativeIndexValues() throws Exception { List expected = Arrays.asList(-100, -50, 0, 50, 100); - expected.stream().forEach(i -> { + expected.forEach(i -> { try { db.write(createCustomType1(i)); } catch (Exception e) { diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 8ca7733507f1..41fcbf058949 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java index ae91bc9cfdd0..480b52652de5 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java +++ b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java @@ -21,6 +21,8 @@ import java.util.List; import io.netty.channel.Channel; +import io.netty.channel.ChannelPipeline; +import io.netty.channel.EventLoopGroup; import io.netty.channel.socket.SocketChannel; import io.netty.handler.timeout.IdleStateHandler; import org.slf4j.Logger; @@ -32,11 +34,13 @@ import org.apache.spark.network.client.TransportResponseHandler; import org.apache.spark.network.protocol.MessageDecoder; import org.apache.spark.network.protocol.MessageEncoder; +import org.apache.spark.network.server.ChunkFetchRequestHandler; import org.apache.spark.network.server.RpcHandler; import org.apache.spark.network.server.TransportChannelHandler; import org.apache.spark.network.server.TransportRequestHandler; import org.apache.spark.network.server.TransportServer; import org.apache.spark.network.server.TransportServerBootstrap; +import org.apache.spark.network.util.IOMode; import org.apache.spark.network.util.NettyUtils; import org.apache.spark.network.util.TransportConf; import org.apache.spark.network.util.TransportFrameDecoder; @@ -61,6 +65,7 @@ public class TransportContext { private final TransportConf conf; private final RpcHandler rpcHandler; private final boolean closeIdleConnections; + private final boolean isClientOnly; /** * Force to create MessageEncoder and MessageDecoder so that we can make sure they will be created @@ -77,17 +82,54 @@ public class TransportContext { private static final MessageEncoder ENCODER = MessageEncoder.INSTANCE; private static final MessageDecoder DECODER = MessageDecoder.INSTANCE; + // Separate thread pool for handling ChunkFetchRequest. This helps to enable throttling + // max number of TransportServer worker threads that are blocked on writing response + // of ChunkFetchRequest message back to the client via the underlying channel. + private static EventLoopGroup chunkFetchWorkers; + public TransportContext(TransportConf conf, RpcHandler rpcHandler) { - this(conf, rpcHandler, false); + this(conf, rpcHandler, false, false); } public TransportContext( TransportConf conf, RpcHandler rpcHandler, boolean closeIdleConnections) { + this(conf, rpcHandler, closeIdleConnections, false); + } + + /** + * Enables TransportContext initialization for underlying client and server. + * + * @param conf TransportConf + * @param rpcHandler RpcHandler responsible for handling requests and responses. + * @param closeIdleConnections Close idle connections if it is set to true. + * @param isClientOnly This config indicates the TransportContext is only used by a client. + * This config is more important when external shuffle is enabled. + * It stops creating extra event loop and subsequent thread pool + * for shuffle clients to handle chunked fetch requests. + */ + public TransportContext( + TransportConf conf, + RpcHandler rpcHandler, + boolean closeIdleConnections, + boolean isClientOnly) { this.conf = conf; this.rpcHandler = rpcHandler; this.closeIdleConnections = closeIdleConnections; + this.isClientOnly = isClientOnly; + + synchronized(TransportContext.class) { + if (chunkFetchWorkers == null && + conf.getModuleName() != null && + conf.getModuleName().equalsIgnoreCase("shuffle") && + !isClientOnly) { + chunkFetchWorkers = NettyUtils.createEventLoop( + IOMode.valueOf(conf.ioMode()), + conf.chunkFetchHandlerThreads(), + "shuffle-chunk-fetch-handler"); + } + } } /** @@ -144,14 +186,23 @@ public TransportChannelHandler initializePipeline( RpcHandler channelRpcHandler) { try { TransportChannelHandler channelHandler = createChannelHandler(channel, channelRpcHandler); - channel.pipeline() + ChunkFetchRequestHandler chunkFetchHandler = + createChunkFetchHandler(channelHandler, channelRpcHandler); + ChannelPipeline pipeline = channel.pipeline() .addLast("encoder", ENCODER) .addLast(TransportFrameDecoder.HANDLER_NAME, NettyUtils.createFrameDecoder()) .addLast("decoder", DECODER) - .addLast("idleStateHandler", new IdleStateHandler(0, 0, conf.connectionTimeoutMs() / 1000)) + .addLast("idleStateHandler", + new IdleStateHandler(0, 0, conf.connectionTimeoutMs() / 1000)) // NOTE: Chunks are currently guaranteed to be returned in the order of request, but this // would require more logic to guarantee if this were not part of the same event loop. .addLast("handler", channelHandler); + // Use a separate EventLoopGroup to handle ChunkFetchRequest messages for shuffle rpcs. + if (conf.getModuleName() != null && + conf.getModuleName().equalsIgnoreCase("shuffle") + && !isClientOnly) { + pipeline.addLast(chunkFetchWorkers, "chunkFetchHandler", chunkFetchHandler); + } return channelHandler; } catch (RuntimeException e) { logger.error("Error while initializing Netty pipeline", e); @@ -173,5 +224,14 @@ private TransportChannelHandler createChannelHandler(Channel channel, RpcHandler conf.connectionTimeoutMs(), closeIdleConnections); } + /** + * Creates the dedicated ChannelHandler for ChunkFetchRequest messages. + */ + private ChunkFetchRequestHandler createChunkFetchHandler(TransportChannelHandler channelHandler, + RpcHandler rpcHandler) { + return new ChunkFetchRequestHandler(channelHandler.getClient(), + rpcHandler.getStreamManager(), conf.maxChunksBeingTransferred()); + } + public TransportConf getConf() { return conf; } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java index 1861f8d7fd8f..2d573f512437 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java +++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java @@ -36,7 +36,10 @@ */ public abstract class ManagedBuffer { - /** Number of bytes of the data. */ + /** + * Number of bytes of the data. If this buffer will decrypt for all of the views into the data, + * this is the size of the decrypted data. + */ public abstract long size(); /** diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java index 056505ef5335..64fdb32a67ad 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java @@ -159,15 +159,21 @@ public void close() throws IOException { // accurately report the errors when they happen. RuntimeException error = null; byte[] dummy = new byte[8]; - try { - doCipherOp(encryptor, dummy, true); - } catch (Exception e) { - error = new RuntimeException(e); + if (encryptor != null) { + try { + doCipherOp(Cipher.ENCRYPT_MODE, dummy, true); + } catch (Exception e) { + error = new RuntimeException(e); + } + encryptor = null; } - try { - doCipherOp(decryptor, dummy, true); - } catch (Exception e) { - error = new RuntimeException(e); + if (decryptor != null) { + try { + doCipherOp(Cipher.DECRYPT_MODE, dummy, true); + } catch (Exception e) { + error = new RuntimeException(e); + } + decryptor = null; } random.close(); @@ -189,11 +195,11 @@ byte[] rawResponse(byte[] challenge) { } private byte[] decrypt(byte[] in) throws GeneralSecurityException { - return doCipherOp(decryptor, in, false); + return doCipherOp(Cipher.DECRYPT_MODE, in, false); } private byte[] encrypt(byte[] in) throws GeneralSecurityException { - return doCipherOp(encryptor, in, false); + return doCipherOp(Cipher.ENCRYPT_MODE, in, false); } private void initializeForAuth(String cipher, byte[] nonce, SecretKeySpec key) @@ -205,11 +211,13 @@ private void initializeForAuth(String cipher, byte[] nonce, SecretKeySpec key) byte[] iv = new byte[conf.ivLength()]; System.arraycopy(nonce, 0, iv, 0, Math.min(nonce.length, iv.length)); - encryptor = CryptoCipherFactory.getCryptoCipher(cipher, cryptoConf); - encryptor.init(Cipher.ENCRYPT_MODE, key, new IvParameterSpec(iv)); + CryptoCipher _encryptor = CryptoCipherFactory.getCryptoCipher(cipher, cryptoConf); + _encryptor.init(Cipher.ENCRYPT_MODE, key, new IvParameterSpec(iv)); + this.encryptor = _encryptor; - decryptor = CryptoCipherFactory.getCryptoCipher(cipher, cryptoConf); - decryptor.init(Cipher.DECRYPT_MODE, key, new IvParameterSpec(iv)); + CryptoCipher _decryptor = CryptoCipherFactory.getCryptoCipher(cipher, cryptoConf); + _decryptor.init(Cipher.DECRYPT_MODE, key, new IvParameterSpec(iv)); + this.decryptor = _decryptor; } /** @@ -241,29 +249,52 @@ private SecretKeySpec generateKey(String kdf, int iterations, byte[] salt, int k return new SecretKeySpec(key.getEncoded(), conf.keyAlgorithm()); } - private byte[] doCipherOp(CryptoCipher cipher, byte[] in, boolean isFinal) + private byte[] doCipherOp(int mode, byte[] in, boolean isFinal) throws GeneralSecurityException { - Preconditions.checkState(cipher != null); + CryptoCipher cipher; + switch (mode) { + case Cipher.ENCRYPT_MODE: + cipher = encryptor; + break; + case Cipher.DECRYPT_MODE: + cipher = decryptor; + break; + default: + throw new IllegalArgumentException(String.valueOf(mode)); + } - int scale = 1; - while (true) { - int size = in.length * scale; - byte[] buffer = new byte[size]; - try { - int outSize = isFinal ? cipher.doFinal(in, 0, in.length, buffer, 0) - : cipher.update(in, 0, in.length, buffer, 0); - if (outSize != buffer.length) { - byte[] output = new byte[outSize]; - System.arraycopy(buffer, 0, output, 0, output.length); - return output; - } else { - return buffer; + Preconditions.checkState(cipher != null, "Cipher is invalid because of previous error."); + + try { + int scale = 1; + while (true) { + int size = in.length * scale; + byte[] buffer = new byte[size]; + try { + int outSize = isFinal ? cipher.doFinal(in, 0, in.length, buffer, 0) + : cipher.update(in, 0, in.length, buffer, 0); + if (outSize != buffer.length) { + byte[] output = new byte[outSize]; + System.arraycopy(buffer, 0, output, 0, output.length); + return output; + } else { + return buffer; + } + } catch (ShortBufferException e) { + // Try again with a bigger buffer. + scale *= 2; } - } catch (ShortBufferException e) { - // Try again with a bigger buffer. - scale *= 2; } + } catch (InternalError ie) { + // SPARK-25535. The commons-cryto library will throw InternalError if something goes wrong, + // and leave bad state behind in the Java wrappers, so it's not safe to use them afterwards. + if (mode == Cipher.ENCRYPT_MODE) { + this.encryptor = null; + } else { + this.decryptor = null; + } + throw ie; } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java index b64e4b7a970b..2745052265f7 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java @@ -107,45 +107,72 @@ public void addToChannel(Channel ch) throws IOException { private static class EncryptionHandler extends ChannelOutboundHandlerAdapter { private final ByteArrayWritableChannel byteChannel; private final CryptoOutputStream cos; + private boolean isCipherValid; EncryptionHandler(TransportCipher cipher) throws IOException { byteChannel = new ByteArrayWritableChannel(STREAM_BUFFER_SIZE); cos = cipher.createOutputStream(byteChannel); + isCipherValid = true; } @Override public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise) throws Exception { - ctx.write(new EncryptedMessage(cos, msg, byteChannel), promise); + ctx.write(new EncryptedMessage(this, cos, msg, byteChannel), promise); } @Override public void close(ChannelHandlerContext ctx, ChannelPromise promise) throws Exception { try { - cos.close(); + if (isCipherValid) { + cos.close(); + } } finally { super.close(ctx, promise); } } + + /** + * SPARK-25535. Workaround for CRYPTO-141. Avoid further interaction with the underlying cipher + * after an error occurs. + */ + void reportError() { + this.isCipherValid = false; + } + + boolean isCipherValid() { + return isCipherValid; + } } private static class DecryptionHandler extends ChannelInboundHandlerAdapter { private final CryptoInputStream cis; private final ByteArrayReadableChannel byteChannel; + private boolean isCipherValid; DecryptionHandler(TransportCipher cipher) throws IOException { byteChannel = new ByteArrayReadableChannel(); cis = cipher.createInputStream(byteChannel); + isCipherValid = true; } @Override public void channelRead(ChannelHandlerContext ctx, Object data) throws Exception { + if (!isCipherValid) { + throw new IOException("Cipher is in invalid state."); + } byteChannel.feedData((ByteBuf) data); byte[] decryptedData = new byte[byteChannel.readableBytes()]; int offset = 0; while (offset < decryptedData.length) { - offset += cis.read(decryptedData, offset, decryptedData.length - offset); + // SPARK-25535: workaround for CRYPTO-141. + try { + offset += cis.read(decryptedData, offset, decryptedData.length - offset); + } catch (InternalError ie) { + isCipherValid = false; + throw ie; + } } ctx.fireChannelRead(Unpooled.wrappedBuffer(decryptedData, 0, decryptedData.length)); @@ -154,7 +181,9 @@ public void channelRead(ChannelHandlerContext ctx, Object data) throws Exception @Override public void channelInactive(ChannelHandlerContext ctx) throws Exception { try { - cis.close(); + if (isCipherValid) { + cis.close(); + } } finally { super.channelInactive(ctx); } @@ -165,8 +194,9 @@ private static class EncryptedMessage extends AbstractFileRegion { private final boolean isByteBuf; private final ByteBuf buf; private final FileRegion region; + private final CryptoOutputStream cos; + private final EncryptionHandler handler; private long transferred; - private CryptoOutputStream cos; // Due to streaming issue CRYPTO-125: https://issues.apache.org/jira/browse/CRYPTO-125, it has // to utilize two helper ByteArrayWritableChannel for streaming. One is used to receive raw data @@ -176,9 +206,14 @@ private static class EncryptedMessage extends AbstractFileRegion { private ByteBuffer currentEncrypted; - EncryptedMessage(CryptoOutputStream cos, Object msg, ByteArrayWritableChannel ch) { + EncryptedMessage( + EncryptionHandler handler, + CryptoOutputStream cos, + Object msg, + ByteArrayWritableChannel ch) { Preconditions.checkArgument(msg instanceof ByteBuf || msg instanceof FileRegion, "Unrecognized message type: %s", msg.getClass().getName()); + this.handler = handler; this.isByteBuf = msg instanceof ByteBuf; this.buf = isByteBuf ? (ByteBuf) msg : null; this.region = isByteBuf ? null : (FileRegion) msg; @@ -261,6 +296,9 @@ public long transferTo(WritableByteChannel target, long position) throws IOExcep } private void encryptMore() throws IOException { + if (!handler.isCipherValid()) { + throw new IOException("Cipher is in invalid state."); + } byteRawChannel.reset(); if (isByteBuf) { @@ -269,8 +307,14 @@ private void encryptMore() throws IOException { } else { region.transferTo(byteRawChannel, region.transferred()); } - cos.write(byteRawChannel.getData(), 0, byteRawChannel.length()); - cos.flush(); + + try { + cos.write(byteRawChannel.getData(), 0, byteRawChannel.length()); + cos.flush(); + } catch (InternalError ie) { + handler.reportError(); + throw ie; + } currentEncrypted = ByteBuffer.wrap(byteEncChannel.getData(), 0, byteEncChannel.length()); diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java new file mode 100644 index 000000000000..f08d8b0f984c --- /dev/null +++ b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.server; + +import java.net.SocketAddress; + +import com.google.common.base.Throwables; +import io.netty.channel.Channel; +import io.netty.channel.ChannelFuture; +import io.netty.channel.ChannelFutureListener; +import io.netty.channel.ChannelHandlerContext; +import io.netty.channel.SimpleChannelInboundHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.spark.network.buffer.ManagedBuffer; +import org.apache.spark.network.client.TransportClient; +import org.apache.spark.network.protocol.ChunkFetchFailure; +import org.apache.spark.network.protocol.ChunkFetchRequest; +import org.apache.spark.network.protocol.ChunkFetchSuccess; +import org.apache.spark.network.protocol.Encodable; + +import static org.apache.spark.network.util.NettyUtils.*; + +/** + * A dedicated ChannelHandler for processing ChunkFetchRequest messages. When sending response + * of ChunkFetchRequest messages to the clients, the thread performing the I/O on the underlying + * channel could potentially be blocked due to disk contentions. If several hundreds of clients + * send ChunkFetchRequest to the server at the same time, it could potentially occupying all + * threads from TransportServer's default EventLoopGroup for waiting for disk reads before it + * can send the block data back to the client as part of the ChunkFetchSuccess messages. As a + * result, it would leave no threads left to process other RPC messages, which takes much less + * time to process, and could lead to client timing out on either performing SASL authentication, + * registering executors, or waiting for response for an OpenBlocks messages. + */ +public class ChunkFetchRequestHandler extends SimpleChannelInboundHandler { + private static final Logger logger = LoggerFactory.getLogger(ChunkFetchRequestHandler.class); + + private final TransportClient client; + private final StreamManager streamManager; + /** The max number of chunks being transferred and not finished yet. */ + private final long maxChunksBeingTransferred; + + public ChunkFetchRequestHandler( + TransportClient client, + StreamManager streamManager, + Long maxChunksBeingTransferred) { + this.client = client; + this.streamManager = streamManager; + this.maxChunksBeingTransferred = maxChunksBeingTransferred; + } + + @Override + public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception { + logger.warn("Exception in connection from " + getRemoteAddress(ctx.channel()), cause); + ctx.close(); + } + + @Override + protected void channelRead0( + ChannelHandlerContext ctx, + final ChunkFetchRequest msg) throws Exception { + Channel channel = ctx.channel(); + if (logger.isTraceEnabled()) { + logger.trace("Received req from {} to fetch block {}", getRemoteAddress(channel), + msg.streamChunkId); + } + long chunksBeingTransferred = streamManager.chunksBeingTransferred(); + if (chunksBeingTransferred >= maxChunksBeingTransferred) { + logger.warn("The number of chunks being transferred {} is above {}, close the connection.", + chunksBeingTransferred, maxChunksBeingTransferred); + channel.close(); + return; + } + ManagedBuffer buf; + try { + streamManager.checkAuthorization(client, msg.streamChunkId.streamId); + streamManager.registerChannel(channel, msg.streamChunkId.streamId); + buf = streamManager.getChunk(msg.streamChunkId.streamId, msg.streamChunkId.chunkIndex); + } catch (Exception e) { + logger.error(String.format("Error opening block %s for request from %s", + msg.streamChunkId, getRemoteAddress(channel)), e); + respond(channel, new ChunkFetchFailure(msg.streamChunkId, + Throwables.getStackTraceAsString(e))); + return; + } + + streamManager.chunkBeingSent(msg.streamChunkId.streamId); + respond(channel, new ChunkFetchSuccess(msg.streamChunkId, buf)).addListener( + (ChannelFutureListener) future -> streamManager.chunkSent(msg.streamChunkId.streamId)); + } + + /** + * The invocation to channel.writeAndFlush is async, and the actual I/O on the + * channel will be handled by the EventLoop the channel is registered to. So even + * though we are processing the ChunkFetchRequest in a separate thread pool, the actual I/O, + * which is the potentially blocking call that could deplete server handler threads, is still + * being processed by TransportServer's default EventLoopGroup. In order to throttle the max + * number of threads that channel I/O for sending response to ChunkFetchRequest, the thread + * calling channel.writeAndFlush will wait for the completion of sending response back to + * client by invoking await(). This will throttle the rate at which threads from + * ChunkFetchRequest dedicated EventLoopGroup submit channel I/O requests to TransportServer's + * default EventLoopGroup, thus making sure that we can reserve some threads in + * TransportServer's default EventLoopGroup for handling other RPC messages. + */ + private ChannelFuture respond( + final Channel channel, + final Encodable result) throws InterruptedException { + final SocketAddress remoteAddress = channel.remoteAddress(); + return channel.writeAndFlush(result).await().addListener((ChannelFutureListener) future -> { + if (future.isSuccess()) { + logger.trace("Sent result {} to client {}", result, remoteAddress); + } else { + logger.error(String.format("Error sending result %s to %s; closing connection", + result, remoteAddress), future.cause()); + channel.close(); + } + }); + } +} diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java index 56782a832787..c824a7b0d474 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java @@ -18,7 +18,7 @@ package org.apache.spark.network.server; import io.netty.channel.ChannelHandlerContext; -import io.netty.channel.ChannelInboundHandlerAdapter; +import io.netty.channel.SimpleChannelInboundHandler; import io.netty.handler.timeout.IdleState; import io.netty.handler.timeout.IdleStateEvent; import org.slf4j.Logger; @@ -26,6 +26,8 @@ import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.client.TransportResponseHandler; +import org.apache.spark.network.protocol.ChunkFetchRequest; +import org.apache.spark.network.protocol.Message; import org.apache.spark.network.protocol.RequestMessage; import org.apache.spark.network.protocol.ResponseMessage; import static org.apache.spark.network.util.NettyUtils.getRemoteAddress; @@ -47,7 +49,7 @@ * on the channel for at least `requestTimeoutMs`. Note that this is duplex traffic; we will not * timeout if the client is continuously sending but getting no responses, for simplicity. */ -public class TransportChannelHandler extends ChannelInboundHandlerAdapter { +public class TransportChannelHandler extends SimpleChannelInboundHandler { private static final Logger logger = LoggerFactory.getLogger(TransportChannelHandler.class); private final TransportClient client; @@ -112,8 +114,21 @@ public void channelInactive(ChannelHandlerContext ctx) throws Exception { super.channelInactive(ctx); } + /** + * Overwrite acceptInboundMessage to properly delegate ChunkFetchRequest messages + * to ChunkFetchRequestHandler. + */ @Override - public void channelRead(ChannelHandlerContext ctx, Object request) throws Exception { + public boolean acceptInboundMessage(Object msg) throws Exception { + if (msg instanceof ChunkFetchRequest) { + return false; + } else { + return super.acceptInboundMessage(msg); + } + } + + @Override + public void channelRead0(ChannelHandlerContext ctx, Message request) throws Exception { if (request instanceof RequestMessage) { requestHandler.handle((RequestMessage) request); } else if (request instanceof ResponseMessage) { diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java index 9fac96dbe450..3e089b4cae27 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java @@ -24,6 +24,7 @@ import com.google.common.base.Throwables; import io.netty.channel.Channel; import io.netty.channel.ChannelFuture; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -97,9 +98,7 @@ public void channelInactive() { @Override public void handle(RequestMessage request) { - if (request instanceof ChunkFetchRequest) { - processFetchRequest((ChunkFetchRequest) request); - } else if (request instanceof RpcRequest) { + if (request instanceof RpcRequest) { processRpcRequest((RpcRequest) request); } else if (request instanceof OneWayMessage) { processOneWayMessage((OneWayMessage) request); @@ -112,36 +111,6 @@ public void handle(RequestMessage request) { } } - private void processFetchRequest(final ChunkFetchRequest req) { - if (logger.isTraceEnabled()) { - logger.trace("Received req from {} to fetch block {}", getRemoteAddress(channel), - req.streamChunkId); - } - long chunksBeingTransferred = streamManager.chunksBeingTransferred(); - if (chunksBeingTransferred >= maxChunksBeingTransferred) { - logger.warn("The number of chunks being transferred {} is above {}, close the connection.", - chunksBeingTransferred, maxChunksBeingTransferred); - channel.close(); - return; - } - ManagedBuffer buf; - try { - streamManager.checkAuthorization(reverseClient, req.streamChunkId.streamId); - streamManager.registerChannel(channel, req.streamChunkId.streamId); - buf = streamManager.getChunk(req.streamChunkId.streamId, req.streamChunkId.chunkIndex); - } catch (Exception e) { - logger.error(String.format("Error opening block %s for request from %s", - req.streamChunkId, getRemoteAddress(channel)), e); - respond(new ChunkFetchFailure(req.streamChunkId, Throwables.getStackTraceAsString(e))); - return; - } - - streamManager.chunkBeingSent(req.streamChunkId.streamId); - respond(new ChunkFetchSuccess(req.streamChunkId, buf)).addListener(future -> { - streamManager.chunkSent(req.streamChunkId.streamId); - }); - } - private void processStreamRequest(final StreamRequest req) { if (logger.isTraceEnabled()) { logger.trace("Received req from {} to fetch stream {}", getRemoteAddress(channel), diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java index 34e4bb5912dc..43a6bc7dc3d0 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java @@ -21,6 +21,7 @@ import java.util.Properties; import com.google.common.primitives.Ints; +import io.netty.util.NettyRuntime; /** * A central location that tracks all the settings we expose to users. @@ -281,4 +282,35 @@ public Properties cryptoConf() { public long maxChunksBeingTransferred() { return conf.getLong("spark.shuffle.maxChunksBeingTransferred", Long.MAX_VALUE); } + + /** + * Percentage of io.serverThreads used by netty to process ChunkFetchRequest. + * Shuffle server will use a separate EventLoopGroup to process ChunkFetchRequest messages. + * Although when calling the async writeAndFlush on the underlying channel to send + * response back to client, the I/O on the channel is still being handled by + * {@link org.apache.spark.network.server.TransportServer}'s default EventLoopGroup + * that's registered with the Channel, by waiting inside the ChunkFetchRequest handler + * threads for the completion of sending back responses, we are able to put a limit on + * the max number of threads from TransportServer's default EventLoopGroup that are + * going to be consumed by writing response to ChunkFetchRequest, which are I/O intensive + * and could take long time to process due to disk contentions. By configuring a slightly + * higher number of shuffler server threads, we are able to reserve some threads for + * handling other RPC messages, thus making the Client less likely to experience timeout + * when sending RPC messages to the shuffle server. The number of threads used for handling + * chunked fetch requests are percentage of io.serverThreads (if defined) else it is a percentage + * of 2 * #cores. However, a percentage of 0 means netty default number of threads which + * is 2 * #cores ignoring io.serverThreads. The percentage here is configured via + * spark.shuffle.server.chunkFetchHandlerThreadsPercent. The returned value is rounded off to + * ceiling of the nearest integer. + */ + public int chunkFetchHandlerThreads() { + if (!this.getModuleName().equalsIgnoreCase("shuffle")) { + return 0; + } + int chunkFetchHandlerThreadsPercent = + conf.getInt("spark.shuffle.server.chunkFetchHandlerThreadsPercent", 100); + return (int)Math.ceil( + (this.serverThreads() > 0 ? this.serverThreads() : 2 * NettyRuntime.availableProcessors()) * + chunkFetchHandlerThreadsPercent/(double)100); + } } diff --git a/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchIntegrationSuite.java b/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchIntegrationSuite.java index 824482af08dd..37a8664a5266 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchIntegrationSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchIntegrationSuite.java @@ -143,37 +143,39 @@ public void releaseBuffers() { } private FetchResult fetchChunks(List chunkIndices) throws Exception { - TransportClient client = clientFactory.createClient(TestUtils.getLocalHost(), server.getPort()); - final Semaphore sem = new Semaphore(0); - final FetchResult res = new FetchResult(); - res.successChunks = Collections.synchronizedSet(new HashSet()); - res.failedChunks = Collections.synchronizedSet(new HashSet()); - res.buffers = Collections.synchronizedList(new LinkedList()); - ChunkReceivedCallback callback = new ChunkReceivedCallback() { - @Override - public void onSuccess(int chunkIndex, ManagedBuffer buffer) { - buffer.retain(); - res.successChunks.add(chunkIndex); - res.buffers.add(buffer); - sem.release(); - } + try (TransportClient client = + clientFactory.createClient(TestUtils.getLocalHost(), server.getPort())) { + final Semaphore sem = new Semaphore(0); + + res.successChunks = Collections.synchronizedSet(new HashSet()); + res.failedChunks = Collections.synchronizedSet(new HashSet()); + res.buffers = Collections.synchronizedList(new LinkedList()); + + ChunkReceivedCallback callback = new ChunkReceivedCallback() { + @Override + public void onSuccess(int chunkIndex, ManagedBuffer buffer) { + buffer.retain(); + res.successChunks.add(chunkIndex); + res.buffers.add(buffer); + sem.release(); + } - @Override - public void onFailure(int chunkIndex, Throwable e) { - res.failedChunks.add(chunkIndex); - sem.release(); - } - }; + @Override + public void onFailure(int chunkIndex, Throwable e) { + res.failedChunks.add(chunkIndex); + sem.release(); + } + }; - for (int chunkIndex : chunkIndices) { - client.fetchChunk(STREAM_ID, chunkIndex, callback); - } - if (!sem.tryAcquire(chunkIndices.size(), 5, TimeUnit.SECONDS)) { - fail("Timeout getting response from the server"); + for (int chunkIndex : chunkIndices) { + client.fetchChunk(STREAM_ID, chunkIndex, callback); + } + if (!sem.tryAcquire(chunkIndices.size(), 5, TimeUnit.SECONDS)) { + fail("Timeout getting response from the server"); + } } - client.close(); return res; } diff --git a/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchRequestHandlerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchRequestHandlerSuite.java new file mode 100644 index 000000000000..2c72c53a33ae --- /dev/null +++ b/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchRequestHandlerSuite.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network; + +import io.netty.channel.ChannelHandlerContext; +import java.util.ArrayList; +import java.util.List; + +import io.netty.channel.Channel; +import org.apache.spark.network.server.ChunkFetchRequestHandler; +import org.junit.Test; + +import static org.mockito.Mockito.*; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.spark.network.buffer.ManagedBuffer; +import org.apache.spark.network.client.TransportClient; +import org.apache.spark.network.protocol.*; +import org.apache.spark.network.server.NoOpRpcHandler; +import org.apache.spark.network.server.OneForOneStreamManager; +import org.apache.spark.network.server.RpcHandler; + +public class ChunkFetchRequestHandlerSuite { + + @Test + public void handleChunkFetchRequest() throws Exception { + RpcHandler rpcHandler = new NoOpRpcHandler(); + OneForOneStreamManager streamManager = (OneForOneStreamManager) (rpcHandler.getStreamManager()); + Channel channel = mock(Channel.class); + ChannelHandlerContext context = mock(ChannelHandlerContext.class); + when(context.channel()) + .thenAnswer(invocationOnMock0 -> { + return channel; + }); + List> responseAndPromisePairs = + new ArrayList<>(); + when(channel.writeAndFlush(any())) + .thenAnswer(invocationOnMock0 -> { + Object response = invocationOnMock0.getArguments()[0]; + ExtendedChannelPromise channelFuture = new ExtendedChannelPromise(channel); + responseAndPromisePairs.add(ImmutablePair.of(response, channelFuture)); + return channelFuture; + }); + + // Prepare the stream. + List managedBuffers = new ArrayList<>(); + managedBuffers.add(new TestManagedBuffer(10)); + managedBuffers.add(new TestManagedBuffer(20)); + managedBuffers.add(new TestManagedBuffer(30)); + managedBuffers.add(new TestManagedBuffer(40)); + long streamId = streamManager.registerStream("test-app", managedBuffers.iterator()); + streamManager.registerChannel(channel, streamId); + TransportClient reverseClient = mock(TransportClient.class); + ChunkFetchRequestHandler requestHandler = new ChunkFetchRequestHandler(reverseClient, + rpcHandler.getStreamManager(), 2L); + + RequestMessage request0 = new ChunkFetchRequest(new StreamChunkId(streamId, 0)); + requestHandler.channelRead(context, request0); + assert responseAndPromisePairs.size() == 1; + assert responseAndPromisePairs.get(0).getLeft() instanceof ChunkFetchSuccess; + assert ((ChunkFetchSuccess) (responseAndPromisePairs.get(0).getLeft())).body() == + managedBuffers.get(0); + + RequestMessage request1 = new ChunkFetchRequest(new StreamChunkId(streamId, 1)); + requestHandler.channelRead(context, request1); + assert responseAndPromisePairs.size() == 2; + assert responseAndPromisePairs.get(1).getLeft() instanceof ChunkFetchSuccess; + assert ((ChunkFetchSuccess) (responseAndPromisePairs.get(1).getLeft())).body() == + managedBuffers.get(1); + + // Finish flushing the response for request0. + responseAndPromisePairs.get(0).getRight().finish(true); + + RequestMessage request2 = new ChunkFetchRequest(new StreamChunkId(streamId, 2)); + requestHandler.channelRead(context, request2); + assert responseAndPromisePairs.size() == 3; + assert responseAndPromisePairs.get(2).getLeft() instanceof ChunkFetchSuccess; + assert ((ChunkFetchSuccess) (responseAndPromisePairs.get(2).getLeft())).body() == + managedBuffers.get(2); + + RequestMessage request3 = new ChunkFetchRequest(new StreamChunkId(streamId, 3)); + requestHandler.channelRead(context, request3); + verify(channel, times(1)).close(); + assert responseAndPromisePairs.size() == 3; + } +} diff --git a/common/network-common/src/test/java/org/apache/spark/network/ExtendedChannelPromise.java b/common/network-common/src/test/java/org/apache/spark/network/ExtendedChannelPromise.java new file mode 100644 index 000000000000..573ffd627a2e --- /dev/null +++ b/common/network-common/src/test/java/org/apache/spark/network/ExtendedChannelPromise.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network; + +import java.util.ArrayList; +import java.util.List; + +import io.netty.channel.Channel; +import io.netty.channel.ChannelPromise; +import io.netty.channel.DefaultChannelPromise; +import io.netty.util.concurrent.Future; +import io.netty.util.concurrent.GenericFutureListener; + +class ExtendedChannelPromise extends DefaultChannelPromise { + + private List>> listeners = new ArrayList<>(); + private boolean success; + + ExtendedChannelPromise(Channel channel) { + super(channel); + success = false; + } + + @Override + public ChannelPromise addListener( + GenericFutureListener> listener) { + @SuppressWarnings("unchecked") + GenericFutureListener> gfListener = + (GenericFutureListener>) listener; + listeners.add(gfListener); + return super.addListener(listener); + } + + @Override + public boolean isSuccess() { + return success; + } + + @Override + public ChannelPromise await() throws InterruptedException { + return this; + } + + public void finish(boolean success) { + this.success = success; + listeners.forEach(listener -> { + try { + listener.operationComplete(this); + } catch (Exception e) { + // do nothing + } + }); + } +} diff --git a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java index 2656cbee95a2..ad640415a8e6 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java @@ -21,10 +21,6 @@ import java.util.List; import io.netty.channel.Channel; -import io.netty.channel.ChannelPromise; -import io.netty.channel.DefaultChannelPromise; -import io.netty.util.concurrent.Future; -import io.netty.util.concurrent.GenericFutureListener; import org.junit.Test; import static org.mockito.Mockito.*; @@ -42,7 +38,7 @@ public class TransportRequestHandlerSuite { @Test - public void handleFetchRequestAndStreamRequest() throws Exception { + public void handleStreamRequest() throws Exception { RpcHandler rpcHandler = new NoOpRpcHandler(); OneForOneStreamManager streamManager = (OneForOneStreamManager) (rpcHandler.getStreamManager()); Channel channel = mock(Channel.class); @@ -68,18 +64,18 @@ public void handleFetchRequestAndStreamRequest() throws Exception { TransportRequestHandler requestHandler = new TransportRequestHandler(channel, reverseClient, rpcHandler, 2L); - RequestMessage request0 = new ChunkFetchRequest(new StreamChunkId(streamId, 0)); + RequestMessage request0 = new StreamRequest(String.format("%d_%d", streamId, 0)); requestHandler.handle(request0); assert responseAndPromisePairs.size() == 1; - assert responseAndPromisePairs.get(0).getLeft() instanceof ChunkFetchSuccess; - assert ((ChunkFetchSuccess) (responseAndPromisePairs.get(0).getLeft())).body() == + assert responseAndPromisePairs.get(0).getLeft() instanceof StreamResponse; + assert ((StreamResponse) (responseAndPromisePairs.get(0).getLeft())).body() == managedBuffers.get(0); - RequestMessage request1 = new ChunkFetchRequest(new StreamChunkId(streamId, 1)); + RequestMessage request1 = new StreamRequest(String.format("%d_%d", streamId, 1)); requestHandler.handle(request1); assert responseAndPromisePairs.size() == 2; - assert responseAndPromisePairs.get(1).getLeft() instanceof ChunkFetchSuccess; - assert ((ChunkFetchSuccess) (responseAndPromisePairs.get(1).getLeft())).body() == + assert responseAndPromisePairs.get(1).getLeft() instanceof StreamResponse; + assert ((StreamResponse) (responseAndPromisePairs.get(1).getLeft())).body() == managedBuffers.get(1); // Finish flushing the response for request0. @@ -99,41 +95,4 @@ public void handleFetchRequestAndStreamRequest() throws Exception { verify(channel, times(1)).close(); assert responseAndPromisePairs.size() == 3; } - - private class ExtendedChannelPromise extends DefaultChannelPromise { - - private List>> listeners = new ArrayList<>(); - private boolean success; - - ExtendedChannelPromise(Channel channel) { - super(channel); - success = false; - } - - @Override - public ChannelPromise addListener( - GenericFutureListener> listener) { - @SuppressWarnings("unchecked") - GenericFutureListener> gfListener = - (GenericFutureListener>) listener; - listeners.add(gfListener); - return super.addListener(listener); - } - - @Override - public boolean isSuccess() { - return success; - } - - public void finish(boolean success) { - this.success = success; - listeners.forEach(listener -> { - try { - listener.operationComplete(this); - } catch (Exception e) { - // do nothing - } - }); - } - } } diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java index a3519fe4a423..c0aa298a4017 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java @@ -18,8 +18,11 @@ package org.apache.spark.network.crypto; import java.util.Arrays; +import java.util.Map; +import java.security.InvalidKeyException; import static java.nio.charset.StandardCharsets.UTF_8; +import com.google.common.collect.ImmutableMap; import org.junit.BeforeClass; import org.junit.Test; import static org.junit.Assert.*; @@ -104,4 +107,18 @@ public void testBadChallenge() throws Exception { challenge.cipher, challenge.keyLength, challenge.nonce, badChallenge)); } + @Test(expected = InvalidKeyException.class) + public void testBadKeySize() throws Exception { + Map mconf = ImmutableMap.of("spark.network.crypto.keyLength", "42"); + TransportConf conf = new TransportConf("rpc", new MapConfigProvider(mconf)); + + try (AuthEngine engine = new AuthEngine("appId", "secret", conf)) { + engine.challenge(); + fail("Should have failed to create challenge message."); + + // Call close explicitly to make sure it's idempotent. + engine.close(); + } + } + } diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 05335df61a66..ff717057bb25 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFile.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFile.java new file mode 100644 index 000000000000..633622b35175 --- /dev/null +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFile.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.shuffle; + +import java.io.IOException; + +/** + * A handle on the file used when fetching remote data to disk. Used to ensure the lifecycle of + * writing the data, reading it back, and then cleaning it up is followed. Specific implementations + * may also handle encryption. The data can be read only via DownloadFileWritableChannel, + * which ensures data is not read until after the writer is closed. + */ +public interface DownloadFile { + /** + * Delete the file. + * + * @return true if and only if the file or directory is + * successfully deleted; false otherwise + */ + boolean delete(); + + /** + * A channel for writing data to the file. This special channel allows access to the data for + * reading, after the channel is closed, via {@link DownloadFileWritableChannel#closeAndRead()}. + */ + DownloadFileWritableChannel openForWriting() throws IOException; + + /** + * The path of the file, intended only for debug purposes. + */ + String path(); +} diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempFileManager.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFileManager.java similarity index 75% rename from common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempFileManager.java rename to common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFileManager.java index 552364d274f1..c335a17ae1fe 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/TempFileManager.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFileManager.java @@ -17,20 +17,20 @@ package org.apache.spark.network.shuffle; -import java.io.File; +import org.apache.spark.network.util.TransportConf; /** - * A manager to create temp block files to reduce the memory usage and also clean temp - * files when they won't be used any more. + * A manager to create temp block files used when fetching remote data to reduce the memory usage. + * It will clean files when they won't be used any more. */ -public interface TempFileManager { +public interface DownloadFileManager { /** Create a temp block file. */ - File createTempFile(); + DownloadFile createTempFile(TransportConf transportConf); /** * Register a temp file to clean up when it won't be used any more. Return whether the * file is registered successfully. If `false`, the caller should clean up the file by itself. */ - boolean registerTempFileToClean(File file); + boolean registerTempFileToClean(DownloadFile file); } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFileWritableChannel.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFileWritableChannel.java new file mode 100644 index 000000000000..dbbbac43eb74 --- /dev/null +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/DownloadFileWritableChannel.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.shuffle; + +import org.apache.spark.network.buffer.ManagedBuffer; + +import java.nio.channels.WritableByteChannel; + +/** + * A channel for writing data which is fetched to disk, which allows access to the written data only + * after the writer has been closed. Used with DownloadFile and DownloadFileManager. + */ +public interface DownloadFileWritableChannel extends WritableByteChannel { + ManagedBuffer closeAndRead(); +} diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java index 7ed0b6e93a7a..e49e27ab5aa7 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java @@ -76,7 +76,7 @@ protected void checkInit() { @Override public void init(String appId) { this.appId = appId; - TransportContext context = new TransportContext(conf, new NoOpRpcHandler(), true); + TransportContext context = new TransportContext(conf, new NoOpRpcHandler(), true, true); List bootstraps = Lists.newArrayList(); if (authEnabled) { bootstraps.add(new AuthClientBootstrap(conf, appId, secretKeyHolder)); @@ -91,7 +91,7 @@ public void fetchBlocks( String execId, String[] blockIds, BlockFetchingListener listener, - TempFileManager tempFileManager) { + DownloadFileManager downloadFileManager) { checkInit(); logger.debug("External shuffle fetch from {}:{} (executor id {})", host, port, execId); try { @@ -99,7 +99,7 @@ public void fetchBlocks( (blockIds1, listener1) -> { TransportClient client = clientFactory.createClient(host, port); new OneForOneBlockFetcher(client, appId, execId, - blockIds1, listener1, conf, tempFileManager).start(); + blockIds1, listener1, conf, downloadFileManager).start(); }; int maxRetries = conf.maxIORetries(); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java index 0bc571874f07..30587023877c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java @@ -17,18 +17,13 @@ package org.apache.spark.network.shuffle; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.channels.Channels; -import java.nio.channels.WritableByteChannel; import java.util.Arrays; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.spark.network.buffer.FileSegmentManagedBuffer; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.ChunkReceivedCallback; import org.apache.spark.network.client.RpcResponseCallback; @@ -58,7 +53,7 @@ public class OneForOneBlockFetcher { private final BlockFetchingListener listener; private final ChunkReceivedCallback chunkCallback; private final TransportConf transportConf; - private final TempFileManager tempFileManager; + private final DownloadFileManager downloadFileManager; private StreamHandle streamHandle = null; @@ -79,14 +74,14 @@ public OneForOneBlockFetcher( String[] blockIds, BlockFetchingListener listener, TransportConf transportConf, - TempFileManager tempFileManager) { + DownloadFileManager downloadFileManager) { this.client = client; this.openMessage = new OpenBlocks(appId, execId, blockIds); this.blockIds = blockIds; this.listener = listener; this.chunkCallback = new ChunkCallback(); this.transportConf = transportConf; - this.tempFileManager = tempFileManager; + this.downloadFileManager = downloadFileManager; } /** Callback invoked on receipt of each chunk. We equate a single chunk to a single block. */ @@ -125,7 +120,7 @@ public void onSuccess(ByteBuffer response) { // Immediately request all chunks -- we expect that the total size of the request is // reasonable due to higher level chunking in [[ShuffleBlockFetcherIterator]]. for (int i = 0; i < streamHandle.numChunks; i++) { - if (tempFileManager != null) { + if (downloadFileManager != null) { client.stream(OneForOneStreamManager.genStreamChunkId(streamHandle.streamId, i), new DownloadCallback(i)); } else { @@ -159,13 +154,13 @@ private void failRemainingBlocks(String[] failedBlockIds, Throwable e) { private class DownloadCallback implements StreamCallback { - private WritableByteChannel channel = null; - private File targetFile = null; + private DownloadFileWritableChannel channel = null; + private DownloadFile targetFile = null; private int chunkIndex; DownloadCallback(int chunkIndex) throws IOException { - this.targetFile = tempFileManager.createTempFile(); - this.channel = Channels.newChannel(new FileOutputStream(targetFile)); + this.targetFile = downloadFileManager.createTempFile(transportConf); + this.channel = targetFile.openForWriting(); this.chunkIndex = chunkIndex; } @@ -178,11 +173,8 @@ public void onData(String streamId, ByteBuffer buf) throws IOException { @Override public void onComplete(String streamId) throws IOException { - channel.close(); - ManagedBuffer buffer = new FileSegmentManagedBuffer(transportConf, targetFile, 0, - targetFile.length()); - listener.onBlockFetchSuccess(blockIds[chunkIndex], buffer); - if (!tempFileManager.registerTempFileToClean(targetFile)) { + listener.onBlockFetchSuccess(blockIds[chunkIndex], channel.closeAndRead()); + if (!downloadFileManager.registerTempFileToClean(targetFile)) { targetFile.delete(); } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java index 18b04fedcac5..62b99c40f61f 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleClient.java @@ -43,7 +43,7 @@ public void init(String appId) { } * @param execId the executor id. * @param blockIds block ids to fetch. * @param listener the listener to receive block fetching status. - * @param tempFileManager TempFileManager to create and clean temp files. + * @param downloadFileManager DownloadFileManager to create and clean temp files. * If it's not null, the remote blocks will be streamed * into temp shuffle files to reduce the memory usage, otherwise, * they will be kept in memory. @@ -54,7 +54,7 @@ public abstract void fetchBlocks( String execId, String[] blockIds, BlockFetchingListener listener, - TempFileManager tempFileManager); + DownloadFileManager downloadFileManager); /** * Get the shuffle MetricsSet from ShuffleClient, this will be used in MetricsSystem to diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java index 386738ece51a..371149bef397 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java @@ -37,14 +37,8 @@ public ShuffleIndexInformation(File indexFile) throws IOException { size = (int)indexFile.length(); ByteBuffer buffer = ByteBuffer.allocate(size); offsets = buffer.asLongBuffer(); - DataInputStream dis = null; - try { - dis = new DataInputStream(Files.newInputStream(indexFile.toPath())); + try (DataInputStream dis = new DataInputStream(Files.newInputStream(indexFile.toPath()))) { dis.readFully(buffer.array()); - } finally { - if (dis != null) { - dis.close(); - } } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java new file mode 100644 index 000000000000..670612fd6f66 --- /dev/null +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.network.shuffle; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.Channels; +import java.nio.channels.WritableByteChannel; + +import org.apache.spark.network.buffer.FileSegmentManagedBuffer; +import org.apache.spark.network.buffer.ManagedBuffer; +import org.apache.spark.network.util.TransportConf; + +/** + * A DownloadFile that does not take any encryption settings into account for reading and + * writing data. + * + * This does *not* mean the data in the file is un-encrypted -- it could be that the data is + * already encrypted when its written, and subsequent layer is responsible for decrypting. + */ +public class SimpleDownloadFile implements DownloadFile { + + private final File file; + private final TransportConf transportConf; + + public SimpleDownloadFile(File file, TransportConf transportConf) { + this.file = file; + this.transportConf = transportConf; + } + + @Override + public boolean delete() { + return file.delete(); + } + + @Override + public DownloadFileWritableChannel openForWriting() throws IOException { + return new SimpleDownloadWritableChannel(); + } + + @Override + public String path() { + return file.getAbsolutePath(); + } + + private class SimpleDownloadWritableChannel implements DownloadFileWritableChannel { + + private final WritableByteChannel channel; + + SimpleDownloadWritableChannel() throws FileNotFoundException { + channel = Channels.newChannel(new FileOutputStream(file)); + } + + @Override + public ManagedBuffer closeAndRead() { + return new FileSegmentManagedBuffer(transportConf, file, 0, file.length()); + } + + @Override + public int write(ByteBuffer src) throws IOException { + return channel.write(src); + } + + @Override + public boolean isOpen() { + return channel.isOpen(); + } + + @Override + public void close() throws IOException { + channel.close(); + } + } +} diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java index d2072a54fa41..459629c5f05f 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java @@ -98,19 +98,19 @@ public void testSortShuffleBlocks() throws IOException { resolver.registerExecutor("app0", "exec0", dataContext.createExecutorInfo(SORT_MANAGER)); - InputStream block0Stream = - resolver.getBlockData("app0", "exec0", 0, 0, 0).createInputStream(); - String block0 = CharStreams.toString( - new InputStreamReader(block0Stream, StandardCharsets.UTF_8)); - block0Stream.close(); - assertEquals(sortBlock0, block0); - - InputStream block1Stream = - resolver.getBlockData("app0", "exec0", 0, 0, 1).createInputStream(); - String block1 = CharStreams.toString( - new InputStreamReader(block1Stream, StandardCharsets.UTF_8)); - block1Stream.close(); - assertEquals(sortBlock1, block1); + try (InputStream block0Stream = resolver.getBlockData( + "app0", "exec0", 0, 0, 0).createInputStream()) { + String block0 = + CharStreams.toString(new InputStreamReader(block0Stream, StandardCharsets.UTF_8)); + assertEquals(sortBlock0, block0); + } + + try (InputStream block1Stream = resolver.getBlockData( + "app0", "exec0", 0, 0, 1).createInputStream()) { + String block1 = + CharStreams.toString(new InputStreamReader(block1Stream, StandardCharsets.UTF_8)); + assertEquals(sortBlock1, block1); + } } @Test @@ -149,7 +149,7 @@ public void testNormalizeAndInternPathname() { private void assertPathsMatch(String p1, String p2, String p3, String expectedPathname) { String normPathname = - ExternalShuffleBlockResolver.createNormalizedInternedPathname(p1, p2, p3); + ExternalShuffleBlockResolver.createNormalizedInternedPathname(p1, p2, p3); assertEquals(expectedPathname, normPathname); File file = new File(normPathname); String returnedPath = file.getPath(); diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java index a6a1b8d0ac3f..526b96b36447 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java @@ -133,37 +133,37 @@ private FetchResult fetchBlocks( final Semaphore requestsRemaining = new Semaphore(0); - ExternalShuffleClient client = new ExternalShuffleClient(clientConf, null, false, 5000); - client.init(APP_ID); - client.fetchBlocks(TestUtils.getLocalHost(), port, execId, blockIds, - new BlockFetchingListener() { - @Override - public void onBlockFetchSuccess(String blockId, ManagedBuffer data) { - synchronized (this) { - if (!res.successBlocks.contains(blockId) && !res.failedBlocks.contains(blockId)) { - data.retain(); - res.successBlocks.add(blockId); - res.buffers.add(data); - requestsRemaining.release(); + try (ExternalShuffleClient client = new ExternalShuffleClient(clientConf, null, false, 5000)) { + client.init(APP_ID); + client.fetchBlocks(TestUtils.getLocalHost(), port, execId, blockIds, + new BlockFetchingListener() { + @Override + public void onBlockFetchSuccess(String blockId, ManagedBuffer data) { + synchronized (this) { + if (!res.successBlocks.contains(blockId) && !res.failedBlocks.contains(blockId)) { + data.retain(); + res.successBlocks.add(blockId); + res.buffers.add(data); + requestsRemaining.release(); + } } } - } - - @Override - public void onBlockFetchFailure(String blockId, Throwable exception) { - synchronized (this) { - if (!res.successBlocks.contains(blockId) && !res.failedBlocks.contains(blockId)) { - res.failedBlocks.add(blockId); - requestsRemaining.release(); + + @Override + public void onBlockFetchFailure(String blockId, Throwable exception) { + synchronized (this) { + if (!res.successBlocks.contains(blockId) && !res.failedBlocks.contains(blockId)) { + res.failedBlocks.add(blockId); + requestsRemaining.release(); + } } } - } - }, null); + }, null); - if (!requestsRemaining.tryAcquire(blockIds.length, 5, TimeUnit.SECONDS)) { - fail("Timeout getting response from the server"); + if (!requestsRemaining.tryAcquire(blockIds.length, 5, TimeUnit.SECONDS)) { + fail("Timeout getting response from the server"); + } } - client.close(); return res; } diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java index 16bad9f1b319..82caf392b821 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java @@ -96,14 +96,16 @@ private void validate(String appId, String secretKey, boolean encrypt) ImmutableMap.of("spark.authenticate.enableSaslEncryption", "true"))); } - ExternalShuffleClient client = - new ExternalShuffleClient(testConf, new TestSecretKeyHolder(appId, secretKey), true, 5000); - client.init(appId); - // Registration either succeeds or throws an exception. - client.registerWithShuffleServer(TestUtils.getLocalHost(), server.getPort(), "exec0", - new ExecutorShuffleInfo(new String[0], 0, - "org.apache.spark.shuffle.sort.SortShuffleManager")); - client.close(); + try (ExternalShuffleClient client = + new ExternalShuffleClient( + testConf, new TestSecretKeyHolder(appId, secretKey), true, 5000)) { + client.init(appId); + // Registration either succeeds or throws an exception. + client.registerWithShuffleServer(TestUtils.getLocalHost(), server.getPort(), "exec0", + new ExecutorShuffleInfo( + new String[0], 0, "org.apache.spark.shuffle.sort.SortShuffleManager") + ); + } } /** Provides a secret key holder which always returns the given secret key, for a single appId. */ diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 564e6583c909..a1cf761d12d8 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java index d8b2ed6b5dc7..72ae1a129523 100644 --- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java +++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java @@ -35,6 +35,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.metrics2.impl.MetricsSystemImpl; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.server.api.*; import org.apache.spark.network.util.LevelDBProvider; @@ -168,6 +170,15 @@ protected void serviceInit(Configuration conf) throws Exception { TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf)); blockHandler = new ExternalShuffleBlockHandler(transportConf, registeredExecutorFile); + // register metrics on the block handler into the Node Manager's metrics system. + YarnShuffleServiceMetrics serviceMetrics = + new YarnShuffleServiceMetrics(blockHandler.getAllMetrics()); + + MetricsSystemImpl metricsSystem = (MetricsSystemImpl) DefaultMetricsSystem.instance(); + metricsSystem.register( + "sparkShuffleService", "Metrics on the Spark Shuffle Service", serviceMetrics); + logger.info("Registered metrics with Hadoop's DefaultMetricsSystem"); + // If authentication is enabled, set up the shuffle server to use a // special RPC handler that filters out unauthenticated fetch requests List bootstraps = Lists.newArrayList(); diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleServiceMetrics.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleServiceMetrics.java new file mode 100644 index 000000000000..3e4d479b862b --- /dev/null +++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleServiceMetrics.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.yarn; + +import java.util.Map; + +import com.codahale.metrics.*; +import org.apache.hadoop.metrics2.MetricsCollector; +import org.apache.hadoop.metrics2.MetricsInfo; +import org.apache.hadoop.metrics2.MetricsRecordBuilder; +import org.apache.hadoop.metrics2.MetricsSource; + +/** + * Forward {@link org.apache.spark.network.shuffle.ExternalShuffleBlockHandler.ShuffleMetrics} + * to hadoop metrics system. + * NodeManager by default exposes JMX endpoint where can be collected. + */ +class YarnShuffleServiceMetrics implements MetricsSource { + + private final MetricSet metricSet; + + YarnShuffleServiceMetrics(MetricSet metricSet) { + this.metricSet = metricSet; + } + + /** + * Get metrics from the source + * + * @param collector to contain the resulting metrics snapshot + * @param all if true, return all metrics even if unchanged. + */ + @Override + public void getMetrics(MetricsCollector collector, boolean all) { + MetricsRecordBuilder metricsRecordBuilder = collector.addRecord("sparkShuffleService"); + + for (Map.Entry entry : metricSet.getMetrics().entrySet()) { + collectMetric(metricsRecordBuilder, entry.getKey(), entry.getValue()); + } + } + + /** + * The metric types used in + * {@link org.apache.spark.network.shuffle.ExternalShuffleBlockHandler.ShuffleMetrics}. + * Visible for testing. + */ + public static void collectMetric( + MetricsRecordBuilder metricsRecordBuilder, String name, Metric metric) { + + if (metric instanceof Timer) { + Timer t = (Timer) metric; + metricsRecordBuilder + .addCounter(new ShuffleServiceMetricsInfo(name + "_count", "Count of timer " + name), + t.getCount()) + .addGauge( + new ShuffleServiceMetricsInfo(name + "_rate15", "15 minute rate of timer " + name), + t.getFifteenMinuteRate()) + .addGauge( + new ShuffleServiceMetricsInfo(name + "_rate5", "5 minute rate of timer " + name), + t.getFiveMinuteRate()) + .addGauge( + new ShuffleServiceMetricsInfo(name + "_rate1", "1 minute rate of timer " + name), + t.getOneMinuteRate()) + .addGauge(new ShuffleServiceMetricsInfo(name + "_rateMean", "Mean rate of timer " + name), + t.getMeanRate()); + } else if (metric instanceof Meter) { + Meter m = (Meter) metric; + metricsRecordBuilder + .addCounter(new ShuffleServiceMetricsInfo(name + "_count", "Count of meter " + name), + m.getCount()) + .addGauge( + new ShuffleServiceMetricsInfo(name + "_rate15", "15 minute rate of meter " + name), + m.getFifteenMinuteRate()) + .addGauge( + new ShuffleServiceMetricsInfo(name + "_rate5", "5 minute rate of meter " + name), + m.getFiveMinuteRate()) + .addGauge( + new ShuffleServiceMetricsInfo(name + "_rate1", "1 minute rate of meter " + name), + m.getOneMinuteRate()) + .addGauge(new ShuffleServiceMetricsInfo(name + "_rateMean", "Mean rate of meter " + name), + m.getMeanRate()); + } else if (metric instanceof Gauge) { + final Object gaugeValue = ((Gauge) metric).getValue(); + if (gaugeValue instanceof Integer) { + metricsRecordBuilder.addGauge(getShuffleServiceMetricsInfo(name), (Integer) gaugeValue); + } else if (gaugeValue instanceof Long) { + metricsRecordBuilder.addGauge(getShuffleServiceMetricsInfo(name), (Long) gaugeValue); + } else if (gaugeValue instanceof Float) { + metricsRecordBuilder.addGauge(getShuffleServiceMetricsInfo(name), (Float) gaugeValue); + } else if (gaugeValue instanceof Double) { + metricsRecordBuilder.addGauge(getShuffleServiceMetricsInfo(name), (Double) gaugeValue); + } else { + throw new IllegalStateException( + "Not supported class type of metric[" + name + "] for value " + gaugeValue); + } + } + } + + private static MetricsInfo getShuffleServiceMetricsInfo(String name) { + return new ShuffleServiceMetricsInfo(name, "Value of gauge " + name); + } + + private static class ShuffleServiceMetricsInfo implements MetricsInfo { + + private final String name; + private final String description; + + ShuffleServiceMetricsInfo(String name, String description) { + this.name = name; + this.description = description; + } + + @Override + public String name() { + return name; + } + + @Override + public String description() { + return description; + } + } +} diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 2f04abe8c7e8..adbbcb1cb304 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java index f7c22dddb8cc..06a248c9a27c 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java @@ -191,10 +191,9 @@ public static CountMinSketch readFrom(InputStream in) throws IOException { * Reads in a {@link CountMinSketch} from a byte array. */ public static CountMinSketch readFrom(byte[] bytes) throws IOException { - InputStream in = new ByteArrayInputStream(bytes); - CountMinSketch cms = readFrom(in); - in.close(); - return cms; + try (InputStream in = new ByteArrayInputStream(bytes)) { + return readFrom(in); + } } /** diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java index fd1906d2e5ae..b78c1677a121 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java @@ -322,10 +322,10 @@ public void writeTo(OutputStream out) throws IOException { @Override public byte[] toByteArray() throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - writeTo(out); - out.close(); - return out.toByteArray(); + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + writeTo(out); + return out.toByteArray(); + } } public static CountMinSketchImpl readFrom(InputStream in) throws IOException { diff --git a/common/tags/pom.xml b/common/tags/pom.xml index ba127408e1c5..f6627beabe84 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml index 152785473039..62c493a5e1ed 100644 --- a/common/unsafe/pom.xml +++ b/common/unsafe/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java index 62b75ae8aa01..73577437ac50 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/expressions/HiveHasher.java @@ -17,8 +17,7 @@ package org.apache.spark.sql.catalyst.expressions; -import org.apache.spark.unsafe.memory.MemoryBlock; -import org.apache.spark.unsafe.types.UTF8String; +import org.apache.spark.unsafe.Platform; /** * Simulates Hive's hashing function from Hive v1.2.1 @@ -39,21 +38,12 @@ public static int hashLong(long input) { return (int) ((input >>> 32) ^ input); } - public static int hashUnsafeBytesBlock(MemoryBlock mb) { - long lengthInBytes = mb.size(); + public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes) { assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int result = 0; - for (long i = 0; i < lengthInBytes; i++) { - result = (result * 31) + (int) mb.getByte(i); + for (int i = 0; i < lengthInBytes; i++) { + result = (result * 31) + (int) Platform.getByte(base, offset + i); } return result; } - - public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes) { - return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes)); - } - - public static int hashUTF8String(UTF8String str) { - return hashUnsafeBytesBlock(str.getMemoryBlock()); - } } diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java index 54dcadf3a775..aca6fca00c48 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java @@ -187,7 +187,7 @@ public static void setMemory(long address, byte value, long size) { } public static void copyMemory( - Object src, long srcOffset, Object dst, long dstOffset, long length) { + Object src, long srcOffset, Object dst, long dstOffset, long length) { // Check if dstOffset is before or after srcOffset to determine if we should copy // forward or backwards. This is necessary in case src and dst overlap. if (dstOffset < srcOffset) { diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java index ef0f78d95d1e..cec8c30887e2 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java @@ -18,7 +18,6 @@ package org.apache.spark.unsafe.array; import org.apache.spark.unsafe.Platform; -import org.apache.spark.unsafe.memory.MemoryBlock; public class ByteArrayMethods { @@ -53,25 +52,15 @@ public static long roundNumberOfBytesToNearestWord(long numBytes) { public static int MAX_ROUNDED_ARRAY_LENGTH = Integer.MAX_VALUE - 15; private static final boolean unaligned = Platform.unaligned(); - /** - * MemoryBlock equality check for MemoryBlocks. - * @return true if the arrays are equal, false otherwise - */ - public static boolean arrayEqualsBlock( - MemoryBlock leftBase, long leftOffset, MemoryBlock rightBase, long rightOffset, long length) { - return arrayEquals(leftBase.getBaseObject(), leftBase.getBaseOffset() + leftOffset, - rightBase.getBaseObject(), rightBase.getBaseOffset() + rightOffset, length); - } - /** * Optimized byte array equality check for byte arrays. * @return true if the arrays are equal, false otherwise */ public static boolean arrayEquals( - Object leftBase, long leftOffset, Object rightBase, long rightOffset, long length) { + Object leftBase, long leftOffset, Object rightBase, long rightOffset, final long length) { int i = 0; - // check if starts align and we can get both offsets to be aligned + // check if stars align and we can get both offsets to be aligned if ((leftOffset % 8) == (rightOffset % 8)) { while ((leftOffset + i) % 8 != 0 && i < length) { if (Platform.getByte(leftBase, leftOffset + i) != diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java index b74d2de0691d..2cd39bd60c2a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/LongArray.java @@ -17,6 +17,7 @@ package org.apache.spark.unsafe.array; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.MemoryBlock; /** @@ -32,12 +33,16 @@ public final class LongArray { private static final long WIDTH = 8; private final MemoryBlock memory; + private final Object baseObj; + private final long baseOffset; private final long length; public LongArray(MemoryBlock memory) { assert memory.size() < (long) Integer.MAX_VALUE * 8: "Array size >= Integer.MAX_VALUE elements"; this.memory = memory; + this.baseObj = memory.getBaseObject(); + this.baseOffset = memory.getBaseOffset(); this.length = memory.size() / WIDTH; } @@ -46,11 +51,11 @@ public MemoryBlock memoryBlock() { } public Object getBaseObject() { - return memory.getBaseObject(); + return baseObj; } public long getBaseOffset() { - return memory.getBaseOffset(); + return baseOffset; } /** @@ -64,8 +69,8 @@ public long size() { * Fill this all with 0L. */ public void zeroOut() { - for (long off = 0; off < length * WIDTH; off += WIDTH) { - memory.putLong(off, 0); + for (long off = baseOffset; off < baseOffset + length * WIDTH; off += WIDTH) { + Platform.putLong(baseObj, off, 0); } } @@ -75,7 +80,7 @@ public void zeroOut() { public void set(int index, long value) { assert index >= 0 : "index (" + index + ") should >= 0"; assert index < length : "index (" + index + ") should < length (" + length + ")"; - memory.putLong(index * WIDTH, value); + Platform.putLong(baseObj, baseOffset + index * WIDTH, value); } /** @@ -84,6 +89,6 @@ public void set(int index, long value) { public long get(int index) { assert index >= 0 : "index (" + index + ") should >= 0"; assert index < length : "index (" + index + ") should < length (" + length + ")"; - return memory.getLong(index * WIDTH); + return Platform.getLong(baseObj, baseOffset + index * WIDTH); } } diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java index aff6e93d647f..d239de6083ad 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java @@ -17,10 +17,7 @@ package org.apache.spark.unsafe.hash; -import com.google.common.primitives.Ints; - -import org.apache.spark.unsafe.memory.MemoryBlock; -import org.apache.spark.unsafe.types.UTF8String; +import org.apache.spark.unsafe.Platform; /** * 32-bit Murmur3 hasher. This is based on Guava's Murmur3_32HashFunction. @@ -52,70 +49,49 @@ public static int hashInt(int input, int seed) { } public int hashUnsafeWords(Object base, long offset, int lengthInBytes) { - return hashUnsafeWordsBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes), seed); + return hashUnsafeWords(base, offset, lengthInBytes, seed); } - public static int hashUnsafeWordsBlock(MemoryBlock base, int seed) { + public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed) { // This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method. - int lengthInBytes = Ints.checkedCast(base.size()); assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)"; - int h1 = hashBytesByIntBlock(base, seed); + int h1 = hashBytesByInt(base, offset, lengthInBytes, seed); return fmix(h1, lengthInBytes); } - public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, int seed) { - // This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method. - return hashUnsafeWordsBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes), seed); - } - - public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) { + public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { // This is not compatible with original and another implementations. // But remain it for backward compatibility for the components existing before 2.3. - int lengthInBytes = Ints.checkedCast(base.size()); assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; - int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed); + int h1 = hashBytesByInt(base, offset, lengthAligned, seed); for (int i = lengthAligned; i < lengthInBytes; i++) { - int halfWord = base.getByte(i); + int halfWord = Platform.getByte(base, offset + i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); } return fmix(h1, lengthInBytes); } - public static int hashUTF8String(UTF8String str, int seed) { - return hashUnsafeBytesBlock(str.getMemoryBlock(), seed); - } - - public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { - return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes), seed); - } - public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed) { - return hashUnsafeBytes2Block(MemoryBlock.allocateFromObject(base, offset, lengthInBytes), seed); - } - - public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) { - // This is compatible with original and other implementations. + // This is compatible with original and another implementations. // Use this method for new components after Spark 2.3. - int lengthInBytes = Ints.checkedCast(base.size()); - assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative"; + assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; - int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed); + int h1 = hashBytesByInt(base, offset, lengthAligned, seed); int k1 = 0; for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) { - k1 ^= (base.getByte(i) & 0xFF) << shift; + k1 ^= (Platform.getByte(base, offset + i) & 0xFF) << shift; } h1 ^= mixK1(k1); return fmix(h1, lengthInBytes); } - private static int hashBytesByIntBlock(MemoryBlock base, int seed) { - long lengthInBytes = base.size(); + private static int hashBytesByInt(Object base, long offset, int lengthInBytes, int seed) { assert (lengthInBytes % 4 == 0); int h1 = seed; - for (long i = 0; i < lengthInBytes; i += 4) { - int halfWord = base.getInt(i); + for (int i = 0; i < lengthInBytes; i += 4) { + int halfWord = Platform.getInt(base, offset + i); int k1 = mixK1(halfWord); h1 = mixH1(h1, k1); } diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/ByteArrayMemoryBlock.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/ByteArrayMemoryBlock.java deleted file mode 100644 index 9f238632bc87..000000000000 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/ByteArrayMemoryBlock.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.unsafe.memory; - -import com.google.common.primitives.Ints; - -import org.apache.spark.unsafe.Platform; - -/** - * A consecutive block of memory with a byte array on Java heap. - */ -public final class ByteArrayMemoryBlock extends MemoryBlock { - - private final byte[] array; - - public ByteArrayMemoryBlock(byte[] obj, long offset, long size) { - super(obj, offset, size); - this.array = obj; - assert(offset + size <= Platform.BYTE_ARRAY_OFFSET + obj.length) : - "The sum of size " + size + " and offset " + offset + " should not be larger than " + - "the size of the given memory space " + (obj.length + Platform.BYTE_ARRAY_OFFSET); - } - - public ByteArrayMemoryBlock(long length) { - this(new byte[Ints.checkedCast(length)], Platform.BYTE_ARRAY_OFFSET, length); - } - - @Override - public MemoryBlock subBlock(long offset, long size) { - checkSubBlockRange(offset, size); - if (offset == 0 && size == this.size()) return this; - return new ByteArrayMemoryBlock(array, this.offset + offset, size); - } - - public byte[] getByteArray() { return array; } - - /** - * Creates a memory block pointing to the memory used by the byte array. - */ - public static ByteArrayMemoryBlock fromArray(final byte[] array) { - return new ByteArrayMemoryBlock(array, Platform.BYTE_ARRAY_OFFSET, array.length); - } - - @Override - public int getInt(long offset) { - return Platform.getInt(array, this.offset + offset); - } - - @Override - public void putInt(long offset, int value) { - Platform.putInt(array, this.offset + offset, value); - } - - @Override - public boolean getBoolean(long offset) { - return Platform.getBoolean(array, this.offset + offset); - } - - @Override - public void putBoolean(long offset, boolean value) { - Platform.putBoolean(array, this.offset + offset, value); - } - - @Override - public byte getByte(long offset) { - return array[(int)(this.offset + offset - Platform.BYTE_ARRAY_OFFSET)]; - } - - @Override - public void putByte(long offset, byte value) { - array[(int)(this.offset + offset - Platform.BYTE_ARRAY_OFFSET)] = value; - } - - @Override - public short getShort(long offset) { - return Platform.getShort(array, this.offset + offset); - } - - @Override - public void putShort(long offset, short value) { - Platform.putShort(array, this.offset + offset, value); - } - - @Override - public long getLong(long offset) { - return Platform.getLong(array, this.offset + offset); - } - - @Override - public void putLong(long offset, long value) { - Platform.putLong(array, this.offset + offset, value); - } - - @Override - public float getFloat(long offset) { - return Platform.getFloat(array, this.offset + offset); - } - - @Override - public void putFloat(long offset, float value) { - Platform.putFloat(array, this.offset + offset, value); - } - - @Override - public double getDouble(long offset) { - return Platform.getDouble(array, this.offset + offset); - } - - @Override - public void putDouble(long offset, double value) { - Platform.putDouble(array, this.offset + offset, value); - } -} diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java index 36caf80888cd..2733760dd19e 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java @@ -23,6 +23,8 @@ import java.util.LinkedList; import java.util.Map; +import org.apache.spark.unsafe.Platform; + /** * A simple {@link MemoryAllocator} that can allocate up to 16GB using a JVM long primitive array. */ @@ -56,7 +58,7 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError { final long[] array = arrayReference.get(); if (array != null) { assert (array.length * 8L >= size); - MemoryBlock memory = OnHeapMemoryBlock.fromArray(array, size); + MemoryBlock memory = new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, size); if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) { memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE); } @@ -68,7 +70,7 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError { } } long[] array = new long[numWords]; - MemoryBlock memory = OnHeapMemoryBlock.fromArray(array, size); + MemoryBlock memory = new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, size); if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) { memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE); } @@ -77,13 +79,12 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError { @Override public void free(MemoryBlock memory) { - assert(memory instanceof OnHeapMemoryBlock); - assert (memory.getBaseObject() != null) : + assert (memory.obj != null) : "baseObject was null; are you trying to use the on-heap allocator to free off-heap memory?"; - assert (memory.getPageNumber() != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) : + assert (memory.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) : "page has already been freed"; - assert ((memory.getPageNumber() == MemoryBlock.NO_PAGE_NUMBER) - || (memory.getPageNumber() == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) : + assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER) + || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) : "TMM-allocated pages must first be freed via TMM.freePage(), not directly in allocator " + "free()"; @@ -93,12 +94,12 @@ public void free(MemoryBlock memory) { } // Mark the page as freed (so we can detect double-frees). - memory.setPageNumber(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER); + memory.pageNumber = MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER; // As an additional layer of defense against use-after-free bugs, we mutate the // MemoryBlock to null out its reference to the long[] array. - long[] array = ((OnHeapMemoryBlock)memory).getLongArray(); - memory.resetObjAndOffset(); + long[] array = (long[]) memory.obj; + memory.setObjAndOffset(null, 0); long alignedSize = ((size + 7) / 8) * 8; if (shouldPool(alignedSize)) { diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryAllocator.java index 38315fb97b46..7b588681d979 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryAllocator.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryAllocator.java @@ -38,7 +38,7 @@ public interface MemoryAllocator { void free(MemoryBlock memory); - UnsafeMemoryAllocator UNSAFE = new UnsafeMemoryAllocator(); + MemoryAllocator UNSAFE = new UnsafeMemoryAllocator(); - HeapMemoryAllocator HEAP = new HeapMemoryAllocator(); + MemoryAllocator HEAP = new HeapMemoryAllocator(); } diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java index ca7213bbf92d..c333857358d3 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java @@ -22,10 +22,10 @@ import org.apache.spark.unsafe.Platform; /** - * A representation of a consecutive memory block in Spark. It defines the common interfaces - * for memory accessing and mutating. + * A consecutive block of memory, starting at a {@link MemoryLocation} with a fixed size. */ -public abstract class MemoryBlock { +public class MemoryBlock extends MemoryLocation { + /** Special `pageNumber` value for pages which were not allocated by TaskMemoryManagers */ public static final int NO_PAGE_NUMBER = -1; @@ -45,163 +45,38 @@ public abstract class MemoryBlock { */ public static final int FREED_IN_ALLOCATOR_PAGE_NUMBER = -3; - @Nullable - protected Object obj; - - protected long offset; - - protected long length; + private final long length; /** * Optional page number; used when this MemoryBlock represents a page allocated by a - * TaskMemoryManager. This field can be updated using setPageNumber method so that - * this can be modified by the TaskMemoryManager, which lives in a different package. + * TaskMemoryManager. This field is public so that it can be modified by the TaskMemoryManager, + * which lives in a different package. */ - private int pageNumber = NO_PAGE_NUMBER; + public int pageNumber = NO_PAGE_NUMBER; - protected MemoryBlock(@Nullable Object obj, long offset, long length) { - if (offset < 0 || length < 0) { - throw new IllegalArgumentException( - "Length " + length + " and offset " + offset + "must be non-negative"); - } - this.obj = obj; - this.offset = offset; + public MemoryBlock(@Nullable Object obj, long offset, long length) { + super(obj, offset); this.length = length; } - protected MemoryBlock() { - this(null, 0, 0); - } - - public final Object getBaseObject() { - return obj; - } - - public final long getBaseOffset() { - return offset; - } - - public void resetObjAndOffset() { - this.obj = null; - this.offset = 0; - } - /** * Returns the size of the memory block. */ - public final long size() { + public long size() { return length; } - public final void setPageNumber(int pageNum) { - pageNumber = pageNum; - } - - public final int getPageNumber() { - return pageNumber; - } - - /** - * Fills the memory block with the specified byte value. - */ - public final void fill(byte value) { - Platform.setMemory(obj, offset, length, value); - } - - /** - * Instantiate MemoryBlock for given object type with new offset - */ - public static final MemoryBlock allocateFromObject(Object obj, long offset, long length) { - MemoryBlock mb = null; - if (obj instanceof byte[]) { - byte[] array = (byte[])obj; - mb = new ByteArrayMemoryBlock(array, offset, length); - } else if (obj instanceof long[]) { - long[] array = (long[])obj; - mb = new OnHeapMemoryBlock(array, offset, length); - } else if (obj == null) { - // we assume that to pass null pointer means off-heap - mb = new OffHeapMemoryBlock(offset, length); - } else { - throw new UnsupportedOperationException( - "Instantiate MemoryBlock for type " + obj.getClass() + " is not supported now"); - } - return mb; - } - /** - * Just instantiate the sub-block with the same type of MemoryBlock with the new size and relative - * offset from the original offset. The data is not copied. - * If parameters are invalid, an exception is thrown. + * Creates a memory block pointing to the memory used by the long array. */ - public abstract MemoryBlock subBlock(long offset, long size); - - protected void checkSubBlockRange(long offset, long size) { - if (offset < 0 || size < 0) { - throw new ArrayIndexOutOfBoundsException( - "Size " + size + " and offset " + offset + " must be non-negative"); - } - if (offset + size > length) { - throw new ArrayIndexOutOfBoundsException("The sum of size " + size + " and offset " + - offset + " should not be larger than the length " + length + " in the MemoryBlock"); - } + public static MemoryBlock fromLongArray(final long[] array) { + return new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, array.length * 8L); } /** - * getXXX/putXXX does not ensure guarantee behavior if the offset is invalid. e.g cause illegal - * memory access, throw an exception, or etc. - * getXXX/putXXX uses an index based on this.offset that includes the size of metadata such as - * JVM object header. The offset is 0-based and is expected as an logical offset in the memory - * block. + * Fills the memory block with the specified byte value. */ - public abstract int getInt(long offset); - - public abstract void putInt(long offset, int value); - - public abstract boolean getBoolean(long offset); - - public abstract void putBoolean(long offset, boolean value); - - public abstract byte getByte(long offset); - - public abstract void putByte(long offset, byte value); - - public abstract short getShort(long offset); - - public abstract void putShort(long offset, short value); - - public abstract long getLong(long offset); - - public abstract void putLong(long offset, long value); - - public abstract float getFloat(long offset); - - public abstract void putFloat(long offset, float value); - - public abstract double getDouble(long offset); - - public abstract void putDouble(long offset, double value); - - public static final void copyMemory( - MemoryBlock src, long srcOffset, MemoryBlock dst, long dstOffset, long length) { - assert(srcOffset + length <= src.length && dstOffset + length <= dst.length); - Platform.copyMemory(src.getBaseObject(), src.getBaseOffset() + srcOffset, - dst.getBaseObject(), dst.getBaseOffset() + dstOffset, length); - } - - public static final void copyMemory(MemoryBlock src, MemoryBlock dst, long length) { - assert(length <= src.length && length <= dst.length); - Platform.copyMemory(src.getBaseObject(), src.getBaseOffset(), - dst.getBaseObject(), dst.getBaseOffset(), length); - } - - public final void copyFrom(Object src, long srcOffset, long dstOffset, long length) { - assert(length <= this.length - srcOffset); - Platform.copyMemory(src, srcOffset, obj, offset + dstOffset, length); - } - - public final void writeTo(long srcOffset, Object dst, long dstOffset, long length) { - assert(length <= this.length - srcOffset); - Platform.copyMemory(obj, offset + srcOffset, dst, dstOffset, length); + public void fill(byte value) { + Platform.setMemory(obj, offset, length, value); } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilderSuite.scala b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryLocation.java similarity index 51% rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilderSuite.scala rename to common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryLocation.java index 1b25a4b191f8..74ebc87dc978 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilderSuite.scala +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryLocation.java @@ -1,42 +1,54 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.expressions.codegen - -import org.apache.spark.SparkFunSuite -import org.apache.spark.unsafe.types.UTF8String - -class UTF8StringBuilderSuite extends SparkFunSuite { - - test("basic test") { - val sb = new UTF8StringBuilder() - assert(sb.build() === UTF8String.EMPTY_UTF8) - - sb.append("") - assert(sb.build() === UTF8String.EMPTY_UTF8) - - sb.append("abcd") - assert(sb.build() === UTF8String.fromString("abcd")) - - sb.append(UTF8String.fromString("1234")) - assert(sb.build() === UTF8String.fromString("abcd1234")) - - // expect to grow an internal buffer - sb.append(UTF8String.fromString("efgijk567890")) - assert(sb.build() === UTF8String.fromString("abcd1234efgijk567890")) - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.unsafe.memory; + +import javax.annotation.Nullable; + +/** + * A memory location. Tracked either by a memory address (with off-heap allocation), + * or by an offset from a JVM object (in-heap allocation). + */ +public class MemoryLocation { + + @Nullable + Object obj; + + long offset; + + public MemoryLocation(@Nullable Object obj, long offset) { + this.obj = obj; + this.offset = offset; + } + + public MemoryLocation() { + this(null, 0); + } + + public void setObjAndOffset(Object newObj, long newOffset) { + this.obj = newObj; + this.offset = newOffset; + } + + public final Object getBaseObject() { + return obj; + } + + public final long getBaseOffset() { + return offset; + } +} diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/OffHeapMemoryBlock.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/OffHeapMemoryBlock.java deleted file mode 100644 index 3431b08980eb..000000000000 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/OffHeapMemoryBlock.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.unsafe.memory; - -import org.apache.spark.unsafe.Platform; - -public class OffHeapMemoryBlock extends MemoryBlock { - public static final OffHeapMemoryBlock NULL = new OffHeapMemoryBlock(0, 0); - - public OffHeapMemoryBlock(long address, long size) { - super(null, address, size); - } - - @Override - public MemoryBlock subBlock(long offset, long size) { - checkSubBlockRange(offset, size); - if (offset == 0 && size == this.size()) return this; - return new OffHeapMemoryBlock(this.offset + offset, size); - } - - @Override - public final int getInt(long offset) { - return Platform.getInt(null, this.offset + offset); - } - - @Override - public final void putInt(long offset, int value) { - Platform.putInt(null, this.offset + offset, value); - } - - @Override - public final boolean getBoolean(long offset) { - return Platform.getBoolean(null, this.offset + offset); - } - - @Override - public final void putBoolean(long offset, boolean value) { - Platform.putBoolean(null, this.offset + offset, value); - } - - @Override - public final byte getByte(long offset) { - return Platform.getByte(null, this.offset + offset); - } - - @Override - public final void putByte(long offset, byte value) { - Platform.putByte(null, this.offset + offset, value); - } - - @Override - public final short getShort(long offset) { - return Platform.getShort(null, this.offset + offset); - } - - @Override - public final void putShort(long offset, short value) { - Platform.putShort(null, this.offset + offset, value); - } - - @Override - public final long getLong(long offset) { - return Platform.getLong(null, this.offset + offset); - } - - @Override - public final void putLong(long offset, long value) { - Platform.putLong(null, this.offset + offset, value); - } - - @Override - public final float getFloat(long offset) { - return Platform.getFloat(null, this.offset + offset); - } - - @Override - public final void putFloat(long offset, float value) { - Platform.putFloat(null, this.offset + offset, value); - } - - @Override - public final double getDouble(long offset) { - return Platform.getDouble(null, this.offset + offset); - } - - @Override - public final void putDouble(long offset, double value) { - Platform.putDouble(null, this.offset + offset, value); - } -} diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/OnHeapMemoryBlock.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/OnHeapMemoryBlock.java deleted file mode 100644 index ee42bc27c9c5..000000000000 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/OnHeapMemoryBlock.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.unsafe.memory; - -import com.google.common.primitives.Ints; - -import org.apache.spark.unsafe.Platform; - -/** - * A consecutive block of memory with a long array on Java heap. - */ -public final class OnHeapMemoryBlock extends MemoryBlock { - - private final long[] array; - - public OnHeapMemoryBlock(long[] obj, long offset, long size) { - super(obj, offset, size); - this.array = obj; - assert(offset + size <= obj.length * 8L + Platform.LONG_ARRAY_OFFSET) : - "The sum of size " + size + " and offset " + offset + " should not be larger than " + - "the size of the given memory space " + (obj.length * 8L + Platform.LONG_ARRAY_OFFSET); - } - - public OnHeapMemoryBlock(long size) { - this(new long[Ints.checkedCast((size + 7) / 8)], Platform.LONG_ARRAY_OFFSET, size); - } - - @Override - public MemoryBlock subBlock(long offset, long size) { - checkSubBlockRange(offset, size); - if (offset == 0 && size == this.size()) return this; - return new OnHeapMemoryBlock(array, this.offset + offset, size); - } - - public long[] getLongArray() { return array; } - - /** - * Creates a memory block pointing to the memory used by the long array. - */ - public static OnHeapMemoryBlock fromArray(final long[] array) { - return new OnHeapMemoryBlock(array, Platform.LONG_ARRAY_OFFSET, array.length * 8L); - } - - public static OnHeapMemoryBlock fromArray(final long[] array, long size) { - return new OnHeapMemoryBlock(array, Platform.LONG_ARRAY_OFFSET, size); - } - - @Override - public int getInt(long offset) { - return Platform.getInt(array, this.offset + offset); - } - - @Override - public void putInt(long offset, int value) { - Platform.putInt(array, this.offset + offset, value); - } - - @Override - public boolean getBoolean(long offset) { - return Platform.getBoolean(array, this.offset + offset); - } - - @Override - public void putBoolean(long offset, boolean value) { - Platform.putBoolean(array, this.offset + offset, value); - } - - @Override - public byte getByte(long offset) { - return Platform.getByte(array, this.offset + offset); - } - - @Override - public void putByte(long offset, byte value) { - Platform.putByte(array, this.offset + offset, value); - } - - @Override - public short getShort(long offset) { - return Platform.getShort(array, this.offset + offset); - } - - @Override - public void putShort(long offset, short value) { - Platform.putShort(array, this.offset + offset, value); - } - - @Override - public long getLong(long offset) { - return Platform.getLong(array, this.offset + offset); - } - - @Override - public void putLong(long offset, long value) { - Platform.putLong(array, this.offset + offset, value); - } - - @Override - public float getFloat(long offset) { - return Platform.getFloat(array, this.offset + offset); - } - - @Override - public void putFloat(long offset, float value) { - Platform.putFloat(array, this.offset + offset, value); - } - - @Override - public double getDouble(long offset) { - return Platform.getDouble(array, this.offset + offset); - } - - @Override - public void putDouble(long offset, double value) { - Platform.putDouble(array, this.offset + offset, value); - } -} diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java index 5310bdf2779a..4368fb615ba1 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java @@ -25,9 +25,9 @@ public class UnsafeMemoryAllocator implements MemoryAllocator { @Override - public OffHeapMemoryBlock allocate(long size) throws OutOfMemoryError { + public MemoryBlock allocate(long size) throws OutOfMemoryError { long address = Platform.allocateMemory(size); - OffHeapMemoryBlock memory = new OffHeapMemoryBlock(address, size); + MemoryBlock memory = new MemoryBlock(null, address, size); if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) { memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE); } @@ -36,25 +36,22 @@ public OffHeapMemoryBlock allocate(long size) throws OutOfMemoryError { @Override public void free(MemoryBlock memory) { - assert(memory instanceof OffHeapMemoryBlock) : - "UnsafeMemoryAllocator can only free OffHeapMemoryBlock."; - if (memory == OffHeapMemoryBlock.NULL) return; - assert (memory.getPageNumber() != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) : + assert (memory.obj == null) : + "baseObject not null; are you trying to use the off-heap allocator to free on-heap memory?"; + assert (memory.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) : "page has already been freed"; - assert ((memory.getPageNumber() == MemoryBlock.NO_PAGE_NUMBER) - || (memory.getPageNumber() == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) : + assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER) + || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) : "TMM-allocated pages must be freed via TMM.freePage(), not directly in allocator free()"; if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) { memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE); } - Platform.freeMemory(memory.offset); - // As an additional layer of defense against use-after-free bugs, we mutate the // MemoryBlock to reset its pointer. - memory.resetObjAndOffset(); + memory.offset = 0; // Mark the page as freed (so we can detect double-frees). - memory.setPageNumber(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER); + memory.pageNumber = MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER; } } diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index e91fc4391425..3a3bfc4a94bb 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -34,8 +34,6 @@ import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.hash.Murmur3_x86_32; -import org.apache.spark.unsafe.memory.ByteArrayMemoryBlock; -import org.apache.spark.unsafe.memory.MemoryBlock; import static org.apache.spark.unsafe.Platform.*; @@ -53,13 +51,12 @@ public final class UTF8String implements Comparable, Externalizable, // These are only updated by readExternal() or read() @Nonnull - private MemoryBlock base; - // While numBytes has the same value as base.size(), to keep as int avoids cast from long to int + private Object base; + private long offset; private int numBytes; - public MemoryBlock getMemoryBlock() { return base; } - public Object getBaseObject() { return base.getBaseObject(); } - public long getBaseOffset() { return base.getBaseOffset(); } + public Object getBaseObject() { return base; } + public long getBaseOffset() { return offset; } /** * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which @@ -112,8 +109,7 @@ public final class UTF8String implements Comparable, Externalizable, */ public static UTF8String fromBytes(byte[] bytes) { if (bytes != null) { - return new UTF8String( - new ByteArrayMemoryBlock(bytes, BYTE_ARRAY_OFFSET, bytes.length)); + return new UTF8String(bytes, BYTE_ARRAY_OFFSET, bytes.length); } else { return null; } @@ -126,13 +122,19 @@ public static UTF8String fromBytes(byte[] bytes) { */ public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) { if (bytes != null) { - return new UTF8String( - new ByteArrayMemoryBlock(bytes, BYTE_ARRAY_OFFSET + offset, numBytes)); + return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes); } else { return null; } } + /** + * Creates an UTF8String from given address (base and offset) and length. + */ + public static UTF8String fromAddress(Object base, long offset, int numBytes) { + return new UTF8String(base, offset, numBytes); + } + /** * Creates an UTF8String from String. */ @@ -149,13 +151,16 @@ public static UTF8String blankString(int length) { return fromBytes(spaces); } - public UTF8String(MemoryBlock base) { + protected UTF8String(Object base, long offset, int numBytes) { this.base = base; - this.numBytes = Ints.checkedCast(base.size()); + this.offset = offset; + this.numBytes = numBytes; } // for serialization - public UTF8String() {} + public UTF8String() { + this(null, 0, 0); + } /** * Writes the content of this string into a memory address, identified by an object and an offset. @@ -163,7 +168,7 @@ public UTF8String() {} * bytes in this string. */ public void writeToMemory(Object target, long targetOffset) { - base.writeTo(0, target, targetOffset, numBytes); + Platform.copyMemory(base, offset, target, targetOffset, numBytes); } public void writeTo(ByteBuffer buffer) { @@ -183,9 +188,8 @@ public void writeTo(ByteBuffer buffer) { */ @Nonnull public ByteBuffer getByteBuffer() { - long offset = base.getBaseOffset(); - if (base instanceof ByteArrayMemoryBlock && offset >= BYTE_ARRAY_OFFSET) { - final byte[] bytes = ((ByteArrayMemoryBlock) base).getByteArray(); + if (base instanceof byte[] && offset >= BYTE_ARRAY_OFFSET) { + final byte[] bytes = (byte[]) base; // the offset includes an object header... this is only needed for unsafe copies final long arrayOffset = offset - BYTE_ARRAY_OFFSET; @@ -252,12 +256,12 @@ public long getPrefix() { long mask = 0; if (IS_LITTLE_ENDIAN) { if (numBytes >= 8) { - p = base.getLong(0); + p = Platform.getLong(base, offset); } else if (numBytes > 4) { - p = base.getLong(0); + p = Platform.getLong(base, offset); mask = (1L << (8 - numBytes) * 8) - 1; } else if (numBytes > 0) { - p = (long) base.getInt(0); + p = (long) Platform.getInt(base, offset); mask = (1L << (8 - numBytes) * 8) - 1; } else { p = 0; @@ -266,12 +270,12 @@ public long getPrefix() { } else { // byteOrder == ByteOrder.BIG_ENDIAN if (numBytes >= 8) { - p = base.getLong(0); + p = Platform.getLong(base, offset); } else if (numBytes > 4) { - p = base.getLong(0); + p = Platform.getLong(base, offset); mask = (1L << (8 - numBytes) * 8) - 1; } else if (numBytes > 0) { - p = ((long) base.getInt(0)) << 32; + p = ((long) Platform.getInt(base, offset)) << 32; mask = (1L << (8 - numBytes) * 8) - 1; } else { p = 0; @@ -286,13 +290,12 @@ public long getPrefix() { */ public byte[] getBytes() { // avoid copy if `base` is `byte[]` - long offset = base.getBaseOffset(); - if (offset == BYTE_ARRAY_OFFSET && base instanceof ByteArrayMemoryBlock - && (((ByteArrayMemoryBlock) base).getByteArray()).length == numBytes) { - return ((ByteArrayMemoryBlock) base).getByteArray(); + if (offset == BYTE_ARRAY_OFFSET && base instanceof byte[] + && ((byte[]) base).length == numBytes) { + return (byte[]) base; } else { byte[] bytes = new byte[numBytes]; - base.writeTo(0, bytes, BYTE_ARRAY_OFFSET, numBytes); + copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, numBytes); return bytes; } } @@ -322,7 +325,7 @@ public UTF8String substring(final int start, final int until) { if (i > j) { byte[] bytes = new byte[i - j]; - base.writeTo(j, bytes, BYTE_ARRAY_OFFSET, i - j); + copyMemory(base, offset + j, bytes, BYTE_ARRAY_OFFSET, i - j); return fromBytes(bytes); } else { return EMPTY_UTF8; @@ -363,14 +366,14 @@ public boolean contains(final UTF8String substring) { * Returns the byte at position `i`. */ private byte getByte(int i) { - return base.getByte(i); + return Platform.getByte(base, offset + i); } private boolean matchAt(final UTF8String s, int pos) { if (s.numBytes + pos > numBytes || pos < 0) { return false; } - return ByteArrayMethods.arrayEqualsBlock(base, pos, s.base, 0, s.numBytes); + return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes); } public boolean startsWith(final UTF8String prefix) { @@ -497,7 +500,8 @@ public int findInSet(UTF8String match) { for (int i = 0; i < numBytes; i++) { if (getByte(i) == (byte) ',') { if (i - (lastComma + 1) == match.numBytes && - ByteArrayMethods.arrayEqualsBlock(base, lastComma + 1, match.base, 0, match.numBytes)) { + ByteArrayMethods.arrayEquals(base, offset + (lastComma + 1), match.base, match.offset, + match.numBytes)) { return n; } lastComma = i; @@ -505,7 +509,8 @@ public int findInSet(UTF8String match) { } } if (numBytes - (lastComma + 1) == match.numBytes && - ByteArrayMethods.arrayEqualsBlock(base, lastComma + 1, match.base, 0, match.numBytes)) { + ByteArrayMethods.arrayEquals(base, offset + (lastComma + 1), match.base, match.offset, + match.numBytes)) { return n; } return 0; @@ -520,7 +525,7 @@ public int findInSet(UTF8String match) { private UTF8String copyUTF8String(int start, int end) { int len = end - start + 1; byte[] newBytes = new byte[len]; - base.writeTo(start, newBytes, BYTE_ARRAY_OFFSET, len); + copyMemory(base, offset + start, newBytes, BYTE_ARRAY_OFFSET, len); return UTF8String.fromBytes(newBytes); } @@ -667,7 +672,8 @@ public UTF8String reverse() { int i = 0; // position in byte while (i < numBytes) { int len = numBytesForFirstByte(getByte(i)); - base.writeTo(i, result, BYTE_ARRAY_OFFSET + result.length - i - len, len); + copyMemory(this.base, this.offset + i, result, + BYTE_ARRAY_OFFSET + result.length - i - len, len); i += len; } @@ -681,7 +687,7 @@ public UTF8String repeat(int times) { } byte[] newBytes = new byte[numBytes * times]; - base.writeTo(0, newBytes, BYTE_ARRAY_OFFSET, numBytes); + copyMemory(this.base, this.offset, newBytes, BYTE_ARRAY_OFFSET, numBytes); int copied = 1; while (copied < times) { @@ -718,7 +724,7 @@ public int indexOf(UTF8String v, int start) { if (i + v.numBytes > numBytes) { return -1; } - if (ByteArrayMethods.arrayEqualsBlock(base, i, v.base, 0, v.numBytes)) { + if (ByteArrayMethods.arrayEquals(base, offset + i, v.base, v.offset, v.numBytes)) { return c; } i += numBytesForFirstByte(getByte(i)); @@ -734,7 +740,7 @@ public int indexOf(UTF8String v, int start) { private int find(UTF8String str, int start) { assert (str.numBytes > 0); while (start <= numBytes - str.numBytes) { - if (ByteArrayMethods.arrayEqualsBlock(base, start, str.base, 0, str.numBytes)) { + if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) { return start; } start += 1; @@ -748,7 +754,7 @@ private int find(UTF8String str, int start) { private int rfind(UTF8String str, int start) { assert (str.numBytes > 0); while (start >= 0) { - if (ByteArrayMethods.arrayEqualsBlock(base, start, str.base, 0, str.numBytes)) { + if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) { return start; } start -= 1; @@ -781,7 +787,7 @@ public UTF8String subStringIndex(UTF8String delim, int count) { return EMPTY_UTF8; } byte[] bytes = new byte[idx]; - base.writeTo(0, bytes, BYTE_ARRAY_OFFSET, idx); + copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, idx); return fromBytes(bytes); } else { @@ -801,7 +807,7 @@ public UTF8String subStringIndex(UTF8String delim, int count) { } int size = numBytes - delim.numBytes - idx; byte[] bytes = new byte[size]; - base.writeTo(idx + delim.numBytes, bytes, BYTE_ARRAY_OFFSET, size); + copyMemory(base, offset + idx + delim.numBytes, bytes, BYTE_ARRAY_OFFSET, size); return fromBytes(bytes); } } @@ -824,15 +830,15 @@ public UTF8String rpad(int len, UTF8String pad) { UTF8String remain = pad.substring(0, spaces - padChars * count); byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes]; - base.writeTo(0, data, BYTE_ARRAY_OFFSET, this.numBytes); + copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET, this.numBytes); int offset = this.numBytes; int idx = 0; while (idx < count) { - pad.base.writeTo(0, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes); + copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes); ++ idx; offset += pad.numBytes; } - remain.base.writeTo(0, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes); + copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes); return UTF8String.fromBytes(data); } @@ -860,13 +866,13 @@ public UTF8String lpad(int len, UTF8String pad) { int offset = 0; int idx = 0; while (idx < count) { - pad.base.writeTo(0, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes); + copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes); ++ idx; offset += pad.numBytes; } - remain.base.writeTo(0, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes); + copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes); offset += remain.numBytes; - base.writeTo(0, data, BYTE_ARRAY_OFFSET + offset, numBytes()); + copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET + offset, numBytes()); return UTF8String.fromBytes(data); } @@ -891,8 +897,8 @@ public static UTF8String concat(UTF8String... inputs) { int offset = 0; for (int i = 0; i < inputs.length; i++) { int len = inputs[i].numBytes; - inputs[i].base.writeTo( - 0, + copyMemory( + inputs[i].base, inputs[i].offset, result, BYTE_ARRAY_OFFSET + offset, len); offset += len; @@ -931,8 +937,8 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) { for (int i = 0, j = 0; i < inputs.length; i++) { if (inputs[i] != null) { int len = inputs[i].numBytes; - inputs[i].base.writeTo( - 0, + copyMemory( + inputs[i].base, inputs[i].offset, result, BYTE_ARRAY_OFFSET + offset, len); offset += len; @@ -940,8 +946,8 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) { j++; // Add separator if this is not the last input. if (j < numInputs) { - separator.base.writeTo( - 0, + copyMemory( + separator.base, separator.offset, result, BYTE_ARRAY_OFFSET + offset, separator.numBytes); offset += separator.numBytes; @@ -952,6 +958,12 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) { } public UTF8String[] split(UTF8String pattern, int limit) { + // Java String's split method supports "ignore empty string" behavior when the limit is 0 + // whereas other languages do not. To avoid this java specific behavior, we fall back to + // -1 when the limit is 0. + if (limit == 0) { + limit = -1; + } String[] splits = toString().split(pattern.toString(), limit); UTF8String[] res = new UTF8String[splits.length]; for (int i = 0; i < res.length; i++) { @@ -1215,7 +1227,7 @@ public UTF8String clone() { public UTF8String copy() { byte[] bytes = new byte[numBytes]; - base.writeTo(0, bytes, BYTE_ARRAY_OFFSET, numBytes); + copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, numBytes); return fromBytes(bytes); } @@ -1223,10 +1235,11 @@ public UTF8String copy() { public int compareTo(@Nonnull final UTF8String other) { int len = Math.min(numBytes, other.numBytes); int wordMax = (len / 8) * 8; - MemoryBlock rbase = other.base; + long roffset = other.offset; + Object rbase = other.base; for (int i = 0; i < wordMax; i += 8) { - long left = base.getLong(i); - long right = rbase.getLong(i); + long left = getLong(base, offset + i); + long right = getLong(rbase, roffset + i); if (left != right) { if (IS_LITTLE_ENDIAN) { return Long.compareUnsigned(Long.reverseBytes(left), Long.reverseBytes(right)); @@ -1237,7 +1250,7 @@ public int compareTo(@Nonnull final UTF8String other) { } for (int i = wordMax; i < len; i++) { // In UTF-8, the byte should be unsigned, so we should compare them as unsigned int. - int res = (getByte(i) & 0xFF) - (rbase.getByte(i) & 0xFF); + int res = (getByte(i) & 0xFF) - (Platform.getByte(rbase, roffset + i) & 0xFF); if (res != 0) { return res; } @@ -1256,7 +1269,7 @@ public boolean equals(final Object other) { if (numBytes != o.numBytes) { return false; } - return ByteArrayMethods.arrayEqualsBlock(base, 0, o.base, 0, numBytes); + return ByteArrayMethods.arrayEquals(base, offset, o.base, o.offset, numBytes); } else { return false; } @@ -1312,8 +1325,8 @@ public int levenshteinDistance(UTF8String other) { num_bytes_j != numBytesForFirstByte(s.getByte(i_bytes))) { cost = 1; } else { - cost = (ByteArrayMethods.arrayEqualsBlock(t.base, j_bytes, s.base, - i_bytes, num_bytes_j)) ? 0 : 1; + cost = (ByteArrayMethods.arrayEquals(t.base, t.offset + j_bytes, s.base, + s.offset + i_bytes, num_bytes_j)) ? 0 : 1; } d[i + 1] = Math.min(Math.min(d[i] + 1, p[i + 1] + 1), p[i] + cost); } @@ -1328,7 +1341,7 @@ public int levenshteinDistance(UTF8String other) { @Override public int hashCode() { - return Murmur3_x86_32.hashUnsafeBytesBlock(base,42); + return Murmur3_x86_32.hashUnsafeBytes(base, offset, numBytes, 42); } /** @@ -1391,10 +1404,10 @@ public void writeExternal(ObjectOutput out) throws IOException { } public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { + offset = BYTE_ARRAY_OFFSET; numBytes = in.readInt(); - byte[] bytes = new byte[numBytes]; - in.readFully(bytes); - base = ByteArrayMemoryBlock.fromArray(bytes); + base = new byte[numBytes]; + in.readFully((byte[]) base); } @Override @@ -1406,10 +1419,10 @@ public void write(Kryo kryo, Output out) { @Override public void read(Kryo kryo, Input in) { - numBytes = in.readInt(); - byte[] bytes = new byte[numBytes]; - in.read(bytes); - base = ByteArrayMemoryBlock.fromArray(bytes); + this.offset = BYTE_ARRAY_OFFSET; + this.numBytes = in.readInt(); + this.base = new byte[numBytes]; + in.read((byte[]) base); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java index 583a148b3845..3ad9ac7b4de9 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java @@ -81,7 +81,7 @@ public void freeingOnHeapMemoryBlockResetsBaseObjectAndOffset() { MemoryAllocator.HEAP.free(block); Assert.assertNull(block.getBaseObject()); Assert.assertEquals(0, block.getBaseOffset()); - Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.getPageNumber()); + Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.pageNumber); } @Test @@ -92,7 +92,7 @@ public void freeingOffHeapMemoryBlockResetsOffset() { MemoryAllocator.UNSAFE.free(block); Assert.assertNull(block.getBaseObject()); Assert.assertEquals(0, block.getBaseOffset()); - Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.getPageNumber()); + Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.pageNumber); } @Test(expected = AssertionError.class) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/array/LongArraySuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/array/LongArraySuite.java index 8c2e98c2bfc5..fb8e53b3348f 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/array/LongArraySuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/array/LongArraySuite.java @@ -20,13 +20,14 @@ import org.junit.Assert; import org.junit.Test; -import org.apache.spark.unsafe.memory.OnHeapMemoryBlock; +import org.apache.spark.unsafe.memory.MemoryBlock; public class LongArraySuite { @Test public void basicTest() { - LongArray arr = new LongArray(new OnHeapMemoryBlock(16)); + long[] bytes = new long[2]; + LongArray arr = new LongArray(MemoryBlock.fromLongArray(bytes)); arr.set(0, 1L); arr.set(1, 2L); arr.set(1, 3L); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java index d9898771720a..6348a73bf389 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java @@ -70,24 +70,6 @@ public void testKnownBytesInputs() { Murmur3_x86_32.hashUnsafeBytes2(tes, Platform.BYTE_ARRAY_OFFSET, tes.length, 0)); } - @Test - public void testKnownWordsInputs() { - byte[] bytes = new byte[16]; - long offset = Platform.BYTE_ARRAY_OFFSET; - for (int i = 0; i < 16; i++) { - bytes[i] = 0; - } - Assert.assertEquals(-300363099, Murmur3_x86_32.hashUnsafeWords(bytes, offset, 16, 42)); - for (int i = 0; i < 16; i++) { - bytes[i] = -1; - } - Assert.assertEquals(-1210324667, Murmur3_x86_32.hashUnsafeWords(bytes, offset, 16, 42)); - for (int i = 0; i < 16; i++) { - bytes[i] = (byte)i; - } - Assert.assertEquals(-634919701, Murmur3_x86_32.hashUnsafeWords(bytes, offset, 16, 42)); - } - @Test public void randomizedStressTest() { int size = 65536; diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/memory/MemoryBlockSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/memory/MemoryBlockSuite.java deleted file mode 100644 index ef5ff8ee70ec..000000000000 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/memory/MemoryBlockSuite.java +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.unsafe.memory; - -import org.apache.spark.unsafe.Platform; -import org.junit.Assert; -import org.junit.Test; - -import java.nio.ByteOrder; - -import static org.hamcrest.core.StringContains.containsString; - -public class MemoryBlockSuite { - private static final boolean bigEndianPlatform = - ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN); - - private void check(MemoryBlock memory, Object obj, long offset, int length) { - memory.setPageNumber(1); - memory.fill((byte)-1); - memory.putBoolean(0, true); - memory.putByte(1, (byte)127); - memory.putShort(2, (short)257); - memory.putInt(4, 0x20000002); - memory.putLong(8, 0x1234567089ABCDEFL); - memory.putFloat(16, 1.0F); - memory.putLong(20, 0x1234567089ABCDEFL); - memory.putDouble(28, 2.0); - MemoryBlock.copyMemory(memory, 0L, memory, 36, 4); - int[] a = new int[2]; - a[0] = 0x12345678; - a[1] = 0x13579BDF; - memory.copyFrom(a, Platform.INT_ARRAY_OFFSET, 40, 8); - byte[] b = new byte[8]; - memory.writeTo(40, b, Platform.BYTE_ARRAY_OFFSET, 8); - - Assert.assertEquals(obj, memory.getBaseObject()); - Assert.assertEquals(offset, memory.getBaseOffset()); - Assert.assertEquals(length, memory.size()); - Assert.assertEquals(1, memory.getPageNumber()); - Assert.assertEquals(true, memory.getBoolean(0)); - Assert.assertEquals((byte)127, memory.getByte(1 )); - Assert.assertEquals((short)257, memory.getShort(2)); - Assert.assertEquals(0x20000002, memory.getInt(4)); - Assert.assertEquals(0x1234567089ABCDEFL, memory.getLong(8)); - Assert.assertEquals(1.0F, memory.getFloat(16), 0); - Assert.assertEquals(0x1234567089ABCDEFL, memory.getLong(20)); - Assert.assertEquals(2.0, memory.getDouble(28), 0); - Assert.assertEquals(true, memory.getBoolean(36)); - Assert.assertEquals((byte)127, memory.getByte(37 )); - Assert.assertEquals((short)257, memory.getShort(38)); - Assert.assertEquals(a[0], memory.getInt(40)); - Assert.assertEquals(a[1], memory.getInt(44)); - if (bigEndianPlatform) { - Assert.assertEquals(a[0], - ((int)b[0] & 0xff) << 24 | ((int)b[1] & 0xff) << 16 | - ((int)b[2] & 0xff) << 8 | ((int)b[3] & 0xff)); - Assert.assertEquals(a[1], - ((int)b[4] & 0xff) << 24 | ((int)b[5] & 0xff) << 16 | - ((int)b[6] & 0xff) << 8 | ((int)b[7] & 0xff)); - } else { - Assert.assertEquals(a[0], - ((int)b[3] & 0xff) << 24 | ((int)b[2] & 0xff) << 16 | - ((int)b[1] & 0xff) << 8 | ((int)b[0] & 0xff)); - Assert.assertEquals(a[1], - ((int)b[7] & 0xff) << 24 | ((int)b[6] & 0xff) << 16 | - ((int)b[5] & 0xff) << 8 | ((int)b[4] & 0xff)); - } - for (int i = 48; i < memory.size(); i++) { - Assert.assertEquals((byte) -1, memory.getByte(i)); - } - - assert(memory.subBlock(0, memory.size()) == memory); - - try { - memory.subBlock(-8, 8); - Assert.fail(); - } catch (Exception expected) { - Assert.assertThat(expected.getMessage(), containsString("non-negative")); - } - - try { - memory.subBlock(0, -8); - Assert.fail(); - } catch (Exception expected) { - Assert.assertThat(expected.getMessage(), containsString("non-negative")); - } - - try { - memory.subBlock(0, length + 8); - Assert.fail(); - } catch (Exception expected) { - Assert.assertThat(expected.getMessage(), containsString("should not be larger than")); - } - - try { - memory.subBlock(8, length - 4); - Assert.fail(); - } catch (Exception expected) { - Assert.assertThat(expected.getMessage(), containsString("should not be larger than")); - } - - try { - memory.subBlock(length + 8, 4); - Assert.fail(); - } catch (Exception expected) { - Assert.assertThat(expected.getMessage(), containsString("should not be larger than")); - } - - memory.setPageNumber(MemoryBlock.NO_PAGE_NUMBER); - } - - @Test - public void testByteArrayMemoryBlock() { - byte[] obj = new byte[56]; - long offset = Platform.BYTE_ARRAY_OFFSET; - int length = obj.length; - - MemoryBlock memory = new ByteArrayMemoryBlock(obj, offset, length); - check(memory, obj, offset, length); - - memory = ByteArrayMemoryBlock.fromArray(obj); - check(memory, obj, offset, length); - - obj = new byte[112]; - memory = new ByteArrayMemoryBlock(obj, offset, length); - check(memory, obj, offset, length); - } - - @Test - public void testOnHeapMemoryBlock() { - long[] obj = new long[7]; - long offset = Platform.LONG_ARRAY_OFFSET; - int length = obj.length * 8; - - MemoryBlock memory = new OnHeapMemoryBlock(obj, offset, length); - check(memory, obj, offset, length); - - memory = OnHeapMemoryBlock.fromArray(obj); - check(memory, obj, offset, length); - - obj = new long[14]; - memory = new OnHeapMemoryBlock(obj, offset, length); - check(memory, obj, offset, length); - } - - @Test - public void testOffHeapArrayMemoryBlock() { - MemoryAllocator memoryAllocator = new UnsafeMemoryAllocator(); - MemoryBlock memory = memoryAllocator.allocate(56); - Object obj = memory.getBaseObject(); - long offset = memory.getBaseOffset(); - int length = 56; - - check(memory, obj, offset, length); - memoryAllocator.free(memory); - - long address = Platform.allocateMemory(112); - memory = new OffHeapMemoryBlock(address, length); - obj = memory.getBaseObject(); - offset = memory.getBaseOffset(); - check(memory, obj, offset, length); - Platform.freeMemory(address); - } -} diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 42dda3048070..cf9cc6b1800a 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -25,8 +25,7 @@ import java.util.*; import com.google.common.collect.ImmutableMap; -import org.apache.spark.unsafe.memory.ByteArrayMemoryBlock; -import org.apache.spark.unsafe.memory.OnHeapMemoryBlock; +import org.apache.spark.unsafe.Platform; import org.junit.Test; import static org.junit.Assert.*; @@ -394,12 +393,14 @@ public void substringSQL() { @Test public void split() { - assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), -1), - new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi")})); - assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2), - new UTF8String[]{fromString("ab"), fromString("def,ghi")})); - assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2), - new UTF8String[]{fromString("ab"), fromString("def,ghi")})); + UTF8String[] negativeAndZeroLimitCase = + new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi"), fromString("")}; + assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), 0), + negativeAndZeroLimitCase)); + assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), -1), + negativeAndZeroLimitCase)); + assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), 2), + new UTF8String[]{fromString("ab"), fromString("def,ghi,")})); } @Test @@ -513,6 +514,21 @@ public void soundex() { assertEquals(fromString("世界千世").soundex(), fromString("世界千世")); } + @Test + public void writeToOutputStreamUnderflow() throws IOException { + // offset underflow is apparently supported? + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + final byte[] test = "01234567".getBytes(StandardCharsets.UTF_8); + + for (int i = 1; i <= Platform.BYTE_ARRAY_OFFSET; ++i) { + UTF8String.fromAddress(test, Platform.BYTE_ARRAY_OFFSET - i, test.length + i) + .writeTo(outputStream); + final ByteBuffer buffer = ByteBuffer.wrap(outputStream.toByteArray(), i, test.length); + assertEquals("01234567", StandardCharsets.UTF_8.decode(buffer).toString()); + outputStream.reset(); + } + } + @Test public void writeToOutputStreamSlice() throws IOException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); @@ -520,7 +536,7 @@ public void writeToOutputStreamSlice() throws IOException { for (int i = 0; i < test.length; ++i) { for (int j = 0; j < test.length - i; ++j) { - new UTF8String(ByteArrayMemoryBlock.fromArray(test).subBlock(i, j)) + UTF8String.fromAddress(test, Platform.BYTE_ARRAY_OFFSET + i, j) .writeTo(outputStream); assertArrayEquals(Arrays.copyOfRange(test, i, i + j), outputStream.toByteArray()); @@ -551,7 +567,7 @@ public void writeToOutputStreamOverflow() throws IOException { for (final long offset : offsets) { try { - new UTF8String(ByteArrayMemoryBlock.fromArray(test).subBlock(offset, test.length)) + fromAddress(test, BYTE_ARRAY_OFFSET + offset, test.length) .writeTo(outputStream); throw new IllegalStateException(Long.toString(offset)); @@ -578,25 +594,26 @@ public void writeToOutputStream() throws IOException { } @Test - public void writeToOutputStreamLongArray() throws IOException { + public void writeToOutputStreamIntArray() throws IOException { // verify that writes work on objects that are not byte arrays - final ByteBuffer buffer = StandardCharsets.UTF_8.encode("3千大千世界"); + final ByteBuffer buffer = StandardCharsets.UTF_8.encode("大千世界"); buffer.position(0); buffer.order(ByteOrder.nativeOrder()); final int length = buffer.limit(); - assertEquals(16, length); + assertEquals(12, length); - final int longs = length / 8; - final long[] array = new long[longs]; + final int ints = length / 4; + final int[] array = new int[ints]; - for (int i = 0; i < longs; ++i) { - array[i] = buffer.getLong(); + for (int i = 0; i < ints; ++i) { + array[i] = buffer.getInt(); } final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - new UTF8String(OnHeapMemoryBlock.fromArray(array)).writeTo(outputStream); - assertEquals("3千大千世界", outputStream.toString("UTF-8")); + fromAddress(array, Platform.INT_ARRAY_OFFSET, length) + .writeTo(outputStream); + assertEquals("大千世界", outputStream.toString("UTF-8")); } @Test diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala index 7d3331f44f01..9656951810da 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala @@ -63,6 +63,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty } } + // scalastyle:off caselocale test("toUpperCase") { forAll { (s: String) => assert(toUTF8(s).toUpperCase === toUTF8(s.toUpperCase)) @@ -74,6 +75,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty assert(toUTF8(s).toLowerCase === toUTF8(s.toLowerCase)) } } + // scalastyle:on caselocale test("compare") { forAll { (s1: String, s2: String) => diff --git a/core/pom.xml b/core/pom.xml index 5fa3a86de6b0..eff3aa1d1942 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/core/src/main/java/org/apache/spark/ExecutorPlugin.java b/core/src/main/java/org/apache/spark/ExecutorPlugin.java new file mode 100644 index 000000000000..ec0b57f1a281 --- /dev/null +++ b/core/src/main/java/org/apache/spark/ExecutorPlugin.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark; + +import org.apache.spark.annotation.DeveloperApi; + +/** + * A plugin which can be automaticaly instantiated within each Spark executor. Users can specify + * plugins which should be created with the "spark.executor.plugins" configuration. An instance + * of each plugin will be created for every executor, including those created by dynamic allocation, + * before the executor starts running any tasks. + * + * The specific api exposed to the end users still considered to be very unstable. We will + * hopefully be able to keep compatability by providing default implementations for any methods + * added, but make no guarantees this will always be possible across all Spark releases. + * + * Spark does nothing to verify the plugin is doing legitimate things, or to manage the resources + * it uses. A plugin acquires the same privileges as the user running the task. A bad plugin + * could also intefere with task execution and make the executor fail in unexpected ways. + */ +@DeveloperApi +public interface ExecutorPlugin { + + /** + * Initialize the executor plugin. + * + *

Each executor will, during its initialization, invoke this method on each + * plugin provided in the spark.executor.plugins configuration.

+ * + *

Plugins should create threads in their implementation of this method for + * any polling, blocking, or intensive computation.

+ */ + default void init() {} + + /** + * Clean up and terminate this plugin. + * + *

This function is called during the executor shutdown phase. The executor + * will wait for the plugin to terminate before continuing its own shutdown.

+ */ + default void shutdown() {} +} diff --git a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java index 94c5c11b61a5..731f6fc767df 100644 --- a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java +++ b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java @@ -103,6 +103,12 @@ public final void onExecutorMetricsUpdate( onEvent(executorMetricsUpdate); } + @Override + public final void onStageExecutorMetrics( + SparkListenerStageExecutorMetrics executorMetrics) { + onEvent(executorMetrics); + } + @Override public final void onExecutorAdded(SparkListenerExecutorAdded executorAdded) { onEvent(executorAdded); diff --git a/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java b/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java index 0cced9e22295..2e18715b600e 100644 --- a/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java +++ b/core/src/main/java/org/apache/spark/io/ReadAheadInputStream.java @@ -135,62 +135,58 @@ private void readAsync() throws IOException { } finally { stateChangeLock.unlock(); } - executorService.execute(new Runnable() { - - @Override - public void run() { - stateChangeLock.lock(); - try { - if (isClosed) { - readInProgress = false; - return; - } - // Flip this so that the close method will not close the underlying input stream when we - // are reading. - isReading = true; - } finally { - stateChangeLock.unlock(); + executorService.execute(() -> { + stateChangeLock.lock(); + try { + if (isClosed) { + readInProgress = false; + return; } + // Flip this so that the close method will not close the underlying input stream when we + // are reading. + isReading = true; + } finally { + stateChangeLock.unlock(); + } - // Please note that it is safe to release the lock and read into the read ahead buffer - // because either of following two conditions will hold - 1. The active buffer has - // data available to read so the reader will not read from the read ahead buffer. - // 2. This is the first time read is called or the active buffer is exhausted, - // in that case the reader waits for this async read to complete. - // So there is no race condition in both the situations. - int read = 0; - int off = 0, len = arr.length; - Throwable exception = null; - try { - // try to fill the read ahead buffer. - // if a reader is waiting, possibly return early. - do { - read = underlyingInputStream.read(arr, off, len); - if (read <= 0) break; - off += read; - len -= read; - } while (len > 0 && !isWaiting.get()); - } catch (Throwable ex) { - exception = ex; - if (ex instanceof Error) { - // `readException` may not be reported to the user. Rethrow Error to make sure at least - // The user can see Error in UncaughtExceptionHandler. - throw (Error) ex; - } - } finally { - stateChangeLock.lock(); - readAheadBuffer.limit(off); - if (read < 0 || (exception instanceof EOFException)) { - endOfStream = true; - } else if (exception != null) { - readAborted = true; - readException = exception; - } - readInProgress = false; - signalAsyncReadComplete(); - stateChangeLock.unlock(); - closeUnderlyingInputStreamIfNecessary(); + // Please note that it is safe to release the lock and read into the read ahead buffer + // because either of following two conditions will hold - 1. The active buffer has + // data available to read so the reader will not read from the read ahead buffer. + // 2. This is the first time read is called or the active buffer is exhausted, + // in that case the reader waits for this async read to complete. + // So there is no race condition in both the situations. + int read = 0; + int off = 0, len = arr.length; + Throwable exception = null; + try { + // try to fill the read ahead buffer. + // if a reader is waiting, possibly return early. + do { + read = underlyingInputStream.read(arr, off, len); + if (read <= 0) break; + off += read; + len -= read; + } while (len > 0 && !isWaiting.get()); + } catch (Throwable ex) { + exception = ex; + if (ex instanceof Error) { + // `readException` may not be reported to the user. Rethrow Error to make sure at least + // The user can see Error in UncaughtExceptionHandler. + throw (Error) ex; } + } finally { + stateChangeLock.lock(); + readAheadBuffer.limit(off); + if (read < 0 || (exception instanceof EOFException)) { + endOfStream = true; + } else if (exception != null) { + readAborted = true; + readException = exception; + } + readInProgress = false; + signalAsyncReadComplete(); + stateChangeLock.unlock(); + closeUnderlyingInputStreamIfNecessary(); } }); } diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java index 8651a639c07f..d07faf1da124 100644 --- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java +++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java @@ -311,7 +311,7 @@ public MemoryBlock allocatePage(long size, MemoryConsumer consumer) { // this could trigger spilling to free some pages. return allocatePage(size, consumer); } - page.setPageNumber(pageNumber); + page.pageNumber = pageNumber; pageTable[pageNumber] = page; if (logger.isTraceEnabled()) { logger.trace("Allocate page number {} ({} bytes)", pageNumber, acquired); @@ -323,25 +323,25 @@ public MemoryBlock allocatePage(long size, MemoryConsumer consumer) { * Free a block of memory allocated via {@link TaskMemoryManager#allocatePage}. */ public void freePage(MemoryBlock page, MemoryConsumer consumer) { - assert (page.getPageNumber() != MemoryBlock.NO_PAGE_NUMBER) : + assert (page.pageNumber != MemoryBlock.NO_PAGE_NUMBER) : "Called freePage() on memory that wasn't allocated with allocatePage()"; - assert (page.getPageNumber() != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) : + assert (page.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) : "Called freePage() on a memory block that has already been freed"; - assert (page.getPageNumber() != MemoryBlock.FREED_IN_TMM_PAGE_NUMBER) : + assert (page.pageNumber != MemoryBlock.FREED_IN_TMM_PAGE_NUMBER) : "Called freePage() on a memory block that has already been freed"; - assert(allocatedPages.get(page.getPageNumber())); - pageTable[page.getPageNumber()] = null; + assert(allocatedPages.get(page.pageNumber)); + pageTable[page.pageNumber] = null; synchronized (this) { - allocatedPages.clear(page.getPageNumber()); + allocatedPages.clear(page.pageNumber); } if (logger.isTraceEnabled()) { - logger.trace("Freed page number {} ({} bytes)", page.getPageNumber(), page.size()); + logger.trace("Freed page number {} ({} bytes)", page.pageNumber, page.size()); } long pageSize = page.size(); // Clear the page number before passing the block to the MemoryAllocator's free(). // Doing this allows the MemoryAllocator to detect when a TaskMemoryManager-managed // page has been inappropriately directly freed without calling TMM.freePage(). - page.setPageNumber(MemoryBlock.FREED_IN_TMM_PAGE_NUMBER); + page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER; memoryManager.tungstenMemoryAllocator().free(page); releaseExecutionMemory(pageSize, consumer); } @@ -363,7 +363,7 @@ public long encodePageNumberAndOffset(MemoryBlock page, long offsetInPage) { // relative to the page's base offset; this relative offset will fit in 51 bits. offsetInPage -= page.getBaseOffset(); } - return encodePageNumberAndOffset(page.getPageNumber(), offsetInPage); + return encodePageNumberAndOffset(page.pageNumber, offsetInPage); } @VisibleForTesting @@ -434,7 +434,7 @@ public long cleanUpAllAllocatedMemory() { for (MemoryBlock page : pageTable) { if (page != null) { logger.debug("unreleased page: " + page + " in task " + taskAttemptId); - page.setPageNumber(MemoryBlock.FREED_IN_TMM_PAGE_NUMBER); + page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER; memoryManager.tungstenMemoryAllocator().free(page); } } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java index e3bd5496cf5b..b020a6d99247 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java @@ -125,7 +125,7 @@ public void write(Iterator> records) throws IOException { if (!records.hasNext()) { partitionLengths = new long[numPartitions]; shuffleBlockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, null); - mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths, 0); + mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); return; } final SerializerInstance serInstance = serializer.newInstance(); @@ -152,9 +152,9 @@ public void write(Iterator> records) throws IOException { } for (int i = 0; i < numPartitions; i++) { - final DiskBlockObjectWriter writer = partitionWriters[i]; - partitionWriterSegments[i] = writer.commitAndGet(); - writer.close(); + try (DiskBlockObjectWriter writer = partitionWriters[i]) { + partitionWriterSegments[i] = writer.commitAndGet(); + } } File output = shuffleBlockResolver.getDataFile(shuffleId, mapId); @@ -167,8 +167,7 @@ public void write(Iterator> records) throws IOException { logger.error("Error while deleting temp file {}", tmp.getAbsolutePath()); } } - mapStatus = MapStatus$.MODULE$.apply( - blockManager.shuffleServerId(), partitionLengths, writeMetrics.recordsWritten()); + mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); } @VisibleForTesting diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java index c7d2db4217d9..1c0d664afb13 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java @@ -181,42 +181,43 @@ private void writeSortedFile(boolean isLastFile) { // around this, we pass a dummy no-op serializer. final SerializerInstance ser = DummySerializerInstance.INSTANCE; - final DiskBlockObjectWriter writer = - blockManager.getDiskWriter(blockId, file, ser, fileBufferSizeBytes, writeMetricsToUse); - int currentPartition = -1; - final int uaoSize = UnsafeAlignedOffset.getUaoSize(); - while (sortedRecords.hasNext()) { - sortedRecords.loadNext(); - final int partition = sortedRecords.packedRecordPointer.getPartitionId(); - assert (partition >= currentPartition); - if (partition != currentPartition) { - // Switch to the new partition - if (currentPartition != -1) { - final FileSegment fileSegment = writer.commitAndGet(); - spillInfo.partitionLengths[currentPartition] = fileSegment.length(); + final FileSegment committedSegment; + try (DiskBlockObjectWriter writer = + blockManager.getDiskWriter(blockId, file, ser, fileBufferSizeBytes, writeMetricsToUse)) { + + final int uaoSize = UnsafeAlignedOffset.getUaoSize(); + while (sortedRecords.hasNext()) { + sortedRecords.loadNext(); + final int partition = sortedRecords.packedRecordPointer.getPartitionId(); + assert (partition >= currentPartition); + if (partition != currentPartition) { + // Switch to the new partition + if (currentPartition != -1) { + final FileSegment fileSegment = writer.commitAndGet(); + spillInfo.partitionLengths[currentPartition] = fileSegment.length(); + } + currentPartition = partition; } - currentPartition = partition; - } - final long recordPointer = sortedRecords.packedRecordPointer.getRecordPointer(); - final Object recordPage = taskMemoryManager.getPage(recordPointer); - final long recordOffsetInPage = taskMemoryManager.getOffsetInPage(recordPointer); - int dataRemaining = UnsafeAlignedOffset.getSize(recordPage, recordOffsetInPage); - long recordReadPosition = recordOffsetInPage + uaoSize; // skip over record length - while (dataRemaining > 0) { - final int toTransfer = Math.min(diskWriteBufferSize, dataRemaining); - Platform.copyMemory( - recordPage, recordReadPosition, writeBuffer, Platform.BYTE_ARRAY_OFFSET, toTransfer); - writer.write(writeBuffer, 0, toTransfer); - recordReadPosition += toTransfer; - dataRemaining -= toTransfer; + final long recordPointer = sortedRecords.packedRecordPointer.getRecordPointer(); + final Object recordPage = taskMemoryManager.getPage(recordPointer); + final long recordOffsetInPage = taskMemoryManager.getOffsetInPage(recordPointer); + int dataRemaining = UnsafeAlignedOffset.getSize(recordPage, recordOffsetInPage); + long recordReadPosition = recordOffsetInPage + uaoSize; // skip over record length + while (dataRemaining > 0) { + final int toTransfer = Math.min(diskWriteBufferSize, dataRemaining); + Platform.copyMemory( + recordPage, recordReadPosition, writeBuffer, Platform.BYTE_ARRAY_OFFSET, toTransfer); + writer.write(writeBuffer, 0, toTransfer); + recordReadPosition += toTransfer; + dataRemaining -= toTransfer; + } + writer.recordWritten(); } - writer.recordWritten(); - } - final FileSegment committedSegment = writer.commitAndGet(); - writer.close(); + committedSegment = writer.commitAndGet(); + } // If `writeSortedFile()` was called from `closeAndGetSpills()` and no records were inserted, // then the file might be empty. Note that it might be better to avoid calling // writeSortedFile() in that case. diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java index 4b48599ad311..0d069125dc60 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java @@ -20,6 +20,7 @@ import java.util.Comparator; import org.apache.spark.memory.MemoryConsumer; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.LongArray; import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.util.collection.Sorter; @@ -112,7 +113,13 @@ public void reset() { public void expandPointerArray(LongArray newArray) { assert(newArray.size() > array.size()); - MemoryBlock.copyMemory(array.memoryBlock(), newArray.memoryBlock(), pos * 8L); + Platform.copyMemory( + array.getBaseObject(), + array.getBaseOffset(), + newArray.getBaseObject(), + newArray.getBaseOffset(), + pos * 8L + ); consumer.freeArray(array); array = newArray; usableCapacity = getUsableCapacity(); @@ -181,7 +188,10 @@ public ShuffleSorterIterator getSortedIterator() { PackedRecordPointer.PARTITION_ID_START_BYTE_INDEX, PackedRecordPointer.PARTITION_ID_END_BYTE_INDEX, false, false); } else { - MemoryBlock unused = array.memoryBlock().subBlock(pos * 8L, (array.size() - pos) * 8L); + MemoryBlock unused = new MemoryBlock( + array.getBaseObject(), + array.getBaseOffset() + pos * 8L, + (array.size() - pos) * 8L); LongArray buffer = new LongArray(unused); Sorter sorter = new Sorter<>(new ShuffleSortDataFormat(buffer)); diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java index 254449e95443..717bdd79d47e 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleSortDataFormat.java @@ -17,8 +17,8 @@ package org.apache.spark.shuffle.sort; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.LongArray; -import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.util.collection.SortDataFormat; final class ShuffleSortDataFormat extends SortDataFormat { @@ -60,8 +60,13 @@ public void copyElement(LongArray src, int srcPos, LongArray dst, int dstPos) { @Override public void copyRange(LongArray src, int srcPos, LongArray dst, int dstPos, int length) { - MemoryBlock.copyMemory(src.memoryBlock(), srcPos * 8L, - dst.memoryBlock(),dstPos * 8L,length * 8L); + Platform.copyMemory( + src.getBaseObject(), + src.getBaseOffset() + srcPos * 8L, + dst.getBaseObject(), + dst.getBaseOffset() + dstPos * 8L, + length * 8L + ); } @Override diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java index 069e6d5f224d..4839d04522f1 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java @@ -248,8 +248,7 @@ void closeAndWriteOutput() throws IOException { logger.error("Error while deleting temp file {}", tmp.getAbsolutePath()); } } - mapStatus = MapStatus$.MODULE$.apply( - blockManager.shuffleServerId(), partitionLengths, writeMetrics.recordsWritten()); + mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); } @VisibleForTesting diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index 399251b80e64..5056652a2420 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -544,7 +544,7 @@ public long spill() throws IOException { // is accessing the current record. We free this page in that caller's next loadNext() // call. for (MemoryBlock page : allocatedPages) { - if (!loaded || page.getPageNumber() != + if (!loaded || page.pageNumber != ((UnsafeInMemorySorter.SortedIterator)upstream).getCurrentPageNumber()) { released += page.size(); freePage(page); diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index 717823ebbd32..75690ae26483 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -26,6 +26,7 @@ import org.apache.spark.memory.MemoryConsumer; import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.UnsafeAlignedOffset; import org.apache.spark.unsafe.array.LongArray; import org.apache.spark.unsafe.memory.MemoryBlock; @@ -215,7 +216,12 @@ public void expandPointerArray(LongArray newArray) { if (newArray.size() < array.size()) { throw new SparkOutOfMemoryError("Not enough memory to grow pointer array"); } - MemoryBlock.copyMemory(array.memoryBlock(), newArray.memoryBlock(), pos * 8L); + Platform.copyMemory( + array.getBaseObject(), + array.getBaseOffset(), + newArray.getBaseObject(), + newArray.getBaseOffset(), + pos * 8L); consumer.freeArray(array); array = newArray; usableCapacity = getUsableCapacity(); @@ -342,7 +348,10 @@ public UnsafeSorterIterator getSortedIterator() { array, nullBoundaryPos, (pos - nullBoundaryPos) / 2L, 0, 7, radixSortSupport.sortDescending(), radixSortSupport.sortSigned()); } else { - MemoryBlock unused = array.memoryBlock().subBlock(pos * 8L, (array.size() - pos) * 8L); + MemoryBlock unused = new MemoryBlock( + array.getBaseObject(), + array.getBaseOffset() + pos * 8L, + (array.size() - pos) * 8L); LongArray buffer = new LongArray(unused); Sorter sorter = new Sorter<>(new UnsafeSortDataFormat(buffer)); diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css index 935d9b1aec61..4b060b0f4e53 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/webui.css +++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css @@ -251,4 +251,9 @@ a.expandbutton { .table-cell-width-limited td { max-width: 600px; +} + +.paginate_button.active > a { + color: #999999; + text-decoration: underline; } \ No newline at end of file diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.js b/core/src/main/resources/org/apache/spark/ui/static/webui.js index f01c567ba58a..b1254e08fa50 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/webui.js +++ b/core/src/main/resources/org/apache/spark/ui/static/webui.js @@ -83,4 +83,7 @@ $(function() { collapseTablePageLoad('collapse-aggregated-rdds','aggregated-rdds'); collapseTablePageLoad('collapse-aggregated-activeBatches','aggregated-activeBatches'); collapseTablePageLoad('collapse-aggregated-completedBatches','aggregated-completedBatches'); + collapseTablePageLoad('collapse-aggregated-runningExecutions','aggregated-runningExecutions'); + collapseTablePageLoad('collapse-aggregated-completedExecutions','aggregated-completedExecutions'); + collapseTablePageLoad('collapse-aggregated-failedExecutions','aggregated-failedExecutions'); }); \ No newline at end of file diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala index bcbc8df0d586..ab0ae55ed357 100644 --- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala +++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala @@ -22,6 +22,7 @@ import java.util.concurrent.{ScheduledFuture, TimeUnit} import scala.collection.mutable import scala.concurrent.Future +import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler._ @@ -37,7 +38,8 @@ import org.apache.spark.util._ private[spark] case class Heartbeat( executorId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], // taskId -> accumulator updates - blockManagerId: BlockManagerId) + blockManagerId: BlockManagerId, + executorUpdates: ExecutorMetrics) // executor level updates /** * An event that SparkContext uses to notify HeartbeatReceiver that SparkContext.taskScheduler is @@ -119,14 +121,14 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) context.reply(true) // Messages received from executors - case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId) => + case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId, executorMetrics) => if (scheduler != null) { if (executorLastSeen.contains(executorId)) { executorLastSeen(executorId) = clock.getTimeMillis() eventLoopThread.submit(new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { val unknownExecutor = !scheduler.executorHeartbeatReceived( - executorId, accumUpdates, blockManagerId) + executorId, accumUpdates, blockManagerId, executorMetrics) val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor) context.reply(response) } diff --git a/core/src/main/scala/org/apache/spark/Heartbeater.scala b/core/src/main/scala/org/apache/spark/Heartbeater.scala new file mode 100644 index 000000000000..5ba1b9b2d828 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/Heartbeater.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import java.util.concurrent.TimeUnit + +import org.apache.spark.executor.ExecutorMetrics +import org.apache.spark.internal.Logging +import org.apache.spark.memory.MemoryManager +import org.apache.spark.metrics.ExecutorMetricType +import org.apache.spark.util.{ThreadUtils, Utils} + +/** + * Creates a heartbeat thread which will call the specified reportHeartbeat function at + * intervals of intervalMs. + * + * @param memoryManager the memory manager for execution and storage memory. + * @param reportHeartbeat the heartbeat reporting function to call. + * @param name the thread name for the heartbeater. + * @param intervalMs the interval between heartbeats. + */ +private[spark] class Heartbeater( + memoryManager: MemoryManager, + reportHeartbeat: () => Unit, + name: String, + intervalMs: Long) extends Logging { + // Executor for the heartbeat task + private val heartbeater = ThreadUtils.newDaemonSingleThreadScheduledExecutor(name) + + /** Schedules a task to report a heartbeat. */ + def start(): Unit = { + // Wait a random interval so the heartbeats don't end up in sync + val initialDelay = intervalMs + (math.random * intervalMs).asInstanceOf[Int] + + val heartbeatTask = new Runnable() { + override def run(): Unit = Utils.logUncaughtExceptions(reportHeartbeat()) + } + heartbeater.scheduleAtFixedRate(heartbeatTask, initialDelay, intervalMs, TimeUnit.MILLISECONDS) + } + + /** Stops the heartbeat thread. */ + def stop(): Unit = { + heartbeater.shutdown() + heartbeater.awaitTermination(10, TimeUnit.SECONDS) + } + + /** + * Get the current executor level metrics. These are returned as an array, with the index + * determined by MetricGetter.values + */ + def getCurrentMetrics(): ExecutorMetrics = { + val metrics = ExecutorMetricType.values.map(_.getMetricValue(memoryManager)).toArray + new ExecutorMetrics(metrics) + } +} + diff --git a/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala b/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala index ff85e11409e3..f8a6f1d0d8cb 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputStatistics.scala @@ -23,9 +23,5 @@ package org.apache.spark * @param shuffleId ID of the shuffle * @param bytesByPartitionId approximate number of output bytes for each map output partition * (may be inexact due to use of compressed map statuses) - * @param recordsByPartitionId number of output records for each map output partition */ -private[spark] class MapOutputStatistics( - val shuffleId: Int, - val bytesByPartitionId: Array[Long], - val recordsByPartitionId: Array[Long]) +private[spark] class MapOutputStatistics(val shuffleId: Int, val bytesByPartitionId: Array[Long]) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 41575ce4e6e3..1c4fa4bc6541 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -522,19 +522,16 @@ private[spark] class MapOutputTrackerMaster( def getStatistics(dep: ShuffleDependency[_, _, _]): MapOutputStatistics = { shuffleStatuses(dep.shuffleId).withMapStatuses { statuses => val totalSizes = new Array[Long](dep.partitioner.numPartitions) - val recordsByMapTask = new Array[Long](statuses.length) - val parallelAggThreshold = conf.get( SHUFFLE_MAP_OUTPUT_PARALLEL_AGGREGATION_THRESHOLD) val parallelism = math.min( Runtime.getRuntime.availableProcessors(), statuses.length.toLong * totalSizes.length / parallelAggThreshold + 1).toInt if (parallelism <= 1) { - statuses.zipWithIndex.foreach { case (s, index) => + for (s <- statuses) { for (i <- 0 until totalSizes.length) { totalSizes(i) += s.getSizeForBlock(i) } - recordsByMapTask(index) = s.numberOfOutput } } else { val threadPool = ThreadUtils.newDaemonFixedThreadPool(parallelism, "map-output-aggregate") @@ -551,11 +548,8 @@ private[spark] class MapOutputTrackerMaster( } finally { threadPool.shutdown() } - statuses.zipWithIndex.foreach { case (s, index) => - recordsByMapTask(index) = s.numberOfOutput - } } - new MapOutputStatistics(dep.shuffleId, totalSizes, recordsByMapTask) + new MapOutputStatistics(dep.shuffleId, totalSizes) } } diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 6c4c5c94cfa2..81aa31d79ba8 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -609,13 +609,14 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria require(!encryptionEnabled || get(NETWORK_AUTH_ENABLED), s"${NETWORK_AUTH_ENABLED.key} must be enabled when enabling encryption.") - val executorTimeoutThreshold = getTimeAsSeconds("spark.network.timeout", "120s") - val executorHeartbeatInterval = getTimeAsSeconds("spark.executor.heartbeatInterval", "10s") + val executorTimeoutThresholdMs = + getTimeAsSeconds("spark.network.timeout", "120s") * 1000 + val executorHeartbeatIntervalMs = get(EXECUTOR_HEARTBEAT_INTERVAL) // If spark.executor.heartbeatInterval bigger than spark.network.timeout, // it will almost always cause ExecutorLostFailure. See SPARK-22754. - require(executorTimeoutThreshold > executorHeartbeatInterval, "The value of " + - s"spark.network.timeout=${executorTimeoutThreshold}s must be no less than the value of " + - s"spark.executor.heartbeatInterval=${executorHeartbeatInterval}s.") + require(executorTimeoutThresholdMs > executorHeartbeatIntervalMs, "The value of " + + s"spark.network.timeout=${executorTimeoutThresholdMs}ms must be no less than the value of " + + s"spark.executor.heartbeatInterval=${executorHeartbeatIntervalMs}ms.") } /** @@ -726,7 +727,11 @@ private[spark] object SparkConf extends Logging { DRIVER_MEMORY_OVERHEAD.key -> Seq( AlternateConfig("spark.yarn.driver.memoryOverhead", "2.3")), EXECUTOR_MEMORY_OVERHEAD.key -> Seq( - AlternateConfig("spark.yarn.executor.memoryOverhead", "2.3")) + AlternateConfig("spark.yarn.executor.memoryOverhead", "2.3")), + KEYTAB.key -> Seq( + AlternateConfig("spark.yarn.keytab", "2.5")), + PRINCIPAL.key -> Seq( + AlternateConfig("spark.yarn.principal", "2.5")) ) /** diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index e5b1e0ecd158..0a66dae94dbd 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -213,6 +213,7 @@ class SparkContext(config: SparkConf) extends Logging { private var _files: Seq[String] = _ private var _shutdownHookRef: AnyRef = _ private var _statusStore: AppStatusStore = _ + private var _heartbeater: Heartbeater = _ /* ------------------------------------------------------------------------------------- * | Accessors and public fields. These provide access to the internal state of the | @@ -496,6 +497,11 @@ class SparkContext(config: SparkConf) extends Logging { _dagScheduler = new DAGScheduler(this) _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet) + // create and start the heartbeater for collecting memory metrics + _heartbeater = new Heartbeater(env.memoryManager, reportHeartBeat, "driver-heartbeater", + conf.get(EXECUTOR_HEARTBEAT_INTERVAL)) + _heartbeater.start() + // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's // constructor _taskScheduler.start() @@ -1959,6 +1965,12 @@ class SparkContext(config: SparkConf) extends Logging { Utils.tryLogNonFatalError { _eventLogger.foreach(_.stop()) } + if (_heartbeater != null) { + Utils.tryLogNonFatalError { + _heartbeater.stop() + } + _heartbeater = null + } if (env != null && _heartbeatReceiver != null) { Utils.tryLogNonFatalError { env.rpcEnv.stop(_heartbeatReceiver) @@ -2429,6 +2441,14 @@ class SparkContext(config: SparkConf) extends Logging { } } + /** Reports heartbeat metrics for the driver. */ + private def reportHeartBeat(): Unit = { + val driverUpdates = _heartbeater.getCurrentMetrics() + val accumUpdates = new Array[(Long, Int, Int, Seq[AccumulableInfo])](0) + listenerBus.post(SparkListenerExecutorMetricsUpdate("driver", accumUpdates, + Some(driverUpdates))) + } + // In order to prevent multiple SparkContexts from being active at the same time, mark this // context as having finished construction. // NOTE: this must be placed at the end of the SparkContext constructor. diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index e639a842754b..8b5a7a9aefea 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -24,8 +24,10 @@ import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConverters._ import scala.collection.mutable +import scala.concurrent.Promise +import scala.concurrent.duration.Duration import scala.language.existentials -import scala.util.control.NonFatal +import scala.util.Try import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.compress.CompressionCodec @@ -37,6 +39,7 @@ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.broadcast.Broadcast import org.apache.spark.input.PortableDataStream import org.apache.spark.internal.Logging +import org.apache.spark.network.util.JavaUtils import org.apache.spark.rdd.RDD import org.apache.spark.security.SocketAuthHelper import org.apache.spark.util._ @@ -169,27 +172,34 @@ private[spark] object PythonRDD extends Logging { def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int): JavaRDD[Array[Byte]] = { - val file = new DataInputStream(new FileInputStream(filename)) + readRDDFromInputStream(sc.sc, new FileInputStream(filename), parallelism) + } + + def readRDDFromInputStream( + sc: SparkContext, + in: InputStream, + parallelism: Int): JavaRDD[Array[Byte]] = { + val din = new DataInputStream(in) try { val objs = new mutable.ArrayBuffer[Array[Byte]] try { while (true) { - val length = file.readInt() + val length = din.readInt() val obj = new Array[Byte](length) - file.readFully(obj) + din.readFully(obj) objs += obj } } catch { case eof: EOFException => // No-op } - JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism)) + JavaRDD.fromRDD(sc.parallelize(objs, parallelism)) } finally { - file.close() + din.close() } } - def readBroadcastFromFile(sc: JavaSparkContext, path: String): Broadcast[PythonBroadcast] = { - sc.broadcast(new PythonBroadcast(path)) + def setupBroadcast(path: String): PythonBroadcast = { + new PythonBroadcast(path) } def writeIteratorToStream[T](iter: Iterator[T], dataOut: DataOutputStream) { @@ -419,34 +429,15 @@ private[spark] object PythonRDD extends Logging { */ private[spark] def serveToStream( threadName: String)(writeFunc: OutputStream => Unit): Array[Any] = { - val serverSocket = new ServerSocket(0, 1, InetAddress.getByName("localhost")) - // Close the socket if no connection in 15 seconds - serverSocket.setSoTimeout(15000) - - new Thread(threadName) { - setDaemon(true) - override def run() { - try { - val sock = serverSocket.accept() - authHelper.authClient(sock) - - val out = new BufferedOutputStream(sock.getOutputStream) - Utils.tryWithSafeFinally { - writeFunc(out) - } { - out.close() - sock.close() - } - } catch { - case NonFatal(e) => - logError(s"Error while sending iterator", e) - } finally { - serverSocket.close() - } + val (port, secret) = PythonServer.setupOneConnectionServer(authHelper, threadName) { s => + val out = new BufferedOutputStream(s.getOutputStream()) + Utils.tryWithSafeFinally { + writeFunc(out) + } { + out.close() } - }.start() - - Array(serverSocket.getLocalPort, authHelper.secret) + } + Array(port, secret) } private def getMergedConf(confAsMap: java.util.HashMap[String, String], @@ -664,13 +655,11 @@ private[spark] class PythonAccumulatorV2( } } -/** - * A Wrapper for Python Broadcast, which is written into disk by Python. It also will - * write the data into disk after deserialization, then Python can read it from disks. - */ // scalastyle:off no.finalize private[spark] class PythonBroadcast(@transient var path: String) extends Serializable - with Logging { + with Logging { + + private var encryptionServer: PythonServer[Unit] = null /** * Read data from disks, then copy it to `out` @@ -713,5 +702,235 @@ private[spark] class PythonBroadcast(@transient var path: String) extends Serial } super.finalize() } + + def setupEncryptionServer(): Array[Any] = { + encryptionServer = new PythonServer[Unit]("broadcast-encrypt-server") { + override def handleConnection(sock: Socket): Unit = { + val env = SparkEnv.get + val in = sock.getInputStream() + val dir = new File(Utils.getLocalDir(env.conf)) + val file = File.createTempFile("broadcast", "", dir) + path = file.getAbsolutePath + val out = env.serializerManager.wrapForEncryption(new FileOutputStream(path)) + DechunkedInputStream.dechunkAndCopyToOutput(in, out) + } + } + Array(encryptionServer.port, encryptionServer.secret) + } + + def waitTillDataReceived(): Unit = encryptionServer.getResult() } // scalastyle:on no.finalize + +/** + * The inverse of pyspark's ChunkedStream for sending data of unknown size. + * + * We might be serializing a really large object from python -- we don't want + * python to buffer the whole thing in memory, nor can it write to a file, + * so we don't know the length in advance. So python writes it in chunks, each chunk + * preceeded by a length, till we get a "length" of -1 which serves as EOF. + * + * Tested from python tests. + */ +private[spark] class DechunkedInputStream(wrapped: InputStream) extends InputStream with Logging { + private val din = new DataInputStream(wrapped) + private var remainingInChunk = din.readInt() + + override def read(): Int = { + val into = new Array[Byte](1) + val n = read(into, 0, 1) + if (n == -1) { + -1 + } else { + // if you just cast a byte to an int, then anything > 127 is negative, which is interpreted + // as an EOF + val b = into(0) + if (b < 0) { + 256 + b + } else { + b + } + } + } + + override def read(dest: Array[Byte], off: Int, len: Int): Int = { + if (remainingInChunk == -1) { + return -1 + } + var destSpace = len + var destPos = off + while (destSpace > 0 && remainingInChunk != -1) { + val toCopy = math.min(remainingInChunk, destSpace) + val read = din.read(dest, destPos, toCopy) + destPos += read + destSpace -= read + remainingInChunk -= read + if (remainingInChunk == 0) { + remainingInChunk = din.readInt() + } + } + assert(destSpace == 0 || remainingInChunk == -1) + return destPos - off + } + + override def close(): Unit = wrapped.close() +} + +private[spark] object DechunkedInputStream { + + /** + * Dechunks the input, copies to output, and closes both input and the output safely. + */ + def dechunkAndCopyToOutput(chunked: InputStream, out: OutputStream): Unit = { + val dechunked = new DechunkedInputStream(chunked) + Utils.tryWithSafeFinally { + Utils.copyStream(dechunked, out) + } { + JavaUtils.closeQuietly(out) + JavaUtils.closeQuietly(dechunked) + } + } +} + +/** + * Creates a server in the jvm to communicate with python for handling one batch of data, with + * authentication and error handling. + */ +private[spark] abstract class PythonServer[T]( + authHelper: SocketAuthHelper, + threadName: String) { + + def this(env: SparkEnv, threadName: String) = this(new SocketAuthHelper(env.conf), threadName) + def this(threadName: String) = this(SparkEnv.get, threadName) + + val (port, secret) = PythonServer.setupOneConnectionServer(authHelper, threadName) { sock => + promise.complete(Try(handleConnection(sock))) + } + + /** + * Handle a connection which has already been authenticated. Any error from this function + * will clean up this connection and the entire server, and get propogated to [[getResult]]. + */ + def handleConnection(sock: Socket): T + + val promise = Promise[T]() + + /** + * Blocks indefinitely for [[handleConnection]] to finish, and returns that result. If + * handleConnection throws an exception, this will throw an exception which includes the original + * exception as a cause. + */ + def getResult(): T = { + getResult(Duration.Inf) + } + + def getResult(wait: Duration): T = { + ThreadUtils.awaitResult(promise.future, wait) + } + +} + +private[spark] object PythonServer { + + /** + * Create a socket server and run user function on the socket in a background thread. + * + * The socket server can only accept one connection, or close if no connection + * in 15 seconds. + * + * The thread will terminate after the supplied user function, or if there are any exceptions. + * + * If you need to get a result of the supplied function, create a subclass of [[PythonServer]] + * + * @return The port number of a local socket and the secret for authentication. + */ + def setupOneConnectionServer( + authHelper: SocketAuthHelper, + threadName: String) + (func: Socket => Unit): (Int, String) = { + val serverSocket = new ServerSocket(0, 1, InetAddress.getByAddress(Array(127, 0, 0, 1))) + // Close the socket if no connection in 15 seconds + serverSocket.setSoTimeout(15000) + + new Thread(threadName) { + setDaemon(true) + override def run(): Unit = { + var sock: Socket = null + try { + sock = serverSocket.accept() + authHelper.authClient(sock) + func(sock) + } finally { + JavaUtils.closeQuietly(serverSocket) + JavaUtils.closeQuietly(sock) + } + } + }.start() + (serverSocket.getLocalPort, authHelper.secret) + } +} + +/** + * Sends decrypted broadcast data to python worker. See [[PythonRunner]] for entire protocol. + */ +private[spark] class EncryptedPythonBroadcastServer( + val env: SparkEnv, + val idsAndFiles: Seq[(Long, String)]) + extends PythonServer[Unit]("broadcast-decrypt-server") with Logging { + + override def handleConnection(socket: Socket): Unit = { + val out = new DataOutputStream(new BufferedOutputStream(socket.getOutputStream())) + var socketIn: InputStream = null + // send the broadcast id, then the decrypted data. We don't need to send the length, the + // the python pickle module just needs a stream. + Utils.tryWithSafeFinally { + (idsAndFiles).foreach { case (id, path) => + out.writeLong(id) + val in = env.serializerManager.wrapForEncryption(new FileInputStream(path)) + Utils.tryWithSafeFinally { + Utils.copyStream(in, out, false) + } { + in.close() + } + } + logTrace("waiting for python to accept broadcast data over socket") + out.flush() + socketIn = socket.getInputStream() + socketIn.read() + logTrace("done serving broadcast data") + } { + JavaUtils.closeQuietly(socketIn) + JavaUtils.closeQuietly(out) + } + } + + def waitTillBroadcastDataSent(): Unit = { + getResult() + } +} + +/** + * Helper for making RDD[Array[Byte]] from some python data, by reading the data from python + * over a socket. This is used in preference to writing data to a file when encryption is enabled. + */ +private[spark] abstract class PythonRDDServer + extends PythonServer[JavaRDD[Array[Byte]]]("pyspark-parallelize-server") { + + def handleConnection(sock: Socket): JavaRDD[Array[Byte]] = { + val in = sock.getInputStream() + val dechunkedInput: InputStream = new DechunkedInputStream(in) + streamToRDD(dechunkedInput) + } + + protected def streamToRDD(input: InputStream): RDD[Array[Byte]] + +} + +private[spark] class PythonParallelizeServer(sc: SparkContext, parallelism: Int) + extends PythonRDDServer { + + override protected def streamToRDD(input: InputStream): RDD[Array[Byte]] = { + PythonRDD.readRDDFromInputStream(sc, input, parallelism) + } +} + diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala index 4c53bc269a10..6e53a044e9a8 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala @@ -289,19 +289,51 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( val newBids = broadcastVars.map(_.id).toSet // number of different broadcasts val toRemove = oldBids.diff(newBids) - val cnt = toRemove.size + newBids.diff(oldBids).size + val addedBids = newBids.diff(oldBids) + val cnt = toRemove.size + addedBids.size + val needsDecryptionServer = env.serializerManager.encryptionEnabled && addedBids.nonEmpty + dataOut.writeBoolean(needsDecryptionServer) dataOut.writeInt(cnt) - for (bid <- toRemove) { - // remove the broadcast from worker - dataOut.writeLong(- bid - 1) // bid >= 0 - oldBids.remove(bid) + def sendBidsToRemove(): Unit = { + for (bid <- toRemove) { + // remove the broadcast from worker + dataOut.writeLong(-bid - 1) // bid >= 0 + oldBids.remove(bid) + } } - for (broadcast <- broadcastVars) { - if (!oldBids.contains(broadcast.id)) { + if (needsDecryptionServer) { + // if there is encryption, we setup a server which reads the encrypted files, and sends + // the decrypted data to python + val idsAndFiles = broadcastVars.flatMap { broadcast => + if (!oldBids.contains(broadcast.id)) { + Some((broadcast.id, broadcast.value.path)) + } else { + None + } + } + val server = new EncryptedPythonBroadcastServer(env, idsAndFiles) + dataOut.writeInt(server.port) + logTrace(s"broadcast decryption server setup on ${server.port}") + PythonRDD.writeUTF(server.secret, dataOut) + sendBidsToRemove() + idsAndFiles.foreach { case (id, _) => // send new broadcast - dataOut.writeLong(broadcast.id) - PythonRDD.writeUTF(broadcast.value.path, dataOut) - oldBids.add(broadcast.id) + dataOut.writeLong(id) + oldBids.add(id) + } + dataOut.flush() + logTrace("waiting for python to read decrypted broadcast data from server") + server.waitTillBroadcastDataSent() + logTrace("done sending decrypted data to python") + } else { + sendBidsToRemove() + for (broadcast <- broadcastVars) { + if (!oldBids.contains(broadcast.id)) { + // send new broadcast + dataOut.writeLong(broadcast.id) + PythonRDD.writeUTF(broadcast.value.path, dataOut) + oldBids.add(broadcast.id) + } } } dataOut.flush() diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala index 27a5e19f96a1..cdce371dfcbf 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala @@ -74,4 +74,8 @@ private[spark] object PythonUtils { def toScalaMap[K, V](jm: java.util.Map[K, V]): Map[K, V] = { jm.asScala.toMap } + + def getEncryptionEnabled(sc: JavaSparkContext): Boolean = { + sc.conf.get(org.apache.spark.internal.config.IO_ENCRYPTION_ENABLED) + } } diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala index 295355c7bf01..1dc61c7eef33 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala @@ -17,7 +17,9 @@ package org.apache.spark.api.r -import java.io.File +import java.io.{DataInputStream, File} +import java.net.Socket +import java.nio.charset.StandardCharsets.UTF_8 import java.util.{Map => JMap} import scala.collection.JavaConverters._ @@ -25,10 +27,11 @@ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} -import org.apache.spark.api.python.PythonRDD +import org.apache.spark.api.python.{PythonRDD, PythonServer} import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD +import org.apache.spark.security.SocketAuthHelper private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], @@ -163,3 +166,29 @@ private[r] object RRDD { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } } + +/** + * Helper for making RDD[Array[Byte]] from some R data, by reading the data from R + * over a socket. This is used in preference to writing data to a file when encryption is enabled. + */ +private[spark] class RParallelizeServer(sc: JavaSparkContext, parallelism: Int) + extends PythonServer[JavaRDD[Array[Byte]]]( + new RSocketAuthHelper(), "sparkr-parallelize-server") { + + override def handleConnection(sock: Socket): JavaRDD[Array[Byte]] = { + val in = sock.getInputStream() + PythonRDD.readRDDFromInputStream(sc.sc, in, parallelism) + } +} + +private[spark] class RSocketAuthHelper extends SocketAuthHelper(SparkEnv.get.conf) { + override protected def readUtf8(s: Socket): String = { + val din = new DataInputStream(s.getInputStream()) + val len = din.readInt() + val bytes = new Array[Byte](len) + din.readFully(bytes) + // The R code adds a null terminator to serialized strings, so ignore it here. + assert(bytes(bytes.length - 1) == 0) // sanity check. + new String(bytes, 0, bytes.length - 1, UTF_8) + } +} diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala index fdd8cf62f0e5..9bf35af1da92 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala @@ -21,6 +21,8 @@ import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.api.python.PythonUtils private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark @@ -104,4 +106,6 @@ private[spark] object RUtils { case e: Exception => false } } + + def getEncryptionEnabled(sc: JavaSparkContext): Boolean = PythonUtils.getEncryptionEnabled(sc) } diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index cf902db8709e..d5f2865f8728 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -520,6 +520,10 @@ private[spark] class SparkSubmit extends Logging { confKey = "spark.driver.extraJavaOptions"), OptionAssigner(args.driverExtraLibraryPath, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, confKey = "spark.driver.extraLibraryPath"), + OptionAssigner(args.principal, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, + confKey = PRINCIPAL.key), + OptionAssigner(args.keytab, ALL_CLUSTER_MGRS, ALL_DEPLOY_MODES, + confKey = KEYTAB.key), // Propagate attributes for dependency resolution at the driver side OptionAssigner(args.packages, STANDALONE | MESOS, CLUSTER, confKey = "spark.jars.packages"), @@ -537,8 +541,6 @@ private[spark] class SparkSubmit extends Logging { OptionAssigner(args.jars, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.jars"), OptionAssigner(args.files, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.files"), OptionAssigner(args.archives, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.dist.archives"), - OptionAssigner(args.principal, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.principal"), - OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.keytab"), // Other options OptionAssigner(args.executorCores, STANDALONE | YARN | KUBERNETES, ALL_DEPLOY_MODES, diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 099875771545..4cf08a7980f5 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -199,8 +199,14 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S numExecutors = Option(numExecutors) .getOrElse(sparkProperties.get("spark.executor.instances").orNull) queue = Option(queue).orElse(sparkProperties.get("spark.yarn.queue")).orNull - keytab = Option(keytab).orElse(sparkProperties.get("spark.yarn.keytab")).orNull - principal = Option(principal).orElse(sparkProperties.get("spark.yarn.principal")).orNull + keytab = Option(keytab) + .orElse(sparkProperties.get("spark.kerberos.keytab")) + .orElse(sparkProperties.get("spark.yarn.keytab")) + .orNull + principal = Option(principal) + .orElse(sparkProperties.get("spark.kerberos.principal")) + .orElse(sparkProperties.get("spark.yarn.principal")) + .orNull dynamicAllocationEnabled = sparkProperties.get("spark.dynamicAllocation.enabled").exists("true".equalsIgnoreCase) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 44d23908146c..c23a659e76df 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -19,7 +19,6 @@ package org.apache.spark.deploy.history import java.io.{File, FileNotFoundException, IOException} import java.nio.file.Files -import java.nio.file.attribute.PosixFilePermissions import java.util.{Date, ServiceLoader} import java.util.concurrent.{ConcurrentHashMap, ExecutorService, Future, TimeUnit} import java.util.zip.{ZipEntry, ZipOutputStream} @@ -133,9 +132,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // Visible for testing. private[history] val listing: KVStore = storePath.map { path => - val perms = PosixFilePermissions.fromString("rwx------") - val dbPath = Files.createDirectories(new File(path, "listing.ldb").toPath(), - PosixFilePermissions.asFileAttribute(perms)).toFile() + val dbPath = Files.createDirectories(new File(path, "listing.ldb").toPath()).toFile() + Utils.chmod700(dbPath) val metadata = new FsHistoryProviderMetadata(CURRENT_LISTING_VERSION, AppStatusStore.CURRENT_VERSION, logDir.toString()) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala index c03a360b91ef..ad0dd23cb59c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala @@ -18,8 +18,6 @@ package org.apache.spark.deploy.history import java.io.File -import java.nio.file.Files -import java.nio.file.attribute.PosixFilePermissions import java.util.concurrent.atomic.AtomicLong import scala.collection.JavaConverters._ @@ -107,9 +105,8 @@ private class HistoryServerDiskManager( val needed = approximateSize(eventLogSize, isCompressed) makeRoom(needed) - val perms = PosixFilePermissions.fromString("rwx------") - val tmp = Files.createTempDirectory(tmpStoreDir.toPath(), "appstore", - PosixFilePermissions.asFileAttribute(perms)).toFile() + val tmp = Utils.createTempDir(tmpStoreDir.getPath(), "appstore") + Utils.chmod700(tmp) updateUsage(needed) val current = currentUsage.get() diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala index ef5a7e35ad56..97b689cdadd5 100644 --- a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala @@ -36,7 +36,7 @@ import org.apache.spark.util.Utils * (2) the Spark version of the client / server * (3) an optional message */ -@JsonInclude(Include.NON_NULL) +@JsonInclude(Include.NON_ABSENT) @JsonAutoDetect(getterVisibility = Visibility.ANY, setterVisibility = Visibility.ANY) @JsonPropertyOrder(alphabetic = true) private[rest] abstract class SubmitRestProtocolMessage { diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 86b19578037d..61deb543d874 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -28,6 +28,7 @@ import javax.annotation.concurrent.GuardedBy import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, HashMap, Map} +import scala.concurrent.duration._ import scala.util.control.NonFatal import com.google.common.util.concurrent.ThreadFactoryBuilder @@ -38,7 +39,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.memory.{SparkOutOfMemoryError, TaskMemoryManager} import org.apache.spark.rpc.RpcTimeout -import org.apache.spark.scheduler.{DirectTaskResult, IndirectTaskResult, Task, TaskDescription} +import org.apache.spark.scheduler._ import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{StorageLevel, TaskResultBlockId} import org.apache.spark.util._ @@ -120,7 +121,7 @@ private[spark] class Executor( } // Whether to load classes in user jars before those in Spark jars - private val userClassPathFirst = conf.getBoolean("spark.executor.userClassPathFirst", false) + private val userClassPathFirst = conf.get(EXECUTOR_USER_CLASS_PATH_FIRST) // Whether to monitor killed / interrupted tasks private val taskReaperEnabled = conf.getBoolean("spark.task.reaper.enabled", false) @@ -136,6 +137,29 @@ private[spark] class Executor( // for fetching remote cached RDD blocks, so need to make sure it uses the right classloader too. env.serializerManager.setDefaultClassLoader(replClassLoader) + private val executorPlugins: Seq[ExecutorPlugin] = { + val pluginNames = conf.get(EXECUTOR_PLUGINS) + if (pluginNames.nonEmpty) { + logDebug(s"Initializing the following plugins: ${pluginNames.mkString(", ")}") + + // Plugins need to load using a class loader that includes the executor's user classpath + val pluginList: Seq[ExecutorPlugin] = + Utils.withContextClassLoader(replClassLoader) { + val plugins = Utils.loadExtensions(classOf[ExecutorPlugin], pluginNames, conf) + plugins.foreach { plugin => + plugin.init() + logDebug(s"Successfully loaded plugin " + plugin.getClass().getCanonicalName()) + } + plugins + } + + logDebug("Finished initializing plugins") + pluginList + } else { + Nil + } + } + // Max size of direct result. If task result is bigger than this, we use the block manager // to send the result back. private val maxDirectResultSize = Math.min( @@ -147,19 +171,31 @@ private[spark] class Executor( // Maintains the list of running tasks. private val runningTasks = new ConcurrentHashMap[Long, TaskRunner] - // Executor for the heartbeat task. - private val heartbeater = ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-heartbeater") - - // must be initialized before running startDriverHeartbeat() - private val heartbeatReceiverRef = - RpcUtils.makeDriverRef(HeartbeatReceiver.ENDPOINT_NAME, conf, env.rpcEnv) - /** * When an executor is unable to send heartbeats to the driver more than `HEARTBEAT_MAX_FAILURES` * times, it should kill itself. The default value is 60. It means we will retry to send * heartbeats about 10 minutes because the heartbeat interval is 10s. */ - private val HEARTBEAT_MAX_FAILURES = conf.getInt("spark.executor.heartbeat.maxFailures", 60) + private val HEARTBEAT_MAX_FAILURES = conf.get(EXECUTOR_HEARTBEAT_MAX_FAILURES) + + /** + * Whether to drop empty accumulators from heartbeats sent to the driver. Including the empty + * accumulators (that satisfy isZero) can make the size of the heartbeat message very large. + */ + private val HEARTBEAT_DROP_ZEROES = conf.get(EXECUTOR_HEARTBEAT_DROP_ZERO_ACCUMULATOR_UPDATES) + + /** + * Interval to send heartbeats, in milliseconds + */ + private val HEARTBEAT_INTERVAL_MS = conf.get(EXECUTOR_HEARTBEAT_INTERVAL) + + // Executor for the heartbeat task. + private val heartbeater = new Heartbeater(env.memoryManager, reportHeartBeat, + "executor-heartbeater", HEARTBEAT_INTERVAL_MS) + + // must be initialized before running startDriverHeartbeat() + private val heartbeatReceiverRef = + RpcUtils.makeDriverRef(HeartbeatReceiver.ENDPOINT_NAME, conf, env.rpcEnv) /** * Count the failure times of heartbeat. It should only be accessed in the heartbeat thread. Each @@ -167,7 +203,7 @@ private[spark] class Executor( */ private var heartbeatFailures = 0 - startDriverHeartbeater() + heartbeater.start() private[executor] def numRunningTasks: Int = runningTasks.size() @@ -216,9 +252,25 @@ private[spark] class Executor( def stop(): Unit = { env.metricsSystem.report() - heartbeater.shutdown() - heartbeater.awaitTermination(10, TimeUnit.SECONDS) + try { + heartbeater.stop() + } catch { + case NonFatal(e) => + logWarning("Unable to stop heartbeater", e) + } threadPool.shutdown() + + // Notify plugins that executor is shutting down so they can terminate cleanly + Utils.withContextClassLoader(replClassLoader) { + executorPlugins.foreach { plugin => + try { + plugin.shutdown() + } catch { + case e: Exception => + logWarning("Plugin " + plugin.getClass().getCanonicalName() + " shutdown failed", e) + } + } + } if (!isLocal) { env.stop() } @@ -464,7 +516,7 @@ private[spark] class Executor( executorSource.METRIC_OUTPUT_BYTES_WRITTEN .inc(task.metrics.outputMetrics.bytesWritten) executorSource.METRIC_OUTPUT_RECORDS_WRITTEN - .inc(task.metrics.inputMetrics.recordsRead) + .inc(task.metrics.outputMetrics.recordsWritten) executorSource.METRIC_RESULT_SIZE.inc(task.metrics.resultSize) executorSource.METRIC_DISK_BYTES_SPILLED.inc(task.metrics.diskBytesSpilled) executorSource.METRIC_MEMORY_BYTES_SPILLED.inc(task.metrics.memoryBytesSpilled) @@ -787,18 +839,28 @@ private[spark] class Executor( val accumUpdates = new ArrayBuffer[(Long, Seq[AccumulatorV2[_, _]])]() val curGCTime = computeTotalGcTime() + // get executor level memory metrics + val executorUpdates = heartbeater.getCurrentMetrics() + for (taskRunner <- runningTasks.values().asScala) { if (taskRunner.task != null) { taskRunner.task.metrics.mergeShuffleReadMetrics() taskRunner.task.metrics.setJvmGCTime(curGCTime - taskRunner.startGCTime) - accumUpdates += ((taskRunner.taskId, taskRunner.task.metrics.accumulators())) + val accumulatorsToReport = + if (HEARTBEAT_DROP_ZEROES) { + taskRunner.task.metrics.accumulators().filterNot(_.isZero) + } else { + taskRunner.task.metrics.accumulators() + } + accumUpdates += ((taskRunner.taskId, accumulatorsToReport)) } } - val message = Heartbeat(executorId, accumUpdates.toArray, env.blockManager.blockManagerId) + val message = Heartbeat(executorId, accumUpdates.toArray, env.blockManager.blockManagerId, + executorUpdates) try { val response = heartbeatReceiverRef.askSync[HeartbeatResponse]( - message, RpcTimeout(conf, "spark.executor.heartbeatInterval", "10s")) + message, new RpcTimeout(HEARTBEAT_INTERVAL_MS.millis, EXECUTOR_HEARTBEAT_INTERVAL.key)) if (response.reregisterBlockManager) { logInfo("Told to re-register on heartbeat") env.blockManager.reregister() @@ -815,21 +877,6 @@ private[spark] class Executor( } } } - - /** - * Schedules a task to report heartbeat and partial metrics for active tasks to driver. - */ - private def startDriverHeartbeater(): Unit = { - val intervalMs = conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s") - - // Wait a random interval so the heartbeats don't end up in sync - val initialDelay = intervalMs + (math.random * intervalMs).asInstanceOf[Int] - - val heartbeatTask = new Runnable() { - override def run(): Unit = Utils.logUncaughtExceptions(reportHeartBeat()) - } - heartbeater.scheduleAtFixedRate(heartbeatTask, initialDelay, intervalMs, TimeUnit.MILLISECONDS) - } } private[spark] object Executor { diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala new file mode 100644 index 000000000000..2933f3ba6d3b --- /dev/null +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.executor + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.metrics.ExecutorMetricType + +/** + * :: DeveloperApi :: + * Metrics tracked for executors and the driver. + * + * Executor-level metrics are sent from each executor to the driver as part of the Heartbeat. + */ +@DeveloperApi +class ExecutorMetrics private[spark] extends Serializable { + + // Metrics are indexed by MetricGetter.values + private val metrics = new Array[Long](ExecutorMetricType.values.length) + + // the first element is initialized to -1, indicating that the values for the array + // haven't been set yet. + metrics(0) = -1 + + /** Returns the value for the specified metricType. */ + def getMetricValue(metricType: ExecutorMetricType): Long = { + metrics(ExecutorMetricType.metricIdxMap(metricType)) + } + + /** Returns true if the values for the metrics have been set, false otherwise. */ + def isSet(): Boolean = metrics(0) > -1 + + private[spark] def this(metrics: Array[Long]) { + this() + Array.copy(metrics, 0, this.metrics, 0, Math.min(metrics.size, this.metrics.size)) + } + + /** + * Constructor: create the ExecutorMetrics with the values specified. + * + * @param executorMetrics map of executor metric name to value + */ + private[spark] def this(executorMetrics: Map[String, Long]) { + this() + (0 until ExecutorMetricType.values.length).foreach { idx => + metrics(idx) = executorMetrics.getOrElse(ExecutorMetricType.values(idx).name, 0L) + } + } + + /** + * Compare the specified executor metrics values with the current executor metric values, + * and update the value for any metrics where the new value for the metric is larger. + * + * @param executorMetrics the executor metrics to compare + * @return if there is a new peak value for any metric + */ + private[spark] def compareAndUpdatePeakValues(executorMetrics: ExecutorMetrics): Boolean = { + var updated = false + + (0 until ExecutorMetricType.values.length).foreach { idx => + if (executorMetrics.metrics(idx) > metrics(idx)) { + updated = true + metrics(idx) = executorMetrics.metrics(idx) + } + } + updated + } +} diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 319e664a1967..e8b1d8859cc4 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -69,15 +69,34 @@ package object config { .bytesConf(ByteUnit.KiB) .createWithDefaultString("100k") + private[spark] val EVENT_LOG_STAGE_EXECUTOR_METRICS = + ConfigBuilder("spark.eventLog.logStageExecutorMetrics.enabled") + .booleanConf + .createWithDefault(false) + private[spark] val EVENT_LOG_OVERWRITE = ConfigBuilder("spark.eventLog.overwrite").booleanConf.createWithDefault(false) - private[spark] val EVENT_LOG_CALLSITE_FORM = - ConfigBuilder("spark.eventLog.callsite").stringConf.createWithDefault("short") + private[spark] val EVENT_LOG_CALLSITE_LONG_FORM = + ConfigBuilder("spark.eventLog.longForm.enabled").booleanConf.createWithDefault(false) private[spark] val EXECUTOR_CLASS_PATH = ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_CLASSPATH).stringConf.createOptional + private[spark] val EXECUTOR_HEARTBEAT_DROP_ZERO_ACCUMULATOR_UPDATES = + ConfigBuilder("spark.executor.heartbeat.dropZeroAccumulatorUpdates") + .internal() + .booleanConf + .createWithDefault(true) + + private[spark] val EXECUTOR_HEARTBEAT_INTERVAL = + ConfigBuilder("spark.executor.heartbeatInterval") + .timeConf(TimeUnit.MILLISECONDS) + .createWithDefaultString("10s") + + private[spark] val EXECUTOR_HEARTBEAT_MAX_FAILURES = + ConfigBuilder("spark.executor.heartbeat.maxFailures").internal().intConf.createWithDefault(60) + private[spark] val EXECUTOR_JAVA_OPTIONS = ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS).stringConf.createOptional @@ -147,11 +166,11 @@ package object config { private[spark] val SHUFFLE_SERVICE_PORT = ConfigBuilder("spark.shuffle.service.port").intConf.createWithDefault(7337) - private[spark] val KEYTAB = ConfigBuilder("spark.yarn.keytab") + private[spark] val KEYTAB = ConfigBuilder("spark.kerberos.keytab") .doc("Location of user's keytab.") .stringConf.createOptional - private[spark] val PRINCIPAL = ConfigBuilder("spark.yarn.principal") + private[spark] val PRINCIPAL = ConfigBuilder("spark.kerberos.principal") .doc("Name of the Kerberos principal.") .stringConf.createOptional @@ -618,4 +637,14 @@ package object config { .intConf .checkValue(v => v > 0, "The max failures should be a positive value.") .createWithDefault(40) + + private[spark] val EXECUTOR_PLUGINS = + ConfigBuilder("spark.executor.plugins") + .doc("Comma-separated list of class names for \"plugins\" implementing " + + "org.apache.spark.ExecutorPlugin. Plugins have the same privileges as any task " + + "in a Spark executor. They can also interfere with task execution and fail in " + + "unexpected ways. So be sure to only use this for trusted plugins.") + .stringConf + .toSequence + .createWithDefault(Nil) } diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala index 0641adc2ab69..4fde2d0beaa7 100644 --- a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala +++ b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala @@ -180,6 +180,34 @@ private[spark] abstract class MemoryManager( onHeapStorageMemoryPool.memoryUsed + offHeapStorageMemoryPool.memoryUsed } + /** + * On heap execution memory currently in use, in bytes. + */ + final def onHeapExecutionMemoryUsed: Long = synchronized { + onHeapExecutionMemoryPool.memoryUsed + } + + /** + * Off heap execution memory currently in use, in bytes. + */ + final def offHeapExecutionMemoryUsed: Long = synchronized { + offHeapExecutionMemoryPool.memoryUsed + } + + /** + * On heap storage memory currently in use, in bytes. + */ + final def onHeapStorageMemoryUsed: Long = synchronized { + onHeapStorageMemoryPool.memoryUsed + } + + /** + * Off heap storage memory currently in use, in bytes. + */ + final def offHeapStorageMemoryUsed: Long = synchronized { + offHeapStorageMemoryPool.memoryUsed + } + /** * Returns the execution memory consumption, in bytes, for the given task. */ diff --git a/core/src/main/scala/org/apache/spark/metrics/ExecutorMetricType.scala b/core/src/main/scala/org/apache/spark/metrics/ExecutorMetricType.scala new file mode 100644 index 000000000000..cd10dad25e87 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/metrics/ExecutorMetricType.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.metrics + +import java.lang.management.{BufferPoolMXBean, ManagementFactory} +import javax.management.ObjectName + +import org.apache.spark.memory.MemoryManager + +/** + * Executor metric types for executor-level metrics stored in ExecutorMetrics. + */ +sealed trait ExecutorMetricType { + private[spark] def getMetricValue(memoryManager: MemoryManager): Long + private[spark] val name = getClass().getName().stripSuffix("$").split("""\.""").last +} + +private[spark] abstract class MemoryManagerExecutorMetricType( + f: MemoryManager => Long) extends ExecutorMetricType { + override private[spark] def getMetricValue(memoryManager: MemoryManager): Long = { + f(memoryManager) + } +} + +private[spark] abstract class MBeanExecutorMetricType(mBeanName: String) + extends ExecutorMetricType { + private val bean = ManagementFactory.newPlatformMXBeanProxy( + ManagementFactory.getPlatformMBeanServer, + new ObjectName(mBeanName).toString, classOf[BufferPoolMXBean]) + + override private[spark] def getMetricValue(memoryManager: MemoryManager): Long = { + bean.getMemoryUsed + } +} + +case object JVMHeapMemory extends ExecutorMetricType { + override private[spark] def getMetricValue(memoryManager: MemoryManager): Long = { + ManagementFactory.getMemoryMXBean.getHeapMemoryUsage().getUsed() + } +} + +case object JVMOffHeapMemory extends ExecutorMetricType { + override private[spark] def getMetricValue(memoryManager: MemoryManager): Long = { + ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage().getUsed() + } +} + +case object OnHeapExecutionMemory extends MemoryManagerExecutorMetricType( + _.onHeapExecutionMemoryUsed) + +case object OffHeapExecutionMemory extends MemoryManagerExecutorMetricType( + _.offHeapExecutionMemoryUsed) + +case object OnHeapStorageMemory extends MemoryManagerExecutorMetricType( + _.onHeapStorageMemoryUsed) + +case object OffHeapStorageMemory extends MemoryManagerExecutorMetricType( + _.offHeapStorageMemoryUsed) + +case object OnHeapUnifiedMemory extends MemoryManagerExecutorMetricType( + (m => m.onHeapExecutionMemoryUsed + m.onHeapStorageMemoryUsed)) + +case object OffHeapUnifiedMemory extends MemoryManagerExecutorMetricType( + (m => m.offHeapExecutionMemoryUsed + m.offHeapStorageMemoryUsed)) + +case object DirectPoolMemory extends MBeanExecutorMetricType( + "java.nio:type=BufferPool,name=direct") + +case object MappedPoolMemory extends MBeanExecutorMetricType( + "java.nio:type=BufferPool,name=mapped") + +private[spark] object ExecutorMetricType { + // List of all executor metric types + val values = IndexedSeq( + JVMHeapMemory, + JVMOffHeapMemory, + OnHeapExecutionMemory, + OffHeapExecutionMemory, + OnHeapStorageMemory, + OffHeapStorageMemory, + OnHeapUnifiedMemory, + OffHeapUnifiedMemory, + DirectPoolMemory, + MappedPoolMemory + ) + + // Map of executor metric type to its index in values. + val metricIdxMap = + Map[ExecutorMetricType, Int](ExecutorMetricType.values.zipWithIndex: _*) +} diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala index 859a2f6bcd45..61e74e05169c 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdSink.scala @@ -17,7 +17,7 @@ package org.apache.spark.metrics.sink -import java.util.Properties +import java.util.{Locale, Properties} import java.util.concurrent.TimeUnit import com.codahale.metrics.MetricRegistry @@ -52,7 +52,8 @@ private[spark] class StatsdSink( val pollPeriod = property.getProperty(STATSD_KEY_PERIOD, STATSD_DEFAULT_PERIOD).toInt val pollUnit = - TimeUnit.valueOf(property.getProperty(STATSD_KEY_UNIT, STATSD_DEFAULT_UNIT).toUpperCase) + TimeUnit.valueOf( + property.getProperty(STATSD_KEY_UNIT, STATSD_DEFAULT_UNIT).toUpperCase(Locale.ROOT)) val prefix = property.getProperty(STATSD_KEY_PREFIX, STATSD_DEFAULT_PREFIX) diff --git a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala index 1d8a266d0079..eef8c31e05ab 100644 --- a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala @@ -26,7 +26,7 @@ import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer, NioManagedBuffer} -import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempFileManager} +import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager, ShuffleClient} import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils @@ -68,7 +68,7 @@ abstract class BlockTransferService extends ShuffleClient with Closeable with Lo execId: String, blockIds: Array[String], listener: BlockFetchingListener, - tempFileManager: TempFileManager): Unit + tempFileManager: DownloadFileManager): Unit /** * Upload a single block to a remote node, available only after [[init]] is invoked. @@ -92,7 +92,7 @@ abstract class BlockTransferService extends ShuffleClient with Closeable with Lo port: Int, execId: String, blockId: String, - tempFileManager: TempFileManager): ManagedBuffer = { + tempFileManager: DownloadFileManager): ManagedBuffer = { // A monitor for the thread to wait on. val result = Promise[ManagedBuffer]() fetchBlocks(host, port, execId, Array(blockId), diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala index 1905632a936d..dc55685b1e7b 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala @@ -33,7 +33,7 @@ import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClientBootstrap, TransportClientFactory} import org.apache.spark.network.crypto.{AuthClientBootstrap, AuthServerBootstrap} import org.apache.spark.network.server._ -import org.apache.spark.network.shuffle.{BlockFetchingListener, OneForOneBlockFetcher, RetryingBlockFetcher, TempFileManager} +import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager, OneForOneBlockFetcher, RetryingBlockFetcher} import org.apache.spark.network.shuffle.protocol.{UploadBlock, UploadBlockStream} import org.apache.spark.network.util.JavaUtils import org.apache.spark.serializer.JavaSerializer @@ -106,7 +106,7 @@ private[spark] class NettyBlockTransferService( execId: String, blockIds: Array[String], listener: BlockFetchingListener, - tempFileManager: TempFileManager): Unit = { + tempFileManager: DownloadFileManager): Unit = { logTrace(s"Fetch blocks from $host:$port (executor id $execId)") try { val blockFetchStarter = new RetryingBlockFetcher.BlockFetchStarter { diff --git a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala index a5992022d083..5b1c02425752 100644 --- a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala @@ -35,7 +35,8 @@ import org.apache.spark.internal.Logging * * val rdd: RDD[(String, Int)] = ... * implicit val caseInsensitiveOrdering = new Ordering[String] { - * override def compare(a: String, b: String) = a.toLowerCase.compare(b.toLowerCase) + * override def compare(a: String, b: String) = + * a.toLowerCase(Locale.ROOT).compare(b.toLowerCase(Locale.ROOT)) * } * * // Sort by key, using the above case insensitive ordering. diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 61ad6dfdb221..743e3441eea5 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -42,7 +42,8 @@ import org.apache.spark.partial.GroupedCountEvaluator import org.apache.spark.partial.PartialResult import org.apache.spark.storage.{RDDBlockId, StorageLevel} import org.apache.spark.util.{BoundedPriorityQueue, Utils} -import org.apache.spark.util.collection.{OpenHashMap, Utils => collectionUtils} +import org.apache.spark.util.collection.{ExternalAppendOnlyMap, OpenHashMap, + Utils => collectionUtils} import org.apache.spark.util.random.{BernoulliCellSampler, BernoulliSampler, PoissonSampler, SamplingUtils} @@ -396,7 +397,20 @@ abstract class RDD[T: ClassTag]( * Return a new RDD containing the distinct elements in this RDD. */ def distinct(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope { - map(x => (x, null)).reduceByKey((x, y) => x, numPartitions).map(_._1) + def removeDuplicatesInPartition(partition: Iterator[T]): Iterator[T] = { + // Create an instance of external append only map which ignores values. + val map = new ExternalAppendOnlyMap[T, Null, Null]( + createCombiner = value => null, + mergeValue = (a, b) => a, + mergeCombiners = (a, b) => a) + map.insertAll(partition.map(_ -> null)) + map.iterator.map(_._1) + } + partitioner match { + case Some(p) if numPartitions == partitions.length => + mapPartitions(removeDuplicatesInPartition, preservesPartitioning = true) + case _ => map(x => (x, null)).reduceByKey((x, y) => x, numPartitions).map(_._1) + } } /** diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala index 53d69ba26811..3abb2d8a11f3 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDDOperationScope.scala @@ -41,7 +41,7 @@ import org.apache.spark.internal.Logging * There is no particular relationship between an operation scope and a stage or a job. * A scope may live inside one stage (e.g. map) or span across multiple jobs (e.g. take). */ -@JsonInclude(Include.NON_NULL) +@JsonInclude(Include.NON_ABSENT) @JsonPropertyOrder(Array("id", "name", "parent")) private[spark] class RDDOperationScope( val name: String, diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 50c91da8b13d..f93d8a8d5de5 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -35,7 +35,7 @@ import org.apache.commons.lang3.SerializationUtils import org.apache.spark._ import org.apache.spark.broadcast.Broadcast -import org.apache.spark.executor.TaskMetrics +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.Logging import org.apache.spark.internal.config import org.apache.spark.network.util.JavaUtils @@ -264,8 +264,11 @@ private[spark] class DAGScheduler( execId: String, // (taskId, stageId, stageAttemptId, accumUpdates) accumUpdates: Array[(Long, Int, Int, Seq[AccumulableInfo])], - blockManagerId: BlockManagerId): Boolean = { - listenerBus.post(SparkListenerExecutorMetricsUpdate(execId, accumUpdates)) + blockManagerId: BlockManagerId, + // executor metrics indexed by MetricGetter.values + executorUpdates: ExecutorMetrics): Boolean = { + listenerBus.post(SparkListenerExecutorMetricsUpdate(execId, accumUpdates, + Some(executorUpdates))) blockManagerMaster.driverEndpoint.askSync[Boolean]( BlockManagerHeartbeat(blockManagerId), new RpcTimeout(600 seconds, "BlockManagerHeartbeat")) } @@ -1242,9 +1245,10 @@ private[spark] class DAGScheduler( private def updateAccumulators(event: CompletionEvent): Unit = { val task = event.task val stage = stageIdToStage(task.stageId) - try { - event.accumUpdates.foreach { updates => - val id = updates.id + + event.accumUpdates.foreach { updates => + val id = updates.id + try { // Find the corresponding accumulator on the driver and update it val acc: AccumulatorV2[Any, Any] = AccumulatorContext.get(id) match { case Some(accum) => accum.asInstanceOf[AccumulatorV2[Any, Any]] @@ -1258,10 +1262,17 @@ private[spark] class DAGScheduler( event.taskInfo.setAccumulables( acc.toInfo(Some(updates.value), Some(acc.value)) +: event.taskInfo.accumulables) } + } catch { + case NonFatal(e) => + // Log the class name to make it easy to find the bad implementation + val accumClassName = AccumulatorContext.get(id) match { + case Some(accum) => accum.getClass.getName + case None => "Unknown class" + } + logError( + s"Failed to update accumulator $id ($accumClassName) for task ${task.partitionId}", + e) } - } catch { - case NonFatal(e) => - logError(s"Failed to update accumulators for task ${task.partitionId}", e) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index 69bc51c1ecf9..1629e1797977 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -23,8 +23,7 @@ import java.nio.charset.StandardCharsets import java.util.EnumSet import java.util.Locale -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.{ArrayBuffer, Map} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} @@ -36,6 +35,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.{SPARK_VERSION, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.io.CompressionCodec @@ -51,6 +51,7 @@ import org.apache.spark.util.{JsonProtocol, Utils} * spark.eventLog.overwrite - Whether to overwrite any existing files. * spark.eventLog.dir - Path to the directory in which events are logged. * spark.eventLog.buffer.kb - Buffer size to use when writing to output streams + * spark.eventLog.logStageExecutorMetrics.enabled - Whether to log stage executor metrics */ private[spark] class EventLoggingListener( appId: String, @@ -69,6 +70,7 @@ private[spark] class EventLoggingListener( private val shouldCompress = sparkConf.get(EVENT_LOG_COMPRESS) private val shouldOverwrite = sparkConf.get(EVENT_LOG_OVERWRITE) private val shouldLogBlockUpdates = sparkConf.get(EVENT_LOG_BLOCK_UPDATES) + private val shouldLogStageExecutorMetrics = sparkConf.get(EVENT_LOG_STAGE_EXECUTOR_METRICS) private val testing = sparkConf.get(EVENT_LOG_TESTING) private val outputBufferSize = sparkConf.get(EVENT_LOG_OUTPUT_BUFFER_SIZE).toInt private val fileSystem = Utils.getHadoopFileSystem(logBaseDir, hadoopConf) @@ -93,6 +95,9 @@ private[spark] class EventLoggingListener( // Visible for tests only. private[scheduler] val logPath = getLogPath(logBaseDir, appId, appAttemptId, compressionCodecName) + // map of (stageId, stageAttempt), to peak executor metrics for the stage + private val liveStageExecutorMetrics = Map.empty[(Int, Int), Map[String, ExecutorMetrics]] + /** * Creates the log file in the configured log directory. */ @@ -155,7 +160,14 @@ private[spark] class EventLoggingListener( } // Events that do not trigger a flush - override def onStageSubmitted(event: SparkListenerStageSubmitted): Unit = logEvent(event) + override def onStageSubmitted(event: SparkListenerStageSubmitted): Unit = { + logEvent(event) + if (shouldLogStageExecutorMetrics) { + // record the peak metrics for the new stage + liveStageExecutorMetrics.put((event.stageInfo.stageId, event.stageInfo.attemptNumber()), + Map.empty[String, ExecutorMetrics]) + } + } override def onTaskStart(event: SparkListenerTaskStart): Unit = logEvent(event) @@ -169,6 +181,26 @@ private[spark] class EventLoggingListener( // Events that trigger a flush override def onStageCompleted(event: SparkListenerStageCompleted): Unit = { + if (shouldLogStageExecutorMetrics) { + // clear out any previous attempts, that did not have a stage completed event + val prevAttemptId = event.stageInfo.attemptNumber() - 1 + for (attemptId <- 0 to prevAttemptId) { + liveStageExecutorMetrics.remove((event.stageInfo.stageId, attemptId)) + } + + // log the peak executor metrics for the stage, for each live executor, + // whether or not the executor is running tasks for the stage + val executorOpt = liveStageExecutorMetrics.remove( + (event.stageInfo.stageId, event.stageInfo.attemptNumber())) + executorOpt.foreach { execMap => + execMap.foreach { case (executorId, peakExecutorMetrics) => + logEvent(new SparkListenerStageExecutorMetrics(executorId, event.stageInfo.stageId, + event.stageInfo.attemptNumber(), peakExecutorMetrics)) + } + } + } + + // log stage completed event logEvent(event, flushLogger = true) } @@ -234,8 +266,18 @@ private[spark] class EventLoggingListener( } } - // No-op because logging every update would be overkill - override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate): Unit = { } + override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate): Unit = { + if (shouldLogStageExecutorMetrics) { + // For the active stages, record any new peak values for the memory metrics for the executor + event.executorUpdates.foreach { executorUpdates => + liveStageExecutorMetrics.values.foreach { peakExecutorMetrics => + val peakMetrics = peakExecutorMetrics.getOrElseUpdate( + event.execId, new ExecutorMetrics()) + peakMetrics.compareAndUpdatePeakValues(executorUpdates) + } + } + } + } override def onOtherEvent(event: SparkListenerEvent): Unit = { if (event.logEvent) { @@ -296,7 +338,7 @@ private[spark] object EventLoggingListener extends Logging { private val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort) // A cache for compression codecs to avoid creating the same codec many times - private val codecMap = new mutable.HashMap[String, CompressionCodec] + private val codecMap = Map.empty[String, CompressionCodec] /** * Write metadata about an event log to the given stream. diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala index 7e1d75fe723d..0e221edf3965 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala @@ -31,8 +31,7 @@ import org.apache.spark.util.Utils /** * Result returned by a ShuffleMapTask to a scheduler. Includes the block manager address that the - * task ran on, the sizes of outputs for each reducer, and the number of outputs of the map task, - * for passing on to the reduce tasks. + * task ran on as well as the sizes of outputs for each reducer, for passing on to the reduce tasks. */ private[spark] sealed trait MapStatus { /** Location where this task was run. */ @@ -45,23 +44,24 @@ private[spark] sealed trait MapStatus { * necessary for correctness, since block fetchers are allowed to skip zero-size blocks. */ def getSizeForBlock(reduceId: Int): Long - - /** - * The number of outputs for the map task. - */ - def numberOfOutput: Long } private[spark] object MapStatus { - def apply(loc: BlockManagerId, uncompressedSizes: Array[Long], numOutput: Long): MapStatus = { - if (uncompressedSizes.length > Option(SparkEnv.get) - .map(_.conf.get(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS)) - .getOrElse(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS.defaultValue.get)) { - HighlyCompressedMapStatus(loc, uncompressedSizes, numOutput) + /** + * Min partition number to use [[HighlyCompressedMapStatus]]. A bit ugly here because in test + * code we can't assume SparkEnv.get exists. + */ + private lazy val minPartitionsToUseHighlyCompressMapStatus = Option(SparkEnv.get) + .map(_.conf.get(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS)) + .getOrElse(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS.defaultValue.get) + + def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): MapStatus = { + if (uncompressedSizes.length > minPartitionsToUseHighlyCompressMapStatus) { + HighlyCompressedMapStatus(loc, uncompressedSizes) } else { - new CompressedMapStatus(loc, uncompressedSizes, numOutput) + new CompressedMapStatus(loc, uncompressedSizes) } } @@ -104,34 +104,29 @@ private[spark] object MapStatus { */ private[spark] class CompressedMapStatus( private[this] var loc: BlockManagerId, - private[this] var compressedSizes: Array[Byte], - private[this] var numOutput: Long) + private[this] var compressedSizes: Array[Byte]) extends MapStatus with Externalizable { - protected def this() = this(null, null.asInstanceOf[Array[Byte]], -1) // For deserialization only + protected def this() = this(null, null.asInstanceOf[Array[Byte]]) // For deserialization only - def this(loc: BlockManagerId, uncompressedSizes: Array[Long], numOutput: Long) { - this(loc, uncompressedSizes.map(MapStatus.compressSize), numOutput) + def this(loc: BlockManagerId, uncompressedSizes: Array[Long]) { + this(loc, uncompressedSizes.map(MapStatus.compressSize)) } override def location: BlockManagerId = loc - override def numberOfOutput: Long = numOutput - override def getSizeForBlock(reduceId: Int): Long = { MapStatus.decompressSize(compressedSizes(reduceId)) } override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { loc.writeExternal(out) - out.writeLong(numOutput) out.writeInt(compressedSizes.length) out.write(compressedSizes) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { loc = BlockManagerId(in) - numOutput = in.readLong() val len = in.readInt() compressedSizes = new Array[Byte](len) in.readFully(compressedSizes) @@ -154,20 +149,17 @@ private[spark] class HighlyCompressedMapStatus private ( private[this] var numNonEmptyBlocks: Int, private[this] var emptyBlocks: RoaringBitmap, private[this] var avgSize: Long, - private var hugeBlockSizes: Map[Int, Byte], - private[this] var numOutput: Long) + private var hugeBlockSizes: Map[Int, Byte]) extends MapStatus with Externalizable { // loc could be null when the default constructor is called during deserialization require(loc == null || avgSize > 0 || hugeBlockSizes.size > 0 || numNonEmptyBlocks == 0, "Average size can only be zero for map stages that produced no output") - protected def this() = this(null, -1, null, -1, null, -1) // For deserialization only + protected def this() = this(null, -1, null, -1, null) // For deserialization only override def location: BlockManagerId = loc - override def numberOfOutput: Long = numOutput - override def getSizeForBlock(reduceId: Int): Long = { assert(hugeBlockSizes != null) if (emptyBlocks.contains(reduceId)) { @@ -182,7 +174,6 @@ private[spark] class HighlyCompressedMapStatus private ( override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { loc.writeExternal(out) - out.writeLong(numOutput) emptyBlocks.writeExternal(out) out.writeLong(avgSize) out.writeInt(hugeBlockSizes.size) @@ -194,7 +185,6 @@ private[spark] class HighlyCompressedMapStatus private ( override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { loc = BlockManagerId(in) - numOutput = in.readLong() emptyBlocks = new RoaringBitmap() emptyBlocks.readExternal(in) avgSize = in.readLong() @@ -210,10 +200,7 @@ private[spark] class HighlyCompressedMapStatus private ( } private[spark] object HighlyCompressedMapStatus { - def apply( - loc: BlockManagerId, - uncompressedSizes: Array[Long], - numOutput: Long): HighlyCompressedMapStatus = { + def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): HighlyCompressedMapStatus = { // We must keep track of which blocks are empty so that we don't report a zero-sized // block as being non-empty (or vice-versa) when using the average block size. var i = 0 @@ -254,6 +241,6 @@ private[spark] object HighlyCompressedMapStatus { emptyBlocks.trim() emptyBlocks.runOptimize() new HighlyCompressedMapStatus(loc, numNonEmptyBlocks, emptyBlocks, avgSize, - hugeBlockSizesArray.toMap, numOutput) + hugeBlockSizesArray.toMap) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala index 8a112f6a37b9..293e8369677f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala @@ -26,7 +26,7 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo import org.apache.spark.{SparkConf, TaskEndReason} import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.executor.TaskMetrics +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.storage.{BlockManagerId, BlockUpdatedInfo} import org.apache.spark.ui.SparkUI @@ -160,11 +160,29 @@ case class SparkListenerBlockUpdated(blockUpdatedInfo: BlockUpdatedInfo) extends * Periodic updates from executors. * @param execId executor id * @param accumUpdates sequence of (taskId, stageId, stageAttemptId, accumUpdates) + * @param executorUpdates executor level metrics updates */ @DeveloperApi case class SparkListenerExecutorMetricsUpdate( execId: String, - accumUpdates: Seq[(Long, Int, Int, Seq[AccumulableInfo])]) + accumUpdates: Seq[(Long, Int, Int, Seq[AccumulableInfo])], + executorUpdates: Option[ExecutorMetrics] = None) + extends SparkListenerEvent + +/** + * Peak metric values for the executor for the stage, written to the history log at stage + * completion. + * @param execId executor id + * @param stageId stage id + * @param stageAttemptId stage attempt + * @param executorMetrics executor level metrics, indexed by MetricGetter.values + */ +@DeveloperApi +case class SparkListenerStageExecutorMetrics( + execId: String, + stageId: Int, + stageAttemptId: Int, + executorMetrics: ExecutorMetrics) extends SparkListenerEvent @DeveloperApi @@ -264,6 +282,13 @@ private[spark] trait SparkListenerInterface { */ def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate): Unit + /** + * Called with the peak memory metrics for a given (executor, stage) combination. Note that this + * is only present when reading from the event log (as in the history server), and is never + * called in a live application. + */ + def onStageExecutorMetrics(executorMetrics: SparkListenerStageExecutorMetrics): Unit + /** * Called when the driver registers a new executor. */ @@ -361,6 +386,9 @@ abstract class SparkListener extends SparkListenerInterface { override def onExecutorMetricsUpdate( executorMetricsUpdate: SparkListenerExecutorMetricsUpdate): Unit = { } + override def onStageExecutorMetrics( + executorMetrics: SparkListenerStageExecutorMetrics): Unit = { } + override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = { } override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = { } diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala index ff19cc65552e..8f6b7ad30960 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala @@ -57,6 +57,8 @@ private[spark] trait SparkListenerBus listener.onApplicationEnd(applicationEnd) case metricsUpdate: SparkListenerExecutorMetricsUpdate => listener.onExecutorMetricsUpdate(metricsUpdate) + case stageExecutorMetrics: SparkListenerStageExecutorMetrics => + listener.onStageExecutorMetrics(stageExecutorMetrics) case executorAdded: SparkListenerExecutorAdded => listener.onExecutorAdded(executorAdded) case executorRemoved: SparkListenerExecutorRemoved => diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala index 95f7ae4fd39a..94221eb0d551 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala @@ -17,6 +17,7 @@ package org.apache.spark.scheduler +import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 @@ -74,14 +75,15 @@ private[spark] trait TaskScheduler { def defaultParallelism(): Int /** - * Update metrics for in-progress tasks and let the master know that the BlockManager is still - * alive. Return true if the driver knows about the given block manager. Otherwise, return false, - * indicating that the block manager should re-register. + * Update metrics for in-progress tasks and executor metrics, and let the master know that the + * BlockManager is still alive. Return true if the driver knows about the given block manager. + * Otherwise, return false, indicating that the block manager should re-register. */ def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], - blockManagerId: BlockManagerId): Boolean + blockManagerId: BlockManagerId, + executorUpdates: ExecutorMetrics): Boolean /** * Get an application ID associated with the job. diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 8b7117066863..4f870e85ad38 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -28,6 +28,7 @@ import scala.util.Random import org.apache.spark._ import org.apache.spark.TaskState.TaskState +import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.Logging import org.apache.spark.internal.config import org.apache.spark.rpc.RpcEndpoint @@ -508,14 +509,15 @@ private[spark] class TaskSchedulerImpl( } /** - * Update metrics for in-progress tasks and let the master know that the BlockManager is still - * alive. Return true if the driver knows about the given block manager. Otherwise, return false, - * indicating that the block manager should re-register. + * Update metrics for in-progress tasks and executor metrics, and let the master know that the + * BlockManager is still alive. Return true if the driver knows about the given block manager. + * Otherwise, return false, indicating that the block manager should re-register. */ override def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], - blockManagerId: BlockManagerId): Boolean = { + blockManagerId: BlockManagerId, + executorMetrics: ExecutorMetrics): Boolean = { // (taskId, stageId, stageAttemptId, accumUpdates) val accumUpdatesWithTaskIds: Array[(Long, Int, Int, Seq[AccumulableInfo])] = { accumUpdates.flatMap { case (id, updates) => @@ -525,7 +527,8 @@ private[spark] class TaskSchedulerImpl( } } } - dagScheduler.executorHeartbeatReceived(execId, accumUpdatesWithTaskIds, blockManagerId) + dagScheduler.executorHeartbeatReceived(execId, accumUpdatesWithTaskIds, blockManagerId, + executorMetrics) } def handleTaskGettingResult(taskSetManager: TaskSetManager, tid: Long): Unit = synchronized { diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala index 00621976b77f..18b735b8035a 100644 --- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala +++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.security -import java.io.{InputStream, OutputStream} +import java.io.{Closeable, InputStream, IOException, OutputStream} import java.nio.ByteBuffer import java.nio.channels.{ReadableByteChannel, WritableByteChannel} import java.util.Properties @@ -54,8 +54,10 @@ private[spark] object CryptoStreamUtils extends Logging { val params = new CryptoParams(key, sparkConf) val iv = createInitializationVector(params.conf) os.write(iv) - new CryptoOutputStream(params.transformation, params.conf, os, params.keySpec, - new IvParameterSpec(iv)) + new ErrorHandlingOutputStream( + new CryptoOutputStream(params.transformation, params.conf, os, params.keySpec, + new IvParameterSpec(iv)), + os) } /** @@ -70,8 +72,10 @@ private[spark] object CryptoStreamUtils extends Logging { val helper = new CryptoHelperChannel(channel) helper.write(ByteBuffer.wrap(iv)) - new CryptoOutputStream(params.transformation, params.conf, helper, params.keySpec, - new IvParameterSpec(iv)) + new ErrorHandlingWritableChannel( + new CryptoOutputStream(params.transformation, params.conf, helper, params.keySpec, + new IvParameterSpec(iv)), + helper) } /** @@ -84,8 +88,10 @@ private[spark] object CryptoStreamUtils extends Logging { val iv = new Array[Byte](IV_LENGTH_IN_BYTES) ByteStreams.readFully(is, iv) val params = new CryptoParams(key, sparkConf) - new CryptoInputStream(params.transformation, params.conf, is, params.keySpec, - new IvParameterSpec(iv)) + new ErrorHandlingInputStream( + new CryptoInputStream(params.transformation, params.conf, is, params.keySpec, + new IvParameterSpec(iv)), + is) } /** @@ -100,8 +106,10 @@ private[spark] object CryptoStreamUtils extends Logging { JavaUtils.readFully(channel, buf) val params = new CryptoParams(key, sparkConf) - new CryptoInputStream(params.transformation, params.conf, channel, params.keySpec, - new IvParameterSpec(iv)) + new ErrorHandlingReadableChannel( + new CryptoInputStream(params.transformation, params.conf, channel, params.keySpec, + new IvParameterSpec(iv)), + channel) } def toCryptoConf(conf: SparkConf): Properties = { @@ -157,6 +165,117 @@ private[spark] object CryptoStreamUtils extends Logging { } + /** + * SPARK-25535. The commons-cryto library will throw InternalError if something goes + * wrong, and leave bad state behind in the Java wrappers, so it's not safe to use them + * afterwards. This wrapper detects that situation and avoids further calls into the + * commons-crypto code, while still allowing the underlying streams to be closed. + * + * This should be removed once CRYPTO-141 is fixed (and Spark upgrades its commons-crypto + * dependency). + */ + trait BaseErrorHandler extends Closeable { + + private var closed = false + + /** The encrypted stream that may get into an unhealthy state. */ + protected def cipherStream: Closeable + + /** + * The underlying stream that is being wrapped by the encrypted stream, so that it can be + * closed even if there's an error in the crypto layer. + */ + protected def original: Closeable + + protected def safeCall[T](fn: => T): T = { + if (closed) { + throw new IOException("Cipher stream is closed.") + } + try { + fn + } catch { + case ie: InternalError => + closed = true + original.close() + throw ie + } + } + + override def close(): Unit = { + if (!closed) { + cipherStream.close() + } + } + + } + + // Visible for testing. + class ErrorHandlingReadableChannel( + protected val cipherStream: ReadableByteChannel, + protected val original: ReadableByteChannel) + extends ReadableByteChannel with BaseErrorHandler { + + override def read(src: ByteBuffer): Int = safeCall { + cipherStream.read(src) + } + + override def isOpen(): Boolean = cipherStream.isOpen() + + } + + private class ErrorHandlingInputStream( + protected val cipherStream: InputStream, + protected val original: InputStream) + extends InputStream with BaseErrorHandler { + + override def read(b: Array[Byte]): Int = safeCall { + cipherStream.read(b) + } + + override def read(b: Array[Byte], off: Int, len: Int): Int = safeCall { + cipherStream.read(b, off, len) + } + + override def read(): Int = safeCall { + cipherStream.read() + } + } + + private class ErrorHandlingWritableChannel( + protected val cipherStream: WritableByteChannel, + protected val original: WritableByteChannel) + extends WritableByteChannel with BaseErrorHandler { + + override def write(src: ByteBuffer): Int = safeCall { + cipherStream.write(src) + } + + override def isOpen(): Boolean = cipherStream.isOpen() + + } + + private class ErrorHandlingOutputStream( + protected val cipherStream: OutputStream, + protected val original: OutputStream) + extends OutputStream with BaseErrorHandler { + + override def flush(): Unit = safeCall { + cipherStream.flush() + } + + override def write(b: Array[Byte]): Unit = safeCall { + cipherStream.write(b) + } + + override def write(b: Array[Byte], off: Int, len: Int): Unit = safeCall { + cipherStream.write(b, off, len) + } + + override def write(b: Int): Unit = safeCall { + cipherStream.write(b) + } + } + private class CryptoParams(key: Array[Byte], sparkConf: SparkConf) { val keySpec = new SecretKeySpec(key, "AES") diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala index 91fc26762e53..274399b9cc1f 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala @@ -70,8 +70,7 @@ private[spark] class SortShuffleWriter[K, V, C]( val blockId = ShuffleBlockId(dep.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID) val partitionLengths = sorter.writePartitionedFile(blockId, tmp) shuffleBlockResolver.writeIndexFileAndCommit(dep.shuffleId, mapId, partitionLengths, tmp) - mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths, - writeMetrics.recordsWritten) + mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths) } finally { if (tmp.exists() && !tmp.delete()) { logError(s"Error while deleting temp file ${tmp.getAbsolutePath}") diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala index 91b75e485299..36aaf67b5729 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable.HashMap import org.apache.spark._ -import org.apache.spark.executor.TaskMetrics +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.Logging import org.apache.spark.scheduler._ import org.apache.spark.status.api.v1 @@ -66,6 +66,7 @@ private[spark] class AppStatusListener( private val liveStages = new ConcurrentHashMap[(Int, Int), LiveStage]() private val liveJobs = new HashMap[Int, LiveJob]() private val liveExecutors = new HashMap[String, LiveExecutor]() + private val deadExecutors = new HashMap[String, LiveExecutor]() private val liveTasks = new HashMap[Long, LiveTask]() private val liveRDDs = new HashMap[Int, LiveRDD]() private val pools = new HashMap[String, SchedulerPool]() @@ -204,6 +205,19 @@ private[spark] class AppStatusListener( update(rdd, now) } } + if (isExecutorActiveForLiveStages(exec)) { + // the executor was running for a currently active stage, so save it for now in + // deadExecutors, and remove when there are no active stages overlapping with the + // executor. + deadExecutors.put(event.executorId, exec) + } + } + } + + /** Was the specified executor active for any currently live stages? */ + private def isExecutorActiveForLiveStages(exec: LiveExecutor): Boolean = { + liveStages.values.asScala.exists { stage => + stage.info.submissionTime.getOrElse(0L) < exec.removeTime.getTime } } @@ -374,10 +388,11 @@ private[spark] class AppStatusListener( job.completionTime = if (event.time > 0) Some(new Date(event.time)) else None update(job, now, last = true) + if (job.status == JobExecutionStatus.SUCCEEDED) { + appSummary = new AppSummary(appSummary.numCompletedJobs + 1, appSummary.numCompletedStages) + kvstore.write(appSummary) + } } - - appSummary = new AppSummary(appSummary.numCompletedJobs + 1, appSummary.numCompletedStages) - kvstore.write(appSummary) } override def onStageSubmitted(event: SparkListenerStageSubmitted): Unit = { @@ -639,10 +654,14 @@ private[spark] class AppStatusListener( if (removeStage) { liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptNumber)) } + if (stage.status == v1.StageStatus.COMPLETE) { + appSummary = new AppSummary(appSummary.numCompletedJobs, appSummary.numCompletedStages + 1) + kvstore.write(appSummary) + } } - appSummary = new AppSummary(appSummary.numCompletedJobs, appSummary.numCompletedStages + 1) - kvstore.write(appSummary) + // remove any dead executors that were not running for any currently active stages + deadExecutors.retain((execId, exec) => isExecutorActiveForLiveStages(exec)) } private def removeBlackListedStageFrom(exec: LiveExecutor, stageId: Int, now: Long) = { @@ -669,7 +688,37 @@ private[spark] class AppStatusListener( } override def onUnpersistRDD(event: SparkListenerUnpersistRDD): Unit = { - liveRDDs.remove(event.rddId) + liveRDDs.remove(event.rddId).foreach { liveRDD => + val storageLevel = liveRDD.info.storageLevel + + // Use RDD partition info to update executor block info. + liveRDD.getPartitions().foreach { case (_, part) => + part.executors.foreach { executorId => + liveExecutors.get(executorId).foreach { exec => + exec.rddBlocks = exec.rddBlocks - 1 + } + } + } + + val now = System.nanoTime() + + // Use RDD distribution to update executor memory and disk usage info. + liveRDD.getDistributions().foreach { case (executorId, rddDist) => + liveExecutors.get(executorId).foreach { exec => + if (exec.hasMemoryInfo) { + if (storageLevel.useOffHeap) { + exec.usedOffHeap = addDeltaToValue(exec.usedOffHeap, -rddDist.offHeapUsed) + } else { + exec.usedOnHeap = addDeltaToValue(exec.usedOnHeap, -rddDist.onHeapUsed) + } + } + exec.memoryUsed = addDeltaToValue(exec.memoryUsed, -rddDist.memoryUsed) + exec.diskUsed = addDeltaToValue(exec.diskUsed, -rddDist.diskUsed) + maybeUpdate(exec, now) + } + } + } + kvstore.delete(classOf[RDDStorageInfoWrapper], event.rddId) } @@ -692,6 +741,31 @@ private[spark] class AppStatusListener( } } } + + // check if there is a new peak value for any of the executor level memory metrics + // for the live UI. SparkListenerExecutorMetricsUpdate events are only processed + // for the live UI. + event.executorUpdates.foreach { updates => + liveExecutors.get(event.execId).foreach { exec => + if (exec.peakExecutorMetrics.compareAndUpdatePeakValues(updates)) { + maybeUpdate(exec, now) + } + } + } + } + + override def onStageExecutorMetrics(executorMetrics: SparkListenerStageExecutorMetrics): Unit = { + val now = System.nanoTime() + + // check if there is a new peak value for any of the executor level memory metrics, + // while reading from the log. SparkListenerStageExecutorMetrics are only processed + // when reading logs. + liveExecutors.get(executorMetrics.execId) + .orElse(deadExecutors.get(executorMetrics.execId)).map { exec => + if (exec.peakExecutorMetrics.compareAndUpdatePeakValues(executorMetrics.executorMetrics)) { + update(exec, now) + } + } } override def onBlockUpdated(event: SparkListenerBlockUpdated): Unit = { @@ -728,6 +802,11 @@ private[spark] class AppStatusListener( .sortBy(_.stageId) } + /** + * Apply a delta to a value, but ensure that it doesn't go negative. + */ + private def addDeltaToValue(old: Long, delta: Long): Long = math.max(0, old + delta) + private def updateRDDBlock(event: SparkListenerBlockUpdated, block: RDDBlockId): Unit = { val now = System.nanoTime() val executorId = event.blockUpdatedInfo.blockManagerId.executorId @@ -737,9 +816,6 @@ private[spark] class AppStatusListener( val diskDelta = event.blockUpdatedInfo.diskSize * (if (storageLevel.useDisk) 1 else -1) val memoryDelta = event.blockUpdatedInfo.memSize * (if (storageLevel.useMemory) 1 else -1) - // Function to apply a delta to a value, but ensure that it doesn't go negative. - def newValue(old: Long, delta: Long): Long = math.max(0, old + delta) - val updatedStorageLevel = if (storageLevel.isValid) { Some(storageLevel.description) } else { @@ -756,13 +832,13 @@ private[spark] class AppStatusListener( maybeExec.foreach { exec => if (exec.hasMemoryInfo) { if (storageLevel.useOffHeap) { - exec.usedOffHeap = newValue(exec.usedOffHeap, memoryDelta) + exec.usedOffHeap = addDeltaToValue(exec.usedOffHeap, memoryDelta) } else { - exec.usedOnHeap = newValue(exec.usedOnHeap, memoryDelta) + exec.usedOnHeap = addDeltaToValue(exec.usedOnHeap, memoryDelta) } } - exec.memoryUsed = newValue(exec.memoryUsed, memoryDelta) - exec.diskUsed = newValue(exec.diskUsed, diskDelta) + exec.memoryUsed = addDeltaToValue(exec.memoryUsed, memoryDelta) + exec.diskUsed = addDeltaToValue(exec.diskUsed, diskDelta) } // Update the block entry in the RDD info, keeping track of the deltas above so that we @@ -790,8 +866,8 @@ private[spark] class AppStatusListener( // Only update the partition if it's still stored in some executor, otherwise get rid of it. if (executors.nonEmpty) { partition.update(executors, rdd.storageLevel, - newValue(partition.memoryUsed, memoryDelta), - newValue(partition.diskUsed, diskDelta)) + addDeltaToValue(partition.memoryUsed, memoryDelta), + addDeltaToValue(partition.diskUsed, diskDelta)) } else { rdd.removePartition(block.name) } @@ -799,14 +875,14 @@ private[spark] class AppStatusListener( maybeExec.foreach { exec => if (exec.rddBlocks + rddBlocksDelta > 0) { val dist = rdd.distribution(exec) - dist.memoryUsed = newValue(dist.memoryUsed, memoryDelta) - dist.diskUsed = newValue(dist.diskUsed, diskDelta) + dist.memoryUsed = addDeltaToValue(dist.memoryUsed, memoryDelta) + dist.diskUsed = addDeltaToValue(dist.diskUsed, diskDelta) if (exec.hasMemoryInfo) { if (storageLevel.useOffHeap) { - dist.offHeapUsed = newValue(dist.offHeapUsed, memoryDelta) + dist.offHeapUsed = addDeltaToValue(dist.offHeapUsed, memoryDelta) } else { - dist.onHeapUsed = newValue(dist.onHeapUsed, memoryDelta) + dist.onHeapUsed = addDeltaToValue(dist.onHeapUsed, memoryDelta) } } dist.lastUpdate = null @@ -825,8 +901,8 @@ private[spark] class AppStatusListener( } } - rdd.memoryUsed = newValue(rdd.memoryUsed, memoryDelta) - rdd.diskUsed = newValue(rdd.diskUsed, diskDelta) + rdd.memoryUsed = addDeltaToValue(rdd.memoryUsed, memoryDelta) + rdd.diskUsed = addDeltaToValue(rdd.diskUsed, diskDelta) update(rdd, now) } diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala index e237281c552b..9839cbb99f86 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala @@ -112,10 +112,12 @@ private[spark] class AppStatusStore( } } - def stageAttempt(stageId: Int, stageAttemptId: Int, details: Boolean = false): v1.StageData = { + def stageAttempt(stageId: Int, stageAttemptId: Int, + details: Boolean = false): (v1.StageData, Seq[Int]) = { val stageKey = Array(stageId, stageAttemptId) - val stage = store.read(classOf[StageDataWrapper], stageKey).info - if (details) stageWithDetails(stage) else stage + val stageDataWrapper = store.read(classOf[StageDataWrapper], stageKey) + val stage = if (details) stageWithDetails(stageDataWrapper.info) else stageDataWrapper.info + (stage, stageDataWrapper.jobIds.toSeq) } def taskCount(stageId: Int, stageAttemptId: Int): Long = { diff --git a/core/src/main/scala/org/apache/spark/status/KVUtils.scala b/core/src/main/scala/org/apache/spark/status/KVUtils.scala index 99b1843d8e1c..45348be5c98b 100644 --- a/core/src/main/scala/org/apache/spark/status/KVUtils.scala +++ b/core/src/main/scala/org/apache/spark/status/KVUtils.scala @@ -42,7 +42,7 @@ private[spark] object KVUtils extends Logging { private[spark] class KVStoreScalaSerializer extends KVStoreSerializer { mapper.registerModule(DefaultScalaModule) - mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL) + mapper.setSerializationInclusion(JsonInclude.Include.NON_ABSENT) } diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala index 79e3f13b826c..8708e64db3c1 100644 --- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala +++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala @@ -26,14 +26,13 @@ import scala.collection.mutable.HashMap import com.google.common.collect.Interners import org.apache.spark.JobExecutionStatus -import org.apache.spark.executor.TaskMetrics +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.scheduler.{AccumulableInfo, StageInfo, TaskInfo} import org.apache.spark.status.api.v1 import org.apache.spark.storage.RDDInfo import org.apache.spark.ui.SparkUI import org.apache.spark.util.AccumulatorContext import org.apache.spark.util.collection.OpenHashSet -import org.apache.spark.util.kvstore.KVStore /** * A mutable representation of a live entity in Spark (jobs, stages, tasks, et al). Every live @@ -268,6 +267,9 @@ private class LiveExecutor(val executorId: String, _addTime: Long) extends LiveE def hasMemoryInfo: Boolean = totalOnHeap >= 0L + // peak values for executor level metrics + val peakExecutorMetrics = new ExecutorMetrics() + def hostname: String = if (host != null) host else hostPort.split(":")(0) override protected def doUpdate(): Any = { @@ -302,10 +304,10 @@ private class LiveExecutor(val executorId: String, _addTime: Long) extends LiveE Option(removeReason), executorLogs, memoryMetrics, - blacklistedInStages) + blacklistedInStages, + Some(peakExecutorMetrics).filter(_.isSet)) new ExecutorSummaryWrapper(info) } - } private class LiveExecutorStageSummary( @@ -538,6 +540,10 @@ private class LiveRDD(val info: RDDInfo) extends LiveEntity { distributions.get(exec.executorId) } + def getPartitions(): scala.collection.Map[String, LiveRDDPartition] = partitions + + def getDistributions(): scala.collection.Map[String, LiveRDDDistribution] = distributions + override protected def doUpdate(): Any = { val dists = if (distributions.nonEmpty) { Some(distributions.values.map(_.toApi()).toSeq) @@ -581,8 +587,7 @@ private object LiveEntityHelpers { .filter { acc => // We don't need to store internal or SQL accumulables as their values will be shown in // other places, so drop them to reduce the memory usage. - !acc.internal && (!acc.metadata.isDefined || - acc.metadata.get != Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER)) + !acc.internal && acc.metadata != Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER) } .map { acc => new v1.AccumulableInfo( diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala index 4560d300cb0c..50a286d0d3b0 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala @@ -49,7 +49,7 @@ private[v1] class JacksonMessageWriter extends MessageBodyWriter[Object]{ } mapper.registerModule(com.fasterxml.jackson.module.scala.DefaultScalaModule) mapper.enable(SerializationFeature.INDENT_OUTPUT) - mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL) + mapper.setSerializationInclusion(JsonInclude.Include.NON_ABSENT) mapper.setDateFormat(JacksonMessageWriter.makeISODateFormat) override def isWriteable( diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala index 32100c570453..1f4082cac8f7 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala @@ -175,7 +175,7 @@ private[v1] class OneApplicationAttemptResource extends AbstractApplicationResou def getAttempt(): ApplicationAttemptInfo = { uiRoot.getApplicationInfo(appId) .flatMap { app => - app.attempts.filter(_.attemptId == attemptId).headOption + app.attempts.find(_.attemptId.contains(attemptId)) } .getOrElse { throw new NotFoundException(s"unknown app $appId, attempt $attemptId") diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala index 96249e4bfd5f..30d52b97833e 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala @@ -56,7 +56,7 @@ private[v1] class StagesResource extends BaseAppResource { @PathParam("stageAttemptId") stageAttemptId: Int, @QueryParam("details") @DefaultValue("true") details: Boolean): StageData = withUI { ui => try { - ui.store.stageAttempt(stageId, stageAttemptId, details = details) + ui.store.stageAttempt(stageId, stageAttemptId, details = details)._1 } catch { case _: NoSuchElementException => // Change the message depending on whether there are any attempts for the requested stage. diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala index 971d7e90fa7b..30afd8b76972 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala @@ -22,9 +22,14 @@ import java.util.Date import scala.xml.{NodeSeq, Text} import com.fasterxml.jackson.annotation.JsonIgnoreProperties -import com.fasterxml.jackson.databind.annotation.JsonDeserialize +import com.fasterxml.jackson.core.{JsonGenerator, JsonParser} +import com.fasterxml.jackson.core.`type`.TypeReference +import com.fasterxml.jackson.databind.{DeserializationContext, JsonDeserializer, JsonSerializer, SerializerProvider} +import com.fasterxml.jackson.databind.annotation.{JsonDeserialize, JsonSerialize} import org.apache.spark.JobExecutionStatus +import org.apache.spark.executor.ExecutorMetrics +import org.apache.spark.metrics.ExecutorMetricType case class ApplicationInfo private[spark]( id: String, @@ -98,7 +103,10 @@ class ExecutorSummary private[spark]( val removeReason: Option[String], val executorLogs: Map[String, String], val memoryMetrics: Option[MemoryMetrics], - val blacklistedInStages: Set[Int]) + val blacklistedInStages: Set[Int], + @JsonSerialize(using = classOf[ExecutorMetricsJsonSerializer]) + @JsonDeserialize(using = classOf[ExecutorMetricsJsonDeserializer]) + val peakMemoryMetrics: Option[ExecutorMetrics]) class MemoryMetrics private[spark]( val usedOnHeapStorageMemory: Long, @@ -106,6 +114,36 @@ class MemoryMetrics private[spark]( val totalOnHeapStorageMemory: Long, val totalOffHeapStorageMemory: Long) +/** deserializer for peakMemoryMetrics: convert map to ExecutorMetrics */ +private[spark] class ExecutorMetricsJsonDeserializer + extends JsonDeserializer[Option[ExecutorMetrics]] { + override def deserialize( + jsonParser: JsonParser, + deserializationContext: DeserializationContext): Option[ExecutorMetrics] = { + val metricsMap = jsonParser.readValueAs[Option[Map[String, Long]]]( + new TypeReference[Option[Map[String, java.lang.Long]]] {}) + metricsMap.map(metrics => new ExecutorMetrics(metrics)) + } +} +/** serializer for peakMemoryMetrics: convert ExecutorMetrics to map with metric name as key */ +private[spark] class ExecutorMetricsJsonSerializer + extends JsonSerializer[Option[ExecutorMetrics]] { + override def serialize( + metrics: Option[ExecutorMetrics], + jsonGenerator: JsonGenerator, + serializerProvider: SerializerProvider): Unit = { + metrics.foreach { m: ExecutorMetrics => + val metricsMap = ExecutorMetricType.values.map { metricType => + metricType.name -> m.getMetricValue(metricType) + }.toMap + jsonGenerator.writeObject(metricsMap) + } + } + + override def isEmpty(provider: SerializerProvider, value: Option[ExecutorMetrics]): Boolean = + value.isEmpty +} + class JobData private[spark]( val jobId: Int, val name: String, diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index f5c69ad241e3..0fe82ac0cedc 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -33,6 +33,7 @@ import scala.util.Random import scala.util.control.NonFatal import com.codahale.metrics.{MetricRegistry, MetricSet} +import com.google.common.io.CountingOutputStream import org.apache.spark._ import org.apache.spark.executor.{DataReadMethod, ShuffleWriteMetrics} @@ -43,8 +44,9 @@ import org.apache.spark.network._ import org.apache.spark.network.buffer.ManagedBuffer import org.apache.spark.network.client.StreamCallbackWithID import org.apache.spark.network.netty.SparkTransportConf -import org.apache.spark.network.shuffle.{ExternalShuffleClient, TempFileManager} +import org.apache.spark.network.shuffle._ import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo +import org.apache.spark.network.util.TransportConf import org.apache.spark.rpc.RpcEnv import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.serializer.{SerializerInstance, SerializerManager} @@ -213,11 +215,11 @@ private[spark] class BlockManager( private var blockReplicationPolicy: BlockReplicationPolicy = _ - // A TempFileManager used to track all the files of remote blocks which above the + // A DownloadFileManager used to track all the files of remote blocks which are above the // specified memory threshold. Files will be deleted automatically based on weak reference. // Exposed for test private[storage] val remoteBlockTempFileManager = - new BlockManager.RemoteBlockTempFileManager(this) + new BlockManager.RemoteBlockDownloadFileManager(this) private val maxRemoteBlockToMem = conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM) /** @@ -436,10 +438,8 @@ private[spark] class BlockManager( // stream. channel.close() // TODO SPARK-25035 Even if we're only going to write the data to disk after this, we end up - // using a lot of memory here. With encryption, we'll read the whole file into a regular - // byte buffer and OOM. Without encryption, we'll memory map the file and won't get a jvm - // OOM, but might get killed by the OS / cluster manager. We could at least read the tmp - // file as a stream in both cases. + // using a lot of memory here. We'll read the whole file into a regular + // byte buffer and OOM. We could at least read the tmp file as a stream. val buffer = securityManager.getIOEncryptionKey() match { case Some(key) => // we need to pass in the size of the unencrypted block @@ -451,7 +451,7 @@ private[spark] class BlockManager( new EncryptedBlockData(tmpFile, blockSize, conf, key).toChunkedByteBuffer(allocator) case None => - ChunkedByteBuffer.map(tmpFile, conf.get(config.MEMORY_MAP_LIMIT_FOR_TESTS).toInt) + ChunkedByteBuffer.fromFile(tmpFile, conf.get(config.MEMORY_MAP_LIMIT_FOR_TESTS).toInt) } putBytes(blockId, buffer, level)(classTag) tmpFile.delete() @@ -724,10 +724,9 @@ private[spark] class BlockManager( */ def getRemoteBytes(blockId: BlockId): Option[ChunkedByteBuffer] = { // TODO if we change this method to return the ManagedBuffer, then getRemoteValues - // could just use the inputStream on the temp file, rather than memory-mapping the file. + // could just use the inputStream on the temp file, rather than reading the file into memory. // Until then, replication can cause the process to use too much memory and get killed - // by the OS / cluster manager (not a java OOM, since it's a memory-mapped file) even though - // we've read the data to disk. + // even though we've read the data to disk. logDebug(s"Getting remote block $blockId") require(blockId != null, "BlockId is null") var runningFailureCount = 0 @@ -1664,23 +1663,28 @@ private[spark] object BlockManager { metricRegistry.registerAll(metricSet) } - class RemoteBlockTempFileManager(blockManager: BlockManager) - extends TempFileManager with Logging { + class RemoteBlockDownloadFileManager(blockManager: BlockManager) + extends DownloadFileManager with Logging { + // lazy because SparkEnv is set after this + lazy val encryptionKey = SparkEnv.get.securityManager.getIOEncryptionKey() - private class ReferenceWithCleanup(file: File, referenceQueue: JReferenceQueue[File]) - extends WeakReference[File](file, referenceQueue) { - private val filePath = file.getAbsolutePath + private class ReferenceWithCleanup( + file: DownloadFile, + referenceQueue: JReferenceQueue[DownloadFile] + ) extends WeakReference[DownloadFile](file, referenceQueue) { + + val filePath = file.path() def cleanUp(): Unit = { logDebug(s"Clean up file $filePath") - if (!new File(filePath).delete()) { + if (!file.delete()) { logDebug(s"Fail to delete file $filePath") } } } - private val referenceQueue = new JReferenceQueue[File] + private val referenceQueue = new JReferenceQueue[DownloadFile] private val referenceBuffer = Collections.newSetFromMap[ReferenceWithCleanup]( new ConcurrentHashMap) @@ -1692,11 +1696,21 @@ private[spark] object BlockManager { cleaningThread.setName("RemoteBlock-temp-file-clean-thread") cleaningThread.start() - override def createTempFile(): File = { - blockManager.diskBlockManager.createTempLocalBlock()._2 + override def createTempFile(transportConf: TransportConf): DownloadFile = { + val file = blockManager.diskBlockManager.createTempLocalBlock()._2 + encryptionKey match { + case Some(key) => + // encryption is enabled, so when we read the decrypted data off the network, we need to + // encrypt it when writing to disk. Note that the data may have been encrypted when it + // was cached on disk on the remote side, but it was already decrypted by now (see + // EncryptedBlockData). + new EncryptedDownloadFile(file, key) + case None => + new SimpleDownloadFile(file, transportConf) + } } - override def registerTempFileToClean(file: File): Boolean = { + override def registerTempFileToClean(file: DownloadFile): Boolean = { referenceBuffer.add(new ReferenceWithCleanup(file, referenceQueue)) } @@ -1724,4 +1738,39 @@ private[spark] object BlockManager { } } } + + /** + * A DownloadFile that encrypts data when it is written, and decrypts when it's read. + */ + private class EncryptedDownloadFile( + file: File, + key: Array[Byte]) extends DownloadFile { + + private val env = SparkEnv.get + + override def delete(): Boolean = file.delete() + + override def openForWriting(): DownloadFileWritableChannel = { + new EncryptedDownloadWritableChannel() + } + + override def path(): String = file.getAbsolutePath + + private class EncryptedDownloadWritableChannel extends DownloadFileWritableChannel { + private val countingOutput: CountingWritableChannel = new CountingWritableChannel( + Channels.newChannel(env.serializerManager.wrapForEncryption(new FileOutputStream(file)))) + + override def closeAndRead(): ManagedBuffer = { + countingOutput.close() + val size = countingOutput.getCount + new EncryptedManagedBuffer(new EncryptedBlockData(file, size, env.conf, key)) + } + + override def write(src: ByteBuffer): Int = countingOutput.write(src) + + override def isOpen: Boolean = countingOutput.isOpen() + + override def close(): Unit = countingOutput.close() + } + } } diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala index a820bc70b33b..d88bd710d1ea 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala @@ -30,6 +30,7 @@ import io.netty.channel.DefaultFileRegion import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.internal.{config, Logging} +import org.apache.spark.network.buffer.ManagedBuffer import org.apache.spark.network.util.{AbstractFileRegion, JavaUtils} import org.apache.spark.security.CryptoStreamUtils import org.apache.spark.util.Utils @@ -260,7 +261,22 @@ private class EncryptedBlockData( throw e } } +} + +private class EncryptedManagedBuffer(val blockData: EncryptedBlockData) extends ManagedBuffer { + + // This is the size of the decrypted data + override def size(): Long = blockData.size + + override def nioByteBuffer(): ByteBuffer = blockData.toByteBuffer() + + override def convertToNetty(): AnyRef = blockData.toNetty() + + override def createInputStream(): InputStream = blockData.toInputStream() + + override def retain(): ManagedBuffer = this + override def release(): ManagedBuffer = this } private class ReadableChannelFileRegion(source: ReadableByteChannel, blockSize: Long) diff --git a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala index 9ccc8f9cc585..917cfab1c699 100644 --- a/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala +++ b/core/src/main/scala/org/apache/spark/storage/RDDInfo.scala @@ -55,14 +55,17 @@ class RDDInfo( } private[spark] object RDDInfo { - private val callsiteForm = SparkEnv.get.conf.get(EVENT_LOG_CALLSITE_FORM) - def fromRdd(rdd: RDD[_]): RDDInfo = { val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd)) val parentIds = rdd.dependencies.map(_.rdd.id) - val callSite = callsiteForm match { - case "short" => rdd.creationSite.shortForm - case "long" => rdd.creationSite.longForm + val callsiteLongForm = Option(SparkEnv.get) + .map(_.conf.get(EVENT_LOG_CALLSITE_LONG_FORM)) + .getOrElse(false) + + val callSite = if (callsiteLongForm) { + rdd.creationSite.longForm + } else { + rdd.creationSite.shortForm } new RDDInfo(rdd.id, rddName, rdd.partitions.length, rdd.getStorageLevel, parentIds, callSite, rdd.scope) diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index 00d01dd28afb..aecc2284a958 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -17,7 +17,7 @@ package org.apache.spark.storage -import java.io.{File, InputStream, IOException} +import java.io.{InputStream, IOException} import java.nio.ByteBuffer import java.util.concurrent.LinkedBlockingQueue import javax.annotation.concurrent.GuardedBy @@ -28,7 +28,8 @@ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Queue} import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} -import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempFileManager} +import org.apache.spark.network.shuffle._ +import org.apache.spark.network.util.TransportConf import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.util.Utils import org.apache.spark.util.io.ChunkedByteBufferOutputStream @@ -71,7 +72,7 @@ final class ShuffleBlockFetcherIterator( maxBlocksInFlightPerAddress: Int, maxReqSizeShuffleToMem: Long, detectCorrupt: Boolean) - extends Iterator[(BlockId, InputStream)] with TempFileManager with Logging { + extends Iterator[(BlockId, InputStream)] with DownloadFileManager with Logging { import ShuffleBlockFetcherIterator._ @@ -150,7 +151,7 @@ final class ShuffleBlockFetcherIterator( * deleted when cleanup. This is a layer of defensiveness against disk file leaks. */ @GuardedBy("this") - private[this] val shuffleFilesSet = mutable.HashSet[File]() + private[this] val shuffleFilesSet = mutable.HashSet[DownloadFile]() initialize() @@ -164,11 +165,15 @@ final class ShuffleBlockFetcherIterator( currentResult = null } - override def createTempFile(): File = { - blockManager.diskBlockManager.createTempLocalBlock()._2 + override def createTempFile(transportConf: TransportConf): DownloadFile = { + // we never need to do any encryption or decryption here, regardless of configs, because that + // is handled at another layer in the code. When encryption is enabled, shuffle data is written + // to disk encrypted in the first place, and sent over the network still encrypted. + new SimpleDownloadFile( + blockManager.diskBlockManager.createTempLocalBlock()._2, transportConf) } - override def registerTempFileToClean(file: File): Boolean = synchronized { + override def registerTempFileToClean(file: DownloadFile): Boolean = synchronized { if (isZombie) { false } else { @@ -204,7 +209,7 @@ final class ShuffleBlockFetcherIterator( } shuffleFilesSet.foreach { file => if (!file.delete()) { - logWarning("Failed to cleanup shuffle fetch temp file " + file.getAbsolutePath()) + logWarning("Failed to cleanup shuffle fetch temp file " + file.path()) } } } @@ -443,35 +448,35 @@ final class ShuffleBlockFetcherIterator( buf.release() throwFetchFailedException(blockId, address, e) } - - input = streamWrapper(blockId, in) - // Only copy the stream if it's wrapped by compression or encryption, also the size of - // block is small (the decompressed block is smaller than maxBytesInFlight) - if (detectCorrupt && !input.eq(in) && size < maxBytesInFlight / 3) { - val originalInput = input - val out = new ChunkedByteBufferOutputStream(64 * 1024, ByteBuffer.allocate) - try { + var isStreamCopied: Boolean = false + try { + input = streamWrapper(blockId, in) + // Only copy the stream if it's wrapped by compression or encryption, also the size of + // block is small (the decompressed block is smaller than maxBytesInFlight) + if (detectCorrupt && !input.eq(in) && size < maxBytesInFlight / 3) { + isStreamCopied = true + val out = new ChunkedByteBufferOutputStream(64 * 1024, ByteBuffer.allocate) // Decompress the whole block at once to detect any corruption, which could increase // the memory usage tne potential increase the chance of OOM. // TODO: manage the memory used here, and spill it into disk in case of OOM. - Utils.copyStream(input, out) - out.close() + Utils.copyStream(input, out, closeStreams = true) input = out.toChunkedByteBuffer.toInputStream(dispose = true) - } catch { - case e: IOException => - buf.release() - if (buf.isInstanceOf[FileSegmentManagedBuffer] - || corruptedBlocks.contains(blockId)) { - throwFetchFailedException(blockId, address, e) - } else { - logWarning(s"got an corrupted block $blockId from $address, fetch again", e) - corruptedBlocks += blockId - fetchRequests += FetchRequest(address, Array((blockId, size))) - result = null - } - } finally { - // TODO: release the buf here to free memory earlier - originalInput.close() + } + } catch { + case e: IOException => + buf.release() + if (buf.isInstanceOf[FileSegmentManagedBuffer] + || corruptedBlocks.contains(blockId)) { + throwFetchFailedException(blockId, address, e) + } else { + logWarning(s"got an corrupted block $blockId from $address, fetch again", e) + corruptedBlocks += blockId + fetchRequests += FetchRequest(address, Array((blockId, size))) + result = null + } + } finally { + // TODO: release the buf here to free memory earlier + if (isStreamCopied) { in.close() } } diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index 55eb98996266..0f74b07a6265 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -105,7 +105,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We val stageAttemptId = parameterAttempt.toInt val stageHeader = s"Details for Stage $stageId (Attempt $stageAttemptId)" - val stageData = parent.store + val (stageData, stageJobIds) = parent.store .asOption(parent.store.stageAttempt(stageId, stageAttemptId, details = false)) .getOrElse { val content = @@ -117,7 +117,8 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We val localitySummary = store.localitySummary(stageData.stageId, stageData.attemptId) - val totalTasks = taskCount(stageData) + val totalTasks = stageData.numActiveTasks + stageData.numCompleteTasks + + stageData.numFailedTasks + stageData.numKilledTasks if (totalTasks == 0) { val content =
@@ -132,7 +133,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We val totalTasksNumStr = if (totalTasks == storedTasks) { s"$totalTasks" } else { - s"$storedTasks, showing ${totalTasks}" + s"$totalTasks, showing $storedTasks" } val summary = @@ -182,6 +183,15 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We {Utils.bytesToString(stageData.diskBytesSpilled)} }} + {if (!stageJobIds.isEmpty) { +
  • + Associated Job Ids: + {stageJobIds.map(jobId => {val detailUrl = "%s/jobs/job/?id=%s".format( + UIUtils.prependBaseUri(request, parent.basePath), jobId) + {s"${jobId}"}    + })} +
  • + }}
    @@ -685,7 +695,7 @@ private[ui] class TaskDataSource( private var _tasksToShow: Seq[TaskData] = null - override def dataSize: Int = taskCount(stage) + override def dataSize: Int = store.taskCount(stage.stageId, stage.attemptId).toInt override def sliceData(from: Int, to: Int): Seq[TaskData] = { if (_tasksToShow == null) { @@ -1047,13 +1057,8 @@ private[ui] object ApiHelper { } def lastStageNameAndDescription(store: AppStatusStore, job: JobData): (String, String) = { - val stage = store.asOption(store.stageAttempt(job.stageIds.max, 0)) + val stage = store.asOption(store.stageAttempt(job.stageIds.max, 0)._1) (stage.map(_.name).getOrElse(""), stage.flatMap(_.description).getOrElse(job.name)) } - def taskCount(stageData: StageData): Int = { - stageData.numActiveTasks + stageData.numCompleteTasks + stageData.numFailedTasks + - stageData.numKilledTasks - } - } diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala index b6c300c4778b..6c4740c00210 100644 --- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala +++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala @@ -175,7 +175,7 @@ private[spark] object ClosureCleaner extends Logging { closure.getClass.isSynthetic && closure .getClass - .getInterfaces.exists(_.getName.equals("scala.Serializable")) + .getInterfaces.exists(_.getName == "scala.Serializable") if (isClosureCandidate) { try { @@ -285,8 +285,6 @@ private[spark] object ClosureCleaner extends Logging { innerClasses.foreach { c => logDebug(s" ${c.getName}") } logDebug(s" + outer classes: ${outerClasses.size}" ) outerClasses.foreach { c => logDebug(s" ${c.getName}") } - logDebug(s" + outer objects: ${outerObjects.size}") - outerObjects.foreach { o => logDebug(s" $o") } } // Fail fast if we detect return statements in closures @@ -318,19 +316,20 @@ private[spark] object ClosureCleaner extends Logging { if (outerPairs.nonEmpty) { val (outermostClass, outermostObject) = outerPairs.head if (isClosure(outermostClass)) { - logDebug(s" + outermost object is a closure, so we clone it: ${outerPairs.head}") + logDebug(s" + outermost object is a closure, so we clone it: ${outermostClass}") } else if (outermostClass.getName.startsWith("$line")) { // SPARK-14558: if the outermost object is a REPL line object, we should clone // and clean it as it may carray a lot of unnecessary information, // e.g. hadoop conf, spark conf, etc. - logDebug(s" + outermost object is a REPL line object, so we clone it: ${outerPairs.head}") + logDebug(s" + outermost object is a REPL line object, so we clone it:" + + s" ${outermostClass}") } else { // The closure is ultimately nested inside a class; keep the object of that // class without cloning it since we don't want to clone the user's objects. // Note that we still need to keep around the outermost object itself because // we need it to clone its child closure later (see below). - logDebug(" + outermost object is not a closure or REPL line object," + - "so do not clone it: " + outerPairs.head) + logDebug(s" + outermost object is not a closure or REPL line object," + + s" so do not clone it: ${outermostClass}") parent = outermostObject // e.g. SparkContext outerPairs = outerPairs.tail } @@ -341,7 +340,7 @@ private[spark] object ClosureCleaner extends Logging { // Clone the closure objects themselves, nulling out any fields that are not // used in the closure we're working on or any of its inner closures. for ((cls, obj) <- outerPairs) { - logDebug(s" + cloning the object $obj of class ${cls.getName}") + logDebug(s" + cloning instance of class ${cls.getName}") // We null out these unused references by cloning each object and then filling in all // required fields from the original object. We need the parent here because the Java // language specification requires the first constructor parameter of any closure to be @@ -351,7 +350,7 @@ private[spark] object ClosureCleaner extends Logging { // If transitive cleaning is enabled, we recursively clean any enclosing closure using // the already populated accessed fields map of the starting closure if (cleanTransitively && isClosure(clone.getClass)) { - logDebug(s" + cleaning cloned closure $clone recursively (${cls.getName})") + logDebug(s" + cleaning cloned closure recursively (${cls.getName})") // No need to check serializable here for the outer closures because we're // only interested in the serializability of the starting closure clean(clone, checkSerializable = false, cleanTransitively, accessedFields) diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index 50c6461373de..0cd8612b8fd1 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -31,6 +31,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark._ import org.apache.spark.executor._ +import org.apache.spark.metrics.ExecutorMetricType import org.apache.spark.rdd.RDDOperationScope import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo @@ -98,6 +99,8 @@ private[spark] object JsonProtocol { logStartToJson(logStart) case metricsUpdate: SparkListenerExecutorMetricsUpdate => executorMetricsUpdateToJson(metricsUpdate) + case stageExecutorMetrics: SparkListenerStageExecutorMetrics => + stageExecutorMetricsToJson(stageExecutorMetrics) case blockUpdate: SparkListenerBlockUpdated => blockUpdateToJson(blockUpdate) case _ => parse(mapper.writeValueAsString(event)) @@ -236,6 +239,7 @@ private[spark] object JsonProtocol { def executorMetricsUpdateToJson(metricsUpdate: SparkListenerExecutorMetricsUpdate): JValue = { val execId = metricsUpdate.execId val accumUpdates = metricsUpdate.accumUpdates + val executorMetrics = metricsUpdate.executorUpdates.map(executorMetricsToJson(_)) ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.metricsUpdate) ~ ("Executor ID" -> execId) ~ ("Metrics Updated" -> accumUpdates.map { case (taskId, stageId, stageAttemptId, updates) => @@ -243,7 +247,16 @@ private[spark] object JsonProtocol { ("Stage ID" -> stageId) ~ ("Stage Attempt ID" -> stageAttemptId) ~ ("Accumulator Updates" -> JArray(updates.map(accumulableInfoToJson).toList)) - }) + }) ~ + ("Executor Metrics Updated" -> executorMetrics) + } + + def stageExecutorMetricsToJson(metrics: SparkListenerStageExecutorMetrics): JValue = { + ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.stageExecutorMetrics) ~ + ("Executor ID" -> metrics.execId) ~ + ("Stage ID" -> metrics.stageId) ~ + ("Stage Attempt ID" -> metrics.stageAttemptId) ~ + ("Executor Metrics" -> executorMetricsToJson(metrics.executorMetrics)) } def blockUpdateToJson(blockUpdate: SparkListenerBlockUpdated): JValue = { @@ -379,6 +392,14 @@ private[spark] object JsonProtocol { ("Updated Blocks" -> updatedBlocks) } + /** Convert executor metrics to JSON. */ + def executorMetricsToJson(executorMetrics: ExecutorMetrics): JValue = { + val metrics = ExecutorMetricType.values.map{ metricType => + JField(metricType.name, executorMetrics.getMetricValue(metricType)) + } + JObject(metrics: _*) + } + def taskEndReasonToJson(taskEndReason: TaskEndReason): JValue = { val reason = Utils.getFormattedClassName(taskEndReason) val json: JObject = taskEndReason match { @@ -531,6 +552,7 @@ private[spark] object JsonProtocol { val executorRemoved = Utils.getFormattedClassName(SparkListenerExecutorRemoved) val logStart = Utils.getFormattedClassName(SparkListenerLogStart) val metricsUpdate = Utils.getFormattedClassName(SparkListenerExecutorMetricsUpdate) + val stageExecutorMetrics = Utils.getFormattedClassName(SparkListenerStageExecutorMetrics) val blockUpdate = Utils.getFormattedClassName(SparkListenerBlockUpdated) } @@ -555,6 +577,7 @@ private[spark] object JsonProtocol { case `executorRemoved` => executorRemovedFromJson(json) case `logStart` => logStartFromJson(json) case `metricsUpdate` => executorMetricsUpdateFromJson(json) + case `stageExecutorMetrics` => stageExecutorMetricsFromJson(json) case `blockUpdate` => blockUpdateFromJson(json) case other => mapper.readValue(compact(render(json)), Utils.classForName(other)) .asInstanceOf[SparkListenerEvent] @@ -585,6 +608,15 @@ private[spark] object JsonProtocol { SparkListenerTaskGettingResult(taskInfo) } + /** Extract the executor metrics from JSON. */ + def executorMetricsFromJson(json: JValue): ExecutorMetrics = { + val metrics = + ExecutorMetricType.values.map { metric => + metric.name -> jsonOption(json \ metric.name).map(_.extract[Long]).getOrElse(0L) + }.toMap + new ExecutorMetrics(metrics) + } + def taskEndFromJson(json: JValue): SparkListenerTaskEnd = { val stageId = (json \ "Stage ID").extract[Int] val stageAttemptId = @@ -691,7 +723,18 @@ private[spark] object JsonProtocol { (json \ "Accumulator Updates").extract[List[JValue]].map(accumulableInfoFromJson) (taskId, stageId, stageAttemptId, updates) } - SparkListenerExecutorMetricsUpdate(execInfo, accumUpdates) + val executorUpdates = jsonOption(json \ "Executor Metrics Updated").map { + executorUpdate => executorMetricsFromJson(executorUpdate) + } + SparkListenerExecutorMetricsUpdate(execInfo, accumUpdates, executorUpdates) + } + + def stageExecutorMetricsFromJson(json: JValue): SparkListenerStageExecutorMetrics = { + val execId = (json \ "Executor ID").extract[String] + val stageId = (json \ "Stage ID").extract[Int] + val stageAttemptId = (json \ "Stage Attempt ID").extract[Int] + val executorMetrics = executorMetricsFromJson(json \ "Executor Metrics") + SparkListenerStageExecutorMetrics(execId, stageId, stageAttemptId, executorMetrics) } def blockUpdateFromJson(json: JValue): SparkListenerBlockUpdated = { diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 15c958d3f511..93b5826f8a74 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -19,7 +19,6 @@ package org.apache.spark.util import java.io._ import java.lang.{Byte => JByte} -import java.lang.InternalError import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo, ThreadInfo} import java.lang.reflect.InvocationTargetException import java.math.{MathContext, RoundingMode} @@ -240,6 +239,19 @@ private[spark] object Utils extends Logging { // scalastyle:on classforname } + /** + * Run a segment of code using a different context class loader in the current thread + */ + def withContextClassLoader[T](ctxClassLoader: ClassLoader)(fn: => T): T = { + val oldClassLoader = Thread.currentThread().getContextClassLoader() + try { + Thread.currentThread().setContextClassLoader(ctxClassLoader) + fn + } finally { + Thread.currentThread().setContextClassLoader(oldClassLoader) + } + } + /** * Primitive often used when writing [[java.nio.ByteBuffer]] to [[java.io.DataOutput]] */ @@ -2052,6 +2064,30 @@ private[spark] object Utils extends Logging { } } + /** + * Implements the same logic as JDK `java.lang.String#trim` by removing leading and trailing + * non-printable characters less or equal to '\u0020' (SPACE) but preserves natural line + * delimiters according to [[java.util.Properties]] load method. The natural line delimiters are + * removed by JDK during load. Therefore any remaining ones have been specifically provided and + * escaped by the user, and must not be ignored + * + * @param str + * @return the trimmed value of str + */ + private[util] def trimExceptCRLF(str: String): String = { + val nonSpaceOrNaturalLineDelimiter: Char => Boolean = { ch => + ch > ' ' || ch == '\r' || ch == '\n' + } + + val firstPos = str.indexWhere(nonSpaceOrNaturalLineDelimiter) + val lastPos = str.lastIndexWhere(nonSpaceOrNaturalLineDelimiter) + if (firstPos >= 0 && lastPos >= 0) { + str.substring(firstPos, lastPos + 1) + } else { + "" + } + } + /** Load properties present in the given file. */ def getPropertiesFromFile(filename: String): Map[String, String] = { val file = new File(filename) @@ -2062,8 +2098,10 @@ private[spark] object Utils extends Logging { try { val properties = new Properties() properties.load(inReader) - properties.stringPropertyNames().asScala.map( - k => (k, properties.getProperty(k).trim)).toMap + properties.stringPropertyNames().asScala + .map { k => (k, trimExceptCRLF(properties.getProperty(k))) } + .toMap + } catch { case e: IOException => throw new SparkException(s"Failed when loading Spark properties from $filename", e) @@ -2698,7 +2736,7 @@ private[spark] object Utils extends Logging { } val masterScheme = new URI(masterWithoutK8sPrefix).getScheme - val resolvedURL = masterScheme.toLowerCase match { + val resolvedURL = masterScheme.toLowerCase(Locale.ROOT) match { case "https" => masterWithoutK8sPrefix case "http" => @@ -2795,6 +2833,36 @@ private[spark] object Utils extends Logging { } } } + + /** + * Regular expression matching full width characters. + * + * Looked at all the 0x0000-0xFFFF characters (unicode) and showed them under Xshell. + * Found all the full width characters, then get the regular expression. + */ + private val fullWidthRegex = ("""[""" + + // scalastyle:off nonascii + """\u1100-\u115F""" + + """\u2E80-\uA4CF""" + + """\uAC00-\uD7A3""" + + """\uF900-\uFAFF""" + + """\uFE10-\uFE19""" + + """\uFE30-\uFE6F""" + + """\uFF00-\uFF60""" + + """\uFFE0-\uFFE6""" + + // scalastyle:on nonascii + """]""").r + + /** + * Return the number of half widths in a given string. Note that a full width character + * occupies two half widths. + * + * For a string consisting of 1 million characters, the execution of this method requires + * about 50ms. + */ + def stringHalfWidth(str: String): Int = { + if (str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size + } } private[util] object CallerContext extends Logging { diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala index 39f050f6ca5a..4aa8d45ec740 100644 --- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala +++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala @@ -19,17 +19,16 @@ package org.apache.spark.util.io import java.io.{File, FileInputStream, InputStream} import java.nio.ByteBuffer -import java.nio.channels.{FileChannel, WritableByteChannel} -import java.nio.file.StandardOpenOption - -import scala.collection.mutable.ListBuffer +import java.nio.channels.WritableByteChannel +import com.google.common.io.ByteStreams import com.google.common.primitives.UnsignedBytes +import org.apache.commons.io.IOUtils import org.apache.spark.SparkEnv import org.apache.spark.internal.config import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} -import org.apache.spark.network.util.ByteArrayWritableChannel +import org.apache.spark.network.util.{ByteArrayWritableChannel, LimitedInputStream} import org.apache.spark.storage.StorageUtils import org.apache.spark.util.Utils @@ -175,30 +174,36 @@ object ChunkedByteBuffer { def fromManagedBuffer(data: ManagedBuffer, maxChunkSize: Int): ChunkedByteBuffer = { data match { case f: FileSegmentManagedBuffer => - map(f.getFile, maxChunkSize, f.getOffset, f.getLength) + fromFile(f.getFile, maxChunkSize, f.getOffset, f.getLength) case other => new ChunkedByteBuffer(other.nioByteBuffer()) } } - def map(file: File, maxChunkSize: Int): ChunkedByteBuffer = { - map(file, maxChunkSize, 0, file.length()) + def fromFile(file: File, maxChunkSize: Int): ChunkedByteBuffer = { + fromFile(file, maxChunkSize, 0, file.length()) } - def map(file: File, maxChunkSize: Int, offset: Long, length: Long): ChunkedByteBuffer = { - Utils.tryWithResource(FileChannel.open(file.toPath, StandardOpenOption.READ)) { channel => - var remaining = length - var pos = offset - val chunks = new ListBuffer[ByteBuffer]() - while (remaining > 0) { - val chunkSize = math.min(remaining, maxChunkSize) - val chunk = channel.map(FileChannel.MapMode.READ_ONLY, pos, chunkSize) - pos += chunkSize - remaining -= chunkSize - chunks += chunk - } - new ChunkedByteBuffer(chunks.toArray) + private def fromFile( + file: File, + maxChunkSize: Int, + offset: Long, + length: Long): ChunkedByteBuffer = { + // We do *not* memory map the file, because we may end up putting this into the memory store, + // and spark currently is not expecting memory-mapped buffers in the memory store, it conflicts + // with other parts that manage the lifecyle of buffers and dispose them. See SPARK-25422. + val is = new FileInputStream(file) + ByteStreams.skipFully(is, offset) + val in = new LimitedInputStream(is, length) + val chunkSize = math.min(maxChunkSize, length).toInt + val out = new ChunkedByteBufferOutputStream(chunkSize, ByteBuffer.allocate _) + Utils.tryWithSafeFinally { + IOUtils.copy(in, out) + } { + in.close() + out.close() } + out.toChunkedByteBuffer } } diff --git a/core/src/test/java/org/apache/spark/ExecutorPluginSuite.java b/core/src/test/java/org/apache/spark/ExecutorPluginSuite.java new file mode 100644 index 000000000000..686eb28010c6 --- /dev/null +++ b/core/src/test/java/org/apache/spark/ExecutorPluginSuite.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark; + +import org.apache.spark.api.java.JavaSparkContext; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class ExecutorPluginSuite { + private static final String EXECUTOR_PLUGIN_CONF_NAME = "spark.executor.plugins"; + private static final String testBadPluginName = TestBadShutdownPlugin.class.getName(); + private static final String testPluginName = TestExecutorPlugin.class.getName(); + private static final String testSecondPluginName = TestSecondPlugin.class.getName(); + + // Static value modified by testing plugins to ensure plugins loaded correctly. + public static int numSuccessfulPlugins = 0; + + // Static value modified by testing plugins to verify plugins shut down properly. + public static int numSuccessfulTerminations = 0; + + private JavaSparkContext sc; + + @Before + public void setUp() { + sc = null; + numSuccessfulPlugins = 0; + numSuccessfulTerminations = 0; + } + + @After + public void tearDown() { + if (sc != null) { + sc.stop(); + sc = null; + } + } + + private SparkConf initializeSparkConf(String pluginNames) { + return new SparkConf() + .setMaster("local") + .setAppName("test") + .set(EXECUTOR_PLUGIN_CONF_NAME, pluginNames); + } + + @Test + public void testPluginClassDoesNotExist() { + SparkConf conf = initializeSparkConf("nonexistant.plugin"); + try { + sc = new JavaSparkContext(conf); + fail("No exception thrown for nonexistant plugin"); + } catch (Exception e) { + // We cannot catch ClassNotFoundException directly because Java doesn't think it'll be thrown + assertTrue(e.toString().startsWith("java.lang.ClassNotFoundException")); + } + } + + @Test + public void testAddPlugin() throws InterruptedException { + // Load the sample TestExecutorPlugin, which will change the value of numSuccessfulPlugins + SparkConf conf = initializeSparkConf(testPluginName); + sc = new JavaSparkContext(conf); + assertEquals(1, numSuccessfulPlugins); + sc.stop(); + sc = null; + assertEquals(1, numSuccessfulTerminations); + } + + @Test + public void testAddMultiplePlugins() throws InterruptedException { + // Load two plugins and verify they both execute. + SparkConf conf = initializeSparkConf(testPluginName + "," + testSecondPluginName); + sc = new JavaSparkContext(conf); + assertEquals(2, numSuccessfulPlugins); + sc.stop(); + sc = null; + assertEquals(2, numSuccessfulTerminations); + } + + @Test + public void testPluginShutdownWithException() { + // Verify an exception in one plugin shutdown does not affect the others + String pluginNames = testPluginName + "," + testBadPluginName + "," + testPluginName; + SparkConf conf = initializeSparkConf(pluginNames); + sc = new JavaSparkContext(conf); + assertEquals(3, numSuccessfulPlugins); + sc.stop(); + sc = null; + assertEquals(2, numSuccessfulTerminations); + } + + public static class TestExecutorPlugin implements ExecutorPlugin { + public void init() { + ExecutorPluginSuite.numSuccessfulPlugins++; + } + + public void shutdown() { + ExecutorPluginSuite.numSuccessfulTerminations++; + } + } + + public static class TestSecondPlugin implements ExecutorPlugin { + public void init() { + ExecutorPluginSuite.numSuccessfulPlugins++; + } + + public void shutdown() { + ExecutorPluginSuite.numSuccessfulTerminations++; + } + } + + public static class TestBadShutdownPlugin implements ExecutorPlugin { + public void init() { + ExecutorPluginSuite.numSuccessfulPlugins++; + } + + public void shutdown() { + throw new RuntimeException("This plugin will fail to cleanly shut down"); + } + } +} diff --git a/core/src/test/java/org/apache/spark/JavaJdbcRDDSuite.java b/core/src/test/java/org/apache/spark/JavaJdbcRDDSuite.java index a6589d289814..40a7c9486ae5 100644 --- a/core/src/test/java/org/apache/spark/JavaJdbcRDDSuite.java +++ b/core/src/test/java/org/apache/spark/JavaJdbcRDDSuite.java @@ -39,30 +39,28 @@ public void setUp() throws ClassNotFoundException, SQLException { sc = new JavaSparkContext("local", "JavaAPISuite"); Class.forName("org.apache.derby.jdbc.EmbeddedDriver"); - Connection connection = - DriverManager.getConnection("jdbc:derby:target/JavaJdbcRDDSuiteDb;create=true"); - try { - Statement create = connection.createStatement(); - create.execute( - "CREATE TABLE FOO(" + - "ID INTEGER NOT NULL GENERATED ALWAYS AS IDENTITY (START WITH 1, INCREMENT BY 1)," + - "DATA INTEGER)"); - create.close(); + try (Connection connection = DriverManager.getConnection( + "jdbc:derby:target/JavaJdbcRDDSuiteDb;create=true")) { + + try (Statement create = connection.createStatement()) { + create.execute( + "CREATE TABLE FOO(ID INTEGER NOT NULL GENERATED ALWAYS AS IDENTITY" + + " (START WITH 1, INCREMENT BY 1), DATA INTEGER)"); + } - PreparedStatement insert = connection.prepareStatement("INSERT INTO FOO(DATA) VALUES(?)"); - for (int i = 1; i <= 100; i++) { - insert.setInt(1, i * 2); - insert.executeUpdate(); + try (PreparedStatement insert = connection.prepareStatement( + "INSERT INTO FOO(DATA) VALUES(?)")) { + for (int i = 1; i <= 100; i++) { + insert.setInt(1, i * 2); + insert.executeUpdate(); + } } - insert.close(); } catch (SQLException e) { // If table doesn't exist... if (e.getSQLState().compareTo("X0Y32") != 0) { throw e; } - } finally { - connection.close(); } } diff --git a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java index d7d2d0b012bd..a0664b30d6cc 100644 --- a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java +++ b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java @@ -76,7 +76,7 @@ public void freeingPageSetsPageNumberToSpecialConstant() { final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP); final MemoryBlock dataPage = manager.allocatePage(256, c); c.freePage(dataPage); - Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, dataPage.getPageNumber()); + Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, dataPage.pageNumber); } @Test(expected = AssertionError.class) diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java index faa70f23b0ac..a07d0e84ea85 100644 --- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java @@ -186,14 +186,14 @@ private List> readRecordsFromFile() throws IOException { if (conf.getBoolean("spark.shuffle.compress", true)) { in = CompressionCodec$.MODULE$.createCodec(conf).compressedInputStream(in); } - DeserializationStream recordsStream = serializer.newInstance().deserializeStream(in); - Iterator> records = recordsStream.asKeyValueIterator(); - while (records.hasNext()) { - Tuple2 record = records.next(); - assertEquals(i, hashPartitioner.getPartition(record._1())); - recordsList.add(record); + try (DeserializationStream recordsStream = serializer.newInstance().deserializeStream(in)) { + Iterator> records = recordsStream.asKeyValueIterator(); + while (records.hasNext()) { + Tuple2 record = records.next(); + assertEquals(i, hashPartitioner.getPartition(record._1())); + recordsList.add(record); + } } - recordsStream.close(); startOffset += partitionSize; } } @@ -233,7 +233,6 @@ public void writeEmptyIterator() throws Exception { writer.write(Iterators.emptyIterator()); final Option mapStatus = writer.stop(true); assertTrue(mapStatus.isDefined()); - assertEquals(0, mapStatus.get().numberOfOutput()); assertTrue(mergedOutputFile.exists()); assertArrayEquals(new long[NUM_PARTITITONS], partitionSizesInMergedFile); assertEquals(0, taskMetrics.shuffleWriteMetrics().recordsWritten()); @@ -253,7 +252,6 @@ public void writeWithoutSpilling() throws Exception { writer.write(dataToWrite.iterator()); final Option mapStatus = writer.stop(true); assertTrue(mapStatus.isDefined()); - assertEquals(NUM_PARTITITONS, mapStatus.get().numberOfOutput()); assertTrue(mergedOutputFile.exists()); long sumOfPartitionSizes = 0; diff --git a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java index 01b5fb7b4668..3992ab7049bd 100644 --- a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java @@ -997,10 +997,10 @@ public void binaryFiles() throws Exception { FileOutputStream fos1 = new FileOutputStream(file1); - FileChannel channel1 = fos1.getChannel(); - ByteBuffer bbuf = ByteBuffer.wrap(content1); - channel1.write(bbuf); - channel1.close(); + try (FileChannel channel1 = fos1.getChannel()) { + ByteBuffer bbuf = ByteBuffer.wrap(content1); + channel1.write(bbuf); + } JavaPairRDD readRDD = sc.binaryFiles(tempDirName, 3); List> result = readRDD.collect(); for (Tuple2 res : result) { @@ -1018,10 +1018,10 @@ public void binaryFilesCaching() throws Exception { FileOutputStream fos1 = new FileOutputStream(file1); - FileChannel channel1 = fos1.getChannel(); - ByteBuffer bbuf = ByteBuffer.wrap(content1); - channel1.write(bbuf); - channel1.close(); + try (FileChannel channel1 = fos1.getChannel()) { + ByteBuffer bbuf = ByteBuffer.wrap(content1); + channel1.write(bbuf); + } JavaPairRDD readRDD = sc.binaryFiles(tempDirName).cache(); readRDD.foreach(pair -> pair._2().toArray()); // force the file to read @@ -1042,13 +1042,12 @@ public void binaryRecords() throws Exception { FileOutputStream fos1 = new FileOutputStream(file1); - FileChannel channel1 = fos1.getChannel(); - - for (int i = 0; i < numOfCopies; i++) { - ByteBuffer bbuf = ByteBuffer.wrap(content1); - channel1.write(bbuf); + try (FileChannel channel1 = fos1.getChannel()) { + for (int i = 0; i < numOfCopies; i++) { + ByteBuffer bbuf = ByteBuffer.wrap(content1); + channel1.write(bbuf); + } } - channel1.close(); JavaRDD readRDD = sc.binaryRecords(tempDirName, content1.length); assertEquals(numOfCopies,readRDD.count()); diff --git a/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json index 4fecf84db65a..eea6f595efd2 100644 --- a/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json @@ -1,4 +1,19 @@ [ { + "id" : "application_1506645932520_24630151", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2018-04-19T23:54:42.734GMT", + "endTime" : "2018-04-19T23:56:29.134GMT", + "lastUpdated" : "", + "duration" : 106400, + "sparkUser" : "edlu", + "completed" : true, + "appSparkVersion" : "2.4.0-SNAPSHOT", + "lastUpdatedEpoch" : 0, + "startTimeEpoch" : 1524182082734, + "endTimeEpoch" : 1524182189134 + } ] +}, { "id" : "application_1516285256255_0012", "name" : "Spark shell", "attempts" : [ { diff --git a/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json index 4fecf84db65a..7bc7f31be097 100644 --- a/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json @@ -1,4 +1,19 @@ [ { + "id" : "application_1506645932520_24630151", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2018-04-19T23:54:42.734GMT", + "endTime" : "2018-04-19T23:56:29.134GMT", + "lastUpdated" : "", + "duration" : 106400, + "sparkUser" : "edlu", + "completed" : true, + "appSparkVersion" : "2.4.0-SNAPSHOT", + "startTimeEpoch" : 1524182082734, + "endTimeEpoch" : 1524182189134, + "lastUpdatedEpoch" : 0 + } ] +}, { "id" : "application_1516285256255_0012", "name" : "Spark shell", "attempts" : [ { diff --git a/core/src/test/resources/HistoryServerExpectations/executor_list_with_executor_metrics_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/executor_list_with_executor_metrics_json_expectation.json new file mode 100644 index 000000000000..9bf2086cc8e7 --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/executor_list_with_executor_metrics_json_expectation.json @@ -0,0 +1,314 @@ +[ { + "id" : "driver", + "hostPort" : "node0033.grid.company.com:60749", + "isActive" : true, + "rddBlocks" : 0, + "memoryUsed" : 0, + "diskUsed" : 0, + "totalCores" : 0, + "maxTasks" : 0, + "activeTasks" : 0, + "failedTasks" : 0, + "completedTasks" : 0, + "totalTasks" : 0, + "totalDuration" : 0, + "totalGCTime" : 0, + "totalInputBytes" : 0, + "totalShuffleRead" : 0, + "totalShuffleWrite" : 0, + "isBlacklisted" : false, + "maxMemory" : 1043437977, + "addTime" : "2018-04-19T23:55:05.107GMT", + "executorLogs" : { }, + "memoryMetrics" : { + "usedOnHeapStorageMemory" : 0, + "usedOffHeapStorageMemory" : 0, + "totalOnHeapStorageMemory" : 1043437977, + "totalOffHeapStorageMemory" : 0 + }, + "blacklistedInStages" : [ ], + "peakMemoryMetrics" : { + "OnHeapStorageMemory" : 905801, + "JVMOffHeapMemory" : 205304696, + "OffHeapExecutionMemory" : 0, + "OnHeapUnifiedMemory" : 905801, + "OnHeapExecutionMemory" : 0, + "OffHeapUnifiedMemory" : 0, + "DirectPoolMemory" : 397602, + "MappedPoolMemory" : 0, + "JVMHeapMemory" : 629553808, + "OffHeapStorageMemory" : 0 + } +}, { + "id" : "7", + "hostPort" : "node6340.grid.company.com:5933", + "isActive" : true, + "rddBlocks" : 0, + "memoryUsed" : 0, + "diskUsed" : 0, + "totalCores" : 1, + "maxTasks" : 1, + "activeTasks" : 0, + "failedTasks" : 0, + "completedTasks" : 0, + "totalTasks" : 0, + "totalDuration" : 0, + "totalGCTime" : 0, + "totalInputBytes" : 0, + "totalShuffleRead" : 0, + "totalShuffleWrite" : 0, + "isBlacklisted" : false, + "maxMemory" : 956615884, + "addTime" : "2018-04-19T23:55:49.826GMT", + "executorLogs" : { + "stdout" : "http://node6340.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000009/edlu/stdout?start=-4096", + "stderr" : "http://node6340.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000009/edlu/stderr?start=-4096" + }, + "memoryMetrics" : { + "usedOnHeapStorageMemory" : 0, + "usedOffHeapStorageMemory" : 0, + "totalOnHeapStorageMemory" : 956615884, + "totalOffHeapStorageMemory" : 0 + }, + "blacklistedInStages" : [ ] +}, { + "id" : "6", + "hostPort" : "node6644.grid.company.com:8445", + "isActive" : true, + "rddBlocks" : 0, + "memoryUsed" : 0, + "diskUsed" : 0, + "totalCores" : 1, + "maxTasks" : 1, + "activeTasks" : 0, + "failedTasks" : 0, + "completedTasks" : 0, + "totalTasks" : 0, + "totalDuration" : 0, + "totalGCTime" : 0, + "totalInputBytes" : 0, + "totalShuffleRead" : 0, + "totalShuffleWrite" : 0, + "isBlacklisted" : false, + "maxMemory" : 956615884, + "addTime" : "2018-04-19T23:55:47.549GMT", + "executorLogs" : { + "stdout" : "http://node6644.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000008/edlu/stdout?start=-4096", + "stderr" : "http://node6644.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000008/edlu/stderr?start=-4096" + }, + "memoryMetrics" : { + "usedOnHeapStorageMemory" : 0, + "usedOffHeapStorageMemory" : 0, + "totalOnHeapStorageMemory" : 956615884, + "totalOffHeapStorageMemory" : 0 + }, + "blacklistedInStages" : [ ] +}, { + "id" : "5", + "hostPort" : "node2477.grid.company.com:20123", + "isActive" : true, + "rddBlocks" : 0, + "memoryUsed" : 0, + "diskUsed" : 0, + "totalCores" : 1, + "maxTasks" : 1, + "activeTasks" : 0, + "failedTasks" : 0, + "completedTasks" : 1, + "totalTasks" : 1, + "totalDuration" : 9252, + "totalGCTime" : 920, + "totalInputBytes" : 36838295, + "totalShuffleRead" : 0, + "totalShuffleWrite" : 355051, + "isBlacklisted" : false, + "maxMemory" : 956615884, + "addTime" : "2018-04-19T23:55:43.160GMT", + "executorLogs" : { + "stdout" : "http://node2477.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000007/edlu/stdout?start=-4096", + "stderr" : "http://node2477.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000007/edlu/stderr?start=-4096" + }, + "memoryMetrics" : { + "usedOnHeapStorageMemory" : 0, + "usedOffHeapStorageMemory" : 0, + "totalOnHeapStorageMemory" : 956615884, + "totalOffHeapStorageMemory" : 0 + }, + "blacklistedInStages" : [ ] +}, { + "id" : "4", + "hostPort" : "node4243.grid.company.com:16084", + "isActive" : true, + "rddBlocks" : 0, + "memoryUsed" : 0, + "diskUsed" : 0, + "totalCores" : 1, + "maxTasks" : 1, + "activeTasks" : 0, + "failedTasks" : 0, + "completedTasks" : 3, + "totalTasks" : 3, + "totalDuration" : 15645, + "totalGCTime" : 405, + "totalInputBytes" : 87272855, + "totalShuffleRead" : 438675, + "totalShuffleWrite" : 26773039, + "isBlacklisted" : false, + "maxMemory" : 956615884, + "addTime" : "2018-04-19T23:55:12.278GMT", + "executorLogs" : { + "stdout" : "http://node4243.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000006/edlu/stdout?start=-4096", + "stderr" : "http://node4243.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000006/edlu/stderr?start=-4096" + }, + "memoryMetrics" : { + "usedOnHeapStorageMemory" : 0, + "usedOffHeapStorageMemory" : 0, + "totalOnHeapStorageMemory" : 956615884, + "totalOffHeapStorageMemory" : 0 + }, + "blacklistedInStages" : [ ], + "peakMemoryMetrics" : { + "OnHeapStorageMemory" : 63104457, + "JVMOffHeapMemory" : 95657456, + "OffHeapExecutionMemory" : 0, + "OnHeapUnifiedMemory" : 100853193, + "OnHeapExecutionMemory" : 37748736, + "OffHeapUnifiedMemory" : 0, + "DirectPoolMemory" : 126261, + "MappedPoolMemory" : 0, + "JVMHeapMemory" : 518613056, + "OffHeapStorageMemory" : 0 + } +}, { + "id" : "3", + "hostPort" : "node0998.grid.company.com:45265", + "isActive" : true, + "rddBlocks" : 0, + "memoryUsed" : 0, + "diskUsed" : 0, + "totalCores" : 1, + "maxTasks" : 1, + "activeTasks" : 0, + "failedTasks" : 0, + "completedTasks" : 1, + "totalTasks" : 1, + "totalDuration" : 14491, + "totalGCTime" : 342, + "totalInputBytes" : 50409514, + "totalShuffleRead" : 0, + "totalShuffleWrite" : 31362123, + "isBlacklisted" : false, + "maxMemory" : 956615884, + "addTime" : "2018-04-19T23:55:12.088GMT", + "executorLogs" : { + "stdout" : "http://node0998.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000005/edlu/stdout?start=-4096", + "stderr" : "http://node0998.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000005/edlu/stderr?start=-4096" + }, + "memoryMetrics" : { + "usedOnHeapStorageMemory" : 0, + "usedOffHeapStorageMemory" : 0, + "totalOnHeapStorageMemory" : 956615884, + "totalOffHeapStorageMemory" : 0 + }, + "blacklistedInStages" : [ ], + "peakMemoryMetrics" : { + "OnHeapStorageMemory" : 69535048, + "JVMOffHeapMemory" : 90709624, + "OffHeapExecutionMemory" : 0, + "OnHeapUnifiedMemory" : 69535048, + "OnHeapExecutionMemory" : 0, + "OffHeapUnifiedMemory" : 0, + "DirectPoolMemory" : 87796, + "MappedPoolMemory" : 0, + "JVMHeapMemory" : 726805712, + "OffHeapStorageMemory" : 0 + } +}, { + "id" : "2", + "hostPort" : "node4045.grid.company.com:29262", + "isActive" : true, + "rddBlocks" : 0, + "memoryUsed" : 0, + "diskUsed" : 0, + "totalCores" : 1, + "maxTasks" : 1, + "activeTasks" : 0, + "failedTasks" : 0, + "completedTasks" : 1, + "totalTasks" : 1, + "totalDuration" : 14113, + "totalGCTime" : 326, + "totalInputBytes" : 50423423, + "totalShuffleRead" : 0, + "totalShuffleWrite" : 22950296, + "isBlacklisted" : false, + "maxMemory" : 956615884, + "addTime" : "2018-04-19T23:55:12.471GMT", + "executorLogs" : { + "stdout" : "http://node4045.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000004/edlu/stdout?start=-4096", + "stderr" : "http://node4045.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000004/edlu/stderr?start=-4096" + }, + "memoryMetrics" : { + "usedOnHeapStorageMemory" : 0, + "usedOffHeapStorageMemory" : 0, + "totalOnHeapStorageMemory" : 956615884, + "totalOffHeapStorageMemory" : 0 + }, + "blacklistedInStages" : [ ], + "peakMemoryMetrics" : { + "OnHeapStorageMemory" : 58468944, + "JVMOffHeapMemory" : 91208368, + "OffHeapExecutionMemory" : 0, + "OnHeapUnifiedMemory" : 58468944, + "OnHeapExecutionMemory" : 0, + "OffHeapUnifiedMemory" : 0, + "DirectPoolMemory" : 87796, + "MappedPoolMemory" : 0, + "JVMHeapMemory" : 595946552, + "OffHeapStorageMemory" : 0 + } +}, { + "id" : "1", + "hostPort" : "node1404.grid.company.com:34043", + "isActive" : true, + "rddBlocks" : 0, + "memoryUsed" : 0, + "diskUsed" : 0, + "totalCores" : 1, + "maxTasks" : 1, + "activeTasks" : 0, + "failedTasks" : 0, + "completedTasks" : 3, + "totalTasks" : 3, + "totalDuration" : 15665, + "totalGCTime" : 471, + "totalInputBytes" : 98905018, + "totalShuffleRead" : 0, + "totalShuffleWrite" : 20594744, + "isBlacklisted" : false, + "maxMemory" : 956615884, + "addTime" : "2018-04-19T23:55:11.695GMT", + "executorLogs" : { + "stdout" : "http://node1404.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000002/edlu/stdout?start=-4096", + "stderr" : "http://node1404.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000002/edlu/stderr?start=-4096" + }, + "memoryMetrics" : { + "usedOnHeapStorageMemory" : 0, + "usedOffHeapStorageMemory" : 0, + "totalOnHeapStorageMemory" : 956615884, + "totalOffHeapStorageMemory" : 0 + }, + "blacklistedInStages" : [ ], + "peakMemoryMetrics" : { + "OnHeapStorageMemory" : 47962185, + "JVMOffHeapMemory" : 100519936, + "OffHeapExecutionMemory" : 0, + "OnHeapUnifiedMemory" : 47962185, + "OnHeapExecutionMemory" : 0, + "OffHeapUnifiedMemory" : 0, + "DirectPoolMemory" : 98230, + "MappedPoolMemory" : 0, + "JVMHeapMemory" : 755008624, + "OffHeapStorageMemory" : 0 + } +} ] diff --git a/core/src/test/resources/HistoryServerExpectations/limit_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/limit_app_list_json_expectation.json index 79950b0dc648..9e1e65a35881 100644 --- a/core/src/test/resources/HistoryServerExpectations/limit_app_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/limit_app_list_json_expectation.json @@ -1,4 +1,19 @@ [ { + "id" : "application_1506645932520_24630151", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2018-04-19T23:54:42.734GMT", + "endTime" : "2018-04-19T23:56:29.134GMT", + "lastUpdated" : "", + "duration" : 106400, + "sparkUser" : "edlu", + "completed" : true, + "appSparkVersion" : "2.4.0-SNAPSHOT", + "lastUpdatedEpoch" : 0, + "startTimeEpoch" : 1524182082734, + "endTimeEpoch" : 1524182189134 + } ] +}, { "id" : "application_1516285256255_0012", "name" : "Spark shell", "attempts" : [ { @@ -28,19 +43,4 @@ "startTimeEpoch" : 1515492942372, "endTimeEpoch" : 1515493477606 } ] -}, { - "id" : "app-20161116163331-0000", - "name" : "Spark shell", - "attempts" : [ { - "startTime" : "2016-11-16T22:33:29.916GMT", - "endTime" : "2016-11-16T22:33:40.587GMT", - "lastUpdated" : "", - "duration" : 10671, - "sparkUser" : "jose", - "completed" : true, - "appSparkVersion" : "2.1.0-SNAPSHOT", - "lastUpdatedEpoch" : 0, - "startTimeEpoch" : 1479335609916, - "endTimeEpoch" : 1479335620587 - } ] } ] diff --git a/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json index 7d60977dcd4f..28c6bf1b3e01 100644 --- a/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/minDate_app_list_json_expectation.json @@ -1,4 +1,19 @@ [ { + "id" : "application_1506645932520_24630151", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2018-04-19T23:54:42.734GMT", + "endTime" : "2018-04-19T23:56:29.134GMT", + "lastUpdated" : "", + "duration" : 106400, + "sparkUser" : "edlu", + "completed" : true, + "appSparkVersion" : "2.4.0-SNAPSHOT", + "lastUpdatedEpoch" : 0, + "startTimeEpoch" : 1524182082734, + "endTimeEpoch" : 1524182189134 + } ] +}, { "id" : "application_1516285256255_0012", "name" : "Spark shell", "attempts" : [ { diff --git a/core/src/test/resources/HistoryServerExpectations/minEndDate_app_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/minEndDate_app_list_json_expectation.json index dfbfd8aedcc2..f547b79f47e1 100644 --- a/core/src/test/resources/HistoryServerExpectations/minEndDate_app_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/minEndDate_app_list_json_expectation.json @@ -1,4 +1,19 @@ [ { + "id" : "application_1506645932520_24630151", + "name" : "Spark shell", + "attempts" : [ { + "startTime" : "2018-04-19T23:54:42.734GMT", + "endTime" : "2018-04-19T23:56:29.134GMT", + "lastUpdated" : "", + "duration" : 106400, + "sparkUser" : "edlu", + "completed" : true, + "appSparkVersion" : "2.4.0-SNAPSHOT", + "lastUpdatedEpoch" : 0, + "startTimeEpoch" : 1524182082734, + "endTimeEpoch" : 1524182189134 + } ] +}, { "id" : "application_1516285256255_0012", "name" : "Spark shell", "attempts" : [ { @@ -101,4 +116,4 @@ "startTimeEpoch" : 1430917380880, "endTimeEpoch" : 1430917380890 } ] -} ] \ No newline at end of file +} ] diff --git a/core/src/test/resources/spark-events/application_1506645932520_24630151 b/core/src/test/resources/spark-events/application_1506645932520_24630151 new file mode 100644 index 000000000000..c48ed741c56e --- /dev/null +++ b/core/src/test/resources/spark-events/application_1506645932520_24630151 @@ -0,0 +1,63 @@ +{"Event":"SparkListenerLogStart","Spark Version":"2.4.0-SNAPSHOT"} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"node0033.grid.company.com","Port":60749},"Maximum Memory":1043437977,"Timestamp":1524182105107,"Maximum Onheap Memory":1043437977,"Maximum Offheap Memory":0} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/usr/java/jdk1.8.0_31/jre","Java Version":"1.8.0_31 (Oracle Corporation)","Scala Version":"version 2.11.8"},"Spark Properties":{"spark.jars.ivySettings":"/export/apps/spark/commonconf/ivysettings.xml","spark.serializer":"org.apache.spark.serializer.KryoSerializer","spark.driver.host":"node0033.grid.company.com","spark.dynamicAllocation.sustainedSchedulerBacklogTimeout":"5","spark.eventLog.enabled":"true","spark.ui.port":"0","spark.driver.port":"57705","spark.shuffle.service.enabled":"true","spark.ui.acls.enable":"true","spark.reducer.maxSizeInFlight":"48m","spark.yarn.queue":"spark_default","spark.repl.class.uri":"spark://node0033.grid.company.com:57705/classes","spark.jars":"","spark.yarn.historyServer.address":"clustersh01.grid.company.com:18080","spark.memoryOverhead.multiplier.percent":"10","spark.repl.class.outputDir":"/grid/a/mapred/tmp/spark-21b68b4b-c1db-460e-a228-b87545d870f1/repl-58778a76-04c1-434d-bfb7-9a9b83afe718","spark.dynamicAllocation.cachedExecutorIdleTimeout":"1200","spark.yarn.access.namenodes":"hdfs://clusternn02.grid.company.com:9000","spark.app.name":"Spark shell","spark.dynamicAllocation.schedulerBacklogTimeout":"5","spark.yarn.security.credentials.hive.enabled":"false","spark.yarn.am.cores":"1","spark.memoryOverhead.min":"384","spark.scheduler.mode":"FIFO","spark.driver.memory":"2G","spark.executor.instances":"4","spark.isolated.classloader.additional.classes.prefix":"com_company_","spark.logConf":"true","spark.ui.showConsoleProgress":"true","spark.user.priority.jars":"*********(redacted)","spark.isolated.classloader":"true","spark.sql.sources.schemaStringLengthThreshold":"40000","spark.yarn.secondary.jars":"spark-avro_2.11-3.2.0.21.jar,grid-topology-1.0.jar","spark.reducer.maxBlocksInFlightPerAddress":"100","spark.dynamicAllocation.maxExecutors":"900","spark.yarn.appMasterEnv.LD_LIBRARY_PATH":"/export/apps/hadoop/latest/lib/native","spark.executor.id":"driver","spark.yarn.am.memory":"2G","spark.driver.cores":"1","spark.search.packages":"com.company.dali:dali-data-spark,com.company.spark-common:spark-common","spark.min.mem.vore.ratio":"5","spark.sql.sources.partitionOverwriteMode":"DYNAMIC","spark.submit.deployMode":"client","spark.yarn.maxAppAttempts":"1","spark.master":"yarn","spark.default.packages":"com.company.dali:dali-data-spark:8.+?classifier=all,com.company.spark-common:spark-common_2.10:0.+?","spark.isolated.classloader.default.jar":"*dali-data-spark*","spark.authenticate":"true","spark.eventLog.usexattr":"true","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.executor.memory":"2G","spark.home":"/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51","spark.reducer.maxReqsInFlight":"10","spark.eventLog.dir":"hdfs://clusternn02.grid.company.com:9000/system/spark-history","spark.dynamicAllocation.enabled":"true","spark.sql.catalogImplementation":"hive","spark.isolated.classes":"org.apache.hadoop.hive.ql.io.CombineHiveInputFormat$CombineHiveInputSplit","spark.eventLog.compress":"true","spark.executor.cores":"1","spark.version":"2.1.0","spark.driver.appUIAddress":"http://node0033.grid.company.com:8364","spark.repl.local.jars":"file:///export/home/edlu/spark-avro_2.11-3.2.0.21.jar,file:///export/apps/hadoop/site/lib/grid-topology-1.0.jar","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"clusterwp01.grid.company.com","spark.min.memory-gb.size":"10","spark.dynamicAllocation.minExecutors":"1","spark.dynamicAllocation.initialExecutors":"3","spark.expressionencoder.org.apache.avro.specific.SpecificRecord":"com.databricks.spark.avro.AvroEncoder$","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://clusterwp01.grid.company.com:8080/proxy/application_1506645932520_24630151","spark.executorEnv.LD_LIBRARY_PATH":"/export/apps/hadoop/latest/lib/native","spark.dynamicAllocation.executorIdleTimeout":"150","spark.shell.auto.node.labeling":"true","spark.yarn.dist.jars":"file:///export/home/edlu/spark-avro_2.11-3.2.0.21.jar,file:///export/apps/hadoop/site/lib/grid-topology-1.0.jar","spark.app.id":"application_1506645932520_24630151","spark.ui.view.acls":"*"},"System Properties":{"java.io.tmpdir":"/tmp","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.8","user.home":"*********(redacted)","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","sun.arch.data.model":"64","sun.boot.library.path":"/usr/java/jdk1.8.0_31/jre/lib/amd64","user.dir":"*********(redacted)","java.library.path":"/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib","sun.cpu.isalist":"","os.arch":"amd64","java.vm.version":"25.31-b07","java.endorsed.dirs":"/usr/java/jdk1.8.0_31/jre/lib/endorsed","java.runtime.version":"1.8.0_31-b13","java.vm.info":"mixed mode","java.ext.dirs":"/usr/java/jdk1.8.0_31/jre/lib/ext:/usr/java/packages/lib/ext","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","scala.usejavacp":"true","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/usr/java/jdk1.8.0_31/jre/lib/resources.jar:/usr/java/jdk1.8.0_31/jre/lib/rt.jar:/usr/java/jdk1.8.0_31/jre/lib/sunrsasign.jar:/usr/java/jdk1.8.0_31/jre/lib/jsse.jar:/usr/java/jdk1.8.0_31/jre/lib/jce.jar:/usr/java/jdk1.8.0_31/jre/lib/charsets.jar:/usr/java/jdk1.8.0_31/jre/lib/jfr.jar:/usr/java/jdk1.8.0_31/jre/classes","file.encoding":"UTF-8","user.timezone":"*********(redacted)","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"2.6.32-504.16.2.el6.x86_64","sun.os.patch.level":"unknown","java.vm.specification.vendor":"Oracle Corporation","user.country":"*********(redacted)","sun.jnu.encoding":"UTF-8","user.language":"*********(redacted)","java.vendor.url":"*********(redacted)","java.awt.printerjob":"sun.print.PSPrinterJob","java.awt.graphicsenv":"sun.awt.X11GraphicsEnvironment","awt.toolkit":"sun.awt.X11.XToolkit","os.name":"Linux","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"*********(redacted)","user.name":"*********(redacted)","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --master yarn --deploy-mode client --class org.apache.spark.repl.Main --name Spark shell --jars /export/home/edlu/spark-avro_2.11-3.2.0.21.jar,/export/apps/hadoop/site/lib/grid-topology-1.0.jar --num-executors 4 spark-shell","java.home":"/usr/java/jdk1.8.0_31/jre","java.version":"1.8.0_31","sun.io.unicode.encoding":"UnicodeLittle"},"Classpath Entries":{"/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/guice-servlet-3.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/derby-10.12.1.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/htrace-core-3.0.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/api-asn1-api-1.0.0-M20.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/scala-reflect-2.11.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/datanucleus-rdbms-3.2.9.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-graphx_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/api-util-1.0.0-M20.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-yarn-client-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/base64-2.3.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-auth-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/validation-api-1.1.0.Final.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hk2-utils-2.4.0-b34.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/zstd-jni-1.3.2-2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-yarn-api-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/objenesis-2.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/conf/":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/httpclient-4.5.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/kryo-shaded-3.0.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/scala-library-2.11.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-net-3.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/xz-1.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/json4s-jackson_2.11-3.5.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-server-1.9.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-annotations-2.6.7.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/parquet-hadoop-1.8.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/activation-1.1.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spire_2.11-0.13.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/arpack_combined_all-0.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/libthrift-0.9.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/aircompressor-0.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/parquet-jackson-1.8.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hk2-api-2.4.0-b34.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/asm-3.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/apacheds-kerberos-codec-2.0.0-M15.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-hive_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/ivy-2.4.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/javax.inject-2.4.0-b34.jar":"System Classpath","/export/apps/hadoop/site/etc/hadoop/":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/snappy-java-1.1.7.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/arrow-format-0.8.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/netty-all-4.1.17.Final.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/avro-ipc-1.7.7.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/xmlenc-0.52.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jdo-api-3.0.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/curator-client-2.7.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/antlr-runtime-3.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/pyrolite-4.13.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/scala-xml_2.11-1.0.5.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-catalyst_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-collections-3.2.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/slf4j-api-1.7.16.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/stream-2.7.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/parquet-format-2.3.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/arrow-vector-0.8.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-yarn-server-web-proxy-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/htrace-core-3.1.0-incubating.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-sketch_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-common-2.22.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hppc-0.7.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-sql_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/univocity-parsers-2.5.9.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-math3-3.4.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-compiler-3.0.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-beanutils-1.7.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/java-xmlbuilder-1.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/javax.inject-1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-annotations-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/netty-3.9.9.Final.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/aopalliance-repackaged-2.4.0-b34.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/zookeeper-3.4.6.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/guice-3.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/scala-compiler-2.11.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/eigenbase-properties-1.1.5.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/aopalliance-1.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-yarn_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/JavaEWAH-0.3.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jsr305-1.3.9.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/libfb303-0.9.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/javax.annotation-api-1.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-yarn-server-common-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-digester-1.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/metrics-jvm-3.1.5.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/curator-framework-2.7.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/javax.ws.rs-api-2.0.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/paranamer-2.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/janino-3.0.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-mapreduce-client-core-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-server-2.22.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/orc-core-1.4.3-nohive.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jsch-0.1.42.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/calcite-linq4j-1.2.0-incubating.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-unsafe_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-codec-1.10.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jtransforms-2.4.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/lz4-java-1.4.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/datanucleus-core-3.2.10.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/flatbuffers-1.2.0-3f79e055.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hive-exec-1.2.1.spark2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/avro-mapred-1.7.7-hadoop2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/stax-api-1.0.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/core-1.1.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/leveldbjni-all-1.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-databind-2.6.7.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-dbcp-1.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-module-scala_2.11-2.6.7.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-lang3-3.5.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spire-macros_2.11-0.13.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-module-paranamer-2.7.9.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/slf4j-log4j12-1.7.16.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/chill-java-0.8.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jodd-core-3.5.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-pool-1.5.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/osgi-resource-locator-1.0.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/minlog-1.3.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-mapreduce-client-common-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/gson-2.2.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/py4j-0.10.6.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-streaming_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-core-2.6.7.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/calcite-avatica-1.2.0-incubating.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/machinist_2.11-0.6.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/avro-1.7.7.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-beanutils-core-1.8.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/apacheds-i18n-2.0.0-M15.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-media-jaxb-2.22.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/snappy-0.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-mapreduce-client-app-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/parquet-hadoop-bundle-1.6.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jul-to-slf4j-1.7.16.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/metrics-graphite-3.1.5.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jcl-over-slf4j-1.7.16.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/metrics-core-3.1.5.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-mllib-local_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/arrow-memory-0.8.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/breeze_2.11-0.13.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-guava-2.22.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-client-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/xercesImpl-2.9.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-tags_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/javolution-5.5.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jetty-6.1.26.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/joda-time-2.9.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/antlr-2.7.7.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-mapreduce-client-jobclient-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-lang-2.6.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/compress-lzf-1.0.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-crypto-1.0.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-core-1.9.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/curator-recipes-2.7.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hk2-locator-2.4.0-b34.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/guava-14.0.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-jaxrs-1.9.13.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-core_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jetty-sslengine-6.1.26.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-network-common_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-launcher_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/json4s-ast_2.11-3.5.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/antlr4-runtime-4.7.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jetty-util-6.1.26.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jaxb-api-2.2.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-io-2.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/parquet-encoding-1.8.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/httpcore-4.4.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/macro-compat_2.11-1.1.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jackson-xc-1.9.13.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/xbean-asm5-shaded-4.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/breeze-macros_2.11-0.13.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/protobuf-java-2.5.0.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/json4s-scalap_2.11-3.5.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-mllib_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-configuration-1.6.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-compress-1.4.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/json4s-core_2.11-3.5.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/orc-mapreduce-1.4.3-nohive.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/ST4-4.0.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/calcite-core-1.2.0-incubating.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-mapreduce-client-shuffle-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-common-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-repl_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-container-servlet-2.22.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/opencsv-2.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-logging-1.1.3.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/shapeless_2.11-2.3.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-cli-1.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-client-2.22.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-yarn-common-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hadoop-hdfs-2.7.4.51.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/log4j-1.2.17.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/parquet-column-1.8.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/hive-metastore-1.2.1.spark2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/RoaringBitmap-0.5.11.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/chill_2.11-0.8.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jersey-container-servlet-core-2.22.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/stringtemplate-3.2.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/parquet-common-1.8.2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-network-shuffle_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/spark-kvstore_2.11-2.4.0-SNAPSHOT.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/stax-api-1.0-2.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jta-1.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/javassist-3.18.1-GA.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/commons-httpclient-3.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jets3t-0.9.4.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/apache-log4j-extras-1.2.17.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/metrics-json-3.1.5.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/bcprov-jdk15on-1.58.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/oro-2.0.8.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/bonecp-0.8.0.RELEASE.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/jsp-api-2.1.jar":"System Classpath","/export/home/edlu/spark-2.4.0-SNAPSHOT-bin-2.7.4.51/jars/scala-parser-combinators_2.11-1.0.4.jar":"System Classpath"}} +{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"application_1506645932520_24630151","Timestamp":1524182082734,"User":"edlu"} +{"Event":"SparkListenerExecutorAdded","Timestamp":1524182111695,"Executor ID":"1","Executor Info":{"Host":"node1404.grid.company.com","Total Cores":1,"Log Urls":{"stdout":"http://node1404.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000002/edlu/stdout?start=-4096","stderr":"http://node1404.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000002/edlu/stderr?start=-4096"}}} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"1","Host":"node1404.grid.company.com","Port":34043},"Maximum Memory":956615884,"Timestamp":1524182111795,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0} +{"Event":"SparkListenerExecutorAdded","Timestamp":1524182112088,"Executor ID":"3","Executor Info":{"Host":"node0998.grid.company.com","Total Cores":1,"Log Urls":{"stdout":"http://node0998.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000005/edlu/stdout?start=-4096","stderr":"http://node0998.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000005/edlu/stderr?start=-4096"}}} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"3","Host":"node0998.grid.company.com","Port":45265},"Maximum Memory":956615884,"Timestamp":1524182112208,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0} +{"Event":"SparkListenerExecutorAdded","Timestamp":1524182112278,"Executor ID":"4","Executor Info":{"Host":"node4243.grid.company.com","Total Cores":1,"Log Urls":{"stdout":"http://node4243.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000006/edlu/stdout?start=-4096","stderr":"http://node4243.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000006/edlu/stderr?start=-4096"}}} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"4","Host":"node4243.grid.company.com","Port":16084},"Maximum Memory":956615884,"Timestamp":1524182112408,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0} +{"Event":"SparkListenerExecutorAdded","Timestamp":1524182112471,"Executor ID":"2","Executor Info":{"Host":"node4045.grid.company.com","Total Cores":1,"Log Urls":{"stdout":"http://node4045.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000004/edlu/stdout?start=-4096","stderr":"http://node4045.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000004/edlu/stderr?start=-4096"}}} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"2","Host":"node4045.grid.company.com","Port":29262},"Maximum Memory":956615884,"Timestamp":1524182112578,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":0,"description":"createOrReplaceTempView at :40","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3033)\n$line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:40)\n$line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:45)\n$line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:47)\n$line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:49)\n$line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:51)\n$line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:53)\n$line44.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:55)\n$line44.$read$$iw$$iw$$iw$$iw$$iw.(:57)\n$line44.$read$$iw$$iw$$iw$$iw.(:59)\n$line44.$read$$iw$$iw$$iw.(:61)\n$line44.$read$$iw$$iw.(:63)\n$line44.$read$$iw.(:65)\n$line44.$read.(:67)\n$line44.$read$.(:71)\n$line44.$read$.()\n$line44.$eval$.$print$lzycompute(:7)\n$line44.$eval$.$print(:6)\n$line44.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `apps`, false, true, LocalTempView\n +- AnalysisBarrier\n +- Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n +- Relation[appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] avro\n\n== Analyzed Logical Plan ==\nCreateViewCommand `apps`, false, true, LocalTempView\n +- AnalysisBarrier\n +- Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n +- Relation[appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] avro\n\n== Optimized Logical Plan ==\nCreateViewCommand `apps`, false, true, LocalTempView\n +- AnalysisBarrier\n +- Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n +- Relation[appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] avro\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `apps`, false, true, LocalTempView\n +- AnalysisBarrier\n +- Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n +- Relation[appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] avro","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metrics":[]},"time":1524182125829} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":0,"time":1524182125832} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":1,"description":"createOrReplaceTempView at :40","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3033)\n$line48.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:40)\n$line48.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:45)\n$line48.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:47)\n$line48.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:49)\n$line48.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:51)\n$line48.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:53)\n$line48.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:55)\n$line48.$read$$iw$$iw$$iw$$iw$$iw.(:57)\n$line48.$read$$iw$$iw$$iw$$iw.(:59)\n$line48.$read$$iw$$iw$$iw.(:61)\n$line48.$read$$iw$$iw.(:63)\n$line48.$read$$iw.(:65)\n$line48.$read.(:67)\n$line48.$read$.(:71)\n$line48.$read$.()\n$line48.$eval$.$print$lzycompute(:7)\n$line48.$eval$.$print(:6)\n$line48.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `sys_props`, false, true, LocalTempView\n +- AnalysisBarrier\n +- Aggregate [appId#137], [appId#137, first(if ((key#148 <=> azkaban.link.workflow.url)) value#149 else cast(null as string), true) AS azkaban.link.workflow.url#159, first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else cast(null as string), true) AS azkaban.link.execution.url#161, first(if ((key#148 <=> azkaban.link.job.url)) value#149 else cast(null as string), true) AS azkaban.link.job.url#163, first(if ((key#148 <=> user.name)) value#149 else cast(null as string), true) AS user.name#165]\n +- Project [appId#137, col#145.key AS key#148, col#145.value AS value#149]\n +- Project [appId#137, col#145]\n +- Generate explode(systemProperties#135), false, [col#145]\n +- Relation[runtime#133,sparkProperties#134,systemProperties#135,classpathEntries#136,appId#137,attemptId#138] avro\n\n== Analyzed Logical Plan ==\nCreateViewCommand `sys_props`, false, true, LocalTempView\n +- AnalysisBarrier\n +- Aggregate [appId#137], [appId#137, first(if ((key#148 <=> azkaban.link.workflow.url)) value#149 else cast(null as string), true) AS azkaban.link.workflow.url#159, first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else cast(null as string), true) AS azkaban.link.execution.url#161, first(if ((key#148 <=> azkaban.link.job.url)) value#149 else cast(null as string), true) AS azkaban.link.job.url#163, first(if ((key#148 <=> user.name)) value#149 else cast(null as string), true) AS user.name#165]\n +- Project [appId#137, col#145.key AS key#148, col#145.value AS value#149]\n +- Project [appId#137, col#145]\n +- Generate explode(systemProperties#135), false, [col#145]\n +- Relation[runtime#133,sparkProperties#134,systemProperties#135,classpathEntries#136,appId#137,attemptId#138] avro\n\n== Optimized Logical Plan ==\nCreateViewCommand `sys_props`, false, true, LocalTempView\n +- AnalysisBarrier\n +- Aggregate [appId#137], [appId#137, first(if ((key#148 <=> azkaban.link.workflow.url)) value#149 else cast(null as string), true) AS azkaban.link.workflow.url#159, first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else cast(null as string), true) AS azkaban.link.execution.url#161, first(if ((key#148 <=> azkaban.link.job.url)) value#149 else cast(null as string), true) AS azkaban.link.job.url#163, first(if ((key#148 <=> user.name)) value#149 else cast(null as string), true) AS user.name#165]\n +- Project [appId#137, col#145.key AS key#148, col#145.value AS value#149]\n +- Project [appId#137, col#145]\n +- Generate explode(systemProperties#135), false, [col#145]\n +- Relation[runtime#133,sparkProperties#134,systemProperties#135,classpathEntries#136,appId#137,attemptId#138] avro\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `sys_props`, false, true, LocalTempView\n +- AnalysisBarrier\n +- Aggregate [appId#137], [appId#137, first(if ((key#148 <=> azkaban.link.workflow.url)) value#149 else cast(null as string), true) AS azkaban.link.workflow.url#159, first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else cast(null as string), true) AS azkaban.link.execution.url#161, first(if ((key#148 <=> azkaban.link.job.url)) value#149 else cast(null as string), true) AS azkaban.link.job.url#163, first(if ((key#148 <=> user.name)) value#149 else cast(null as string), true) AS user.name#165]\n +- Project [appId#137, col#145.key AS key#148, col#145.value AS value#149]\n +- Project [appId#137, col#145]\n +- Generate explode(systemProperties#135), false, [col#145]\n +- Relation[runtime#133,sparkProperties#134,systemProperties#135,classpathEntries#136,appId#137,attemptId#138] avro","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metrics":[]},"time":1524182128463} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":1,"time":1524182128463} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":2,"description":"show at :40","details":"org.apache.spark.sql.Dataset.show(Dataset.scala:691)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:40)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:45)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:47)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:49)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:51)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:53)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:55)\n$line50.$read$$iw$$iw$$iw$$iw$$iw.(:57)\n$line50.$read$$iw$$iw$$iw$$iw.(:59)\n$line50.$read$$iw$$iw$$iw.(:61)\n$line50.$read$$iw$$iw.(:63)\n$line50.$read$$iw.(:65)\n$line50.$read.(:67)\n$line50.$read$.(:71)\n$line50.$read$.()\n$line50.$eval$.$print$lzycompute(:7)\n$line50.$eval$.$print(:6)\n$line50.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","physicalPlanDescription":"== Parsed Logical Plan ==\nGlobalLimit 21\n+- LocalLimit 21\n +- AnalysisBarrier\n +- Project [cast(appId#0 as string) AS appId#397, cast(attemptId#1 as string) AS attemptId#398, cast(name#2 as string) AS name#399, cast(mode#3 as string) AS mode#400, cast(completed#4 as string) AS completed#401, cast(duration#5L as string) AS duration#402, cast(endTime#6 as string) AS endTime#403, cast(endTimeEpoch#7L as string) AS endTimeEpoch#404, cast(lastUpdated#8 as string) AS lastUpdated#405, cast(lastUpdatedEpoch#9L as string) AS lastUpdatedEpoch#406, cast(sparkUser#10 as string) AS sparkUser#407, cast(startTime#11 as string) AS startTime#408, cast(startTimeEpoch#12L as string) AS startTimeEpoch#409, cast(appSparkVersion#13 as string) AS appSparkVersion#410, cast(endDate#28 as string) AS endDate#411, cast(azkaban.link.workflow.url#159 as string) AS azkaban.link.workflow.url#412, cast(azkaban.link.execution.url#161 as string) AS azkaban.link.execution.url#413, cast(azkaban.link.job.url#163 as string) AS azkaban.link.job.url#414, cast(user.name#165 as string) AS user.name#415]\n +- Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n +- Join LeftOuter, (appId#0 = appId#137)\n :- Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n : +- Relation[appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] avro\n +- Aggregate [appId#137], [appId#137, first(if ((key#148 <=> azkaban.link.workflow.url)) value#149 else cast(null as string), true) AS azkaban.link.workflow.url#159, first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else cast(null as string), true) AS azkaban.link.execution.url#161, first(if ((key#148 <=> azkaban.link.job.url)) value#149 else cast(null as string), true) AS azkaban.link.job.url#163, first(if ((key#148 <=> user.name)) value#149 else cast(null as string), true) AS user.name#165]\n +- Project [appId#137, col#145.key AS key#148, col#145.value AS value#149]\n +- Project [appId#137, col#145]\n +- Generate explode(systemProperties#135), false, [col#145]\n +- Relation[runtime#133,sparkProperties#134,systemProperties#135,classpathEntries#136,appId#137,attemptId#138] avro\n\n== Analyzed Logical Plan ==\nappId: string, attemptId: string, name: string, mode: string, completed: string, duration: string, endTime: string, endTimeEpoch: string, lastUpdated: string, lastUpdatedEpoch: string, sparkUser: string, startTime: string, startTimeEpoch: string, appSparkVersion: string, endDate: string, azkaban.link.workflow.url: string, azkaban.link.execution.url: string, azkaban.link.job.url: string, user.name: string\nGlobalLimit 21\n+- LocalLimit 21\n +- Project [cast(appId#0 as string) AS appId#397, cast(attemptId#1 as string) AS attemptId#398, cast(name#2 as string) AS name#399, cast(mode#3 as string) AS mode#400, cast(completed#4 as string) AS completed#401, cast(duration#5L as string) AS duration#402, cast(endTime#6 as string) AS endTime#403, cast(endTimeEpoch#7L as string) AS endTimeEpoch#404, cast(lastUpdated#8 as string) AS lastUpdated#405, cast(lastUpdatedEpoch#9L as string) AS lastUpdatedEpoch#406, cast(sparkUser#10 as string) AS sparkUser#407, cast(startTime#11 as string) AS startTime#408, cast(startTimeEpoch#12L as string) AS startTimeEpoch#409, cast(appSparkVersion#13 as string) AS appSparkVersion#410, cast(endDate#28 as string) AS endDate#411, cast(azkaban.link.workflow.url#159 as string) AS azkaban.link.workflow.url#412, cast(azkaban.link.execution.url#161 as string) AS azkaban.link.execution.url#413, cast(azkaban.link.job.url#163 as string) AS azkaban.link.job.url#414, cast(user.name#165 as string) AS user.name#415]\n +- Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n +- Join LeftOuter, (appId#0 = appId#137)\n :- Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n : +- Relation[appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] avro\n +- Aggregate [appId#137], [appId#137, first(if ((key#148 <=> azkaban.link.workflow.url)) value#149 else cast(null as string), true) AS azkaban.link.workflow.url#159, first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else cast(null as string), true) AS azkaban.link.execution.url#161, first(if ((key#148 <=> azkaban.link.job.url)) value#149 else cast(null as string), true) AS azkaban.link.job.url#163, first(if ((key#148 <=> user.name)) value#149 else cast(null as string), true) AS user.name#165]\n +- Project [appId#137, col#145.key AS key#148, col#145.value AS value#149]\n +- Project [appId#137, col#145]\n +- Generate explode(systemProperties#135), false, [col#145]\n +- Relation[runtime#133,sparkProperties#134,systemProperties#135,classpathEntries#136,appId#137,attemptId#138] avro\n\n== Optimized Logical Plan ==\nGlobalLimit 21\n+- LocalLimit 21\n +- Project [appId#0, attemptId#1, name#2, mode#3, cast(completed#4 as string) AS completed#401, cast(duration#5L as string) AS duration#402, endTime#6, cast(endTimeEpoch#7L as string) AS endTimeEpoch#404, lastUpdated#8, cast(lastUpdatedEpoch#9L as string) AS lastUpdatedEpoch#406, sparkUser#10, startTime#11, cast(startTimeEpoch#12L as string) AS startTimeEpoch#409, appSparkVersion#13, cast(endDate#28 as string) AS endDate#411, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n +- InMemoryRelation [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas)\n +- *(5) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n +- SortMergeJoin [appId#0], [appId#137], LeftOuter\n :- *(1) Sort [appId#0 ASC NULLS FIRST], false, 0\n : +- Exchange hashpartitioning(appId#0, 200)\n : +- InMemoryTableScan [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28]\n : +- InMemoryRelation [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas)\n : +- *(1) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n : +- *(1) FileScan avro [appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] Batched: false, Format: com.databricks.spark.avro.DefaultSource@7006b304, Location: InMemoryFileIndex[hdfs://clusternn01.grid.company.com:9000/data/hadoopdev/sparkmetrics/ltx1-..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct azkaban.link.workflow.url)) value#149 else null, true), first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else null, true), first(if ((key#148 <=> azkaban.link.job.url)) value#149 else null, true), first(if ((key#148 <=> user.name)) value#149 else null, true)], output=[appId#137, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165])\n +- *(4) Sort [appId#137 ASC NULLS FIRST], false, 0\n +- Exchange hashpartitioning(appId#137, 200)\n +- SortAggregate(key=[appId#137], functions=[partial_first(if ((key#148 <=> azkaban.link.workflow.url)) value#149 else null, true), partial_first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else null, true), partial_first(if ((key#148 <=> azkaban.link.job.url)) value#149 else null, true), partial_first(if ((key#148 <=> user.name)) value#149 else null, true)], output=[appId#137, first#273, valueSet#274, first#275, valueSet#276, first#277, valueSet#278, first#279, valueSet#280])\n +- *(3) Sort [appId#137 ASC NULLS FIRST], false, 0\n +- *(3) Project [appId#137, col#145.key AS key#148, col#145.value AS value#149]\n +- Generate explode(systemProperties#135), [appId#137], false, [col#145]\n +- *(2) FileScan avro [systemProperties#135,appId#137] Batched: false, Format: com.databricks.spark.avro.DefaultSource@485d3d1, Location: InMemoryFileIndex[hdfs://clusternn01.grid.company.com:9000/data/hadoopdev/sparkmetrics/ltx1-..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct>,appId:string>\n\n== Physical Plan ==\nCollectLimit 21\n+- *(1) LocalLimit 21\n +- *(1) Project [appId#0, attemptId#1, name#2, mode#3, cast(completed#4 as string) AS completed#401, cast(duration#5L as string) AS duration#402, endTime#6, cast(endTimeEpoch#7L as string) AS endTimeEpoch#404, lastUpdated#8, cast(lastUpdatedEpoch#9L as string) AS lastUpdatedEpoch#406, sparkUser#10, startTime#11, cast(startTimeEpoch#12L as string) AS startTimeEpoch#409, appSparkVersion#13, cast(endDate#28 as string) AS endDate#411, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n +- InMemoryTableScan [appId#0, appSparkVersion#13, attemptId#1, azkaban.link.execution.url#161, azkaban.link.job.url#163, azkaban.link.workflow.url#159, completed#4, duration#5L, endDate#28, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, mode#3, name#2, sparkUser#10, startTime#11, startTimeEpoch#12L, user.name#165]\n +- InMemoryRelation [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas)\n +- *(5) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n +- SortMergeJoin [appId#0], [appId#137], LeftOuter\n :- *(1) Sort [appId#0 ASC NULLS FIRST], false, 0\n : +- Exchange hashpartitioning(appId#0, 200)\n : +- InMemoryTableScan [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28]\n : +- InMemoryRelation [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas)\n : +- *(1) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n : +- *(1) FileScan avro [appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] Batched: false, Format: com.databricks.spark.avro.DefaultSource@7006b304, Location: InMemoryFileIndex[hdfs://clusternn01.grid.company.com:9000/data/hadoopdev/sparkmetrics/ltx1-..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct azkaban.link.workflow.url)) value#149 else null, true), first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else null, true), first(if ((key#148 <=> azkaban.link.job.url)) value#149 else null, true), first(if ((key#148 <=> user.name)) value#149 else null, true)], output=[appId#137, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165])\n +- *(4) Sort [appId#137 ASC NULLS FIRST], false, 0\n +- Exchange hashpartitioning(appId#137, 200)\n +- SortAggregate(key=[appId#137], functions=[partial_first(if ((key#148 <=> azkaban.link.workflow.url)) value#149 else null, true), partial_first(if ((key#148 <=> azkaban.link.execution.url)) value#149 else null, true), partial_first(if ((key#148 <=> azkaban.link.job.url)) value#149 else null, true), partial_first(if ((key#148 <=> user.name)) value#149 else null, true)], output=[appId#137, first#273, valueSet#274, first#275, valueSet#276, first#277, valueSet#278, first#279, valueSet#280])\n +- *(3) Sort [appId#137 ASC NULLS FIRST], false, 0\n +- *(3) Project [appId#137, col#145.key AS key#148, col#145.value AS value#149]\n +- Generate explode(systemProperties#135), [appId#137], false, [col#145]\n +- *(2) FileScan avro [systemProperties#135,appId#137] Batched: false, Format: com.databricks.spark.avro.DefaultSource@485d3d1, Location: InMemoryFileIndex[hdfs://clusternn01.grid.company.com:9000/data/hadoopdev/sparkmetrics/ltx1-..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct>,appId:string>","sparkPlanInfo":{"nodeName":"CollectLimit","simpleString":"CollectLimit 21","children":[{"nodeName":"WholeStageCodegen","simpleString":"WholeStageCodegen","children":[{"nodeName":"LocalLimit","simpleString":"LocalLimit 21","children":[{"nodeName":"Project","simpleString":"Project [appId#0, attemptId#1, name#2, mode#3, cast(completed#4 as string) AS completed#401, cast(duration#5L as string) AS duration#402, endTime#6, cast(endTimeEpoch#7L as string) AS endTimeEpoch#404, lastUpdated#8, cast(lastUpdatedEpoch#9L as string) AS lastUpdatedEpoch#406, sparkUser#10, startTime#11, cast(startTimeEpoch#12L as string) AS startTimeEpoch#409, appSparkVersion#13, cast(endDate#28 as string) AS endDate#411, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [appId#0, appSparkVersion#13, attemptId#1, azkaban.link.execution.url#161, azkaban.link.job.url#163, azkaban.link.workflow.url#159, completed#4, duration#5L, endDate#28, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, mode#3, name#2, sparkUser#10, startTime#11, startTimeEpoch#12L, user.name#165]","children":[],"metrics":[{"name":"number of output rows","accumulatorId":35,"metricType":"sum"},{"name":"scan time total (min, med, max)","accumulatorId":36,"metricType":"timing"}]}],"metrics":[]}],"metrics":[]}],"metrics":[]}],"metrics":[{"name":"duration total (min, med, max)","accumulatorId":34,"metricType":"timing"}]}],"metrics":[]},"time":1524182129952} +{"Event":"SparkListenerJobStart","Job ID":0,"Submission Time":1524182130194,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"cache at :41","Number of Tasks":4,"RDD Info":[{"RDD ID":6,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"11\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[5],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"FileScanRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :39","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"*(1) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n+- *(1) FileScan avro [appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] Batched: false, Format: com.databricks.spark.avro.DefaultSource@7006b304, Location: InMemoryFileIndex[hdfs://clusternn01.grid.company.com:9000/data/hadoopdev/sparkmetrics/ltx1-..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct:39","Parent IDs":[1],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":true,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :39","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"12\",\"name\":\"InMemoryTableScan\"}","Callsite":"cache at :41","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"12\",\"name\":\"InMemoryTableScan\"}","Callsite":"cache at :41","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.Dataset.cache(Dataset.scala:2912)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:41)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:46)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:48)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:50)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:52)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:54)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:56)\n$line49.$read$$iw$$iw$$iw$$iw$$iw.(:58)\n$line49.$read$$iw$$iw$$iw$$iw.(:60)\n$line49.$read$$iw$$iw$$iw.(:62)\n$line49.$read$$iw$$iw.(:64)\n$line49.$read$$iw.(:66)\n$line49.$read.(:68)\n$line49.$read$.(:72)\n$line49.$read$.()\n$line49.$eval$.$print$lzycompute(:7)\n$line49.$eval$.$print(:6)\n$line49.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Accumulables":[]},{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"cache at :41","Number of Tasks":4,"RDD Info":[{"RDD ID":14,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"17\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"24\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"FileScanRDD","Scope":"{\"id\":\"24\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"19\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"Generate\"}","Callsite":"cache at :41","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"18\",\"name\":\"SortAggregate\"}","Callsite":"cache at :41","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.Dataset.cache(Dataset.scala:2912)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:41)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:46)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:48)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:50)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:52)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:54)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:56)\n$line49.$read$$iw$$iw$$iw$$iw$$iw.(:58)\n$line49.$read$$iw$$iw$$iw$$iw.(:60)\n$line49.$read$$iw$$iw$$iw.(:62)\n$line49.$read$$iw$$iw.(:64)\n$line49.$read$$iw.(:66)\n$line49.$read.(:68)\n$line49.$read$.(:72)\n$line49.$read$.()\n$line49.$eval$.$print$lzycompute(:7)\n$line49.$eval$.$print(:6)\n$line49.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Accumulables":[]},{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"show at :40","Number of Tasks":1,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"map\"}","Callsite":"show at :40","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"32\",\"name\":\"mapPartitionsInternal\"}","Callsite":"show at :40","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"27\",\"name\":\"WholeStageCodegen\"}","Callsite":"show at :40","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":22,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"31\",\"name\":\"InMemoryTableScan\"}","Callsite":"show at :40","Parent IDs":[20],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":20,"Name":"*(5) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n+- SortMergeJoin [appId#0], [appId#137], LeftOuter\n :- *(1) Sort [appId#0 ASC NULLS FIRST], false, 0\n : +- Exchange hashpartitioning(appId#0, 200)\n : +- InMemoryTableScan [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28]\n : +- InMemoryRelation [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28], true, 10000, StorageLevel(disk, memory, deserialized, 1 rep...","Scope":"{\"id\":\"26\",\"name\":\"mapPartitionsInternal\"}","Callsite":"cache at :41","Parent IDs":[19],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":true,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"31\",\"name\":\"InMemoryTableScan\"}","Callsite":"show at :40","Parent IDs":[22],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"7\",\"name\":\"SortMergeJoin\"}","Callsite":"cache at :41","Parent IDs":[8,17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"SortAggregate\"}","Callsite":"cache at :41","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"11\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"14\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"17\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"4\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[0,1],"Details":"org.apache.spark.sql.Dataset.show(Dataset.scala:691)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:40)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:45)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:47)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:49)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:51)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:53)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:55)\n$line50.$read$$iw$$iw$$iw$$iw$$iw.(:57)\n$line50.$read$$iw$$iw$$iw$$iw.(:59)\n$line50.$read$$iw$$iw$$iw.(:61)\n$line50.$read$$iw$$iw.(:63)\n$line50.$read$$iw.(:65)\n$line50.$read.(:67)\n$line50.$read$.(:71)\n$line50.$read$.()\n$line50.$eval$.$print$lzycompute(:7)\n$line50.$eval$.$print(:6)\n$line50.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Accumulables":[]}],"Stage IDs":[0,1,2],"Properties":{"spark.sql.execution.id":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"cache at :41","Number of Tasks":4,"RDD Info":[{"RDD ID":6,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"11\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[5],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"FileScanRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :39","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"*(1) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n+- *(1) FileScan avro [appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] Batched: false, Format: com.databricks.spark.avro.DefaultSource@7006b304, Location: InMemoryFileIndex[hdfs://clusternn01.grid.company.com:9000/data/hadoopdev/sparkmetrics/ltx1-..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct:39","Parent IDs":[1],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":true,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :39","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"12\",\"name\":\"InMemoryTableScan\"}","Callsite":"cache at :41","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"12\",\"name\":\"InMemoryTableScan\"}","Callsite":"cache at :41","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.Dataset.cache(Dataset.scala:2912)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:41)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:46)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:48)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:50)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:52)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:54)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:56)\n$line49.$read$$iw$$iw$$iw$$iw$$iw.(:58)\n$line49.$read$$iw$$iw$$iw$$iw.(:60)\n$line49.$read$$iw$$iw$$iw.(:62)\n$line49.$read$$iw$$iw.(:64)\n$line49.$read$$iw.(:66)\n$line49.$read.(:68)\n$line49.$read$.(:72)\n$line49.$read$.()\n$line49.$eval$.$print$lzycompute(:7)\n$line49.$eval$.$print(:6)\n$line49.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Submission Time":1524182130229,"Accumulables":[]},"Properties":{"spark.sql.execution.id":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"cache at :41","Number of Tasks":4,"RDD Info":[{"RDD ID":14,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"17\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"24\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"FileScanRDD","Scope":"{\"id\":\"24\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"19\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"Generate\"}","Callsite":"cache at :41","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"18\",\"name\":\"SortAggregate\"}","Callsite":"cache at :41","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.Dataset.cache(Dataset.scala:2912)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:41)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:46)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:48)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:50)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:52)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:54)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:56)\n$line49.$read$$iw$$iw$$iw$$iw$$iw.(:58)\n$line49.$read$$iw$$iw$$iw$$iw.(:60)\n$line49.$read$$iw$$iw$$iw.(:62)\n$line49.$read$$iw$$iw.(:64)\n$line49.$read$$iw.(:66)\n$line49.$read.(:68)\n$line49.$read$.(:72)\n$line49.$read$.()\n$line49.$eval$.$print$lzycompute(:7)\n$line49.$eval$.$print(:6)\n$line49.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Submission Time":1524182130328,"Accumulables":[]},"Properties":{"spark.sql.execution.id":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1524182130331,"Executor ID":"2","Host":"node4045.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1524182130349,"Executor ID":"3","Host":"node0998.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1524182130350,"Executor ID":"4","Host":"node4243.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1524182130350,"Executor ID":"1","Host":"node1404.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":0,"Attempt":0,"Launch Time":1524182142251,"Executor ID":"1","Host":"node1404.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":3,"Attempt":0,"Launch Time":1524182130350,"Executor ID":"1","Host":"node1404.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":1524182142286,"Failed":false,"Killed":false,"Accumulables":[{"ID":7,"Name":"data size total (min, med, max)","Update":"154334487","Value":"154334486","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":16,"Name":"number of output rows","Update":"466636","Value":"466636","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1,"Name":"number of output rows","Update":"466636","Value":"466636","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":5,"Name":"duration total (min, med, max)","Update":"19666","Value":"19665","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":59,"Name":"internal.metrics.input.recordsRead","Update":466636,"Value":466636,"Internal":true,"Count Failed Values":true},{"ID":58,"Name":"internal.metrics.input.bytesRead","Update":37809697,"Value":37809697,"Internal":true,"Count Failed Values":true},{"ID":57,"Name":"internal.metrics.shuffle.write.writeTime","Update":91545212,"Value":91545212,"Internal":true,"Count Failed Values":true},{"ID":56,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":466636,"Value":466636,"Internal":true,"Count Failed Values":true},{"ID":55,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":20002743,"Value":20002743,"Internal":true,"Count Failed Values":true},{"ID":43,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":2,"Internal":true,"Count Failed Values":true},{"ID":42,"Name":"internal.metrics.jvmGCTime","Update":407,"Value":407,"Internal":true,"Count Failed Values":true},{"ID":41,"Name":"internal.metrics.resultSize","Update":1856,"Value":1856,"Internal":true,"Count Failed Values":true},{"ID":40,"Name":"internal.metrics.executorCpuTime","Update":9020410971,"Value":9020410971,"Internal":true,"Count Failed Values":true},{"ID":39,"Name":"internal.metrics.executorRunTime","Update":11146,"Value":11146,"Internal":true,"Count Failed Values":true},{"ID":38,"Name":"internal.metrics.executorDeserializeCpuTime","Update":574344183,"Value":574344183,"Internal":true,"Count Failed Values":true},{"ID":37,"Name":"internal.metrics.executorDeserializeTime","Update":714,"Value":714,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":714,"Executor Deserialize CPU Time":574344183,"Executor Run Time":11146,"Executor CPU Time":9020410971,"Result Size":1856,"JVM GC Time":407,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":20002743,"Shuffle Write Time":91545212,"Shuffle Records Written":466636},"Input Metrics":{"Bytes Read":37809697,"Records Read":466636},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":1,"Attempt":0,"Launch Time":1524182142997,"Executor ID":"4","Host":"node4243.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":2,"Attempt":0,"Launch Time":1524182130350,"Executor ID":"4","Host":"node4243.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":1524182143009,"Failed":false,"Killed":false,"Accumulables":[{"ID":7,"Name":"data size total (min, med, max)","Update":"206421303","Value":"360755789","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":16,"Name":"number of output rows","Update":"624246","Value":"1090882","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1,"Name":"number of output rows","Update":"624246","Value":"1090882","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":5,"Name":"duration total (min, med, max)","Update":"20604","Value":"40269","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":59,"Name":"internal.metrics.input.recordsRead","Update":624246,"Value":1090882,"Internal":true,"Count Failed Values":true},{"ID":58,"Name":"internal.metrics.input.bytesRead","Update":50423609,"Value":88233306,"Internal":true,"Count Failed Values":true},{"ID":57,"Name":"internal.metrics.shuffle.write.writeTime","Update":104125550,"Value":195670762,"Internal":true,"Count Failed Values":true},{"ID":56,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":624246,"Value":1090882,"Internal":true,"Count Failed Values":true},{"ID":55,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":26424033,"Value":46426776,"Internal":true,"Count Failed Values":true},{"ID":43,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":3,"Internal":true,"Count Failed Values":true},{"ID":42,"Name":"internal.metrics.jvmGCTime","Update":374,"Value":781,"Internal":true,"Count Failed Values":true},{"ID":41,"Name":"internal.metrics.resultSize","Update":1856,"Value":3712,"Internal":true,"Count Failed Values":true},{"ID":40,"Name":"internal.metrics.executorCpuTime","Update":11039226628,"Value":20059637599,"Internal":true,"Count Failed Values":true},{"ID":39,"Name":"internal.metrics.executorRunTime","Update":11978,"Value":23124,"Internal":true,"Count Failed Values":true},{"ID":38,"Name":"internal.metrics.executorDeserializeCpuTime","Update":526915936,"Value":1101260119,"Internal":true,"Count Failed Values":true},{"ID":37,"Name":"internal.metrics.executorDeserializeTime","Update":622,"Value":1336,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":622,"Executor Deserialize CPU Time":526915936,"Executor Run Time":11978,"Executor CPU Time":11039226628,"Result Size":1856,"JVM GC Time":374,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":26424033,"Shuffle Write Time":104125550,"Shuffle Records Written":624246},"Input Metrics":{"Bytes Read":50423609,"Records Read":624246},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerExecutorAdded","Timestamp":1524182143160,"Executor ID":"5","Executor Info":{"Host":"node2477.grid.company.com","Total Cores":1,"Log Urls":{"stdout":"http://node2477.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000007/edlu/stdout?start=-4096","stderr":"http://node2477.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000007/edlu/stderr?start=-4096"}}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":2,"Attempt":0,"Launch Time":1524182143166,"Executor ID":"5","Host":"node2477.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"5","Host":"node2477.grid.company.com","Port":20123},"Maximum Memory":956615884,"Timestamp":1524182143406,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":3,"Attempt":0,"Launch Time":1524182144237,"Executor ID":"1","Host":"node1404.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":0,"Attempt":0,"Launch Time":1524182142251,"Executor ID":"1","Host":"node1404.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":1524182144246,"Failed":false,"Killed":false,"Accumulables":[{"ID":8,"Name":"data size total (min, med, max)","Update":"1920975","Value":"1920974","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":23,"Name":"number of output rows","Update":"3562","Value":"3562","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":25,"Name":"peak memory total (min, med, max)","Update":"41943039","Value":"41943038","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":24,"Name":"sort time total (min, med, max)","Update":"38","Value":"37","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":27,"Name":"duration total (min, med, max)","Update":"1813","Value":"1812","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":28,"Name":"number of output rows","Update":"195602","Value":"195602","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":29,"Name":"number of output rows","Update":"3563","Value":"3563","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":33,"Name":"duration total (min, med, max)","Update":"1558","Value":"1557","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":84,"Name":"internal.metrics.input.recordsRead","Update":3563,"Value":3563,"Internal":true,"Count Failed Values":true},{"ID":83,"Name":"internal.metrics.input.bytesRead","Update":36845111,"Value":36845111,"Internal":true,"Count Failed Values":true},{"ID":82,"Name":"internal.metrics.shuffle.write.writeTime","Update":27318908,"Value":27318908,"Internal":true,"Count Failed Values":true},{"ID":81,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":3562,"Value":3562,"Internal":true,"Count Failed Values":true},{"ID":80,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":349287,"Value":349287,"Internal":true,"Count Failed Values":true},{"ID":71,"Name":"internal.metrics.peakExecutionMemory","Update":41943040,"Value":41943040,"Internal":true,"Count Failed Values":true},{"ID":67,"Name":"internal.metrics.jvmGCTime","Update":33,"Value":33,"Internal":true,"Count Failed Values":true},{"ID":66,"Name":"internal.metrics.resultSize","Update":2394,"Value":2394,"Internal":true,"Count Failed Values":true},{"ID":65,"Name":"internal.metrics.executorCpuTime","Update":1498974375,"Value":1498974375,"Internal":true,"Count Failed Values":true},{"ID":64,"Name":"internal.metrics.executorRunTime","Update":1922,"Value":1922,"Internal":true,"Count Failed Values":true},{"ID":63,"Name":"internal.metrics.executorDeserializeCpuTime","Update":49547405,"Value":49547405,"Internal":true,"Count Failed Values":true},{"ID":62,"Name":"internal.metrics.executorDeserializeTime","Update":56,"Value":56,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":56,"Executor Deserialize CPU Time":49547405,"Executor Run Time":1922,"Executor CPU Time":1498974375,"Result Size":2394,"JVM GC Time":33,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":349287,"Shuffle Write Time":27318908,"Shuffle Records Written":3562},"Input Metrics":{"Bytes Read":36845111,"Records Read":3563},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1524182130331,"Executor ID":"2","Host":"node4045.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":1524182144444,"Failed":false,"Killed":false,"Accumulables":[{"ID":7,"Name":"data size total (min, med, max)","Update":"204058975","Value":"564814764","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":16,"Name":"number of output rows","Update":"616897","Value":"1707779","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1,"Name":"number of output rows","Update":"616897","Value":"1707779","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":5,"Name":"duration total (min, med, max)","Update":"23365","Value":"63634","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":59,"Name":"internal.metrics.input.recordsRead","Update":616897,"Value":1707779,"Internal":true,"Count Failed Values":true},{"ID":58,"Name":"internal.metrics.input.bytesRead","Update":50423423,"Value":138656729,"Internal":true,"Count Failed Values":true},{"ID":57,"Name":"internal.metrics.shuffle.write.writeTime","Update":105575962,"Value":301246724,"Internal":true,"Count Failed Values":true},{"ID":56,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":616897,"Value":1707779,"Internal":true,"Count Failed Values":true},{"ID":55,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":22950296,"Value":69377072,"Internal":true,"Count Failed Values":true},{"ID":43,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":5,"Internal":true,"Count Failed Values":true},{"ID":42,"Name":"internal.metrics.jvmGCTime","Update":326,"Value":1107,"Internal":true,"Count Failed Values":true},{"ID":41,"Name":"internal.metrics.resultSize","Update":1856,"Value":5568,"Internal":true,"Count Failed Values":true},{"ID":40,"Name":"internal.metrics.executorCpuTime","Update":11931694025,"Value":31991331624,"Internal":true,"Count Failed Values":true},{"ID":39,"Name":"internal.metrics.executorRunTime","Update":13454,"Value":36578,"Internal":true,"Count Failed Values":true},{"ID":38,"Name":"internal.metrics.executorDeserializeCpuTime","Update":531799977,"Value":1633060096,"Internal":true,"Count Failed Values":true},{"ID":37,"Name":"internal.metrics.executorDeserializeTime","Update":594,"Value":1930,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":594,"Executor Deserialize CPU Time":531799977,"Executor Run Time":13454,"Executor CPU Time":11931694025,"Result Size":1856,"JVM GC Time":326,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":22950296,"Shuffle Write Time":105575962,"Shuffle Records Written":616897},"Input Metrics":{"Bytes Read":50423423,"Records Read":616897},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1524182130349,"Executor ID":"3","Host":"node0998.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":1524182144840,"Failed":false,"Killed":false,"Accumulables":[{"ID":7,"Name":"data size total (min, med, max)","Update":"207338935","Value":"772153699","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":16,"Name":"number of output rows","Update":"626277","Value":"2334056","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1,"Name":"number of output rows","Update":"626277","Value":"2334056","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":5,"Name":"duration total (min, med, max)","Update":"24254","Value":"87888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":59,"Name":"internal.metrics.input.recordsRead","Update":626277,"Value":2334056,"Internal":true,"Count Failed Values":true},{"ID":58,"Name":"internal.metrics.input.bytesRead","Update":50409514,"Value":189066243,"Internal":true,"Count Failed Values":true},{"ID":57,"Name":"internal.metrics.shuffle.write.writeTime","Update":106963069,"Value":408209793,"Internal":true,"Count Failed Values":true},{"ID":56,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":626277,"Value":2334056,"Internal":true,"Count Failed Values":true},{"ID":55,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":31362123,"Value":100739195,"Internal":true,"Count Failed Values":true},{"ID":43,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":7,"Internal":true,"Count Failed Values":true},{"ID":42,"Name":"internal.metrics.jvmGCTime","Update":342,"Value":1449,"Internal":true,"Count Failed Values":true},{"ID":41,"Name":"internal.metrics.resultSize","Update":1856,"Value":7424,"Internal":true,"Count Failed Values":true},{"ID":40,"Name":"internal.metrics.executorCpuTime","Update":12267596062,"Value":44258927686,"Internal":true,"Count Failed Values":true},{"ID":39,"Name":"internal.metrics.executorRunTime","Update":13858,"Value":50436,"Internal":true,"Count Failed Values":true},{"ID":38,"Name":"internal.metrics.executorDeserializeCpuTime","Update":519573839,"Value":2152633935,"Internal":true,"Count Failed Values":true},{"ID":37,"Name":"internal.metrics.executorDeserializeTime","Update":573,"Value":2503,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":573,"Executor Deserialize CPU Time":519573839,"Executor Run Time":13858,"Executor CPU Time":12267596062,"Result Size":1856,"JVM GC Time":342,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":31362123,"Shuffle Write Time":106963069,"Shuffle Records Written":626277},"Input Metrics":{"Bytes Read":50409514,"Records Read":626277},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"driver","Stage ID":0,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":592412824,"JVMOffHeapMemory":202907152,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":905801,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":905801,"OffHeapUnifiedMemory":0,"DirectPoolMemory":355389,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"2","Stage ID":0,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":523121272,"JVMOffHeapMemory":88280720,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":52050147,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":52050147,"OffHeapUnifiedMemory":0,"DirectPoolMemory":87796,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"1","Stage ID":0,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":214174608,"JVMOffHeapMemory":91548704,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":47399168,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":47399168,"OffHeapUnifiedMemory":0,"DirectPoolMemory":87796,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"4","Stage ID":0,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":518613056,"JVMOffHeapMemory":95657456,"OnHeapExecutionMemory":37748736,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":63104457,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":100853193,"OffHeapUnifiedMemory":0,"DirectPoolMemory":126261,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"3","Stage ID":0,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":726805712,"JVMOffHeapMemory":90709624,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":69535048,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":69535048,"OffHeapUnifiedMemory":0,"DirectPoolMemory":87796,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"cache at :41","Number of Tasks":4,"RDD Info":[{"RDD ID":6,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"11\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[5],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"FileScanRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :39","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"*(1) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, cast(endTime#6 as date) AS endDate#28]\n+- *(1) FileScan avro [appId#0,attemptId#1,name#2,mode#3,completed#4,duration#5L,endTime#6,endTimeEpoch#7L,lastUpdated#8,lastUpdatedEpoch#9L,sparkUser#10,startTime#11,startTimeEpoch#12L,appSparkVersion#13] Batched: false, Format: com.databricks.spark.avro.DefaultSource@7006b304, Location: InMemoryFileIndex[hdfs://clusternn01.grid.company.com:9000/data/hadoopdev/sparkmetrics/ltx1-..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct:39","Parent IDs":[1],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":true,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :39","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"12\",\"name\":\"InMemoryTableScan\"}","Callsite":"cache at :41","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"12\",\"name\":\"InMemoryTableScan\"}","Callsite":"cache at :41","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.Dataset.cache(Dataset.scala:2912)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:41)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:46)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:48)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:50)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:52)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:54)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:56)\n$line49.$read$$iw$$iw$$iw$$iw$$iw.(:58)\n$line49.$read$$iw$$iw$$iw$$iw.(:60)\n$line49.$read$$iw$$iw$$iw.(:62)\n$line49.$read$$iw$$iw.(:64)\n$line49.$read$$iw.(:66)\n$line49.$read.(:68)\n$line49.$read$.(:72)\n$line49.$read$.()\n$line49.$eval$.$print$lzycompute(:7)\n$line49.$eval$.$print(:6)\n$line49.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Submission Time":1524182130229,"Completion Time":1524182144852,"Accumulables":[{"ID":41,"Name":"internal.metrics.resultSize","Value":7424,"Internal":true,"Count Failed Values":true},{"ID":59,"Name":"internal.metrics.input.recordsRead","Value":2334056,"Internal":true,"Count Failed Values":true},{"ID":38,"Name":"internal.metrics.executorDeserializeCpuTime","Value":2152633935,"Internal":true,"Count Failed Values":true},{"ID":56,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":2334056,"Internal":true,"Count Failed Values":true},{"ID":5,"Name":"duration total (min, med, max)","Value":"87888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":55,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":100739195,"Internal":true,"Count Failed Values":true},{"ID":40,"Name":"internal.metrics.executorCpuTime","Value":44258927686,"Internal":true,"Count Failed Values":true},{"ID":58,"Name":"internal.metrics.input.bytesRead","Value":189066243,"Internal":true,"Count Failed Values":true},{"ID":7,"Name":"data size total (min, med, max)","Value":"772153699","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":16,"Name":"number of output rows","Value":"2334056","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":43,"Name":"internal.metrics.resultSerializationTime","Value":7,"Internal":true,"Count Failed Values":true},{"ID":1,"Name":"number of output rows","Value":"2334056","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":37,"Name":"internal.metrics.executorDeserializeTime","Value":2503,"Internal":true,"Count Failed Values":true},{"ID":57,"Name":"internal.metrics.shuffle.write.writeTime","Value":408209793,"Internal":true,"Count Failed Values":true},{"ID":39,"Name":"internal.metrics.executorRunTime","Value":50436,"Internal":true,"Count Failed Values":true},{"ID":42,"Name":"internal.metrics.jvmGCTime","Value":1449,"Internal":true,"Count Failed Values":true}]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":1,"Attempt":0,"Launch Time":1524182142997,"Executor ID":"4","Host":"node4243.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":1524182145327,"Failed":false,"Killed":false,"Accumulables":[{"ID":8,"Name":"data size total (min, med, max)","Update":"1953295","Value":"3874269","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":23,"Name":"number of output rows","Update":"3575","Value":"7137","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":25,"Name":"peak memory total (min, med, max)","Update":"41943039","Value":"83886077","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":24,"Name":"sort time total (min, med, max)","Update":"49","Value":"86","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":27,"Name":"duration total (min, med, max)","Update":"2002","Value":"3814","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":28,"Name":"number of output rows","Update":"196587","Value":"392189","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":29,"Name":"number of output rows","Update":"3575","Value":"7138","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":33,"Name":"duration total (min, med, max)","Update":"1755","Value":"3312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":84,"Name":"internal.metrics.input.recordsRead","Update":3575,"Value":7138,"Internal":true,"Count Failed Values":true},{"ID":83,"Name":"internal.metrics.input.bytesRead","Update":36849246,"Value":73694357,"Internal":true,"Count Failed Values":true},{"ID":82,"Name":"internal.metrics.shuffle.write.writeTime","Update":32035583,"Value":59354491,"Internal":true,"Count Failed Values":true},{"ID":81,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":3575,"Value":7137,"Internal":true,"Count Failed Values":true},{"ID":80,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":349006,"Value":698293,"Internal":true,"Count Failed Values":true},{"ID":71,"Name":"internal.metrics.peakExecutionMemory","Update":41943040,"Value":83886080,"Internal":true,"Count Failed Values":true},{"ID":67,"Name":"internal.metrics.jvmGCTime","Update":31,"Value":64,"Internal":true,"Count Failed Values":true},{"ID":66,"Name":"internal.metrics.resultSize","Update":2394,"Value":4788,"Internal":true,"Count Failed Values":true},{"ID":65,"Name":"internal.metrics.executorCpuTime","Update":1785119941,"Value":3284094316,"Internal":true,"Count Failed Values":true},{"ID":64,"Name":"internal.metrics.executorRunTime","Update":2182,"Value":4104,"Internal":true,"Count Failed Values":true},{"ID":63,"Name":"internal.metrics.executorDeserializeCpuTime","Update":71500541,"Value":121047946,"Internal":true,"Count Failed Values":true},{"ID":62,"Name":"internal.metrics.executorDeserializeTime","Update":136,"Value":192,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":136,"Executor Deserialize CPU Time":71500541,"Executor Run Time":2182,"Executor CPU Time":1785119941,"Result Size":2394,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":349006,"Shuffle Write Time":32035583,"Shuffle Records Written":3575},"Input Metrics":{"Bytes Read":36849246,"Records Read":3575},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":3,"Attempt":0,"Launch Time":1524182144237,"Executor ID":"1","Host":"node1404.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":1524182145971,"Failed":false,"Killed":false,"Accumulables":[{"ID":8,"Name":"data size total (min, med, max)","Update":"1337999","Value":"5212268","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":23,"Name":"number of output rows","Update":"2435","Value":"9572","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":25,"Name":"peak memory total (min, med, max)","Update":"37748735","Value":"121634812","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":24,"Name":"sort time total (min, med, max)","Update":"9","Value":"95","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":27,"Name":"duration total (min, med, max)","Update":"1703","Value":"5517","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":28,"Name":"number of output rows","Update":"133759","Value":"525948","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":29,"Name":"number of output rows","Update":"2435","Value":"9573","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":33,"Name":"duration total (min, med, max)","Update":"1609","Value":"4921","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":84,"Name":"internal.metrics.input.recordsRead","Update":2435,"Value":9573,"Internal":true,"Count Failed Values":true},{"ID":83,"Name":"internal.metrics.input.bytesRead","Update":24250210,"Value":97944567,"Internal":true,"Count Failed Values":true},{"ID":82,"Name":"internal.metrics.shuffle.write.writeTime","Update":20055909,"Value":79410400,"Internal":true,"Count Failed Values":true},{"ID":81,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":2435,"Value":9572,"Internal":true,"Count Failed Values":true},{"ID":80,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":242714,"Value":941007,"Internal":true,"Count Failed Values":true},{"ID":71,"Name":"internal.metrics.peakExecutionMemory","Update":37748736,"Value":121634816,"Internal":true,"Count Failed Values":true},{"ID":67,"Name":"internal.metrics.jvmGCTime","Update":31,"Value":95,"Internal":true,"Count Failed Values":true},{"ID":66,"Name":"internal.metrics.resultSize","Update":2394,"Value":7182,"Internal":true,"Count Failed Values":true},{"ID":65,"Name":"internal.metrics.executorCpuTime","Update":896878991,"Value":4180973307,"Internal":true,"Count Failed Values":true},{"ID":64,"Name":"internal.metrics.executorRunTime","Update":1722,"Value":5826,"Internal":true,"Count Failed Values":true},{"ID":63,"Name":"internal.metrics.executorDeserializeCpuTime","Update":2787355,"Value":123835301,"Internal":true,"Count Failed Values":true},{"ID":62,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":195,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":2787355,"Executor Run Time":1722,"Executor CPU Time":896878991,"Result Size":2394,"JVM GC Time":31,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":242714,"Shuffle Write Time":20055909,"Shuffle Records Written":2435},"Input Metrics":{"Bytes Read":24250210,"Records Read":2435},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerExecutorAdded","Timestamp":1524182147549,"Executor ID":"6","Executor Info":{"Host":"node6644.grid.company.com","Total Cores":1,"Log Urls":{"stdout":"http://node6644.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000008/edlu/stdout?start=-4096","stderr":"http://node6644.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000008/edlu/stderr?start=-4096"}}} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"6","Host":"node6644.grid.company.com","Port":8445},"Maximum Memory":956615884,"Timestamp":1524182147706,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0} +{"Event":"SparkListenerExecutorAdded","Timestamp":1524182149826,"Executor ID":"7","Executor Info":{"Host":"node6340.grid.company.com","Total Cores":1,"Log Urls":{"stdout":"http://node6340.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000009/edlu/stdout?start=-4096","stderr":"http://node6340.grid.company.com:8042/node/containerlogs/container_e05_1523494505172_1552404_01_000009/edlu/stderr?start=-4096"}}} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"7","Host":"node6340.grid.company.com","Port":5933},"Maximum Memory":956615884,"Timestamp":1524182149983,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":2,"Attempt":0,"Launch Time":1524182143166,"Executor ID":"5","Host":"node2477.grid.company.com","Locality":"ANY","Speculative":false,"Getting Result Time":0,"Finish Time":1524182152418,"Failed":false,"Killed":false,"Accumulables":[{"ID":8,"Name":"data size total (min, med, max)","Update":"1910103","Value":"7122371","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":23,"Name":"number of output rows","Update":"3541","Value":"13113","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":25,"Name":"peak memory total (min, med, max)","Update":"41943039","Value":"163577851","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":24,"Name":"sort time total (min, med, max)","Update":"48","Value":"143","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":27,"Name":"duration total (min, med, max)","Update":"6093","Value":"11610","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":28,"Name":"number of output rows","Update":"194553","Value":"720501","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":29,"Name":"number of output rows","Update":"3541","Value":"13114","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":33,"Name":"duration total (min, med, max)","Update":"5951","Value":"10872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":84,"Name":"internal.metrics.input.recordsRead","Update":3541,"Value":13114,"Internal":true,"Count Failed Values":true},{"ID":83,"Name":"internal.metrics.input.bytesRead","Update":36838295,"Value":134782862,"Internal":true,"Count Failed Values":true},{"ID":82,"Name":"internal.metrics.shuffle.write.writeTime","Update":49790497,"Value":129200897,"Internal":true,"Count Failed Values":true},{"ID":81,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":3541,"Value":13113,"Internal":true,"Count Failed Values":true},{"ID":80,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":355051,"Value":1296058,"Internal":true,"Count Failed Values":true},{"ID":71,"Name":"internal.metrics.peakExecutionMemory","Update":41943040,"Value":163577856,"Internal":true,"Count Failed Values":true},{"ID":68,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":2,"Internal":true,"Count Failed Values":true},{"ID":67,"Name":"internal.metrics.jvmGCTime","Update":920,"Value":1015,"Internal":true,"Count Failed Values":true},{"ID":66,"Name":"internal.metrics.resultSize","Update":2437,"Value":9619,"Internal":true,"Count Failed Values":true},{"ID":65,"Name":"internal.metrics.executorCpuTime","Update":5299274511,"Value":9480247818,"Internal":true,"Count Failed Values":true},{"ID":64,"Name":"internal.metrics.executorRunTime","Update":7847,"Value":13673,"Internal":true,"Count Failed Values":true},{"ID":63,"Name":"internal.metrics.executorDeserializeCpuTime","Update":687811857,"Value":811647158,"Internal":true,"Count Failed Values":true},{"ID":62,"Name":"internal.metrics.executorDeserializeTime","Update":1037,"Value":1232,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":1037,"Executor Deserialize CPU Time":687811857,"Executor Run Time":7847,"Executor CPU Time":5299274511,"Result Size":2437,"JVM GC Time":920,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":355051,"Shuffle Write Time":49790497,"Shuffle Records Written":3541},"Input Metrics":{"Bytes Read":36838295,"Records Read":3541},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"driver","Stage ID":1,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":629553808,"JVMOffHeapMemory":205304696,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":905801,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":905801,"OffHeapUnifiedMemory":0,"DirectPoolMemory":397602,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"2","Stage ID":1,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":595946552,"JVMOffHeapMemory":91208368,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":58468944,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":58468944,"OffHeapUnifiedMemory":0,"DirectPoolMemory":87796,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"1","Stage ID":1,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":755008624,"JVMOffHeapMemory":100519936,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":47962185,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":47962185,"OffHeapUnifiedMemory":0,"DirectPoolMemory":98230,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"4","Stage ID":1,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":518613056,"JVMOffHeapMemory":95657456,"OnHeapExecutionMemory":37748736,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":63104457,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":100853193,"OffHeapUnifiedMemory":0,"DirectPoolMemory":126261,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageExecutorMetrics","Executor ID":"3","Stage ID":1,"Stage Attempt ID":0,"Executor Metrics":{"JVMHeapMemory":726805712,"JVMOffHeapMemory":90709624,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":69535048,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":69535048,"OffHeapUnifiedMemory":0,"DirectPoolMemory":87796,"MappedPoolMemory":0}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"cache at :41","Number of Tasks":4,"RDD Info":[{"RDD ID":14,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"17\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"24\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"FileScanRDD","Scope":"{\"id\":\"24\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"19\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"Generate\"}","Callsite":"cache at :41","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"18\",\"name\":\"SortAggregate\"}","Callsite":"cache at :41","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":4,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.Dataset.cache(Dataset.scala:2912)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:41)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:46)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:48)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:50)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:52)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:54)\n$line49.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:56)\n$line49.$read$$iw$$iw$$iw$$iw$$iw.(:58)\n$line49.$read$$iw$$iw$$iw$$iw.(:60)\n$line49.$read$$iw$$iw$$iw.(:62)\n$line49.$read$$iw$$iw.(:64)\n$line49.$read$$iw.(:66)\n$line49.$read.(:68)\n$line49.$read$.(:72)\n$line49.$read$.()\n$line49.$eval$.$print$lzycompute(:7)\n$line49.$eval$.$print(:6)\n$line49.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Submission Time":1524182130328,"Completion Time":1524182152419,"Accumulables":[{"ID":83,"Name":"internal.metrics.input.bytesRead","Value":134782862,"Internal":true,"Count Failed Values":true},{"ID":23,"Name":"number of output rows","Value":"13113","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":68,"Name":"internal.metrics.resultSerializationTime","Value":2,"Internal":true,"Count Failed Values":true},{"ID":8,"Name":"data size total (min, med, max)","Value":"7122371","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":62,"Name":"internal.metrics.executorDeserializeTime","Value":1232,"Internal":true,"Count Failed Values":true},{"ID":80,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":1296058,"Internal":true,"Count Failed Values":true},{"ID":71,"Name":"internal.metrics.peakExecutionMemory","Value":163577856,"Internal":true,"Count Failed Values":true},{"ID":29,"Name":"number of output rows","Value":"13114","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":65,"Name":"internal.metrics.executorCpuTime","Value":9480247818,"Internal":true,"Count Failed Values":true},{"ID":64,"Name":"internal.metrics.executorRunTime","Value":13673,"Internal":true,"Count Failed Values":true},{"ID":82,"Name":"internal.metrics.shuffle.write.writeTime","Value":129200897,"Internal":true,"Count Failed Values":true},{"ID":67,"Name":"internal.metrics.jvmGCTime","Value":1015,"Internal":true,"Count Failed Values":true},{"ID":25,"Name":"peak memory total (min, med, max)","Value":"163577851","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":28,"Name":"number of output rows","Value":"720501","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":63,"Name":"internal.metrics.executorDeserializeCpuTime","Value":811647158,"Internal":true,"Count Failed Values":true},{"ID":27,"Name":"duration total (min, med, max)","Value":"11610","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":81,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":13113,"Internal":true,"Count Failed Values":true},{"ID":84,"Name":"internal.metrics.input.recordsRead","Value":13114,"Internal":true,"Count Failed Values":true},{"ID":66,"Name":"internal.metrics.resultSize","Value":9619,"Internal":true,"Count Failed Values":true},{"ID":24,"Name":"sort time total (min, med, max)","Value":"143","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":33,"Name":"duration total (min, med, max)","Value":"10872","Internal":true,"Count Failed Values":true,"Metadata":"sql"}]}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"show at :40","Number of Tasks":1,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"map\"}","Callsite":"show at :40","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"32\",\"name\":\"mapPartitionsInternal\"}","Callsite":"show at :40","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"27\",\"name\":\"WholeStageCodegen\"}","Callsite":"show at :40","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":22,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"31\",\"name\":\"InMemoryTableScan\"}","Callsite":"show at :40","Parent IDs":[20],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":20,"Name":"*(5) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n+- SortMergeJoin [appId#0], [appId#137], LeftOuter\n :- *(1) Sort [appId#0 ASC NULLS FIRST], false, 0\n : +- Exchange hashpartitioning(appId#0, 200)\n : +- InMemoryTableScan [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28]\n : +- InMemoryRelation [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28], true, 10000, StorageLevel(disk, memory, deserialized, 1 rep...","Scope":"{\"id\":\"26\",\"name\":\"mapPartitionsInternal\"}","Callsite":"cache at :41","Parent IDs":[19],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":true,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"31\",\"name\":\"InMemoryTableScan\"}","Callsite":"show at :40","Parent IDs":[22],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"7\",\"name\":\"SortMergeJoin\"}","Callsite":"cache at :41","Parent IDs":[8,17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"SortAggregate\"}","Callsite":"cache at :41","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"11\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"14\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"17\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"4\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[0,1],"Details":"org.apache.spark.sql.Dataset.show(Dataset.scala:691)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:40)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:45)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:47)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:49)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:51)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:53)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:55)\n$line50.$read$$iw$$iw$$iw$$iw$$iw.(:57)\n$line50.$read$$iw$$iw$$iw$$iw.(:59)\n$line50.$read$$iw$$iw$$iw.(:61)\n$line50.$read$$iw$$iw.(:63)\n$line50.$read$$iw.(:65)\n$line50.$read.(:67)\n$line50.$read$.(:71)\n$line50.$read$.()\n$line50.$eval$.$print$lzycompute(:7)\n$line50.$eval$.$print(:6)\n$line50.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Submission Time":1524182152430,"Accumulables":[]},"Properties":{"spark.sql.execution.id":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":8,"Index":0,"Attempt":0,"Launch Time":1524182152447,"Executor ID":"4","Host":"node4243.grid.company.com","Locality":"NODE_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":8,"Index":0,"Attempt":0,"Launch Time":1524182152447,"Executor ID":"4","Host":"node4243.grid.company.com","Locality":"NODE_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1524182153103,"Failed":false,"Killed":false,"Accumulables":[{"ID":34,"Name":"duration total (min, med, max)","Update":"1","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":35,"Name":"number of output rows","Update":"6928","Value":"6928","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":10,"Name":"duration total (min, med, max)","Update":"452","Value":"451","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":11,"Name":"number of output rows","Update":"10945","Value":"10945","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":18,"Name":"number of output rows","Update":"62","Value":"62","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":20,"Name":"peak memory total (min, med, max)","Update":"33619967","Value":"33619966","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":22,"Name":"duration total (min, med, max)","Update":"323","Value":"322","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":13,"Name":"peak memory total (min, med, max)","Update":"34078719","Value":"34078718","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":12,"Name":"sort time total (min, med, max)","Update":"10","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":15,"Name":"duration total (min, med, max)","Update":"367","Value":"366","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":104,"Name":"internal.metrics.shuffle.read.recordsRead","Update":11007,"Value":11007,"Internal":true,"Count Failed Values":true},{"ID":103,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":102,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":124513,"Value":124513,"Internal":true,"Count Failed Values":true},{"ID":101,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":100,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":314162,"Value":314162,"Internal":true,"Count Failed Values":true},{"ID":99,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":2,"Value":2,"Internal":true,"Count Failed Values":true},{"ID":98,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":6,"Value":6,"Internal":true,"Count Failed Values":true},{"ID":96,"Name":"internal.metrics.peakExecutionMemory","Update":67698688,"Value":67698688,"Internal":true,"Count Failed Values":true},{"ID":91,"Name":"internal.metrics.resultSize","Update":4642,"Value":4642,"Internal":true,"Count Failed Values":true},{"ID":90,"Name":"internal.metrics.executorCpuTime","Update":517655714,"Value":517655714,"Internal":true,"Count Failed Values":true},{"ID":89,"Name":"internal.metrics.executorRunTime","Update":589,"Value":589,"Internal":true,"Count Failed Values":true},{"ID":88,"Name":"internal.metrics.executorDeserializeCpuTime","Update":45797784,"Value":45797784,"Internal":true,"Count Failed Values":true},{"ID":87,"Name":"internal.metrics.executorDeserializeTime","Update":50,"Value":50,"Internal":true,"Count Failed Values":true}]},"Task Metrics":{"Executor Deserialize Time":50,"Executor Deserialize CPU Time":45797784,"Executor Run Time":589,"Executor CPU Time":517655714,"Result Size":4642,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":6,"Local Blocks Fetched":2,"Fetch Wait Time":0,"Remote Bytes Read":314162,"Remote Bytes Read To Disk":0,"Local Bytes Read":124513,"Total Records Read":11007},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"show at :40","Number of Tasks":1,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"map\"}","Callsite":"show at :40","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"32\",\"name\":\"mapPartitionsInternal\"}","Callsite":"show at :40","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"27\",\"name\":\"WholeStageCodegen\"}","Callsite":"show at :40","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":22,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"31\",\"name\":\"InMemoryTableScan\"}","Callsite":"show at :40","Parent IDs":[20],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":20,"Name":"*(5) Project [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28, azkaban.link.workflow.url#159, azkaban.link.execution.url#161, azkaban.link.job.url#163, user.name#165]\n+- SortMergeJoin [appId#0], [appId#137], LeftOuter\n :- *(1) Sort [appId#0 ASC NULLS FIRST], false, 0\n : +- Exchange hashpartitioning(appId#0, 200)\n : +- InMemoryTableScan [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28]\n : +- InMemoryRelation [appId#0, attemptId#1, name#2, mode#3, completed#4, duration#5L, endTime#6, endTimeEpoch#7L, lastUpdated#8, lastUpdatedEpoch#9L, sparkUser#10, startTime#11, startTimeEpoch#12L, appSparkVersion#13, endDate#28], true, 10000, StorageLevel(disk, memory, deserialized, 1 rep...","Scope":"{\"id\":\"26\",\"name\":\"mapPartitionsInternal\"}","Callsite":"cache at :41","Parent IDs":[19],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":true,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"31\",\"name\":\"InMemoryTableScan\"}","Callsite":"show at :40","Parent IDs":[22],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"7\",\"name\":\"SortMergeJoin\"}","Callsite":"cache at :41","Parent IDs":[8,17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"SortAggregate\"}","Callsite":"cache at :41","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"11\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"14\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"17\",\"name\":\"Exchange\"}","Callsite":"cache at :41","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"4\",\"name\":\"WholeStageCodegen\"}","Callsite":"cache at :41","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":200,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[0,1],"Details":"org.apache.spark.sql.Dataset.show(Dataset.scala:691)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:40)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:45)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:47)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:49)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:51)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:53)\n$line50.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:55)\n$line50.$read$$iw$$iw$$iw$$iw$$iw.(:57)\n$line50.$read$$iw$$iw$$iw$$iw.(:59)\n$line50.$read$$iw$$iw$$iw.(:61)\n$line50.$read$$iw$$iw.(:63)\n$line50.$read$$iw.(:65)\n$line50.$read.(:67)\n$line50.$read$.(:71)\n$line50.$read$.()\n$line50.$eval$.$print$lzycompute(:7)\n$line50.$eval$.$print(:6)\n$line50.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)","Submission Time":1524182152430,"Completion Time":1524182153104,"Accumulables":[{"ID":101,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":104,"Name":"internal.metrics.shuffle.read.recordsRead","Value":11007,"Internal":true,"Count Failed Values":true},{"ID":35,"Name":"number of output rows","Value":"6928","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":89,"Name":"internal.metrics.executorRunTime","Value":589,"Internal":true,"Count Failed Values":true},{"ID":98,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":6,"Internal":true,"Count Failed Values":true},{"ID":11,"Name":"number of output rows","Value":"10945","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":20,"Name":"peak memory total (min, med, max)","Value":"33619966","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":91,"Name":"internal.metrics.resultSize","Value":4642,"Internal":true,"Count Failed Values":true},{"ID":100,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":314162,"Internal":true,"Count Failed Values":true},{"ID":13,"Name":"peak memory total (min, med, max)","Value":"34078718","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":22,"Name":"duration total (min, med, max)","Value":"322","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":103,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":88,"Name":"internal.metrics.executorDeserializeCpuTime","Value":45797784,"Internal":true,"Count Failed Values":true},{"ID":34,"Name":"duration total (min, med, max)","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":10,"Name":"duration total (min, med, max)","Value":"451","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":87,"Name":"internal.metrics.executorDeserializeTime","Value":50,"Internal":true,"Count Failed Values":true},{"ID":96,"Name":"internal.metrics.peakExecutionMemory","Value":67698688,"Internal":true,"Count Failed Values":true},{"ID":90,"Name":"internal.metrics.executorCpuTime","Value":517655714,"Internal":true,"Count Failed Values":true},{"ID":99,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":2,"Internal":true,"Count Failed Values":true},{"ID":18,"Name":"number of output rows","Value":"62","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":12,"Name":"sort time total (min, med, max)","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":102,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":124513,"Internal":true,"Count Failed Values":true},{"ID":15,"Name":"duration total (min, med, max)","Value":"366","Internal":true,"Count Failed Values":true,"Metadata":"sql"}]}} +{"Event":"SparkListenerJobEnd","Job ID":0,"Completion Time":1524182153112,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":2,"time":1524182153139} +{"Event":"SparkListenerUnpersistRDD","RDD ID":2} +{"Event":"SparkListenerUnpersistRDD","RDD ID":20} +{"Event":"SparkListenerApplicationEnd","Timestamp":1524182189134} diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index a441b9c8ab97..81b18c71f30e 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -19,10 +19,12 @@ package org.apache.spark import java.io._ import java.nio.ByteBuffer +import java.nio.charset.StandardCharsets import java.util.zip.GZIPOutputStream import scala.io.Source +import com.google.common.io.Files import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io._ @@ -299,6 +301,25 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { } } + test("SPARK-22357 test binaryFiles minPartitions") { + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local") + .set("spark.files.openCostInBytes", "0") + .set("spark.default.parallelism", "1")) + + val tempDir = Utils.createTempDir() + val tempDirPath = tempDir.getAbsolutePath + + for (i <- 0 until 8) { + val tempFile = new File(tempDir, s"part-0000$i") + Files.write("someline1 in file1\nsomeline2 in file1\nsomeline3 in file1", tempFile, + StandardCharsets.UTF_8) + } + + for (p <- Seq(1, 2, 8)) { + assert(sc.binaryFiles(tempDirPath, minPartitions = p).getNumPartitions === p) + } + } + test("fixed record length binary file as byte array") { sc = new SparkContext("local", "test") val testOutput = Array[Byte](1, 2, 3, 4, 5, 6) diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala index b705556e54b1..de479db5fbc0 100644 --- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala +++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala @@ -28,7 +28,7 @@ import org.mockito.Matchers._ import org.mockito.Mockito.{mock, spy, verify, when} import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester} -import org.apache.spark.executor.TaskMetrics +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEndpointRef, RpcEnv} import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ @@ -77,7 +77,7 @@ class HeartbeatReceiverSuite heartbeatReceiverClock = new ManualClock heartbeatReceiver = new HeartbeatReceiver(sc, heartbeatReceiverClock) heartbeatReceiverRef = sc.env.rpcEnv.setupEndpoint("heartbeat", heartbeatReceiver) - when(scheduler.executorHeartbeatReceived(any(), any(), any())).thenReturn(true) + when(scheduler.executorHeartbeatReceived(any(), any(), any(), any())).thenReturn(true) } /** @@ -213,8 +213,10 @@ class HeartbeatReceiverSuite executorShouldReregister: Boolean): Unit = { val metrics = TaskMetrics.empty val blockManagerId = BlockManagerId(executorId, "localhost", 12345) + val executorUpdates = new ExecutorMetrics(Array(123456L, 543L, 12345L, 1234L, 123L, + 12L, 432L, 321L, 654L, 765L)) val response = heartbeatReceiverRef.askSync[HeartbeatResponse]( - Heartbeat(executorId, Array(1L -> metrics.accumulators()), blockManagerId)) + Heartbeat(executorId, Array(1L -> metrics.accumulators()), blockManagerId, executorUpdates)) if (executorShouldReregister) { assert(response.reregisterBlockManager) } else { @@ -223,7 +225,8 @@ class HeartbeatReceiverSuite verify(scheduler).executorHeartbeatReceived( Matchers.eq(executorId), Matchers.eq(Array(1L -> metrics.accumulators())), - Matchers.eq(blockManagerId)) + Matchers.eq(blockManagerId), + Matchers.eq(executorUpdates)) } } diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index e79739692fe1..21f481d47724 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -62,9 +62,9 @@ class MapOutputTrackerSuite extends SparkFunSuite { val size1000 = MapStatus.decompressSize(MapStatus.compressSize(1000L)) val size10000 = MapStatus.decompressSize(MapStatus.compressSize(10000L)) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(1000L, 10000L), 10)) + Array(1000L, 10000L))) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(10000L, 1000L), 10)) + Array(10000L, 1000L))) val statuses = tracker.getMapSizesByExecutorId(10, 0) assert(statuses.toSet === Seq((BlockManagerId("a", "hostA", 1000), ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000))), @@ -84,9 +84,9 @@ class MapOutputTrackerSuite extends SparkFunSuite { val compressedSize1000 = MapStatus.compressSize(1000L) val compressedSize10000 = MapStatus.compressSize(10000L) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(compressedSize1000, compressedSize10000), 10)) + Array(compressedSize1000, compressedSize10000))) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(compressedSize10000, compressedSize1000), 10)) + Array(compressedSize10000, compressedSize1000))) assert(tracker.containsShuffle(10)) assert(tracker.getMapSizesByExecutorId(10, 0).nonEmpty) assert(0 == tracker.getNumCachedSerializedBroadcast) @@ -107,9 +107,9 @@ class MapOutputTrackerSuite extends SparkFunSuite { val compressedSize1000 = MapStatus.compressSize(1000L) val compressedSize10000 = MapStatus.compressSize(10000L) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(compressedSize1000, compressedSize1000, compressedSize1000), 10)) + Array(compressedSize1000, compressedSize1000, compressedSize1000))) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(compressedSize10000, compressedSize1000, compressedSize1000), 10)) + Array(compressedSize10000, compressedSize1000, compressedSize1000))) assert(0 == tracker.getNumCachedSerializedBroadcast) // As if we had two simultaneous fetch failures @@ -145,7 +145,7 @@ class MapOutputTrackerSuite extends SparkFunSuite { val size1000 = MapStatus.decompressSize(MapStatus.compressSize(1000L)) masterTracker.registerMapOutput(10, 0, MapStatus( - BlockManagerId("a", "hostA", 1000), Array(1000L), 10)) + BlockManagerId("a", "hostA", 1000), Array(1000L))) slaveTracker.updateEpoch(masterTracker.getEpoch) assert(slaveTracker.getMapSizesByExecutorId(10, 0).toSeq === Seq((BlockManagerId("a", "hostA", 1000), ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000))))) @@ -182,7 +182,7 @@ class MapOutputTrackerSuite extends SparkFunSuite { // Message size should be ~123B, and no exception should be thrown masterTracker.registerShuffle(10, 1) masterTracker.registerMapOutput(10, 0, MapStatus( - BlockManagerId("88", "mph", 1000), Array.fill[Long](10)(0), 0)) + BlockManagerId("88", "mph", 1000), Array.fill[Long](10)(0))) val senderAddress = RpcAddress("localhost", 12345) val rpcCallContext = mock(classOf[RpcCallContext]) when(rpcCallContext.senderAddress).thenReturn(senderAddress) @@ -216,11 +216,11 @@ class MapOutputTrackerSuite extends SparkFunSuite { // on hostB with output size 3 tracker.registerShuffle(10, 3) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(2L), 1)) + Array(2L))) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(2L), 1)) + Array(2L))) tracker.registerMapOutput(10, 2, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(3L), 1)) + Array(3L))) // When the threshold is 50%, only host A should be returned as a preferred location // as it has 4 out of 7 bytes of output. @@ -260,7 +260,7 @@ class MapOutputTrackerSuite extends SparkFunSuite { masterTracker.registerShuffle(20, 100) (0 until 100).foreach { i => masterTracker.registerMapOutput(20, i, new CompressedMapStatus( - BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0), 0)) + BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0))) } val senderAddress = RpcAddress("localhost", 12345) val rpcCallContext = mock(classOf[RpcCallContext]) @@ -309,9 +309,9 @@ class MapOutputTrackerSuite extends SparkFunSuite { val size1000 = MapStatus.decompressSize(MapStatus.compressSize(1000L)) val size10000 = MapStatus.decompressSize(MapStatus.compressSize(10000L)) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(size0, size1000, size0, size10000), 1)) + Array(size0, size1000, size0, size10000))) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(size10000, size0, size1000, size0), 1)) + Array(size10000, size0, size1000, size0))) assert(tracker.containsShuffle(10)) assert(tracker.getMapSizesByExecutorId(10, 0, 4).toSeq === Seq( diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala index 456f97b535ef..b917469e4874 100644 --- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala +++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala @@ -391,7 +391,6 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC assert(mapOutput2.isDefined) assert(mapOutput1.get.location === mapOutput2.get.location) assert(mapOutput1.get.getSizeForBlock(0) === mapOutput1.get.getSizeForBlock(0)) - assert(mapOutput1.get.numberOfOutput === mapOutput2.get.numberOfOutput) // register one of the map outputs -- doesn't matter which one mapOutput1.foreach { case mapStatus => diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala index 05b4e67412f2..6f9b583898c3 100644 --- a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala @@ -18,9 +18,13 @@ package org.apache.spark.api.python import java.io.{ByteArrayOutputStream, DataOutputStream} +import java.net.{InetAddress, Socket} import java.nio.charset.StandardCharsets -import org.apache.spark.SparkFunSuite +import scala.concurrent.duration.Duration + +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.security.SocketAuthHelper class PythonRDDSuite extends SparkFunSuite { @@ -44,4 +48,21 @@ class PythonRDDSuite extends SparkFunSuite { ("a".getBytes(StandardCharsets.UTF_8), null), (null, "b".getBytes(StandardCharsets.UTF_8))), buffer) } + + test("python server error handling") { + val authHelper = new SocketAuthHelper(new SparkConf()) + val errorServer = new ExceptionPythonServer(authHelper) + val client = new Socket(InetAddress.getLoopbackAddress(), errorServer.port) + authHelper.authToServer(client) + val ex = intercept[Exception] { errorServer.getResult(Duration(1, "second")) } + assert(ex.getCause().getMessage().contains("exception within handleConnection")) + } + + class ExceptionPythonServer(authHelper: SocketAuthHelper) + extends PythonServer[Unit](authHelper, "error-server") { + + override def handleConnection(sock: Socket): Unit = { + throw new Exception("exception within handleConnection") + } + } } diff --git a/core/src/main/scala/org/apache/spark/util/Benchmark.scala b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala similarity index 99% rename from core/src/main/scala/org/apache/spark/util/Benchmark.scala rename to core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala index 7def44bd2a2b..7a36b5f02dc4 100644 --- a/core/src/main/scala/org/apache/spark/util/Benchmark.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.util +package org.apache.spark.benchmark import java.io.{OutputStream, PrintStream} @@ -27,6 +27,8 @@ import scala.util.Try import org.apache.commons.io.output.TeeOutputStream import org.apache.commons.lang3.SystemUtils +import org.apache.spark.util.Utils + /** * Utility class to benchmark components. An example of how to use this is: * val benchmark = new Benchmark("My Benchmark", valuesPerIteration) diff --git a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala new file mode 100644 index 000000000000..89e927e5784d --- /dev/null +++ b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.benchmark + +import java.io.{File, FileOutputStream, OutputStream} + +/** + * A base class for generate benchmark results to a file. + */ +abstract class BenchmarkBase { + var output: Option[OutputStream] = None + + /** + * Main process of the whole benchmark. + * Implementations of this method are supposed to use the wrapper method `runBenchmark` + * for each benchmark scenario. + */ + def runBenchmarkSuite(): Unit + + final def runBenchmark(benchmarkName: String)(func: => Any): Unit = { + val separator = "=" * 96 + val testHeader = (separator + '\n' + benchmarkName + '\n' + separator + '\n' + '\n').getBytes + output.foreach(_.write(testHeader)) + func + output.foreach(_.write('\n')) + } + + def main(args: Array[String]): Unit = { + val regenerateBenchmarkFiles: Boolean = System.getenv("SPARK_GENERATE_BENCHMARK_FILES") == "1" + if (regenerateBenchmarkFiles) { + val resultFileName = s"${this.getClass.getSimpleName.replace("$", "")}-results.txt" + val file = new File(s"benchmarks/$resultFileName") + if (!file.exists()) { + file.createNewFile() + } + output = Some(new FileOutputStream(file)) + } + + runBenchmarkSuite() + + output.foreach { o => + if (o != null) { + o.close() + } + } + } +} diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index f829fecc3084..9eae3605d073 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -1144,6 +1144,53 @@ class SparkSubmitSuite conf1.get(PY_FILES.key) should be (s"s3a://${pyFile.getAbsolutePath}") conf1.get("spark.submit.pyFiles") should (startWith("/")) } + + test("handles natural line delimiters in --properties-file and --conf uniformly") { + val delimKey = "spark.my.delimiter." + val LF = "\n" + val CR = "\r" + + val lineFeedFromCommandLine = s"${delimKey}lineFeedFromCommandLine" -> LF + val leadingDelimKeyFromFile = s"${delimKey}leadingDelimKeyFromFile" -> s"${LF}blah" + val trailingDelimKeyFromFile = s"${delimKey}trailingDelimKeyFromFile" -> s"blah${CR}" + val infixDelimFromFile = s"${delimKey}infixDelimFromFile" -> s"${CR}blah${LF}" + val nonDelimSpaceFromFile = s"${delimKey}nonDelimSpaceFromFile" -> " blah\f" + + val testProps = Seq(leadingDelimKeyFromFile, trailingDelimKeyFromFile, infixDelimFromFile, + nonDelimSpaceFromFile) + + val props = new java.util.Properties() + val propsFile = File.createTempFile("test-spark-conf", ".properties", + Utils.createTempDir()) + val propsOutputStream = new FileOutputStream(propsFile) + try { + testProps.foreach { case (k, v) => props.put(k, v) } + props.store(propsOutputStream, "test whitespace") + } finally { + propsOutputStream.close() + } + + val clArgs = Seq( + "--class", "org.SomeClass", + "--conf", s"${lineFeedFromCommandLine._1}=${lineFeedFromCommandLine._2}", + "--conf", "spark.master=yarn", + "--properties-file", propsFile.getPath, + "thejar.jar") + + val appArgs = new SparkSubmitArguments(clArgs) + val (_, _, conf, _) = submit.prepareSubmitEnvironment(appArgs) + + Seq( + lineFeedFromCommandLine, + leadingDelimKeyFromFile, + trailingDelimKeyFromFile, + infixDelimFromFile + ).foreach { case (k, v) => + conf.get(k) should be (v) + } + + conf.get(nonDelimSpaceFromFile._1) should be ("blah") + } } object SparkSubmitSuite extends SparkFunSuite with TimeLimits { diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala index b4eba755eccb..444e8d6e11f8 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.deploy.history import java.io._ import java.nio.charset.StandardCharsets -import java.util.Date +import java.util.{Date, Locale} import java.util.concurrent.TimeUnit import java.util.zip.{ZipInputStream, ZipOutputStream} @@ -834,7 +834,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc doThrow(new AccessControlException("Cannot read accessDenied file")).when(mockedFs).open( argThat(new ArgumentMatcher[Path]() { override def matches(path: Any): Boolean = { - path.asInstanceOf[Path].getName.toLowerCase == "accessdenied" + path.asInstanceOf[Path].getName.toLowerCase(Locale.ROOT) == "accessdenied" } })) val mockedProvider = spy(provider) diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala index 11b29121739a..11a2db81f7c6 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala @@ -82,6 +82,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers .set("spark.history.fs.update.interval", "0") .set("spark.testing", "true") .set(LOCAL_STORE_DIR, storeDir.getAbsolutePath()) + .set("spark.eventLog.logStageExecutorMetrics.enabled", "true") conf.setAll(extraConf) provider = new FsHistoryProvider(conf) provider.checkForLogs() @@ -128,6 +129,8 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers "succeeded&failed job list json" -> "applications/local-1422981780767/jobs?status=succeeded&status=failed", "executor list json" -> "applications/local-1422981780767/executors", + "executor list with executor metrics json" -> + "applications/application_1506645932520_24630151/executors", "stage list json" -> "applications/local-1422981780767/stages", "complete stage list json" -> "applications/local-1422981780767/stages?status=complete", "failed stage list json" -> "applications/local-1422981780767/stages?status=failed", diff --git a/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala index 69a460fbc7db..f4558aa3eb89 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala @@ -53,8 +53,11 @@ class MasterWebUISuite extends SparkFunSuite with BeforeAndAfterAll { } override def afterAll() { - masterWebUI.stop() - super.afterAll() + try { + masterWebUI.stop() + } finally { + super.afterAll() + } } test("kill application") { diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index 77a7668d3a1d..1f8a65707b2f 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -21,9 +21,10 @@ import java.io.{Externalizable, ObjectInput, ObjectOutput} import java.lang.Thread.UncaughtExceptionHandler import java.nio.ByteBuffer import java.util.Properties -import java.util.concurrent.{CountDownLatch, TimeUnit} +import java.util.concurrent.{ConcurrentHashMap, CountDownLatch, TimeUnit} import java.util.concurrent.atomic.AtomicBoolean +import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.Map import scala.concurrent.duration._ import scala.language.postfixOps @@ -33,22 +34,25 @@ import org.mockito.Matchers.{any, eq => meq} import org.mockito.Mockito.{inOrder, verify, when} import org.mockito.invocation.InvocationOnMock import org.mockito.stubbing.Answer +import org.scalatest.PrivateMethodTester import org.scalatest.concurrent.Eventually import org.scalatest.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.TaskState.TaskState -import org.apache.spark.memory.MemoryManager +import org.apache.spark.internal.config._ +import org.apache.spark.memory.TestMemoryManager import org.apache.spark.metrics.MetricsSystem import org.apache.spark.rdd.RDD -import org.apache.spark.rpc.RpcEnv -import org.apache.spark.scheduler.{FakeTask, ResultTask, TaskDescription} +import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv, RpcTimeout} +import org.apache.spark.scheduler.{FakeTask, ResultTask, Task, TaskDescription} import org.apache.spark.serializer.{JavaSerializer, SerializerManager} import org.apache.spark.shuffle.FetchFailedException -import org.apache.spark.storage.BlockManagerId -import org.apache.spark.util.UninterruptibleThread +import org.apache.spark.storage.{BlockManager, BlockManagerId} +import org.apache.spark.util.{LongAccumulator, UninterruptibleThread} -class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar with Eventually { +class ExecutorSuite extends SparkFunSuite + with LocalSparkContext with MockitoSugar with Eventually with PrivateMethodTester { test("SPARK-15963: Catch `TaskKilledException` correctly in Executor.TaskRunner") { // mock some objects to make Executor.launchTask() happy @@ -252,18 +256,107 @@ class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSug } } + test("Heartbeat should drop zero accumulator updates") { + heartbeatZeroAccumulatorUpdateTest(true) + } + + test("Heartbeat should not drop zero accumulator updates when the conf is disabled") { + heartbeatZeroAccumulatorUpdateTest(false) + } + + private def withHeartbeatExecutor(confs: (String, String)*) + (f: (Executor, ArrayBuffer[Heartbeat]) => Unit): Unit = { + val conf = new SparkConf + confs.foreach { case (k, v) => conf.set(k, v) } + val serializer = new JavaSerializer(conf) + val env = createMockEnv(conf, serializer) + val executor = + new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true) + val executorClass = classOf[Executor] + + // Save all heartbeats sent into an ArrayBuffer for verification + val heartbeats = ArrayBuffer[Heartbeat]() + val mockReceiver = mock[RpcEndpointRef] + when(mockReceiver.askSync(any[Heartbeat], any[RpcTimeout])(any)) + .thenAnswer(new Answer[HeartbeatResponse] { + override def answer(invocation: InvocationOnMock): HeartbeatResponse = { + val args = invocation.getArguments() + val mock = invocation.getMock + heartbeats += args(0).asInstanceOf[Heartbeat] + HeartbeatResponse(false) + } + }) + val receiverRef = executorClass.getDeclaredField("heartbeatReceiverRef") + receiverRef.setAccessible(true) + receiverRef.set(executor, mockReceiver) + + f(executor, heartbeats) + } + + private def heartbeatZeroAccumulatorUpdateTest(dropZeroMetrics: Boolean): Unit = { + val c = EXECUTOR_HEARTBEAT_DROP_ZERO_ACCUMULATOR_UPDATES.key -> dropZeroMetrics.toString + withHeartbeatExecutor(c) { (executor, heartbeats) => + val reportHeartbeat = PrivateMethod[Unit]('reportHeartBeat) + + // When no tasks are running, there should be no accumulators sent in heartbeat + executor.invokePrivate(reportHeartbeat()) + // invokeReportHeartbeat(executor) + assert(heartbeats.length == 1) + assert(heartbeats(0).accumUpdates.length == 0, + "No updates should be sent when no tasks are running") + + // When we start a task with a nonzero accumulator, that should end up in the heartbeat + val metrics = new TaskMetrics() + val nonZeroAccumulator = new LongAccumulator() + nonZeroAccumulator.add(1) + metrics.registerAccumulator(nonZeroAccumulator) + + val executorClass = classOf[Executor] + val tasksMap = { + val field = + executorClass.getDeclaredField("org$apache$spark$executor$Executor$$runningTasks") + field.setAccessible(true) + field.get(executor).asInstanceOf[ConcurrentHashMap[Long, executor.TaskRunner]] + } + val mockTaskRunner = mock[executor.TaskRunner] + val mockTask = mock[Task[Any]] + when(mockTask.metrics).thenReturn(metrics) + when(mockTaskRunner.taskId).thenReturn(6) + when(mockTaskRunner.task).thenReturn(mockTask) + when(mockTaskRunner.startGCTime).thenReturn(1) + tasksMap.put(6, mockTaskRunner) + + executor.invokePrivate(reportHeartbeat()) + assert(heartbeats.length == 2) + val updates = heartbeats(1).accumUpdates + assert(updates.length == 1 && updates(0)._1 == 6, + "Heartbeat should only send update for the one task running") + val accumsSent = updates(0)._2.length + assert(accumsSent > 0, "The nonzero accumulator we added should be sent") + if (dropZeroMetrics) { + assert(accumsSent == metrics.accumulators().count(!_.isZero), + "The number of accumulators sent should match the number of nonzero accumulators") + } else { + assert(accumsSent == metrics.accumulators().length, + "The number of accumulators sent should match the number of total accumulators") + } + } + } + private def createMockEnv(conf: SparkConf, serializer: JavaSerializer): SparkEnv = { val mockEnv = mock[SparkEnv] val mockRpcEnv = mock[RpcEnv] val mockMetricsSystem = mock[MetricsSystem] - val mockMemoryManager = mock[MemoryManager] + val mockBlockManager = mock[BlockManager] when(mockEnv.conf).thenReturn(conf) when(mockEnv.serializer).thenReturn(serializer) when(mockEnv.serializerManager).thenReturn(mock[SerializerManager]) when(mockEnv.rpcEnv).thenReturn(mockRpcEnv) when(mockEnv.metricsSystem).thenReturn(mockMetricsSystem) - when(mockEnv.memoryManager).thenReturn(mockMemoryManager) + when(mockEnv.memoryManager).thenReturn(new TestMemoryManager(conf)) when(mockEnv.closureSerializer).thenReturn(serializer) + when(mockBlockManager.blockManagerId).thenReturn(BlockManagerId("1", "hostA", 1234)) + when(mockEnv.blockManager).thenReturn(mockBlockManager) SparkEnv.set(mockEnv) mockEnv } diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index b143a468a1ba..2227698cf1ad 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -95,6 +95,18 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext { assert(!deserial.toString().isEmpty()) } + test("distinct with known partitioner preserves partitioning") { + val rdd = sc.parallelize(1.to(100), 10).map(x => (x % 10, x % 10)).sortByKey() + val initialPartitioner = rdd.partitioner + val distinctRdd = rdd.distinct() + val resultingPartitioner = distinctRdd.partitioner + assert(initialPartitioner === resultingPartitioner) + val distinctRddDifferent = rdd.distinct(5) + val distinctRddDifferentPartitioner = distinctRddDifferent.partitioner + assert(initialPartitioner != distinctRddDifferentPartitioner) + assert(distinctRdd.collect().sorted === distinctRddDifferent.collect().sorted) + } + test("countApproxDistinct") { def error(est: Long, size: Long): Double = math.abs(est - size) / size.toDouble diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala index d3bbfd11d406..fe22d70850c7 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala @@ -24,7 +24,6 @@ import org.apache.spark.internal.config class BlacklistIntegrationSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend]{ val badHost = "host-0" - val duration = Duration(10, SECONDS) /** * This backend just always fails if the task is executed on a bad host, but otherwise succeeds diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 4e87deb136df..b41d2acab715 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -30,6 +30,7 @@ import org.scalatest.time.SpanSugar._ import org.apache.spark._ import org.apache.spark.broadcast.BroadcastManager +import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.config import org.apache.spark.rdd.{DeterministicLevel, RDD} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode @@ -140,7 +141,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi override def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], - blockManagerId: BlockManagerId): Boolean = true + blockManagerId: BlockManagerId, + executorUpdates: ExecutorMetrics): Boolean = true override def submitTasks(taskSet: TaskSet) = { // normally done by TaskSetManager taskSet.tasks.foreach(_.epoch = mapOutputTracker.getEpoch) @@ -443,17 +445,17 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // map stage1 completes successfully, with one task on each executor complete(taskSets(0), Seq( (Success, - MapStatus(BlockManagerId("exec-hostA1", "hostA", 12345), Array.fill[Long](1)(2), 1)), + MapStatus(BlockManagerId("exec-hostA1", "hostA", 12345), Array.fill[Long](1)(2))), (Success, - MapStatus(BlockManagerId("exec-hostA2", "hostA", 12345), Array.fill[Long](1)(2), 1)), + MapStatus(BlockManagerId("exec-hostA2", "hostA", 12345), Array.fill[Long](1)(2))), (Success, makeMapStatus("hostB", 1)) )) // map stage2 completes successfully, with one task on each executor complete(taskSets(1), Seq( (Success, - MapStatus(BlockManagerId("exec-hostA1", "hostA", 12345), Array.fill[Long](1)(2), 1)), + MapStatus(BlockManagerId("exec-hostA1", "hostA", 12345), Array.fill[Long](1)(2))), (Success, - MapStatus(BlockManagerId("exec-hostA2", "hostA", 12345), Array.fill[Long](1)(2), 1)), + MapStatus(BlockManagerId("exec-hostA2", "hostA", 12345), Array.fill[Long](1)(2))), (Success, makeMapStatus("hostB", 1)) )) // make sure our test setup is correct @@ -660,7 +662,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi override def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], - blockManagerId: BlockManagerId): Boolean = true + blockManagerId: BlockManagerId, + executorMetrics: ExecutorMetrics): Boolean = true override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def workerRemoved(workerId: String, host: String, message: String): Unit = {} override def applicationAttemptId(): Option[String] = None @@ -1877,6 +1880,26 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(sc.parallelize(1 to 10, 2).count() === 10) } + test("misbehaved accumulator should not impact other accumulators") { + val bad = new LongAccumulator { + override def merge(other: AccumulatorV2[java.lang.Long, java.lang.Long]): Unit = { + throw new DAGSchedulerSuiteDummyException + } + } + sc.register(bad, "bad") + val good = sc.longAccumulator("good") + + sc.parallelize(1 to 10, 2).foreach { item => + bad.add(1) + good.add(1) + } + + // This is to ensure the `bad` accumulator did fail to update its value + assert(bad.value == 0L) + // Should be able to update the "good" accumulator + assert(good.value == 10L) + } + /** * The job will be failed on first task throwing a DAGSchedulerSuiteDummyException. * Any subsequent task WILL throw a legitimate java.lang.UnsupportedOperationException. @@ -2854,7 +2877,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi object DAGSchedulerSuite { def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus = - MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes), 1) + MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes)) def makeBlockManagerId(host: String): BlockManagerId = BlockManagerId("exec-" + host, host, 12345) diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index a9e92fa07b9d..cecd6996df7b 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -19,7 +19,9 @@ package org.apache.spark.scheduler import java.io.{File, FileOutputStream, InputStream, IOException} +import scala.collection.immutable.Map import scala.collection.mutable +import scala.collection.mutable.Set import scala.io.Source import org.apache.hadoop.fs.Path @@ -29,11 +31,14 @@ import org.scalatest.BeforeAndAfter import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.Logging import org.apache.spark.io._ -import org.apache.spark.metrics.MetricsSystem +import org.apache.spark.metrics.{ExecutorMetricType, MetricsSystem} +import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.util.{JsonProtocol, Utils} + /** * Test whether EventLoggingListener logs events properly. * @@ -43,6 +48,7 @@ import org.apache.spark.util.{JsonProtocol, Utils} */ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext with BeforeAndAfter with Logging { + import EventLoggingListenerSuite._ private val fileSystem = Utils.getHadoopFileSystem("/", @@ -137,6 +143,10 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit "a fine:mind$dollar{bills}.1", None, Some("lz4"))) } + test("Executor metrics update") { + testStageExecutorMetricsEventLogging() + } + /* ----------------- * * Actual test logic * * ----------------- */ @@ -251,6 +261,214 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit } } + /** + * Test stage executor metrics logging functionality. This checks that peak + * values from SparkListenerExecutorMetricsUpdate events during a stage are + * logged in a StageExecutorMetrics event for each executor at stage completion. + */ + private def testStageExecutorMetricsEventLogging() { + val conf = getLoggingConf(testDirPath, None) + val logName = "stageExecutorMetrics-test" + val eventLogger = new EventLoggingListener(logName, None, testDirPath.toUri(), conf) + val listenerBus = new LiveListenerBus(conf) + + // Events to post. + val events = Array( + SparkListenerApplicationStart("executionMetrics", None, + 1L, "update", None), + createExecutorAddedEvent(1), + createExecutorAddedEvent(2), + createStageSubmittedEvent(0), + // receive 3 metric updates from each executor with just stage 0 running, + // with different peak updates for each executor + createExecutorMetricsUpdateEvent(1, + new ExecutorMetrics(Array(4000L, 50L, 20L, 0L, 40L, 0L, 60L, 0L, 70L, 20L))), + createExecutorMetricsUpdateEvent(2, + new ExecutorMetrics(Array(1500L, 50L, 20L, 0L, 0L, 0L, 20L, 0L, 70L, 0L))), + // exec 1: new stage 0 peaks for metrics at indexes: 2, 4, 6 + createExecutorMetricsUpdateEvent(1, + new ExecutorMetrics(Array(4000L, 50L, 50L, 0L, 50L, 0L, 100L, 0L, 70L, 20L))), + // exec 2: new stage 0 peaks for metrics at indexes: 0, 4, 6 + createExecutorMetricsUpdateEvent(2, + new ExecutorMetrics(Array(2000L, 50L, 10L, 0L, 10L, 0L, 30L, 0L, 70L, 0L))), + // exec 1: new stage 0 peaks for metrics at indexes: 5, 7 + createExecutorMetricsUpdateEvent(1, + new ExecutorMetrics(Array(2000L, 40L, 50L, 0L, 40L, 10L, 90L, 10L, 50L, 0L))), + // exec 2: new stage 0 peaks for metrics at indexes: 0, 5, 6, 7, 8 + createExecutorMetricsUpdateEvent(2, + new ExecutorMetrics(Array(3500L, 50L, 15L, 0L, 10L, 10L, 35L, 10L, 80L, 0L))), + // now start stage 1, one more metric update for each executor, and new + // peaks for some stage 1 metrics (as listed), initialize stage 1 peaks + createStageSubmittedEvent(1), + // exec 1: new stage 0 peaks for metrics at indexes: 0, 3, 7; initialize stage 1 peaks + createExecutorMetricsUpdateEvent(1, + new ExecutorMetrics(Array(5000L, 30L, 50L, 20L, 30L, 10L, 80L, 30L, 50L, 0L))), + // exec 2: new stage 0 peaks for metrics at indexes: 0, 1, 2, 3, 6, 7, 9; + // initialize stage 1 peaks + createExecutorMetricsUpdateEvent(2, + new ExecutorMetrics(Array(7000L, 70L, 50L, 20L, 0L, 10L, 50L, 30L, 10L, 40L))), + // complete stage 0, and 3 more updates for each executor with just + // stage 1 running + createStageCompletedEvent(0), + // exec 1: new stage 1 peaks for metrics at indexes: 0, 1, 3 + createExecutorMetricsUpdateEvent(1, + new ExecutorMetrics(Array(6000L, 70L, 20L, 30L, 10L, 0L, 30L, 30L, 30L, 0L))), + // enew ExecutorMetrics(xec 2: new stage 1 peaks for metrics at indexes: 3, 4, 7, 8 + createExecutorMetricsUpdateEvent(2, + new ExecutorMetrics(Array(5500L, 30L, 20L, 40L, 10L, 0L, 30L, 40L, 40L, 20L))), + // exec 1: new stage 1 peaks for metrics at indexes: 0, 4, 5, 7 + createExecutorMetricsUpdateEvent(1, + new ExecutorMetrics(Array(7000L, 70L, 5L, 25L, 60L, 30L, 65L, 55L, 30L, 0L))), + // exec 2: new stage 1 peak for metrics at index: 7 + createExecutorMetricsUpdateEvent(2, + new ExecutorMetrics(Array(5500L, 40L, 25L, 30L, 10L, 30L, 35L, 60L, 0L, 20L))), + // exec 1: no new stage 1 peaks + createExecutorMetricsUpdateEvent(1, + new ExecutorMetrics(Array(5500L, 70L, 15L, 20L, 55L, 20L, 70L, 40L, 20L, 0L))), + createExecutorRemovedEvent(1), + // exec 2: new stage 1 peak for metrics at index: 6 + createExecutorMetricsUpdateEvent(2, + new ExecutorMetrics(Array(4000L, 20L, 25L, 30L, 10L, 30L, 35L, 60L, 0L, 0L))), + createStageCompletedEvent(1), + SparkListenerApplicationEnd(1000L)) + + // play the events for the event logger + eventLogger.start() + listenerBus.start(Mockito.mock(classOf[SparkContext]), Mockito.mock(classOf[MetricsSystem])) + listenerBus.addToEventLogQueue(eventLogger) + events.foreach(event => listenerBus.post(event)) + listenerBus.stop() + eventLogger.stop() + + // expected StageExecutorMetrics, for the given stage id and executor id + val expectedMetricsEvents: Map[(Int, String), SparkListenerStageExecutorMetrics] = + Map( + ((0, "1"), + new SparkListenerStageExecutorMetrics("1", 0, 0, + new ExecutorMetrics(Array(5000L, 50L, 50L, 20L, 50L, 10L, 100L, 30L, 70L, 20L)))), + ((0, "2"), + new SparkListenerStageExecutorMetrics("2", 0, 0, + new ExecutorMetrics(Array(7000L, 70L, 50L, 20L, 10L, 10L, 50L, 30L, 80L, 40L)))), + ((1, "1"), + new SparkListenerStageExecutorMetrics("1", 1, 0, + new ExecutorMetrics(Array(7000L, 70L, 50L, 30L, 60L, 30L, 80L, 55L, 50L, 0L)))), + ((1, "2"), + new SparkListenerStageExecutorMetrics("2", 1, 0, + new ExecutorMetrics(Array(7000L, 70L, 50L, 40L, 10L, 30L, 50L, 60L, 40L, 40L))))) + + // Verify the log file contains the expected events. + // Posted events should be logged, except for ExecutorMetricsUpdate events -- these + // are consolidated, and the peak values for each stage are logged at stage end. + val logData = EventLoggingListener.openEventLog(new Path(eventLogger.logPath), fileSystem) + try { + val lines = readLines(logData) + val logStart = SparkListenerLogStart(SPARK_VERSION) + assert(lines.size === 14) + assert(lines(0).contains("SparkListenerLogStart")) + assert(lines(1).contains("SparkListenerApplicationStart")) + assert(JsonProtocol.sparkEventFromJson(parse(lines(0))) === logStart) + var logIdx = 1 + events.foreach {event => + event match { + case metricsUpdate: SparkListenerExecutorMetricsUpdate => + case stageCompleted: SparkListenerStageCompleted => + val execIds = Set[String]() + (1 to 2).foreach { _ => + val execId = checkStageExecutorMetrics(lines(logIdx), + stageCompleted.stageInfo.stageId, expectedMetricsEvents) + execIds += execId + logIdx += 1 + } + assert(execIds.size == 2) // check that each executor was logged + checkEvent(lines(logIdx), event) + logIdx += 1 + case _ => + checkEvent(lines(logIdx), event) + logIdx += 1 + } + } + } finally { + logData.close() + } + } + + private def createStageSubmittedEvent(stageId: Int) = { + SparkListenerStageSubmitted(new StageInfo(stageId, 0, stageId.toString, 0, + Seq.empty, Seq.empty, "details")) + } + + private def createStageCompletedEvent(stageId: Int) = { + SparkListenerStageCompleted(new StageInfo(stageId, 0, stageId.toString, 0, + Seq.empty, Seq.empty, "details")) + } + + private def createExecutorAddedEvent(executorId: Int) = { + SparkListenerExecutorAdded(0L, executorId.toString, new ExecutorInfo("host1", 1, Map.empty)) + } + + private def createExecutorRemovedEvent(executorId: Int) = { + SparkListenerExecutorRemoved(0L, executorId.toString, "test") + } + + private def createExecutorMetricsUpdateEvent( + executorId: Int, + executorMetrics: ExecutorMetrics): SparkListenerExecutorMetricsUpdate = { + val taskMetrics = TaskMetrics.empty + taskMetrics.incDiskBytesSpilled(111) + taskMetrics.incMemoryBytesSpilled(222) + val accum = Array((333L, 1, 1, taskMetrics.accumulators().map(AccumulatorSuite.makeInfo))) + SparkListenerExecutorMetricsUpdate(executorId.toString, accum, Some(executorMetrics)) + } + + /** Check that the Spark history log line matches the expected event. */ + private def checkEvent(line: String, event: SparkListenerEvent): Unit = { + assert(line.contains(event.getClass.toString.split("\\.").last)) + val parsed = JsonProtocol.sparkEventFromJson(parse(line)) + assert(parsed.getClass === event.getClass) + (event, parsed) match { + case (expected: SparkListenerStageSubmitted, actual: SparkListenerStageSubmitted) => + // accumulables can be different, so only check the stage Id + assert(expected.stageInfo.stageId == actual.stageInfo.stageId) + case (expected: SparkListenerStageCompleted, actual: SparkListenerStageCompleted) => + // accumulables can be different, so only check the stage Id + assert(expected.stageInfo.stageId == actual.stageInfo.stageId) + case (expected: SparkListenerEvent, actual: SparkListenerEvent) => + assert(expected === actual) + } + } + + /** + * Check that the Spark history log line is an StageExecutorMetrics event, and matches the + * expected value for the stage and executor. + * + * @param line the Spark history log line + * @param stageId the stage ID the ExecutorMetricsUpdate is associated with + * @param expectedEvents map of expected ExecutorMetricsUpdate events, for (stageId, executorId) + */ + private def checkStageExecutorMetrics( + line: String, + stageId: Int, + expectedEvents: Map[(Int, String), SparkListenerStageExecutorMetrics]): String = { + JsonProtocol.sparkEventFromJson(parse(line)) match { + case executorMetrics: SparkListenerStageExecutorMetrics => + expectedEvents.get((stageId, executorMetrics.execId)) match { + case Some(expectedMetrics) => + assert(executorMetrics.execId === expectedMetrics.execId) + assert(executorMetrics.stageId === expectedMetrics.stageId) + assert(executorMetrics.stageAttemptId === expectedMetrics.stageAttemptId) + ExecutorMetricType.values.foreach { metricType => + assert(executorMetrics.executorMetrics.getMetricValue(metricType) === + expectedMetrics.executorMetrics.getMetricValue(metricType)) + } + case None => + assert(false) + } + executorMetrics.execId + case _ => + fail("expecting SparkListenerStageExecutorMetrics") + } + } + private def readLines(in: InputStream): Seq[String] = { Source.fromInputStream(in).getLines().toSeq } @@ -299,6 +517,7 @@ object EventLoggingListenerSuite { conf.set("spark.eventLog.compress", "true") conf.set("spark.io.compression.codec", codec) } + conf.set("spark.eventLog.logStageExecutorMetrics.enabled", "true") conf } diff --git a/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala index b4705914b999..0621c98d4118 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 @@ -92,5 +93,6 @@ private class DummyTaskScheduler extends TaskScheduler { def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], - blockManagerId: BlockManagerId): Boolean = true + blockManagerId: BlockManagerId, + executorMetrics: ExecutorMetrics): Boolean = true } diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala index 555e48bd28aa..2155a0f2b6c2 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala @@ -60,7 +60,7 @@ class MapStatusSuite extends SparkFunSuite { stddev <- Seq(0.0, 0.01, 0.5, 1.0) ) { val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean) - val status = MapStatus(BlockManagerId("a", "b", 10), sizes, 1) + val status = MapStatus(BlockManagerId("a", "b", 10), sizes) val status1 = compressAndDecompressMapStatus(status) for (i <- 0 until numSizes) { if (sizes(i) != 0) { @@ -74,7 +74,7 @@ class MapStatusSuite extends SparkFunSuite { test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) { val sizes = Array.fill[Long](2001)(150L) - val status = MapStatus(null, sizes, 1) + val status = MapStatus(null, sizes) assert(status.isInstanceOf[HighlyCompressedMapStatus]) assert(status.getSizeForBlock(10) === 150L) assert(status.getSizeForBlock(50) === 150L) @@ -86,7 +86,7 @@ class MapStatusSuite extends SparkFunSuite { val sizes = Array.tabulate[Long](3000) { i => i.toLong } val avg = sizes.sum / sizes.count(_ != 0) val loc = BlockManagerId("a", "b", 10) - val status = MapStatus(loc, sizes, 1) + val status = MapStatus(loc, sizes) val status1 = compressAndDecompressMapStatus(status) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) assert(status1.location == loc) @@ -108,7 +108,7 @@ class MapStatusSuite extends SparkFunSuite { val smallBlockSizes = sizes.filter(n => n > 0 && n < threshold) val avg = smallBlockSizes.sum / smallBlockSizes.length val loc = BlockManagerId("a", "b", 10) - val status = MapStatus(loc, sizes, 1) + val status = MapStatus(loc, sizes) val status1 = compressAndDecompressMapStatus(status) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) assert(status1.location == loc) @@ -164,7 +164,7 @@ class MapStatusSuite extends SparkFunSuite { SparkEnv.set(env) // Value of element in sizes is equal to the corresponding index. val sizes = (0L to 2000L).toArray - val status1 = MapStatus(BlockManagerId("exec-0", "host-0", 100), sizes, 1) + val status1 = MapStatus(BlockManagerId("exec-0", "host-0", 100), sizes) val arrayStream = new ByteArrayOutputStream(102400) val objectOutputStream = new ObjectOutputStream(arrayStream) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) @@ -188,32 +188,4 @@ class MapStatusSuite extends SparkFunSuite { assert(count === 3000) } } - - test("SPARK-24519: HighlyCompressedMapStatus has configurable threshold") { - val conf = new SparkConf() - val env = mock(classOf[SparkEnv]) - doReturn(conf).when(env).conf - SparkEnv.set(env) - val sizes = Array.fill[Long](500)(150L) - // Test default value - val status = MapStatus(null, sizes, 1) - assert(status.isInstanceOf[CompressedMapStatus]) - // Test Non-positive values - for (s <- -1 to 0) { - assertThrows[IllegalArgumentException] { - conf.set(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS, s) - val status = MapStatus(null, sizes, 1) - } - } - // Test positive values - Seq(1, 100, 499, 500, 501).foreach { s => - conf.set(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS, s) - val status = MapStatus(null, sizes, 1) - if(sizes.length > s) { - assert(status.isInstanceOf[HighlyCompressedMapStatus]) - } else { - assert(status.isInstanceOf[CompressedMapStatus]) - } - } - } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala index e24d550a6266..d1113c7e0b10 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala @@ -22,6 +22,7 @@ import java.net.URI import java.util.concurrent.atomic.AtomicInteger import org.apache.hadoop.fs.Path +import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods._ import org.scalatest.BeforeAndAfter @@ -217,7 +218,9 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp // Verify the same events are replayed in the same order assert(sc.eventLogger.isDefined) - val originalEvents = sc.eventLogger.get.loggedEvents + val originalEvents = sc.eventLogger.get.loggedEvents.filter { e => + !JsonProtocol.sparkEventFromJson(e).isInstanceOf[SparkListenerStageExecutorMetrics] + } val replayedEvents = eventMonster.loggedEvents originalEvents.zip(replayedEvents).foreach { case (e1, e2) => // Don't compare the JSON here because accumulators in StageInfo may be out of order diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 2d409d94ca1b..ff0f99b5c94d 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -51,6 +51,9 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa var taskScheduler: TestTaskScheduler = null var scheduler: DAGScheduler = null var backend: T = _ + // Even though the tests aren't doing much, occassionally we see flakiness from pauses over + // a second (probably from GC?) so we leave a long timeout in here + val duration = Duration(10, SECONDS) override def beforeEach(): Unit = { if (taskScheduler != null) { @@ -539,7 +542,6 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor } withBackend(runBackend _) { val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) - val duration = Duration(1, SECONDS) awaitJobTermination(jobFuture, duration) } assert(results === (0 until 10).map { _ -> 42 }.toMap) @@ -592,7 +594,6 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor } withBackend(runBackend _) { val jobFuture = submit(d, (0 until 30).toArray) - val duration = Duration(1, SECONDS) awaitJobTermination(jobFuture, duration) } assert(results === (0 until 30).map { idx => idx -> (4321 + idx) }.toMap) @@ -634,7 +635,6 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor } withBackend(runBackend _) { val jobFuture = submit(shuffledRdd, (0 until 10).toArray) - val duration = Duration(1, SECONDS) awaitJobTermination(jobFuture, duration) } assertDataStructuresEmpty() @@ -649,7 +649,6 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor } withBackend(runBackend _) { val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) - val duration = Duration(1, SECONDS) awaitJobTermination(jobFuture, duration) assert(failure.getMessage.contains("test task failure")) } diff --git a/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala b/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala index 78f618f8a216..0d3611c80b8d 100644 --- a/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala @@ -16,13 +16,16 @@ */ package org.apache.spark.security -import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, FileOutputStream} -import java.nio.channels.Channels +import java.io._ +import java.nio.ByteBuffer +import java.nio.channels.{Channels, ReadableByteChannel} import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.Files import java.util.{Arrays, Random, UUID} import com.google.common.io.ByteStreams +import org.mockito.Matchers.any +import org.mockito.Mockito._ import org.apache.spark._ import org.apache.spark.internal.config._ @@ -164,6 +167,36 @@ class CryptoStreamUtilsSuite extends SparkFunSuite { } } + test("error handling wrapper") { + val wrapped = mock(classOf[ReadableByteChannel]) + val decrypted = mock(classOf[ReadableByteChannel]) + val errorHandler = new CryptoStreamUtils.ErrorHandlingReadableChannel(decrypted, wrapped) + + when(decrypted.read(any(classOf[ByteBuffer]))) + .thenThrow(new IOException()) + .thenThrow(new InternalError()) + .thenReturn(1) + + val out = ByteBuffer.allocate(1) + intercept[IOException] { + errorHandler.read(out) + } + intercept[InternalError] { + errorHandler.read(out) + } + + val e = intercept[IOException] { + errorHandler.read(out) + } + assert(e.getMessage().contains("is closed")) + errorHandler.close() + + verify(decrypted, times(2)).read(any(classOf[ByteBuffer])) + verify(wrapped, never()).read(any(classOf[ByteBuffer])) + verify(decrypted, never()).close() + verify(wrapped, times(1)).close() + } + private def createConf(extra: (String, String)*): SparkConf = { val conf = new SparkConf() extra.foreach { case (k, v) => conf.set(k, v) } diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala b/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala index a1cf3570a7a6..f4fc0080f310 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala @@ -21,8 +21,8 @@ import scala.reflect.ClassTag import scala.util.Random import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.benchmark.Benchmark import org.apache.spark.serializer.KryoTest._ -import org.apache.spark.util.Benchmark class KryoBenchmark extends SparkFunSuite { val benchmark = new Benchmark("Benchmark Kryo Unsafe vs safe Serialization", 1024 * 1024 * 15, 10) diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala index 36912441c03b..ac25bcef5434 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala @@ -345,8 +345,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { val denseBlockSizes = new Array[Long](5000) val sparseBlockSizes = Array[Long](0L, 1L, 0L, 2L) Seq(denseBlockSizes, sparseBlockSizes).foreach { blockSizes => - ser.serialize( - HighlyCompressedMapStatus(BlockManagerId("exec-1", "host", 1234), blockSizes, 1)) + ser.serialize(HighlyCompressedMapStatus(BlockManagerId("exec-1", "host", 1234), blockSizes)) } } diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala index ea80fea90534..0b2bbd2fa8a7 100644 --- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala @@ -22,18 +22,19 @@ import java.lang.{Integer => JInteger, Long => JLong} import java.util.{Arrays, Date, Properties} import scala.collection.JavaConverters._ +import scala.collection.immutable.Map import scala.reflect.{classTag, ClassTag} import org.scalatest.BeforeAndAfter import org.apache.spark._ -import org.apache.spark.executor.TaskMetrics +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} +import org.apache.spark.metrics.ExecutorMetricType import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster._ import org.apache.spark.status.api.v1 import org.apache.spark.storage._ import org.apache.spark.util.Utils -import org.apache.spark.util.kvstore._ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { @@ -881,12 +882,41 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { assert(dist.memoryRemaining === maxMemory - rdd2b1.memSize - rdd1b2.memSize ) } + // Add block1 of rdd1 back to bm 1. + listener.onBlockUpdated(SparkListenerBlockUpdated( + BlockUpdatedInfo(bm1, rdd1b1.blockId, level, rdd1b1.memSize, rdd1b1.diskSize))) + + check[ExecutorSummaryWrapper](bm1.executorId) { exec => + assert(exec.info.rddBlocks === 3L) + assert(exec.info.memoryUsed === rdd1b1.memSize + rdd1b2.memSize + rdd2b1.memSize) + assert(exec.info.diskUsed === rdd1b1.diskSize + rdd1b2.diskSize + rdd2b1.diskSize) + } + // Unpersist RDD1. listener.onUnpersistRDD(SparkListenerUnpersistRDD(rdd1b1.rddId)) intercept[NoSuchElementException] { check[RDDStorageInfoWrapper](rdd1b1.rddId) { _ => () } } + // executor1 now only contains block1 from rdd2. + check[ExecutorSummaryWrapper](bm1.executorId) { exec => + assert(exec.info.rddBlocks === 1L) + assert(exec.info.memoryUsed === rdd2b1.memSize) + assert(exec.info.diskUsed === rdd2b1.diskSize) + } + + // Unpersist RDD2. + listener.onUnpersistRDD(SparkListenerUnpersistRDD(rdd2b1.rddId)) + intercept[NoSuchElementException] { + check[RDDStorageInfoWrapper](rdd2b1.rddId) { _ => () } + } + + check[ExecutorSummaryWrapper](bm1.executorId) { exec => + assert(exec.info.rddBlocks === 0L) + assert(exec.info.memoryUsed === 0) + assert(exec.info.diskUsed === 0) + } + // Update a StreamBlock. val stream1 = StreamBlockId(1, 1L) listener.onBlockUpdated(SparkListenerBlockUpdated( @@ -1263,6 +1293,130 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { } } + test("executor metrics updates") { + val listener = new AppStatusListener(store, conf, true) + + val driver = BlockManagerId(SparkContext.DRIVER_IDENTIFIER, "localhost", 42) + + listener.onExecutorAdded(createExecutorAddedEvent(1)) + listener.onExecutorAdded(createExecutorAddedEvent(2)) + listener.onStageSubmitted(createStageSubmittedEvent(0)) + // receive 3 metric updates from each executor with just stage 0 running, + // with different peak updates for each executor + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(1, + Array(4000L, 50L, 20L, 0L, 40L, 0L, 60L, 0L, 70L, 20L))) + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(2, + Array(1500L, 50L, 20L, 0L, 0L, 0L, 20L, 0L, 70L, 0L))) + // exec 1: new stage 0 peaks for metrics at indexes: 2, 4, 6 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(1, + Array(4000L, 50L, 50L, 0L, 50L, 0L, 100L, 0L, 70L, 20L))) + // exec 2: new stage 0 peaks for metrics at indexes: 0, 4, 6 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(2, + Array(2000L, 50L, 10L, 0L, 10L, 0L, 30L, 0L, 70L, 0L))) + // exec 1: new stage 0 peaks for metrics at indexes: 5, 7 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(1, + Array(2000L, 40L, 50L, 0L, 40L, 10L, 90L, 10L, 50L, 0L))) + // exec 2: new stage 0 peaks for metrics at indexes: 0, 5, 6, 7, 8 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(2, + Array(3500L, 50L, 15L, 0L, 10L, 10L, 35L, 10L, 80L, 0L))) + // now start stage 1, one more metric update for each executor, and new + // peaks for some stage 1 metrics (as listed), initialize stage 1 peaks + listener.onStageSubmitted(createStageSubmittedEvent(1)) + // exec 1: new stage 0 peaks for metrics at indexes: 0, 3, 7 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(1, + Array(5000L, 30L, 50L, 20L, 30L, 10L, 80L, 30L, 50L, 0L))) + // exec 2: new stage 0 peaks for metrics at indexes: 0, 1, 2, 3, 6, 7, 9 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(2, + Array(7000L, 80L, 50L, 20L, 0L, 10L, 50L, 30L, 10L, 40L))) + // complete stage 0, and 3 more updates for each executor with just + // stage 1 running + listener.onStageCompleted(createStageCompletedEvent(0)) + // exec 1: new stage 1 peaks for metrics at indexes: 0, 1, 3 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(1, + Array(6000L, 70L, 20L, 30L, 10L, 0L, 30L, 30L, 30L, 0L))) + // exec 2: new stage 1 peaks for metrics at indexes: 3, 4, 7, 8 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(2, + Array(5500L, 30L, 20L, 40L, 10L, 0L, 30L, 40L, 40L, 20L))) + // exec 1: new stage 1 peaks for metrics at indexes: 0, 4, 5, 7 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(1, + Array(7000L, 70L, 5L, 25L, 60L, 30L, 65L, 55L, 30L, 0L))) + // exec 2: new stage 1 peak for metrics at index: 7 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(2, + Array(5500L, 40L, 25L, 30L, 10L, 30L, 35L, 60L, 0L, 20L))) + // exec 1: no new stage 1 peaks + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(1, + Array(5500L, 70L, 15L, 20L, 55L, 20L, 70L, 40L, 20L, 0L))) + listener.onExecutorRemoved(createExecutorRemovedEvent(1)) + // exec 2: new stage 1 peak for metrics at index: 6 + listener.onExecutorMetricsUpdate(createExecutorMetricsUpdateEvent(2, + Array(4000L, 20L, 25L, 30L, 10L, 30L, 35L, 60L, 0L, 0L))) + listener.onStageCompleted(createStageCompletedEvent(1)) + + // expected peak values for each executor + val expectedValues = Map( + "1" -> new ExecutorMetrics(Array(7000L, 70L, 50L, 30L, 60L, 30L, 100L, 55L, 70L, 20L)), + "2" -> new ExecutorMetrics(Array(7000L, 80L, 50L, 40L, 10L, 30L, 50L, 60L, 80L, 40L))) + + // check that the stored peak values match the expected values + expectedValues.foreach { case (id, metrics) => + check[ExecutorSummaryWrapper](id) { exec => + assert(exec.info.id === id) + exec.info.peakMemoryMetrics match { + case Some(actual) => + ExecutorMetricType.values.foreach { metricType => + assert(actual.getMetricValue(metricType) === metrics.getMetricValue(metricType)) + } + case _ => + assert(false) + } + } + } + } + + test("stage executor metrics") { + // simulate reading in StageExecutorMetrics events from the history log + val listener = new AppStatusListener(store, conf, false) + val driver = BlockManagerId(SparkContext.DRIVER_IDENTIFIER, "localhost", 42) + + listener.onExecutorAdded(createExecutorAddedEvent(1)) + listener.onExecutorAdded(createExecutorAddedEvent(2)) + listener.onStageSubmitted(createStageSubmittedEvent(0)) + listener.onStageSubmitted(createStageSubmittedEvent(1)) + listener.onStageExecutorMetrics(SparkListenerStageExecutorMetrics("1", 0, 0, + new ExecutorMetrics(Array(5000L, 50L, 50L, 20L, 50L, 10L, 100L, 30L, 70L, 20L)))) + listener.onStageExecutorMetrics(SparkListenerStageExecutorMetrics("2", 0, 0, + new ExecutorMetrics(Array(7000L, 70L, 50L, 20L, 10L, 10L, 50L, 30L, 80L, 40L)))) + listener.onStageCompleted(createStageCompletedEvent(0)) + // executor 1 is removed before stage 1 has finished, the stage executor metrics + // are logged afterwards and should still be used to update the executor metrics. + listener.onExecutorRemoved(createExecutorRemovedEvent(1)) + listener.onStageExecutorMetrics(SparkListenerStageExecutorMetrics("1", 1, 0, + new ExecutorMetrics(Array(7000L, 70L, 50L, 30L, 60L, 30L, 80L, 55L, 50L, 0L)))) + listener.onStageExecutorMetrics(SparkListenerStageExecutorMetrics("2", 1, 0, + new ExecutorMetrics(Array(7000L, 80L, 50L, 40L, 10L, 30L, 50L, 60L, 40L, 40L)))) + listener.onStageCompleted(createStageCompletedEvent(1)) + + // expected peak values for each executor + val expectedValues = Map( + "1" -> new ExecutorMetrics(Array(7000L, 70L, 50L, 30L, 60L, 30L, 100L, 55L, 70L, 20L)), + "2" -> new ExecutorMetrics(Array(7000L, 80L, 50L, 40L, 10L, 30L, 50L, 60L, 80L, 40L))) + + // check that the stored peak values match the expected values + for ((id, metrics) <- expectedValues) { + check[ExecutorSummaryWrapper](id) { exec => + assert(exec.info.id === id) + exec.info.peakMemoryMetrics match { + case Some(actual) => + ExecutorMetricType.values.foreach { metricType => + assert(actual.getMetricValue(metricType) === metrics.getMetricValue(metricType)) + } + case _ => + assert(false) + } + } + } + } + private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptNumber) private def check[T: ClassTag](key: Any)(fn: T => Unit): Unit = { @@ -1300,4 +1454,37 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { } + /** Create a stage submitted event for the specified stage Id. */ + private def createStageSubmittedEvent(stageId: Int) = { + SparkListenerStageSubmitted(new StageInfo(stageId, 0, stageId.toString, 0, + Seq.empty, Seq.empty, "details")) + } + + /** Create a stage completed event for the specified stage Id. */ + private def createStageCompletedEvent(stageId: Int) = { + SparkListenerStageCompleted(new StageInfo(stageId, 0, stageId.toString, 0, + Seq.empty, Seq.empty, "details")) + } + + /** Create an executor added event for the specified executor Id. */ + private def createExecutorAddedEvent(executorId: Int) = { + SparkListenerExecutorAdded(0L, executorId.toString, new ExecutorInfo("host1", 1, Map.empty)) + } + + /** Create an executor added event for the specified executor Id. */ + private def createExecutorRemovedEvent(executorId: Int) = { + SparkListenerExecutorRemoved(10L, executorId.toString, "test") + } + + /** Create an executor metrics update event, with the specified executor metrics values. */ + private def createExecutorMetricsUpdateEvent( + executorId: Int, + executorMetrics: Array[Long]): SparkListenerExecutorMetricsUpdate = { + val taskMetrics = TaskMetrics.empty + taskMetrics.incDiskBytesSpilled(111) + taskMetrics.incMemoryBytesSpilled(222) + val accum = Array((333L, 1, 1, taskMetrics.accumulators().map(AccumulatorSuite.makeInfo))) + SparkListenerExecutorMetricsUpdate(executorId.toString, accum, + Some(new ExecutorMetrics(executorMetrics))) + } } diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala index dbee1f60d7af..32d6e8b94e1a 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala @@ -43,7 +43,7 @@ import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.netty.{NettyBlockTransferService, SparkTransportConf} import org.apache.spark.network.server.{NoOpRpcHandler, TransportServer, TransportServerBootstrap} -import org.apache.spark.network.shuffle.{BlockFetchingListener, TempFileManager} +import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, RegisterExecutor} import org.apache.spark.network.util.TransportConf import org.apache.spark.rpc.RpcEnv @@ -1437,7 +1437,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE class MockBlockTransferService(val maxFailures: Int) extends BlockTransferService { var numCalls = 0 - var tempFileManager: TempFileManager = null + var tempFileManager: DownloadFileManager = null override def init(blockDataManager: BlockDataManager): Unit = {} @@ -1447,7 +1447,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE execId: String, blockIds: Array[String], listener: BlockFetchingListener, - tempFileManager: TempFileManager): Unit = { + tempFileManager: DownloadFileManager): Unit = { listener.onBlockFetchSuccess("mockBlockId", new NioManagedBuffer(ByteBuffer.allocate(1))) } @@ -1474,7 +1474,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE port: Int, execId: String, blockId: String, - tempFileManager: TempFileManager): ManagedBuffer = { + tempFileManager: DownloadFileManager): ManagedBuffer = { numCalls += 1 this.tempFileManager = tempFileManager if (numCalls <= maxFailures) { diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala index a2997dbd1b1a..b268195e09a5 100644 --- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala @@ -33,7 +33,7 @@ import org.scalatest.PrivateMethodTester import org.apache.spark.{SparkFunSuite, TaskContext} import org.apache.spark.network._ import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} -import org.apache.spark.network.shuffle.{BlockFetchingListener, TempFileManager} +import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager} import org.apache.spark.network.util.LimitedInputStream import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.util.Utils @@ -478,12 +478,12 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT val remoteBlocks = Map[BlockId, ManagedBuffer]( ShuffleBlockId(0, 0, 0) -> createMockManagedBuffer()) val transfer = mock(classOf[BlockTransferService]) - var tempFileManager: TempFileManager = null + var tempFileManager: DownloadFileManager = null when(transfer.fetchBlocks(any(), any(), any(), any(), any(), any())) .thenAnswer(new Answer[Unit] { override def answer(invocation: InvocationOnMock): Unit = { val listener = invocation.getArguments()(4).asInstanceOf[BlockFetchingListener] - tempFileManager = invocation.getArguments()(5).asInstanceOf[TempFileManager] + tempFileManager = invocation.getArguments()(5).asInstanceOf[DownloadFileManager] Future { listener.onBlockFetchSuccess( ShuffleBlockId(0, 0, 0).toString, remoteBlocks(ShuffleBlockId(0, 0, 0))) diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index 74b72d940eee..1e0d2af9a471 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -30,6 +30,7 @@ import org.scalatest.exceptions.TestFailedException import org.apache.spark._ import org.apache.spark.executor._ +import org.apache.spark.metrics.ExecutorMetricType import org.apache.spark.rdd.RDDOperationScope import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo @@ -94,11 +95,17 @@ class JsonProtocolSuite extends SparkFunSuite { makeTaskMetrics(300L, 400L, 500L, 600L, 700, 800, hasHadoopInput = true, hasOutput = true) .accumulators().map(AccumulatorSuite.makeInfo) .zipWithIndex.map { case (a, i) => a.copy(id = i) } - SparkListenerExecutorMetricsUpdate("exec3", Seq((1L, 2, 3, accumUpdates))) + val executorUpdates = new ExecutorMetrics( + Array(543L, 123456L, 12345L, 1234L, 123L, 12L, 432L, 321L, 654L, 765L)) + SparkListenerExecutorMetricsUpdate("exec3", Seq((1L, 2, 3, accumUpdates)), + Some(executorUpdates)) } val blockUpdated = SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId("Stars", "In your multitude...", 300), RDDBlockId(0, 0), StorageLevel.MEMORY_ONLY, 100L, 0L)) + val stageExecutorMetrics = + SparkListenerStageExecutorMetrics("1", 2, 3, + new ExecutorMetrics(Array(543L, 123456L, 12345L, 1234L, 123L, 12L, 432L, 321L, 654L, 765L))) testEvent(stageSubmitted, stageSubmittedJsonString) testEvent(stageCompleted, stageCompletedJsonString) @@ -124,6 +131,7 @@ class JsonProtocolSuite extends SparkFunSuite { testEvent(nodeUnblacklisted, nodeUnblacklistedJsonString) testEvent(executorMetricsUpdate, executorMetricsUpdateJsonString) testEvent(blockUpdated, blockUpdatedJsonString) + testEvent(stageExecutorMetrics, stageExecutorMetricsJsonString) } test("Dependent Classes") { @@ -419,6 +427,30 @@ class JsonProtocolSuite extends SparkFunSuite { exceptionFailure.accumUpdates, oldExceptionFailure.accumUpdates, (x, y) => x == y) } + test("ExecutorMetricsUpdate backward compatibility: executor metrics update") { + // executorMetricsUpdate was added in 2.4.0. + val executorMetricsUpdate = makeExecutorMetricsUpdate("1", true, true) + val oldExecutorMetricsUpdateJson = + JsonProtocol.executorMetricsUpdateToJson(executorMetricsUpdate) + .removeField( _._1 == "Executor Metrics Updated") + val exepectedExecutorMetricsUpdate = makeExecutorMetricsUpdate("1", true, false) + assertEquals(exepectedExecutorMetricsUpdate, + JsonProtocol.executorMetricsUpdateFromJson(oldExecutorMetricsUpdateJson)) + } + + test("executorMetricsFromJson backward compatibility: handle missing metrics") { + // any missing metrics should be set to 0 + val executorMetrics = new ExecutorMetrics( + Array(12L, 23L, 45L, 67L, 78L, 89L, 90L, 123L, 456L, 789L)) + val oldExecutorMetricsJson = + JsonProtocol.executorMetricsToJson(executorMetrics) + .removeField( _._1 == "MappedPoolMemory") + val expectedExecutorMetrics = new ExecutorMetrics( + Array(12L, 23L, 45L, 67L, 78L, 89L, 90L, 123L, 456L, 0L)) + assertEquals(expectedExecutorMetrics, + JsonProtocol.executorMetricsFromJson(oldExecutorMetricsJson)) + } + test("AccumulableInfo value de/serialization") { import InternalAccumulator._ val blocks = Seq[(BlockId, BlockStatus)]( @@ -435,7 +467,6 @@ class JsonProtocolSuite extends SparkFunSuite { testAccumValue(Some("anything"), blocks, JString(blocks.toString)) testAccumValue(Some("anything"), 123, JString("123")) } - } @@ -565,6 +596,13 @@ private[spark] object JsonProtocolSuite extends Assertions { assert(stageAttemptId1 === stageAttemptId2) assertSeqEquals[AccumulableInfo](updates1, updates2, (a, b) => a.equals(b)) }) + assertOptionEquals(e1.executorUpdates, e2.executorUpdates, + (e1: ExecutorMetrics, e2: ExecutorMetrics) => assertEquals(e1, e2)) + case (e1: SparkListenerStageExecutorMetrics, e2: SparkListenerStageExecutorMetrics) => + assert(e1.execId === e2.execId) + assert(e1.stageId === e2.stageId) + assert(e1.stageAttemptId === e2.stageAttemptId) + assertEquals(e1.executorMetrics, e2.executorMetrics) case (e1, e2) => assert(e1 === e2) case _ => fail("Events don't match in types!") @@ -715,6 +753,12 @@ private[spark] object JsonProtocolSuite extends Assertions { assertStackTraceElementEquals) } + private def assertEquals(metrics1: ExecutorMetrics, metrics2: ExecutorMetrics) { + ExecutorMetricType.values.foreach { metricType => + assert(metrics1.getMetricValue(metricType) === metrics2.getMetricValue(metricType)) + } + } + private def assertJsonStringEquals(expected: String, actual: String, metadata: String) { val expectedJson = pretty(parse(expected)) val actualJson = pretty(parse(actual)) @@ -765,7 +809,6 @@ private[spark] object JsonProtocolSuite extends Assertions { assert(ste1 === ste2) } - /** ----------------------------------- * | Util methods for constructing events | * ------------------------------------ */ @@ -820,6 +863,27 @@ private[spark] object JsonProtocolSuite extends Assertions { new AccumulableInfo(id, Some(s"Accumulable$id"), Some(s"delta$id"), Some(s"val$id"), internal, countFailedValues, metadata) + /** Creates an SparkListenerExecutorMetricsUpdate event */ + private def makeExecutorMetricsUpdate( + execId: String, + includeTaskMetrics: Boolean, + includeExecutorMetrics: Boolean): SparkListenerExecutorMetricsUpdate = { + val taskMetrics = + if (includeTaskMetrics) { + Seq((1L, 1, 1, Seq(makeAccumulableInfo(1, false, false, None), + makeAccumulableInfo(2, false, false, None)))) + } else { + Seq() + } + val executorMetricsUpdate = + if (includeExecutorMetrics) { + Some(new ExecutorMetrics(Array(123456L, 543L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L))) + } else { + None + } + SparkListenerExecutorMetricsUpdate(execId, taskMetrics, executorMetricsUpdate) + } + /** * Creates a TaskMetrics object describing a task that read data from Hadoop (if hasHadoopInput is * set to true) or read data from a shuffle otherwise. @@ -2007,7 +2071,42 @@ private[spark] object JsonProtocolSuite extends Assertions { | } | ] | } - | ] + | ], + | "Executor Metrics Updated" : { + | "JVMHeapMemory" : 543, + | "JVMOffHeapMemory" : 123456, + | "OnHeapExecutionMemory" : 12345, + | "OffHeapExecutionMemory" : 1234, + | "OnHeapStorageMemory" : 123, + | "OffHeapStorageMemory" : 12, + | "OnHeapUnifiedMemory" : 432, + | "OffHeapUnifiedMemory" : 321, + | "DirectPoolMemory" : 654, + | "MappedPoolMemory" : 765 + | } + | + |} + """.stripMargin + + private val stageExecutorMetricsJsonString = + """ + |{ + | "Event": "SparkListenerStageExecutorMetrics", + | "Executor ID": "1", + | "Stage ID": 2, + | "Stage Attempt ID": 3, + | "Executor Metrics" : { + | "JVMHeapMemory" : 543, + | "JVMOffHeapMemory" : 123456, + | "OnHeapExecutionMemory" : 12345, + | "OffHeapExecutionMemory" : 1234, + | "OnHeapStorageMemory" : 123, + | "OffHeapStorageMemory" : 12, + | "OnHeapUnifiedMemory" : 432, + | "OffHeapUnifiedMemory" : 321, + | "DirectPoolMemory" : 654, + | "MappedPoolMemory" : 765 + | } |} """.stripMargin diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 418d2f9b8850..39f4fba78583 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -1184,6 +1184,55 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(Utils.getSimpleName(classOf[MalformedClassObject.MalformedClass]) === "UtilsSuite$MalformedClassObject$MalformedClass") } + + test("stringHalfWidth") { + // scalastyle:off nonascii + assert(Utils.stringHalfWidth(null) == 0) + assert(Utils.stringHalfWidth("") == 0) + assert(Utils.stringHalfWidth("ab c") == 4) + assert(Utils.stringHalfWidth("1098") == 4) + assert(Utils.stringHalfWidth("mø") == 2) + assert(Utils.stringHalfWidth("γύρ") == 3) + assert(Utils.stringHalfWidth("pê") == 2) + assert(Utils.stringHalfWidth("ー") == 2) + assert(Utils.stringHalfWidth("测") == 2) + assert(Utils.stringHalfWidth("か") == 2) + assert(Utils.stringHalfWidth("걸") == 2) + assert(Utils.stringHalfWidth("à") == 1) + assert(Utils.stringHalfWidth("焼") == 2) + assert(Utils.stringHalfWidth("羍む") == 4) + assert(Utils.stringHalfWidth("뺭ᾘ") == 3) + assert(Utils.stringHalfWidth("\u0967\u0968\u0969") == 3) + // scalastyle:on nonascii + } + + test("trimExceptCRLF standalone") { + val crlfSet = Set("\r", "\n") + val nonPrintableButCRLF = (0 to 32).map(_.toChar.toString).toSet -- crlfSet + + // identity for CRLF + crlfSet.foreach { s => Utils.trimExceptCRLF(s) === s } + + // empty for other non-printables + nonPrintableButCRLF.foreach { s => assert(Utils.trimExceptCRLF(s) === "") } + + // identity for a printable string + assert(Utils.trimExceptCRLF("a") === "a") + + // identity for strings with CRLF + crlfSet.foreach { s => + assert(Utils.trimExceptCRLF(s"${s}a") === s"${s}a") + assert(Utils.trimExceptCRLF(s"a${s}") === s"a${s}") + assert(Utils.trimExceptCRLF(s"b${s}b") === s"b${s}b") + } + + // trim nonPrintableButCRLF except when inside a string + nonPrintableButCRLF.foreach { s => + assert(Utils.trimExceptCRLF(s"${s}a") === "a") + assert(Utils.trimExceptCRLF(s"a${s}") === "a") + assert(Utils.trimExceptCRLF(s"b${s}b") === s"b${s}b") + } + } } private class SimpleExtension diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala index d542ba0b6640..cd2526578413 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala @@ -17,9 +17,8 @@ package org.apache.spark.util.collection -import java.util.Objects - import scala.collection.mutable.ArrayBuffer +import scala.concurrent.duration._ import scala.ref.WeakReference import org.scalatest.Matchers @@ -459,7 +458,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite // https://github.com/scala/scala/blob/2.13.x/test/junit/scala/tools/testing/AssertUtil.scala // (lines 69-89) // assert(map.currentMap == null) - eventually { + eventually(timeout(5 seconds), interval(200 milliseconds)) { System.gc() // direct asserts introduced some macro generated code that held a reference to the map val tmpIsNull = null == underlyingMapRef.get.orNull @@ -509,7 +508,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite .sorted assert(it.isEmpty) - assert(keys == (0 until 100)) + assert(keys == (0 until 100).toList) assert(map.numSpills == 0) // these asserts try to show that we're no longer holding references to the underlying map. diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala index 3e56db5ea116..47173b89e91e 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark._ import org.apache.spark.memory.MemoryTestingUtils import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.unsafe.array.LongArray -import org.apache.spark.unsafe.memory.OnHeapMemoryBlock +import org.apache.spark.unsafe.memory.MemoryBlock import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, RecordPointerAndKeyPrefix, UnsafeSortDataFormat} class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { @@ -105,8 +105,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { // the form [150000000, 150000001, 150000002, ...., 300000000, 0, 1, 2, ..., 149999999] // that can trigger copyRange() in TimSort.mergeLo() or TimSort.mergeHi() val ref = Array.tabulate[Long](size) { i => if (i < size / 2) size / 2 + i else i } - val buf = new LongArray(OnHeapMemoryBlock.fromArray(ref)) - val tmpBuf = new LongArray(new OnHeapMemoryBlock((size/2) * 8L)) + val buf = new LongArray(MemoryBlock.fromLongArray(ref)) + val tmp = new Array[Long](size/2) + val tmpBuf = new LongArray(MemoryBlock.fromLongArray(tmp)) new Sorter(new UnsafeSortDataFormat(tmpBuf)).sort( buf, 0, size, new Comparator[RecordPointerAndKeyPrefix] { diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala index 151235dd0fb9..68bcc5e5a509 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala @@ -185,16 +185,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers { assert(map.contains(null)) } - test("support for more than 12M items") { - val cnt = 12000000 // 12M - val map = new OpenHashMap[Int, Int](cnt) - for (i <- 0 until cnt) { - map(i) = 1 - } - val numInvalidValues = map.iterator.count(_._2 == 0) - assertResult(0)(numInvalidValues) - } - test("distinguish between the 0/0.0/0L and null") { val specializedMap1 = new OpenHashMap[String, Long] specializedMap1("a") = null.asInstanceOf[Long] diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala index b887f937a9da..44d2118d7794 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala @@ -255,4 +255,17 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers { val set = new OpenHashSet[Long](0) assert(set.size === 0) } + + test("support for more than 12M items") { + val cnt = 12000000 // 12M + val set = new OpenHashSet[Int](cnt) + for (i <- 0 until cnt) { + set.add(i) + assert(set.contains(i)) + + val pos1 = set.getPos(i) + val pos2 = set.addWithoutResize(i) & OpenHashSet.POSITION_MASK + assert(pos1 == pos2) + } + } } diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala index ddf3740e76a7..d5956ea32096 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala @@ -27,7 +27,7 @@ import com.google.common.primitives.Ints import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.unsafe.array.LongArray -import org.apache.spark.unsafe.memory.OnHeapMemoryBlock +import org.apache.spark.unsafe.memory.MemoryBlock import org.apache.spark.util.collection.Sorter import org.apache.spark.util.random.XORShiftRandom @@ -78,14 +78,14 @@ class RadixSortSuite extends SparkFunSuite with Logging { private def generateTestData(size: Long, rand: => Long): (Array[JLong], LongArray) = { val ref = Array.tabulate[Long](Ints.checkedCast(size)) { i => rand } val extended = ref ++ Array.fill[Long](Ints.checkedCast(size))(0) - (ref.map(i => new JLong(i)), new LongArray(OnHeapMemoryBlock.fromArray(extended))) + (ref.map(i => new JLong(i)), new LongArray(MemoryBlock.fromLongArray(extended))) } private def generateKeyPrefixTestData(size: Long, rand: => Long): (LongArray, LongArray) = { val ref = Array.tabulate[Long](Ints.checkedCast(size * 2)) { i => rand } val extended = ref ++ Array.fill[Long](Ints.checkedCast(size * 2))(0) - (new LongArray(OnHeapMemoryBlock.fromArray(ref)), - new LongArray(OnHeapMemoryBlock.fromArray(extended))) + (new LongArray(MemoryBlock.fromLongArray(ref)), + new LongArray(MemoryBlock.fromLongArray(extended))) } private def collectToArray(array: LongArray, offset: Int, length: Long): Array[Long] = { @@ -110,7 +110,7 @@ class RadixSortSuite extends SparkFunSuite with Logging { } private def referenceKeyPrefixSort(buf: LongArray, lo: Long, hi: Long, refCmp: PrefixComparator) { - val sortBuffer = new LongArray(new OnHeapMemoryBlock(buf.size() * 8L)) + val sortBuffer = new LongArray(MemoryBlock.fromLongArray(new Array[Long](buf.size().toInt))) new Sorter(new UnsafeSortDataFormat(sortBuffer)).sort( buf, Ints.checkedCast(lo), Ints.checkedCast(hi), new Comparator[RecordPointerAndKeyPrefix] { override def compare( diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 466135e72233..777950016801 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -81,6 +81,7 @@ app-20180109111548-0000 app-20161115172038-0000 app-20161116163331-0000 application_1516285256255_0012 +application_1506645932520_24630151 local-1422981759269 local-1422981780767 local-1425081759269 diff --git a/dev/create-release/do-release-docker.sh b/dev/create-release/do-release-docker.sh index fa7b73cdb40e..c1a122ebfb12 100755 --- a/dev/create-release/do-release-docker.sh +++ b/dev/create-release/do-release-docker.sh @@ -135,6 +135,9 @@ if [ -n "$JAVA" ]; then JAVA_VOL="--volume $JAVA:/opt/spark-java" fi +# SPARK-24530: Sphinx must work with python 3 to generate doc correctly. +echo "SPHINXPYTHON=/opt/p35/bin/python" >> $ENVFILE + echo "Building $RELEASE_TAG; output will be at $WORKDIR/output" docker run -ti \ --env-file "$ENVFILE" \ diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index 131d81c8a75c..d9135173419a 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -67,7 +67,7 @@ print("Release tag: %s" % RELEASE_TAG) print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG) print("Number of commits in this range: %s" % len(new_commits)) -print +print("") def print_indented(_list): @@ -88,10 +88,10 @@ def print_indented(_list): def is_release(commit_title): - return re.findall("\[release\]", commit_title.lower()) or \ - "preparing spark release" in commit_title.lower() or \ - "preparing development version" in commit_title.lower() or \ - "CHANGES.txt" in commit_title + return ("[release]" in commit_title.lower() or + "preparing spark release" in commit_title.lower() or + "preparing development version" in commit_title.lower() or + "CHANGES.txt" in commit_title) def is_maintenance(commit_title): diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 73610a333591..cce5f8b6975c 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -76,9 +76,8 @@ for env in ASF_USERNAME GPG_PASSPHRASE GPG_KEY; do fi done -# Explicitly set locale in order to make `sort` output consistent across machines. -# See https://stackoverflow.com/questions/28881 for more details. -export LC_ALL=C +export LC_ALL=C.UTF-8 +export LANG=C.UTF-8 # Commit ref to checkout when building GIT_REF=${GIT_REF:-master} @@ -114,8 +113,6 @@ BASE_PROFILES="-Pmesos -Pyarn" PUBLISH_SCALA_2_10=0 SCALA_2_10_PROFILES="-Pscala-2.10" SCALA_2_11_PROFILES= -SCALA_2_12_PROFILES="-Pscala-2.12" - if [[ $SPARK_VERSION > "2.3" ]]; then BASE_PROFILES="$BASE_PROFILES -Pkubernetes -Pflume" SCALA_2_11_PROFILES="-Pkafka-0-8" @@ -123,6 +120,12 @@ else PUBLISH_SCALA_2_10=1 fi +PUBLISH_SCALA_2_12=0 +SCALA_2_12_PROFILES="-Pscala-2.12" +if [[ $SPARK_VERSION > "2.4" ]]; then + PUBLISH_SCALA_2_12=1 +fi + # Hive-specific profiles for some builds HIVE_PROFILES="-Phive -Phive-thriftserver" # Profiles for publishing snapshots and release to Maven Central @@ -171,6 +174,10 @@ if [[ "$1" == "package" ]]; then # Source and binary tarballs echo "Packaging release source tarballs" cp -r spark spark-$SPARK_VERSION + # For source release, exclude copy of binary license/notice + rm spark-$SPARK_VERSION/LICENSE-binary + rm spark-$SPARK_VERSION/NOTICE-binary + rm -r spark-$SPARK_VERSION/licenses-binary tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \ --detach-sig spark-$SPARK_VERSION.tgz @@ -183,8 +190,9 @@ if [[ "$1" == "package" ]]; then # Updated for each binary build make_binary_release() { NAME=$1 - FLAGS="$MVN_EXTRA_OPTS -B $SCALA_2_11_PROFILES $BASE_RELEASE_PROFILES $2" + FLAGS="$MVN_EXTRA_OPTS -B $BASE_RELEASE_PROFILES $2" BUILD_PACKAGE=$3 + SCALA_VERSION=$4 # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds # share the same Zinc server. @@ -194,10 +202,9 @@ if [[ "$1" == "package" ]]; then cp -r spark spark-$SPARK_VERSION-bin-$NAME cd spark-$SPARK_VERSION-bin-$NAME - # TODO There should probably be a flag to make-distribution to allow 2.12 support - #if [[ $FLAGS == *scala-2.12* ]]; then - # ./dev/change-scala-version.sh 2.12 - #fi + if [[ "$SCALA_VERSION" != "2.11" ]]; then + ./dev/change-scala-version.sh $SCALA_VERSION + fi export ZINC_PORT=$ZINC_PORT echo "Creating distribution: $NAME ($FLAGS)" @@ -271,7 +278,7 @@ if [[ "$1" == "package" ]]; then BINARY_PKGS_ARGS["hadoop2.7"]="-Phadoop-2.7 $HIVE_PROFILES" if ! is_dry_run; then BINARY_PKGS_ARGS["hadoop2.6"]="-Phadoop-2.6 $HIVE_PROFILES" - BINARY_PKGS_ARGS["without-hadoop"]="-Pwithout-hadoop" + BINARY_PKGS_ARGS["without-hadoop"]="-Phadoop-provided" if [[ $SPARK_VERSION < "2.2." ]]; then BINARY_PKGS_ARGS["hadoop2.4"]="-Phadoop-2.4 $HIVE_PROFILES" BINARY_PKGS_ARGS["hadoop2.3"]="-Phadoop-2.3 $HIVE_PROFILES" @@ -288,11 +295,20 @@ if [[ "$1" == "package" ]]; then for key in ${!BINARY_PKGS_ARGS[@]}; do args=${BINARY_PKGS_ARGS[$key]} extra=${BINARY_PKGS_EXTRA[$key]} - if ! make_binary_release "$key" "$args" "$extra"; then + if ! make_binary_release "$key" "$SCALA_2_11_PROFILES $args" "$extra" "2.11"; then error "Failed to build $key package. Check logs for details." fi done + if [[ $PUBLISH_SCALA_2_12 = 1 ]]; then + key="without-hadoop-scala-2.12" + args="-Phadoop-provided" + extra="" + if ! make_binary_release "$key" "$SCALA_2_12_PROFILES $args" "$extra" "2.12"; then + error "Failed to build $key package. Check logs for details." + fi + fi + rm -rf spark-$SPARK_VERSION-bin-*/ if ! is_dry_run; then @@ -411,14 +427,16 @@ if [[ "$1" == "publish-release" ]]; then -DskipTests $PUBLISH_PROFILES $SCALA_2_10_PROFILES clean install fi - #./dev/change-scala-version.sh 2.12 - #$MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo \ - # -DskipTests $SCALA_2_12_PROFILES §$PUBLISH_PROFILES clean install + if ! is_dry_run && [[ $PUBLISH_SCALA_2_12 = 1 ]]; then + ./dev/change-scala-version.sh 2.12 + $MVN -DzincPort=$((ZINC_PORT + 2)) -Dmaven.repo.local=$tmp_repo -Dscala-2.12 \ + -DskipTests $PUBLISH_PROFILES $SCALA_2_12_PROFILES clean install + fi # Clean-up Zinc nailgun process $LSOF -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill - #./dev/change-scala-version.sh 2.11 + ./dev/change-scala-version.sh 2.11 pushd $tmp_repo/org/apache/spark diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index 8cc990d87184..f273b337fdb4 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -235,7 +235,7 @@ def translate_component(component, commit_hash, warnings): # Parse components in the commit message # The returned components are already filtered and translated def find_components(commit, commit_hash): - components = re.findall("\[\w*\]", commit.lower()) + components = re.findall(r"\[\w*\]", commit.lower()) components = [translate_component(c, commit_hash) for c in components if c in known_components] return components diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 07ce320177f5..42315446016c 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -62,8 +62,8 @@ RUN echo 'deb http://cran.cnr.Berkeley.edu/bin/linux/ubuntu xenial/' >> /etc/apt pip install $BASE_PIP_PKGS && \ pip install $PIP_PKGS && \ cd && \ - virtualenv -p python3 p35 && \ - . p35/bin/activate && \ + virtualenv -p python3 /opt/p35 && \ + . /opt/p35/bin/activate && \ pip install $BASE_PIP_PKGS && \ pip install $PIP_PKGS && \ # Install R packages and dependencies used when building. @@ -73,7 +73,7 @@ RUN echo 'deb http://cran.cnr.Berkeley.edu/bin/linux/ubuntu xenial/' >> /etc/apt Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='http://cran.us.r-project.org/')" && \ Rscript -e "devtools::install_github('jimhester/lintr')" && \ # Install tools needed to build the documentation. - $APT_INSTALL ruby2.3 ruby2.3-dev && \ + $APT_INSTALL ruby2.3 ruby2.3-dev mkdocs && \ gem install jekyll --no-rdoc --no-ri && \ gem install jekyll-redirect-from && \ gem install pygments.rb diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 62ae04dbc255..e0e3e0a82e73 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -34,7 +34,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-3.0.9.jar +commons-compiler-3.0.10.jar commons-compress-1.8.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar @@ -87,18 +87,18 @@ htrace-core-3.0.4.jar httpclient-4.5.6.jar httpcore-4.4.10.jar ivy-2.4.0.jar -jackson-annotations-2.6.7.jar -jackson-core-2.6.7.jar +jackson-annotations-2.9.6.jar +jackson-core-2.9.6.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.6.7.1.jar -jackson-dataformat-yaml-2.6.7.jar +jackson-databind-2.9.6.jar +jackson-dataformat-yaml-2.9.6.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-jaxb-annotations-2.6.7.jar -jackson-module-paranamer-2.7.9.jar -jackson-module-scala_2.11-2.6.7.1.jar +jackson-module-jaxb-annotations-2.9.6.jar +jackson-module-paranamer-2.9.6.jar +jackson-module-scala_2.11-2.9.6.jar jackson-xc-1.9.13.jar -janino-3.0.9.jar +janino-3.0.10.jar javassist-3.18.1-GA.jar javax.annotation-api-1.2.jar javax.inject-1.jar @@ -138,7 +138,7 @@ libfb303-0.9.3.jar libthrift-0.9.3.jar log4j-1.2.17.jar logging-interceptor-3.8.1.jar -lz4-java-1.4.0.jar +lz4-java-1.5.0.jar machinist_2.11-0.6.1.jar macro-compat_2.11-1.1.1.jar mesos-1.4.0-shaded-protobuf.jar @@ -153,9 +153,9 @@ objenesis-2.5.1.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar @@ -177,7 +177,7 @@ scala-xml_2.11-1.0.5.jar shapeless_2.11-2.3.2.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar -snakeyaml-1.15.jar +snakeyaml-1.18.jar snappy-0.2.jar snappy-java-1.1.7.1.jar spire-macros_2.11-0.13.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 5e12ca053af5..3b17f88a82c1 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -34,7 +34,7 @@ commons-beanutils-core-1.8.0.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-3.0.9.jar +commons-compiler-3.0.10.jar commons-compress-1.8.1.jar commons-configuration-1.6.jar commons-crypto-1.0.0.jar @@ -64,21 +64,21 @@ gson-2.2.4.jar guava-14.0.1.jar guice-3.0.jar guice-servlet-3.0.jar -hadoop-annotations-2.7.7.jar -hadoop-auth-2.7.7.jar -hadoop-client-2.7.7.jar -hadoop-common-2.7.7.jar -hadoop-hdfs-2.7.7.jar -hadoop-mapreduce-client-app-2.7.7.jar -hadoop-mapreduce-client-common-2.7.7.jar -hadoop-mapreduce-client-core-2.7.7.jar -hadoop-mapreduce-client-jobclient-2.7.7.jar -hadoop-mapreduce-client-shuffle-2.7.7.jar -hadoop-yarn-api-2.7.7.jar -hadoop-yarn-client-2.7.7.jar -hadoop-yarn-common-2.7.7.jar -hadoop-yarn-server-common-2.7.7.jar -hadoop-yarn-server-web-proxy-2.7.7.jar +hadoop-annotations-2.7.3.jar +hadoop-auth-2.7.3.jar +hadoop-client-2.7.3.jar +hadoop-common-2.7.3.jar +hadoop-hdfs-2.7.3.jar +hadoop-mapreduce-client-app-2.7.3.jar +hadoop-mapreduce-client-common-2.7.3.jar +hadoop-mapreduce-client-core-2.7.3.jar +hadoop-mapreduce-client-jobclient-2.7.3.jar +hadoop-mapreduce-client-shuffle-2.7.3.jar +hadoop-yarn-api-2.7.3.jar +hadoop-yarn-client-2.7.3.jar +hadoop-yarn-common-2.7.3.jar +hadoop-yarn-server-common-2.7.3.jar +hadoop-yarn-server-web-proxy-2.7.3.jar hk2-api-2.4.0-b34.jar hk2-locator-2.4.0-b34.jar hk2-utils-2.4.0-b34.jar @@ -87,18 +87,18 @@ htrace-core-3.1.0-incubating.jar httpclient-4.5.6.jar httpcore-4.4.10.jar ivy-2.4.0.jar -jackson-annotations-2.6.7.jar -jackson-core-2.6.7.jar +jackson-annotations-2.9.6.jar +jackson-core-2.9.6.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.6.7.1.jar -jackson-dataformat-yaml-2.6.7.jar +jackson-databind-2.9.6.jar +jackson-dataformat-yaml-2.9.6.jar jackson-jaxrs-1.9.13.jar jackson-mapper-asl-1.9.13.jar -jackson-module-jaxb-annotations-2.6.7.jar -jackson-module-paranamer-2.7.9.jar -jackson-module-scala_2.11-2.6.7.1.jar +jackson-module-jaxb-annotations-2.9.6.jar +jackson-module-paranamer-2.9.6.jar +jackson-module-scala_2.11-2.9.6.jar jackson-xc-1.9.13.jar -janino-3.0.9.jar +janino-3.0.10.jar javassist-3.18.1-GA.jar javax.annotation-api-1.2.jar javax.inject-1.jar @@ -117,7 +117,6 @@ jersey-guava-2.22.2.jar jersey-media-jaxb-2.22.2.jar jersey-server-2.22.2.jar jetty-6.1.26.jar -jetty-sslengine-6.1.26.jar jetty-util-6.1.26.jar jline-2.14.6.jar joda-time-2.9.3.jar @@ -140,7 +139,7 @@ libfb303-0.9.3.jar libthrift-0.9.3.jar log4j-1.2.17.jar logging-interceptor-3.8.1.jar -lz4-java-1.4.0.jar +lz4-java-1.5.0.jar machinist_2.11-0.6.1.jar macro-compat_2.11-1.1.1.jar mesos-1.4.0-shaded-protobuf.jar @@ -155,9 +154,9 @@ objenesis-2.5.1.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar @@ -179,7 +178,7 @@ scala-xml_2.11-1.0.5.jar shapeless_2.11-2.3.2.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar -snakeyaml-1.15.jar +snakeyaml-1.18.jar snappy-0.2.jar snappy-java-1.1.7.1.jar spire-macros_2.11-0.13.0.jar diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1 index 641b4a15ad7c..c818b2c39f74 100644 --- a/dev/deps/spark-deps-hadoop-3.1 +++ b/dev/deps/spark-deps-hadoop-3.1 @@ -31,7 +31,7 @@ commons-beanutils-1.9.3.jar commons-cli-1.2.jar commons-codec-1.10.jar commons-collections-3.2.2.jar -commons-compiler-3.0.9.jar +commons-compiler-3.0.10.jar commons-compress-1.8.1.jar commons-configuration2-2.1.1.jar commons-crypto-1.0.0.jar @@ -86,18 +86,18 @@ htrace-core4-4.1.0-incubating.jar httpclient-4.5.6.jar httpcore-4.4.10.jar ivy-2.4.0.jar -jackson-annotations-2.6.7.jar -jackson-core-2.6.7.jar +jackson-annotations-2.9.6.jar +jackson-core-2.9.6.jar jackson-core-asl-1.9.13.jar -jackson-databind-2.6.7.1.jar -jackson-dataformat-yaml-2.6.7.jar +jackson-databind-2.9.6.jar +jackson-dataformat-yaml-2.9.6.jar jackson-jaxrs-base-2.7.8.jar jackson-jaxrs-json-provider-2.7.8.jar jackson-mapper-asl-1.9.13.jar -jackson-module-jaxb-annotations-2.6.7.jar -jackson-module-paranamer-2.7.9.jar -jackson-module-scala_2.11-2.6.7.1.jar -janino-3.0.9.jar +jackson-module-jaxb-annotations-2.9.6.jar +jackson-module-paranamer-2.9.6.jar +jackson-module-scala_2.11-2.9.6.jar +janino-3.0.10.jar javassist-3.18.1-GA.jar javax.annotation-api-1.2.jar javax.inject-1.jar @@ -154,7 +154,7 @@ libfb303-0.9.3.jar libthrift-0.9.3.jar log4j-1.2.17.jar logging-interceptor-3.8.1.jar -lz4-java-1.4.0.jar +lz4-java-1.5.0.jar machinist_2.11-0.6.1.jar macro-compat_2.11-1.1.1.jar mesos-1.4.0-shaded-protobuf.jar @@ -172,9 +172,9 @@ okhttp-2.7.5.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar @@ -197,7 +197,7 @@ scala-xml_2.11-1.0.5.jar shapeless_2.11-2.3.2.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar -snakeyaml-1.15.jar +snakeyaml-1.18.jar snappy-0.2.jar snappy-java-1.1.7.1.jar spire-macros_2.11-0.13.0.jar diff --git a/dev/lint-python b/dev/lint-python index f738af9c4976..e26bd4bd4517 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -36,7 +36,7 @@ compile_status="${PIPESTATUS[0]}" # Get pycodestyle at runtime so that we don't rely on it being installed on the build server. # See: https://github.com/apache/spark/pull/1744#issuecomment-50982162 # Updated to the latest official version of pep8. pep8 is formally renamed to pycodestyle. -PYCODESTYLE_VERSION="2.3.1" +PYCODESTYLE_VERSION="2.4.0" PYCODESTYLE_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pycodestyle-$PYCODESTYLE_VERSION.py" PYCODESTYLE_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/PyCQA/pycodestyle/$PYCODESTYLE_VERSION/pycodestyle.py" @@ -82,6 +82,23 @@ else rm "$PYCODESTYLE_REPORT_PATH" fi +# stop the build if there are Python syntax errors or undefined names +flake8 . --count --select=E901,E999,F821,F822,F823 --max-line-length=100 --show-source --statistics +flake8_status="${PIPESTATUS[0]}" + +if [ "$flake8_status" -eq 0 ]; then + lint_status=0 +else + lint_status=1 +fi + +if [ "$lint_status" -ne 0 ]; then + echo "flake8 checks failed." + exit "$lint_status" +else + echo "flake8 checks passed." +fi + # Check that the documentation builds acceptably, skip check if sphinx is not installed. if hash "$SPHINXBUILD" 2> /dev/null; then cd python/docs diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index ad99ce55806a..668682fbb913 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -192,6 +192,7 @@ fi if [ -d "$SPARK_HOME"/resource-managers/kubernetes/core/target/ ]; then mkdir -p "$DISTDIR/kubernetes/" cp -a "$SPARK_HOME"/resource-managers/kubernetes/docker/src/main/dockerfiles "$DISTDIR/kubernetes/" + cp -a "$SPARK_HOME"/resource-managers/kubernetes/integration-tests/tests "$DISTDIR/kubernetes/" fi # Copy examples and dependencies @@ -212,7 +213,6 @@ cp -r "$SPARK_HOME/examples/src/main" "$DISTDIR/examples/src/" # Copy license and ASF files cp "$SPARK_HOME/LICENSE-binary" "$DISTDIR/LICENSE" -mkdir -p "$DISTDIR/licenses" cp -r "$SPARK_HOME/licenses-binary" "$DISTDIR/licenses" cp "$SPARK_HOME/NOTICE-binary" "$DISTDIR/NOTICE" diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index 81daa909e019..cca6f405e89a 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -274,7 +274,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""): versions = sorted(versions, key=lambda x: x.name, reverse=True) versions = filter(lambda x: x.raw['released'] is False, versions) # Consider only x.y.z versions - versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions) + versions = filter(lambda x: re.match(r'\d+\.\d+\.\d+', x.name), versions) default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches) for v in default_fix_versions: @@ -403,7 +403,7 @@ def standardize_jira_ref(text): # Extract spark component(s): # Look for alphanumeric chars, spaces, dashes, periods, and/or commas - pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE) + pattern = re.compile(r'(\[[\w\s,.-]+\])', re.IGNORECASE) for component in pattern.findall(text): components.append(component.upper()) text = text.replace(component, '') diff --git a/dev/requirements.txt b/dev/requirements.txt index fa833ab96b8e..3fdd3425ffcc 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -1,3 +1,4 @@ +flake8==3.5.0 jira==1.0.3 PyGithub==1.26.0 Unidecode==0.04.19 diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index e6fe3b82ed20..eca88f2391bf 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -115,7 +115,8 @@ def run_tests(tests_timeout): os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait() failure_note_by_errcode = { - 1: 'executing the `dev/run-tests` script', # error to denote run-tests script failures + # error to denote run-tests script failures: + 1: 'executing the `dev/run-tests` script', ERROR_CODES["BLOCK_GENERAL"]: 'some tests', ERROR_CODES["BLOCK_RAT"]: 'RAT tests', ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests', @@ -130,7 +131,7 @@ def run_tests(tests_timeout): ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests', ERROR_CODES["BLOCK_PYSPARK_PIP_TESTS"]: 'PySpark pip packaging tests', ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests', - ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % ( + ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of `%s`' % ( tests_timeout) } diff --git a/dev/run-tests.py b/dev/run-tests.py index d9d3789ac125..f534637b80d6 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -169,7 +169,7 @@ def determine_java_version(java_exe): # find raw version string, eg 'java version "1.8.0_25"' raw_version_str = next(x for x in raw_output_lines if " version " in x) - match = re.search('(\d+)\.(\d+)\.(\d+)', raw_version_str) + match = re.search(r'(\d+)\.(\d+)\.(\d+)', raw_version_str) major = int(match.group(1)) minor = int(match.group(2)) diff --git a/dev/scalastyle b/dev/scalastyle index b8053df05fa2..b0ad02523826 100755 --- a/dev/scalastyle +++ b/dev/scalastyle @@ -29,6 +29,7 @@ ERRORS=$(echo -e "q\n" \ -Pflume \ -Phive \ -Phive-thriftserver \ + -Pspark-ganglia-lgpl \ scalastyle test:scalastyle \ | awk '{if($1~/error/)print}' \ ) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 2aa355504bf2..e7ac063e234e 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -387,6 +387,8 @@ def __hash__(self): "pyspark.profiler", "pyspark.shuffle", "pyspark.tests", + "pyspark.test_broadcast", + "pyspark.test_serializers", "pyspark.util", ] ) @@ -555,6 +557,16 @@ def __hash__(self): sbt_test_goals=["kubernetes/test"] ) + +spark_ganglia_lgpl = Module( + name="spark-ganglia-lgpl", + dependencies=[], + build_profile_flags=["-Pspark-ganglia-lgpl"], + source_file_regexes=[ + "external/spark-ganglia-lgpl", + ] +) + # The root module is a dummy module which is used to run all of the tests. # No other modules should directly depend on this module. root = Module( diff --git a/dev/tox.ini b/dev/tox.ini index 28dad8f3b5c7..6ec223b743b4 100644 --- a/dev/tox.ini +++ b/dev/tox.ini @@ -14,6 +14,6 @@ # limitations under the License. [pycodestyle] -ignore=E402,E731,E241,W503,E226,E722,E741,E305 +ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504 max-line-length=100 exclude=cloudpickle.py,heapq3.py,shared.py,python/docs/conf.py,work/*/*.py,python/.eggs/*,dist/* diff --git a/docs/_config.yml b/docs/_config.yml index 095fadb93fe5..c3ef98575fa6 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -14,10 +14,10 @@ include: # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 2.4.0-SNAPSHOT -SPARK_VERSION_SHORT: 2.4.0 +SPARK_VERSION: 3.0.0-SNAPSHOT +SPARK_VERSION_SHORT: 3.0.0 SCALA_BINARY_VERSION: "2.11" -SCALA_VERSION: "2.11.8" +SCALA_VERSION: "2.11.12" MESOS_VERSION: 1.0.0 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK SPARK_GITHUB_URL: https://github.com/apache/spark diff --git a/docs/building-spark.md b/docs/building-spark.md index 1d3e0b1b7d39..1501f0bb8454 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -67,7 +67,7 @@ Examples: ./build/mvn -Pyarn -DskipTests clean package # Apache Hadoop 2.7.X and later - ./build/mvn -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.7 -DskipTests clean package + ./build/mvn -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.3 -DskipTests clean package ## Building With Hive and JDBC Support diff --git a/docs/configuration.md b/docs/configuration.md index f344bcd20087..613e214783d5 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -188,7 +188,7 @@ of the most common options to set are: unless otherwise specified. If set, PySpark memory for an executor will be limited to this amount. If not set, Spark will not limit Python's memory use and it is up to the application to avoid exceeding the overhead memory space - shared with other non-JVM processes. When PySpark is run in YARN, this memory + shared with other non-JVM processes. When PySpark is run in YARN or Kubernetes, this memory is added to executor resource requests. @@ -746,6 +746,13 @@ Apart from these, the following properties are also available, and may be useful *Warning*: This will increase the size of the event log considerably. + + spark.eventLog.longForm.enabled + false + + If true, use the long form of call sites in the event log. Otherwise use the short form. + + spark.eventLog.compress false @@ -786,6 +793,13 @@ Apart from these, the following properties are also available, and may be useful Buffer size to use when writing to output streams, in KiB unless otherwise specified. + + spark.ui.dagGraph.retainedRootRDDs + Int.MaxValue + + How many DAG graph nodes the Spark UI and status APIs remember before garbage collecting. + + spark.ui.enabled true @@ -800,6 +814,15 @@ Apart from these, the following properties are also available, and may be useful Allows jobs and stages to be killed from the web UI. + + spark.ui.liveUpdate.period + 100ms + + How often to update live entities. -1 means "never update" when replaying applications, + meaning only the last write will happen. For live applications, this avoids a few + operations that we can live without when rapidly processing incoming task events. + + spark.ui.port 4040 @@ -931,7 +954,7 @@ Apart from these, the following properties are also available, and may be useful org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, org.apache.spark.io.SnappyCompressionCodec, - and org.apache.spark.io.ZstdCompressionCodec. + and org.apache.spark.io.ZStdCompressionCodec. diff --git a/docs/monitoring.md b/docs/monitoring.md index 2717dd091c75..69bf3082f0f2 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -185,6 +185,23 @@ Security options for the Spark History Server are covered more detail in the Job history files older than this will be deleted when the filesystem history cleaner runs. + + spark.history.fs.endEventReparseChunkSize + 1m + + How many bytes to parse at the end of log files looking for the end event. + This is used to speed up generation of application listings by skipping unnecessary + parts of event log files. It can be disabled by setting this config to 0. + + + + spark.history.fs.inProgressOptimization.enabled + true + + Enable optimized handling of in-progress logs. This option may leave finished + applications that fail to rename their event logs listed as in-progress. + + spark.history.fs.numReplayThreads 25% of available cores @@ -192,6 +209,14 @@ Security options for the Spark History Server are covered more detail in the Number of threads that will be used by history server to process event logs. + + spark.history.store.maxDiskUsage + 10g + + Maximum disk usage for the local directory where the cache application history information + are stored. + + spark.history.store.path (none) @@ -388,6 +413,158 @@ value triggering garbage collection on jobs, and `spark.ui.retainedStages` that Note that the garbage collection takes place on playback: it is possible to retrieve more entries by increasing these values and restarting the history server. +### Executor Task Metrics + +The REST API exposes the values of the Task Metrics collected by Spark executors with the granularity +of task execution. The metrics can be used for performance troubleshooting and workload characterization. +A list of the available metrics, with a short description: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Spark Executor Task Metric nameShort description
    executorRunTimeElapsed time the executor spent running this task. This includes time fetching shuffle data. + The value is expressed in milliseconds.
    executorCpuTimeCPU time the executor spent running this task. This includes time fetching shuffle data. + The value is expressed in nanoseconds.
    executorDeserializeTimeElapsed time spent to deserialize this task. The value is expressed in milliseconds.
    executorDeserializeCpuTimeCPU time taken on the executor to deserialize this task. The value is expressed + in nanoseconds.
    resultSizeThe number of bytes this task transmitted back to the driver as the TaskResult.
    jvmGCTimeElapsed time the JVM spent in garbage collection while executing this task. + The value is expressed in milliseconds.
    resultSerializationTimeElapsed time spent serializing the task result. The value is expressed in milliseconds.
    memoryBytesSpilledThe number of in-memory bytes spilled by this task.
    diskBytesSpilledThe number of on-disk bytes spilled by this task.
    peakExecutionMemoryPeak memory used by internal data structures created during shuffles, aggregations and + joins. The value of this accumulator should be approximately the sum of the peak sizes + across all such data structures created in this task. For SQL jobs, this only tracks all + unsafe operators and ExternalSort.
    inputMetrics.*Metrics related to reading data from [[org.apache.spark.rdd.HadoopRDD]] + or from persisted data.
        .bytesReadTotal number of bytes read.
        .recordsReadTotal number of records read.
    outputMetrics.*Metrics related to writing data externally (e.g. to a distributed filesystem), + defined only in tasks with output.
        .bytesWrittenTotal number of bytes written
        .recordsWrittenTotal number of records written
    shuffleReadMetrics.*Metrics related to shuffle read operations.
        .recordsReadNumber of records read in shuffle operations
        .remoteBlocksFetchedNumber of remote blocks fetched in shuffle operations
        .localBlocksFetchedNumber of local (as opposed to read from a remote executor) blocks fetched + in shuffle operations
        .totalBlocksFetchedNumber of blocks fetched in shuffle operations (both local and remote)
        .remoteBytesReadNumber of remote bytes read in shuffle operations
        .localBytesReadNumber of bytes read in shuffle operations from local disk (as opposed to + read from a remote executor)
        .totalBytesReadNumber of bytes read in shuffle operations (both local and remote)
        .remoteBytesReadToDiskNumber of remote bytes read to disk in shuffle operations. + Large blocks are fetched to disk in shuffle read operations, as opposed to + being read into memory, which is the default behavior.
        .fetchWaitTimeTime the task spent waiting for remote shuffle blocks. + This only includes the time blocking on shuffle input data. + For instance if block B is being fetched while the task is still not finished + processing block A, it is not considered to be blocking on block B. + The value is expressed in milliseconds.
    shuffleWriteMetrics.*Metrics related to operations writing shuffle data.
        .bytesWrittenNumber of bytes written in shuffle operations
        .recordsWrittenNumber of records written in shuffle operations
        .writeTimeTime spent blocking on writes to disk or buffer cache. The value is expressed + in nanoseconds.
    + + + ### API Versioning Policy These endpoints have been strongly versioned to make it easier to develop applications on top. diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md index d95b757f3685..9a07d6ca24b6 100644 --- a/docs/rdd-programming-guide.md +++ b/docs/rdd-programming-guide.md @@ -859,7 +859,7 @@ We could also use `counts.sortByKey()`, for example, to sort the pairs alphabeti **Note:** when using custom objects as the key in key-value pair operations, you must be sure that a custom `equals()` method is accompanied with a matching `hashCode()` method. For full details, see the contract outlined in the [Object.hashCode() -documentation](http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html#hashCode()). +documentation](https://docs.oracle.com/javase/8/docs/api/java/lang/Object.html#hashCode--). @@ -896,7 +896,7 @@ We could also use `counts.sortByKey()`, for example, to sort the pairs alphabeti **Note:** when using custom objects as the key in key-value pair operations, you must be sure that a custom `equals()` method is accompanied with a matching `hashCode()` method. For full details, see the contract outlined in the [Object.hashCode() -documentation](http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html#hashCode()). +documentation](https://docs.oracle.com/javase/8/docs/api/java/lang/Object.html#hashCode--). @@ -1465,6 +1465,10 @@ jsc.sc().register(myVectorAcc, "MyVectorAcc1"); Note that, when programmers define their own type of AccumulatorV2, the resulting type can be different than that of the elements added. +*Warning*: When a Spark task finishes, Spark will try to merge the accumulated updates in this task to an accumulator. +If it fails, Spark will ignore the failure and still mark the task successful and continue to run other tasks. Hence, +a buggy accumulator will not impact a Spark job, but it may not get updated correctly although a Spark job is successful. +
    diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index c83dad6df1e7..b4088d79addf 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -215,6 +215,19 @@ spark.kubernetes.driver.volumes.persistentVolumeClaim.checkpointpvc.options.clai The configuration properties for mounting volumes into the executor pods use prefix `spark.kubernetes.executor.` instead of `spark.kubernetes.driver.`. For a complete list of available options for each supported type of volumes, please refer to the [Spark Properties](#spark-properties) section below. +## Local Storage + +Spark uses temporary scratch space to spill data to disk during shuffles and other operations. When using Kubernetes as the resource manager the pods will be created with an [emptyDir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) volume mounted for each directory listed in `SPARK_LOCAL_DIRS`. If no directories are explicitly specified then a default directory is created and configured appropriately. + +`emptyDir` volumes use the ephemeral storage feature of Kubernetes and do not persist beyond the life of the pod. + +### Using RAM for local storage + +`emptyDir` volumes use the nodes backing storage for ephemeral storage by default, this behaviour may not be appropriate for some compute environments. For example if you have diskless nodes with remote storage mounted over a network, having lots of executors doing IO to this remote storage may actually degrade performance. + +In this case it may be desirable to set `spark.kubernetes.local.dirs.tmpfs=true` in your configuration which will cause the `emptyDir` volumes to be configured as `tmpfs` i.e. RAM backed volumes. When configured like this Sparks local storage usage will count towards your pods memory usage therefore you may wish to increase your memory requests by increasing the value of `spark.kubernetes.memoryOverheadFactor` as appropriate. + + ## Introspection and Debugging These are the different ways in which you can investigate a running/completed Spark application, monitor progress, and @@ -667,23 +680,24 @@ specific to Spark on Kubernetes. spark.kubernetes.driver.limit.cores (none) - Specify a hard cpu [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for the driver pod. + Specify a hard cpu limit for the driver pod. spark.kubernetes.executor.request.cores (none) - Specify the cpu request for each executor pod. Values conform to the Kubernetes [convention](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu). - Example values include 0.1, 500m, 1.5, 5, etc., with the definition of cpu units documented in [CPU units](https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units). + Specify the cpu request for each executor pod. Values conform to the Kubernetes convention. + Example values include 0.1, 500m, 1.5, 5, etc., with the definition of cpu units documented in CPU units. This is distinct from spark.executor.cores: it is only used and takes precedence over spark.executor.cores for specifying the executor pod cpu request if set. Task parallelism, e.g., number of tasks an executor can run concurrently is not affected by this. + spark.kubernetes.executor.limit.cores (none) - Specify a hard cpu [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for each executor pod launched for the Spark Application. + Specify a hard cpu limit for each executor pod launched for the Spark Application. @@ -784,6 +798,14 @@ specific to Spark on Kubernetes. spark.kubernetes.executor.volumes.persistentVolumeClaim.checkpointpvc.options.claimName=spark-pvc-claim. + + spark.kubernetes.local.dirs.tmpfs + false + + Configure the emptyDir volumes used to back SPARK_LOCAL_DIRS within the Spark driver and executor pods to use tmpfs backing i.e. RAM. See Local Storage earlier on this page + for more discussion of this. + + spark.kubernetes.memoryOverheadFactor 0.1 diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index e3d67c34d53e..687f9e46c328 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -465,7 +465,7 @@ providers can be disabled individually by setting `spark.security.credentials.{s - + - + - + - + diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 374909456927..a1d7b1108bf7 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -965,6 +965,8 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession `parquet.compression` is specified in the table-specific options/properties, the precedence would be `compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`. Acceptable values include: none, uncompressed, snappy, gzip, lzo, brotli, lz4, zstd. + Note that `zstd` requires `ZStandardCodec` to be installed before Hadoop 2.9.0, `brotli` requires + `BrotliCodec` to be installed. @@ -1002,6 +1004,17 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession

    + + + + +
    Property NameDefaultMeaning
    spark.yarn.keytabspark.kerberos.keytab (none) The full path to the file that contains the keytab for the principal specified above. This keytab @@ -477,7 +477,7 @@ providers can be disabled individually by setting `spark.security.credentials.{s
    spark.yarn.principalspark.kerberos.principal (none) Principal to be used to login to KDC, while running on secure clusters. Equivalent to the diff --git a/docs/sparkr.md b/docs/sparkr.md index b4248e8bb21d..55e8f15da17c 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -70,12 +70,12 @@ The following Spark driver properties can be set in `sparkConfig` with `sparkR.s --master
    spark.yarn.keytabspark.kerberos.keytab Application Properties --keytab
    spark.yarn.principalspark.kerberos.principal Application Properties --principal
    spark.sql.parquet.writeLegacyFormatfalse + If true, data will be written in a way of Spark 1.4 and earlier. For example, decimal values + will be written in Apache Parquet's fixed-length byte array format, which other systems such as + Apache Hive and Apache Impala use. If false, the newer format in Parquet will be used. For + example, decimals will be written in int-based format. If Parquet output is intended for use + with systems that do not support this newer format, set to true. +
    ## ORC Files @@ -1489,7 +1502,7 @@ See the [Apache Avro Data Source Guide](avro-data-source-guide.html). * The JDBC driver class must be visible to the primordial class loader on the client session and on all executors. This is because Java's DriverManager class does a security check that results in it ignoring all drivers not visible to the primordial class loader when one goes to open a connection. One convenient way to do this is to modify compute_classpath.sh on all worker nodes to include your driver JARs. * Some databases, such as H2, convert all names to upper case. You'll need to use upper case to refer to those names in Spark SQL. - + * Users can specify vendor-specific JDBC connection properties in the data source options to do special treatment. For example, `spark.read.format("jdbc").option("url", oracleJdbcUrl).option("oracle.jdbc.mapDateToTimestamp", "false")`. `oracle.jdbc.mapDateToTimestamp` defaults to true, users often need to disable this flag to avoid Oracle date being resolved as timestamp. # Performance Tuning @@ -1879,6 +1892,68 @@ working with timestamps in `pandas_udf`s to get the best performance, see ## Upgrading From Spark SQL 2.3 to 2.4 + - In Spark version 2.3 and earlier, the second parameter to array_contains function is implicitly promoted to the element type of first array type parameter. This type promotion can be lossy and may cause `array_contains` function to return wrong result. This problem has been addressed in 2.4 by employing a safer type promotion mechanism. This can cause some change in behavior and are illustrated in the table below. + + + + + + + + + + + + + + + + + + + + + + + + + +
    + Query + + Result Spark 2.3 or Prior + + Result Spark 2.4 + + Remarks +
    + SELECT
    array_contains(array(1), 1.34D);
    +
    + true + + false + + In Spark 2.4, left and right parameters are promoted to array(double) and double type respectively. +
    + SELECT
    array_contains(array(1), '1');
    +
    + true + + AnalysisException is thrown since integer type can not be promoted to string type in a loss-less manner. + + Users can use explict cast +
    + SELECT
    array_contains(array(1), 'anystring');
    +
    + null + + AnalysisException is thrown since integer type can not be promoted to string type in a loss-less manner. + + Users can use explict cast +
    + + - Since Spark 2.4, when there is a struct field in front of the IN operator before a subquery, the inner query must contain a struct field as well. In previous versions, instead, the fields of the struct were compared to the output of the inner query. Eg. if `a` is a `struct(a string, b int)`, in Spark 2.4 `a in (select (1 as a, 'a' as b) from range(1))` is a valid query, while `a in (select 1, 'a' from range(1))` is not. In previous version it was the opposite. + - In versions 2.2.1+ and 2.3, if `spark.sql.caseSensitive` is set to true, then the `CURRENT_DATE` and `CURRENT_TIMESTAMP` functions incorrectly became case-sensitive and would resolve to columns (unless typed in lower case). In Spark 2.4 this has been fixed and the functions are no longer case-sensitive. - Since Spark 2.4, Spark will evaluate the set operations referenced in a query by following a precedence rule as per the SQL standard. If the order is not specified by parentheses, set operations are performed from left to right with the exception that all INTERSECT operations are performed before any UNION, EXCEPT or MINUS operations. The old behaviour of giving equal precedence to all the set operations are preserved under a newly added configuration `spark.sql.legacy.setopsPrecedence.enabled` with a default value of `false`. When this property is set to `true`, spark will evaluate the set operators from left to right as they appear in the query given no explicit ordering is enforced by usage of parenthesis. - Since Spark 2.4, Spark will display table description column Last Access value as UNKNOWN when the value was Jan 01 1970. - Since Spark 2.4, Spark maximizes the usage of a vectorized ORC reader for ORC files by default. To do that, `spark.sql.orc.impl` and `spark.sql.orc.filterPushdown` change their default values to `native` and `true` respectively. @@ -1886,17 +1961,18 @@ working with timestamps in `pandas_udf`s to get the best performance, see - Since Spark 2.4, writing an empty dataframe to a directory launches at least one write task, even if physically the dataframe has no partition. This introduces a small behavior change that for self-describing file formats like Parquet and Orc, Spark creates a metadata-only file in the target directory when writing a 0-partition dataframe, so that schema inference can still work if users read that directory later. The new behavior is more reasonable and more consistent regarding writing empty dataframe. - Since Spark 2.4, expression IDs in UDF arguments do not appear in column names. For example, an column name in Spark 2.4 is not `UDF:f(col0 AS colA#28)` but ``UDF:f(col0 AS `colA`)``. - Since Spark 2.4, writing a dataframe with an empty or nested empty schema using any file formats (parquet, orc, json, text, csv etc.) is not allowed. An exception is thrown when attempting to write dataframes with empty schema. - - Since Spark 2.4, Spark compares a DATE type with a TIMESTAMP type after promotes both sides to TIMESTAMP. To set `false` to `spark.sql.hive.compareDateTimestampInTimestamp` restores the previous behavior. This option will be removed in Spark 3.0. - - Since Spark 2.4, creating a managed table with nonempty location is not allowed. An exception is thrown when attempting to create a managed table with nonempty location. To set `true` to `spark.sql.allowCreatingManagedTableUsingNonemptyLocation` restores the previous behavior. This option will be removed in Spark 3.0. + - Since Spark 2.4, Spark compares a DATE type with a TIMESTAMP type after promotes both sides to TIMESTAMP. To set `false` to `spark.sql.legacy.compareDateTimestampInTimestamp` restores the previous behavior. This option will be removed in Spark 3.0. + - Since Spark 2.4, creating a managed table with nonempty location is not allowed. An exception is thrown when attempting to create a managed table with nonempty location. To set `true` to `spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation` restores the previous behavior. This option will be removed in Spark 3.0. - Since Spark 2.4, renaming a managed table to existing location is not allowed. An exception is thrown when attempting to rename a managed table to existing location. - Since Spark 2.4, the type coercion rules can automatically promote the argument types of the variadic SQL functions (e.g., IN/COALESCE) to the widest common type, no matter how the input arguments order. In prior Spark versions, the promotion could fail in some specific orders (e.g., TimestampType, IntegerType and StringType) and throw an exception. - Since Spark 2.4, Spark has enabled non-cascading SQL cache invalidation in addition to the traditional cache invalidation mechanism. The non-cascading cache invalidation mechanism allows users to remove a cache without impacting its dependent caches. This new cache invalidation mechanism is used in scenarios where the data of the cache to be removed is still valid, e.g., calling unpersist() on a Dataset, or dropping a temporary view. This allows users to free up memory and keep the desired caches valid at the same time. - - In version 2.3 and earlier, `to_utc_timestamp` and `from_utc_timestamp` respect the timezone in the input timestamp string, which breaks the assumption that the input timestamp is in a specific timezone. Therefore, these 2 functions can return unexpected results. In version 2.4 and later, this problem has been fixed. `to_utc_timestamp` and `from_utc_timestamp` will return null if the input timestamp string contains timezone. As an example, `from_utc_timestamp('2000-10-10 00:00:00', 'GMT+1')` will return `2000-10-10 01:00:00` in both Spark 2.3 and 2.4. However, `from_utc_timestamp('2000-10-10 00:00:00+00:00', 'GMT+1')`, assuming a local timezone of GMT+8, will return `2000-10-10 09:00:00` in Spark 2.3 but `null` in 2.4. For people who don't care about this problem and want to retain the previous behavior to keep their query unchanged, you can set `spark.sql.function.rejectTimezoneInString` to false. This option will be removed in Spark 3.0 and should only be used as a temporary workaround. - In version 2.3 and earlier, Spark converts Parquet Hive tables by default but ignores table properties like `TBLPROPERTIES (parquet.compression 'NONE')`. This happens for ORC Hive table properties like `TBLPROPERTIES (orc.compress 'NONE')` in case of `spark.sql.hive.convertMetastoreOrc=true`, too. Since Spark 2.4, Spark respects Parquet/ORC specific table properties while converting Parquet/ORC Hive tables. As an example, `CREATE TABLE t(id int) STORED AS PARQUET TBLPROPERTIES (parquet.compression 'NONE')` would generate Snappy parquet files during insertion in Spark 2.3, and in Spark 2.4, the result would be uncompressed parquet files. - Since Spark 2.0, Spark converts Parquet Hive tables by default for better performance. Since Spark 2.4, Spark converts ORC Hive tables by default, too. It means Spark uses its own ORC support by default instead of Hive SerDe. As an example, `CREATE TABLE t(id int) STORED AS ORC` would be handled with Hive SerDe in Spark 2.3, and in Spark 2.4, it would be converted into Spark's ORC data source table and ORC vectorization would be applied. To set `false` to `spark.sql.hive.convertMetastoreOrc` restores the previous behavior. - In version 2.3 and earlier, CSV rows are considered as malformed if at least one column value in the row is malformed. CSV parser dropped such rows in the DROPMALFORMED mode or outputs an error in the FAILFAST mode. Since Spark 2.4, CSV row is considered as malformed only when it contains malformed column values requested from CSV datasource, other values can be ignored. As an example, CSV file contains the "id,name" header and one row "1234". In Spark 2.4, selection of the id column consists of a row with one column value 1234 but in Spark 2.3 and earlier it is empty in the DROPMALFORMED mode. To restore the previous behavior, set `spark.sql.csv.parser.columnPruning.enabled` to `false`. - - Since Spark 2.4, File listing for compute statistics is done in parallel by default. This can be disabled by setting `spark.sql.parallelFileListingInStatsComputation.enabled` to `False`. + - Since Spark 2.4, File listing for compute statistics is done in parallel by default. This can be disabled by setting `spark.sql.statistics.parallelFileListingInStatsComputation.enabled` to `False`. - Since Spark 2.4, Metadata files (e.g. Parquet summary files) and temporary files are not counted as data files when calculating table size during Statistics computation. + - Since Spark 2.4, empty strings are saved as quoted empty strings `""`. In version 2.3 and earlier, empty strings are equal to `null` values and do not reflect to any characters in saved CSV files. For example, the row of `"a", null, "", 1` was writted as `a,,,1`. Since Spark 2.4, the same row is saved as `a,,"",1`. To restore the previous behavior, set the CSV option `emptyValue` to empty (not quoted) string. + - Since Spark 2.4, The LOAD DATA command supports wildcard `?` and `*`, which match any one character, and zero or more characters, respectively. Example: `LOAD DATA INPATH '/tmp/folder*/'` or `LOAD DATA INPATH '/tmp/part-?'`. Special Characters like `space` also now work in paths. Example: `LOAD DATA INPATH '/tmp/folder name/'`. ## Upgrading From Spark SQL 2.3.0 to 2.3.1 and above @@ -1908,7 +1984,6 @@ working with timestamps in `pandas_udf`s to get the best performance, see - The `percentile_approx` function previously accepted numeric type input and output double type results. Now it supports date type, timestamp type and numeric types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles. - Since Spark 2.3, the Join/Filter's deterministic predicates that are after the first non-deterministic predicates are also pushed down/through the child operators, if possible. In prior Spark versions, these filters are not eligible for predicate pushdown. - Partition column inference previously found incorrect common type for different inferred types, for example, previously it ended up with double type as the common type for double type and date type. Now it finds the correct common type for such conflicts. The conflict resolution follows the table below: - - + + + + + + + + + @@ -1989,22 +2026,214 @@ head(sql("select * from aggregates")) -##### Using Foreach -The `foreach` operation allows arbitrary operations to be computed on the output data. As of Spark 2.1, this is available only for Scala and Java. To use this, you will have to implement the interface `ForeachWriter` -([Scala](api/scala/index.html#org.apache.spark.sql.ForeachWriter)/[Java](api/java/org/apache/spark/sql/ForeachWriter.html) docs), -which has methods that get called whenever there is a sequence of rows generated as output after a trigger. Note the following important points. +##### Using Foreach and ForeachBatch +The `foreach` and `foreachBatch` operations allow you to apply arbitrary operations and writing +logic on the output of a streaming query. They have slightly different use cases - while `foreach` +allows custom write logic on every row, `foreachBatch` allows arbitrary operations +and custom logic on the output of each micro-batch. Let's understand their usages in more detail. + +###### ForeachBatch +`foreachBatch(...)` allows you to specify a function that is executed on +the output data of every micro-batch of a streaming query. Since Spark 2.4, this is supported in Scala, Java and Python. +It takes two parameters: a DataFrame or Dataset that has the output data of a micro-batch and the unique ID of the micro-batch. + +
    +
    + +{% highlight scala %} +streamingDF.writeStream.foreachBatch { (batchDF: DataFrame, batchId: Long) => + // Transform and write batchDF +}.start() +{% endhighlight %} + +
    +
    + +{% highlight java %} +streamingDatasetOfString.writeStream().foreachBatch( + new VoidFunction2, Long> { + public void call(Dataset dataset, Long batchId) { + // Transform and write batchDF + } + } +).start(); +{% endhighlight %} + +
    +
    + +{% highlight python %} +def foreach_batch_function(df, epoch_id): + # Transform and write batchDF + pass + +streamingDF.writeStream.foreachBatch(foreach_batch_function).start() +{% endhighlight %} + +
    +
    +R is not yet supported. +
    +
    + +With `foreachBatch`, you can do the following. + +- **Reuse existing batch data sources** - For many storage systems, there may not be a streaming sink available yet, + but there may already exist a data writer for batch queries. Using `foreachBatch`, you can use the batch + data writers on the output of each micro-batch. +- **Write to multiple locations** - If you want to write the output of a streaming query to multiple locations, + then you can simply write the output DataFrame/Dataset multiple times. However, each attempt to write can + cause the output data to be recomputed (including possible re-reading of the input data). To avoid recomputations, + you should cache the output DataFrame/Dataset, write it to multiple locations, and then uncache it. Here is an outline. + + streamingDF.writeStream.foreachBatch { (batchDF: DataFrame, batchId: Long) => + batchDF.persist() + batchDF.write.format(...).save(...) // location 1 + batchDF.write.format(...).save(...) // location 2 + batchDF.unpersist() + } + +- **Apply additional DataFrame operations** - Many DataFrame and Dataset operations are not supported + in streaming DataFrames because Spark does not support generating incremental plans in those cases. + Using `foreachBatch`, you can apply some of these operations on each micro-batch output. However, you will have to reason about the end-to-end semantics of doing that operation yourself. + +**Note:** +- By default, `foreachBatch` provides only at-least-once write guarantees. However, you can use the + batchId provided to the function as way to deduplicate the output and get an exactly-once guarantee. +- `foreachBatch` does not work with the continuous processing mode as it fundamentally relies on the + micro-batch execution of a streaming query. If you write data in the continuous mode, use `foreach` instead. + + +###### Foreach +If `foreachBatch` is not an option (for example, corresponding batch data writer does not exist, or +continuous processing mode), then you can express you custom writer logic using `foreach`. +Specifically, you can express the data writing logic by dividing it into three methods: `open`, `process`, and `close`. +Since Spark 2.4, `foreach` is available in Scala, Java and Python. + +
    +
    + +In Scala, you have to extend the class `ForeachWriter` ([docs](api/scala/index.html#org.apache.spark.sql.ForeachWriter)). + +{% highlight scala %} +streamingDatasetOfString.writeStream.foreach( + new ForeachWriter[String] { + + def open(partitionId: Long, version: Long): Boolean = { + // Open connection + } + + def process(record: String): Unit = { + // Write string to connection + } + + def close(errorOrNull: Throwable): Unit = { + // Close the connection + } + } +).start() +{% endhighlight %} + +
    +
    + +In Java, you have to extend the class `ForeachWriter` ([docs](api/java/org/apache/spark/sql/ForeachWriter.html)). +{% highlight java %} +streamingDatasetOfString.writeStream().foreach( + new ForeachWriter[String] { + + @Override public boolean open(long partitionId, long version) { + // Open connection + } + + @Override public void process(String record) { + // Write string to connection + } + + @Override public void close(Throwable errorOrNull) { + // Close the connection + } + } +).start(); + +{% endhighlight %} + +
    +
    + +In Python, you can invoke foreach in two ways: in a function or in an object. +The function offers a simple way to express your processing logic but does not allow you to +deduplicate generated data when failures cause reprocessing of some input data. +For that situation you must specify the processing logic in an object. + +1. The function takes a row as input. + + {% highlight python %} + def process_row(row): + # Write row to storage + pass + + query = streamingDF.writeStream.foreach(process_row).start() + {% endhighlight %} + +2. The object has a process method and optional open and close methods: + + {% highlight python %} + class ForeachWriter: + def open(self, partition_id, epoch_id): + # Open connection. This method is optional in Python. + pass + + def process(self, row): + # Write row to connection. This method is NOT optional in Python. + pass + + def close(self, error): + # Close the connection. This method in optional in Python. + pass + + query = streamingDF.writeStream.foreach(ForeachWriter()).start() + {% endhighlight %} + +
    +
    +R is not yet supported. +
    +
    + + +**Execution semantics** +When the streaming query is started, Spark calls the function or the object’s methods in the following way: + +- A single copy of this object is responsible for all the data generated by a single task in a query. + In other words, one instance is responsible for processing one partition of the data generated in a distributed manner. + +- This object must be serializable, because each task will get a fresh serialized-deserialized copy + of the provided object. Hence, it is strongly recommended that any initialization for writing data + (for example. opening a connection or starting a transaction) is done after the open() method has + been called, which signifies that the task is ready to generate data. + +- The lifecycle of the methods are as follows: + + - For each partition with partition_id: -- The writer must be serializable, as it will be serialized and sent to the executors for execution. + - For each batch/epoch of streaming data with epoch_id: -- All the three methods, `open`, `process` and `close` will be called on the executors. + - Method open(partitionId, epochId) is called. -- The writer must do all the initialization (e.g. opening connections, starting a transaction, etc.) only when the `open` method is called. Be aware that, if there is any initialization in the class as soon as the object is created, then that initialization will happen in the driver (because that is where the instance is being created), which may not be what you intend. + - If open(...) returns true, for each row in the partition and batch/epoch, method process(row) is called. -- `version` and `partition` are two parameters in `open` that uniquely represent a set of rows that needs to be pushed out. `version` is a monotonically increasing id that increases with every trigger. `partition` is an id that represents a partition of the output, since the output is distributed and will be processed on multiple executors. + - Method close(error) is called with error (if any) seen while processing rows. -- `open` can use the `version` and `partition` to choose whether it needs to write the sequence of rows. Accordingly, it can return `true` (proceed with writing), or `false` (no need to write). If `false` is returned, then `process` will not be called on any row. For example, after a partial failure, some of the output partitions of the failed trigger may have already been committed to a database. Based on metadata stored in the database, the writer can identify partitions that have already been committed and accordingly return false to skip committing them again. +- The close() method (if it exists) is called if an open() method exists and returns successfully (irrespective of the return value), except if the JVM or Python process crashes in the middle. -- Whenever `open` is called, `close` will also be called (unless the JVM exits due to some error). This is true even if `open` returns false. If there is any error in processing and writing the data, `close` will be called with the error. It is your responsibility to clean up state (e.g. connections, transactions, etc.) that have been created in `open` such that there are no resource leaks. +- **Note:** The partitionId and epochId in the open() method can be used to deduplicate generated data + when failures cause reprocessing of some input data. This depends on the execution mode of the query. + If the streaming query is being executed in the micro-batch mode, then every partition represented + by a unique tuple (partition_id, epoch_id) is guaranteed to have the same data. + Hence, (partition_id, epoch_id) can be used to deduplicate and/or transactionally commit + data and achieve exactly-once guarantees. However, if the streaming query is being executed + in the continuous mode, then this guarantee does not hold and therefore should not be used for deduplication. #### Triggers The trigger settings of a streaming query defines the timing of streaming data processing, whether @@ -2709,6 +2938,78 @@ write.stream(aggDF, "memory", outputMode = "complete", checkpointLocation = "pat + +## Recovery Semantics after Changes in a Streaming Query +There are limitations on what changes in a streaming query are allowed between restarts from the +same checkpoint location. Here are a few kinds of changes that are either not allowed, or +the effect of the change is not well-defined. For all of them: + +- The term *allowed* means you can do the specified change but whether the semantics of its effect + is well-defined depends on the query and the change. + +- The term *not allowed* means you should not do the specified change as the restarted query is likely + to fail with unpredictable errors. `sdf` represents a streaming DataFrame/Dataset + generated with sparkSession.readStream. + +**Types of changes** + +- *Changes in the number or type (i.e. different source) of input sources*: This is not allowed. + +- *Changes in the parameters of input sources*: Whether this is allowed and whether the semantics + of the change are well-defined depends on the source and the query. Here are a few examples. + + - Addition/deletion/modification of rate limits is allowed: `spark.readStream.format("kafka").option("subscribe", "topic")` to `spark.readStream.format("kafka").option("subscribe", "topic").option("maxOffsetsPerTrigger", ...)` + + - Changes to subscribed topics/files is generally not allowed as the results are unpredictable: `spark.readStream.format("kafka").option("subscribe", "topic")` to `spark.readStream.format("kafka").option("subscribe", "newTopic")` + +- *Changes in the type of output sink*: Changes between a few specific combinations of sinks + are allowed. This needs to be verified on a case-by-case basis. Here are a few examples. + + - File sink to Kafka sink is allowed. Kafka will see only the new data. + + - Kafka sink to file sink is not allowed. + + - Kafka sink changed to foreach, or vice versa is allowed. + +- *Changes in the parameters of output sink*: Whether this is allowed and whether the semantics of + the change are well-defined depends on the sink and the query. Here are a few examples. + + - Changes to output directory of a file sink is not allowed: `sdf.writeStream.format("parquet").option("path", "/somePath")` to `sdf.writeStream.format("parquet").option("path", "/anotherPath")` + + - Changes to output topic is allowed: `sdf.writeStream.format("kafka").option("topic", "someTopic")` to `sdf.writeStream.format("kafka").option("topic", "anotherTopic")` + + - Changes to the user-defined foreach sink (that is, the `ForeachWriter` code) is allowed, but the semantics of the change depends on the code. + +- *Changes in projection / filter / map-like operations**: Some cases are allowed. For example: + + - Addition / deletion of filters is allowed: `sdf.selectExpr("a")` to `sdf.where(...).selectExpr("a").filter(...)`. + + - Changes in projections with same output schema is allowed: `sdf.selectExpr("stringColumn AS json").writeStream` to `sdf.selectExpr("anotherStringColumn AS json").writeStream` + + - Changes in projections with different output schema are conditionally allowed: `sdf.selectExpr("a").writeStream` to `sdf.selectExpr("b").writeStream` is allowed only if the output sink allows the schema change from `"a"` to `"b"`. + +- *Changes in stateful operations*: Some operations in streaming queries need to maintain + state data in order to continuously update the result. Structured Streaming automatically checkpoints + the state data to fault-tolerant storage (for example, HDFS, AWS S3, Azure Blob storage) and restores it after restart. + However, this assumes that the schema of the state data remains same across restarts. This means that + *any changes (that is, additions, deletions, or schema modifications) to the stateful operations of a streaming query are not allowed between restarts*. + Here is the list of stateful operations whose schema should not be changed between restarts in order to ensure state recovery: + + - *Streaming aggregation*: For example, `sdf.groupBy("a").agg(...)`. Any change in number or type of grouping keys or aggregates is not allowed. + + - *Streaming deduplication*: For example, `sdf.dropDuplicates("a")`. Any change in number or type of grouping keys or aggregates is not allowed. + + - *Stream-stream join*: For example, `sdf1.join(sdf2, ...)` (i.e. both inputs are generated with `sparkSession.readStream`). Changes + in the schema or equi-joining columns are not allowed. Changes in join type (outer or inner) not allowed. Other changes in the join condition are ill-defined. + + - *Arbitrary stateful operation*: For example, `sdf.groupByKey(...).mapGroupsWithState(...)` or `sdf.groupByKey(...).flatMapGroupsWithState(...)`. + Any change to the schema of the user-defined state and the type of timeout is not allowed. + Any change within the user-defined state-mapping function are allowed, but the semantic effect of the change depends on the user-defined logic. + If you really want to support state schema changes, then you can explicitly encode/decode your complex state data + structures into bytes using an encoding/decoding scheme that supports schema migration. For example, + if you save your state as Avro-encoded bytes, then you are free to change the Avro-state-schema between query + restarts as the binary state will always be restored successfully. + # Continuous Processing ## [Experimental] {:.no_toc} diff --git a/docs/tuning.md b/docs/tuning.md index f60971aa2e0a..cd0f9cd08136 100644 --- a/docs/tuning.md +++ b/docs/tuning.md @@ -26,12 +26,12 @@ Often, this will be the first thing you should tune to optimize a Spark applicat Spark aims to strike a balance between convenience (allowing you to work with any Java type in your operations) and performance. It provides two serialization libraries: -* [Java serialization](http://docs.oracle.com/javase/6/docs/api/java/io/Serializable.html): +* [Java serialization](https://docs.oracle.com/javase/8/docs/api/java/io/Serializable.html): By default, Spark serializes objects using Java's `ObjectOutputStream` framework, and can work with any class you create that implements - [`java.io.Serializable`](http://docs.oracle.com/javase/6/docs/api/java/io/Serializable.html). + [`java.io.Serializable`](https://docs.oracle.com/javase/8/docs/api/java/io/Serializable.html). You can also control the performance of your serialization more closely by extending - [`java.io.Externalizable`](http://docs.oracle.com/javase/6/docs/api/java/io/Externalizable.html). + [`java.io.Externalizable`](https://docs.oracle.com/javase/8/docs/api/java/io/Externalizable.html). Java serialization is flexible but often quite slow, and leads to large serialized formats for many classes. * [Kryo serialization](https://github.com/EsotericSoftware/kryo): Spark can also use @@ -230,7 +230,7 @@ temporary objects created during task execution. Some steps which may be useful * Monitor how the frequency and time taken by garbage collection changes with the new settings. Our experience suggests that the effect of GC tuning depends on your application and the amount of memory available. -There are [many more tuning options](http://www.oracle.com/technetwork/java/javase/gc-tuning-6-140523.html) described online, +There are [many more tuning options](https://docs.oracle.com/javase/8/docs/technotes/guides/vm/gctuning/index.html) described online, but at a high level, managing how frequently full GC takes place can help in reducing the overhead. GC tuning flags for executors can be specified by setting `spark.executor.extraJavaOptions` in diff --git a/examples/pom.xml b/examples/pom.xml index 868110b8e35e..756c475b4748 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/external/avro/pom.xml b/external/avro/pom.xml index 8f118ba48201..9d8f319cc939 100644 --- a/external/avro/pom.xml +++ b/external/avro/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala index 6df23c93e4c5..e60fa88cbeba 100755 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala +++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala @@ -32,14 +32,14 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.mapreduce.Job -import org.apache.spark.TaskContext +import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} import org.apache.spark.sql.sources.{DataSourceRegister, Filter} import org.apache.spark.sql.types.StructType -import org.apache.spark.util.SerializableConfiguration +import org.apache.spark.util.{SerializableConfiguration, Utils} private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister with Logging with Serializable { @@ -59,36 +59,13 @@ private[avro] class AvroFileFormat extends FileFormat val conf = spark.sessionState.newHadoopConf() val parsedOptions = new AvroOptions(options, conf) - // Schema evolution is not supported yet. Here we only pick a single random sample file to - // figure out the schema of the whole dataset. - val sampleFile = - if (parsedOptions.ignoreExtension) { - files.headOption.getOrElse { - throw new FileNotFoundException("Files for schema inferring have been not found.") - } - } else { - files.find(_.getPath.getName.endsWith(".avro")).getOrElse { - throw new FileNotFoundException( - "No Avro files found. If files don't have .avro extension, set ignoreExtension to true") - } - } - // User can specify an optional avro json schema. val avroSchema = parsedOptions.schema .map(new Schema.Parser().parse) .getOrElse { - val in = new FsInput(sampleFile.getPath, conf) - try { - val reader = DataFileReader.openReader(in, new GenericDatumReader[GenericRecord]()) - try { - reader.getSchema - } finally { - reader.close() - } - } finally { - in.close() - } - } + inferAvroSchemaFromFiles(files, conf, parsedOptions.ignoreExtension, + spark.sessionState.conf.ignoreCorruptFiles) + } SchemaConverters.toSqlType(avroSchema).dataType match { case t: StructType => Some(t) @@ -100,6 +77,51 @@ private[avro] class AvroFileFormat extends FileFormat } } + private def inferAvroSchemaFromFiles( + files: Seq[FileStatus], + conf: Configuration, + ignoreExtension: Boolean, + ignoreCorruptFiles: Boolean): Schema = { + // Schema evolution is not supported yet. Here we only pick first random readable sample file to + // figure out the schema of the whole dataset. + val avroReader = files.iterator.map { f => + val path = f.getPath + if (!ignoreExtension && !path.getName.endsWith(".avro")) { + None + } else { + Utils.tryWithResource { + new FsInput(path, conf) + } { in => + try { + Some(DataFileReader.openReader(in, new GenericDatumReader[GenericRecord]())) + } catch { + case e: IOException => + if (ignoreCorruptFiles) { + logWarning(s"Skipped the footer in the corrupted file: $path", e) + None + } else { + throw new SparkException(s"Could not read file: $path", e) + } + } + } + } + }.collectFirst { + case Some(reader) => reader + } + + avroReader match { + case Some(reader) => + try { + reader.getSchema + } finally { + reader.close() + } + case None => + throw new FileNotFoundException( + "No Avro files found. If files don't have .avro extension, set ignoreExtension to true") + } + } + override def shortName(): String = "avro" override def isSplitable( diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index 9ad4388414ea..1e08f7b50b11 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils} import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils { import testImplicits._ @@ -342,6 +343,48 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } + private def createDummyCorruptFile(dir: File): Unit = { + Utils.tryWithResource { + FileUtils.forceMkdir(dir) + val corruptFile = new File(dir, "corrupt.avro") + new BufferedWriter(new FileWriter(corruptFile)) + } { writer => + writer.write("corrupt") + } + } + + test("Ignore corrupt Avro file if flag IGNORE_CORRUPT_FILES enabled") { + withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") { + withTempPath { dir => + createDummyCorruptFile(dir) + val message = intercept[FileNotFoundException] { + spark.read.format("avro").load(dir.getAbsolutePath).schema + }.getMessage + assert(message.contains("No Avro files found.")) + + val srcFile = new File("src/test/resources/episodes.avro") + val destFile = new File(dir, "episodes.avro") + FileUtils.copyFile(srcFile, destFile) + + val result = spark.read.format("avro").load(srcFile.getAbsolutePath).collect() + checkAnswer(spark.read.format("avro").load(dir.getAbsolutePath), result) + } + } + } + + test("Throws IOException on reading corrupt Avro file if flag IGNORE_CORRUPT_FILES disabled") { + withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") { + withTempPath { dir => + createDummyCorruptFile(dir) + val message = intercept[org.apache.spark.SparkException] { + spark.read.format("avro").load(dir.getAbsolutePath) + }.getMessage + + assert(message.contains("Could not read file")) + } + } + } + test("Date field type") { withTempPath { dir => val schema = StructType(Seq( diff --git a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala b/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala new file mode 100644 index 000000000000..df13b4a1c2d3 --- /dev/null +++ b/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +/** + * Benchmark to measure Avro data sources write performance. + * Usage: + * 1. with spark-submit: bin/spark-submit --class + * 2. with sbt: build/sbt "avro/test:runMain " + */ +object AvroWriteBenchmark extends DataSourceWriteBenchmark { + def main(args: Array[String]): Unit = { + /* + Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz + Avro writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Output Single Int Column 2481 / 2499 6.3 157.8 1.0X + Output Single Double Column 2705 / 2710 5.8 172.0 0.9X + Output Int and String Column 5539 / 5639 2.8 352.2 0.4X + Output Partitions 4613 / 5004 3.4 293.3 0.5X + Output Buckets 5554 / 5561 2.8 353.1 0.4X + */ + runBenchmark("Avro") + } +} diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml index 431339d41219..f24254b69808 100644 --- a/external/docker-integration-tests/pom.xml +++ b/external/docker-integration-tests/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala index 09a2cd83aed6..70d294d0ca65 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala @@ -442,6 +442,12 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo .option("lowerBound", "2018-07-06") .option("upperBound", "2018-07-20") .option("numPartitions", 3) + // oracle.jdbc.mapDateToTimestamp defaults to true. If this flag is not disabled, column d + // (Oracle DATE) will be resolved as Catalyst Timestamp, which will fail bound evaluation of + // the partition column. E.g. 2018-07-06 cannot be evaluated as Timestamp, and the error + // message says: Timestamp format must be yyyy-mm-dd hh:mm:ss[.fffffffff]. + .option("oracle.jdbc.mapDateToTimestamp", "false") + .option("sessionInitStatement", "ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD'") .load() df1.logicalPlan match { @@ -462,6 +468,9 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo .option("lowerBound", "2018-07-04 03:30:00.0") .option("upperBound", "2018-07-27 14:11:05.0") .option("numPartitions", 2) + .option("oracle.jdbc.mapDateToTimestamp", "false") + .option("sessionInitStatement", + "ALTER SESSION SET NLS_TIMESTAMP_FORMAT = 'YYYY-MM-DD HH24:MI:SS.FF'") .load() df2.logicalPlan match { diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 7cd1ec4c9c09..002bd6fb7f29 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index f810aa80e878..168d9d3b2ae0 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 498e88f665eb..1410ef7f4702 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala index 4324cc6d0f80..9241b13c100f 100644 --- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala +++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala @@ -50,13 +50,18 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfterAll with val utils = new PollingFlumeTestUtils override def beforeAll(): Unit = { + super.beforeAll() _sc = new SparkContext(conf) } override def afterAll(): Unit = { - if (_sc != null) { - _sc.stop() - _sc = null + try { + if (_sc != null) { + _sc.stop() + _sc = null + } + } finally { + super.afterAll() } } diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml index f80f8e3a0183..4f9c3163b240 100644 --- a/external/kafka-0-10-assembly/pom.xml +++ b/external/kafka-0-10-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml index 8588e8be052e..efd0862fb58e 100644 --- a/external/kafka-0-10-sql/pom.xml +++ b/external/kafka-0-10-sql/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataConsumer.scala index ceb9e318b283..7b1314bc8c3c 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataConsumer.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataConsumer.scala @@ -134,6 +134,8 @@ private[kafka010] case class InternalKafkaConsumer( /** Reset the internal pre-fetched data. */ def reset(): Unit = { _records = ju.Collections.emptyListIterator() + _nextOffsetInFetchedData = UNKNOWN_OFFSET + _offsetAfterPoll = UNKNOWN_OFFSET } /** @@ -361,8 +363,9 @@ private[kafka010] case class InternalKafkaConsumer( if (offset < fetchedData.offsetAfterPoll) { // Offsets in [offset, fetchedData.offsetAfterPoll) are invisible. Return a record to ask // the next call to start from `fetchedData.offsetAfterPoll`. + val nextOffsetToFetch = fetchedData.offsetAfterPoll fetchedData.reset() - return fetchedRecord.withRecord(null, fetchedData.offsetAfterPoll) + return fetchedRecord.withRecord(null, nextOffsetToFetch) } else { // Fetch records from Kafka and update `fetchedData`. fetchData(offset, pollTimeoutMs) diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala index 6631ae84167c..fb209c724afb 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.sources.v2.DataSourceOptions private[kafka010] class KafkaOffsetRangeCalculator(val minPartitions: Option[Int]) { require(minPartitions.isEmpty || minPartitions.get > 0) - import KafkaOffsetRangeCalculator._ /** * Calculate the offset ranges that we are going to process this batch. If `minPartitions` * is not set or is set less than or equal the number of `topicPartitions` that we're going to diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala index 9d856c9494e1..e6f9d1259e43 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.kafka010 -import java.{util => ju} import java.util.UUID import org.apache.kafka.common.TopicPartition diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamingWriteSupport.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamingWriteSupport.scala index dc19312f79a2..927c56d9ce82 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamingWriteSupport.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamingWriteSupport.scala @@ -54,8 +54,8 @@ class KafkaStreamingWriteSupport( } /** - * A [[DataWriterFactory]] for Kafka writing. Will be serialized and sent to executors to generate - * the per-task data writers. + * A [[StreamingDataWriterFactory]] for Kafka writing. Will be serialized and sent to executors to + * generate the per-task data writers. * @param topic The topic that should be written to. If None, topic will be inferred from * a `topic` field in the incoming data. * @param producerParams Parameters for Kafka producers in each task. diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala index 8e246dbbf5d7..5ee76990b54f 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala @@ -33,11 +33,13 @@ import org.apache.kafka.common.TopicPartition import org.scalatest.concurrent.PatienceConfiguration.Timeout import org.scalatest.time.SpanSugar._ -import org.apache.spark.sql.{ForeachWriter, SparkSession} +import org.apache.spark.sql.{Dataset, ForeachWriter, SparkSession} import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation +import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.functions.{count, window} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.kafka010.KafkaSourceProvider._ import org.apache.spark.sql.sources.v2.DataSourceOptions import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest} @@ -598,18 +600,37 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { val join = values.join(values, "key") - testStream(join)( - makeSureGetOffsetCalled, - AddKafkaData(Set(topic), 1, 2), - CheckAnswer((1, 1, 1), (2, 2, 2)), - AddKafkaData(Set(topic), 6, 3), - CheckAnswer((1, 1, 1), (2, 2, 2), (3, 3, 3), (1, 6, 1), (1, 1, 6), (1, 6, 6)), - AssertOnQuery { q => + def checkQuery(check: AssertOnQuery): Unit = { + testStream(join)( + makeSureGetOffsetCalled, + AddKafkaData(Set(topic), 1, 2), + CheckAnswer((1, 1, 1), (2, 2, 2)), + AddKafkaData(Set(topic), 6, 3), + CheckAnswer((1, 1, 1), (2, 2, 2), (3, 3, 3), (1, 6, 1), (1, 1, 6), (1, 6, 6)), + check + ) + } + + withSQLConf(SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { + checkQuery(AssertOnQuery { q => + assert(q.availableOffsets.iterator.size == 1) + // The kafka source is scanned twice because of self-join + assert(q.recentProgress.map(_.numInputRows).sum == 8) + true + }) + } + + withSQLConf(SQLConf.EXCHANGE_REUSE_ENABLED.key -> "true") { + checkQuery(AssertOnQuery { q => assert(q.availableOffsets.iterator.size == 1) + assert(q.lastExecution.executedPlan.collect { + case r: ReusedExchangeExec => r + }.length == 1) + // The kafka source is scanned only once because of exchange reuse. assert(q.recentProgress.map(_.numInputRows).sum == 4) true - } - ) + }) + } } test("read Kafka transactional messages: read_committed") { @@ -853,6 +874,58 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { ) } } + + test("SPARK-25495: FetchedData.reset should reset all fields") { + val topic = newTopic() + val topicPartition = new TopicPartition(topic, 0) + testUtils.createTopic(topic, partitions = 1) + + val ds = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("kafka.metadata.max.age.ms", "1") + .option("kafka.isolation.level", "read_committed") + .option("subscribe", topic) + .option("startingOffsets", "earliest") + .load() + .select($"value".as[String]) + + testUtils.withTranscationalProducer { producer => + producer.beginTransaction() + (0 to 3).foreach { i => + producer.send(new ProducerRecord[String, String](topic, i.toString)).get() + } + producer.commitTransaction() + } + testUtils.waitUntilOffsetAppears(topicPartition, 5) + + val q = ds.writeStream.foreachBatch { (ds: Dataset[String], epochId: Long) => + if (epochId == 0) { + // Send more message before the tasks of the current batch start reading the current batch + // data, so that the executors will prefetch messages in the next batch and drop them. In + // this case, if we forget to reset `FetchedData._nextOffsetInFetchedData` or + // `FetchedData._offsetAfterPoll` (See SPARK-25495), the next batch will see incorrect + // values and return wrong results hence fail the test. + testUtils.withTranscationalProducer { producer => + producer.beginTransaction() + (4 to 7).foreach { i => + producer.send(new ProducerRecord[String, String](topic, i.toString)).get() + } + producer.commitTransaction() + } + testUtils.waitUntilOffsetAppears(topicPartition, 10) + checkDatasetUnorderly(ds, (0 to 3).map(_.toString): _*) + } else { + checkDatasetUnorderly(ds, (4 to 7).map(_.toString): _*) + } + }.start() + try { + q.processAllAvailable() + } finally { + q.stop() + } + } } diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala index eb186970fc25..8cfca56433f5 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala @@ -48,9 +48,12 @@ class KafkaRelationSuite extends QueryTest with SharedSQLContext with KafkaTest } override def afterAll(): Unit = { - if (testUtils != null) { - testUtils.teardown() - testUtils = null + try { + if (testUtils != null) { + testUtils.teardown() + testUtils = null + } + } finally { super.afterAll() } } diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala index a2213e024bd9..81832fbdcd7e 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala @@ -48,9 +48,12 @@ class KafkaSinkSuite extends StreamTest with SharedSQLContext with KafkaTest { } override def afterAll(): Unit = { - if (testUtils != null) { - testUtils.teardown() - testUtils = null + try { + if (testUtils != null) { + testUtils.teardown() + testUtils = null + } + } finally { super.afterAll() } } diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml index a97fd35bfbb7..f59f07265a0f 100644 --- a/external/kafka-0-10/pom.xml +++ b/external/kafka-0-10/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala index 661b67a8ab68..1974bb1e12e1 100644 --- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala +++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala @@ -59,14 +59,19 @@ class DirectKafkaStreamSuite private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll { + super.beforeAll() kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll { - if (kafkaTestUtils != null) { - kafkaTestUtils.teardown() - kafkaTestUtils = null + try { + if (kafkaTestUtils != null) { + kafkaTestUtils.teardown() + kafkaTestUtils = null + } + } finally { + super.afterAll() } } diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala index 3ac6509b0470..561bca5f5537 100644 --- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala +++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala @@ -44,20 +44,27 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll { private var sc: SparkContext = _ override def beforeAll { + super.beforeAll() sc = new SparkContext(sparkConf) kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll { - if (sc != null) { - sc.stop - sc = null - } - - if (kafkaTestUtils != null) { - kafkaTestUtils.teardown() - kafkaTestUtils = null + try { + try { + if (sc != null) { + sc.stop + sc = null + } + } finally { + if (kafkaTestUtils != null) { + kafkaTestUtils.teardown() + kafkaTestUtils = null + } + } + } finally { + super.afterAll() } } diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml index 6be17a81f3fe..83edb11f296a 100644 --- a/external/kafka-0-8-assembly/pom.xml +++ b/external/kafka-0-8-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml index 6d1c4789f382..4545877a9d83 100644 --- a/external/kafka-0-8/pom.xml +++ b/external/kafka-0-8/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala index ecca38784e77..3fd37f4c8ac9 100644 --- a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala +++ b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala @@ -57,14 +57,19 @@ class DirectKafkaStreamSuite private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll { + super.beforeAll() kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll { - if (kafkaTestUtils != null) { - kafkaTestUtils.teardown() - kafkaTestUtils = null + try { + if (kafkaTestUtils != null) { + kafkaTestUtils.teardown() + kafkaTestUtils = null + } + } finally { + super.afterAll() } } diff --git a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala index d66830cbacde..73d528518d48 100644 --- a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala +++ b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala @@ -32,6 +32,7 @@ class KafkaClusterSuite extends SparkFunSuite with BeforeAndAfterAll { private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll() { + super.beforeAll() kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() @@ -41,9 +42,13 @@ class KafkaClusterSuite extends SparkFunSuite with BeforeAndAfterAll { } override def afterAll() { - if (kafkaTestUtils != null) { - kafkaTestUtils.teardown() - kafkaTestUtils = null + try { + if (kafkaTestUtils != null) { + kafkaTestUtils.teardown() + kafkaTestUtils = null + } + } finally { + super.afterAll() } } diff --git a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala index 809699a73996..72f954149fef 100644 --- a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala +++ b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala @@ -35,20 +35,27 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll { private var sc: SparkContext = _ override def beforeAll { + super.beforeAll() sc = new SparkContext(sparkConf) kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll { - if (sc != null) { - sc.stop - sc = null - } - - if (kafkaTestUtils != null) { - kafkaTestUtils.teardown() - kafkaTestUtils = null + try { + try { + if (sc != null) { + sc.stop + sc = null + } + } finally { + if (kafkaTestUtils != null) { + kafkaTestUtils.teardown() + kafkaTestUtils = null + } + } + } finally { + super.afterAll() } } diff --git a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala index 426cd83b4ddf..ed130f599095 100644 --- a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala +++ b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala @@ -35,19 +35,26 @@ class KafkaStreamSuite extends SparkFunSuite with Eventually with BeforeAndAfter private var kafkaTestUtils: KafkaTestUtils = _ override def beforeAll(): Unit = { + super.beforeAll() kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() } override def afterAll(): Unit = { - if (ssc != null) { - ssc.stop() - ssc = null - } - - if (kafkaTestUtils != null) { - kafkaTestUtils.teardown() - kafkaTestUtils = null + try { + try { + if (ssc != null) { + ssc.stop() + ssc = null + } + } finally { + if (kafkaTestUtils != null) { + kafkaTestUtils.teardown() + kafkaTestUtils = null + } + } + } finally { + super.afterAll() } } diff --git a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala index 57f89cc7dbc6..5da5ea49d77e 100644 --- a/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala +++ b/external/kafka-0-8/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala @@ -51,6 +51,7 @@ class ReliableKafkaStreamSuite extends SparkFunSuite private var tempDirectory: File = null override def beforeAll(): Unit = { + super.beforeAll() kafkaTestUtils = new KafkaTestUtils kafkaTestUtils.setup() @@ -65,11 +66,15 @@ class ReliableKafkaStreamSuite extends SparkFunSuite } override def afterAll(): Unit = { - Utils.deleteRecursively(tempDirectory) + try { + Utils.deleteRecursively(tempDirectory) - if (kafkaTestUtils != null) { - kafkaTestUtils.teardown() - kafkaTestUtils = null + if (kafkaTestUtils != null) { + kafkaTestUtils.teardown() + kafkaTestUtils = null + } + } finally { + super.afterAll() } } diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml index 68fded515626..0bf4c265939e 100644 --- a/external/kinesis-asl-assembly/pom.xml +++ b/external/kinesis-asl-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml index 491589396559..032aca9077e2 100644 --- a/external/kinesis-asl/pom.xml +++ b/external/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala index e0e26847aa0e..361520e29226 100644 --- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala +++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala @@ -40,7 +40,11 @@ class KinesisInputDStreamBuilderSuite extends TestSuiteBase with BeforeAndAfterE .checkpointAppName(checkpointAppName) override def afterAll(): Unit = { - ssc.stop() + try { + ssc.stop() + } finally { + super.afterAll() + } } test("should raise an exception if the StreamingContext is missing") { diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala index a7a68eba910b..6d27445c5b60 100644 --- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala +++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala @@ -71,17 +71,21 @@ abstract class KinesisStreamTests(aggregateTestData: Boolean) extends KinesisFun } override def afterAll(): Unit = { - if (ssc != null) { - ssc.stop() - } - if (sc != null) { - sc.stop() - } - if (testUtils != null) { - // Delete the Kinesis stream as well as the DynamoDB table generated by - // Kinesis Client Library when consuming the stream - testUtils.deleteStream() - testUtils.deleteDynamoDBTable(appName) + try { + if (ssc != null) { + ssc.stop() + } + if (sc != null) { + sc.stop() + } + if (testUtils != null) { + // Delete the Kinesis stream as well as the DynamoDB table generated by + // Kinesis Client Library when consuming the stream + testUtils.deleteStream() + testUtils.deleteDynamoDBTable(appName) + } + } finally { + super.afterAll() } } diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml index 027157e53d51..35a55b70baf3 100644 --- a/external/spark-ganglia-lgpl/pom.xml +++ b/external/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/external/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala b/external/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala index 0cd795f63887..4fb9f2f84908 100644 --- a/external/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala +++ b/external/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala @@ -17,7 +17,7 @@ package org.apache.spark.metrics.sink -import java.util.Properties +import java.util.{Locale, Properties} import java.util.concurrent.TimeUnit import com.codahale.metrics.MetricRegistry @@ -64,11 +64,12 @@ class GangliaSink(val property: Properties, val registry: MetricRegistry, val ttl = propertyToOption(GANGLIA_KEY_TTL).map(_.toInt).getOrElse(GANGLIA_DEFAULT_TTL) val dmax = propertyToOption(GANGLIA_KEY_DMAX).map(_.toInt).getOrElse(GANGLIA_DEFAULT_DMAX) val mode: UDPAddressingMode = propertyToOption(GANGLIA_KEY_MODE) - .map(u => GMetric.UDPAddressingMode.valueOf(u.toUpperCase)).getOrElse(GANGLIA_DEFAULT_MODE) + .map(u => GMetric.UDPAddressingMode.valueOf(u.toUpperCase(Locale.ROOT))) + .getOrElse(GANGLIA_DEFAULT_MODE) val pollPeriod = propertyToOption(GANGLIA_KEY_PERIOD).map(_.toInt) .getOrElse(GANGLIA_DEFAULT_PERIOD) val pollUnit: TimeUnit = propertyToOption(GANGLIA_KEY_UNIT) - .map(u => TimeUnit.valueOf(u.toUpperCase)) + .map(u => TimeUnit.valueOf(u.toUpperCase(Locale.ROOT))) .getOrElse(GANGLIA_DEFAULT_UNIT) MetricsSystem.checkMinimalPollingPeriod(pollUnit, pollPeriod) diff --git a/graphx/pom.xml b/graphx/pom.xml index 0f5dc548600b..d65a8ceb62b9 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala index 96b635f9a144..1305c059b89c 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala @@ -198,9 +198,11 @@ object PageRank extends Logging { val zero = Vectors.sparse(sources.size, List()).asBreeze // map of vid -> vector where for each vid, the _position of vid in source_ is set to 1.0 - val sourcesInitMap = sources.zipWithIndex.toMap.mapValues { i => - Vectors.sparse(sources.size, Array(i), Array(1.0)).asBreeze - } + val sourcesInitMap = sources.zipWithIndex.map { case (vid, i) => + val v = Vectors.sparse(sources.size, Array(i), Array(1.0)).asBreeze + (vid, v) + }.toMap + val sc = graph.vertices.sparkContext val sourcesInitMapBC = sc.broadcast(sourcesInitMap) // Initialize the PageRank graph with each edge attribute having diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml index 2c39a7df0146..d48162007e67 100644 --- a/hadoop-cloud/pom.xml +++ b/hadoop-cloud/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/launcher/pom.xml b/launcher/pom.xml index 912eb6b6d2a0..b1b6126ea593 100644 --- a/launcher/pom.xml +++ b/launcher/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml index 53286fe93478..ec5f9b0e92c8 100644 --- a/mllib-local/pom.xml +++ b/mllib-local/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/mllib/benchmarks/UDTSerializationBenchmark-results.txt b/mllib/benchmarks/UDTSerializationBenchmark-results.txt new file mode 100644 index 000000000000..169f4c60c748 --- /dev/null +++ b/mllib/benchmarks/UDTSerializationBenchmark-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +VectorUDT de/serialization +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_131-b11 on Mac OS X 10.13.6 +Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz + +VectorUDT de/serialization: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +serialize 144 / 206 0.0 143979.7 1.0X +deserialize 114 / 135 0.0 113802.6 1.3X + + diff --git a/mllib/pom.xml b/mllib/pom.xml index f07d7f24fd31..17ddb87c4d86 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 8a57bfc029d1..6648e78d8eaf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -168,7 +168,7 @@ object DecisionTreeClassifier extends DefaultParamsReadable[DecisionTreeClassifi @Since("1.4.0") class DecisionTreeClassificationModel private[ml] ( @Since("1.4.0")override val uid: String, - @Since("1.4.0")override val rootNode: ClassificationNode, + @Since("1.4.0")override val rootNode: Node, @Since("1.6.0")override val numFeatures: Int, @Since("1.5.0")override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, DecisionTreeClassificationModel] @@ -181,7 +181,7 @@ class DecisionTreeClassificationModel private[ml] ( * Construct a decision tree classification model. * @param rootNode Root node of tree, with other nodes attached. */ - private[ml] def this(rootNode: ClassificationNode, numFeatures: Int, numClasses: Int) = + private[ml] def this(rootNode: Node, numFeatures: Int, numClasses: Int) = this(Identifiable.randomUID("dtc"), rootNode, numFeatures, numClasses) override def predict(features: Vector): Double = { @@ -279,9 +279,8 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numClasses = (metadata.metadata \ "numClasses").extract[Int] - val root = loadTreeNodes(path, metadata, sparkSession, isClassification = true) - val model = new DecisionTreeClassificationModel(metadata.uid, - root.asInstanceOf[ClassificationNode], numFeatures, numClasses) + val root = loadTreeNodes(path, metadata, sparkSession) + val model = new DecisionTreeClassificationModel(metadata.uid, root, numFeatures, numClasses) metadata.getAndSetParams(model) model } @@ -296,10 +295,9 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica require(oldModel.algo == OldAlgo.Classification, s"Cannot convert non-classification DecisionTreeModel (old API) to" + s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}") - val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures, isClassification = true) + val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtc") // Can't infer number of features from old model, so default to -1 - new DecisionTreeClassificationModel(uid, - rootNode.asInstanceOf[ClassificationNode], numFeatures, -1) + new DecisionTreeClassificationModel(uid, rootNode, numFeatures, -1) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 33acd9914073..62cfa39746ff 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -412,14 +412,14 @@ object GBTClassificationModel extends MLReadable[GBTClassificationModel] { override def load(path: String): GBTClassificationModel = { implicit val format = DefaultFormats val (metadata: Metadata, treesData: Array[(Metadata, Node)], treeWeights: Array[Double]) = - EnsembleModelReadWrite.loadImpl(path, sparkSession, className, treeClassName, false) + EnsembleModelReadWrite.loadImpl(path, sparkSession, className, treeClassName) val numFeatures = (metadata.metadata \ numFeaturesKey).extract[Int] val numTrees = (metadata.metadata \ numTreesKey).extract[Int] val trees: Array[DecisionTreeRegressionModel] = treesData.map { case (treeMetadata, root) => - val tree = new DecisionTreeRegressionModel(treeMetadata.uid, - root.asInstanceOf[RegressionNode], numFeatures) + val tree = + new DecisionTreeRegressionModel(treeMetadata.uid, root, numFeatures) treeMetadata.getAndSetParams(tree) tree } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 94887ac346fe..57132381b647 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -313,15 +313,15 @@ object RandomForestClassificationModel extends MLReadable[RandomForestClassifica override def load(path: String): RandomForestClassificationModel = { implicit val format = DefaultFormats val (metadata: Metadata, treesData: Array[(Metadata, Node)], _) = - EnsembleModelReadWrite.loadImpl(path, sparkSession, className, treeClassName, true) + EnsembleModelReadWrite.loadImpl(path, sparkSession, className, treeClassName) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numClasses = (metadata.metadata \ "numClasses").extract[Int] val numTrees = (metadata.metadata \ "numTrees").extract[Int] val trees: Array[DecisionTreeClassificationModel] = treesData.map { case (treeMetadata, root) => - val tree = new DecisionTreeClassificationModel(treeMetadata.uid, - root.asInstanceOf[ClassificationNode], numFeatures, numClasses) + val tree = + new DecisionTreeClassificationModel(treeMetadata.uid, root, numFeatures, numClasses) treeMetadata.getAndSetParams(tree) tree } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 8904193cae94..5cb16cc76588 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -104,10 +104,6 @@ class BisectingKMeansModel private[ml] ( @Since("2.1.0") def setPredictionCol(value: String): this.type = set(predictionCol, value) - /** @group expertSetParam */ - @Since("2.4.0") - def setDistanceMeasure(value: String): this.type = set(distanceMeasure, value) - @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 50867f776c52..84e73dc19a39 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -570,13 +570,11 @@ abstract class LDAModel private[ml] ( class LocalLDAModel private[ml] ( uid: String, vocabSize: Int, - private[clustering] val oldLocalModel_ : OldLocalLDAModel, + private[clustering] val oldLocalModel : OldLocalLDAModel, sparkSession: SparkSession) extends LDAModel(uid, vocabSize, sparkSession) { - override private[clustering] def oldLocalModel: OldLocalLDAModel = { - oldLocalModel_.setSeed(getSeed) - } + oldLocalModel.setSeed(getSeed) @Since("1.6.0") override def copy(extra: ParamMap): LocalLDAModel = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala index dc38ee326e5e..dc18e1d34880 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala @@ -29,7 +29,7 @@ import org.apache.spark.mllib.feature.{HashingTF => OldHashingTF} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.hash.Murmur3_x86_32.{hashInt, hashLong, hashUnsafeBytes2Block} +import org.apache.spark.unsafe.hash.Murmur3_x86_32.{hashInt, hashLong, hashUnsafeBytes2} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils import org.apache.spark.util.collection.OpenHashMap @@ -244,7 +244,8 @@ object FeatureHasher extends DefaultParamsReadable[FeatureHasher] { case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => - hashUnsafeBytes2Block(UTF8String.fromString(s).getMemoryBlock, seed) + val utf8 = UTF8String.fromString(s) + hashUnsafeBytes2(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed) case _ => throw new SparkException("FeatureHasher with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 94640a5cbe31..6669d402cd99 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -118,7 +118,9 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String } } else { val lc = new Locale($(locale)) + // scalastyle:off caselocale val toLower = (s: String) => if (s != null) s.toLowerCase(lc) else s + // scalastyle:on caselocale val lowerStopWords = $(stopWords).map(toLower(_)).toSet udf { terms: Seq[String] => terms.filter(s => !lowerStopWords.contains(toLower(s))) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index aede1f812a55..748c869af411 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -36,7 +36,9 @@ class Tokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) def this() = this(Identifiable.randomUID("tok")) override protected def createTransformFunc: String => Seq[String] = { + // scalastyle:off caselocale _.toLowerCase.split("\\s") + // scalastyle:on caselocale } override protected def validateInputType(inputType: DataType): Unit = { @@ -140,7 +142,9 @@ class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) override protected def createTransformFunc: String => Seq[String] = { originStr => val re = $(pattern).r + // scalastyle:off caselocale val str = if ($(toLowercase)) originStr.toLowerCase() else originStr + // scalastyle:on caselocale val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala index dcc40b6668c7..0b13eefdf3f5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala @@ -198,6 +198,8 @@ object ImageSchema { * @return DataFrame with a single column "image" of images; * see ImageSchema for the details */ + @deprecated("use `spark.read.format(\"image\").load(path)` and this `readImages` will be " + + "removed in 3.0.0.", "2.4.0") def readImages(path: String): DataFrame = readImages(path, null, false, -1, false, 1.0, 0) /** @@ -218,6 +220,8 @@ object ImageSchema { * @return DataFrame with a single column "image" of images; * see ImageSchema for the details */ + @deprecated("use `spark.read.format(\"image\").load(path)` and this `readImages` will be " + + "removed in 3.0.0.", "2.4.0") def readImages( path: String, sparkSession: SparkSession, diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 018290f81842..6fa656275c1f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -160,7 +160,7 @@ object DecisionTreeRegressor extends DefaultParamsReadable[DecisionTreeRegressor @Since("1.4.0") class DecisionTreeRegressionModel private[ml] ( override val uid: String, - override val rootNode: RegressionNode, + override val rootNode: Node, override val numFeatures: Int) extends PredictionModel[Vector, DecisionTreeRegressionModel] with DecisionTreeModel with DecisionTreeRegressorParams with MLWritable with Serializable { @@ -175,7 +175,7 @@ class DecisionTreeRegressionModel private[ml] ( * Construct a decision tree regression model. * @param rootNode Root node of tree, with other nodes attached. */ - private[ml] def this(rootNode: RegressionNode, numFeatures: Int) = + private[ml] def this(rootNode: Node, numFeatures: Int) = this(Identifiable.randomUID("dtr"), rootNode, numFeatures) override def predict(features: Vector): Double = { @@ -279,9 +279,8 @@ object DecisionTreeRegressionModel extends MLReadable[DecisionTreeRegressionMode implicit val format = DefaultFormats val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] - val root = loadTreeNodes(path, metadata, sparkSession, isClassification = false) - val model = new DecisionTreeRegressionModel(metadata.uid, - root.asInstanceOf[RegressionNode], numFeatures) + val root = loadTreeNodes(path, metadata, sparkSession) + val model = new DecisionTreeRegressionModel(metadata.uid, root, numFeatures) metadata.getAndSetParams(model) model } @@ -296,8 +295,8 @@ object DecisionTreeRegressionModel extends MLReadable[DecisionTreeRegressionMode require(oldModel.algo == OldAlgo.Regression, s"Cannot convert non-regression DecisionTreeModel (old API) to" + s" DecisionTreeRegressionModel (new API). Algo is: ${oldModel.algo}") - val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures, isClassification = false) + val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtr") - new DecisionTreeRegressionModel(uid, rootNode.asInstanceOf[RegressionNode], numFeatures) + new DecisionTreeRegressionModel(uid, rootNode, numFeatures) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 3305881b0ccc..07f88d8d5f84 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -338,15 +338,15 @@ object GBTRegressionModel extends MLReadable[GBTRegressionModel] { override def load(path: String): GBTRegressionModel = { implicit val format = DefaultFormats val (metadata: Metadata, treesData: Array[(Metadata, Node)], treeWeights: Array[Double]) = - EnsembleModelReadWrite.loadImpl(path, sparkSession, className, treeClassName, false) + EnsembleModelReadWrite.loadImpl(path, sparkSession, className, treeClassName) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numTrees = (metadata.metadata \ "numTrees").extract[Int] val trees: Array[DecisionTreeRegressionModel] = treesData.map { case (treeMetadata, root) => - val tree = new DecisionTreeRegressionModel(treeMetadata.uid, - root.asInstanceOf[RegressionNode], numFeatures) + val tree = + new DecisionTreeRegressionModel(treeMetadata.uid, root, numFeatures) treeMetadata.getAndSetParams(tree) tree } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 35875724b3cf..82bf66ff66d8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -271,13 +271,13 @@ object RandomForestRegressionModel extends MLReadable[RandomForestRegressionMode override def load(path: String): RandomForestRegressionModel = { implicit val format = DefaultFormats val (metadata: Metadata, treesData: Array[(Metadata, Node)], treeWeights: Array[Double]) = - EnsembleModelReadWrite.loadImpl(path, sparkSession, className, treeClassName, false) + EnsembleModelReadWrite.loadImpl(path, sparkSession, className, treeClassName) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numTrees = (metadata.metadata \ "numTrees").extract[Int] val trees: Array[DecisionTreeRegressionModel] = treesData.map { case (treeMetadata, root) => - val tree = new DecisionTreeRegressionModel(treeMetadata.uid, - root.asInstanceOf[RegressionNode], numFeatures) + val tree = + new DecisionTreeRegressionModel(treeMetadata.uid, root, numFeatures) treeMetadata.getAndSetParams(tree) tree } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala index 0242bc76698d..d30be452a436 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala @@ -17,16 +17,14 @@ package org.apache.spark.ml.tree -import org.apache.spark.annotation.Since import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.tree.impurity.ImpurityCalculator -import org.apache.spark.mllib.tree.model.{ImpurityStats, InformationGainStats => OldInformationGainStats, - Node => OldNode, Predict => OldPredict} +import org.apache.spark.mllib.tree.model.{ImpurityStats, InformationGainStats => OldInformationGainStats, Node => OldNode, Predict => OldPredict} /** * Decision tree node interface. */ -sealed trait Node extends Serializable { +sealed abstract class Node extends Serializable { // TODO: Add aggregate stats (once available). This will happen after we move the DecisionTree // code into the new API and deprecate the old API. SPARK-3727 @@ -86,86 +84,35 @@ private[ml] object Node { /** * Create a new Node from the old Node format, recursively creating child nodes as needed. */ - def fromOld( - oldNode: OldNode, - categoricalFeatures: Map[Int, Int], - isClassification: Boolean): Node = { + def fromOld(oldNode: OldNode, categoricalFeatures: Map[Int, Int]): Node = { if (oldNode.isLeaf) { // TODO: Once the implementation has been moved to this API, then include sufficient // statistics here. - if (isClassification) { - new ClassificationLeafNode(prediction = oldNode.predict.predict, - impurity = oldNode.impurity, impurityStats = null) - } else { - new RegressionLeafNode(prediction = oldNode.predict.predict, - impurity = oldNode.impurity, impurityStats = null) - } + new LeafNode(prediction = oldNode.predict.predict, + impurity = oldNode.impurity, impurityStats = null) } else { val gain = if (oldNode.stats.nonEmpty) { oldNode.stats.get.gain } else { 0.0 } - if (isClassification) { - new ClassificationInternalNode(prediction = oldNode.predict.predict, - impurity = oldNode.impurity, gain = gain, - leftChild = fromOld(oldNode.leftNode.get, categoricalFeatures, true) - .asInstanceOf[ClassificationNode], - rightChild = fromOld(oldNode.rightNode.get, categoricalFeatures, true) - .asInstanceOf[ClassificationNode], - split = Split.fromOld(oldNode.split.get, categoricalFeatures), impurityStats = null) - } else { - new RegressionInternalNode(prediction = oldNode.predict.predict, - impurity = oldNode.impurity, gain = gain, - leftChild = fromOld(oldNode.leftNode.get, categoricalFeatures, false) - .asInstanceOf[RegressionNode], - rightChild = fromOld(oldNode.rightNode.get, categoricalFeatures, false) - .asInstanceOf[RegressionNode], - split = Split.fromOld(oldNode.split.get, categoricalFeatures), impurityStats = null) - } + new InternalNode(prediction = oldNode.predict.predict, impurity = oldNode.impurity, + gain = gain, leftChild = fromOld(oldNode.leftNode.get, categoricalFeatures), + rightChild = fromOld(oldNode.rightNode.get, categoricalFeatures), + split = Split.fromOld(oldNode.split.get, categoricalFeatures), impurityStats = null) } } } -@Since("2.4.0") -sealed trait ClassificationNode extends Node { - - /** - * Get count of training examples for specified label in this node - * @param label label number in the range [0, numClasses) - */ - @Since("2.4.0") - def getLabelCount(label: Int): Double = { - require(label >= 0 && label < impurityStats.stats.length, - "label should be in the range between 0 (inclusive) " + - s"and ${impurityStats.stats.length} (exclusive).") - impurityStats.stats(label) - } -} - -@Since("2.4.0") -sealed trait RegressionNode extends Node { - - /** Number of training data points in this node */ - @Since("2.4.0") - def getCount: Double = impurityStats.stats(0) - - /** Sum over training data points of the labels in this node */ - @Since("2.4.0") - def getSum: Double = impurityStats.stats(1) - - /** Sum over training data points of the square of the labels in this node */ - @Since("2.4.0") - def getSumOfSquares: Double = impurityStats.stats(2) -} - -@Since("2.4.0") -sealed trait LeafNode extends Node { - - /** Prediction this node makes. */ - def prediction: Double - - def impurity: Double +/** + * Decision tree leaf node. + * @param prediction Prediction this node makes + * @param impurity Impurity measure at this node (for training data) + */ +class LeafNode private[ml] ( + override val prediction: Double, + override val impurity: Double, + override private[ml] val impurityStats: ImpurityCalculator) extends Node { override def toString: String = s"LeafNode(prediction = $prediction, impurity = $impurity)" @@ -188,58 +135,32 @@ sealed trait LeafNode extends Node { override private[ml] def maxSplitFeatureIndex(): Int = -1 -} - -/** - * Decision tree leaf node for classification. - */ -@Since("2.4.0") -class ClassificationLeafNode private[ml] ( - override val prediction: Double, - override val impurity: Double, - override private[ml] val impurityStats: ImpurityCalculator) - extends ClassificationNode with LeafNode { - override private[tree] def deepCopy(): Node = { - new ClassificationLeafNode(prediction, impurity, impurityStats) + new LeafNode(prediction, impurity, impurityStats) } } /** - * Decision tree leaf node for regression. + * Internal Decision Tree node. + * @param prediction Prediction this node would make if it were a leaf node + * @param impurity Impurity measure at this node (for training data) + * @param gain Information gain value. Values less than 0 indicate missing values; + * this quirk will be removed with future updates. + * @param leftChild Left-hand child node + * @param rightChild Right-hand child node + * @param split Information about the test used to split to the left or right child. */ -@Since("2.4.0") -class RegressionLeafNode private[ml] ( +class InternalNode private[ml] ( override val prediction: Double, override val impurity: Double, - override private[ml] val impurityStats: ImpurityCalculator) - extends RegressionNode with LeafNode { - - override private[tree] def deepCopy(): Node = { - new RegressionLeafNode(prediction, impurity, impurityStats) - } -} - -/** - * Internal Decision Tree node. - */ -@Since("2.4.0") -sealed trait InternalNode extends Node { - - /** - * Information gain value. Values less than 0 indicate missing values; - * this quirk will be removed with future updates. - */ - def gain: Double - - /** Left-hand child node */ - def leftChild: Node - - /** Right-hand child node */ - def rightChild: Node + val gain: Double, + val leftChild: Node, + val rightChild: Node, + val split: Split, + override private[ml] val impurityStats: ImpurityCalculator) extends Node { - /** Information about the test used to split to the left or right child. */ - def split: Split + // Note to developers: The constructor argument impurityStats should be reconsidered before we + // make the constructor public. We may be able to improve the representation. override def toString: String = { s"InternalNode(prediction = $prediction, impurity = $impurity, split = $split)" @@ -284,6 +205,11 @@ sealed trait InternalNode extends Node { math.max(split.featureIndex, math.max(leftChild.maxSplitFeatureIndex(), rightChild.maxSplitFeatureIndex())) } + + override private[tree] def deepCopy(): Node = { + new InternalNode(prediction, impurity, gain, leftChild.deepCopy(), rightChild.deepCopy(), + split, impurityStats) + } } private object InternalNode { @@ -314,57 +240,6 @@ private object InternalNode { } } -/** - * Internal Decision Tree node for regression. - */ -@Since("2.4.0") -class ClassificationInternalNode private[ml] ( - override val prediction: Double, - override val impurity: Double, - override val gain: Double, - override val leftChild: ClassificationNode, - override val rightChild: ClassificationNode, - override val split: Split, - override private[ml] val impurityStats: ImpurityCalculator) - extends ClassificationNode with InternalNode { - - // Note to developers: The constructor argument impurityStats should be reconsidered before we - // make the constructor public. We may be able to improve the representation. - - override private[tree] def deepCopy(): Node = { - new ClassificationInternalNode(prediction, impurity, gain, - leftChild.deepCopy().asInstanceOf[ClassificationNode], - rightChild.deepCopy().asInstanceOf[ClassificationNode], - split, impurityStats) - } -} - -/** - * Internal Decision Tree node for regression. - */ -@Since("2.4.0") -class RegressionInternalNode private[ml] ( - override val prediction: Double, - override val impurity: Double, - override val gain: Double, - override val leftChild: RegressionNode, - override val rightChild: RegressionNode, - override val split: Split, - override private[ml] val impurityStats: ImpurityCalculator) - extends RegressionNode with InternalNode { - - // Note to developers: The constructor argument impurityStats should be reconsidered before we - // make the constructor public. We may be able to improve the representation. - - override private[tree] def deepCopy(): Node = { - new RegressionInternalNode(prediction, impurity, gain, - leftChild.deepCopy().asInstanceOf[RegressionNode], - rightChild.deepCopy().asInstanceOf[RegressionNode], - split, impurityStats) - } -} - - /** * Version of a node used in learning. This uses vars so that we can modify nodes as we split the * tree by adding children, etc. @@ -390,52 +265,30 @@ private[tree] class LearningNode( var isLeaf: Boolean, var stats: ImpurityStats) extends Serializable { - def toNode(isClassification: Boolean): Node = toNode(isClassification, prune = true) - - def toClassificationNode(prune: Boolean = true): ClassificationNode = { - toNode(true, prune).asInstanceOf[ClassificationNode] - } - - def toRegressionNode(prune: Boolean = true): RegressionNode = { - toNode(false, prune).asInstanceOf[RegressionNode] - } + def toNode: Node = toNode(prune = true) /** * Convert this [[LearningNode]] to a regular [[Node]], and recurse on any children. */ - def toNode(isClassification: Boolean, prune: Boolean): Node = { + def toNode(prune: Boolean = true): Node = { if (!leftChild.isEmpty || !rightChild.isEmpty) { assert(leftChild.nonEmpty && rightChild.nonEmpty && split.nonEmpty && stats != null, "Unknown error during Decision Tree learning. Could not convert LearningNode to Node.") - (leftChild.get.toNode(isClassification, prune), - rightChild.get.toNode(isClassification, prune)) match { + (leftChild.get.toNode(prune), rightChild.get.toNode(prune)) match { case (l: LeafNode, r: LeafNode) if prune && l.prediction == r.prediction => - if (isClassification) { - new ClassificationLeafNode(l.prediction, stats.impurity, stats.impurityCalculator) - } else { - new RegressionLeafNode(l.prediction, stats.impurity, stats.impurityCalculator) - } + new LeafNode(l.prediction, stats.impurity, stats.impurityCalculator) case (l, r) => - if (isClassification) { - new ClassificationInternalNode(stats.impurityCalculator.predict, stats.impurity, - stats.gain, l.asInstanceOf[ClassificationNode], r.asInstanceOf[ClassificationNode], - split.get, stats.impurityCalculator) - } else { - new RegressionInternalNode(stats.impurityCalculator.predict, stats.impurity, stats.gain, - l.asInstanceOf[RegressionNode], r.asInstanceOf[RegressionNode], - split.get, stats.impurityCalculator) - } + new InternalNode(stats.impurityCalculator.predict, stats.impurity, stats.gain, + l, r, split.get, stats.impurityCalculator) } } else { - // Here we want to keep same behavior with the old mllib.DecisionTreeModel - val impurity = if (stats.valid) stats.impurity else -1.0 - if (isClassification) { - new ClassificationLeafNode(stats.impurityCalculator.predict, impurity, + if (stats.valid) { + new LeafNode(stats.impurityCalculator.predict, stats.impurity, stats.impurityCalculator) } else { - new RegressionLeafNode(stats.impurityCalculator.predict, impurity, - stats.impurityCalculator) + // Here we want to keep same behavior with the old mllib.DecisionTreeModel + new LeafNode(stats.impurityCalculator.predict, -1.0, stats.impurityCalculator) } } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala index 4cdd17266b77..822abd2d3522 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala @@ -226,23 +226,23 @@ private[spark] object RandomForest extends Logging with Serializable { case Some(uid) => if (strategy.algo == OldAlgo.Classification) { topNodes.map { rootNode => - new DecisionTreeClassificationModel(uid, rootNode.toClassificationNode(prune), - numFeatures, strategy.getNumClasses) + new DecisionTreeClassificationModel(uid, rootNode.toNode(prune), numFeatures, + strategy.getNumClasses) } } else { topNodes.map { rootNode => - new DecisionTreeRegressionModel(uid, rootNode.toRegressionNode(prune), numFeatures) + new DecisionTreeRegressionModel(uid, rootNode.toNode(prune), numFeatures) } } case None => if (strategy.algo == OldAlgo.Classification) { topNodes.map { rootNode => - new DecisionTreeClassificationModel(rootNode.toClassificationNode(prune), numFeatures, + new DecisionTreeClassificationModel(rootNode.toNode(prune), numFeatures, strategy.getNumClasses) } } else { topNodes.map(rootNode => - new DecisionTreeRegressionModel(rootNode.toRegressionNode(prune), numFeatures)) + new DecisionTreeRegressionModel(rootNode.toNode(prune), numFeatures)) } } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index f027b14f1d47..4aa4c3617e7f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -219,10 +219,8 @@ private[ml] object TreeEnsembleModel { importances.changeValue(feature, scaledGain, _ + scaledGain) computeFeatureImportance(n.leftChild, importances) computeFeatureImportance(n.rightChild, importances) - case _: LeafNode => + case n: LeafNode => // do nothing - case _ => - throw new IllegalArgumentException(s"Unknown node type: ${node.getClass.toString}") } } @@ -319,8 +317,6 @@ private[ml] object DecisionTreeModelReadWrite { (Seq(NodeData(id, node.prediction, node.impurity, node.impurityStats.stats, -1.0, -1, -1, SplitData(-1, Array.empty[Double], -1))), id) - case _ => - throw new IllegalArgumentException(s"Unknown node type: ${node.getClass.toString}") } } @@ -331,7 +327,7 @@ private[ml] object DecisionTreeModelReadWrite { def loadTreeNodes( path: String, metadata: DefaultParamsReader.Metadata, - sparkSession: SparkSession, isClassification: Boolean): Node = { + sparkSession: SparkSession): Node = { import sparkSession.implicits._ implicit val format = DefaultFormats @@ -343,7 +339,7 @@ private[ml] object DecisionTreeModelReadWrite { val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).as[NodeData] - buildTreeFromNodes(data.collect(), impurityType, isClassification) + buildTreeFromNodes(data.collect(), impurityType) } /** @@ -352,8 +348,7 @@ private[ml] object DecisionTreeModelReadWrite { * @param impurityType Impurity type for this tree * @return Root node of reconstructed tree */ - def buildTreeFromNodes(data: Array[NodeData], impurityType: String, - isClassification: Boolean): Node = { + def buildTreeFromNodes(data: Array[NodeData], impurityType: String): Node = { // Load all nodes, sorted by ID. val nodes = data.sortBy(_.id) // Sanity checks; could remove @@ -369,21 +364,10 @@ private[ml] object DecisionTreeModelReadWrite { val node = if (n.leftChild != -1) { val leftChild = finalNodes(n.leftChild) val rightChild = finalNodes(n.rightChild) - if (isClassification) { - new ClassificationInternalNode(n.prediction, n.impurity, n.gain, - leftChild.asInstanceOf[ClassificationNode], rightChild.asInstanceOf[ClassificationNode], - n.split.getSplit, impurityStats) - } else { - new RegressionInternalNode(n.prediction, n.impurity, n.gain, - leftChild.asInstanceOf[RegressionNode], rightChild.asInstanceOf[RegressionNode], - n.split.getSplit, impurityStats) - } + new InternalNode(n.prediction, n.impurity, n.gain, leftChild, rightChild, + n.split.getSplit, impurityStats) } else { - if (isClassification) { - new ClassificationLeafNode(n.prediction, n.impurity, impurityStats) - } else { - new RegressionLeafNode(n.prediction, n.impurity, impurityStats) - } + new LeafNode(n.prediction, n.impurity, impurityStats) } finalNodes(n.id) = node } @@ -437,8 +421,7 @@ private[ml] object EnsembleModelReadWrite { path: String, sql: SparkSession, className: String, - treeClassName: String, - isClassification: Boolean): (Metadata, Array[(Metadata, Node)], Array[Double]) = { + treeClassName: String): (Metadata, Array[(Metadata, Node)], Array[Double]) = { import sql.implicits._ implicit val format = DefaultFormats val metadata = DefaultParamsReader.loadMetadata(path, sql.sparkContext, className) @@ -466,8 +449,7 @@ private[ml] object EnsembleModelReadWrite { val rootNodesRDD: RDD[(Int, Node)] = nodeData.rdd.map(d => (d.treeID, d.nodeData)).groupByKey().map { case (treeID: Int, nodeData: Iterable[NodeData]) => - treeID -> DecisionTreeModelReadWrite.buildTreeFromNodes( - nodeData.toArray, impurityType, isClassification) + treeID -> DecisionTreeModelReadWrite.buildTreeFromNodes(nodeData.toArray, impurityType) } val rootNodes: Array[Node] = rootNodesRDD.sortByKey().values.collect() (metadata, treesMetadata.zip(rootNodes), treesWeights) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index 7b73b286fb91..8935c8496cdb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -160,7 +160,7 @@ object HashingTF { case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = UTF8String.fromString(s) - hashUnsafeBytesBlock(utf8.getMemoryBlock(), seed) + hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed) case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index d3dbb4e754d3..2930f4900d50 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite -import org.apache.spark.ml.tree.ClassificationLeafNode +import org.apache.spark.ml.tree.LeafNode import org.apache.spark.ml.tree.impl.TreeTests import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} @@ -61,8 +61,7 @@ class DecisionTreeClassifierSuite extends MLTest with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new DecisionTreeClassifier) - val model = new DecisionTreeClassificationModel("dtc", - new ClassificationLeafNode(0.0, 0.0, null), 1, 2) + val model = new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 1, 2) ParamsSuite.checkParams(model) } @@ -376,32 +375,6 @@ class DecisionTreeClassifierSuite extends MLTest with DefaultReadWriteTest { testDefaultReadWrite(model) } - - test("label/impurity stats") { - val arr = Array( - LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 0.0)))), - LabeledPoint(1.0, Vectors.sparse(2, Seq((1, 1.0)))), - LabeledPoint(0.0, Vectors.sparse(2, Seq((0, 1.0))))) - val rdd = sc.parallelize(arr) - val df = TreeTests.setMetadata(rdd, Map.empty[Int, Int], 2) - val dt1 = new DecisionTreeClassifier() - .setImpurity("entropy") - .setMaxDepth(2) - .setMinInstancesPerNode(2) - val model1 = dt1.fit(df) - - val rootNode1 = model1.rootNode - assert(Array(rootNode1.getLabelCount(0), rootNode1.getLabelCount(1)) === Array(2.0, 1.0)) - - val dt2 = new DecisionTreeClassifier() - .setImpurity("gini") - .setMaxDepth(2) - .setMinInstancesPerNode(2) - val model2 = dt2.fit(df) - - val rootNode2 = model2.rootNode - assert(Array(rootNode2.getLabelCount(0), rootNode2.getLabelCount(1)) === Array(2.0, 1.0)) - } } private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index e6d2a8e2b900..304977634189 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.regression.DecisionTreeRegressionModel -import org.apache.spark.ml.tree.RegressionLeafNode +import org.apache.spark.ml.tree.LeafNode import org.apache.spark.ml.tree.impl.{GradientBoostedTrees, TreeTests} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ @@ -70,7 +70,7 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new GBTClassifier) val model = new GBTClassificationModel("gbtc", - Array(new DecisionTreeRegressionModel("dtr", new RegressionLeafNode(0.0, 0.0, null), 1)), + Array(new DecisionTreeRegressionModel("dtr", new LeafNode(0.0, 0.0, null), 1)), Array(1.0), 1, 2) ParamsSuite.checkParams(model) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 75c2aeb14678..84c10e2f85c8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -79,7 +79,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { generateMultinomialLogisticInput(coefficients, xMean, xVariance, addIntercept = true, nPoints, seed) - sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed)) + val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed)) + df.cache() + df } multinomialDataset = { @@ -1130,9 +1132,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { } test("binary logistic regression with intercept with ElasticNet regularization") { - val trainer1 = (new LogisticRegression).setFitIntercept(true).setMaxIter(200) + val trainer1 = (new LogisticRegression).setFitIntercept(true).setMaxIter(120) .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true).setWeightCol("weight") - val trainer2 = (new LogisticRegression).setFitIntercept(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true).setMaxIter(30) .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false).setWeightCol("weight") val model1 = trainer1.fit(binaryDataset) @@ -1174,7 +1176,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val coefficientsR = Vectors.dense(0.0, 0.0, -0.1846038, -0.0559614) val interceptR = 0.5024256 - assert(model1.intercept ~== interceptRStd relTol 6E-3) + assert(model1.intercept ~== interceptRStd relTol 6E-2) assert(model1.coefficients ~== coefficientsRStd absTol 5E-3) assert(model2.intercept ~== interceptR relTol 6E-3) assert(model2.coefficients ~= coefficientsR absTol 1E-3) @@ -1677,10 +1679,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // use tighter constraints because OWL-QN solver takes longer to converge val trainer1 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) - .setMaxIter(300).setTol(1e-10).setWeightCol("weight") + .setMaxIter(160).setTol(1e-10).setWeightCol("weight") val trainer2 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) - .setMaxIter(300).setTol(1e-10).setWeightCol("weight") + .setMaxIter(110).setTol(1e-10).setWeightCol("weight") val model1 = trainer1.fit(multinomialDataset) val model2 = trainer2.fit(multinomialDataset) @@ -1767,7 +1769,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 0.0, 0.0, 0.0, 0.0), isTransposed = true) val interceptsR = Vectors.dense(-0.44215290, 0.76308326, -0.3209304) - assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02) + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05) assert(model1.interceptVector ~== interceptsRStd relTol 0.1) assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02) @@ -2145,10 +2147,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { test("multinomial logistic regression with intercept with elasticnet regularization") { val trainer1 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight") .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) - .setMaxIter(300).setTol(1e-10) + .setMaxIter(220).setTol(1e-10) val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight") .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) - .setMaxIter(300).setTol(1e-10) + .setMaxIter(90).setTol(1e-10) val model1 = trainer1.fit(multinomialDataset) val model2 = trainer2.fit(multinomialDataset) @@ -2234,8 +2236,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 0.0, 0.0, 0.0, 0.0), isTransposed = true) val interceptsR = Vectors.dense(-0.38857157, 0.62492165, -0.2363501) - assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) - assert(model1.interceptVector ~== interceptsRStd absTol 0.01) + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05) + assert(model1.interceptVector ~== interceptsRStd absTol 0.1) assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) assert(model2.interceptVector ~== interceptsR absTol 0.01) @@ -2245,10 +2247,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { test("multinomial logistic regression without intercept with elasticnet regularization") { val trainer1 = (new LogisticRegression).setFitIntercept(false).setWeightCol("weight") .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) - .setMaxIter(300).setTol(1e-10) + .setMaxIter(75).setTol(1e-10) val trainer2 = (new LogisticRegression).setFitIntercept(false).setWeightCol("weight") .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) - .setMaxIter(300).setTol(1e-10) + .setMaxIter(50).setTol(1e-10) val model1 = trainer1.fit(multinomialDataset) val model2 = trainer2.fit(multinomialDataset) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala index 3062aa9f3d27..ba4a9cf08278 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite -import org.apache.spark.ml.tree.ClassificationLeafNode +import org.apache.spark.ml.tree.LeafNode import org.apache.spark.ml.tree.impl.TreeTests import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} @@ -71,8 +71,7 @@ class RandomForestClassifierSuite extends MLTest with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new RandomForestClassifier) val model = new RandomForestClassificationModel("rfc", - Array(new DecisionTreeClassificationModel("dtc", - new ClassificationLeafNode(0.0, 0.0, null), 1, 2)), 2, 2) + Array(new DecisionTreeClassificationModel("dtc", new LeafNode(0.0, 0.0, null), 1, 2)), 2, 2) ParamsSuite.checkParams(model) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala index ed15a1d88a26..a4d388fd321d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorAssemblerSuite.scala @@ -256,4 +256,9 @@ class VectorAssemblerSuite assert(runWithMetadata("keep", additional_filter = "id1 > 2").count() == 4) } + test("SPARK-25371: VectorAssembler with empty inputCols") { + val vectorAssembler = new VectorAssembler().setInputCols(Array()).setOutputCol("a") + val output = vectorAssembler.transform(dfWithNullsAndNaNs) + assert(output.select("a").limit(1).collect().head == Row(Vectors.sparse(0, Seq.empty))) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index 9ae27339b11d..29a438396516 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -191,20 +191,6 @@ class DecisionTreeRegressorSuite extends MLTest with DefaultReadWriteTest { TreeTests.allParamSettings ++ Map("maxDepth" -> 0), TreeTests.allParamSettings ++ Map("maxDepth" -> 0), checkModelData) } - - test("label/impurity stats") { - val categoricalFeatures = Map(0 -> 2, 1 -> 2) - val df = TreeTests.setMetadata(categoricalDataPointsRDD, categoricalFeatures, numClasses = 0) - val dtr = new DecisionTreeRegressor() - .setImpurity("variance") - .setMaxDepth(2) - .setMaxBins(8) - val model = dtr.fit(df) - val statInfo = model.rootNode - - assert(statInfo.getCount == 1000.0 && statInfo.getSum == 600.0 - && statInfo.getSumOfSquares == 600.0) - } } private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite { diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala index 1a6a8d67d8d6..38e25131df86 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.source.image +import java.net.URI import java.nio.file.Paths import org.apache.spark.SparkFunSuite @@ -58,8 +59,14 @@ class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext { .load(filePath) assert(df2.count() === 1) val result = df2.head() - assert(result === invalidImageRow( - Paths.get(filePath).toAbsolutePath().normalize().toUri().toString)) + + val resultOrigin = result.getStruct(0).getString(0) + // covert `origin` to `java.net.URI` object and then compare. + // because `file:/path` and `file:///path` are both valid URI-ifications + assert(new URI(resultOrigin) === Paths.get(filePath).toAbsolutePath().normalize().toUri()) + + // Compare other columns in the row to be the same with the `invalidImageRow` + assert(result === invalidImageRow(resultOrigin)) } test("image datasource partition test") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala index 4dbbd75d2466..743dacf146fe 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -340,8 +340,8 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { assert(topNode.stats.impurity > 0.0) // set impurity and predict for child nodes - assert(topNode.leftChild.get.toNode(isClassification = true).prediction === 0.0) - assert(topNode.rightChild.get.toNode(isClassification = true).prediction === 1.0) + assert(topNode.leftChild.get.toNode.prediction === 0.0) + assert(topNode.rightChild.get.toNode.prediction === 1.0) assert(topNode.leftChild.get.stats.impurity === 0.0) assert(topNode.rightChild.get.stats.impurity === 0.0) } @@ -382,8 +382,8 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { assert(topNode.stats.impurity > 0.0) // set impurity and predict for child nodes - assert(topNode.leftChild.get.toNode(isClassification = true).prediction === 0.0) - assert(topNode.rightChild.get.toNode(isClassification = true).prediction === 1.0) + assert(topNode.leftChild.get.toNode.prediction === 0.0) + assert(topNode.rightChild.get.toNode.prediction === 1.0) assert(topNode.leftChild.get.stats.impurity === 0.0) assert(topNode.rightChild.get.stats.impurity === 0.0) } @@ -582,18 +582,18 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { left right */ val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0)) - val left = new ClassificationLeafNode(0.0, leftImp.calculate(), leftImp) + val left = new LeafNode(0.0, leftImp.calculate(), leftImp) val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0)) - val right = new ClassificationLeafNode(2.0, rightImp.calculate(), rightImp) + val right = new LeafNode(2.0, rightImp.calculate(), rightImp) - val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5), true) + val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5)) val parentImp = parent.impurityStats val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0)) - val left2 = new ClassificationLeafNode(0.0, left2Imp.calculate(), left2Imp) + val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp) - val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0), true) + val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0)) val grandImp = grandParent.impurityStats // Test feature importance computed at different subtrees. @@ -618,8 +618,8 @@ class RandomForestSuite extends SparkFunSuite with MLlibTestSparkContext { // Forest consisting of (full tree) + (internal node with 2 leafs) val trees = Array(parent, grandParent).map { root => - new DecisionTreeClassificationModel(root.asInstanceOf[ClassificationNode], - numFeatures = 2, numClasses = 3).asInstanceOf[DecisionTreeModel] + new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3) + .asInstanceOf[DecisionTreeModel] } val importances: Vector = TreeEnsembleModel.featureImportances(trees, 2) val tree2norm = feature0importance + feature1importance diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala index 3f03d909d4a4..b6894b30b0c2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala @@ -159,7 +159,7 @@ private[ml] object TreeTests extends SparkFunSuite { * @param split Split for parent node * @return Parent node with children attached */ - def buildParentNode(left: Node, right: Node, split: Split, isClassification: Boolean): Node = { + def buildParentNode(left: Node, right: Node, split: Split): Node = { val leftImp = left.impurityStats val rightImp = right.impurityStats val parentImp = leftImp.copy.add(rightImp) @@ -168,15 +168,7 @@ private[ml] object TreeTests extends SparkFunSuite { val gain = parentImp.calculate() - (leftWeight * leftImp.calculate() + rightWeight * rightImp.calculate()) val pred = parentImp.predict - if (isClassification) { - new ClassificationInternalNode(pred, parentImp.calculate(), gain, - left.asInstanceOf[ClassificationNode], right.asInstanceOf[ClassificationNode], - split, parentImp) - } else { - new RegressionInternalNode(pred, parentImp.calculate(), gain, - left.asInstanceOf[RegressionNode], right.asInstanceOf[RegressionNode], - split, parentImp) - } + new InternalNode(pred, parentImp.calculate(), gain, left, right, split, parentImp) } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala index 76d41f9b2371..acac171346a8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala @@ -21,12 +21,13 @@ import java.io.File import org.scalatest.Suite -import org.apache.spark.SparkContext +import org.apache.spark.{DebugFilesystem, SparkConf, SparkContext} import org.apache.spark.ml.{PredictionModel, Transformer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.{DataFrame, Dataset, Encoder, Row} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions.col +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.StreamTest import org.apache.spark.sql.test.TestSparkSession import org.apache.spark.util.Utils @@ -36,6 +37,13 @@ trait MLTest extends StreamTest with TempDirectory { self: Suite => @transient var sc: SparkContext = _ @transient var checkpointDir: String = _ + protected override def sparkConf = { + new SparkConf() + .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName) + .set("spark.unsafe.exceptionOnMemoryLeak", "true") + .set(SQLConf.CODEGEN_FALLBACK.key, "false") + } + protected override def createSparkSession: TestSparkSession = { new TestSparkSession(new SparkContext("local[2]", "MLlibUnitTest", sparkConf)) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala index 5973479dfb5e..6c1d58089867 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala @@ -17,53 +17,55 @@ package org.apache.spark.mllib.linalg +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.util.Benchmark /** * Serialization benchmark for VectorUDT. + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "mllib/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "mllib/test:runMain " + * Results will be written to "benchmarks/UDTSerializationBenchmark-results.txt". + * }}} */ -object UDTSerializationBenchmark { +object UDTSerializationBenchmark extends BenchmarkBase { - def main(args: Array[String]): Unit = { - val iters = 1e2.toInt - val numRows = 1e3.toInt + override def runBenchmarkSuite(): Unit = { - val encoder = ExpressionEncoder[Vector].resolveAndBind() + runBenchmark("VectorUDT de/serialization") { + val iters = 1e2.toInt + val numRows = 1e3.toInt - val vectors = (1 to numRows).map { i => - Vectors.dense(Array.fill(1e5.toInt)(1.0 * i)) - }.toArray - val rows = vectors.map(encoder.toRow) + val encoder = ExpressionEncoder[Vector].resolveAndBind() - val benchmark = new Benchmark("VectorUDT de/serialization", numRows, iters) + val vectors = (1 to numRows).map { i => + Vectors.dense(Array.fill(1e5.toInt)(1.0 * i)) + }.toArray + val rows = vectors.map(encoder.toRow) - benchmark.addCase("serialize") { _ => - var sum = 0 - var i = 0 - while (i < numRows) { - sum += encoder.toRow(vectors(i)).numFields - i += 1 + val benchmark = new Benchmark("VectorUDT de/serialization", numRows, iters, output = output) + + benchmark.addCase("serialize") { _ => + var sum = 0 + var i = 0 + while (i < numRows) { + sum += encoder.toRow(vectors(i)).numFields + i += 1 + } } - } - benchmark.addCase("deserialize") { _ => - var sum = 0 - var i = 0 - while (i < numRows) { - sum += encoder.fromRow(rows(i)).numActives - i += 1 + benchmark.addCase("deserialize") { _ => + var sum = 0 + var i = 0 + while (i < numRows) { + sum += encoder.fromRow(rows(i)).numActives + i += 1 + } } - } - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - VectorUDT de/serialization: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - serialize 265 / 318 0.0 265138.5 1.0X - deserialize 155 / 197 0.0 154611.4 1.7X - */ - benchmark.run() + benchmark.run() + } } } diff --git a/pom.xml b/pom.xml index da526a1709e6..98da38f04553 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT pom Spark Project Parent POM http://spark.apache.org/ @@ -131,7 +131,7 @@ 1.2.1 10.12.1.1 1.10.0 - 1.5.2 + 1.5.3 nohive 1.6.0 9.3.24.v20180605 @@ -158,8 +158,7 @@ 2.11.12 2.11 1.9.13 - 2.6.7 - 2.6.7.1 + 2.9.6 1.1.7.1 1.1.2 1.2.0-incubating @@ -170,7 +169,7 @@ 3.5 3.2.10 - 3.0.9 + 3.0.10 2.22.2 2.9.3 3.5.2 @@ -541,7 +540,7 @@ org.lz4 lz4-java - 1.4.0 + 1.5.0 com.github.luben @@ -629,7 +628,7 @@ com.fasterxml.jackson.core jackson-databind - ${fasterxml.jackson.databind.version} + ${fasterxml.jackson.version} com.fasterxml.jackson.core @@ -641,7 +640,7 @@ com.fasterxml.jackson.module jackson-module-scala_${scala.binary.version} - ${fasterxml.jackson.databind.version} + ${fasterxml.jackson.version} com.google.guava @@ -2683,7 +2682,7 @@ hadoop-2.7 - 2.7.7 + 2.7.3 2.7.1 @@ -2760,7 +2759,7 @@ scala-2.12 - 2.12.6 + 2.12.7 2.12 diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 7ff783da130a..0b074fbf64ed 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -34,8 +34,18 @@ import com.typesafe.tools.mima.core.ProblemFilters._ */ object MimaExcludes { + // Exclude rules for 3.0.x + lazy val v30excludes = v24excludes ++ Seq( + ) + // Exclude rules for 2.4.x lazy val v24excludes = v23excludes ++ Seq( + // [SPARK-23429][CORE] Add executor memory metrics to heartbeat and expose in executors REST API + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate.apply"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate.copy"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate.this"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.scheduler.SparkListenerExecutorMetricsUpdate$"), + // [SPARK-25248] add package private methods to TaskContext ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.TaskContext.markTaskFailed"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.TaskContext.markInterrupted"), @@ -93,13 +103,6 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.param.Params.defaultParamMap"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.param.Params.org$apache$spark$ml$param$Params$_setter_$defaultParamMap_="), - // [SPARK-14681][ML] Provide label/impurity stats for spark.ml decision tree nodes - ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.tree.LeafNode"), - ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.tree.InternalNode"), - ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.ml.tree.Node"), - ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.DecisionTreeClassificationModel.this"), - ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.regression.DecisionTreeRegressionModel.this"), - // [SPARK-7132][ML] Add fit with validation set to spark.ml GBT ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasValidationIndicatorCol.getValidationIndicatorCol"), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasValidationIndicatorCol.org$apache$spark$ml$param$shared$HasValidationIndicatorCol$_setter_$validationIndicatorCol_="), @@ -1196,6 +1199,7 @@ object MimaExcludes { } def excludes(version: String) = version match { + case v if v.startsWith("3.0") => v30excludes case v if v.startsWith("2.4") => v24excludes case v if v.startsWith("2.3") => v23excludes case v if v.startsWith("2.2") => v22excludes diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py index 30ad04297c68..00ec094e7e3b 100644 --- a/python/pyspark/accumulators.py +++ b/python/pyspark/accumulators.py @@ -109,10 +109,14 @@ def _deserialize_accumulator(aid, zero_value, accum_param): from pyspark.accumulators import _accumulatorRegistry - accum = Accumulator(aid, zero_value, accum_param) - accum._deserialized = True - _accumulatorRegistry[aid] = accum - return accum + # If this certain accumulator was deserialized, don't overwrite it. + if aid in _accumulatorRegistry: + return _accumulatorRegistry[aid] + else: + accum = Accumulator(aid, zero_value, accum_param) + accum._deserialized = True + _accumulatorRegistry[aid] = accum + return accum class Accumulator(object): diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py index b3dfc99962a3..1c7f2a7418df 100644 --- a/python/pyspark/broadcast.py +++ b/python/pyspark/broadcast.py @@ -15,13 +15,16 @@ # limitations under the License. # +import gc import os +import socket import sys -import gc from tempfile import NamedTemporaryFile import threading from pyspark.cloudpickle import print_exec +from pyspark.java_gateway import local_connect_and_auth +from pyspark.serializers import ChunkedStream from pyspark.util import _exception_message if sys.version < '3': @@ -64,19 +67,43 @@ class Broadcast(object): >>> large_broadcast = sc.broadcast(range(10000)) """ - def __init__(self, sc=None, value=None, pickle_registry=None, path=None): + def __init__(self, sc=None, value=None, pickle_registry=None, path=None, + sock_file=None): """ Should not be called directly by users -- use L{SparkContext.broadcast()} instead. """ if sc is not None: + # we're on the driver. We want the pickled data to end up in a file (maybe encrypted) f = NamedTemporaryFile(delete=False, dir=sc._temp_dir) - self._path = self.dump(value, f) - self._jbroadcast = sc._jvm.PythonRDD.readBroadcastFromFile(sc._jsc, self._path) + self._path = f.name + python_broadcast = sc._jvm.PythonRDD.setupBroadcast(self._path) + if sc._encryption_enabled: + # with encryption, we ask the jvm to do the encryption for us, we send it data + # over a socket + port, auth_secret = python_broadcast.setupEncryptionServer() + (encryption_sock_file, _) = local_connect_and_auth(port, auth_secret) + broadcast_out = ChunkedStream(encryption_sock_file, 8192) + else: + # no encryption, we can just write pickled data directly to the file from python + broadcast_out = f + self.dump(value, broadcast_out) + if sc._encryption_enabled: + python_broadcast.waitTillDataReceived() + self._jbroadcast = sc._jsc.broadcast(python_broadcast) self._pickle_registry = pickle_registry else: + # we're on an executor self._jbroadcast = None - self._path = path + if sock_file is not None: + # the jvm is doing decryption for us. Read the value + # immediately from the sock_file + self._value = self.load(sock_file) + else: + # the jvm just dumps the pickled data in path -- we'll unpickle lazily when + # the value is requested + assert(path is not None) + self._path = path def dump(self, value, f): try: @@ -89,24 +116,25 @@ def dump(self, value, f): print_exec(sys.stderr) raise pickle.PicklingError(msg) f.close() - return f.name - def load(self, path): + def load_from_path(self, path): with open(path, 'rb', 1 << 20) as f: - # pickle.load() may create lots of objects, disable GC - # temporary for better performance - gc.disable() - try: - return pickle.load(f) - finally: - gc.enable() + return self.load(f) + + def load(self, file): + # "file" could also be a socket + gc.disable() + try: + return pickle.load(file) + finally: + gc.enable() @property def value(self): """ Return the broadcasted value """ if not hasattr(self, "_value") and self._path is not None: - self._value = self.load(self._path) + self._value = self.load_from_path(self._path) return self._value def unpersist(self, blocking=False): diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 4cabae4b2f50..0924d3d95f04 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -33,9 +33,9 @@ from pyspark.broadcast import Broadcast, BroadcastPickleRegistry from pyspark.conf import SparkConf from pyspark.files import SparkFiles -from pyspark.java_gateway import launch_gateway +from pyspark.java_gateway import launch_gateway, local_connect_and_auth from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer, \ - PairDeserializer, AutoBatchedSerializer, NoOpSerializer + PairDeserializer, AutoBatchedSerializer, NoOpSerializer, ChunkedStream from pyspark.storagelevel import StorageLevel from pyspark.rdd import RDD, _load_from_socket, ignore_unicode_prefix from pyspark.traceback_utils import CallSite, first_spark_call @@ -189,6 +189,11 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port, auth_token) self._jsc.sc().register(self._javaAccumulator) + # If encryption is enabled, we need to setup a server in the jvm to read broadcast + # data via a socket. + # scala's mangled names w/ $ in them require special treatment. + self._encryption_enabled = self._jvm.PythonUtils.getEncryptionEnabled(self._jsc) + self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') self.pythonVer = "%d.%d" % sys.version_info[:2] @@ -498,23 +503,48 @@ def f(split, iterator): def reader_func(temp_filename): return self._jvm.PythonRDD.readRDDFromFile(self._jsc, temp_filename, numSlices) - jrdd = self._serialize_to_jvm(c, serializer, reader_func) + def createRDDServer(): + return self._jvm.PythonParallelizeServer(self._jsc.sc(), numSlices) + + jrdd = self._serialize_to_jvm(c, serializer, reader_func, createRDDServer) return RDD(jrdd, self, serializer) - def _serialize_to_jvm(self, data, serializer, reader_func): - """ - Calling the Java parallelize() method with an ArrayList is too slow, - because it sends O(n) Py4J commands. As an alternative, serialized - objects are written to a file and loaded through textFile(). - """ - tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) - try: - serializer.dump_stream(data, tempFile) - tempFile.close() - return reader_func(tempFile.name) - finally: - # readRDDFromFile eagerily reads the file so we can delete right after. - os.unlink(tempFile.name) + def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer): + """ + Using py4j to send a large dataset to the jvm is really slow, so we use either a file + or a socket if we have encryption enabled. + :param data: + :param serializer: + :param reader_func: A function which takes a filename and reads in the data in the jvm and + returns a JavaRDD. Only used when encryption is disabled. + :param createRDDServer: A function which creates a PythonRDDServer in the jvm to + accept the serialized data, for use when encryption is enabled. + :return: + """ + if self._encryption_enabled: + # with encryption, we open a server in java and send the data directly + server = createRDDServer() + (sock_file, _) = local_connect_and_auth(server.port(), server.secret()) + chunked_out = ChunkedStream(sock_file, 8192) + serializer.dump_stream(data, chunked_out) + chunked_out.close() + # this call will block until the server has read all the data and processed it (or + # throws an exception) + r = server.getResult() + return r + else: + # without encryption, we serialize to a file, and we read the file in java and + # parallelize from there. + tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) + try: + try: + serializer.dump_stream(data, tempFile) + finally: + tempFile.close() + return reader_func(tempFile.name) + finally: + # we eagerily reads the file so we can delete right after. + os.unlink(tempFile.name) def pickleFile(self, name, minPartitions=None): """ diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index d5963f4f7042..ce028512357f 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -773,8 +773,8 @@ def roc(self): which is a Dataframe having two fields (FPR, TPR) with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. - .. seealso:: `Wikipedia reference \ - `_ + .. seealso:: `Wikipedia reference + `_ .. note:: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. This will change in later Spark diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index ab449bc3f8f5..5ef4e765ea4e 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -1202,21 +1202,21 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada .. note:: Experimental Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by - Lin and Cohen. From the abstract: + `Lin and Cohen `_. From the abstract: PIC finds a very low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise similarity matrix of the data. This class is not yet an Estimator/Transformer, use :py:func:`assignClusters` method to run the PowerIterationClustering algorithm. - .. seealso:: `Wikipedia on Spectral clustering \ - `_ + .. seealso:: `Wikipedia on Spectral clustering + `_ - >>> data = [(1, 0, 0.5), \ - (2, 0, 0.5), (2, 1, 0.7), \ - (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), \ - (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), \ - (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] + >>> data = [(1, 0, 0.5), + ... (2, 0, 0.5), (2, 1, 0.7), + ... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), + ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), + ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight") >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight") >>> assignments = pic.assignClusters(df) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 760aa82168f5..eccb7acae5b9 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -207,8 +207,8 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp distance space. The output will be vectors of configurable dimension. Hash values in the same dimension are calculated by the same hash function. - .. seealso:: `Stable Distributions \ - `_ + .. seealso:: `Stable Distributions + `_ .. seealso:: `Hashing for Similarity Search: A Survey `_ >>> from pyspark.ml.linalg import Vectors @@ -303,7 +303,7 @@ def _create_model(self, java_model): class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): - """ + r""" .. note:: Experimental Model fitted by :py:class:`BucketedRandomProjectionLSH`, where multiple random vectors are @@ -653,8 +653,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit The return vector is scaled such that the transform matrix is unitary (aka scaled DCT-II). - .. seealso:: `More information on Wikipedia \ - `_. + .. seealso:: `More information on Wikipedia + `_. >>> from pyspark.ml.linalg import Vectors >>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"]) @@ -1353,7 +1353,7 @@ def _create_model(self, java_model): class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): - """ + r""" .. note:: Experimental Model produced by :py:class:`MinHashLSH`, where where multiple hash functions are stored. Each @@ -1362,8 +1362,8 @@ class MinHashLSHModel(LSHModel, JavaMLReadable, JavaMLWritable): :math:`h_i(x) = ((x \cdot a_i + b_i) \mod prime)` This hash family is approximately min-wise independent according to the reference. - .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear \ - permutations." Electronic Journal of Combinatorics 7 (2000): R26. + .. seealso:: Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear + permutations." Electronic Journal of Combinatorics 7 (2000): R26. .. versionadded:: 2.2.0 """ diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py index c2b29b73460f..886ad8409ca6 100644 --- a/python/pyspark/ml/fpm.py +++ b/python/pyspark/ml/fpm.py @@ -158,7 +158,7 @@ class FPGrowth(JavaEstimator, HasItemsCol, HasPredictionCol, HasMinSupport, HasNumPartitions, HasMinConfidence, JavaMLWritable, JavaMLReadable): - """ + r""" .. note:: Experimental A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py index ef6785b4a8ed..edb90a357854 100644 --- a/python/pyspark/ml/image.py +++ b/python/pyspark/ml/image.py @@ -25,8 +25,10 @@ """ import sys +import warnings import numpy as np + from pyspark import SparkContext from pyspark.sql.types import Row, _create_row, _parse_datatype_json_string from pyspark.sql import DataFrame, SparkSession @@ -207,6 +209,9 @@ def readImages(self, path, recursive=False, numPartitions=-1, .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but potentially non-deterministic. + .. note:: Deprecated in 2.4.0. Use `spark.read.format("image").load(path)` instead and + this `readImages` will be removed in 3.0.0. + :param str path: Path to the image directory. :param bool recursive: Recursive search flag. :param int numPartitions: Number of DataFrame partitions. @@ -222,7 +227,8 @@ def readImages(self, path, recursive=False, numPartitions=-1, .. versionadded:: 2.3.0 """ - + warnings.warn("`ImageSchema.readImage` is deprecated. " + + "Use `spark.read.format(\"image\").load(path)` instead.", DeprecationWarning) spark = SparkSession.builder.getOrCreate() image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema jsession = spark._jsparkSession diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 513ca5a9df85..98f436135184 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -188,8 +188,8 @@ def intercept(self): @property @since("2.3.0") def scale(self): - """ - The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0. + r""" + The value by which :math:`\|y - X'w\|` is scaled down when loss is "huber", otherwise 1.0. """ return self._call_java("scale") @@ -279,12 +279,12 @@ def featuresCol(self): @property @since("2.0.0") def explainedVariance(self): - """ + r""" Returns the explained variance regression score. - explainedVariance = 1 - variance(y - \hat{y}) / variance(y) + explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}` - .. seealso:: `Wikipedia explain variation \ - `_ + .. seealso:: `Wikipedia explain variation + `_ .. note:: This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. This will change in later Spark @@ -339,8 +339,8 @@ def r2(self): """ Returns R^2, the coefficient of determination. - .. seealso:: `Wikipedia coefficient of determination \ - `_ + .. seealso:: `Wikipedia coefficient of determination + `_ .. note:: This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. This will change in later Spark @@ -354,8 +354,8 @@ def r2adj(self): """ Returns Adjusted R^2, the adjusted coefficient of determination. - .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 \ - `_ + .. seealso:: `Wikipedia coefficient of determination, Adjusted R^2 + `_ .. note:: This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. This will change in later Spark versions. diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index b09469b9f5c2..b1a8af6bcc09 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -647,7 +647,7 @@ class PowerIterationClustering(object): @classmethod @since('1.5.0') def train(cls, rdd, k, maxIterations=100, initMode="random"): - """ + r""" :param rdd: An RDD of (i, j, s\ :sub:`ij`\) tuples representing the affinity matrix, which is the matrix A in the PIC paper. The diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 6c65da58e4e2..0bb0ca37c1ab 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -117,9 +117,9 @@ def __init__(self, predictionAndObservations): @property @since('1.4.0') def explainedVariance(self): - """ + r""" Returns the explained variance regression score. - explainedVariance = 1 - variance(y - \hat{y}) / variance(y) + explainedVariance = :math:`1 - \frac{variance(y - \hat{y})}{variance(y)}` """ return self.call("explainedVariance") diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 40ecd2e0ff4b..6d7d4d61db04 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -59,7 +59,7 @@ def transform(self, vector): class Normalizer(VectorTransformer): - """ + r""" Normalizes samples individually to unit L\ :sup:`p`\ norm For any 1 <= `p` < float('inf'), normalizes samples using diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index b317156885e5..ccf39e1ffbe9 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2399,7 +2399,7 @@ def barrier(self): :return: an :class:`RDDBarrier` instance that provides actions within a barrier stage. .. seealso:: :class:`BarrierTaskContext` - .. seealso:: `SPIP: Barrier Execution Mode \ + .. seealso:: `SPIP: Barrier Execution Mode `_ .. seealso:: `Design Doc `_ diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index 48006778e86f..ff9a612b77f6 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -731,6 +731,57 @@ def write_with_length(obj, stream): stream.write(obj) +class ChunkedStream(object): + + """ + This is a file-like object takes a stream of data, of unknown length, and breaks it into fixed + length frames. The intended use case is serializing large data and sending it immediately over + a socket -- we do not want to buffer the entire data before sending it, but the receiving end + needs to know whether or not there is more data coming. + + It works by buffering the incoming data in some fixed-size chunks. If the buffer is full, it + first sends the buffer size, then the data. This repeats as long as there is more data to send. + When this is closed, it sends the length of whatever data is in the buffer, then that data, and + finally a "length" of -1 to indicate the stream has completed. + """ + + def __init__(self, wrapped, buffer_size): + self.buffer_size = buffer_size + self.buffer = bytearray(buffer_size) + self.current_pos = 0 + self.wrapped = wrapped + + def write(self, bytes): + byte_pos = 0 + byte_remaining = len(bytes) + while byte_remaining > 0: + new_pos = byte_remaining + self.current_pos + if new_pos < self.buffer_size: + # just put it in our buffer + self.buffer[self.current_pos:new_pos] = bytes[byte_pos:] + self.current_pos = new_pos + byte_remaining = 0 + else: + # fill the buffer, send the length then the contents, and start filling again + space_left = self.buffer_size - self.current_pos + new_byte_pos = byte_pos + space_left + self.buffer[self.current_pos:self.buffer_size] = bytes[byte_pos:new_byte_pos] + write_int(self.buffer_size, self.wrapped) + self.wrapped.write(self.buffer) + byte_remaining -= space_left + byte_pos = new_byte_pos + self.current_pos = 0 + + def close(self): + # if there is anything left in the buffer, write it out first + if self.current_pos > 0: + write_int(self.current_pos, self.wrapped) + self.wrapped.write(self.buffer[:self.current_pos]) + # -1 length indicates to the receiving end that we're done. + write_int(-1, self.wrapped) + self.wrapped.close() + + if __name__ == '__main__': import doctest (failure_count, test_count) = doctest.testmod() diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index 472c3cd4452f..65e3bdbc05ce 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -54,7 +54,7 @@ sqlContext = spark._wrapped sqlCtx = sqlContext -print("""Welcome to +print(r"""Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 9c094dd9a903..1938965a7e17 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -485,7 +485,8 @@ def __init__(self, sparkContext, jhiveContext=None): "SparkSession.builder.enableHiveSupport().getOrCreate() instead.", DeprecationWarning) if jhiveContext is None: - sparkSession = SparkSession.builder.enableHiveSupport().getOrCreate() + sparkContext._conf.set("spark.sql.catalogImplementation", "hive") + sparkSession = SparkSession.builder._sparkContext(sparkContext).getOrCreate() else: sparkSession = SparkSession(sparkContext, jhiveContext.sparkSession()) SQLContext.__init__(self, sparkContext, sparkSession, jhiveContext) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 1affc9b4fcf6..bf6b99048761 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -880,16 +880,23 @@ def sampleBy(self, col, fractions, seed=None): | 0| 5| | 1| 9| +---+-----+ + >>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count() + 33 + .. versionchanged:: 3.0 + Added sampling by a column of :class:`Column` """ - if not isinstance(col, basestring): - raise ValueError("col must be a string, but got %r" % type(col)) + if isinstance(col, basestring): + col = Column(col) + elif not isinstance(col, Column): + raise ValueError("col must be a string or a column, but got %r" % type(col)) if not isinstance(fractions, dict): raise ValueError("fractions must be a dict but got %r" % type(fractions)) for k, v in fractions.items(): if not isinstance(k, (float, int, long, basestring)): raise ValueError("key must be float, int, long, or string, but got %r" % type(k)) fractions[k] = float(v) + col = col._jc seed = seed if seed is not None else random.randint(0, sys.maxsize) return DataFrame(self._jdf.stat().sampleBy(col, self._jmap(fractions), seed), self.sql_ctx) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 864780e0be9b..5425d311f8c7 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -283,7 +283,8 @@ def approxCountDistinct(col, rsd=None): @since(2.1) def approx_count_distinct(col, rsd=None): - """Aggregate function: returns a new :class:`Column` for approximate distinct count of column `col`. + """Aggregate function: returns a new :class:`Column` for approximate distinct count of + column `col`. :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more efficient to use :func:`countDistinct` @@ -346,7 +347,8 @@ def coalesce(*cols): @since(1.6) def corr(col1, col2): - """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` and ``col2``. + """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` + and ``col2``. >>> a = range(20) >>> b = [2 * x for x in range(20)] @@ -1281,9 +1283,18 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'): @since(1.5) def from_utc_timestamp(timestamp, tz): """ - Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders - that time as a timestamp in the given time zone. For example, 'GMT+1' would yield - '2017-07-14 03:40:00.0'. + This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function + takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in UTC, and + renders that timestamp as a timestamp in the given time zone. + + However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not + timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to + the given timezone. + + This function may return confusing result if the input is a string with timezone, e.g. + '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp + according to the timezone in the string, and finally display the result by converting the + timestamp to string according to the session local timezone. :param timestamp: the column that contains timestamps :param tz: a string that has the ID of timezone, e.g. "GMT", "America/Los_Angeles", etc @@ -1306,9 +1317,18 @@ def from_utc_timestamp(timestamp, tz): @since(1.5) def to_utc_timestamp(timestamp, tz): """ - Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time - zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield - '2017-07-14 01:40:00.0'. + This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function + takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in the given + timezone, and renders that timestamp as a timestamp in UTC. + + However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not + timezone-agnostic. So in Spark this function just shift the timestamp value from the given + timezone to UTC timezone. + + This function may return confusing result if the input is a string with timezone, e.g. + '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp + according to the timezone in the string, and finally display the result by converting the + timestamp to string according to the session local timezone. :param timestamp: the column that contains timestamps :param tz: a string that has the ID of timezone, e.g. "GMT", "America/Los_Angeles", etc @@ -1671,31 +1691,45 @@ def repeat(col, n): @since(1.5) @ignore_unicode_prefix -def split(str, pattern): +def split(str, pattern, limit=-1): """ - Splits str around pattern (pattern is a regular expression). + Splits str around matches of the given pattern. - .. note:: pattern is a string represent the regular expression. + :param str: a string expression to split + :param pattern: a string representing a regular expression. The regex string should be + a Java regular expression. + :param limit: an integer which controls the number of times `pattern` is applied. - >>> df = spark.createDataFrame([('ab12cd',)], ['s',]) - >>> df.select(split(df.s, '[0-9]+').alias('s')).collect() - [Row(s=[u'ab', u'cd'])] + * ``limit > 0``: The resulting array's length will not be more than `limit`, and the + resulting array's last entry will contain all input beyond the last + matched pattern. + * ``limit <= 0``: `pattern` will be applied as many times as possible, and the resulting + array can be of any size. + + .. versionchanged:: 3.0 + `split` now takes an optional `limit` field. If not provided, default limit value is -1. + + >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) + >>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect() + [Row(s=[u'one', u'twoBthreeC'])] + >>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect() + [Row(s=[u'one', u'two', u'three', u''])] """ sc = SparkContext._active_spark_context - return Column(sc._jvm.functions.split(_to_java_column(str), pattern)) + return Column(sc._jvm.functions.split(_to_java_column(str), pattern, limit)) @ignore_unicode_prefix @since(1.5) def regexp_extract(str, pattern, idx): - """Extract a specific group matched by a Java regex, from the specified string column. + r"""Extract a specific group matched by a Java regex, from the specified string column. If the regex did not match, or the specified group did not match, an empty string is returned. >>> df = spark.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect() + >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect() [Row(d=u'100')] >>> df = spark.createDataFrame([('foo',)], ['str']) - >>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect() + >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect() [Row(d=u'')] >>> df = spark.createDataFrame([('aaaac',)], ['str']) >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect() @@ -1709,10 +1743,10 @@ def regexp_extract(str, pattern, idx): @ignore_unicode_prefix @since(1.5) def regexp_replace(str, pattern, replacement): - """Replace all substrings of the specified string value that match regexp with rep. + r"""Replace all substrings of the specified string value that match regexp with rep. >>> df = spark.createDataFrame([('100-200',)], ['str']) - >>> df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect() + >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect() [Row(d=u'-----')] """ sc = SparkContext._active_spark_context @@ -2293,7 +2327,9 @@ def to_json(col, options={}): into a JSON string. Throws an exception, in the case of an unsupported type. :param col: name of column containing a struct, an array or a map. - :param options: options to control converting. accepts the same options as the JSON datasource + :param options: options to control converting. accepts the same options as the JSON datasource. + Additionally the function supports the `pretty` option which enables + pretty JSON generation. >>> from pyspark.sql import Row >>> from pyspark.sql.types import * @@ -2326,11 +2362,15 @@ def to_json(col, options={}): @ignore_unicode_prefix @since(2.4) -def schema_of_json(col): +def schema_of_json(col, options={}): """ Parses a column containing a JSON string and infers its schema in DDL format. :param col: string column in json format + :param options: options to control parsing. accepts the same options as the JSON datasource + + .. versionchanged:: 3.0 + It accepts `options` parameter to control schema inferring. >>> from pyspark.sql.types import * >>> data = [(1, '{"a": 1}')] @@ -2339,10 +2379,13 @@ def schema_of_json(col): [Row(json=u'struct')] >>> df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect() [Row(json=u'struct')] + >>> schema = schema_of_json(lit('{a: 1}'), {'allowUnquotedFieldNames':'true'}) + >>> df.select(schema.alias("json")).collect() + [Row(json=u'struct')] """ sc = SparkContext._active_spark_context - jc = sc._jvm.functions.schema_of_json(_to_java_column(col)) + jc = sc._jvm.functions.schema_of_json(_to_java_column(col), options) return Column(jc) @@ -2690,6 +2733,39 @@ def udf(f=None, returnType=StringType()): | 8| JOHN DOE| 22| +----------+--------------+------------+ """ + + # The following table shows most of Python data and SQL type conversions in normal UDFs that + # are not yet visible to the user. Some of behaviors are buggy and might be changed in the near + # future. The table might have to be eventually documented externally. + # Please see SPARK-25666's PR to see the codes in order to generate the table below. + # + # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa + # |SQL Type \ Python Value(Type)|None(NoneType)|True(bool)|1(int)|1(long)| a(str)| a(unicode)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array('i', [1])(array)|[1](list)| (1,)(tuple)| ABC(bytearray)| 1(Decimal)|{'a': 1}(dict)|Row(kwargs=1)(Row)|Row(namedtuple=1)(Row)| # noqa + # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa + # | boolean| None| True| None| None| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | tinyint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | smallint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | int| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | bigint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa + # | string| None| u'true'| u'1'| u'1'| u'a'| u'a'|u'java.util.Grego...| u'java.util.Grego...| u'1.0'| u'[I@24a83055'| u'[1]'|u'[Ljava.lang.Obj...| u'[B@49093632'| u'1'| u'{a=1}'| X| X| # noqa + # | date| None| X| X| X| X| X|datetime.date(197...| datetime.date(197...| X| X| X| X| X| X| X| X| X| # noqa + # | timestamp| None| X| X| X| X| X| X| datetime.datetime...| X| X| X| X| X| X| X| X| X| # noqa + # | float| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa + # | double| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa + # | array| None| None| None| None| None| None| None| None| None| [1]| [1]| [1]| [65, 66, 67]| None| None| X| X| # noqa + # | binary| None| None| None| None|bytearray(b'a')|bytearray(b'a')| None| None| None| None| None| None|bytearray(b'ABC')| None| None| X| X| # noqa + # | decimal(10,0)| None| None| None| None| None| None| None| None| None| None| None| None| None|Decimal('1')| None| X| X| # noqa + # | map| None| None| None| None| None| None| None| None| None| None| None| None| None| None| {u'a': 1}| X| X| # noqa + # | struct<_1:int>| None| X| X| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa + # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa + # + # Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be + # used in `returnType`. + # Note: The values inside of the table are generated by `repr`. + # Note: Python 2 is used to generate this table since it is used to check the backward + # compatibility often in practice. + # Note: 'X' means it throws an exception during the conversion. + # decorator @udf, @udf(), @udf(dataType()) if f is None or isinstance(f, (str, DataType)): # If DataType has been passed as a positional argument @@ -2720,9 +2796,10 @@ def pandas_udf(f=None, returnType=None, functionType=None): 1. SCALAR A scalar UDF defines a transformation: One or more `pandas.Series` -> A `pandas.Series`. - The returnType should be a primitive data type, e.g., :class:`DoubleType`. The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`. + :class:`MapType`, :class:`StructType` are currently not supported as output types. + Scalar UDFs are used with :meth:`pyspark.sql.DataFrame.withColumn` and :meth:`pyspark.sql.DataFrame.select`. @@ -2783,14 +2860,14 @@ def pandas_udf(f=None, returnType=None, functionType=None): +---+-------------------+ Alternatively, the user can define a function that takes two arguments. - In this case, the grouping key will be passed as the first argument and the data will - be passed as the second argument. The grouping key will be passed as a tuple of numpy + In this case, the grouping key(s) will be passed as the first argument and the data will + be passed as the second argument. The grouping key(s) will be passed as a tuple of numpy data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in as a `pandas.DataFrame` containing all columns from the original Spark DataFrame. - This is useful when the user does not want to hardcode grouping key in the function. + This is useful when the user does not want to hardcode grouping key(s) in the function. - >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> import pandas as pd # doctest: +SKIP + >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> df = spark.createDataFrame( ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ... ("id", "v")) # doctest: +SKIP @@ -2806,6 +2883,22 @@ def pandas_udf(f=None, returnType=None, functionType=None): | 1|1.5| | 2|6.0| +---+---+ + >>> @pandas_udf( + ... "id long, `ceil(v / 2)` long, v double", + ... PandasUDFType.GROUPED_MAP) # doctest: +SKIP + >>> def sum_udf(key, pdf): + ... # key is a tuple of two numpy.int64s, which is the values + ... # of 'id' and 'ceil(df.v / 2)' for the current group + ... return pd.DataFrame([key + (pdf.v.sum(),)]) + >>> df.groupby(df.id, ceil(df.v / 2)).apply(sum_udf).show() # doctest: +SKIP + +---+-----------+----+ + | id|ceil(v / 2)| v| + +---+-----------+----+ + | 2| 5|10.0| + | 1| 1| 3.0| + | 2| 3| 5.0| + | 2| 2| 3.0| + +---+-----------+----+ .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is recommended to explicitly index the columns by name to ensure the positions are correct, @@ -2888,6 +2981,12 @@ def pandas_udf(f=None, returnType=None, functionType=None): can fail on special rows, the workaround is to incorporate the condition into the functions. .. note:: The user-defined functions do not take keyword arguments on the calling side. + + .. note:: The data type of returned `pandas.Series` from the user-defined functions should be + matched with defined returnType (see :meth:`types.to_arrow_type` and + :meth:`types.from_arrow_type`). When there is mismatch between them, Spark might do + conversion on returned data. The conversion is not guaranteed to be correct and results + should be checked for accuracy by users. """ # decorator @pandas_udf(returnType, functionType) is_decorator = f is None or isinstance(f, (str, DataType)) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 49f4e6b2ede1..690b13072244 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -349,8 +349,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, - samplingRatio=None, enforceSchema=None): - """Loads a CSV file and returns the result as a :class:`DataFrame`. + samplingRatio=None, enforceSchema=None, emptyValue=None): + r"""Loads a CSV file and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if ``inferSchema`` is enabled. To avoid going through the entire data once, disable @@ -444,6 +444,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non different, ``\0`` otherwise. :param samplingRatio: defines fraction of rows used for schema inferring. If None is set, it uses the default value, ``1.0``. + :param emptyValue: sets the string representation of an empty value. If None is set, it uses + the default value, empty string. >>> df = spark.read.csv('python/test_support/sql/ages.csv') >>> df.dtypes @@ -463,7 +465,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio, - enforceSchema=enforceSchema) + enforceSchema=enforceSchema, emptyValue=emptyValue) if isinstance(path, basestring): path = [path] if type(path) == list: @@ -517,8 +519,8 @@ def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPar If both ``column`` and ``predicates`` are specified, ``column`` will be used. - .. note:: Don't create too many partitions in parallel on a large cluster; \ - otherwise Spark might crash your external database systems. + .. note:: Don't create too many partitions in parallel on a large cluster; + otherwise Spark might crash your external database systems. :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param table: the name of the table @@ -859,8 +861,8 @@ def text(self, path, compression=None, lineSep=None): def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None, header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None, timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, - charToEscapeQuoteEscaping=None, encoding=None): - """Saves the content of the :class:`DataFrame` in CSV format at the specified path. + charToEscapeQuoteEscaping=None, encoding=None, emptyValue=None): + r"""Saves the content of the :class:`DataFrame` in CSV format at the specified path. :param path: the path in any Hadoop supported file system :param mode: specifies the behavior of the save operation when data already exists. @@ -911,6 +913,8 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No different, ``\0`` otherwise.. :param encoding: sets the encoding (charset) of saved csv files. If None is set, the default UTF-8 charset will be used. + :param emptyValue: sets the string representation of an empty value. If None is set, it uses + the default value, ``""``. >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data')) """ @@ -921,7 +925,7 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, - encoding=encoding) + encoding=encoding, emptyValue=emptyValue) self._jwrite.csv(path) @since(1.5) @@ -958,8 +962,8 @@ def orc(self, path, mode=None, partitionBy=None, compression=None): def jdbc(self, url, table, mode=None, properties=None): """Saves the content of the :class:`DataFrame` to an external database table via JDBC. - .. note:: Don't create too many partitions in parallel on a large cluster; \ - otherwise Spark might crash your external database systems. + .. note:: Don't create too many partitions in parallel on a large cluster; + otherwise Spark might crash your external database systems. :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param table: Name of the table in the external database. diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 87d8d6a59a6e..079af8c05705 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -83,6 +83,7 @@ class Builder(object): _lock = RLock() _options = {} + _sc = None @since(2.0) def config(self, key=None, value=None, conf=None): @@ -139,6 +140,11 @@ def enableHiveSupport(self): """ return self.config("spark.sql.catalogImplementation", "hive") + def _sparkContext(self, sc): + with self._lock: + self._sc = sc + return self + @since(2.0) def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a @@ -150,7 +156,7 @@ def getOrCreate(self): default. >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate() - >>> s1.conf.get("k1") == s1.sparkContext.getConf().get("k1") == "v1" + >>> s1.conf.get("k1") == "v1" True In case an existing SparkSession is returned, the config options specified @@ -167,22 +173,19 @@ def getOrCreate(self): from pyspark.conf import SparkConf session = SparkSession._instantiatedSession if session is None or session._sc._jsc is None: - sparkConf = SparkConf() - for key, value in self._options.items(): - sparkConf.set(key, value) - sc = SparkContext.getOrCreate(sparkConf) - # This SparkContext may be an existing one. - for key, value in self._options.items(): - # we need to propagate the confs - # before we create the SparkSession. Otherwise, confs like - # warehouse path and metastore url will not be set correctly ( - # these confs cannot be changed once the SparkSession is created). - sc._conf.set(key, value) + if self._sc is not None: + sc = self._sc + else: + sparkConf = SparkConf() + for key, value in self._options.items(): + sparkConf.set(key, value) + # This SparkContext may be an existing one. + sc = SparkContext.getOrCreate(sparkConf) + # Do not update `SparkConf` for existing `SparkContext`, as it's shared + # by all sessions. session = SparkSession(sc) for key, value in self._options.items(): session._jsparkSession.sessionState().conf().setConfString(key, value) - for key, value in self._options.items(): - session.sparkContext._conf.set(key, value) return session builder = Builder() @@ -539,12 +542,18 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone): struct.names[i] = name schema = struct + jsqlContext = self._wrapped._jsqlContext + def reader_func(temp_filename): - return self._jvm.PythonSQLUtils.arrowReadStreamFromFile( - self._wrapped._jsqlContext, temp_filename, schema.json()) + return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename) + + def create_RDD_server(): + return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition - jdf = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func) + jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func, + create_RDD_server) + jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index ee13778a7dcd..b18453b2a4f9 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -564,8 +564,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, - enforceSchema=None): - """Loads a CSV file stream and returns the result as a :class:`DataFrame`. + enforceSchema=None, emptyValue=None): + r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if ``inferSchema`` is enabled. To avoid going through the entire data once, disable @@ -658,6 +658,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non the quote character. If None is set, the default value is escape character when escape and quote characters are different, ``\0`` otherwise.. + :param emptyValue: sets the string representation of an empty value. If None is set, it uses + the default value, empty string. >>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema) >>> csv_sdf.isStreaming @@ -674,7 +676,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxCharsPerColumn=maxCharsPerColumn, maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, - charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema) + charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema, + emptyValue=emptyValue) if isinstance(path, basestring): return self._df(self._jreader.csv(path)) else: diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 81c0af0b3d81..85712df5f2ad 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -26,6 +26,7 @@ import pydoc import shutil import tempfile +import threading import pickle import functools import time @@ -79,7 +80,7 @@ _have_pyarrow = _pyarrow_requirement_message is None _test_compiled = _test_not_compiled_message is None -from pyspark import SparkContext +from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row from pyspark.sql.types import * from pyspark.sql.types import UserDefinedType, _infer_type, _make_type_verifier @@ -228,12 +229,12 @@ def sql_conf(self, pairs): class ReusedSQLTestCase(ReusedPySparkTestCase, SQLTestUtils): @classmethod def setUpClass(cls): - ReusedPySparkTestCase.setUpClass() + super(ReusedSQLTestCase, cls).setUpClass() cls.spark = SparkSession(cls.sc) @classmethod def tearDownClass(cls): - ReusedPySparkTestCase.tearDownClass() + super(ReusedSQLTestCase, cls).tearDownClass() cls.spark.stop() def assertPandasEqual(self, expected, result): @@ -277,6 +278,54 @@ def test_struct_field_type_name(self): struct_field = StructField("a", IntegerType()) self.assertRaises(TypeError, struct_field.typeName) + def test_invalid_create_row(self): + row_class = Row("c1", "c2") + self.assertRaises(ValueError, lambda: row_class(1, 2, 3)) + + +class SparkSessionBuilderTests(unittest.TestCase): + + def test_create_spark_context_first_then_spark_session(self): + sc = None + session = None + try: + conf = SparkConf().set("key1", "value1") + sc = SparkContext('local[4]', "SessionBuilderTests", conf=conf) + session = SparkSession.builder.config("key2", "value2").getOrCreate() + + self.assertEqual(session.conf.get("key1"), "value1") + self.assertEqual(session.conf.get("key2"), "value2") + self.assertEqual(session.sparkContext, sc) + + self.assertFalse(sc.getConf().contains("key2")) + self.assertEqual(sc.getConf().get("key1"), "value1") + finally: + if session is not None: + session.stop() + if sc is not None: + sc.stop() + + def test_another_spark_session(self): + session1 = None + session2 = None + try: + session1 = SparkSession.builder.config("key1", "value1").getOrCreate() + session2 = SparkSession.builder.config("key2", "value2").getOrCreate() + + self.assertEqual(session1.conf.get("key1"), "value1") + self.assertEqual(session2.conf.get("key1"), "value1") + self.assertEqual(session1.conf.get("key2"), "value2") + self.assertEqual(session2.conf.get("key2"), "value2") + self.assertEqual(session1.sparkContext, session2.sparkContext) + + self.assertEqual(session1.sparkContext.getConf().get("key1"), "value1") + self.assertFalse(session1.sparkContext.getConf().contains("key2")) + finally: + if session1 is not None: + session1.stop() + if session2 is not None: + session2.stop() + class SQLTests(ReusedSQLTestCase): @@ -547,6 +596,70 @@ def test_udf_in_filter_on_top_of_join(self): df = left.crossJoin(right).filter(f("a", "b")) self.assertEqual(df.collect(), [Row(a=1, b=1)]) + def test_udf_in_join_condition(self): + # regression test for SPARK-25314 + from pyspark.sql.functions import udf + left = self.spark.createDataFrame([Row(a=1)]) + right = self.spark.createDataFrame([Row(b=1)]) + f = udf(lambda a, b: a == b, BooleanType()) + df = left.join(right, f("a", "b")) + with self.assertRaisesRegexp(AnalysisException, 'Detected implicit cartesian product'): + df.collect() + with self.sql_conf({"spark.sql.crossJoin.enabled": True}): + self.assertEqual(df.collect(), [Row(a=1, b=1)]) + + def test_udf_in_left_semi_join_condition(self): + # regression test for SPARK-25314 + from pyspark.sql.functions import udf + left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)]) + right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1)]) + f = udf(lambda a, b: a == b, BooleanType()) + df = left.join(right, f("a", "b"), "leftsemi") + with self.assertRaisesRegexp(AnalysisException, 'Detected implicit cartesian product'): + df.collect() + with self.sql_conf({"spark.sql.crossJoin.enabled": True}): + self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1)]) + + def test_udf_and_common_filter_in_join_condition(self): + # regression test for SPARK-25314 + # test the complex scenario with both udf and common filter + from pyspark.sql.functions import udf + left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)]) + right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)]) + f = udf(lambda a, b: a == b, BooleanType()) + df = left.join(right, [f("a", "b"), left.a1 == right.b1]) + # do not need spark.sql.crossJoin.enabled=true for udf is not the only join condition. + self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)]) + + def test_udf_and_common_filter_in_left_semi_join_condition(self): + # regression test for SPARK-25314 + # test the complex scenario with both udf and common filter + from pyspark.sql.functions import udf + left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)]) + right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)]) + f = udf(lambda a, b: a == b, BooleanType()) + df = left.join(right, [f("a", "b"), left.a1 == right.b1], "left_semi") + # do not need spark.sql.crossJoin.enabled=true for udf is not the only join condition. + self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1)]) + + def test_udf_not_supported_in_join_condition(self): + # regression test for SPARK-25314 + # test python udf is not supported in join type besides left_semi and inner join. + from pyspark.sql.functions import udf + left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)]) + right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)]) + f = udf(lambda a, b: a == b, BooleanType()) + + def runWithJoinType(join_type, type_string): + with self.assertRaisesRegexp( + AnalysisException, + 'Using PythonUDF.*%s is not supported.' % type_string): + left.join(right, [f("a", "b"), left.a1 == right.b1], join_type).collect() + runWithJoinType("full", "FullOuter") + runWithJoinType("left", "LeftOuter") + runWithJoinType("right", "RightOuter") + runWithJoinType("leftanti", "LeftAnti") + def test_udf_without_arguments(self): self.spark.catalog.registerFunction("foo", lambda: "bar") [row] = self.spark.sql("SELECT foo()").collect() @@ -1036,6 +1149,75 @@ def test_infer_schema(self): result = self.spark.sql("SELECT l[0].a from test2 where d['key'].d = '2'") self.assertEqual(1, result.head()[0]) + def test_infer_schema_specification(self): + from decimal import Decimal + + class A(object): + def __init__(self): + self.a = 1 + + data = [ + True, + 1, + "a", + u"a", + datetime.date(1970, 1, 1), + datetime.datetime(1970, 1, 1, 0, 0), + 1.0, + array.array("d", [1]), + [1], + (1, ), + {"a": 1}, + bytearray(1), + Decimal(1), + Row(a=1), + Row("a")(1), + A(), + ] + + df = self.spark.createDataFrame([data]) + actual = list(map(lambda x: x.dataType.simpleString(), df.schema)) + expected = [ + 'boolean', + 'bigint', + 'string', + 'string', + 'date', + 'timestamp', + 'double', + 'array', + 'array', + 'struct<_1:bigint>', + 'map', + 'binary', + 'decimal(38,18)', + 'struct', + 'struct', + 'struct', + ] + self.assertEqual(actual, expected) + + actual = list(df.first()) + expected = [ + True, + 1, + 'a', + u"a", + datetime.date(1970, 1, 1), + datetime.datetime(1970, 1, 1, 0, 0), + 1.0, + [1.0], + [1], + Row(_1=1), + {"a": 1}, + bytearray(b'\x00'), + Decimal('1.000000000000000000'), + Row(a=1), + Row(a=1), + Row(a=1), + ] + self.assertEqual(actual, expected) + def test_infer_schema_not_enough_names(self): df = self.spark.createDataFrame([["a", "b"]], ["col1"]) self.assertEqual(df.columns, ['col1', '_2']) @@ -1163,7 +1345,7 @@ def test_simple_udt_in_df(self): df = self.spark.createDataFrame( [(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)], schema=schema) - df.show() + df.collect() def test_nested_udt_in_df(self): schema = StructType().add("key", LongType()).add("val", ArrayType(PythonOnlyUDT())) @@ -1493,8 +1675,7 @@ def test_array_contains_function(self): from pyspark.sql.functions import array_contains df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data']) - actual = df.select(array_contains(df.data, 1).alias('b')).collect() - # The value argument can be implicitly castable to the element's type of the array. + actual = df.select(array_contains(df.data, "1").alias('b')).collect() self.assertEqual([Row(b=True), Row(b=False)], actual) def test_between_function(self): @@ -1957,6 +2138,9 @@ def __getstate__(self): def __setstate__(self, state): self.open_events_dir, self.process_events_dir, self.close_events_dir = state + # Those foreach tests are failed in Python 3.6 and macOS High Sierra by defined rules + # at http://sealiesoftware.com/blog/archive/2017/6/5/Objective-C_and_fork_in_macOS_1013.html + # To work around this, OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES. def test_streaming_foreach_with_simple_function(self): tester = self.ForeachWriterTester(self.spark) @@ -3261,7 +3445,7 @@ def test_create_dataframe_from_pandas_with_timestamp(self): import pandas as pd from datetime import datetime pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)], - "d": [pd.Timestamp.now().date()]}) + "d": [pd.Timestamp.now().date()]}, columns=["d", "ts"]) # test types are inferred correctly without specifying schema df = self.spark.createDataFrame(pdf) self.assertTrue(isinstance(df.schema['ts'].dataType, TimestampType)) @@ -3488,6 +3672,31 @@ def test_repr_behaviors(self): self.assertEquals(None, df._repr_html_()) self.assertEquals(expected, df.__repr__()) + # SPARK-25591 + def test_same_accumulator_in_udfs(self): + from pyspark.sql.functions import udf + + data_schema = StructType([StructField("a", IntegerType(), True), + StructField("b", IntegerType(), True)]) + data = self.spark.createDataFrame([[1, 2]], schema=data_schema) + + test_accum = self.sc.accumulator(0) + + def first_udf(x): + test_accum.add(1) + return x + + def second_udf(x): + test_accum.add(100) + return x + + func_udf = udf(first_udf, IntegerType()) + func_udf2 = udf(second_udf, IntegerType()) + data = data.withColumn("out1", func_udf(data["a"])) + data = data.withColumn("out2", func_udf2(data["b"])) + data.collect() + self.assertEqual(test_accum.value, 101) + class HiveSparkSubmitTests(SparkSubmitTests): @@ -4101,7 +4310,8 @@ def setUpClass(cls): from decimal import Decimal from distutils.version import LooseVersion import pyarrow as pa - ReusedSQLTestCase.setUpClass() + super(ArrowTests, cls).setUpClass() + cls.warnings_lock = threading.Lock() # Synchronize default timezone between Python and Java cls.tz_prev = os.environ.get("TZ", None) # save current tz if set @@ -4142,7 +4352,7 @@ def tearDownClass(cls): if cls.tz_prev is not None: os.environ["TZ"] = cls.tz_prev time.tzset() - ReusedSQLTestCase.tearDownClass() + super(ArrowTests, cls).tearDownClass() def create_pandas_data_frame(self): import pandas as pd @@ -4162,15 +4372,18 @@ def test_toPandas_fallback_enabled(self): schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)]) df = self.spark.createDataFrame([({u'a': 1},)], schema=schema) with QuietTest(self.sc): - with warnings.catch_warnings(record=True) as warns: - pdf = df.toPandas() - # Catch and check the last UserWarning. - user_warns = [ - warn.message for warn in warns if isinstance(warn.message, UserWarning)] - self.assertTrue(len(user_warns) > 0) - self.assertTrue( - "Attempting non-optimization" in _exception_message(user_warns[-1])) - self.assertPandasEqual(pdf, pd.DataFrame({u'map': [{u'a': 1}]})) + with self.warnings_lock: + with warnings.catch_warnings(record=True) as warns: + # we want the warnings to appear even if this test is run from a subclass + warnings.simplefilter("always") + pdf = df.toPandas() + # Catch and check the last UserWarning. + user_warns = [ + warn.message for warn in warns if isinstance(warn.message, UserWarning)] + self.assertTrue(len(user_warns) > 0) + self.assertTrue( + "Attempting non-optimization" in _exception_message(user_warns[-1])) + self.assertPandasEqual(pdf, pd.DataFrame({u'map': [{u'a': 1}]})) def test_toPandas_fallback_disabled(self): from distutils.version import LooseVersion @@ -4179,8 +4392,9 @@ def test_toPandas_fallback_disabled(self): schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)]) df = self.spark.createDataFrame([(None,)], schema=schema) with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'Unsupported type'): - df.toPandas() + with self.warnings_lock: + with self.assertRaisesRegexp(Exception, 'Unsupported type'): + df.toPandas() # TODO: remove BinaryType check once minimum pyarrow version is 0.10.0 if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): @@ -4392,6 +4606,8 @@ def test_createDataFrame_fallback_enabled(self): with QuietTest(self.sc): with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}): with warnings.catch_warnings(record=True) as warns: + # we want the warnings to appear even if this test is run from a subclass + warnings.simplefilter("always") df = self.spark.createDataFrame( pd.DataFrame([[{u'a': 1}]]), "a: map") # Catch and check the last UserWarning. @@ -4435,10 +4651,18 @@ def test_timestamp_dst(self): self.assertPandasEqual(pdf, df_from_pandas.toPandas()) +class EncryptionArrowTests(ArrowTests): + + @classmethod + def conf(cls): + return super(EncryptionArrowTests, cls).conf().set("spark.io.encryption.enabled", "true") + + @unittest.skipIf( not _have_pandas or not _have_pyarrow, _pandas_requirement_message or _pyarrow_requirement_message) class PandasUDFTests(ReusedSQLTestCase): + def test_pandas_udf_basic(self): from pyspark.rdd import PythonEvalType from pyspark.sql.functions import pandas_udf, PandasUDFType @@ -4654,6 +4878,24 @@ def random_udf(v): random_udf = random_udf.asNondeterministic() return random_udf + def test_pandas_udf_tokenize(self): + from pyspark.sql.functions import pandas_udf + tokenize = pandas_udf(lambda s: s.apply(lambda str: str.split(' ')), + ArrayType(StringType())) + self.assertEqual(tokenize.returnType, ArrayType(StringType())) + df = self.spark.createDataFrame([("hi boo",), ("bye boo",)], ["vals"]) + result = df.select(tokenize("vals").alias("hi")) + self.assertEqual([Row(hi=[u'hi', u'boo']), Row(hi=[u'bye', u'boo'])], result.collect()) + + def test_pandas_udf_nested_arrays(self): + from pyspark.sql.functions import pandas_udf + tokenize = pandas_udf(lambda s: s.apply(lambda str: [str.split(' ')]), + ArrayType(ArrayType(StringType()))) + self.assertEqual(tokenize.returnType, ArrayType(ArrayType(StringType()))) + df = self.spark.createDataFrame([("hi boo",), ("bye boo",)], ["vals"]) + result = df.select(tokenize("vals").alias("hi")) + self.assertEqual([Row(hi=[[u'hi', u'boo']]), Row(hi=[[u'bye', u'boo']])], result.collect()) + def test_vectorized_udf_basic(self): from pyspark.sql.functions import pandas_udf, col, array df = self.spark.range(10).select( @@ -5377,32 +5619,81 @@ def data(self): .withColumn("v", explode(col('vs'))).drop('vs') def test_supported_types(self): - from pyspark.sql.functions import pandas_udf, PandasUDFType, array, col - df = self.data.withColumn("arr", array(col("id"))) + from decimal import Decimal + from distutils.version import LooseVersion + import pyarrow as pa + from pyspark.sql.functions import pandas_udf, PandasUDFType - # Different forms of group map pandas UDF, results of these are the same + values = [ + 1, 2, 3, + 4, 5, 1.1, + 2.2, Decimal(1.123), + [1, 2, 2], True, 'hello' + ] + output_fields = [ + ('id', IntegerType()), ('byte', ByteType()), ('short', ShortType()), + ('int', IntegerType()), ('long', LongType()), ('float', FloatType()), + ('double', DoubleType()), ('decim', DecimalType(10, 3)), + ('array', ArrayType(IntegerType())), ('bool', BooleanType()), ('str', StringType()) + ] - output_schema = StructType( - [StructField('id', LongType()), - StructField('v', IntegerType()), - StructField('arr', ArrayType(LongType())), - StructField('v1', DoubleType()), - StructField('v2', LongType())]) + # TODO: Add BinaryType to variables above once minimum pyarrow version is 0.10.0 + if LooseVersion(pa.__version__) >= LooseVersion("0.10.0"): + values.append(bytearray([0x01, 0x02])) + output_fields.append(('bin', BinaryType())) + + output_schema = StructType([StructField(*x) for x in output_fields]) + df = self.spark.createDataFrame([values], schema=output_schema) + # Different forms of group map pandas UDF, results of these are the same udf1 = pandas_udf( - lambda pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id), + lambda pdf: pdf.assign( + byte=pdf.byte * 2, + short=pdf.short * 2, + int=pdf.int * 2, + long=pdf.long * 2, + float=pdf.float * 2, + double=pdf.double * 2, + decim=pdf.decim * 2, + bool=False if pdf.bool else True, + str=pdf.str + 'there', + array=pdf.array, + ), output_schema, PandasUDFType.GROUPED_MAP ) udf2 = pandas_udf( - lambda _, pdf: pdf.assign(v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id), + lambda _, pdf: pdf.assign( + byte=pdf.byte * 2, + short=pdf.short * 2, + int=pdf.int * 2, + long=pdf.long * 2, + float=pdf.float * 2, + double=pdf.double * 2, + decim=pdf.decim * 2, + bool=False if pdf.bool else True, + str=pdf.str + 'there', + array=pdf.array, + ), output_schema, PandasUDFType.GROUPED_MAP ) udf3 = pandas_udf( - lambda key, pdf: pdf.assign(id=key[0], v1=pdf.v * pdf.id * 1.0, v2=pdf.v + pdf.id), + lambda key, pdf: pdf.assign( + id=key[0], + byte=pdf.byte * 2, + short=pdf.short * 2, + int=pdf.int * 2, + long=pdf.long * 2, + float=pdf.float * 2, + double=pdf.double * 2, + decim=pdf.decim * 2, + bool=False if pdf.bool else True, + str=pdf.str + 'there', + array=pdf.array, + ), output_schema, PandasUDFType.GROUPED_MAP ) @@ -5445,8 +5736,9 @@ def test_register_grouped_map_udf(self): foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP) with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, 'f must be either SQL_BATCHED_UDF or ' - 'SQL_SCALAR_PANDAS_UDF'): + with self.assertRaisesRegexp( + ValueError, + 'f.*SQL_BATCHED_UDF.*SQL_SCALAR_PANDAS_UDF.*SQL_GROUPED_AGG_PANDAS_UDF.*'): self.spark.catalog.registerFunction("foo_udf", foo_udf) def test_decorator(self): @@ -5566,24 +5858,26 @@ def test_wrong_args(self): pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR)) def test_unsupported_types(self): + from distutils.version import LooseVersion + import pyarrow as pa from pyspark.sql.functions import pandas_udf, PandasUDFType - schema = StructType( - [StructField("id", LongType(), True), - StructField("map", MapType(StringType(), IntegerType()), True)]) - with QuietTest(self.sc): - with self.assertRaisesRegexp( - NotImplementedError, - 'Invalid returnType.*grouped map Pandas UDF.*MapType'): - pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP) - schema = StructType( - [StructField("id", LongType(), True), - StructField("arr_ts", ArrayType(TimestampType()), True)]) - with QuietTest(self.sc): - with self.assertRaisesRegexp( - NotImplementedError, - 'Invalid returnType.*grouped map Pandas UDF.*ArrayType.*TimestampType'): - pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP) + common_err_msg = 'Invalid returnType.*grouped map Pandas UDF.*' + unsupported_types = [ + StructField('map', MapType(StringType(), IntegerType())), + StructField('arr_ts', ArrayType(TimestampType())), + StructField('null', NullType()), + ] + + # TODO: Remove this if-statement once minimum pyarrow version is 0.10.0 + if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): + unsupported_types.append(StructField('bin', BinaryType())) + + for unsupported_type in unsupported_types: + schema = StructType([StructField('id', LongType(), True), unsupported_type]) + with QuietTest(self.sc): + with self.assertRaisesRegexp(NotImplementedError, common_err_msg): + pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP) # Regression test for SPARK-23314 def test_timestamp_dst(self): @@ -5762,7 +6056,8 @@ def test_positional_assignment_conf(self): import pandas as pd from pyspark.sql.functions import pandas_udf, PandasUDFType - with self.sql_conf({"spark.sql.execution.pandas.groupedMap.assignColumnsByPosition": True}): + with self.sql_conf({ + "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName": False}): @pandas_udf("a string, b float", PandasUDFType.GROUPED_MAP) def foo(_): @@ -6259,6 +6554,21 @@ def test_invalid_args(self): 'mixture.*aggregate function.*group aggregate pandas UDF'): df.groupby(df.id).agg(mean_udf(df.v), mean(df.v)).collect() + def test_register_vectorized_udf_basic(self): + from pyspark.sql.functions import pandas_udf + from pyspark.rdd import PythonEvalType + + sum_pandas_udf = pandas_udf( + lambda v: v.sum(), "integer", PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) + + self.assertEqual(sum_pandas_udf.evalType, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) + group_agg_pandas_udf = self.spark.udf.register("sum_pandas_udf", sum_pandas_udf) + self.assertEqual(group_agg_pandas_udf.evalType, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF) + q = "SELECT sum_pandas_udf(v1) FROM VALUES (3, 0), (2, 0), (1, 1) tbl(v1, v2) GROUP BY v2" + actual = sorted(map(lambda r: r[0], self.spark.sql(q).collect())) + expected = [1, 5] + self.assertEqual(actual, expected) + @unittest.skipIf( not _have_pandas or not _have_pyarrow, diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 0b61707c8cc0..1d24c40e5858 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -752,7 +752,7 @@ def __eq__(self, other): for v in [ArrayType, MapType, StructType]) -_FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)") +_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(\d+)\s*\)") def _parse_datatype_string(s): @@ -1500,6 +1500,9 @@ def __contains__(self, item): # let object acts like class def __call__(self, *args): """create new Row object""" + if len(args) > len(self): + raise ValueError("Can not create Row with fields %s, expected %d values " + "but got %s" % (self, len(self), args)) return _create_row(self, args) def __getitem__(self, item): diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 9dbe49b831ce..58f4e0dff5ee 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -298,6 +298,15 @@ def register(self, name, f, returnType=None): >>> spark.sql("SELECT add_one(id) FROM range(3)").collect() # doctest: +SKIP [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)] + >>> @pandas_udf("integer", PandasUDFType.GROUPED_AGG) # doctest: +SKIP + ... def sum_udf(v): + ... return v.sum() + ... + >>> _ = spark.udf.register("sum_udf", sum_udf) # doctest: +SKIP + >>> q = "SELECT sum_udf(v1) FROM VALUES (3, 0), (2, 0), (1, 1) tbl(v1, v2) GROUP BY v2" + >>> spark.sql(q).collect() # doctest: +SKIP + [Row(sum_udf(v1)=1), Row(sum_udf(v1)=5)] + .. note:: Registration for a user-defined function (case 2.) was added from Spark 2.3.0. """ @@ -310,9 +319,11 @@ def register(self, name, f, returnType=None): "Invalid returnType: data type can not be specified when f is" "a user-defined function, but got %s." % returnType) if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF, - PythonEvalType.SQL_SCALAR_PANDAS_UDF]: + PythonEvalType.SQL_SCALAR_PANDAS_UDF, + PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]: raise ValueError( - "Invalid f: f must be either SQL_BATCHED_UDF or SQL_SCALAR_PANDAS_UDF") + "Invalid f: f must be SQL_BATCHED_UDF, SQL_SCALAR_PANDAS_UDF or " + "SQL_GROUPED_AGG_PANDAS_UDF") register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name, evalType=f.evalType, deterministic=f.deterministic) diff --git a/python/pyspark/storagelevel.py b/python/pyspark/storagelevel.py index ef012d27cb22..7f29646c0743 100644 --- a/python/pyspark/storagelevel.py +++ b/python/pyspark/storagelevel.py @@ -58,8 +58,8 @@ def __str__(self): StorageLevel.OFF_HEAP = StorageLevel(True, True, True, False, 1) """ -.. note:: The following four storage level constants are deprecated in 2.0, since the records \ -will always be serialized in Python. +.. note:: The following four storage level constants are deprecated in 2.0, since the records + will always be serialized in Python. """ StorageLevel.MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY """.. note:: Deprecated in 2.0, use ``StorageLevel.MEMORY_ONLY`` instead.""" diff --git a/python/pyspark/test_broadcast.py b/python/pyspark/test_broadcast.py new file mode 100644 index 000000000000..a00329c18ad8 --- /dev/null +++ b/python/pyspark/test_broadcast.py @@ -0,0 +1,126 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import random +import tempfile +import unittest + +try: + import xmlrunner +except ImportError: + xmlrunner = None + +from pyspark.broadcast import Broadcast +from pyspark.conf import SparkConf +from pyspark.context import SparkContext +from pyspark.java_gateway import launch_gateway +from pyspark.serializers import ChunkedStream + + +class BroadcastTest(unittest.TestCase): + + def tearDown(self): + if getattr(self, "sc", None) is not None: + self.sc.stop() + self.sc = None + + def _test_encryption_helper(self, vs): + """ + Creates a broadcast variables for each value in vs, and runs a simple job to make sure the + value is the same when it's read in the executors. Also makes sure there are no task + failures. + """ + bs = [self.sc.broadcast(value=v) for v in vs] + exec_values = self.sc.parallelize(range(2)).map(lambda x: [b.value for b in bs]).collect() + for ev in exec_values: + self.assertEqual(ev, vs) + # make sure there are no task failures + status = self.sc.statusTracker() + for jid in status.getJobIdsForGroup(): + for sid in status.getJobInfo(jid).stageIds: + stage_info = status.getStageInfo(sid) + self.assertEqual(0, stage_info.numFailedTasks) + + def _test_multiple_broadcasts(self, *extra_confs): + """ + Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, + and also multiple jobs. + """ + conf = SparkConf() + for key, value in extra_confs: + conf.set(key, value) + conf.setMaster("local-cluster[2,1,1024]") + self.sc = SparkContext(conf=conf) + self._test_encryption_helper([5]) + self._test_encryption_helper([5, 10, 20]) + + def test_broadcast_with_encryption(self): + self._test_multiple_broadcasts(("spark.io.encryption.enabled", "true")) + + def test_broadcast_no_encryption(self): + self._test_multiple_broadcasts() + + +class BroadcastFrameProtocolTest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + gateway = launch_gateway(SparkConf()) + cls._jvm = gateway.jvm + cls.longMessage = True + random.seed(42) + + def _test_chunked_stream(self, data, py_buf_size): + # write data using the chunked protocol from python. + chunked_file = tempfile.NamedTemporaryFile(delete=False) + dechunked_file = tempfile.NamedTemporaryFile(delete=False) + dechunked_file.close() + try: + out = ChunkedStream(chunked_file, py_buf_size) + out.write(data) + out.close() + # now try to read it in java + jin = self._jvm.java.io.FileInputStream(chunked_file.name) + jout = self._jvm.java.io.FileOutputStream(dechunked_file.name) + self._jvm.DechunkedInputStream.dechunkAndCopyToOutput(jin, jout) + # java should have decoded it back to the original data + self.assertEqual(len(data), os.stat(dechunked_file.name).st_size) + with open(dechunked_file.name, "rb") as f: + byte = f.read(1) + idx = 0 + while byte: + self.assertEqual(data[idx], bytearray(byte)[0], msg="idx = " + str(idx)) + byte = f.read(1) + idx += 1 + finally: + os.unlink(chunked_file.name) + os.unlink(dechunked_file.name) + + def test_chunked_stream(self): + def random_bytes(n): + return bytearray(random.getrandbits(8) for _ in range(n)) + for data_length in [1, 10, 100, 10000]: + for buffer_length in [1, 2, 5, 8192]: + self._test_chunked_stream(random_bytes(data_length), buffer_length) + +if __name__ == '__main__': + from pyspark.test_broadcast import * + if xmlrunner: + unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2) + else: + unittest.main(verbosity=2) diff --git a/python/pyspark/test_serializers.py b/python/pyspark/test_serializers.py new file mode 100644 index 000000000000..5b43729f9ebb --- /dev/null +++ b/python/pyspark/test_serializers.py @@ -0,0 +1,90 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import io +import math +import struct +import sys +import unittest + +try: + import xmlrunner +except ImportError: + xmlrunner = None + +from pyspark import serializers + + +def read_int(b): + return struct.unpack("!i", b)[0] + + +def write_int(i): + return struct.pack("!i", i) + + +class SerializersTest(unittest.TestCase): + + def test_chunked_stream(self): + original_bytes = bytearray(range(100)) + for data_length in [1, 10, 100]: + for buffer_length in [1, 2, 3, 5, 20, 99, 100, 101, 500]: + dest = ByteArrayOutput() + stream_out = serializers.ChunkedStream(dest, buffer_length) + stream_out.write(original_bytes[:data_length]) + stream_out.close() + num_chunks = int(math.ceil(float(data_length) / buffer_length)) + # length for each chunk, and a final -1 at the very end + exp_size = (num_chunks + 1) * 4 + data_length + self.assertEqual(len(dest.buffer), exp_size) + dest_pos = 0 + data_pos = 0 + for chunk_idx in range(num_chunks): + chunk_length = read_int(dest.buffer[dest_pos:(dest_pos + 4)]) + if chunk_idx == num_chunks - 1: + exp_length = data_length % buffer_length + if exp_length == 0: + exp_length = buffer_length + else: + exp_length = buffer_length + self.assertEqual(chunk_length, exp_length) + dest_pos += 4 + dest_chunk = dest.buffer[dest_pos:dest_pos + chunk_length] + orig_chunk = original_bytes[data_pos:data_pos + chunk_length] + self.assertEqual(dest_chunk, orig_chunk) + dest_pos += chunk_length + data_pos += chunk_length + # ends with a -1 + self.assertEqual(dest.buffer[-4:], write_int(-1)) + + +class ByteArrayOutput(object): + def __init__(self): + self.buffer = bytearray() + + def write(self, b): + self.buffer += b + + def close(self): + pass + +if __name__ == '__main__': + from pyspark.test_serializers import * + if xmlrunner: + unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2) + else: + unittest.main(verbosity=2) diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 8ac1df52fc59..050c2dd01836 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -372,9 +372,16 @@ def tearDown(self): class ReusedPySparkTestCase(unittest.TestCase): + @classmethod + def conf(cls): + """ + Override this in subclasses to supply a more specific conf + """ + return SparkConf() + @classmethod def setUpClass(cls): - cls.sc = SparkContext('local[4]', cls.__name__) + cls.sc = SparkContext('local[4]', cls.__name__, conf=cls.conf()) @classmethod def tearDownClass(cls): diff --git a/python/pyspark/util.py b/python/pyspark/util.py index f015542c8799..f906f4959543 100644 --- a/python/pyspark/util.py +++ b/python/pyspark/util.py @@ -80,7 +80,7 @@ def majorMinorVersion(sparkVersion): (2, 3) """ - m = re.search('^(\d+)\.(\d+)(\..*)?$', sparkVersion) + m = re.search(r'^(\d+)\.(\d+)(\..*)?$', sparkVersion) if m is not None: return (int(m.group(1)), int(m.group(2))) else: diff --git a/python/pyspark/version.py b/python/pyspark/version.py index b9c2c4ced71d..ba2a40cec01e 100644 --- a/python/pyspark/version.py +++ b/python/pyspark/version.py @@ -16,4 +16,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.4.0.dev0" +__version__ = "3.0.0.dev0" diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index e934da4d2eb6..8c59f1f999f1 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -97,8 +97,9 @@ def verify_result_length(*a): def wrap_grouped_map_pandas_udf(f, return_type, argspec, runner_conf): - assign_cols_by_pos = runner_conf.get( - "spark.sql.execution.pandas.groupedMap.assignColumnsByPosition", False) + assign_cols_by_name = runner_conf.get( + "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true") + assign_cols_by_name = assign_cols_by_name.lower() == "true" def wrapped(key_series, value_series): import pandas as pd @@ -119,7 +120,7 @@ def wrapped(key_series, value_series): "Expected: {} Actual: {}".format(len(return_type), len(result.columns))) # Assign result columns by schema name if user labeled with strings, else use position - if not assign_cols_by_pos and any(isinstance(name, basestring) for name in result.columns): + if assign_cols_by_name and any(isinstance(name, basestring) for name in result.columns): return [(result[field.name], to_arrow_type(field.dataType)) for field in return_type] else: return [(result[result.columns[i]], to_arrow_type(field.dataType)) @@ -324,16 +325,34 @@ def main(infile, outfile): importlib.invalidate_caches() # fetch names and values of broadcast variables + needs_broadcast_decryption_server = read_bool(infile) num_broadcast_variables = read_int(infile) + if needs_broadcast_decryption_server: + # read the decrypted data from a server in the jvm + port = read_int(infile) + auth_secret = utf8_deserializer.loads(infile) + (broadcast_sock_file, _) = local_connect_and_auth(port, auth_secret) + for _ in range(num_broadcast_variables): bid = read_long(infile) if bid >= 0: - path = utf8_deserializer.loads(infile) - _broadcastRegistry[bid] = Broadcast(path=path) + if needs_broadcast_decryption_server: + read_bid = read_long(broadcast_sock_file) + assert(read_bid == bid) + _broadcastRegistry[bid] = \ + Broadcast(sock_file=broadcast_sock_file) + else: + path = utf8_deserializer.loads(infile) + _broadcastRegistry[bid] = Broadcast(path=path) + else: bid = - bid - 1 _broadcastRegistry.pop(bid) + if needs_broadcast_decryption_server: + broadcast_sock_file.write(b'1') + broadcast_sock_file.close() + _accumulatorRegistry.clear() eval_type = read_int(infile) if eval_type == PythonEvalType.NON_UDF: diff --git a/python/run-tests.py b/python/run-tests.py index 4c90926cfa35..ccbdfac3f385 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -138,7 +138,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): # 2 (or --verbose option is enabled). decoded_lines = map(lambda line: line.decode(), iter(per_test_output)) skipped_tests = list(filter( - lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line), + lambda line: re.search(r'test_.* \(pyspark\..*\) ... skipped ', line), decoded_lines)) skipped_counts = len(skipped_tests) if skipped_counts > 0: diff --git a/repl/pom.xml b/repl/pom.xml index e8464a688336..d2a89b274401 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml index 920f0f6ebf2c..90bac19cba01 100644 --- a/resource-managers/kubernetes/core/pom.xml +++ b/resource-managers/kubernetes/core/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../../pom.xml diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala index 1b582fe53624..71e4d321a0e3 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala @@ -225,6 +225,22 @@ private[spark] object Config extends Logging { "Ensure that major Python version is either Python2 or Python3") .createWithDefault("2") + val APP_RESOURCE_TYPE = + ConfigBuilder("spark.kubernetes.resource.type") + .doc("This sets the resource type internally") + .internal() + .stringConf + .createOptional + + val KUBERNETES_LOCAL_DIRS_TMPFS = + ConfigBuilder("spark.kubernetes.local.dirs.tmpfs") + .doc("If set to true then emptyDir volumes created to back SPARK_LOCAL_DIRS will have " + + "their medium set to Memory so that they will be created as tmpfs (i.e. RAM) backed " + + "volumes. This may improve performance but scratch space usage will count towards " + + "your pods memory limit so you may wish to request more memory.") + .booleanConf + .createWithDefault(false) + val KUBERNETES_AUTH_SUBMISSION_CONF_PREFIX = "spark.kubernetes.authenticate.submission" diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala index 3aa35d419073..cae6e7d5ad51 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala @@ -24,6 +24,7 @@ import org.apache.spark.SparkConf import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.submit._ +import org.apache.spark.deploy.k8s.submit.KubernetesClientApplication._ import org.apache.spark.internal.config.ConfigEntry @@ -220,10 +221,20 @@ private[spark] object KubernetesConf { val executorVolumes = KubernetesVolumeUtils.parseVolumesWithPrefix( sparkConf, KUBERNETES_EXECUTOR_VOLUMES_PREFIX).map(_.get) + // If no prefix is defined then we are in pure client mode + // (not the one used by cluster mode inside the container) + val appResourceNamePrefix = { + if (sparkConf.getOption(KUBERNETES_EXECUTOR_POD_NAME_PREFIX.key).isEmpty) { + getResourceNamePrefix(getAppName(sparkConf)) + } else { + sparkConf.get(KUBERNETES_EXECUTOR_POD_NAME_PREFIX) + } + } + KubernetesConf( sparkConf.clone(), KubernetesExecutorSpecificConf(executorId, driverPod), - sparkConf.get(KUBERNETES_EXECUTOR_POD_NAME_PREFIX), + appResourceNamePrefix, appId, executorLabels, executorAnnotations, diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala index 588cd9d40f9a..f5fae7cc8c47 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala @@ -16,7 +16,11 @@ */ package org.apache.spark.deploy.k8s -import org.apache.spark.SparkConf +import scala.collection.JavaConverters._ + +import io.fabric8.kubernetes.api.model.{ContainerStateRunning, ContainerStateTerminated, ContainerStateWaiting, ContainerStatus, Pod, Time} + +import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.util.Utils private[spark] object KubernetesUtils { @@ -60,4 +64,81 @@ private[spark] object KubernetesUtils { } def parseMasterUrl(url: String): String = url.substring("k8s://".length) + + def formatPairsBundle(pairs: Seq[(String, String)], indent: Int = 1) : String = { + // Use more loggable format if value is null or empty + val indentStr = "\t" * indent + pairs.map { + case (k, v) => s"\n$indentStr $k: ${Option(v).filter(_.nonEmpty).getOrElse("N/A")}" + }.mkString("") + } + + /** + * Given a pod, output a human readable representation of its state + * + * @param pod Pod + * @return Human readable pod state + */ + def formatPodState(pod: Pod): String = { + val details = Seq[(String, String)]( + // pod metadata + ("pod name", pod.getMetadata.getName), + ("namespace", pod.getMetadata.getNamespace), + ("labels", pod.getMetadata.getLabels.asScala.mkString(", ")), + ("pod uid", pod.getMetadata.getUid), + ("creation time", formatTime(pod.getMetadata.getCreationTimestamp)), + + // spec details + ("service account name", pod.getSpec.getServiceAccountName), + ("volumes", pod.getSpec.getVolumes.asScala.map(_.getName).mkString(", ")), + ("node name", pod.getSpec.getNodeName), + + // status + ("start time", formatTime(pod.getStatus.getStartTime)), + ("phase", pod.getStatus.getPhase), + ("container status", containersDescription(pod, 2)) + ) + + formatPairsBundle(details) + } + + def containersDescription(p: Pod, indent: Int = 1): String = { + p.getStatus.getContainerStatuses.asScala.map { status => + Seq( + ("container name", status.getName), + ("container image", status.getImage)) ++ + containerStatusDescription(status) + }.map(p => formatPairsBundle(p, indent)).mkString("\n\n") + } + + def containerStatusDescription(containerStatus: ContainerStatus) + : Seq[(String, String)] = { + val state = containerStatus.getState + Option(state.getRunning) + .orElse(Option(state.getTerminated)) + .orElse(Option(state.getWaiting)) + .map { + case running: ContainerStateRunning => + Seq( + ("container state", "running"), + ("container started at", formatTime(running.getStartedAt))) + case waiting: ContainerStateWaiting => + Seq( + ("container state", "waiting"), + ("pending reason", waiting.getReason)) + case terminated: ContainerStateTerminated => + Seq( + ("container state", "terminated"), + ("container started at", formatTime(terminated.getStartedAt)), + ("container finished at", formatTime(terminated.getFinishedAt)), + ("exit code", terminated.getExitCode.toString), + ("termination reason", terminated.getReason)) + case unknown => + throw new SparkException(s"Unexpected container status type ${unknown.getClass}.") + }.getOrElse(Seq(("container state", "N/A"))) + } + + def formatTime(time: Time): String = { + if (time != null) time.getTime else "N/A" + } } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala index c37f713c56de..d89995ba5e4f 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala @@ -24,7 +24,7 @@ import org.apache.spark.SparkException import org.apache.spark.deploy.k8s._ import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ -import org.apache.spark.internal.config.{EXECUTOR_CLASS_PATH, EXECUTOR_JAVA_OPTIONS, EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD} +import org.apache.spark.internal.config.{EXECUTOR_CLASS_PATH, EXECUTOR_JAVA_OPTIONS, EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD, PYSPARK_EXECUTOR_MEMORY} import org.apache.spark.rpc.RpcEndpointAddress import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend import org.apache.spark.util.Utils @@ -58,6 +58,16 @@ private[spark] class BasicExecutorFeatureStep( (kubernetesConf.get(MEMORY_OVERHEAD_FACTOR) * executorMemoryMiB).toInt, MEMORY_OVERHEAD_MIN_MIB)) private val executorMemoryWithOverhead = executorMemoryMiB + memoryOverheadMiB + private val executorMemoryTotal = kubernetesConf.sparkConf + .getOption(APP_RESOURCE_TYPE.key).map{ res => + val additionalPySparkMemory = res match { + case "python" => + kubernetesConf.sparkConf + .get(PYSPARK_EXECUTOR_MEMORY).map(_.toInt).getOrElse(0) + case _ => 0 + } + executorMemoryWithOverhead + additionalPySparkMemory + }.getOrElse(executorMemoryWithOverhead) private val executorCores = kubernetesConf.sparkConf.getInt("spark.executor.cores", 1) private val executorCoresRequest = @@ -76,7 +86,7 @@ private[spark] class BasicExecutorFeatureStep( // executorId val hostname = name.substring(Math.max(0, name.length - 63)) val executorMemoryQuantity = new QuantityBuilder(false) - .withAmount(s"${executorMemoryWithOverhead}Mi") + .withAmount(s"${executorMemoryTotal}Mi") .build() val executorCpuQuantity = new QuantityBuilder(false) .withAmount(executorCoresRequest) diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/LocalDirsFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/LocalDirsFeatureStep.scala index 70b307303d14..be386e119d46 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/LocalDirsFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/LocalDirsFeatureStep.scala @@ -22,6 +22,7 @@ import java.util.UUID import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder, VolumeBuilder, VolumeMountBuilder} import org.apache.spark.deploy.k8s.{KubernetesConf, KubernetesDriverSpecificConf, KubernetesRoleSpecificConf, SparkPod} +import org.apache.spark.deploy.k8s.Config._ private[spark] class LocalDirsFeatureStep( conf: KubernetesConf[_ <: KubernetesRoleSpecificConf], @@ -37,6 +38,7 @@ private[spark] class LocalDirsFeatureStep( .orElse(conf.getOption("spark.local.dir")) .getOrElse(defaultLocalDir) .split(",") + private val useLocalDirTmpFs = conf.get(KUBERNETES_LOCAL_DIRS_TMPFS) override def configurePod(pod: SparkPod): SparkPod = { val localDirVolumes = resolvedLocalDirs @@ -45,6 +47,7 @@ private[spark] class LocalDirsFeatureStep( new VolumeBuilder() .withName(s"spark-local-dir-${index + 1}") .withNewEmptyDir() + .withMedium(if (useLocalDirTmpFs) "Memory" else null) .endEmptyDir() .build() } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/JavaDriverFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/JavaDriverFeatureStep.scala index f52ec9fdc677..6f063b253cd7 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/JavaDriverFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/JavaDriverFeatureStep.scala @@ -19,6 +19,7 @@ package org.apache.spark.deploy.k8s.features.bindings import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata} import org.apache.spark.deploy.k8s.{KubernetesConf, KubernetesDriverSpecificConf, SparkPod} +import org.apache.spark.deploy.k8s.Config.APP_RESOURCE_TYPE import org.apache.spark.deploy.k8s.Constants.SPARK_CONF_PATH import org.apache.spark.deploy.k8s.features.KubernetesFeatureConfigStep import org.apache.spark.launcher.SparkLauncher @@ -38,7 +39,8 @@ private[spark] class JavaDriverFeatureStep( .build() SparkPod(pod.pod, withDriverArgs) } - override def getAdditionalPodSystemProperties(): Map[String, String] = Map.empty + override def getAdditionalPodSystemProperties(): Map[String, String] = + Map(APP_RESOURCE_TYPE.key -> "java") override def getAdditionalKubernetesResources(): Seq[HasMetadata] = Seq.empty } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/PythonDriverFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/PythonDriverFeatureStep.scala index 406944a95338..cf0c03b22bd7 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/PythonDriverFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/PythonDriverFeatureStep.scala @@ -21,6 +21,7 @@ import scala.collection.JavaConverters._ import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, HasMetadata} import org.apache.spark.deploy.k8s.{KubernetesConf, KubernetesDriverSpecificConf, KubernetesUtils, SparkPod} +import org.apache.spark.deploy.k8s.Config.APP_RESOURCE_TYPE import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.features.KubernetesFeatureConfigStep @@ -68,7 +69,8 @@ private[spark] class PythonDriverFeatureStep( SparkPod(pod.pod, withPythonPrimaryContainer) } - override def getAdditionalPodSystemProperties(): Map[String, String] = Map.empty + override def getAdditionalPodSystemProperties(): Map[String, String] = + Map(APP_RESOURCE_TYPE.key -> "python") override def getAdditionalKubernetesResources(): Seq[HasMetadata] = Seq.empty } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/RDriverFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/RDriverFeatureStep.scala index 11b09b399618..1a7ef52fefe7 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/RDriverFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/bindings/RDriverFeatureStep.scala @@ -21,6 +21,7 @@ import scala.collection.JavaConverters._ import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, HasMetadata} import org.apache.spark.deploy.k8s.{KubernetesConf, KubernetesDriverSpecificConf, KubernetesUtils, SparkPod} +import org.apache.spark.deploy.k8s.Config.APP_RESOURCE_TYPE import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.features.KubernetesFeatureConfigStep @@ -54,7 +55,8 @@ private[spark] class RDriverFeatureStep( SparkPod(pod.pod, withRPrimaryContainer) } - override def getAdditionalPodSystemProperties(): Map[String, String] = Map.empty + override def getAdditionalPodSystemProperties(): Map[String, String] = + Map(APP_RESOURCE_TYPE.key -> "r") override def getAdditionalKubernetesResources(): Seq[HasMetadata] = Seq.empty } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala index 986c950ab365..af3903ac5da5 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala @@ -17,7 +17,7 @@ package org.apache.spark.deploy.k8s.submit import java.io.StringWriter -import java.util.{Collections, UUID} +import java.util.{Collections, Locale, UUID} import java.util.Properties import io.fabric8.kubernetes.api.model._ @@ -211,11 +211,8 @@ private[spark] class KubernetesClientApplication extends SparkApplication { // considerably restrictive, e.g. must be no longer than 63 characters in length. So we generate // a unique app ID (captured by spark.app.id) in the format below. val kubernetesAppId = s"spark-${UUID.randomUUID().toString.replaceAll("-", "")}" - val launchTime = System.currentTimeMillis() val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION) - val kubernetesResourceNamePrefix = { - s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-") - } + val kubernetesResourceNamePrefix = KubernetesClientApplication.getResourceNamePrefix(appName) sparkConf.set(KUBERNETES_PYSPARK_PY_FILES, clientArguments.maybePyFiles.getOrElse("")) val kubernetesConf = KubernetesConf.createDriverConf( sparkConf, @@ -254,3 +251,19 @@ private[spark] class KubernetesClientApplication extends SparkApplication { } } } + +private[spark] object KubernetesClientApplication { + + def getAppName(conf: SparkConf): String = conf.getOption("spark.app.name").getOrElse("spark") + + def getResourceNamePrefix(appName: String): String = { + val launchTime = System.currentTimeMillis() + s"$appName-$launchTime" + .trim + .toLowerCase(Locale.ROOT) + .replaceAll("\\s+", "-") + .replaceAll("\\.", "-") + .replaceAll("[^a-z0-9\\-]", "") + .replaceAll("-+", "-") + } +} diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala index 173ac541626a..1889fe5eb3e9 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala @@ -25,6 +25,7 @@ import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher} import io.fabric8.kubernetes.client.Watcher.Action import org.apache.spark.SparkException +import org.apache.spark.deploy.k8s.KubernetesUtils._ import org.apache.spark.internal.Logging import org.apache.spark.util.ThreadUtils @@ -99,82 +100,10 @@ private[k8s] class LoggingPodStatusWatcherImpl( scheduler.shutdown() } - private def formatPodState(pod: Pod): String = { - val details = Seq[(String, String)]( - // pod metadata - ("pod name", pod.getMetadata.getName), - ("namespace", pod.getMetadata.getNamespace), - ("labels", pod.getMetadata.getLabels.asScala.mkString(", ")), - ("pod uid", pod.getMetadata.getUid), - ("creation time", formatTime(pod.getMetadata.getCreationTimestamp)), - - // spec details - ("service account name", pod.getSpec.getServiceAccountName), - ("volumes", pod.getSpec.getVolumes.asScala.map(_.getName).mkString(", ")), - ("node name", pod.getSpec.getNodeName), - - // status - ("start time", formatTime(pod.getStatus.getStartTime)), - ("container images", - pod.getStatus.getContainerStatuses - .asScala - .map(_.getImage) - .mkString(", ")), - ("phase", pod.getStatus.getPhase), - ("status", pod.getStatus.getContainerStatuses.toString) - ) - - formatPairsBundle(details) - } - - private def formatPairsBundle(pairs: Seq[(String, String)]) = { - // Use more loggable format if value is null or empty - pairs.map { - case (k, v) => s"\n\t $k: ${Option(v).filter(_.nonEmpty).getOrElse("N/A")}" - }.mkString("") - } - override def awaitCompletion(): Unit = { podCompletedFuture.await() logInfo(pod.map { p => s"Container final statuses:\n\n${containersDescription(p)}" }.getOrElse("No containers were found in the driver pod.")) } - - private def containersDescription(p: Pod): String = { - p.getStatus.getContainerStatuses.asScala.map { status => - Seq( - ("Container name", status.getName), - ("Container image", status.getImage)) ++ - containerStatusDescription(status) - }.map(formatPairsBundle).mkString("\n\n") - } - - private def containerStatusDescription( - containerStatus: ContainerStatus): Seq[(String, String)] = { - val state = containerStatus.getState - Option(state.getRunning) - .orElse(Option(state.getTerminated)) - .orElse(Option(state.getWaiting)) - .map { - case running: ContainerStateRunning => - Seq( - ("Container state", "Running"), - ("Container started at", formatTime(running.getStartedAt))) - case waiting: ContainerStateWaiting => - Seq( - ("Container state", "Waiting"), - ("Pending reason", waiting.getReason)) - case terminated: ContainerStateTerminated => - Seq( - ("Container state", "Terminated"), - ("Exit code", terminated.getExitCode.toString)) - case unknown => - throw new SparkException(s"Unexpected container status type ${unknown.getClass}.") - }.getOrElse(Seq(("Container state", "N/A"))) - } - - private def formatTime(time: Time): String = { - if (time != null) time.getTime else "N/A" - } } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala index b28d93990313..cc254b896249 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala @@ -24,6 +24,7 @@ import scala.collection.mutable import org.apache.spark.SparkConf import org.apache.spark.deploy.k8s.Config._ +import org.apache.spark.deploy.k8s.KubernetesUtils._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ExecutorExited import org.apache.spark.util.Utils @@ -99,8 +100,11 @@ private[spark] class ExecutorPodsLifecycleManager( } } } - logDebug(s"Removed executors with ids ${execIdsRemovedInThisRound.mkString(",")}" + - s" from Spark that were either found to be deleted or non-existent in the cluster.") + + if (execIdsRemovedInThisRound.nonEmpty) { + logDebug(s"Removed executors with ids ${execIdsRemovedInThisRound.mkString(",")}" + + s" from Spark that were either found to be deleted or non-existent in the cluster.") + } } private def onFinalNonDeletedState( @@ -151,13 +155,15 @@ private[spark] class ExecutorPodsLifecycleManager( private def exitReasonMessage(podState: FinalPodState, execId: Long, exitCode: Int) = { val pod = podState.pod + val reason = Option(pod.getStatus.getReason) + val message = Option(pod.getStatus.getMessage) s""" |The executor with id $execId exited with exit code $exitCode. - |The API gave the following brief reason: ${pod.getStatus.getReason} - |The API gave the following message: ${pod.getStatus.getMessage} + |The API gave the following brief reason: ${reason.getOrElse("N/A")} + |The API gave the following message: ${message.getOrElse("N/A")} |The API gave the following container statuses: | - |${pod.getStatus.getContainerStatuses.asScala.map(_.toString).mkString("\n===\n")} + |${containersDescription(pod)} """.stripMargin } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala index 26be91804341..435a5f1461c9 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala @@ -16,6 +16,8 @@ */ package org.apache.spark.scheduler.cluster.k8s +import java.util.Locale + import io.fabric8.kubernetes.api.model.Pod import org.apache.spark.deploy.k8s.Constants._ @@ -52,7 +54,7 @@ object ExecutorPodsSnapshot extends Logging { if (isDeleted(pod)) { PodDeleted(pod) } else { - val phase = pod.getStatus.getPhase.toLowerCase + val phase = pod.getStatus.getPhase.toLowerCase(Locale.ROOT) phase match { case "pending" => PodPending(pod) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala index d98e11355464..0968cce971c3 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala @@ -57,7 +57,6 @@ class BasicDriverFeatureStepSuite extends SparkFunSuite { MAIN_CLASS, APP_ARGS) - test("Check the pod respects all configurations from the user.") { val sparkConf = new SparkConf() .set(KUBERNETES_DRIVER_POD_NAME, "spark-driver-pod") diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala index 95d373f79164..63b237b9dfe4 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala @@ -75,6 +75,7 @@ class BasicExecutorFeatureStepSuite .set("spark.driver.host", DRIVER_HOSTNAME) .set("spark.driver.port", DRIVER_PORT.toString) .set(IMAGE_PULL_SECRETS, TEST_IMAGE_PULL_SECRETS.mkString(",")) + .set("spark.kubernetes.resource.type", "java") } test("basic executor pod has reasonable defaults") { @@ -161,6 +162,29 @@ class BasicExecutorFeatureStepSuite checkOwnerReferences(executor.pod, DRIVER_POD_UID) } + test("test executor pyspark memory") { + val conf = baseConf.clone() + conf.set("spark.kubernetes.resource.type", "python") + conf.set(org.apache.spark.internal.config.PYSPARK_EXECUTOR_MEMORY, 42L) + + val step = new BasicExecutorFeatureStep( + KubernetesConf( + conf, + KubernetesExecutorSpecificConf("1", Some(DRIVER_POD)), + RESOURCE_NAME_PREFIX, + APP_ID, + LABELS, + ANNOTATIONS, + Map.empty, + Map.empty, + Map.empty, + Nil, + Seq.empty[String])) + val executor = step.configurePod(SparkPod.initialPod()) + // This is checking that basic executor + executorMemory = 1408 + 42 = 1450 + assert(executor.container.getResources.getRequests.get("memory").getAmount === "1450Mi") + } + // There is always exactly one controller reference, and it points to the driver pod. private def checkOwnerReferences(executor: Pod, driverPodUid: String): Unit = { assert(executor.getMetadata.getOwnerReferences.size() === 1) diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/LocalDirsFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/LocalDirsFeatureStepSuite.scala index a339827b819a..acdd07bc594b 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/LocalDirsFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/LocalDirsFeatureStepSuite.scala @@ -18,10 +18,12 @@ package org.apache.spark.deploy.k8s.features import io.fabric8.kubernetes.api.model.{EnvVarBuilder, VolumeBuilder, VolumeMountBuilder} import org.mockito.Mockito +import org.scalatest._ import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.k8s.{KubernetesConf, KubernetesDriverSpecificConf, KubernetesRoleSpecificConf, SparkPod} +import org.apache.spark.deploy.k8s.Config._ class LocalDirsFeatureStepSuite extends SparkFunSuite with BeforeAndAfter { private val defaultLocalDir = "/var/data/default-local-dir" @@ -111,4 +113,32 @@ class LocalDirsFeatureStepSuite extends SparkFunSuite with BeforeAndAfter { .withValue("/var/data/my-local-dir-1,/var/data/my-local-dir-2") .build()) } + + test("Use tmpfs to back default local dir") { + Mockito.doReturn(null).when(sparkConf).get("spark.local.dir") + Mockito.doReturn(null).when(sparkConf).getenv("SPARK_LOCAL_DIRS") + Mockito.doReturn(true).when(sparkConf).get(KUBERNETES_LOCAL_DIRS_TMPFS) + val stepUnderTest = new LocalDirsFeatureStep(kubernetesConf, defaultLocalDir) + val configuredPod = stepUnderTest.configurePod(SparkPod.initialPod()) + assert(configuredPod.pod.getSpec.getVolumes.size === 1) + assert(configuredPod.pod.getSpec.getVolumes.get(0) === + new VolumeBuilder() + .withName(s"spark-local-dir-1") + .withNewEmptyDir() + .withMedium("Memory") + .endEmptyDir() + .build()) + assert(configuredPod.container.getVolumeMounts.size === 1) + assert(configuredPod.container.getVolumeMounts.get(0) === + new VolumeMountBuilder() + .withName(s"spark-local-dir-1") + .withMountPath(defaultLocalDir) + .build()) + assert(configuredPod.container.getEnv.size === 1) + assert(configuredPod.container.getEnv.get(0) === + new EnvVarBuilder() + .withName("SPARK_LOCAL_DIRS") + .withValue(defaultLocalDir) + .build()) + } } diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/bindings/JavaDriverFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/bindings/JavaDriverFeatureStepSuite.scala index 18874afe6e53..bf552aeb8b90 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/bindings/JavaDriverFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/bindings/JavaDriverFeatureStepSuite.scala @@ -56,6 +56,5 @@ class JavaDriverFeatureStepSuite extends SparkFunSuite { "--properties-file", SPARK_CONF_PATH, "--class", "test-class", "spark-internal", "5 7")) - } } diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala index e847f8590d35..0e617b002101 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala @@ -167,13 +167,23 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter { executorSpecificConf.executorId, TEST_SPARK_APP_ID, Some(driverPod)) - k8sConf.sparkConf.getAll.toMap == conf.getAll.toMap && + + // Set prefixes to a common string since KUBERNETES_EXECUTOR_POD_NAME_PREFIX + // has not be set for the tests and thus KubernetesConf will use a random + // string for the prefix, based on the app name, and this comparison here will fail. + val k8sConfCopy = k8sConf + .copy(appResourceNamePrefix = "") + .copy(sparkConf = conf) + val expectedK8sConfCopy = expectedK8sConf + .copy(appResourceNamePrefix = "") + .copy(sparkConf = conf) + + k8sConf.sparkConf.getAll.toMap == conf.getAll.toMap && // Since KubernetesConf.createExecutorConf clones the SparkConf object, force // deep equality comparison for the SparkConf object and use object equality // comparison on all other fields. - k8sConf.copy(sparkConf = conf) == expectedK8sConf.copy(sparkConf = conf) + k8sConfCopy == expectedK8sConfCopy } } }) - } diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala index 562ace9f49d4..d8409383b4a1 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManagerSuite.scala @@ -31,6 +31,7 @@ import scala.collection.mutable import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.k8s.Fabric8Aliases._ +import org.apache.spark.deploy.k8s.KubernetesUtils._ import org.apache.spark.scheduler.ExecutorExited import org.apache.spark.scheduler.cluster.k8s.ExecutorLifecycleTestUtils._ @@ -104,13 +105,15 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte } private def exitReasonMessage(failedExecutorId: Int, failedPod: Pod): String = { + val reason = Option(failedPod.getStatus.getReason) + val message = Option(failedPod.getStatus.getMessage) s""" |The executor with id $failedExecutorId exited with exit code 1. - |The API gave the following brief reason: ${failedPod.getStatus.getReason} - |The API gave the following message: ${failedPod.getStatus.getMessage} + |The API gave the following brief reason: ${reason.getOrElse("N/A")} + |The API gave the following message: ${message.getOrElse("N/A")} |The API gave the following container statuses: | - |${failedPod.getStatus.getContainerStatuses.asScala.map(_.toString).mkString("\n===\n")} + |${containersDescription(failedPod)} """.stripMargin } diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile index 071aa2020dd8..1c4dcd547687 100644 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile @@ -19,6 +19,7 @@ FROM openjdk:8-alpine ARG spark_jars=jars ARG img_path=kubernetes/dockerfiles +ARG k8s_tests=kubernetes/tests # Before building the docker image, first build and make a Spark distribution following # the instructions in http://spark.apache.org/docs/latest/building-spark.html. @@ -43,6 +44,7 @@ COPY bin /opt/spark/bin COPY sbin /opt/spark/sbin COPY ${img_path}/spark/entrypoint.sh /opt/ COPY examples /opt/spark/examples +COPY ${k8s_tests} /opt/spark/tests COPY data /opt/spark/data ENV SPARK_HOME /opt/spark diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/R/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/R/Dockerfile index e627883ba782..9f67422efeb3 100644 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/R/Dockerfile +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/R/Dockerfile @@ -19,10 +19,10 @@ ARG base_img FROM $base_img WORKDIR / RUN mkdir ${SPARK_HOME}/R -COPY R ${SPARK_HOME}/R RUN apk add --no-cache R R-dev +COPY R ${SPARK_HOME}/R ENV R_HOME /usr/lib/R WORKDIR /opt/spark/work-dir diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile index 72bb9620b45d..69b6efa6149a 100644 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile @@ -19,7 +19,6 @@ ARG base_img FROM $base_img WORKDIR / RUN mkdir ${SPARK_HOME}/python -COPY python/lib ${SPARK_HOME}/python/lib # TODO: Investigate running both pip and pip3 via virtualenvs RUN apk add --no-cache python && \ apk add --no-cache python3 && \ @@ -33,6 +32,7 @@ RUN apk add --no-cache python && \ # Removed the .cache to save space rm -r /root/.cache +COPY python/lib ${SPARK_HOME}/python/lib ENV PYTHONPATH ${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4j-*.zip WORKDIR /opt/spark/work-dir diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml index 614705c1ed66..23453c8957b2 100644 --- a/resource-managers/kubernetes/integration-tests/pom.xml +++ b/resource-managers/kubernetes/integration-tests/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../../pom.xml diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala index 896a83a5badb..c99a907f98d0 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala @@ -23,7 +23,10 @@ import java.util.regex.Pattern import com.google.common.io.PatternFilenameFilter import io.fabric8.kubernetes.api.model.Pod +import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher} +import io.fabric8.kubernetes.client.Watcher.Action import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, Tag} +import org.scalatest.Matchers import org.scalatest.concurrent.{Eventually, PatienceConfiguration} import org.scalatest.time.{Minutes, Seconds, Span} import scala.collection.JavaConverters._ @@ -31,10 +34,12 @@ import scala.collection.JavaConverters._ import org.apache.spark.SparkFunSuite import org.apache.spark.deploy.k8s.integrationtest.TestConfig._ import org.apache.spark.deploy.k8s.integrationtest.backend.{IntegrationTestBackend, IntegrationTestBackendFactory} +import org.apache.spark.internal.Logging private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfterAll with BeforeAndAfter with BasicTestsSuite with SecretsTestsSuite - with PythonTestsSuite with ClientModeTestsSuite { + with PythonTestsSuite with ClientModeTestsSuite + with Logging with Eventually with Matchers { import KubernetesSuite._ @@ -50,7 +55,19 @@ private[spark] class KubernetesSuite extends SparkFunSuite protected var containerLocalSparkDistroExamplesJar: String = _ protected var appLocator: String = _ + // Default memory limit is 1024M + 384M (minimum overhead constant) + private val baseMemory = s"${1024 + 384}Mi" + protected val memOverheadConstant = 0.8 + private val standardNonJVMMemory = s"${(1024 + 0.4*1024).toInt}Mi" + protected val additionalMemory = 200 + // 209715200 is 200Mi + protected val additionalMemoryInBytes = 209715200 + private val extraDriverTotalMemory = s"${(1024 + memOverheadConstant*1024).toInt}Mi" + private val extraExecTotalMemory = + s"${(1024 + memOverheadConstant*1024 + additionalMemory).toInt}Mi" + override def beforeAll(): Unit = { + super.beforeAll() // The scalatest-maven-plugin gives system properties that are referenced but not set null // values. We need to remove the null-value properties before initializing the test backend. val nullValueProperties = System.getProperties.asScala @@ -82,7 +99,11 @@ private[spark] class KubernetesSuite extends SparkFunSuite } override def afterAll(): Unit = { - testBackend.cleanUp() + try { + testBackend.cleanUp() + } finally { + super.afterAll() + } } before { @@ -207,17 +228,28 @@ private[spark] class KubernetesSuite extends SparkFunSuite .getItems .get(0) driverPodChecker(driverPod) - - val executorPods = kubernetesTestComponents.kubernetesClient + val execPods = scala.collection.mutable.Map[String, Pod]() + val execWatcher = kubernetesTestComponents.kubernetesClient .pods() .withLabel("spark-app-locator", appLocator) .withLabel("spark-role", "executor") - .list() - .getItems - executorPods.asScala.foreach { pod => - executorPodChecker(pod) - } - + .watch(new Watcher[Pod] { + logInfo("Beginning watch of executors") + override def onClose(cause: KubernetesClientException): Unit = + logInfo("Ending watch of executors") + override def eventReceived(action: Watcher.Action, resource: Pod): Unit = { + val name = resource.getMetadata.getName + action match { + case Action.ADDED | Action.MODIFIED => + execPods(name) = resource + case Action.DELETED | Action.ERROR => + execPods.remove(name) + } + } + }) + Eventually.eventually(TIMEOUT, INTERVAL) { execPods.values.nonEmpty should be (true) } + execWatcher.close() + execPods.values.foreach(executorPodChecker(_)) Eventually.eventually(TIMEOUT, INTERVAL) { expectedLogOnCompletion.foreach { e => assert(kubernetesTestComponents.kubernetesClient @@ -228,11 +260,12 @@ private[spark] class KubernetesSuite extends SparkFunSuite } } } - protected def doBasicDriverPodCheck(driverPod: Pod): Unit = { assert(driverPod.getMetadata.getName === driverPodName) assert(driverPod.getSpec.getContainers.get(0).getImage === image) assert(driverPod.getSpec.getContainers.get(0).getName === "spark-kubernetes-driver") + assert(driverPod.getSpec.getContainers.get(0).getResources.getRequests.get("memory").getAmount + === baseMemory) } @@ -240,28 +273,48 @@ private[spark] class KubernetesSuite extends SparkFunSuite assert(driverPod.getMetadata.getName === driverPodName) assert(driverPod.getSpec.getContainers.get(0).getImage === pyImage) assert(driverPod.getSpec.getContainers.get(0).getName === "spark-kubernetes-driver") + assert(driverPod.getSpec.getContainers.get(0).getResources.getRequests.get("memory").getAmount + === standardNonJVMMemory) } protected def doBasicDriverRPodCheck(driverPod: Pod): Unit = { assert(driverPod.getMetadata.getName === driverPodName) assert(driverPod.getSpec.getContainers.get(0).getImage === rImage) assert(driverPod.getSpec.getContainers.get(0).getName === "spark-kubernetes-driver") + assert(driverPod.getSpec.getContainers.get(0).getResources.getRequests.get("memory").getAmount + === standardNonJVMMemory) } protected def doBasicExecutorPodCheck(executorPod: Pod): Unit = { assert(executorPod.getSpec.getContainers.get(0).getImage === image) assert(executorPod.getSpec.getContainers.get(0).getName === "executor") + assert(executorPod.getSpec.getContainers.get(0).getResources.getRequests.get("memory").getAmount + === baseMemory) } protected def doBasicExecutorPyPodCheck(executorPod: Pod): Unit = { assert(executorPod.getSpec.getContainers.get(0).getImage === pyImage) assert(executorPod.getSpec.getContainers.get(0).getName === "executor") + assert(executorPod.getSpec.getContainers.get(0).getResources.getRequests.get("memory").getAmount + === standardNonJVMMemory) } protected def doBasicExecutorRPodCheck(executorPod: Pod): Unit = { assert(executorPod.getSpec.getContainers.get(0).getImage === rImage) assert(executorPod.getSpec.getContainers.get(0).getName === "executor") + assert(executorPod.getSpec.getContainers.get(0).getResources.getRequests.get("memory").getAmount + === standardNonJVMMemory) + } + + protected def doDriverMemoryCheck(driverPod: Pod): Unit = { + assert(driverPod.getSpec.getContainers.get(0).getResources.getRequests.get("memory").getAmount + === extraDriverTotalMemory) + } + + protected def doExecutorMemoryCheck(executorPod: Pod): Unit = { + assert(executorPod.getSpec.getContainers.get(0).getResources.getRequests.get("memory").getAmount + === extraExecTotalMemory) } protected def checkCustomSettings(pod: Pod): Unit = { diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala index b602fdf39731..5615d6173eeb 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala @@ -62,7 +62,6 @@ private[spark] class KubernetesTestComponents(defaultClient: DefaultKubernetesCl new SparkAppConf() .set("spark.master", s"k8s://${kubernetesClient.getMasterUrl}") .set("spark.kubernetes.namespace", namespace) - .set("spark.executor.memory", "500m") .set("spark.executor.cores", "1") .set("spark.executors.instances", "1") .set("spark.app.name", "spark-test-app") diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PythonTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PythonTestsSuite.scala index 1ebb30094dcd..06b73107ec23 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PythonTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PythonTestsSuite.scala @@ -23,9 +23,11 @@ private[spark] trait PythonTestsSuite { k8sSuite: KubernetesSuite => import PythonTestsSuite._ import KubernetesSuite.k8sTestTag + private val pySparkDockerImage = + s"${getTestImageRepo}/spark-py:${getTestImageTag}" test("Run PySpark on simple pi.py example", k8sTestTag) { sparkAppConf - .set("spark.kubernetes.container.image", s"${getTestImageRepo}/spark-py:${getTestImageTag}") + .set("spark.kubernetes.container.image", pySparkDockerImage) runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_PI, mainClass = "", @@ -39,7 +41,7 @@ private[spark] trait PythonTestsSuite { k8sSuite: KubernetesSuite => test("Run PySpark with Python2 to test a pyfiles example", k8sTestTag) { sparkAppConf - .set("spark.kubernetes.container.image", s"${getTestImageRepo}/spark-py:${getTestImageTag}") + .set("spark.kubernetes.container.image", pySparkDockerImage) .set("spark.kubernetes.pyspark.pythonVersion", "2") runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_FILES, @@ -57,7 +59,7 @@ private[spark] trait PythonTestsSuite { k8sSuite: KubernetesSuite => test("Run PySpark with Python3 to test a pyfiles example", k8sTestTag) { sparkAppConf - .set("spark.kubernetes.container.image", s"${getTestImageRepo}/spark-py:${getTestImageTag}") + .set("spark.kubernetes.container.image", pySparkDockerImage) .set("spark.kubernetes.pyspark.pythonVersion", "3") runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_FILES, @@ -72,12 +74,32 @@ private[spark] trait PythonTestsSuite { k8sSuite: KubernetesSuite => isJVM = false, pyFiles = Some(PYSPARK_CONTAINER_TESTS)) } + + test("Run PySpark with memory customization", k8sTestTag) { + sparkAppConf + .set("spark.kubernetes.container.image", pySparkDockerImage) + .set("spark.kubernetes.pyspark.pythonVersion", "3") + .set("spark.kubernetes.memoryOverheadFactor", s"$memOverheadConstant") + .set("spark.executor.pyspark.memory", s"${additionalMemory}m") + runSparkApplicationAndVerifyCompletion( + appResource = PYSPARK_MEMORY_CHECK, + mainClass = "", + expectedLogOnCompletion = Seq( + "PySpark Worker Memory Check is: True"), + appArgs = Array(s"$additionalMemoryInBytes"), + driverPodChecker = doDriverMemoryCheck, + executorPodChecker = doExecutorMemoryCheck, + appLocator = appLocator, + isJVM = false, + pyFiles = Some(PYSPARK_CONTAINER_TESTS)) + } } private[spark] object PythonTestsSuite { val CONTAINER_LOCAL_PYSPARK: String = "local:///opt/spark/examples/src/main/python/" val PYSPARK_PI: String = CONTAINER_LOCAL_PYSPARK + "pi.py" - val PYSPARK_FILES: String = CONTAINER_LOCAL_PYSPARK + "pyfiles.py" - val PYSPARK_CONTAINER_TESTS: String = CONTAINER_LOCAL_PYSPARK + "py_container_checks.py" + val TEST_LOCAL_PYSPARK: String = "local:///opt/spark/tests/" + val PYSPARK_FILES: String = TEST_LOCAL_PYSPARK + "pyfiles.py" + val PYSPARK_CONTAINER_TESTS: String = TEST_LOCAL_PYSPARK + "py_container_checks.py" + val PYSPARK_MEMORY_CHECK: String = TEST_LOCAL_PYSPARK + "worker_memory_check.py" } - diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SecretsTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SecretsTestsSuite.scala index 7b05c1355ca2..b18a6aebda49 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SecretsTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SecretsTestsSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.deploy.k8s.integrationtest import scala.collection.JavaConverters._ -import io.fabric8.kubernetes.api.model.{Pod, Secret, SecretBuilder} +import io.fabric8.kubernetes.api.model.{Pod, SecretBuilder} import org.apache.commons.codec.binary.Base64 import org.apache.commons.io.output.ByteArrayOutputStream import org.scalatest.concurrent.Eventually diff --git a/examples/src/main/python/py_container_checks.py b/resource-managers/kubernetes/integration-tests/tests/py_container_checks.py similarity index 100% rename from examples/src/main/python/py_container_checks.py rename to resource-managers/kubernetes/integration-tests/tests/py_container_checks.py diff --git a/examples/src/main/python/pyfiles.py b/resource-managers/kubernetes/integration-tests/tests/pyfiles.py similarity index 100% rename from examples/src/main/python/pyfiles.py rename to resource-managers/kubernetes/integration-tests/tests/pyfiles.py diff --git a/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py b/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py new file mode 100644 index 000000000000..d312a29f388e --- /dev/null +++ b/resource-managers/kubernetes/integration-tests/tests/worker_memory_check.py @@ -0,0 +1,47 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +import resource +import sys + +from pyspark.sql import SparkSession + + +if __name__ == "__main__": + """ + Usage: worker_memory_check [Memory_in_Mi] + """ + spark = SparkSession \ + .builder \ + .appName("PyMemoryTest") \ + .getOrCreate() + sc = spark.sparkContext + if len(sys.argv) < 2: + print("Usage: worker_memory_check [Memory_in_Mi]", file=sys.stderr) + sys.exit(-1) + + def f(x): + rLimit = resource.getrlimit(resource.RLIMIT_AS) + print("RLimit is " + str(rLimit)) + return rLimit + resourceValue = sc.parallelize([1]).map(f).collect()[0][0] + print("Resource Value is " + str(resourceValue)) + truthCheck = (resourceValue == int(sys.argv[1])) + print("PySpark Worker Memory Check is: " + str(truthCheck)) + spark.stop() diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml index 3995d0afeb5f..9585bdfafdcf 100644 --- a/resource-managers/mesos/pom.xml +++ b/resource-managers/mesos/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala index 64698b55c6bb..32ac4f37c5f9 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala @@ -17,6 +17,7 @@ package org.apache.spark.deploy.mesos +import java.util.Locale import java.util.concurrent.CountDownLatch import org.apache.spark.{SecurityManager, SparkConf, SparkException} @@ -60,7 +61,7 @@ private[mesos] class MesosClusterDispatcher( } private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host) - private val recoveryMode = conf.get(RECOVERY_MODE).toUpperCase() + private val recoveryMode = conf.get(RECOVERY_MODE).toUpperCase(Locale.ROOT) logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode) private val engineFactory = recoveryMode match { diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index 7d80eedcc43c..cb1bcba651be 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -202,7 +202,7 @@ private[spark] class MesosClusterScheduler( } else if (removeFromPendingRetryDrivers(submissionId)) { k.success = true k.message = "Removed driver while it's being retried" - } else if (finishedDrivers.exists(_.driverDescription.submissionId.equals(submissionId))) { + } else if (finishedDrivers.exists(_.driverDescription.submissionId == submissionId)) { k.success = false k.message = "Driver already terminated" } else { @@ -222,21 +222,21 @@ private[spark] class MesosClusterScheduler( } s.submissionId = submissionId stateLock.synchronized { - if (queuedDrivers.exists(_.submissionId.equals(submissionId))) { + if (queuedDrivers.exists(_.submissionId == submissionId)) { s.success = true s.driverState = "QUEUED" } else if (launchedDrivers.contains(submissionId)) { s.success = true s.driverState = "RUNNING" launchedDrivers(submissionId).mesosTaskStatus.foreach(state => s.message = state.toString) - } else if (finishedDrivers.exists(_.driverDescription.submissionId.equals(submissionId))) { + } else if (finishedDrivers.exists(_.driverDescription.submissionId == submissionId)) { s.success = true s.driverState = "FINISHED" finishedDrivers .find(d => d.driverDescription.submissionId.equals(submissionId)).get.mesosTaskStatus .foreach(state => s.message = state.toString) - } else if (pendingRetryDrivers.exists(_.submissionId.equals(submissionId))) { - val status = pendingRetryDrivers.find(_.submissionId.equals(submissionId)) + } else if (pendingRetryDrivers.exists(_.submissionId == submissionId)) { + val status = pendingRetryDrivers.find(_.submissionId == submissionId) .get.retryState.get.lastFailureStatus s.success = true s.driverState = "RETRYING" @@ -254,13 +254,13 @@ private[spark] class MesosClusterScheduler( */ def getDriverState(submissionId: String): Option[MesosDriverState] = { stateLock.synchronized { - queuedDrivers.find(_.submissionId.equals(submissionId)) + queuedDrivers.find(_.submissionId == submissionId) .map(d => new MesosDriverState("QUEUED", d)) .orElse(launchedDrivers.get(submissionId) .map(d => new MesosDriverState("RUNNING", d.driverDescription, Some(d)))) - .orElse(finishedDrivers.find(_.driverDescription.submissionId.equals(submissionId)) + .orElse(finishedDrivers.find(_.driverDescription.submissionId == submissionId) .map(d => new MesosDriverState("FINISHED", d.driverDescription, Some(d)))) - .orElse(pendingRetryDrivers.find(_.submissionId.equals(submissionId)) + .orElse(pendingRetryDrivers.find(_.submissionId == submissionId) .map(d => new MesosDriverState("RETRYING", d))) } } @@ -814,7 +814,7 @@ private[spark] class MesosClusterScheduler( status: Int): Unit = {} private def removeFromQueuedDrivers(subId: String): Boolean = { - val index = queuedDrivers.indexWhere(_.submissionId.equals(subId)) + val index = queuedDrivers.indexWhere(_.submissionId == subId) if (index != -1) { queuedDrivers.remove(index) queuedDriversState.expunge(subId) @@ -834,7 +834,7 @@ private[spark] class MesosClusterScheduler( } private def removeFromPendingRetryDrivers(subId: String): Boolean = { - val index = pendingRetryDrivers.indexWhere(_.submissionId.equals(subId)) + val index = pendingRetryDrivers.indexWhere(_.submissionId == subId) if (index != -1) { pendingRetryDrivers.remove(index) pendingRetryDriversState.expunge(subId) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 178de30f0f38..bac0246b7ddc 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -33,6 +33,7 @@ import org.apache.mesos.SchedulerDriver import org.apache.spark.{SecurityManager, SparkConf, SparkContext, SparkException, TaskState} import org.apache.spark.deploy.mesos.config._ import org.apache.spark.internal.config +import org.apache.spark.internal.config.EXECUTOR_HEARTBEAT_INTERVAL import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient @@ -635,7 +636,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( externalShufflePort, sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs", s"${sc.conf.getTimeAsSeconds("spark.network.timeout", "120s")}s"), - sc.conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s")) + sc.conf.get(EXECUTOR_HEARTBEAT_INTERVAL)) slave.shuffleRegistered = true } diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala index e534b9d7e3ed..082d4bcfdf83 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala @@ -21,7 +21,7 @@ import java.util.{Collection, Collections, Date} import scala.collection.JavaConverters._ -import org.apache.mesos.Protos.{Environment, Secret, TaskState => MesosTaskState, _} +import org.apache.mesos.Protos.{TaskState => MesosTaskState, _} import org.apache.mesos.Protos.Value.{Scalar, Type} import org.apache.mesos.SchedulerDriver import org.mockito.{ArgumentCaptor, Matchers} @@ -146,14 +146,14 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi assert(scheduler.getResource(resources, "cpus") == 1.5) assert(scheduler.getResource(resources, "mem") == 1200) val resourcesSeq: Seq[Resource] = resources.asScala - val cpus = resourcesSeq.filter(_.getName.equals("cpus")).toList + val cpus = resourcesSeq.filter(_.getName == "cpus").toList assert(cpus.size == 2) - assert(cpus.exists(_.getRole().equals("role2"))) - assert(cpus.exists(_.getRole().equals("*"))) - val mem = resourcesSeq.filter(_.getName.equals("mem")).toList + assert(cpus.exists(_.getRole() == "role2")) + assert(cpus.exists(_.getRole() == "*")) + val mem = resourcesSeq.filter(_.getName == "mem").toList assert(mem.size == 2) - assert(mem.exists(_.getRole().equals("role2"))) - assert(mem.exists(_.getRole().equals("*"))) + assert(mem.exists(_.getRole() == "role2")) + assert(mem.exists(_.getRole() == "*")) verify(driver, times(1)).launchTasks( Matchers.eq(Collections.singleton(offer.getId)), diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala index 31f84310485a..1ead4b1ed7c7 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala @@ -106,7 +106,7 @@ class MesosFineGrainedSchedulerBackendSuite // uri is null. val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id") val executorResources = executorInfo.getResourcesList - val cpus = executorResources.asScala.find(_.getName.equals("cpus")).get.getScalar.getValue + val cpus = executorResources.asScala.find(_.getName == "cpus").get.getScalar.getValue assert(cpus === mesosExecutorCores) } diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml index 37e25ceecb88..e55b814be846 100644 --- a/resource-managers/yarn/pom.xml +++ b/resource-managers/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala index 7fa597167f3f..26013a109c42 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala @@ -191,7 +191,7 @@ class ClientSuite extends SparkFunSuite with Matchers { appContext.getQueue should be ("staging-queue") appContext.getAMContainerSpec should be (containerLaunchContext) appContext.getApplicationType should be ("SPARK") - appContext.getClass.getMethods.filter(_.getName.equals("getApplicationTags")).foreach{ method => + appContext.getClass.getMethods.filter(_.getName == "getApplicationTags").foreach { method => val tags = method.invoke(appContext).asInstanceOf[java.util.Set[String]] tags should contain allOf ("tag1", "dup", "tag2", "multi word") tags.asScala.count(_.nonEmpty) should be (4) diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceMetricsSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceMetricsSuite.scala new file mode 100644 index 000000000000..40b92282a3b8 --- /dev/null +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceMetricsSuite.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.network.yarn + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.metrics2.MetricsRecordBuilder +import org.mockito.Matchers._ +import org.mockito.Mockito.{mock, times, verify, when} +import org.scalatest.Matchers + +import org.apache.spark.SparkFunSuite +import org.apache.spark.network.server.OneForOneStreamManager +import org.apache.spark.network.shuffle.{ExternalShuffleBlockHandler, ExternalShuffleBlockResolver} + +class YarnShuffleServiceMetricsSuite extends SparkFunSuite with Matchers { + + val streamManager = mock(classOf[OneForOneStreamManager]) + val blockResolver = mock(classOf[ExternalShuffleBlockResolver]) + when(blockResolver.getRegisteredExecutorsSize).thenReturn(42) + + val metrics = new ExternalShuffleBlockHandler(streamManager, blockResolver).getAllMetrics + + test("metrics named as expected") { + val allMetrics = Set( + "openBlockRequestLatencyMillis", "registerExecutorRequestLatencyMillis", + "blockTransferRateBytes", "registeredExecutorsSize") + + metrics.getMetrics.keySet().asScala should be (allMetrics) + } + + // these three metrics have the same effect on the collector + for (testname <- Seq("openBlockRequestLatencyMillis", + "registerExecutorRequestLatencyMillis", + "blockTransferRateBytes")) { + test(s"$testname - collector receives correct types") { + val builder = mock(classOf[MetricsRecordBuilder]) + when(builder.addCounter(any(), anyLong())).thenReturn(builder) + when(builder.addGauge(any(), anyDouble())).thenReturn(builder) + + YarnShuffleServiceMetrics.collectMetric(builder, testname, + metrics.getMetrics.get(testname)) + + verify(builder).addCounter(anyObject(), anyLong()) + verify(builder, times(4)).addGauge(anyObject(), anyDouble()) + } + } + + // this metric writes only one gauge to the collector + test("registeredExecutorsSize - collector receives correct types") { + val builder = mock(classOf[MetricsRecordBuilder]) + + YarnShuffleServiceMetrics.collectMetric(builder, "registeredExecutorsSize", + metrics.getMetrics.get("registeredExecutorsSize")) + + // only one + verify(builder).addGauge(anyObject(), anyInt()) + } +} diff --git a/scalastyle-config.xml b/scalastyle-config.xml index da5c3f29c32d..36a73e336221 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -227,6 +227,19 @@ This file is divided into 3 sections: ]]> + + (\.toUpperCase|\.toLowerCase)(?!(\(|\(Locale.ROOT\))) + + + JavaConversions diff --git a/sql/catalyst/benchmarks/HashBenchmark-results.txt b/sql/catalyst/benchmarks/HashBenchmark-results.txt new file mode 100644 index 000000000000..2459b35c75bb --- /dev/null +++ b/sql/catalyst/benchmarks/HashBenchmark-results.txt @@ -0,0 +1,70 @@ +================================================================================================ +single ints +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 5615 / 5616 95.6 10.5 1.0X +codegen version 8400 / 8407 63.9 15.6 0.7X +codegen version 64-bit 8139 / 8145 66.0 15.2 0.7X +codegen HiveHash version 7213 / 7348 74.4 13.4 0.8X + + +================================================================================================ +single longs +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 6053 / 6054 88.7 11.3 1.0X +codegen version 9367 / 9369 57.3 17.4 0.6X +codegen version 64-bit 8041 / 8051 66.8 15.0 0.8X +codegen HiveHash version 7546 / 7575 71.1 14.1 0.8X + + +================================================================================================ +normal +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 3181 / 3182 0.7 1517.0 1.0X +codegen version 2403 / 2403 0.9 1145.7 1.3X +codegen version 64-bit 915 / 916 2.3 436.2 3.5X +codegen HiveHash version 4505 / 4527 0.5 2148.3 0.7X + + +================================================================================================ +array +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 1828 / 1844 0.1 13946.1 1.0X +codegen version 3678 / 3804 0.0 28058.2 0.5X +codegen version 64-bit 2925 / 2931 0.0 22317.8 0.6X +codegen HiveHash version 1216 / 1217 0.1 9280.0 1.5X + + +================================================================================================ +map +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 0 / 0 44.3 22.6 1.0X +codegen version 176 / 176 0.0 42978.8 0.0X +codegen version 64-bit 173 / 175 0.0 42214.3 0.0X +codegen HiveHash version 44 / 44 0.1 10659.9 0.0X + + diff --git a/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt new file mode 100644 index 000000000000..a4304ee3b5f6 --- /dev/null +++ b/sql/catalyst/benchmarks/HashByteArrayBenchmark-results.txt @@ -0,0 +1,77 @@ +================================================================================================ +Benchmark for MurMurHash 3 and xxHash64 +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 8: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 16 / 16 127.7 7.8 1.0X +xxHash 64-bit 23 / 23 90.7 11.0 0.7X +HiveHasher 16 / 16 134.8 7.4 1.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 16: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 26 / 26 79.5 12.6 1.0X +xxHash 64-bit 26 / 27 79.3 12.6 1.0X +HiveHasher 30 / 30 70.1 14.3 0.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 24: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 36 / 36 58.1 17.2 1.0X +xxHash 64-bit 30 / 30 70.2 14.2 1.2X +HiveHasher 45 / 45 46.4 21.5 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 31: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 50 / 50 41.8 23.9 1.0X +xxHash 64-bit 43 / 43 49.3 20.3 1.2X +HiveHasher 58 / 58 35.9 27.8 0.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 95: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 132 / 132 15.9 62.7 1.0X +xxHash 64-bit 79 / 79 26.7 37.5 1.7X +HiveHasher 198 / 199 10.6 94.6 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 287: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 334 / 334 6.3 159.3 1.0X +xxHash 64-bit 126 / 126 16.7 59.9 2.7X +HiveHasher 633 / 634 3.3 302.0 0.5X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 1055: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 1149 / 1149 1.8 547.9 1.0X +xxHash 64-bit 327 / 327 6.4 155.9 3.5X +HiveHasher 2338 / 2346 0.9 1114.6 0.5X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 2079: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 2215 / 2216 0.9 1056.1 1.0X +xxHash 64-bit 554 / 554 3.8 264.0 4.0X +HiveHasher 4609 / 4609 0.5 2197.5 0.5X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash byte arrays with length 8223: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Murmur3_x86_32 8633 / 8643 0.2 4116.3 1.0X +xxHash 64-bit 1891 / 1892 1.1 901.6 4.6X +HiveHasher 18206 / 18206 0.1 8681.3 0.5X + + diff --git a/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt new file mode 100644 index 000000000000..43156dc6fc67 --- /dev/null +++ b/sql/catalyst/benchmarks/UnsafeProjectionBenchmark-results.txt @@ -0,0 +1,14 @@ +================================================================================================ +unsafe projection +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +unsafe projection: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +single long 2867 / 2868 93.6 10.7 1.0X +single nullable long 3915 / 3949 68.6 14.6 0.7X +7 primitive types 8166 / 8167 32.9 30.4 0.4X +7 nullable primitive types 12767 / 12767 21.0 47.6 0.2X + + diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index 7d23637e2834..2e7df4fd1404 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 94283f59011a..056998630b09 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -99,7 +99,7 @@ statement | CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier LIKE source=tableIdentifier locationSpec? #createTableLike | ANALYZE TABLE tableIdentifier partitionSpec? COMPUTE STATISTICS - (identifier | FOR COLUMNS identifierSeq)? #analyze + (identifier | FOR COLUMNS identifierSeq | FOR ALL COLUMNS)? #analyze | ALTER TABLE tableIdentifier ADD COLUMNS '(' columns=colTypeList ')' #addTableColumns | ALTER (TABLE | VIEW) from=tableIdentifier @@ -468,7 +468,7 @@ joinType joinCriteria : ON booleanExpression - | USING '(' identifier (',' identifier)* ')' + | USING identifierList ; sample diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java index 551443a11298..460513816dfd 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatch.java @@ -16,6 +16,7 @@ */ package org.apache.spark.sql.catalyst.expressions; +import java.io.Closeable; import java.io.IOException; import org.apache.spark.memory.MemoryConsumer; @@ -45,7 +46,7 @@ * page requires an average size for key value pairs to be larger than 1024 bytes. * */ -public abstract class RowBasedKeyValueBatch extends MemoryConsumer { +public abstract class RowBasedKeyValueBatch extends MemoryConsumer implements Closeable { protected final Logger logger = LoggerFactory.getLogger(RowBasedKeyValueBatch.class); private static final int DEFAULT_CAPACITY = 1 << 16; diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java index 9e7b15d339ee..9002abdcfd47 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java @@ -27,7 +27,6 @@ import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.bitset.BitSetMethods; import org.apache.spark.unsafe.hash.Murmur3_x86_32; -import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.unsafe.types.CalendarInterval; import org.apache.spark.unsafe.types.UTF8String; @@ -241,8 +240,7 @@ public UTF8String getUTF8String(int ordinal) { final long offsetAndSize = getLong(ordinal); final int offset = (int) (offsetAndSize >> 32); final int size = (int) offsetAndSize; - MemoryBlock mb = MemoryBlock.allocateFromObject(baseObject, baseOffset + offset, size); - return new UTF8String(mb); + return UTF8String.fromAddress(baseObject, baseOffset + offset, size); } @Override diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java index 469b0e60cc9a..a76e6ef8c91c 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java @@ -37,7 +37,6 @@ import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.bitset.BitSetMethods; import org.apache.spark.unsafe.hash.Murmur3_x86_32; -import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.unsafe.types.CalendarInterval; import org.apache.spark.unsafe.types.UTF8String; @@ -417,8 +416,7 @@ public UTF8String getUTF8String(int ordinal) { final long offsetAndSize = getLong(ordinal); final int offset = (int) (offsetAndSize >> 32); final int size = (int) offsetAndSize; - MemoryBlock mb = MemoryBlock.allocateFromObject(baseObject, baseOffset + offset, size); - return new UTF8String(mb); + return UTF8String.fromAddress(baseObject, baseOffset + offset, size); } @Override diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java index 8e9c0a2e9dc8..eb5051b28407 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/XXH64.java @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.catalyst.expressions; -import org.apache.spark.unsafe.memory.MemoryBlock; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.types.UTF8String; // scalastyle: off @@ -72,13 +72,13 @@ public static long hashLong(long input, long seed) { return fmix(hash); } - public long hashUnsafeWordsBlock(MemoryBlock mb) { - return hashUnsafeWordsBlock(mb, seed); + public long hashUnsafeWords(Object base, long offset, int length) { + return hashUnsafeWords(base, offset, length, seed); } - public static long hashUnsafeWordsBlock(MemoryBlock mb, long seed) { - assert (mb.size() % 8 == 0) : "lengthInBytes must be a multiple of 8 (word-aligned)"; - long hash = hashBytesByWordsBlock(mb, seed); + public static long hashUnsafeWords(Object base, long offset, int length, long seed) { + assert (length % 8 == 0) : "lengthInBytes must be a multiple of 8 (word-aligned)"; + long hash = hashBytesByWords(base, offset, length, seed); return fmix(hash); } @@ -86,22 +86,20 @@ public long hashUnsafeBytes(Object base, long offset, int length) { return hashUnsafeBytes(base, offset, length, seed); } - public static long hashUnsafeBytesBlock(MemoryBlock mb, long seed) { - long offset = 0; - long length = mb.size(); + public static long hashUnsafeBytes(Object base, long offset, int length, long seed) { assert (length >= 0) : "lengthInBytes cannot be negative"; - long hash = hashBytesByWordsBlock(mb, seed); + long hash = hashBytesByWords(base, offset, length, seed); long end = offset + length; offset += length & -8; if (offset + 4L <= end) { - hash ^= (mb.getInt(offset) & 0xFFFFFFFFL) * PRIME64_1; + hash ^= (Platform.getInt(base, offset) & 0xFFFFFFFFL) * PRIME64_1; hash = Long.rotateLeft(hash, 23) * PRIME64_2 + PRIME64_3; offset += 4L; } while (offset < end) { - hash ^= (mb.getByte(offset) & 0xFFL) * PRIME64_5; + hash ^= (Platform.getByte(base, offset) & 0xFFL) * PRIME64_5; hash = Long.rotateLeft(hash, 11) * PRIME64_1; offset++; } @@ -109,11 +107,7 @@ public static long hashUnsafeBytesBlock(MemoryBlock mb, long seed) { } public static long hashUTF8String(UTF8String str, long seed) { - return hashUnsafeBytesBlock(str.getMemoryBlock(), seed); - } - - public static long hashUnsafeBytes(Object base, long offset, int length, long seed) { - return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, length), seed); + return hashUnsafeBytes(str.getBaseObject(), str.getBaseOffset(), str.numBytes(), seed); } private static long fmix(long hash) { @@ -125,31 +119,30 @@ private static long fmix(long hash) { return hash; } - private static long hashBytesByWordsBlock(MemoryBlock mb, long seed) { - long offset = 0; - long length = mb.size(); + private static long hashBytesByWords(Object base, long offset, int length, long seed) { + long end = offset + length; long hash; if (length >= 32) { - long limit = length - 32; + long limit = end - 32; long v1 = seed + PRIME64_1 + PRIME64_2; long v2 = seed + PRIME64_2; long v3 = seed; long v4 = seed - PRIME64_1; do { - v1 += mb.getLong(offset) * PRIME64_2; + v1 += Platform.getLong(base, offset) * PRIME64_2; v1 = Long.rotateLeft(v1, 31); v1 *= PRIME64_1; - v2 += mb.getLong(offset + 8) * PRIME64_2; + v2 += Platform.getLong(base, offset + 8) * PRIME64_2; v2 = Long.rotateLeft(v2, 31); v2 *= PRIME64_1; - v3 += mb.getLong(offset + 16) * PRIME64_2; + v3 += Platform.getLong(base, offset + 16) * PRIME64_2; v3 = Long.rotateLeft(v3, 31); v3 *= PRIME64_1; - v4 += mb.getLong(offset + 24) * PRIME64_2; + v4 += Platform.getLong(base, offset + 24) * PRIME64_2; v4 = Long.rotateLeft(v4, 31); v4 *= PRIME64_1; @@ -190,9 +183,9 @@ private static long hashBytesByWordsBlock(MemoryBlock mb, long seed) { hash += length; - long limit = length - 8; + long limit = end - 8; while (offset <= limit) { - long k1 = mb.getLong(offset); + long k1 = Platform.getLong(base, offset); hash ^= Long.rotateLeft(k1 * PRIME64_2, 31) * PRIME64_1; hash = Long.rotateLeft(hash, 27) * PRIME64_1 + PRIME64_4; offset += 8L; diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java index f8000d78cd1b..f0f66bae245f 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java @@ -19,8 +19,6 @@ import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; -import org.apache.spark.unsafe.memory.ByteArrayMemoryBlock; -import org.apache.spark.unsafe.memory.MemoryBlock; import org.apache.spark.unsafe.types.UTF8String; /** @@ -31,34 +29,43 @@ public class UTF8StringBuilder { private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH; - private ByteArrayMemoryBlock buffer; - private int length = 0; + private byte[] buffer; + private int cursor = Platform.BYTE_ARRAY_OFFSET; public UTF8StringBuilder() { // Since initial buffer size is 16 in `StringBuilder`, we set the same size here - this.buffer = new ByteArrayMemoryBlock(16); + this.buffer = new byte[16]; } // Grows the buffer by at least `neededSize` private void grow(int neededSize) { - if (neededSize > ARRAY_MAX - length) { + if (neededSize > ARRAY_MAX - totalSize()) { throw new UnsupportedOperationException( "Cannot grow internal buffer by size " + neededSize + " because the size after growing " + "exceeds size limitation " + ARRAY_MAX); } - final int requestedSize = length + neededSize; - if (buffer.size() < requestedSize) { - int newLength = requestedSize < ARRAY_MAX / 2 ? requestedSize * 2 : ARRAY_MAX; - final ByteArrayMemoryBlock tmp = new ByteArrayMemoryBlock(newLength); - MemoryBlock.copyMemory(buffer, tmp, length); + final int length = totalSize() + neededSize; + if (buffer.length < length) { + int newLength = length < ARRAY_MAX / 2 ? length * 2 : ARRAY_MAX; + final byte[] tmp = new byte[newLength]; + Platform.copyMemory( + buffer, + Platform.BYTE_ARRAY_OFFSET, + tmp, + Platform.BYTE_ARRAY_OFFSET, + totalSize()); buffer = tmp; } } + private int totalSize() { + return cursor - Platform.BYTE_ARRAY_OFFSET; + } + public void append(UTF8String value) { grow(value.numBytes()); - value.writeToMemory(buffer.getByteArray(), length + Platform.BYTE_ARRAY_OFFSET); - length += value.numBytes(); + value.writeToMemory(buffer, cursor); + cursor += value.numBytes(); } public void append(String value) { @@ -66,6 +73,6 @@ public void append(String value) { } public UTF8String build() { - return UTF8String.fromBytes(buffer.getByteArray(), 0, length); + return UTF8String.fromBytes(buffer, 0, totalSize()); } } diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java index 71c49d8ed017..3960d6d52047 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java @@ -185,13 +185,13 @@ public void write(int ordinal, Decimal input, int precision, int scale) { // grow the global buffer before writing data. holder.grow(16); + // always zero-out the 16-byte buffer + Platform.putLong(getBuffer(), cursor(), 0L); + Platform.putLong(getBuffer(), cursor() + 8, 0L); + // Make sure Decimal object has the same scale as DecimalType. // Note that we may pass in null Decimal object to set null for it. if (input == null || !input.changePrecision(precision, scale)) { - // zero-out the bytes - Platform.putLong(getBuffer(), cursor(), 0L); - Platform.putLong(getBuffer(), cursor() + 8, 0L); - BitSetMethods.set(getBuffer(), startingOffset, ordinal); // keep the offset for future update setOffsetAndSize(ordinal, 0); @@ -200,8 +200,6 @@ public void write(int ordinal, Decimal input, int precision, int scale) { final int numBytes = bytes.length; assert numBytes <= 16; - zeroOutPaddingBytes(numBytes); - // Write the bytes to the variable length portion. Platform.copyMemory( bytes, Platform.BYTE_ARRAY_OFFSET, getBuffer(), cursor(), numBytes); diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 580133dd971b..d72e512e0df5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -554,8 +554,10 @@ class Analyzer( Cast(value, pivotColumn.dataType, Some(conf.sessionLocalTimeZone)).eval(EmptyRow) } // Group-by expressions coming from SQL are implicit and need to be deduced. - val groupByExprs = groupByExprsOpt.getOrElse( - (child.outputSet -- aggregates.flatMap(_.references) -- pivotColumn.references).toSeq) + val groupByExprs = groupByExprsOpt.getOrElse { + val pivotColAndAggRefs = pivotColumn.references ++ AttributeSet(aggregates) + child.output.filterNot(pivotColAndAggRefs.contains) + } val singleAgg = aggregates.size == 1 def outputName(value: Expression, aggregate: Expression): String = { val stringValue = value match { @@ -1045,7 +1047,7 @@ class Analyzer( // support CURRENT_DATE and CURRENT_TIMESTAMP val literalFunctions = Seq(CurrentDate(), CurrentTimestamp()) val name = nameParts.head - val func = literalFunctions.find(e => resolver(e.prettyName, name)) + val func = literalFunctions.find(e => caseInsensitiveResolution(e.prettyName, name)) func.map(wrapper) } @@ -1436,21 +1438,7 @@ class Analyzer( val expr = resolveSubQuery(l, plans)((plan, exprs) => { ListQuery(plan, exprs, exprId, plan.output) }) - val subqueryOutput = expr.plan.output - val resolvedIn = InSubquery(values, expr.asInstanceOf[ListQuery]) - if (values.length != subqueryOutput.length) { - throw new AnalysisException( - s"""Cannot analyze ${resolvedIn.sql}. - |The number of columns in the left hand side of an IN subquery does not match the - |number of columns in the output of subquery. - |#columns in left hand side: ${values.length} - |#columns in right hand side: ${subqueryOutput.length} - |Left side columns: - |[${values.map(_.sql).mkString(", ")}] - |Right side columns: - |[${subqueryOutput.map(_.sql).mkString(", ")}]""".stripMargin) - } - resolvedIn + InSubquery(values, expr.asInstanceOf[ListQuery]) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala index e511f8064e28..82692334544e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala @@ -290,11 +290,13 @@ object DecimalPrecision extends TypeCoercionRule { // potentially loosing 11 digits of the fractional part. Using only the precision needed // by the Literal, instead, the result would be DECIMAL(38 + 1 + 1, 18), which would // become DECIMAL(38, 16), safely having a much lower precision loss. - case (l: Literal, r) if r.dataType.isInstanceOf[DecimalType] - && l.dataType.isInstanceOf[IntegralType] => + case (l: Literal, r) if r.dataType.isInstanceOf[DecimalType] && + l.dataType.isInstanceOf[IntegralType] && + SQLConf.get.literalPickMinimumPrecision => b.makeCopy(Array(Cast(l, DecimalType.fromLiteral(l)), r)) - case (l, r: Literal) if l.dataType.isInstanceOf[DecimalType] - && r.dataType.isInstanceOf[IntegralType] => + case (l, r: Literal) if l.dataType.isInstanceOf[DecimalType] && + r.dataType.isInstanceOf[IntegralType] && + SQLConf.get.literalPickMinimumPrecision => b.makeCopy(Array(l, Cast(r, DecimalType.fromLiteral(r)))) // Promote integers inside a binary expression with fixed-precision decimals to decimals, // and fixed-precision decimals in an expression with floats / doubles to doubles diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 77860e1584f4..7dafebff7987 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -267,6 +267,7 @@ object FunctionRegistry { expression[Subtract]("-"), expression[Multiply]("*"), expression[Divide]("/"), + expression[IntegralDivide]("div"), expression[Remainder]("%"), // aggregate functions @@ -299,15 +300,6 @@ object FunctionRegistry { expression[CollectList]("collect_list"), expression[CollectSet]("collect_set"), expression[CountMinSketchAgg]("count_min_sketch"), - expression[RegrCount]("regr_count"), - expression[RegrSXX]("regr_sxx"), - expression[RegrSYY]("regr_syy"), - expression[RegrAvgX]("regr_avgx"), - expression[RegrAvgY]("regr_avgy"), - expression[RegrSXY]("regr_sxy"), - expression[RegrSlope]("regr_slope"), - expression[RegrR2]("regr_r2"), - expression[RegrIntercept]("regr_intercept"), // string functions expression[Ascii]("ascii"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 288b6358fbff..72ac80e0a0a1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -60,7 +60,7 @@ object TypeCoercion { IfCoercion :: StackCoercion :: Division :: - new ImplicitTypeCasts(conf) :: + ImplicitTypeCasts :: DateTimeOperations :: WindowFrameCoercion :: Nil @@ -841,33 +841,12 @@ object TypeCoercion { /** * Casts types according to the expected input types for [[Expression]]s. */ - class ImplicitTypeCasts(conf: SQLConf) extends TypeCoercionRule { - - private def rejectTzInString = conf.getConf(SQLConf.REJECT_TIMEZONE_IN_STRING) - + object ImplicitTypeCasts extends TypeCoercionRule { override protected def coerceTypes( plan: LogicalPlan): LogicalPlan = plan resolveExpressions { // Skip nodes who's children have not been resolved yet. case e if !e.childrenResolved => e - // Special rules for `from/to_utc_timestamp`. These 2 functions assume the input timestamp - // string is in a specific timezone, so the string itself should not contain timezone. - // TODO: We should move the type coercion logic to expressions instead of a central - // place to put all the rules. - case e: FromUTCTimestamp if e.left.dataType == StringType => - if (rejectTzInString) { - e.copy(left = StringToTimestampWithoutTimezone(e.left)) - } else { - e.copy(left = Cast(e.left, TimestampType)) - } - - case e: ToUTCTimestamp if e.left.dataType == StringType => - if (rejectTzInString) { - e.copy(left = StringToTimestampWithoutTimezone(e.left)) - } else { - e.copy(left = Cast(e.left, TimestampType)) - } - case b @ BinaryOperator(left, right) if left.dataType != right.dataType => findTightestCommonType(left.dataType, right.dataType).map { commonType => if (b.inputType.acceptsType(commonType)) { @@ -884,7 +863,7 @@ object TypeCoercion { case e: ImplicitCastInputTypes if e.inputTypes.nonEmpty => val children: Seq[Expression] = e.children.zip(e.inputTypes).map { case (in, expected) => // If we cannot do the implicit cast, just use the original input. - ImplicitTypeCasts.implicitCast(in, expected).getOrElse(in) + implicitCast(in, expected).getOrElse(in) } e.withNewChildren(children) @@ -900,9 +879,6 @@ object TypeCoercion { } e.withNewChildren(children) } - } - - object ImplicitTypeCasts { /** * Given an expected data type, try to cast the expression and return the cast expression. @@ -974,6 +950,25 @@ object TypeCoercion { if !Cast.forceNullable(fromType, toType) => implicitCast(fromType, toType).map(ArrayType(_, false)).orNull + // Implicit cast between Map types. + // Follows the same semantics of implicit casting between two array types. + // Refer to documentation above. Make sure that both key and values + // can not be null after the implicit cast operation by calling forceNullable + // method. + case (MapType(fromKeyType, fromValueType, fn), MapType(toKeyType, toValueType, tn)) + if !Cast.forceNullable(fromKeyType, toKeyType) && Cast.resolvableNullability(fn, tn) => + if (Cast.forceNullable(fromValueType, toValueType) && !tn) { + null + } else { + val newKeyType = implicitCast(fromKeyType, toKeyType).orNull + val newValueType = implicitCast(fromValueType, toValueType).orNull + if (newKeyType != null && newValueType != null) { + MapType(newKeyType, newValueType, tn) + } else { + null + } + } + case _ => null } Option(ret) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala index dd08190e1e8a..a8a7bbd9f9cd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala @@ -73,7 +73,9 @@ case class ResolveLambdaVariables(conf: SQLConf) extends Rule[LogicalPlan] { private val canonicalizer = { if (!conf.caseSensitiveAnalysis) { + // scalastyle:off caselocale s: String => s.toLowerCase + // scalastyle:on caselocale } else { s: String => s } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index afb0f009db05..c11b44421294 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -701,6 +701,7 @@ class SessionCatalog( val metadata = externalCatalog.getTable(db, table) if (metadata.tableType == CatalogTableType.VIEW) { val viewText = metadata.viewText.getOrElse(sys.error("Invalid view without text.")) + logDebug(s"'$viewText' will be used for the view($table).") // The relation is a view, so we wrap the relation by: // 1. Add a [[View]] operator over the relation to keep track of the view desc; // 2. Wrap the logical plan in a [[SubqueryAlias]] which tracks the name of the view. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 30ded13410f7..817abebd72ac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -244,7 +244,8 @@ case class CatalogTable( unsupportedFeatures: Seq[String] = Seq.empty, tracksPartitionsInCatalog: Boolean = false, schemaPreservesCase: Boolean = true, - ignoredProperties: Map[String, String] = Map.empty) { + ignoredProperties: Map[String, String] = Map.empty, + viewOriginalText: Option[String] = None) { import CatalogTable._ @@ -331,6 +332,7 @@ case class CatalogTable( comment.foreach(map.put("Comment", _)) if (tableType == CatalogTableType.VIEW) { viewText.foreach(map.put("View Text", _)) + viewOriginalText.foreach(map.put("View Original Text", _)) viewDefaultDatabase.foreach(map.put("View Default Database", _)) if (viewQueryColumnNames.nonEmpty) { map.put("View Query Output Columns", viewQueryColumnNames.mkString("[", ", ", "]")) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index d3ccd18d0245..176ea823b1fc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -72,6 +72,7 @@ package object dsl { def - (other: Expression): Expression = Subtract(expr, other) def * (other: Expression): Expression = Multiply(expr, other) def / (other: Expression): Expression = Divide(expr, other) + def div (other: Expression): Expression = IntegralDivide(expr, other) def % (other: Expression): Expression = Remainder(expr, other) def & (other: Expression): Expression = BitwiseAnd(expr, other) def | (other: Expression): Expression = BitwiseOr(expr, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala index 7420b6b57d8e..a7e09eee617e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.expressions +import scala.collection.mutable + protected class AttributeEquals(val a: Attribute) { override def hashCode(): Int = a match { @@ -39,10 +41,13 @@ object AttributeSet { /** Constructs a new [[AttributeSet]] given a sequence of [[Expression Expressions]]. */ def apply(baseSet: Iterable[Expression]): AttributeSet = { - new AttributeSet( - baseSet - .flatMap(_.references) - .map(new AttributeEquals(_)).toSet) + fromAttributeSets(baseSet.map(_.references)) + } + + /** Constructs a new [[AttributeSet]] given a sequence of [[AttributeSet]]s. */ + def fromAttributeSets(sets: Iterable[AttributeSet]): AttributeSet = { + val baseSet = sets.foldLeft(new mutable.LinkedHashSet[AttributeEquals]())( _ ++= _.baseSet) + new AttributeSet(baseSet.toSet) } } @@ -94,8 +99,14 @@ class AttributeSet private (val baseSet: Set[AttributeEquals]) * Returns a new [[AttributeSet]] that does not contain any of the [[Attribute Attributes]] found * in `other`. */ - def --(other: Traversable[NamedExpression]): AttributeSet = - new AttributeSet(baseSet -- other.map(a => new AttributeEquals(a.toAttribute))) + def --(other: Traversable[NamedExpression]): AttributeSet = { + other match { + case otherSet: AttributeSet => + new AttributeSet(baseSet -- otherSet.baseSet) + case _ => + new AttributeSet(baseSet -- other.map(a => new AttributeEquals(a.toAttribute))) + } + } /** * Returns a new [[AttributeSet]] that contains all of the [[Attribute Attributes]] found diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 005350350104..ee463bf5eb6a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -183,7 +183,7 @@ object Cast { case _ => false } - private def resolvableNullability(from: Boolean, to: Boolean) = !from || to + def resolvableNullability(from: Boolean, to: Boolean): Boolean = !from || to } /** @@ -924,27 +924,36 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String } private[this] def changePrecision(d: ExprValue, decimalType: DecimalType, - evPrim: ExprValue, evNull: ExprValue): Block = - code""" - if ($d.changePrecision(${decimalType.precision}, ${decimalType.scale})) { - $evPrim = $d; - } else { - $evNull = true; - } - """ + evPrim: ExprValue, evNull: ExprValue, canNullSafeCast: Boolean): Block = { + if (canNullSafeCast) { + code""" + |$d.changePrecision(${decimalType.precision}, ${decimalType.scale}); + |$evPrim = $d; + """.stripMargin + } else { + code""" + |if ($d.changePrecision(${decimalType.precision}, ${decimalType.scale})) { + | $evPrim = $d; + |} else { + | $evNull = true; + |} + """.stripMargin + } + } private[this] def castToDecimalCode( from: DataType, target: DecimalType, ctx: CodegenContext): CastFunction = { val tmp = ctx.freshVariable("tmpDecimal", classOf[Decimal]) + val canNullSafeCast = Cast.canNullSafeCastToDecimal(from, target) from match { case StringType => (c, evPrim, evNull) => code""" try { Decimal $tmp = Decimal.apply(new java.math.BigDecimal($c.toString())); - ${changePrecision(tmp, target, evPrim, evNull)} + ${changePrecision(tmp, target, evPrim, evNull, canNullSafeCast)} } catch (java.lang.NumberFormatException e) { $evNull = true; } @@ -953,7 +962,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String (c, evPrim, evNull) => code""" Decimal $tmp = $c ? Decimal.apply(1) : Decimal.apply(0); - ${changePrecision(tmp, target, evPrim, evNull)} + ${changePrecision(tmp, target, evPrim, evNull, canNullSafeCast)} """ case DateType => // date can't cast to decimal in Hive @@ -964,19 +973,19 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String code""" Decimal $tmp = Decimal.apply( scala.math.BigDecimal.valueOf(${timestampToDoubleCode(c)})); - ${changePrecision(tmp, target, evPrim, evNull)} + ${changePrecision(tmp, target, evPrim, evNull, canNullSafeCast)} """ case DecimalType() => (c, evPrim, evNull) => code""" Decimal $tmp = $c.clone(); - ${changePrecision(tmp, target, evPrim, evNull)} + ${changePrecision(tmp, target, evPrim, evNull, canNullSafeCast)} """ case x: IntegralType => (c, evPrim, evNull) => code""" Decimal $tmp = Decimal.apply((long) $c); - ${changePrecision(tmp, target, evPrim, evNull)} + ${changePrecision(tmp, target, evPrim, evNull, canNullSafeCast)} """ case x: FractionalType => // All other numeric types can be represented precisely as Doubles @@ -984,7 +993,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String code""" try { Decimal $tmp = Decimal.apply(scala.math.BigDecimal.valueOf((double) $c)); - ${changePrecision(tmp, target, evPrim, evNull)} + ${changePrecision(tmp, target, evPrim, evNull, canNullSafeCast)} } catch (java.lang.NumberFormatException e) { $evNull = true; } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 773aefc0ac1f..c215735ab1c9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -85,7 +85,7 @@ abstract class Expression extends TreeNode[Expression] { def nullable: Boolean - def references: AttributeSet = AttributeSet(children.flatMap(_.references.iterator)) + def references: AttributeSet = AttributeSet.fromAttributeSets(children.map(_.references)) /** Returns the result of evaluating this expression on a given input Row */ def eval(input: InternalRow = null): Any diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala new file mode 100644 index 000000000000..0654108cea28 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp + + +/** + * A [[MutableProjection]] that is calculated by calling `eval` on each of the specified + * expressions. + * + * @param expressions a sequence of expressions that determine the value of each column of the + * output row. + */ +class InterpretedMutableProjection(expressions: Seq[Expression]) extends MutableProjection { + def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) = + this(toBoundExprs(expressions, inputSchema)) + + private[this] val buffer = new Array[Any](expressions.size) + + override def initialize(partitionIndex: Int): Unit = { + expressions.foreach(_.foreach { + case n: Nondeterministic => n.initialize(partitionIndex) + case _ => + }) + } + + private[this] val validExprs = expressions.zipWithIndex.filter { + case (NoOp, _) => false + case _ => true + } + private[this] var mutableRow: InternalRow = new GenericInternalRow(expressions.size) + def currentValue: InternalRow = mutableRow + + override def target(row: InternalRow): MutableProjection = { + mutableRow = row + this + } + + override def apply(input: InternalRow): InternalRow = { + var i = 0 + while (i < validExprs.length) { + val (expr, ordinal) = validExprs(i) + // Store the result into buffer first, to make the projection atomic (needed by aggregation) + buffer(ordinal) = expr.eval(input) + i += 1 + } + i = 0 + while (i < validExprs.length) { + val (_, ordinal) = validExprs(i) + mutableRow(ordinal) = buffer(ordinal) + i += 1 + } + mutableRow + } +} + +/** + * Helper functions for creating an [[InterpretedMutableProjection]]. + */ +object InterpretedMutableProjection { + + /** + * Returns a [[MutableProjection]] for given sequence of bound Expressions. + */ + def createProjection(exprs: Seq[Expression]): MutableProjection = { + // We need to make sure that we do not reuse stateful expressions. + val cleanedExpressions = exprs.map(_.transform { + case s: Stateful => s.freshCopy() + }) + new InterpretedMutableProjection(cleanedExpressions) + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala index 226a4ddcffaa..792646cf9f10 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala @@ -17,10 +17,9 @@ package org.apache.spark.sql.catalyst.expressions -import scala.util.control.NonFatal - import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection} +import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateMutableProjection, GenerateSafeProjection, GenerateUnsafeProjection} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, StructType} /** @@ -57,47 +56,50 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection { } /** - * A [[MutableProjection]] that is calculated by calling `eval` on each of the specified - * expressions. + * Converts a [[InternalRow]] to another Row given a sequence of expression that define each + * column of the new row. If the schema of the input row is specified, then the given expression + * will be bound to that schema. * - * @param expressions a sequence of expressions that determine the value of each column of the - * output row. + * In contrast to a normal projection, a MutableProjection reuses the same underlying row object + * each time an input row is added. This significantly reduces the cost of calculating the + * projection, but means that it is not safe to hold on to a reference to a [[InternalRow]] after + * `next()` has been called on the [[Iterator]] that produced it. Instead, the user must call + * `InternalRow.copy()` and hold on to the returned [[InternalRow]] before calling `next()`. */ -case class InterpretedMutableProjection(expressions: Seq[Expression]) extends MutableProjection { - def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) = - this(expressions.map(BindReferences.bindReference(_, inputSchema))) +abstract class MutableProjection extends Projection { + def currentValue: InternalRow - private[this] val buffer = new Array[Any](expressions.size) + /** Uses the given row to store the output of the projection. */ + def target(row: InternalRow): MutableProjection +} - override def initialize(partitionIndex: Int): Unit = { - expressions.foreach(_.foreach { - case n: Nondeterministic => n.initialize(partitionIndex) - case _ => - }) +/** + * The factory object for `MutableProjection`. + */ +object MutableProjection + extends CodeGeneratorWithInterpretedFallback[Seq[Expression], MutableProjection] { + + override protected def createCodeGeneratedObject(in: Seq[Expression]): MutableProjection = { + GenerateMutableProjection.generate(in, SQLConf.get.subexpressionEliminationEnabled) } - private[this] val exprArray = expressions.toArray - private[this] var mutableRow: InternalRow = new GenericInternalRow(exprArray.length) - def currentValue: InternalRow = mutableRow + override protected def createInterpretedObject(in: Seq[Expression]): MutableProjection = { + InterpretedMutableProjection.createProjection(in) + } - override def target(row: InternalRow): MutableProjection = { - mutableRow = row - this + /** + * Returns an MutableProjection for given sequence of bound Expressions. + */ + def create(exprs: Seq[Expression]): MutableProjection = { + createObject(exprs) } - override def apply(input: InternalRow): InternalRow = { - var i = 0 - while (i < exprArray.length) { - // Store the result into buffer first, to make the projection atomic (needed by aggregation) - buffer(i) = exprArray(i).eval(input) - i += 1 - } - i = 0 - while (i < exprArray.length) { - mutableRow(i) = buffer(i) - i += 1 - } - mutableRow + /** + * Returns an MutableProjection for given sequence of Expressions, which will be bound to + * `inputSchema`. + */ + def create(exprs: Seq[Expression], inputSchema: Seq[Attribute]): MutableProjection = { + create(toBoundExprs(exprs, inputSchema)) } } @@ -117,19 +119,13 @@ object UnsafeProjection extends CodeGeneratorWithInterpretedFallback[Seq[Expression], UnsafeProjection] { override protected def createCodeGeneratedObject(in: Seq[Expression]): UnsafeProjection = { - GenerateUnsafeProjection.generate(in) + GenerateUnsafeProjection.generate(in, SQLConf.get.subexpressionEliminationEnabled) } override protected def createInterpretedObject(in: Seq[Expression]): UnsafeProjection = { InterpretedUnsafeProjection.createProjection(in) } - protected def toBoundExprs( - exprs: Seq[Expression], - inputSchema: Seq[Attribute]): Seq[Expression] = { - exprs.map(BindReferences.bindReference(_, inputSchema)) - } - protected def toUnsafeExprs(exprs: Seq[Expression]): Seq[Expression] = { exprs.map(_ transform { case CreateNamedStruct(children) => CreateNamedStructUnsafe(children) @@ -168,26 +164,6 @@ object UnsafeProjection def create(exprs: Seq[Expression], inputSchema: Seq[Attribute]): UnsafeProjection = { create(toBoundExprs(exprs, inputSchema)) } - - /** - * Same as other create()'s but allowing enabling/disabling subexpression elimination. - * The param `subexpressionEliminationEnabled` doesn't guarantee to work. For example, - * when fallbacking to interpreted execution, it is not supported. - */ - def create( - exprs: Seq[Expression], - inputSchema: Seq[Attribute], - subexpressionEliminationEnabled: Boolean): UnsafeProjection = { - val unsafeExprs = toUnsafeExprs(toBoundExprs(exprs, inputSchema)) - try { - GenerateUnsafeProjection.generate(unsafeExprs, subexpressionEliminationEnabled) - } catch { - case NonFatal(_) => - // We should have already seen the error message in `CodeGenerator` - logWarning("Expr codegen error and falling back to interpreter mode") - InterpretedUnsafeProjection.createProjection(unsafeExprs) - } - } } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/regression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/regression.scala deleted file mode 100644 index d8f4505588ff..000000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/regression.scala +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.expressions.aggregate - -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.types.{AbstractDataType, DoubleType} - -/** - * Base trait for all regression functions. - */ -trait RegrLike extends AggregateFunction with ImplicitCastInputTypes { - def y: Expression - def x: Expression - - override def children: Seq[Expression] = Seq(y, x) - override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType) - - protected def updateIfNotNull(exprs: Seq[Expression]): Seq[Expression] = { - assert(aggBufferAttributes.length == exprs.length) - val nullableChildren = children.filter(_.nullable) - if (nullableChildren.isEmpty) { - exprs - } else { - exprs.zip(aggBufferAttributes).map { case (e, a) => - If(nullableChildren.map(IsNull).reduce(Or), a, e) - } - } - } -} - - -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns the number of non-null pairs.", - since = "2.4.0") -case class RegrCount(y: Expression, x: Expression) - extends CountLike with RegrLike { - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(Seq(count + 1L)) - - override def prettyName: String = "regr_count" -} - - -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns SUM(x*x)-SUM(x)*SUM(x)/N. Any pair with a NULL is ignored.", - since = "2.4.0") -case class RegrSXX(y: Expression, x: Expression) - extends CentralMomentAgg(x) with RegrLike { - - override protected def momentOrder = 2 - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef) - - override val evaluateExpression: Expression = { - If(n === Literal(0.0), Literal.create(null, DoubleType), m2) - } - - override def prettyName: String = "regr_sxx" -} - - -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns SUM(y*y)-SUM(y)*SUM(y)/N. Any pair with a NULL is ignored.", - since = "2.4.0") -case class RegrSYY(y: Expression, x: Expression) - extends CentralMomentAgg(y) with RegrLike { - - override protected def momentOrder = 2 - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef) - - override val evaluateExpression: Expression = { - If(n === Literal(0.0), Literal.create(null, DoubleType), m2) - } - - override def prettyName: String = "regr_syy" -} - - -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns the average of x. Any pair with a NULL is ignored.", - since = "2.4.0") -case class RegrAvgX(y: Expression, x: Expression) - extends AverageLike(x) with RegrLike { - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef) - - override def prettyName: String = "regr_avgx" -} - - -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns the average of y. Any pair with a NULL is ignored.", - since = "2.4.0") -case class RegrAvgY(y: Expression, x: Expression) - extends AverageLike(y) with RegrLike { - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef) - - override def prettyName: String = "regr_avgy" -} - -// scalastyle:off line.size.limit -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns the covariance of y and x multiplied for the number of items in the dataset. Any pair with a NULL is ignored.", - since = "2.4.0") -// scalastyle:on line.size.limit -case class RegrSXY(y: Expression, x: Expression) - extends Covariance(y, x) with RegrLike { - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef) - - override val evaluateExpression: Expression = { - If(n === Literal(0.0), Literal.create(null, DoubleType), ck) - } - - override def prettyName: String = "regr_sxy" -} - - -// scalastyle:off line.size.limit -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns the slope of the linear regression line. Any pair with a NULL is ignored.", - since = "2.4.0") -// scalastyle:on line.size.limit -case class RegrSlope(y: Expression, x: Expression) - extends PearsonCorrelation(y, x) with RegrLike { - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef) - - override val evaluateExpression: Expression = { - If(n < Literal(2.0) || yMk === Literal(0.0), Literal.create(null, DoubleType), ck / yMk) - } - - override def prettyName: String = "regr_slope" -} - - -// scalastyle:off line.size.limit -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns the coefficient of determination (also called R-squared or goodness of fit) for the regression line. Any pair with a NULL is ignored.", - since = "2.4.0") -// scalastyle:on line.size.limit -case class RegrR2(y: Expression, x: Expression) - extends PearsonCorrelation(y, x) with RegrLike { - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef) - - override val evaluateExpression: Expression = { - If(n < Literal(2.0) || yMk === Literal(0.0), Literal.create(null, DoubleType), - If(xMk === Literal(0.0), Literal(1.0), ck * ck / yMk / xMk)) - } - - override def prettyName: String = "regr_r2" -} - - -// scalastyle:off line.size.limit -@ExpressionDescription( - usage = "_FUNC_(y, x) - Returns the y-intercept of the linear regression line. Any pair with a NULL is ignored.", - since = "2.4.0") -// scalastyle:on line.size.limit -case class RegrIntercept(y: Expression, x: Expression) - extends PearsonCorrelation(y, x) with RegrLike { - - override lazy val updateExpressions: Seq[Expression] = updateIfNotNull(updateExpressionsDef) - - override val evaluateExpression: Expression = { - If(n === Literal(0.0) || yMk === Literal(0.0), Literal.create(null, DoubleType), - xAvg - (ck / yMk) * yAvg) - } - - override def prettyName: String = "regr_intercept" -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index c827226d5842..22b29c3000c1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util.TypeUtils +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval @@ -314,6 +315,42 @@ case class Divide(left: Expression, right: Expression) extends DivModLike { override def evalOperation(left: Any, right: Any): Any = div(left, right) } +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "expr1 _FUNC_ expr2 - Divide `expr1` by `expr2` rounded to the long integer. It returns NULL if an operand is NULL or `expr2` is 0.", + examples = """ + Examples: + > SELECT 3 _FUNC_ 2; + 1 + """, + since = "3.0.0") +// scalastyle:on line.size.limit +case class IntegralDivide(left: Expression, right: Expression) extends DivModLike { + + override def inputType: AbstractDataType = IntegralType + override def dataType: DataType = if (SQLConf.get.integralDivideReturnLong) { + LongType + } else { + left.dataType + } + + override def symbol: String = "/" + override def sqlOperator: String = "div" + + private lazy val div: (Any, Any) => Any = left.dataType match { + case i: IntegralType => + val divide = i.integral.asInstanceOf[Integral[Any]].quot _ + if (SQLConf.get.integralDivideReturnLong) { + val toLong = i.integral.asInstanceOf[Integral[Any]].toLong _ + (x, y) => toLong(divide(x, y)) + } else { + divide + } + } + + override def evalOperation(left: Any, right: Any): Any = div(left, right) +} + @ExpressionDescription( usage = "expr1 _FUNC_ expr2 - Returns the remainder after `expr1`/`expr2`.", examples = """ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala index 33d14329ec95..d588e7f08130 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala @@ -44,6 +44,10 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP create(canonicalize(bind(expressions, inputSchema)), useSubexprElimination) } + def generate(expressions: Seq[Expression], useSubexprElimination: Boolean): MutableProjection = { + create(canonicalize(expressions), useSubexprElimination) + } + protected def create(expressions: Seq[Expression]): MutableProjection = { create(expressions, false) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index ea6fcccddfd4..b24d7486f345 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -159,9 +159,9 @@ case class MapKeys(child: Expression) examples = """ Examples: > SELECT _FUNC_(array(1, 2, 3), array(2, 3, 4)); - [[1, 2], [2, 3], [3, 4]] + [{"0":1,"1":2},{"0":2,"1":3},{"0":3,"1":4}] > SELECT _FUNC_(array(1, 2), array(2, 3), array(3, 4)); - [[1, 2, 3], [2, 3, 4]] + [{"0":1,"1":2,"2":3},{"0":2,"1":3,"2":4}] """, since = "2.4.0") case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsInputTypes { @@ -348,7 +348,7 @@ case class MapValues(child: Expression) examples = """ Examples: > SELECT _FUNC_(map(1, 'a', 2, 'b')); - [(1,"a"),(2,"b")] + [{"key":1,"value":"a"},{"key":2,"value":"b"}] """, since = "2.4.0") case class MapEntries(child: Expression) extends UnaryExpression with ExpectsInputTypes { @@ -516,7 +516,7 @@ case class MapEntries(child: Expression) extends UnaryExpression with ExpectsInp examples = """ Examples: > SELECT _FUNC_(map(1, 'a', 2, 'b'), map(2, 'c', 3, 'd')); - [[1 -> "a"], [2 -> "b"], [2 -> "c"], [3 -> "d"]] + {1:"a",2:"c",3:"d"} """, since = "2.4.0") case class MapConcat(children: Seq[Expression]) extends ComplexTypeMergingExpression { @@ -1171,9 +1171,9 @@ case class ArraySort(child: Expression) extends UnaryExpression with ArraySortLi examples = """ Examples: > SELECT _FUNC_(array(1, 20, 3, 5)); - [3, 1, 5, 20] + [3,1,5,20] > SELECT _FUNC_(array(1, 20, null, 3)); - [20, null, 3, 1] + [20,null,3,1] """, note = "The function is non-deterministic.", since = "2.4.0") @@ -1256,7 +1256,7 @@ case class Shuffle(child: Expression, randomSeed: Option[Long] = None) > SELECT _FUNC_('Spark SQL'); LQS krapS > SELECT _FUNC_(array(2, 1, 4, 3)); - [3, 4, 1, 2] + [3,4,1,2] """, since = "1.5.0", note = "Reverse logic for arrays is available since 2.4.0." @@ -1268,11 +1268,15 @@ case class Reverse(child: Expression) extends UnaryExpression with ImplicitCastI override def dataType: DataType = child.dataType - @transient private lazy val elementType: DataType = dataType.asInstanceOf[ArrayType].elementType + override def nullSafeEval(input: Any): Any = doReverse(input) - override def nullSafeEval(input: Any): Any = input match { - case a: ArrayData => new GenericArrayData(a.toObjectArray(elementType).reverse) - case s: UTF8String => s.reverse() + @transient private lazy val doReverse: Any => Any = dataType match { + case ArrayType(elementType, _) => + input => { + val arrayData = input.asInstanceOf[ArrayData] + new GenericArrayData(arrayData.toObjectArray(elementType).reverse) + } + case StringType => _.asInstanceOf[UTF8String].reverse() } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -1294,6 +1298,7 @@ case class Reverse(child: Expression) extends UnaryExpression with ImplicitCastI val i = ctx.freshName("i") val j = ctx.freshName("j") + val elementType = dataType.asInstanceOf[ArrayType].elementType val initialization = CodeGenerator.createArrayData( arrayData, elementType, numElements, s" $prettyName failed.") val assignment = CodeGenerator.createArrayAssignment( @@ -1331,23 +1336,27 @@ case class ArrayContains(left: Expression, right: Expression) @transient private lazy val ordering: Ordering[Any] = TypeUtils.getInterpretedOrdering(right.dataType) - override def inputTypes: Seq[AbstractDataType] = right.dataType match { - case NullType => Seq.empty - case _ => left.dataType match { - case n @ ArrayType(element, _) => Seq(n, element) + override def inputTypes: Seq[AbstractDataType] = { + (left.dataType, right.dataType) match { + case (_, NullType) => Seq.empty + case (ArrayType(e1, hasNull), e2) => + TypeCoercion.findTightestCommonType(e1, e2) match { + case Some(dt) => Seq(ArrayType(dt, hasNull), dt) + case _ => Seq.empty + } case _ => Seq.empty } } override def checkInputDataTypes(): TypeCheckResult = { - if (right.dataType == NullType) { - TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments") - } else if (!left.dataType.isInstanceOf[ArrayType] - || !left.dataType.asInstanceOf[ArrayType].elementType.sameType(right.dataType)) { - TypeCheckResult.TypeCheckFailure( - "Arguments must be an array followed by a value of same type as the array members") - } else { - TypeUtils.checkForOrderingExpr(right.dataType, s"function $prettyName") + (left.dataType, right.dataType) match { + case (_, NullType) => + TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments") + case (ArrayType(e1, _), e2) if e1.sameType(e2) => + TypeUtils.checkForOrderingExpr(e2, s"function $prettyName") + case _ => TypeCheckResult.TypeCheckFailure(s"Input to function $prettyName should have " + + s"been ${ArrayType.simpleString} followed by a value with same element type, but it's " + + s"[${left.dataType.catalogString}, ${right.dataType.catalogString}].") } } @@ -2062,18 +2071,23 @@ case class ArrayPosition(left: Expression, right: Expression) override def dataType: DataType = LongType override def inputTypes: Seq[AbstractDataType] = { - val elementType = left.dataType match { - case t: ArrayType => t.elementType - case _ => AnyDataType + (left.dataType, right.dataType) match { + case (ArrayType(e1, hasNull), e2) => + TypeCoercion.findTightestCommonType(e1, e2) match { + case Some(dt) => Seq(ArrayType(dt, hasNull), dt) + case _ => Seq.empty + } + case _ => Seq.empty } - Seq(ArrayType, elementType) } override def checkInputDataTypes(): TypeCheckResult = { - super.checkInputDataTypes() match { - case f: TypeCheckResult.TypeCheckFailure => f - case TypeCheckResult.TypeCheckSuccess => - TypeUtils.checkForOrderingExpr(right.dataType, s"function $prettyName") + (left.dataType, right.dataType) match { + case (ArrayType(e1, _), e2) if e1.sameType(e2) => + TypeUtils.checkForOrderingExpr(e2, s"function $prettyName") + case _ => TypeCheckResult.TypeCheckFailure(s"Input to function $prettyName should have " + + s"been ${ArrayType.simpleString} followed by a value with same element type, but it's " + + s"[${left.dataType.catalogString}, ${right.dataType.catalogString}].") } } @@ -2123,7 +2137,7 @@ case class ArrayPosition(left: Expression, right: Expression) > SELECT _FUNC_(array(1, 2, 3), 2); 2 > SELECT _FUNC_(map(1, 'a', 2, 'b'), 2); - "b" + b """, since = "2.4.0") case class ElementAt(left: Expression, right: Expression) extends GetMapValueUtil { @@ -2140,29 +2154,44 @@ case class ElementAt(left: Expression, right: Expression) extends GetMapValueUti } override def inputTypes: Seq[AbstractDataType] = { - Seq(TypeCollection(ArrayType, MapType), - left.dataType match { - case _: ArrayType => IntegerType - case _: MapType => mapKeyType - case _ => AnyDataType // no match for a wrong 'left' expression type - } - ) + (left.dataType, right.dataType) match { + case (arr: ArrayType, e2: IntegralType) if (e2 != LongType) => + Seq(arr, IntegerType) + case (MapType(keyType, valueType, hasNull), e2) => + TypeCoercion.findTightestCommonType(keyType, e2) match { + case Some(dt) => Seq(MapType(dt, valueType, hasNull), dt) + case _ => Seq.empty + } + case (l, r) => Seq.empty + + } } override def checkInputDataTypes(): TypeCheckResult = { - super.checkInputDataTypes() match { - case f: TypeCheckResult.TypeCheckFailure => f - case TypeCheckResult.TypeCheckSuccess if left.dataType.isInstanceOf[MapType] => - TypeUtils.checkForOrderingExpr(mapKeyType, s"function $prettyName") - case TypeCheckResult.TypeCheckSuccess => TypeCheckResult.TypeCheckSuccess + (left.dataType, right.dataType) match { + case (_: ArrayType, e2) if e2 != IntegerType => + TypeCheckResult.TypeCheckFailure(s"Input to function $prettyName should have " + + s"been ${ArrayType.simpleString} followed by a ${IntegerType.simpleString}, but it's " + + s"[${left.dataType.catalogString}, ${right.dataType.catalogString}].") + case (MapType(e1, _, _), e2) if (!e2.sameType(e1)) => + TypeCheckResult.TypeCheckFailure(s"Input to function $prettyName should have " + + s"been ${MapType.simpleString} followed by a value of same key type, but it's " + + s"[${left.dataType.catalogString}, ${right.dataType.catalogString}].") + case (e1, _) if (!e1.isInstanceOf[MapType] && !e1.isInstanceOf[ArrayType]) => + TypeCheckResult.TypeCheckFailure(s"The first argument to function $prettyName should " + + s"have been ${ArrayType.simpleString} or ${MapType.simpleString} type, but its " + + s"${left.dataType.catalogString} type.") + case _ => TypeCheckResult.TypeCheckSuccess } } override def nullable: Boolean = true - override def nullSafeEval(value: Any, ordinal: Any): Any = { - left.dataType match { - case _: ArrayType => + override def nullSafeEval(value: Any, ordinal: Any): Any = doElementAt(value, ordinal) + + @transient private lazy val doElementAt: (Any, Any) => Any = left.dataType match { + case _: ArrayType => + (value, ordinal) => { val array = value.asInstanceOf[ArrayData] val index = ordinal.asInstanceOf[Int] if (array.numElements() < math.abs(index)) { @@ -2181,9 +2210,9 @@ case class ElementAt(left: Expression, right: Expression) extends GetMapValueUti array.get(idx, dataType) } } - case _: MapType => - getValueEval(value, ordinal, mapKeyType, ordering) - } + } + case _: MapType => + (value, ordinal) => getValueEval(value, ordinal, mapKeyType, ordering) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -2238,8 +2267,9 @@ case class ElementAt(left: Expression, right: Expression) extends GetMapValueUti > SELECT _FUNC_('Spark', 'SQL'); SparkSQL > SELECT _FUNC_(array(1, 2, 3), array(4, 5), array(6)); - | [1,2,3,4,5,6] - """) + [1,2,3,4,5,6] + """, + note = "Concat logic for arrays is available since 2.4.0.") case class Concat(children: Seq[Expression]) extends ComplexTypeMergingExpression { private def allowedTypes: Seq[AbstractDataType] = Seq(StringType, BinaryType, ArrayType) @@ -2273,33 +2303,41 @@ case class Concat(children: Seq[Expression]) extends ComplexTypeMergingExpressio override def foldable: Boolean = children.forall(_.foldable) - override def eval(input: InternalRow): Any = dataType match { + override def eval(input: InternalRow): Any = doConcat(input) + + @transient private lazy val doConcat: InternalRow => Any = dataType match { case BinaryType => - val inputs = children.map(_.eval(input).asInstanceOf[Array[Byte]]) - ByteArray.concat(inputs: _*) + input => { + val inputs = children.map(_.eval(input).asInstanceOf[Array[Byte]]) + ByteArray.concat(inputs: _*) + } case StringType => - val inputs = children.map(_.eval(input).asInstanceOf[UTF8String]) - UTF8String.concat(inputs : _*) + input => { + val inputs = children.map(_.eval(input).asInstanceOf[UTF8String]) + UTF8String.concat(inputs: _*) + } case ArrayType(elementType, _) => - val inputs = children.toStream.map(_.eval(input)) - if (inputs.contains(null)) { - null - } else { - val arrayData = inputs.map(_.asInstanceOf[ArrayData]) - val numberOfElements = arrayData.foldLeft(0L)((sum, ad) => sum + ad.numElements()) - if (numberOfElements > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) { - throw new RuntimeException(s"Unsuccessful try to concat arrays with $numberOfElements" + - " elements due to exceeding the array size limit " + - ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH + ".") - } - val finalData = new Array[AnyRef](numberOfElements.toInt) - var position = 0 - for(ad <- arrayData) { - val arr = ad.toObjectArray(elementType) - Array.copy(arr, 0, finalData, position, arr.length) - position += arr.length + input => { + val inputs = children.toStream.map(_.eval(input)) + if (inputs.contains(null)) { + null + } else { + val arrayData = inputs.map(_.asInstanceOf[ArrayData]) + val numberOfElements = arrayData.foldLeft(0L)((sum, ad) => sum + ad.numElements()) + if (numberOfElements > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) { + throw new RuntimeException(s"Unsuccessful try to concat arrays with $numberOfElements" + + " elements due to exceeding the array size limit " + + ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH + ".") + } + val finalData = new Array[AnyRef](numberOfElements.toInt) + var position = 0 + for (ad <- arrayData) { + val arr = ad.toObjectArray(elementType) + Array.copy(arr, 0, finalData, position, arr.length) + position += arr.length + } + new GenericArrayData(finalData) } - new GenericArrayData(finalData) } } @@ -2427,7 +2465,7 @@ case class Concat(children: Seq[Expression]) extends ComplexTypeMergingExpressio usage = "_FUNC_(arrayOfArrays) - Transforms an array of arrays into a single array.", examples = """ Examples: - > SELECT _FUNC_(array(array(1, 2), array(3, 4)); + > SELECT _FUNC_(array(array(1, 2), array(3, 4))); [1,2,3,4] """, since = "2.4.0") @@ -2556,11 +2594,11 @@ case class Flatten(child: Expression) extends UnaryExpression { examples = """ Examples: > SELECT _FUNC_(1, 5); - [1, 2, 3, 4, 5] + [1,2,3,4,5] > SELECT _FUNC_(5, 1); - [5, 4, 3, 2, 1] + [5,4,3,2,1] > SELECT _FUNC_(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month); - [2018-01-01, 2018-02-01, 2018-03-01] + [2018-01-01,2018-02-01,2018-03-01] """, since = "2.4.0" ) @@ -2934,7 +2972,7 @@ object Sequence { examples = """ Examples: > SELECT _FUNC_('123', 2); - ['123', '123'] + ["123","123"] """, since = "2.4.0") case class ArrayRepeat(left: Expression, right: Expression) @@ -3063,11 +3101,24 @@ case class ArrayRemove(left: Expression, right: Expression) override def dataType: DataType = left.dataType override def inputTypes: Seq[AbstractDataType] = { - val elementType = left.dataType match { - case t: ArrayType => t.elementType - case _ => AnyDataType + (left.dataType, right.dataType) match { + case (ArrayType(e1, hasNull), e2) => + TypeCoercion.findTightestCommonType(e1, e2) match { + case Some(dt) => Seq(ArrayType(dt, hasNull), dt) + case _ => Seq.empty + } + case _ => Seq.empty + } + } + + override def checkInputDataTypes(): TypeCheckResult = { + (left.dataType, right.dataType) match { + case (ArrayType(e1, _), e2) if e1.sameType(e2) => + TypeUtils.checkForOrderingExpr(e2, s"function $prettyName") + case _ => TypeCheckResult.TypeCheckFailure(s"Input to function $prettyName should have " + + s"been ${ArrayType.simpleString} followed by a value with same element type, but it's " + + s"[${left.dataType.catalogString}, ${right.dataType.catalogString}].") } - Seq(ArrayType, elementType) } private def elementType: DataType = left.dataType.asInstanceOf[ArrayType].elementType @@ -3075,14 +3126,6 @@ case class ArrayRemove(left: Expression, right: Expression) @transient private lazy val ordering: Ordering[Any] = TypeUtils.getInterpretedOrdering(right.dataType) - override def checkInputDataTypes(): TypeCheckResult = { - super.checkInputDataTypes() match { - case f: TypeCheckResult.TypeCheckFailure => f - case TypeCheckResult.TypeCheckSuccess => - TypeUtils.checkForOrderingExpr(right.dataType, s"function $prettyName") - } - } - override def nullSafeEval(arr: Any, value: Any): Any = { val newArray = new Array[Any](arr.asInstanceOf[ArrayData].numElements()) var pos = 0 @@ -3421,7 +3464,7 @@ object ArrayBinaryLike { examples = """ Examples: > SELECT _FUNC_(array(1, 2, 3), array(1, 3, 5)); - array(1, 2, 3, 5) + [1,2,3,5] """, since = "2.4.0") case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLike @@ -3632,7 +3675,7 @@ object ArrayUnion { examples = """ Examples: > SELECT _FUNC_(array(1, 2, 3), array(1, 3, 5)); - array(1, 3) + [1,3] """, since = "2.4.0") case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBinaryLike @@ -3873,7 +3916,7 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina examples = """ Examples: > SELECT _FUNC_(array(1, 2, 3), array(1, 3, 5)); - array(2) + [2] """, since = "2.4.0") case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryLike diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 077a6dc93bd1..0361372b6b73 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -61,11 +61,10 @@ case class CreateArray(children: Seq[Expression]) extends Expression { override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val et = dataType.elementType - val evals = children.map(e => e.genCode(ctx)) - val (preprocess, assigns, postprocess, arrayData) = - GenArrayData.genCodeToCreateArrayData(ctx, et, evals, false) + val (allocation, assigns, arrayData) = + GenArrayData.genCodeToCreateArrayData(ctx, et, children, false, "createArray") ev.copy( - code = code"${preprocess}${assigns}${postprocess}", + code = code"${allocation}${assigns}", value = JavaCode.variable(arrayData, dataType), isNull = FalseLiteral) } @@ -75,87 +74,60 @@ case class CreateArray(children: Seq[Expression]) extends Expression { private [sql] object GenArrayData { /** - * Return Java code pieces based on DataType and isPrimitive to allocate ArrayData class + * Return Java code pieces based on DataType and array size to allocate ArrayData class * * @param ctx a [[CodegenContext]] * @param elementType data type of underlying array elements - * @param elementsCode concatenated set of [[ExprCode]] for each element of an underlying array + * @param elementsExpr concatenated set of [[Expression]] for each element of an underlying array * @param isMapKey if true, throw an exception when the element is null - * @return (code pre-assignments, concatenated assignments to each array elements, - * code post-assignments, arrayData name) + * @param functionName string to include in the error message + * @return (array allocation, concatenated assignments to each array elements, arrayData name) */ def genCodeToCreateArrayData( ctx: CodegenContext, elementType: DataType, - elementsCode: Seq[ExprCode], - isMapKey: Boolean): (String, String, String, String) = { + elementsExpr: Seq[Expression], + isMapKey: Boolean, + functionName: String): (String, String, String) = { val arrayDataName = ctx.freshName("arrayData") - val numElements = elementsCode.length + val numElements = s"${elementsExpr.length}L" - if (!CodeGenerator.isPrimitiveType(elementType)) { - val arrayName = ctx.freshName("arrayObject") - val genericArrayClass = classOf[GenericArrayData].getName + val initialization = CodeGenerator.createArrayData( + arrayDataName, elementType, numElements, s" $functionName failed.") - val assignments = elementsCode.zipWithIndex.map { case (eval, i) => - val isNullAssignment = if (!isMapKey) { - s"$arrayName[$i] = null;" - } else { - "throw new RuntimeException(\"Cannot use null as map key!\");" - } - eval.code + s""" - if (${eval.isNull}) { - $isNullAssignment - } else { - $arrayName[$i] = ${eval.value}; - } - """ - } - val assignmentString = ctx.splitExpressionsWithCurrentInputs( - expressions = assignments, - funcName = "apply", - extraArguments = ("Object[]", arrayName) :: Nil) - - (s"Object[] $arrayName = new Object[$numElements];", - assignmentString, - s"final ArrayData $arrayDataName = new $genericArrayClass($arrayName);", - arrayDataName) - } else { - val arrayName = ctx.freshName("array") - val unsafeArraySizeInBytes = - UnsafeArrayData.calculateHeaderPortionInBytes(numElements) + - ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements) - val baseOffset = Platform.BYTE_ARRAY_OFFSET - - val primitiveValueTypeName = CodeGenerator.primitiveTypeName(elementType) - val assignments = elementsCode.zipWithIndex.map { case (eval, i) => + val assignments = elementsExpr.zipWithIndex.map { case (expr, i) => + val eval = expr.genCode(ctx) + val setArrayElement = CodeGenerator.setArrayElement( + arrayDataName, elementType, i.toString, eval.value) + + val assignment = if (!expr.nullable) { + setArrayElement + } else { val isNullAssignment = if (!isMapKey) { s"$arrayDataName.setNullAt($i);" } else { "throw new RuntimeException(\"Cannot use null as map key!\");" } - eval.code + s""" - if (${eval.isNull}) { - $isNullAssignment - } else { - $arrayDataName.set$primitiveValueTypeName($i, ${eval.value}); - } - """ + + s""" + |if (${eval.isNull}) { + | $isNullAssignment + |} else { + | $setArrayElement + |} + """.stripMargin } - val assignmentString = ctx.splitExpressionsWithCurrentInputs( - expressions = assignments, - funcName = "apply", - extraArguments = ("UnsafeArrayData", arrayDataName) :: Nil) - - (s""" - byte[] $arrayName = new byte[$unsafeArraySizeInBytes]; - UnsafeArrayData $arrayDataName = new UnsafeArrayData(); - Platform.putLong($arrayName, $baseOffset, $numElements); - $arrayDataName.pointTo($arrayName, $baseOffset, $unsafeArraySizeInBytes); - """, - assignmentString, - "", - arrayDataName) + s""" + |${eval.code} + |$assignment + """.stripMargin } + val assignmentString = ctx.splitExpressionsWithCurrentInputs( + expressions = assignments, + funcName = "apply", + extraArguments = ("ArrayData", arrayDataName) :: Nil) + + (initialization, assignmentString, arrayDataName) } } @@ -216,21 +188,17 @@ case class CreateMap(children: Seq[Expression]) extends Expression { override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val mapClass = classOf[ArrayBasedMapData].getName val MapType(keyDt, valueDt, _) = dataType - val evalKeys = keys.map(e => e.genCode(ctx)) - val evalValues = values.map(e => e.genCode(ctx)) - val (preprocessKeyData, assignKeys, postprocessKeyData, keyArrayData) = - GenArrayData.genCodeToCreateArrayData(ctx, keyDt, evalKeys, true) - val (preprocessValueData, assignValues, postprocessValueData, valueArrayData) = - GenArrayData.genCodeToCreateArrayData(ctx, valueDt, evalValues, false) + val (allocationKeyData, assignKeys, keyArrayData) = + GenArrayData.genCodeToCreateArrayData(ctx, keyDt, keys, true, "createMap") + val (allocationValueData, assignValues, valueArrayData) = + GenArrayData.genCodeToCreateArrayData(ctx, valueDt, values, false, "createMap") val code = code""" final boolean ${ev.isNull} = false; - $preprocessKeyData + $allocationKeyData $assignKeys - $postprocessKeyData - $preprocessValueData + $allocationValueData $assignValues - $postprocessValueData final MapData ${ev.value} = new $mapClass($keyArrayData, $valueArrayData); """ ev.copy(code = code) @@ -248,7 +216,7 @@ case class CreateMap(children: Seq[Expression]) extends Expression { in keys should not be null""", examples = """ Examples: - > SELECT _FUNC_([1.0, 3.0], ['2', '4']); + > SELECT _FUNC_(array(1.0, 3.0), array('2', '4')); {1.0:"2",3.0:"4"} """, since = "2.4.0") case class MapFromArrays(left: Expression, right: Expression) @@ -379,10 +347,7 @@ trait CreateNamedStructLike extends Expression { } override def checkInputDataTypes(): TypeCheckResult = { - if (children.length < 1) { - TypeCheckResult.TypeCheckFailure( - s"input to function $prettyName requires at least one argument") - } else if (children.size % 2 != 0) { + if (children.size % 2 != 0) { TypeCheckResult.TypeCheckFailure(s"$prettyName expects an even number of arguments.") } else { val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index f95798d64db1..45e17ae235a9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1018,51 +1018,18 @@ case class TimeAdd(start: Expression, interval: Expression, timeZoneId: Option[S } /** - * A special expression used to convert the string input of `to/from_utc_timestamp` to timestamp, - * which requires the timestamp string to not have timezone information, otherwise null is returned. - */ -case class StringToTimestampWithoutTimezone(child: Expression, timeZoneId: Option[String] = None) - extends UnaryExpression with TimeZoneAwareExpression with ExpectsInputTypes { - - override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = - copy(timeZoneId = Option(timeZoneId)) - - override def inputTypes: Seq[AbstractDataType] = Seq(StringType) - override def dataType: DataType = TimestampType - override def nullable: Boolean = true - override def toString: String = child.toString - override def sql: String = child.sql - - override def nullSafeEval(input: Any): Any = { - DateTimeUtils.stringToTimestamp( - input.asInstanceOf[UTF8String], timeZone, rejectTzInString = true).orNull - } - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") - val tz = ctx.addReferenceObj("timeZone", timeZone) - val longOpt = ctx.freshName("longOpt") - val eval = child.genCode(ctx) - val code = code""" - |${eval.code} - |${CodeGenerator.JAVA_BOOLEAN} ${ev.isNull} = true; - |${CodeGenerator.JAVA_LONG} ${ev.value} = ${CodeGenerator.defaultValue(TimestampType)}; - |if (!${eval.isNull}) { - | scala.Option $longOpt = $dtu.stringToTimestamp(${eval.value}, $tz, true); - | if ($longOpt.isDefined()) { - | ${ev.value} = ((Long) $longOpt.get()).longValue(); - | ${ev.isNull} = false; - | } - |} - """.stripMargin - ev.copy(code = code) - } -} - -/** - * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders - * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield - * '2017-07-14 03:40:00.0'. + * This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function + * takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in UTC, and + * renders that timestamp as a timestamp in the given time zone. + * + * However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not + * timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to + * the given timezone. + * + * This function may return confusing result if the input is a string with timezone, e.g. + * '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp + * according to the timezone in the string, and finally display the result by converting the + * timestamp to string according to the session local timezone. */ // scalastyle:off line.size.limit @ExpressionDescription( @@ -1257,9 +1224,18 @@ case class MonthsBetween( } /** - * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time zone, - * and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield - * '2017-07-14 01:40:00.0'. + * This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function + * takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in the given + * timezone, and renders that timestamp as a timestamp in UTC. + * + * However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not + * timezone-agnostic. So in Spark this function just shift the timestamp value from the given + * timezone to UTC timezone. + * + * This function may return confusing result if the input is a string with timezone, e.g. + * '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp + * according to the timezone in the string, and finally display the result by converting the + * timestamp to string according to the session local timezone. */ // scalastyle:off line.size.limit @ExpressionDescription( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index a754e87a1796..742a4f87a9c0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -33,7 +33,6 @@ import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.hash.Murmur3_x86_32 -import org.apache.spark.unsafe.memory.MemoryBlock import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -362,7 +361,10 @@ abstract class HashExpression[E] extends Expression { } protected def genHashString(input: String, result: String): String = { - s"$result = $hasherClassName.hashUTF8String($input, $result);" + val baseObject = s"$input.getBaseObject()" + val baseOffset = s"$input.getBaseOffset()" + val numBytes = s"$input.numBytes()" + s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes, $result);" } protected def genHashForMap( @@ -469,8 +471,6 @@ abstract class InterpretedHashFunction { protected def hashUnsafeBytes(base: AnyRef, offset: Long, length: Int, seed: Long): Long - protected def hashUnsafeBytesBlock(base: MemoryBlock, seed: Long): Long - /** * Computes hash of a given `value` of type `dataType`. The caller needs to check the validity * of input `value`. @@ -496,7 +496,8 @@ abstract class InterpretedHashFunction { case c: CalendarInterval => hashInt(c.months, hashLong(c.microseconds, seed)) case a: Array[Byte] => hashUnsafeBytes(a, Platform.BYTE_ARRAY_OFFSET, a.length, seed) - case s: UTF8String => hashUnsafeBytesBlock(s.getMemoryBlock(), seed) + case s: UTF8String => + hashUnsafeBytes(s.getBaseObject, s.getBaseOffset, s.numBytes(), seed) case array: ArrayData => val elementType = dataType match { @@ -583,15 +584,9 @@ object Murmur3HashFunction extends InterpretedHashFunction { Murmur3_x86_32.hashLong(l, seed.toInt) } - override protected def hashUnsafeBytes( - base: AnyRef, offset: Long, len: Int, seed: Long): Long = { + override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = { Murmur3_x86_32.hashUnsafeBytes(base, offset, len, seed.toInt) } - - override protected def hashUnsafeBytesBlock( - base: MemoryBlock, seed: Long): Long = { - Murmur3_x86_32.hashUnsafeBytesBlock(base, seed.toInt) - } } /** @@ -616,14 +611,9 @@ object XxHash64Function extends InterpretedHashFunction { override protected def hashLong(l: Long, seed: Long): Long = XXH64.hashLong(l, seed) - override protected def hashUnsafeBytes( - base: AnyRef, offset: Long, len: Int, seed: Long): Long = { + override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = { XXH64.hashUnsafeBytes(base, offset, len, seed) } - - override protected def hashUnsafeBytesBlock(base: MemoryBlock, seed: Long): Long = { - XXH64.hashUnsafeBytesBlock(base, seed) - } } /** @@ -730,7 +720,10 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] { """ override protected def genHashString(input: String, result: String): String = { - s"$result = $hasherClassName.hashUTF8String($input);" + val baseObject = s"$input.getBaseObject()" + val baseOffset = s"$input.getBaseOffset()" + val numBytes = s"$input.numBytes()" + s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes);" } override protected def genHashForArray( @@ -824,14 +817,10 @@ object HiveHashFunction extends InterpretedHashFunction { HiveHasher.hashLong(l) } - override protected def hashUnsafeBytes( - base: AnyRef, offset: Long, len: Int, seed: Long): Long = { + override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = { HiveHasher.hashUnsafeBytes(base, offset, len) } - override protected def hashUnsafeBytesBlock( - base: MemoryBlock, seed: Long): Long = HiveHasher.hashUnsafeBytesBlock(base) - private val HIVE_DECIMAL_MAX_PRECISION = 38 private val HIVE_DECIMAL_MAX_SCALE = 38 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 2bb6b20b944d..b07d9466ba0d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -209,9 +209,9 @@ trait MapBasedSimpleHigherOrderFunction extends SimpleHigherOrderFunction { examples = """ Examples: > SELECT _FUNC_(array(1, 2, 3), x -> x + 1); - array(2, 3, 4) + [2,3,4] > SELECT _FUNC_(array(1, 2, 3), (x, i) -> x + i); - array(1, 3, 5) + [1,3,5] """, since = "2.4.0") case class ArrayTransform( @@ -268,7 +268,7 @@ usage = "_FUNC_(expr, func) - Filters entries in a map using the function.", examples = """ Examples: > SELECT _FUNC_(map(1, 0, 2, 2, 3, -1), (k, v) -> k > v); - [1 -> 0, 3 -> -1] + {1:0,3:-1} """, since = "2.4.0") case class MapFilter( @@ -318,7 +318,7 @@ case class MapFilter( examples = """ Examples: > SELECT _FUNC_(array(1, 2, 3), x -> x % 2 == 1); - array(1, 3) + [1,3] """, since = "2.4.0") case class ArrayFilter( @@ -499,10 +499,10 @@ case class ArrayAggregate( usage = "_FUNC_(expr, func) - Transforms elements in a map using the function.", examples = """ Examples: - > SELECT _FUNC_(map(array(1, 2, 3), array(1, 2, 3)), (k, v) -> k + 1); - map(array(2, 3, 4), array(1, 2, 3)) - > SELECT _FUNC_(map(array(1, 2, 3), array(1, 2, 3)), (k, v) -> k + v); - map(array(2, 4, 6), array(1, 2, 3)) + > SELECT _FUNC_(map_from_arrays(array(1, 2, 3), array(1, 2, 3)), (k, v) -> k + 1); + {2:1,3:2,4:3} + > SELECT _FUNC_(map_from_arrays(array(1, 2, 3), array(1, 2, 3)), (k, v) -> k + v); + {2:1,4:2,6:3} """, since = "2.4.0") case class TransformKeys( @@ -549,10 +549,10 @@ case class TransformKeys( usage = "_FUNC_(expr, func) - Transforms values in the map using the function.", examples = """ Examples: - > SELECT _FUNC_(map(array(1, 2, 3), array(1, 2, 3)), (k, v) -> v + 1); - map(array(1, 2, 3), array(2, 3, 4)) - > SELECT _FUNC_(map(array(1, 2, 3), array(1, 2, 3)), (k, v) -> k + v); - map(array(1, 2, 3), array(2, 4, 6)) + > SELECT _FUNC_(map_from_arrays(array(1, 2, 3), array(1, 2, 3)), (k, v) -> v + 1); + {1:2,2:3,3:4} + > SELECT _FUNC_(map_from_arrays(array(1, 2, 3), array(1, 2, 3)), (k, v) -> k + v); + {1:2,2:4,3:6} """, since = "2.4.0") case class TransformValues( @@ -777,11 +777,11 @@ case class MapZipWith(left: Expression, right: Expression, function: Expression) examples = """ Examples: > SELECT _FUNC_(array(1, 2, 3), array('a', 'b', 'c'), (x, y) -> (y, x)); - array(('a', 1), ('b', 2), ('c', 3)) - > SELECT _FUNC_(array(1, 2), array(3, 4), (x, y) -> x + y)); - array(4, 6) + [{"y":"a","x":1},{"y":"b","x":2},{"y":"c","x":3}] + > SELECT _FUNC_(array(1, 2), array(3, 4), (x, y) -> x + y); + [4,6] > SELECT _FUNC_(array('a', 'b', 'c'), array('d', 'e', 'f'), (x, y) -> concat(x, y)); - array('ad', 'be', 'cf') + ["ad","be","cf"] """, since = "2.4.0") // scalastyle:on line.size.limit diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index bd9090a07471..f5297dde10ed 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -740,15 +740,31 @@ case class StructsToJson( examples = """ Examples: > SELECT _FUNC_('[{"col":0}]'); - array> + array> + > SELECT _FUNC_('[{"col":01}]', map('allowNumericLeadingZeros', 'true')); + array> """, since = "2.4.0") -case class SchemaOfJson(child: Expression) +case class SchemaOfJson( + child: Expression, + options: Map[String, String]) extends UnaryExpression with String2StringExpression with CodegenFallback { - private val jsonOptions = new JSONOptions(Map.empty, "UTC") - private val jsonFactory = new JsonFactory() - jsonOptions.setJacksonOptions(jsonFactory) + def this(child: Expression) = this(child, Map.empty[String, String]) + + def this(child: Expression, options: Expression) = this( + child = child, + options = JsonExprUtils.convertToMapData(options)) + + @transient + private lazy val jsonOptions = new JSONOptions(options, "UTC") + + @transient + private lazy val jsonFactory = { + val factory = new JsonFactory() + jsonOptions.setJacksonOptions(factory) + factory + } override def convert(v: UTF8String): UTF8String = { val dt = Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, v)) { parser => @@ -764,7 +780,7 @@ object JsonExprUtils { def evalSchemaExpr(exp: Expression): DataType = exp match { case Literal(s, StringType) => DataType.fromDDL(s.toString) - case e @ SchemaOfJson(_: Literal) => + case e @ SchemaOfJson(_: Literal, _) => val ddlSchema = e.eval().asInstanceOf[UTF8String] DataType.fromDDL(ddlSchema.toString) case e => throw new AnalysisException( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 0efd1224f1bc..2bcbb92f1a46 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -128,30 +128,36 @@ object Literal { val dataType = DataType.parseDataType(json \ "dataType") json \ "value" match { case JNull => Literal.create(null, dataType) - case JString(str) => - val value = dataType match { - case BooleanType => str.toBoolean - case ByteType => str.toByte - case ShortType => str.toShort - case IntegerType => str.toInt - case LongType => str.toLong - case FloatType => str.toFloat - case DoubleType => str.toDouble - case StringType => UTF8String.fromString(str) - case DateType => java.sql.Date.valueOf(str) - case TimestampType => java.sql.Timestamp.valueOf(str) - case CalendarIntervalType => CalendarInterval.fromString(str) - case t: DecimalType => - val d = Decimal(str) - assert(d.changePrecision(t.precision, t.scale)) - d - case _ => null - } - Literal.create(value, dataType) + case JString(str) => fromString(str, dataType) case other => sys.error(s"$other is not a valid Literal json value") } } + /** + * Constructs a Literal from a String + */ + def fromString(str: String, dataType: DataType): Literal = { + val value = dataType match { + case BooleanType => str.toBoolean + case ByteType => str.toByte + case ShortType => str.toShort + case IntegerType => str.toInt + case LongType => str.toLong + case FloatType => str.toFloat + case DoubleType => str.toDouble + case StringType => UTF8String.fromString(str) + case DateType => java.sql.Date.valueOf(str) + case TimestampType => java.sql.Timestamp.valueOf(str) + case CalendarIntervalType => CalendarInterval.fromString(str) + case t: DecimalType => + val d = Decimal(str) + assert(d.changePrecision(t.precision, t.scale)) + d + case _ => null + } + Literal.create(value, dataType) + } + def create(v: Any, dataType: DataType): Literal = { Literal(CatalystTypeConverters.convertToCatalyst(v), dataType) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala index 11dcc3ebf798..0083ee64653e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala @@ -86,24 +86,12 @@ package object expressions { } /** - * Converts a [[InternalRow]] to another Row given a sequence of expression that define each - * column of the new row. If the schema of the input row is specified, then the given expression - * will be bound to that schema. - * - * In contrast to a normal projection, a MutableProjection reuses the same underlying row object - * each time an input row is added. This significantly reduces the cost of calculating the - * projection, but means that it is not safe to hold on to a reference to a [[InternalRow]] after - * `next()` has been called on the [[Iterator]] that produced it. Instead, the user must call - * `InternalRow.copy()` and hold on to the returned [[InternalRow]] before calling `next()`. + * A helper function to bind given expressions to an input schema. */ - abstract class MutableProjection extends Projection { - def currentValue: InternalRow - - /** Uses the given row to store the output of the projection. */ - def target(row: InternalRow): MutableProjection + def toBoundExprs(exprs: Seq[Expression], inputSchema: Seq[Attribute]): Seq[Expression] = { + exprs.map(BindReferences.bindReference(_, inputSchema)) } - /** * Helper functions for working with `Seq[Attribute]`. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 149bd79278a5..2125340f38ee 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -144,7 +144,7 @@ case class Not(child: Expression) case class InSubquery(values: Seq[Expression], query: ListQuery) extends Predicate with Unevaluable { - @transient lazy val value: Expression = if (values.length > 1) { + @transient private lazy val value: Expression = if (values.length > 1) { CreateNamedStruct(values.zipWithIndex.flatMap { case (v: NamedExpression, _) => Seq(Literal(v.name), v) case (v, idx) => Seq(Literal(s"_$idx"), v) @@ -155,37 +155,35 @@ case class InSubquery(values: Seq[Expression], query: ListQuery) override def checkInputDataTypes(): TypeCheckResult = { - val mismatchOpt = !DataType.equalsStructurally(query.dataType, value.dataType, - ignoreNullability = true) - if (mismatchOpt) { - if (values.length != query.childOutputs.length) { - TypeCheckResult.TypeCheckFailure( - s""" - |The number of columns in the left hand side of an IN subquery does not match the - |number of columns in the output of subquery. - |#columns in left hand side: ${values.length}. - |#columns in right hand side: ${query.childOutputs.length}. - |Left side columns: - |[${values.map(_.sql).mkString(", ")}]. - |Right side columns: - |[${query.childOutputs.map(_.sql).mkString(", ")}].""".stripMargin) - } else { - val mismatchedColumns = values.zip(query.childOutputs).flatMap { - case (l, r) if l.dataType != r.dataType => - Seq(s"(${l.sql}:${l.dataType.catalogString}, ${r.sql}:${r.dataType.catalogString})") - case _ => None - } - TypeCheckResult.TypeCheckFailure( - s""" - |The data type of one or more elements in the left hand side of an IN subquery - |is not compatible with the data type of the output of the subquery - |Mismatched columns: - |[${mismatchedColumns.mkString(", ")}] - |Left side: - |[${values.map(_.dataType.catalogString).mkString(", ")}]. - |Right side: - |[${query.childOutputs.map(_.dataType.catalogString).mkString(", ")}].""".stripMargin) + if (values.length != query.childOutputs.length) { + TypeCheckResult.TypeCheckFailure( + s""" + |The number of columns in the left hand side of an IN subquery does not match the + |number of columns in the output of subquery. + |#columns in left hand side: ${values.length}. + |#columns in right hand side: ${query.childOutputs.length}. + |Left side columns: + |[${values.map(_.sql).mkString(", ")}]. + |Right side columns: + |[${query.childOutputs.map(_.sql).mkString(", ")}].""".stripMargin) + } else if (!DataType.equalsStructurally( + query.dataType, value.dataType, ignoreNullability = true)) { + + val mismatchedColumns = values.zip(query.childOutputs).flatMap { + case (l, r) if l.dataType != r.dataType => + Seq(s"(${l.sql}:${l.dataType.catalogString}, ${r.sql}:${r.dataType.catalogString})") + case _ => None } + TypeCheckResult.TypeCheckFailure( + s""" + |The data type of one or more elements in the left hand side of an IN subquery + |is not compatible with the data type of the output of the subquery + |Mismatched columns: + |[${mismatchedColumns.mkString(", ")}] + |Left side: + |[${values.map(_.dataType.catalogString).mkString(", ")}]. + |Right side: + |[${query.childOutputs.map(_.dataType.catalogString).mkString(", ")}].""".stripMargin) } else { TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName") } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index bf0c35fe6101..4f5ea1e95f83 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -157,7 +157,7 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi arguments = """ Arguments: * str - a string expression - * regexp - a string expression. The pattern string should be a Java regular expression. + * regexp - a string expression. The regex string should be a Java regular expression. Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL parser. For example, to match "\abc", a regular expression for `regexp` can be @@ -229,33 +229,53 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress /** - * Splits str around pat (pattern is a regular expression). + * Splits str around matches of the given regex. */ @ExpressionDescription( - usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.", + usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences that match `regex`" + + " and returns an array with a length of at most `limit`", + arguments = """ + Arguments: + * str - a string expression to split. + * regex - a string representing a regular expression. The regex string should be a + Java regular expression. + * limit - an integer expression which controls the number of times the regex is applied. + * limit > 0: The resulting array's length will not be more than `limit`, + and the resulting array's last entry will contain all input + beyond the last matched regex. + * limit <= 0: `regex` will be applied as many times as possible, and + the resulting array can be of any size. + """, examples = """ Examples: > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]'); ["one","two","three",""] + > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', -1); + ["one","two","three",""] + > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2); + ["one","twoBthreeC"] """) -case class StringSplit(str: Expression, pattern: Expression) - extends BinaryExpression with ImplicitCastInputTypes { +case class StringSplit(str: Expression, regex: Expression, limit: Expression) + extends TernaryExpression with ImplicitCastInputTypes { - override def left: Expression = str - override def right: Expression = pattern override def dataType: DataType = ArrayType(StringType) - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType) + override def children: Seq[Expression] = str :: regex :: limit :: Nil - override def nullSafeEval(string: Any, regex: Any): Any = { - val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1) + def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1)); + + override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = { + val strings = string.asInstanceOf[UTF8String].split( + regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int]) new GenericArrayData(strings.asInstanceOf[Array[Any]]) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val arrayClass = classOf[GenericArrayData].getName - nullSafeCodeGen(ctx, ev, (str, pattern) => + nullSafeCodeGen(ctx, ev, (str, regex, limit) => { // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""") + s"""${ev.value} = new $arrayClass($str.split($regex,$limit));""".stripMargin + }) } override def prettyName: String = "split" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 14faa62bde7d..cd824ee87ca5 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -330,7 +330,9 @@ trait String2StringExpression extends ImplicitCastInputTypes { case class Upper(child: Expression) extends UnaryExpression with String2StringExpression { + // scalastyle:off caselocale override def convert(v: UTF8String): UTF8String = v.toUpperCase + // scalastyle:on caselocale override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { defineCodeGen(ctx, ev, c => s"($c).toUpperCase()") @@ -349,7 +351,9 @@ case class Upper(child: Expression) """) case class Lower(child: Expression) extends UnaryExpression with String2StringExpression { + // scalastyle:off caselocale override def convert(v: UTF8String): UTF8String = v.toLowerCase + // scalastyle:on caselocale override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { defineCodeGen(ctx, ev, c => s"($c).toLowerCase()") @@ -1389,7 +1393,9 @@ case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastI override def dataType: DataType = StringType override def nullSafeEval(string: Any): Any = { + // scalastyle:off caselocale string.asInstanceOf[UTF8String].toLowerCase.toTitleCase + // scalastyle:on caselocale } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { defineCodeGen(ctx, ev, str => s"$str.toLowerCase().toTitleCase()") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala index 47eeb70e0042..64152e04928d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala @@ -113,6 +113,11 @@ private[sql] class JSONOptions( } val lineSeparatorInWrite: String = lineSeparator.getOrElse("\n") + /** + * Generating JSON strings in pretty representation if the parameter is enabled. + */ + val pretty: Boolean = parameters.get("pretty").map(_.toBoolean).getOrElse(false) + /** Sets config options on a Jackson [[JsonFactory]]. */ def setJacksonOptions(factory: JsonFactory): Unit = { factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala index 9b86d865622d..d02a2be8ddad 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala @@ -70,7 +70,10 @@ private[sql] class JacksonGenerator( s"Initial type ${dataType.catalogString} must be a ${MapType.simpleString}") } - private val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) + private val gen = { + val generator = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) + if (options.pretty) generator.useDefaultPrettyPrinter() else generator + } private val lineSeparator: String = options.lineSeparatorInWrite diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index e4b4f1ecbe21..da8009d50b5e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -165,7 +165,10 @@ abstract class Optimizer(sessionCatalog: SessionCatalog) Batch("LocalRelation", fixedPoint, ConvertToLocalRelation, PropagateEmptyRelation) :+ - // The following batch should be executed after batch "Join Reorder" and "LocalRelation". + Batch("Extract PythonUDF From JoinCondition", Once, + PullOutPythonUDFInJoinCondition) :+ + // The following batch should be executed after batch "Join Reorder" "LocalRelation" and + // "Extract PythonUDF From JoinCondition". Batch("Check Cartesian Products", Once, CheckCartesianProducts) :+ Batch("RewriteSubquery", Once, @@ -202,7 +205,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog) ReplaceDistinctWithAggregate.ruleName :: PullupCorrelatedPredicates.ruleName :: RewriteCorrelatedScalarSubquery.ruleName :: - RewritePredicateSubquery.ruleName :: Nil + RewritePredicateSubquery.ruleName :: + PullOutPythonUDFInJoinCondition.ruleName :: Nil /** * Optimize all the subqueries inside expression. @@ -486,6 +490,10 @@ object PushProjectionThroughUnion extends Rule[LogicalPlan] with PredicateHelper private def pushToRight[A <: Expression](e: A, rewrites: AttributeMap[Attribute]) = { val result = e transform { case a: Attribute => rewrites(a) + } match { + // Make sure exprId is unique in each child of Union. + case Alias(child, alias) => Alias(child, alias)() + case other => other } // We must promise the compiler that we did not discard the names in the case of project @@ -528,12 +536,12 @@ object ColumnPruning extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = removeProjectBeforeFilter(plan transform { // Prunes the unused columns from project list of Project/Aggregate/Expand - case p @ Project(_, p2: Project) if (p2.outputSet -- p.references).nonEmpty => + case p @ Project(_, p2: Project) if !p2.outputSet.subsetOf(p.references) => p.copy(child = p2.copy(projectList = p2.projectList.filter(p.references.contains))) - case p @ Project(_, a: Aggregate) if (a.outputSet -- p.references).nonEmpty => + case p @ Project(_, a: Aggregate) if !a.outputSet.subsetOf(p.references) => p.copy( child = a.copy(aggregateExpressions = a.aggregateExpressions.filter(p.references.contains))) - case a @ Project(_, e @ Expand(_, _, grandChild)) if (e.outputSet -- a.references).nonEmpty => + case a @ Project(_, e @ Expand(_, _, grandChild)) if !e.outputSet.subsetOf(a.references) => val newOutput = e.output.filter(a.references.contains(_)) val newProjects = e.projections.map { proj => proj.zip(e.output).filter { case (_, a) => @@ -543,18 +551,18 @@ object ColumnPruning extends Rule[LogicalPlan] { a.copy(child = Expand(newProjects, newOutput, grandChild)) // Prunes the unused columns from child of `DeserializeToObject` - case d @ DeserializeToObject(_, _, child) if (child.outputSet -- d.references).nonEmpty => + case d @ DeserializeToObject(_, _, child) if !child.outputSet.subsetOf(d.references) => d.copy(child = prunedChild(child, d.references)) // Prunes the unused columns from child of Aggregate/Expand/Generate/ScriptTransformation - case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty => + case a @ Aggregate(_, _, child) if !child.outputSet.subsetOf(a.references) => a.copy(child = prunedChild(child, a.references)) - case f @ FlatMapGroupsInPandas(_, _, _, child) if (child.outputSet -- f.references).nonEmpty => + case f @ FlatMapGroupsInPandas(_, _, _, child) if !child.outputSet.subsetOf(f.references) => f.copy(child = prunedChild(child, f.references)) - case e @ Expand(_, _, child) if (child.outputSet -- e.references).nonEmpty => + case e @ Expand(_, _, child) if !child.outputSet.subsetOf(e.references) => e.copy(child = prunedChild(child, e.references)) case s @ ScriptTransformation(_, _, _, child, _) - if (child.outputSet -- s.references).nonEmpty => + if !child.outputSet.subsetOf(s.references) => s.copy(child = prunedChild(child, s.references)) // prune unrequired references @@ -575,7 +583,7 @@ object ColumnPruning extends Rule[LogicalPlan] { case p @ Project(_, _: Distinct) => p // Eliminate unneeded attributes from children of Union. case p @ Project(_, u: Union) => - if ((u.outputSet -- p.references).nonEmpty) { + if (!u.outputSet.subsetOf(p.references)) { val firstChild = u.children.head val newOutput = prunedChild(firstChild, p.references).output // pruning the columns of all children based on the pruned first child. @@ -591,7 +599,7 @@ object ColumnPruning extends Rule[LogicalPlan] { } // Prune unnecessary window expressions - case p @ Project(_, w: Window) if (w.windowOutputSet -- p.references).nonEmpty => + case p @ Project(_, w: Window) if !w.windowOutputSet.subsetOf(p.references) => p.copy(child = w.copy( windowExpressions = w.windowExpressions.filter(p.references.contains))) @@ -607,7 +615,7 @@ object ColumnPruning extends Rule[LogicalPlan] { // for all other logical plans that inherits the output from it's children case p @ Project(_, child) => val required = child.references ++ p.references - if ((child.inputSet -- required).nonEmpty) { + if (!child.inputSet.subsetOf(required)) { val newChildren = child.children.map(c => prunedChild(c, required)) p.copy(child = child.withNewChildren(newChildren)) } else { @@ -617,7 +625,7 @@ object ColumnPruning extends Rule[LogicalPlan] { /** Applies a projection only when the child is producing unnecessary attributes */ private def prunedChild(c: LogicalPlan, allReferences: AttributeSet) = - if ((c.outputSet -- allReferences.filter(c.outputSet.contains)).nonEmpty) { + if (!c.outputSet.subsetOf(allReferences)) { Project(c.output.filter(allReferences.contains), c) } else { c @@ -734,6 +742,28 @@ object CollapseWindow extends Rule[LogicalPlan] { } } +/** + * Transpose Adjacent Window Expressions. + * - If the partition spec of the parent Window expression is compatible with the partition spec + * of the child window expression, transpose them. + */ +object TransposeWindow extends Rule[LogicalPlan] { + private def compatibleParititions(ps1 : Seq[Expression], ps2: Seq[Expression]): Boolean = { + ps1.length < ps2.length && ps2.take(ps1.length).permutations.exists(ps1.zip(_).forall { + case (l, r) => l.semanticEquals(r) + }) + } + + def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { + case w1 @ Window(we1, ps1, os1, w2 @ Window(we2, ps2, os2, grandChild)) + if w1.references.intersect(w2.windowOutputSet).isEmpty && + w1.expressions.forall(_.deterministic) && + w2.expressions.forall(_.deterministic) && + compatibleParititions(ps1, ps2) => + Project(w1.output, Window(we2, ps2, os2, Window(we1, ps1, os1, grandChild))) + } +} + /** * Generate a list of additional filters from an operator's existing constraint but remove those * that are either already part of the operator's condition or are part of the operator's child diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 5629b7289422..f8037588fa71 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -263,10 +263,15 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper { case TrueLiteral Or _ => TrueLiteral case _ Or TrueLiteral => TrueLiteral - case a And b if Not(a).semanticEquals(b) => FalseLiteral - case a Or b if Not(a).semanticEquals(b) => TrueLiteral - case a And b if a.semanticEquals(Not(b)) => FalseLiteral - case a Or b if a.semanticEquals(Not(b)) => TrueLiteral + case a And b if Not(a).semanticEquals(b) => + If(IsNull(a), Literal.create(null, a.dataType), FalseLiteral) + case a And b if a.semanticEquals(Not(b)) => + If(IsNull(b), Literal.create(null, b.dataType), FalseLiteral) + + case a Or b if Not(a).semanticEquals(b) => + If(IsNull(a), Literal.create(null, a.dataType), TrueLiteral) + case a Or b if a.semanticEquals(Not(b)) => + If(IsNull(b), Literal.create(null, b.dataType), TrueLiteral) case a And b if a.semanticEquals(b) => a case a Or b if a.semanticEquals(b) => a diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala index edbeaf273fd6..7149edee0173 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer import scala.annotation.tailrec +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.ExtractFiltersAndInnerJoins import org.apache.spark.sql.catalyst.plans._ @@ -152,3 +153,51 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with PredicateHelper { if (j.joinType == newJoinType) f else Filter(condition, j.copy(joinType = newJoinType)) } } + +/** + * PythonUDF in join condition can not be evaluated, this rule will detect the PythonUDF + * and pull them out from join condition. For python udf accessing attributes from only one side, + * they are pushed down by operation push down rules. If not (e.g. user disables filter push + * down rules), we need to pull them out in this rule too. + */ +object PullOutPythonUDFInJoinCondition extends Rule[LogicalPlan] with PredicateHelper { + def hasPythonUDF(expression: Expression): Boolean = { + expression.collectFirst { case udf: PythonUDF => udf }.isDefined + } + + override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { + case j @ Join(_, _, joinType, condition) + if condition.isDefined && hasPythonUDF(condition.get) => + if (!joinType.isInstanceOf[InnerLike] && joinType != LeftSemi) { + // The current strategy only support InnerLike and LeftSemi join because for other type, + // it breaks SQL semantic if we run the join condition as a filter after join. If we pass + // the plan here, it'll still get a an invalid PythonUDF RuntimeException with message + // `requires attributes from more than one child`, we throw firstly here for better + // readable information. + throw new AnalysisException("Using PythonUDF in join condition of join type" + + s" $joinType is not supported.") + } + // If condition expression contains python udf, it will be moved out from + // the new join conditions. + val (udf, rest) = + splitConjunctivePredicates(condition.get).partition(hasPythonUDF) + val newCondition = if (rest.isEmpty) { + logWarning(s"The join condition:$condition of the join plan contains PythonUDF only," + + s" it will be moved out and the join plan will be turned to cross join.") + None + } else { + Some(rest.reduceLeft(And)) + } + val newJoin = j.copy(condition = newCondition) + joinType match { + case _: InnerLike => Filter(udf.reduceLeft(And), newJoin) + case LeftSemi => + Project( + j.left.output.map(_.toAttribute), + Filter(udf.reduceLeft(And), newJoin.copy(joinType = Inner))) + case _ => + throw new AnalysisException("Using PythonUDF in join condition of join type" + + s" $joinType is not supported.") + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 7bc1f63e3054..ba0b72e747fc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -663,7 +663,9 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging UnresolvedGenerator(visitFunctionName(ctx.qualifiedName), expressions), unrequiredChildIndex = Nil, outer = ctx.OUTER != null, + // scalastyle:off caselocale Some(ctx.tblName.getText.toLowerCase), + // scalastyle:on caselocale ctx.colName.asScala.map(_.getText).map(UnresolvedAttribute.apply), query) } @@ -699,7 +701,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging // Resolve the join type and join condition val (joinType, condition) = Option(join.joinCriteria) match { case Some(c) if c.USING != null => - (UsingJoin(baseJoinType, c.identifier.asScala.map(_.getText)), None) + (UsingJoin(baseJoinType, visitIdentifierList(c.identifierList)), None) case Some(c) if c.booleanExpression != null => (baseJoinType, Option(expression(c.booleanExpression))) case None if join.NATURAL != null => @@ -1157,7 +1159,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case SqlBaseParser.PERCENT => Remainder(left, right) case SqlBaseParser.DIV => - Cast(Divide(left, right), LongType) + IntegralDivide(left, right) case SqlBaseParser.PLUS => Add(left, right) case SqlBaseParser.MINUS => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala index bc41dd0465e3..6fa5203a06f7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala @@ -81,7 +81,7 @@ abstract class QueryPlanner[PhysicalPlan <: TreeNode[PhysicalPlan]] { childPlans.map { childPlan => // Replace the placeholder by the child plan candidateWithPlaceholders.transformUp { - case p if p == placeholder => childPlan + case p if p.eq(placeholder) => childPlan } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala index b1ffdca09146..ca0cea6ba7de 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala @@ -42,7 +42,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT * All Attributes that appear in expressions from this operator. Note that this set does not * include attributes that are implicitly referenced by being passed through to the output tuple. */ - def references: AttributeSet = AttributeSet(expressions.flatMap(_.references)) + def references: AttributeSet = AttributeSet.fromAttributeSets(expressions.map(_.references)) /** * The set of all attributes that are input to this operator by its children. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 0e4456ac0e6a..5f136629eb15 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -159,7 +159,7 @@ abstract class UnaryNode extends LogicalPlan { var allConstraints = child.constraints.asInstanceOf[Set[Expression]] projectList.foreach { case a @ Alias(l: Literal, _) => - allConstraints += EqualTo(a.toAttribute, l) + allConstraints += EqualNullSafe(a.toAttribute, l) case a @ Alias(e, _) => // For every alias in `projectList`, replace the reference in constraints by its attribute. allConstraints ++= allConstraints.map(_ transform { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala index cd28c733f361..cc1a5e835d9c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.catalyst.plans.physical -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.{DataType, IntegerType} @@ -208,18 +206,6 @@ case object SinglePartition extends Partitioning { } } -/** - * Represents a partitioning where rows are only serialized/deserialized locally. The number - * of partitions are not changed and also the distribution of rows. This is mainly used to - * obtain some statistics of map tasks such as number of outputs. - */ -case class LocalPartitioning(childRDD: RDD[InternalRow]) extends Partitioning { - val numPartitions = childRDD.getNumPartitions - - // We will perform this partitioning no matter what the data distribution is. - override def satisfies0(required: Distribution): Boolean = false -} - /** * Represents a partitioning where rows are split up across partitions based on the hash * of `expressions`. All rows where `expressions` evaluate to the same values are guaranteed to be diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala index dccb44ddebfa..e991a2dc7462 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala @@ -21,6 +21,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.sideBySide +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.Utils object RuleExecutor { @@ -72,6 +73,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { def execute(plan: TreeType): TreeType = { var curPlan = plan val queryExecutionMetrics = RuleExecutor.queryExecutionMeter + val planChangeLogger = new PlanChangeLogger() batches.foreach { batch => val batchStartPlan = curPlan @@ -90,11 +92,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { if (!result.fastEquals(plan)) { queryExecutionMetrics.incNumEffectiveExecution(rule.ruleName) queryExecutionMetrics.incTimeEffectiveExecutionBy(rule.ruleName, runTime) - logTrace( - s""" - |=== Applying Rule ${rule.ruleName} === - |${sideBySide(plan.treeString, result.treeString).mkString("\n")} - """.stripMargin) + planChangeLogger.log(rule.ruleName, plan, result) } queryExecutionMetrics.incExecutionTimeBy(rule.ruleName, runTime) queryExecutionMetrics.incNumExecution(rule.ruleName) @@ -143,4 +141,29 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { curPlan } + + private class PlanChangeLogger { + + private val logLevel = SQLConf.get.optimizerPlanChangeLogLevel + + private val logRules = SQLConf.get.optimizerPlanChangeRules.map(Utils.stringToSeq) + + def log(ruleName: String, oldPlan: TreeType, newPlan: TreeType): Unit = { + if (logRules.isEmpty || logRules.get.contains(ruleName)) { + lazy val message = + s""" + |=== Applying Rule ${ruleName} === + |${sideBySide(oldPlan.treeString, newPlan.treeString).mkString("\n")} + """.stripMargin + logLevel match { + case "TRACE" => logTrace(message) + case "DEBUG" => logDebug(message) + case "INFO" => logInfo(message) + case "WARN" => logWarning(message) + case "ERROR" => logError(message) + case _ => logTrace(message) + } + } + } + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala index bb2c5926ae9b..06f95989f2e3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala @@ -24,6 +24,8 @@ import java.util.Locale * case-sensitive information is required. The primary constructor is marked private to avoid * nested case-insensitive map creation, otherwise the keys in the original map will become * case-insensitive in this scenario. + * Note: CaseInsensitiveMap is serializable. However, after transformation, e.g. `filterKeys()`, + * it may become not serializable. */ class CaseInsensitiveMap[T] private (val originalMap: Map[String, T]) extends Map[String, T] with Serializable { @@ -42,7 +44,7 @@ class CaseInsensitiveMap[T] private (val originalMap: Map[String, T]) extends Ma override def iterator: Iterator[(String, T)] = keyLowerCasedMap.iterator override def -(key: String): Map[String, T] = { - new CaseInsensitiveMap(originalMap.filterKeys(!_.equalsIgnoreCase(key))) + new CaseInsensitiveMap(originalMap.filter(!_._1.equalsIgnoreCase(key))) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 02813d393979..81d7274607ac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -300,28 +300,10 @@ object DateTimeUtils { * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m` */ def stringToTimestamp(s: UTF8String): Option[SQLTimestamp] = { - stringToTimestamp(s, defaultTimeZone(), rejectTzInString = false) + stringToTimestamp(s, defaultTimeZone()) } def stringToTimestamp(s: UTF8String, timeZone: TimeZone): Option[SQLTimestamp] = { - stringToTimestamp(s, timeZone, rejectTzInString = false) - } - - /** - * Converts a timestamp string to microseconds from the unix epoch, w.r.t. the given timezone. - * Returns None if the input string is not a valid timestamp format. - * - * @param s the input timestamp string. - * @param timeZone the timezone of the timestamp string, will be ignored if the timestamp string - * already contains timezone information and `forceTimezone` is false. - * @param rejectTzInString if true, rejects timezone in the input string, i.e., if the - * timestamp string contains timezone, like `2000-10-10 00:00:00+00:00`, - * return None. - */ - def stringToTimestamp( - s: UTF8String, - timeZone: TimeZone, - rejectTzInString: Boolean): Option[SQLTimestamp] = { if (s == null) { return None } @@ -439,8 +421,6 @@ object DateTimeUtils { return None } - if (tz.isDefined && rejectTzInString) return None - val c = if (tz.isEmpty) { Calendar.getInstance(timeZone) } else { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index ca22ea24207e..bc861a805ce6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -62,8 +62,10 @@ object StringUtils { private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString) private[this] val falseStrings = Set("f", "false", "n", "no", "0").map(UTF8String.fromString) + // scalastyle:off caselocale def isTrueString(s: UTF8String): Boolean = trueStrings.contains(s.toLowerCase) def isFalseString(s: UTF8String): Boolean = falseStrings.contains(s.toLowerCase) + // scalastyle:on caselocale /** * This utility can be used for filtering pattern in the "Like" of "Show Tables / Functions" DDL diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 738d8fee891d..b699707d8523 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -171,6 +171,26 @@ object SQLConf { .intConf .createWithDefault(10) + val OPTIMIZER_PLAN_CHANGE_LOG_LEVEL = buildConf("spark.sql.optimizer.planChangeLog.level") + .internal() + .doc("Configures the log level for logging the change from the original plan to the new " + + "plan after a rule is applied. The value can be 'trace', 'debug', 'info', 'warn', or " + + "'error'. The default log level is 'trace'.") + .stringConf + .transform(_.toUpperCase(Locale.ROOT)) + .checkValue(logLevel => Set("TRACE", "DEBUG", "INFO", "WARN", "ERROR").contains(logLevel), + "Invalid value for 'spark.sql.optimizer.planChangeLog.level'. Valid values are " + + "'trace', 'debug', 'info', 'warn' and 'error'.") + .createWithDefault("trace") + + val OPTIMIZER_PLAN_CHANGE_LOG_RULES = buildConf("spark.sql.optimizer.planChangeLog.rules") + .internal() + .doc("If this configuration is set, the optimizer will only log plan changes caused by " + + "applying the rules specified in this configuration. The value can be a list of rule " + + "names separated by comma.") + .stringConf + .createOptional + val COMPRESS_CACHED = buildConf("spark.sql.inMemoryColumnarStorage.compressed") .doc("When set to true Spark SQL will automatically select a compression codec for each " + "column based on statistics of the data.") @@ -235,13 +255,6 @@ object SQLConf { .intConf .createWithDefault(4) - val LIMIT_FLAT_GLOBAL_LIMIT = buildConf("spark.sql.limit.flatGlobalLimit") - .internal() - .doc("During global limit, try to evenly distribute limited rows across data " + - "partitions. If disabled, scanning data partitions sequentially until reaching limit number.") - .booleanConf - .createWithDefault(true) - val ADVANCED_PARTITION_PREDICATE_PUSHDOWN = buildConf("spark.sql.hive.advancedPartitionPredicatePushdown.enabled") .internal() @@ -249,22 +262,6 @@ object SQLConf { .booleanConf .createWithDefault(true) - val ENABLE_FALL_BACK_TO_HDFS_FOR_STATS = - buildConf("spark.sql.statistics.fallBackToHdfs") - .doc("If the table statistics are not available from table metadata enable fall back to hdfs." + - " This is useful in determining if a table is small enough to use auto broadcast joins.") - .booleanConf - .createWithDefault(false) - - val DEFAULT_SIZE_IN_BYTES = buildConf("spark.sql.defaultSizeInBytes") - .internal() - .doc("The default table size used in query planning. By default, it is set to Long.MaxValue " + - "which is larger than `spark.sql.autoBroadcastJoinThreshold` to be more conservative. " + - "That is to say by default the optimizer will not choose to broadcast a table unless it " + - "knows for sure its size is small enough.") - .longConf - .createWithDefault(Long.MaxValue) - val SHUFFLE_PARTITIONS = buildConf("spark.sql.shuffle.partitions") .doc("The default number of partitions to use when shuffling data for joins or aggregations.") .intConf @@ -454,8 +451,11 @@ object SQLConf { .createWithDefault(10) val PARQUET_WRITE_LEGACY_FORMAT = buildConf("spark.sql.parquet.writeLegacyFormat") - .doc("Whether to be compatible with the legacy Parquet format adopted by Spark 1.4 and prior " + - "versions, when converting Parquet schema to Spark SQL schema and vice versa.") + .doc("If true, data will be written in a way of Spark 1.4 and earlier. For example, decimal " + + "values will be written in Apache Parquet's fixed-length byte array format, which other " + + "systems such as Apache Hive and Apache Impala use. If false, the newer format in Parquet " + + "will be used. For example, decimals will be written in int-based format. If Parquet " + + "output is intended for use with systems that do not support this newer format, set to true.") .booleanConf .createWithDefault(false) @@ -580,16 +580,6 @@ object SQLConf { .checkValues(HiveCaseSensitiveInferenceMode.values.map(_.toString)) .createWithDefault(HiveCaseSensitiveInferenceMode.INFER_AND_SAVE.toString) - val TYPECOERCION_COMPARE_DATE_TIMESTAMP_IN_TIMESTAMP = - buildConf("spark.sql.typeCoercion.compareDateTimestampInTimestamp") - .internal() - .doc("When true (default), compare Date with Timestamp after converting both sides to " + - "Timestamp. This behavior is compatible with Hive 2.2 or later. See HIVE-15236. " + - "When false, restore the behavior prior to Spark 2.4. Compare Date with Timestamp after " + - "converting both sides to string. This config will be removed in spark 3.0") - .booleanConf - .createWithDefault(true) - val OPTIMIZER_METADATA_ONLY = buildConf("spark.sql.optimizer.metadataOnly") .doc("When true, enable the metadata-only query optimization that use the table's metadata " + "to produce the partition columns instead of table scans. It applies when all the columns " + @@ -608,7 +598,7 @@ object SQLConf { .internal() .doc("When true, force the output schema of the from_json() function to be nullable " + "(including all the fields). Otherwise, the schema might not be compatible with" + - "actual data, which leads to curruptions.") + "actual data, which leads to corruptions. This config will be removed in Spark 3.0.") .booleanConf .createWithDefault(true) @@ -984,8 +974,9 @@ object SQLConf { "Note: This configuration cannot be changed between query restarts from the same " + "checkpoint location.") .stringConf + .transform(_.toLowerCase(Locale.ROOT)) .checkValue( - str => Set("min", "max").contains(str.toLowerCase), + str => Set("min", "max").contains(str), "Invalid value for 'spark.sql.streaming.multipleWatermarkPolicy'. " + "Valid values are 'min' and 'max'") .createWithDefault("min") // must be same as MultipleWatermarkPolicy.DEFAULT_POLICY_NAME @@ -1072,7 +1063,7 @@ object SQLConf { .createWithDefault(10000L) val STREAMING_NO_DATA_MICRO_BATCHES_ENABLED = - buildConf("spark.sql.streaming.noDataMicroBatchesEnabled") + buildConf("spark.sql.streaming.noDataMicroBatches.enabled") .doc( "Whether streaming micro-batch engine will execute batches without data " + "for eager state management for stateful streaming queries.") @@ -1098,6 +1089,30 @@ object SQLConf { .internal() .stringConf + val PARALLEL_FILE_LISTING_IN_STATS_COMPUTATION = + buildConf("spark.sql.statistics.parallelFileListingInStatsComputation.enabled") + .internal() + .doc("When true, SQL commands use parallel file listing, " + + "as opposed to single thread listing." + + "This usually speeds up commands that need to list many directories.") + .booleanConf + .createWithDefault(true) + + val ENABLE_FALL_BACK_TO_HDFS_FOR_STATS = buildConf("spark.sql.statistics.fallBackToHdfs") + .doc("If the table statistics are not available from table metadata enable fall back to hdfs." + + " This is useful in determining if a table is small enough to use auto broadcast joins.") + .booleanConf + .createWithDefault(false) + + val DEFAULT_SIZE_IN_BYTES = buildConf("spark.sql.defaultSizeInBytes") + .internal() + .doc("The default table size used in query planning. By default, it is set to Long.MaxValue " + + "which is larger than `spark.sql.autoBroadcastJoinThreshold` to be more conservative. " + + "That is to say by default the optimizer will not choose to broadcast a table unless it " + + "knows for sure its size is small enough.") + .longConf + .createWithDefault(Long.MaxValue) + val NDV_MAX_ERROR = buildConf("spark.sql.statistics.ndv.maxError") .internal() @@ -1284,15 +1299,15 @@ object SQLConf { .booleanConf .createWithDefault(true) - val PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_POSITION = - buildConf("spark.sql.execution.pandas.groupedMap.assignColumnsByPosition") + val PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME = + buildConf("spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName") .internal() - .doc("When true, a grouped map Pandas UDF will assign columns from the returned " + - "Pandas DataFrame based on position, regardless of column label type. When false, " + - "columns will be looked up by name if labeled with a string and fallback to use " + - "position if not. This configuration will be deprecated in future releases.") + .doc("When true, columns will be looked up by name if labeled with a string and fallback " + + "to use position if not. When false, a grouped map Pandas UDF will assign columns from " + + "the returned Pandas DataFrame based on position, regardless of column label type. " + + "This configuration will be deprecated in future releases.") .booleanConf - .createWithDefault(false) + .createWithDefault(true) val REPLACE_EXCEPT_WITH_FILTER = buildConf("spark.sql.optimizer.replaceExceptWithFilter") .internal() @@ -1317,6 +1332,15 @@ object SQLConf { .booleanConf .createWithDefault(true) + val LITERAL_PICK_MINIMUM_PRECISION = + buildConf("spark.sql.legacy.literal.pickMinimumPrecision") + .internal() + .doc("When integral literal is used in decimal operations, pick a minimum precision " + + "required by the literal if this config is true, to make the resulting precision and/or " + + "scale smaller. This can reduce the possibility of precision lose and/or overflow.") + .booleanConf + .createWithDefault(true) + val SQL_OPTIONS_REDACTION_PATTERN = buildConf("spark.sql.redaction.options.regex") .doc("Regex to decide which keys in a Spark SQL command's options map contain sensitive " + @@ -1347,7 +1371,7 @@ object SQLConf { .createWithDefault(false) val ALLOW_CREATING_MANAGED_TABLE_USING_NONEMPTY_LOCATION = - buildConf("spark.sql.allowCreatingManagedTableUsingNonemptyLocation") + buildConf("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation") .internal() .doc("When this option is set to true, creating managed tables with nonempty location " + "is allowed. Otherwise, an analysis exception is thrown. ") @@ -1387,13 +1411,6 @@ object SQLConf { .stringConf .createWithDefault("") - val REJECT_TIMEZONE_IN_STRING = buildConf("spark.sql.function.rejectTimezoneInString") - .internal() - .doc("If true, `to_utc_timestamp` and `from_utc_timestamp` return null if the input string " + - "contains a timezone part, e.g. `2000-10-10 00:00:00+00:00`.") - .booleanConf - .createWithDefault(true) - object PartitionOverwriteMode extends Enumeration { val STATIC, DYNAMIC = Value } @@ -1429,7 +1446,7 @@ object SQLConf { .createWithDefault(true) val NESTED_SCHEMA_PRUNING_ENABLED = - buildConf("spark.sql.nestedSchemaPruning.enabled") + buildConf("spark.sql.optimizer.nestedSchemaPruning.enabled") .internal() .doc("Prune nested fields from a logical relation's output which are unnecessary in " + "satisfying a query. This optimization allows columnar file format readers to avoid " + @@ -1462,12 +1479,6 @@ object SQLConf { .booleanConf .createWithDefault(true) - val LEGACY_SIZE_OF_NULL = buildConf("spark.sql.legacy.sizeOfNull") - .doc("If it is set to true, size of null returns -1. This behavior was inherited from Hive. " + - "The size function returns null for null input if the flag is disabled.") - .booleanConf - .createWithDefault(true) - val REPL_EAGER_EVAL_ENABLED = buildConf("spark.sql.repl.eagerEval.enabled") .doc("Enables eager evaluation or not. When true, the top K rows of Dataset will be " + "displayed if and only if the REPL supports the eager evaluation. Currently, the " + @@ -1517,6 +1528,22 @@ object SQLConf { .checkValues((1 to 9).toSet + Deflater.DEFAULT_COMPRESSION) .createWithDefault(Deflater.DEFAULT_COMPRESSION) + val COMPARE_DATE_TIMESTAMP_IN_TIMESTAMP = + buildConf("spark.sql.legacy.compareDateTimestampInTimestamp") + .internal() + .doc("When true (default), compare Date with Timestamp after converting both sides to " + + "Timestamp. This behavior is compatible with Hive 2.2 or later. See HIVE-15236. " + + "When false, restore the behavior prior to Spark 2.4. Compare Date with Timestamp after " + + "converting both sides to string. This config will be removed in Spark 3.0.") + .booleanConf + .createWithDefault(true) + + val LEGACY_SIZE_OF_NULL = buildConf("spark.sql.legacy.sizeOfNull") + .doc("If it is set to true, size of null returns -1. This behavior was inherited from Hive. " + + "The size function returns null for null input if the flag is disabled.") + .booleanConf + .createWithDefault(true) + val LEGACY_REPLACE_DATABRICKS_SPARK_AVRO_ENABLED = buildConf("spark.sql.legacy.replaceDatabricksSparkAvro.enabled") .doc("If it is set to true, the data source provider com.databricks.spark.avro is mapped " + @@ -1534,14 +1561,12 @@ object SQLConf { .booleanConf .createWithDefault(false) - val PARALLEL_FILE_LISTING_IN_STATS_COMPUTATION = - buildConf("spark.sql.parallelFileListingInStatsComputation.enabled") - .internal() - .doc("When true, SQL commands use parallel file listing, " + - "as opposed to single thread listing." + - "This usually speeds up commands that need to list many directories.") - .booleanConf - .createWithDefault(true) + val LEGACY_INTEGRALDIVIDE_RETURN_LONG = buildConf("spark.sql.legacy.integralDivide.returnBigint") + .doc("If it is set to true, the div operator returns always a bigint. This behavior was " + + "inherited from Hive. Otherwise, the return type is the data type of the operands.") + .internal() + .booleanConf + .createWithDefault(false) } /** @@ -1570,6 +1595,10 @@ class SQLConf extends Serializable with Logging { def optimizerInSetConversionThreshold: Int = getConf(OPTIMIZER_INSET_CONVERSION_THRESHOLD) + def optimizerPlanChangeLogLevel: String = getConf(OPTIMIZER_PLAN_CHANGE_LOG_LEVEL) + + def optimizerPlanChangeRules: Option[String] = getConf(OPTIMIZER_PLAN_CHANGE_LOG_RULES) + def stateStoreProviderClass: String = getConf(STATE_STORE_PROVIDER_CLASS) def stateStoreMinDeltasForSnapshot: Int = getConf(STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT) @@ -1675,8 +1704,7 @@ class SQLConf extends Serializable with Logging { def caseSensitiveInferenceMode: HiveCaseSensitiveInferenceMode.Value = HiveCaseSensitiveInferenceMode.withName(getConf(HIVE_CASE_SENSITIVE_INFERENCE)) - def compareDateTimestampInTimestamp : Boolean = - getConf(TYPECOERCION_COMPARE_DATE_TIMESTAMP_IN_TIMESTAMP) + def compareDateTimestampInTimestamp : Boolean = getConf(COMPARE_DATE_TIMESTAMP_IN_TIMESTAMP) def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT) @@ -1741,19 +1769,13 @@ class SQLConf extends Serializable with Logging { def limitScaleUpFactor: Int = getConf(LIMIT_SCALE_UP_FACTOR) - def limitFlatGlobalLimit: Boolean = getConf(LIMIT_FLAT_GLOBAL_LIMIT) - def advancedPartitionPredicatePushdownEnabled: Boolean = getConf(ADVANCED_PARTITION_PREDICATE_PUSHDOWN) - def fallBackToHdfsForStatsEnabled: Boolean = getConf(ENABLE_FALL_BACK_TO_HDFS_FOR_STATS) - def preferSortMergeJoin: Boolean = getConf(PREFER_SORTMERGEJOIN) def enableRadixSort: Boolean = getConf(RADIX_SORT_ENABLED) - def defaultSizeInBytes: Long = getConf(DEFAULT_SIZE_IN_BYTES) - def isParquetSchemaMergingEnabled: Boolean = getConf(PARQUET_SCHEMA_MERGING_ENABLED) def isParquetSchemaRespectSummaries: Boolean = getConf(PARQUET_SCHEMA_RESPECT_SUMMARIES) @@ -1845,6 +1867,13 @@ class SQLConf extends Serializable with Logging { def sessionLocalTimeZone: String = getConf(SQLConf.SESSION_LOCAL_TIMEZONE) + def parallelFileListingInStatsComputation: Boolean = + getConf(SQLConf.PARALLEL_FILE_LISTING_IN_STATS_COMPUTATION) + + def fallBackToHdfsForStatsEnabled: Boolean = getConf(ENABLE_FALL_BACK_TO_HDFS_FOR_STATS) + + def defaultSizeInBytes: Long = getConf(DEFAULT_SIZE_IN_BYTES) + def ndvMaxError: Double = getConf(NDV_MAX_ERROR) def histogramEnabled: Boolean = getConf(HISTOGRAM_ENABLED) @@ -1899,13 +1928,15 @@ class SQLConf extends Serializable with Logging { def pandasRespectSessionTimeZone: Boolean = getConf(PANDAS_RESPECT_SESSION_LOCAL_TIMEZONE) - def pandasGroupedMapAssignColumnssByPosition: Boolean = - getConf(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_POSITION) + def pandasGroupedMapAssignColumnsByName: Boolean = + getConf(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME) def replaceExceptWithFilter: Boolean = getConf(REPLACE_EXCEPT_WITH_FILTER) def decimalOperationsAllowPrecisionLoss: Boolean = getConf(DECIMAL_OPERATIONS_ALLOW_PREC_LOSS) + def literalPickMinimumPrecision: Boolean = getConf(LITERAL_PICK_MINIMUM_PRECISION) + def continuousStreamingExecutorQueueSize: Int = getConf(CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE) def continuousStreamingExecutorPollIntervalMs: Long = @@ -1947,8 +1978,7 @@ class SQLConf extends Serializable with Logging { def setOpsPrecedenceEnforced: Boolean = getConf(SQLConf.LEGACY_SETOPS_PRECEDENCE_ENABLED) - def parallelFileListingInStatsComputation: Boolean = - getConf(SQLConf.PARALLEL_FILE_LISTING_IN_STATS_COMPUTATION) + def integralDivideReturnLong: Boolean = getConf(SQLConf.LEGACY_INTEGRALDIVIDE_RETURN_LONG) /** ********************** SQLConf functionality methods ************ */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructField.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructField.scala index 902cae9150ed..35f9970a0aae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructField.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructField.scala @@ -79,6 +79,8 @@ case class StructField( /** * Returns a string containing a schema in DDL format. For example, the following value: * `StructField("eventId", IntegerType)` will be converted to `eventId` INT. + * + * @since 2.4.0 */ def toDDL: String = { val comment = getComment() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala index c5ca169c955d..06289b148320 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -365,6 +365,8 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru * `StructType(Seq(StructField("eventId", IntegerType), StructField("s", StringType)))` * will be converted to `eventId` INT, `s` STRING. * The returned DDL schema can be used in a table creation. + * + * @since 2.4.0 */ def toDDL: String = fields.map(_.toDDL).mkString(",") @@ -441,6 +443,8 @@ object StructType extends AbstractDataType { /** * Creates StructType for a given DDL-formatted string, which is a comma separated list of field * definitions, e.g., a INT, b STRING. + * + * @since 2.2.0 */ def fromDDL(ddl: String): StructType = CatalystSqlParser.parseTableSchema(ddl) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala index 41ca270095ff..052014ab8674 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala @@ -77,7 +77,9 @@ private[spark] object SchemaUtils { */ def checkColumnNameDuplication( columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { + // scalastyle:off caselocale val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) + // scalastyle:on caselocale if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java index 76930f936851..b67c6f3e6e85 100644 --- a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java @@ -17,8 +17,7 @@ package org.apache.spark.sql.catalyst.expressions; -import org.apache.spark.unsafe.memory.ByteArrayMemoryBlock; -import org.apache.spark.unsafe.memory.MemoryBlock; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.types.UTF8String; import org.junit.Assert; import org.junit.Test; @@ -54,7 +53,7 @@ public void testKnownStringAndIntInputs() { for (int i = 0; i < inputs.length; i++) { UTF8String s = UTF8String.fromString("val_" + inputs[i]); - int hash = HiveHasher.hashUnsafeBytesBlock(s.getMemoryBlock()); + int hash = HiveHasher.hashUnsafeBytes(s.getBaseObject(), s.getBaseOffset(), s.numBytes()); Assert.assertEquals(expected[i], ((31 * inputs[i]) + hash)); } } @@ -90,13 +89,13 @@ public void randomizedStressTestBytes() { int byteArrSize = rand.nextInt(100) * 8; byte[] bytes = new byte[byteArrSize]; rand.nextBytes(bytes); - MemoryBlock mb = ByteArrayMemoryBlock.fromArray(bytes); Assert.assertEquals( - HiveHasher.hashUnsafeBytesBlock(mb), - HiveHasher.hashUnsafeBytesBlock(mb)); + HiveHasher.hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize), + HiveHasher.hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); - hashcodes.add(HiveHasher.hashUnsafeBytesBlock(mb)); + hashcodes.add(HiveHasher.hashUnsafeBytes( + bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); } // A very loose bound. @@ -113,13 +112,13 @@ public void randomizedStressTestPaddedStrings() { byte[] strBytes = String.valueOf(i).getBytes(StandardCharsets.UTF_8); byte[] paddedBytes = new byte[byteArrSize]; System.arraycopy(strBytes, 0, paddedBytes, 0, strBytes.length); - MemoryBlock mb = ByteArrayMemoryBlock.fromArray(paddedBytes); Assert.assertEquals( - HiveHasher.hashUnsafeBytesBlock(mb), - HiveHasher.hashUnsafeBytesBlock(mb)); + HiveHasher.hashUnsafeBytes(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize), + HiveHasher.hashUnsafeBytes(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); - hashcodes.add(HiveHasher.hashUnsafeBytesBlock(mb)); + hashcodes.add(HiveHasher.hashUnsafeBytes( + paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); } // A very loose bound. diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatchSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatchSuite.java index 2da87113c622..8da778800bb9 100644 --- a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatchSuite.java +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/RowBasedKeyValueBatchSuite.java @@ -123,9 +123,8 @@ public void tearDown() { @Test public void emptyBatch() throws Exception { - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY)) { Assert.assertEquals(0, batch.numRows()); try { batch.getKeyRow(-1); @@ -152,31 +151,24 @@ public void emptyBatch() throws Exception { // Expected exception; do nothing. } Assert.assertFalse(batch.rowIterator().next()); - } finally { - batch.close(); } } @Test - public void batchType() throws Exception { - RowBasedKeyValueBatch batch1 = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - RowBasedKeyValueBatch batch2 = RowBasedKeyValueBatch.allocate(fixedKeySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - try { + public void batchType() { + try (RowBasedKeyValueBatch batch1 = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY); + RowBasedKeyValueBatch batch2 = RowBasedKeyValueBatch.allocate(fixedKeySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY)) { Assert.assertEquals(batch1.getClass(), VariableLengthRowBasedKeyValueBatch.class); Assert.assertEquals(batch2.getClass(), FixedLengthRowBasedKeyValueBatch.class); - } finally { - batch1.close(); - batch2.close(); } } @Test public void setAndRetrieve() { - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY)) { UnsafeRow ret1 = appendRow(batch, makeKeyRow(1, "A"), makeValueRow(1, 1)); Assert.assertTrue(checkValue(ret1, 1, 1)); UnsafeRow ret2 = appendRow(batch, makeKeyRow(2, "B"), makeValueRow(2, 2)); @@ -204,33 +196,27 @@ public void setAndRetrieve() { } catch (AssertionError e) { // Expected exception; do nothing. } - } finally { - batch.close(); } } @Test public void setUpdateAndRetrieve() { - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY)) { appendRow(batch, makeKeyRow(1, "A"), makeValueRow(1, 1)); Assert.assertEquals(1, batch.numRows()); UnsafeRow retrievedValue = batch.getValueRow(0); updateValueRow(retrievedValue, 2, 2); UnsafeRow retrievedValue2 = batch.getValueRow(0); Assert.assertTrue(checkValue(retrievedValue2, 2, 2)); - } finally { - batch.close(); } } @Test public void iteratorTest() throws Exception { - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY)) { appendRow(batch, makeKeyRow(1, "A"), makeValueRow(1, 1)); appendRow(batch, makeKeyRow(2, "B"), makeValueRow(2, 2)); appendRow(batch, makeKeyRow(3, "C"), makeValueRow(3, 3)); @@ -253,16 +239,13 @@ public void iteratorTest() throws Exception { Assert.assertTrue(checkKey(key3, 3, "C")); Assert.assertTrue(checkValue(value3, 3, 3)); Assert.assertFalse(iterator.next()); - } finally { - batch.close(); } } @Test public void fixedLengthTest() throws Exception { - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(fixedKeySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(fixedKeySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY)) { appendRow(batch, makeKeyRow(11, 11), makeValueRow(1, 1)); appendRow(batch, makeKeyRow(22, 22), makeValueRow(2, 2)); appendRow(batch, makeKeyRow(33, 33), makeValueRow(3, 3)); @@ -293,16 +276,13 @@ public void fixedLengthTest() throws Exception { Assert.assertTrue(checkKey(key3, 33, 33)); Assert.assertTrue(checkValue(value3, 3, 3)); Assert.assertFalse(iterator.next()); - } finally { - batch.close(); } } @Test public void appendRowUntilExceedingCapacity() throws Exception { - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, 10); - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, 10)) { UnsafeRow key = makeKeyRow(1, "A"); UnsafeRow value = makeValueRow(1, 1); for (int i = 0; i < 10; i++) { @@ -321,8 +301,6 @@ public void appendRowUntilExceedingCapacity() throws Exception { Assert.assertTrue(checkValue(value1, 1, 1)); } Assert.assertFalse(iterator.next()); - } finally { - batch.close(); } } @@ -330,9 +308,8 @@ public void appendRowUntilExceedingCapacity() throws Exception { public void appendRowUntilExceedingPageSize() throws Exception { // Use default size or spark.buffer.pageSize if specified int pageSizeToUse = (int) memoryManager.pageSizeBytes(); - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, pageSizeToUse); //enough capacity - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, pageSizeToUse)) { UnsafeRow key = makeKeyRow(1, "A"); UnsafeRow value = makeValueRow(1, 1); int recordLength = 8 + key.getSizeInBytes() + value.getSizeInBytes() + 8; @@ -356,49 +333,44 @@ public void appendRowUntilExceedingPageSize() throws Exception { Assert.assertTrue(checkValue(value1, 1, 1)); } Assert.assertFalse(iterator.next()); - } finally { - batch.close(); } } @Test public void failureToAllocateFirstPage() throws Exception { memoryManager.limit(1024); - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY)) { UnsafeRow key = makeKeyRow(1, "A"); UnsafeRow value = makeValueRow(11, 11); UnsafeRow ret = appendRow(batch, key, value); Assert.assertNull(ret); Assert.assertFalse(batch.rowIterator().next()); - } finally { - batch.close(); } } @Test public void randomizedTest() { - RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, - valueSchema, taskMemoryManager, DEFAULT_CAPACITY); - int numEntry = 100; - long[] expectedK1 = new long[numEntry]; - String[] expectedK2 = new String[numEntry]; - long[] expectedV1 = new long[numEntry]; - long[] expectedV2 = new long[numEntry]; - - for (int i = 0; i < numEntry; i++) { - long k1 = rand.nextLong(); - String k2 = getRandomString(rand.nextInt(256)); - long v1 = rand.nextLong(); - long v2 = rand.nextLong(); - appendRow(batch, makeKeyRow(k1, k2), makeValueRow(v1, v2)); - expectedK1[i] = k1; - expectedK2[i] = k2; - expectedV1[i] = v1; - expectedV2[i] = v2; - } - try { + try (RowBasedKeyValueBatch batch = RowBasedKeyValueBatch.allocate(keySchema, + valueSchema, taskMemoryManager, DEFAULT_CAPACITY)) { + int numEntry = 100; + long[] expectedK1 = new long[numEntry]; + String[] expectedK2 = new String[numEntry]; + long[] expectedV1 = new long[numEntry]; + long[] expectedV2 = new long[numEntry]; + + for (int i = 0; i < numEntry; i++) { + long k1 = rand.nextLong(); + String k2 = getRandomString(rand.nextInt(256)); + long v1 = rand.nextLong(); + long v2 = rand.nextLong(); + appendRow(batch, makeKeyRow(k1, k2), makeValueRow(v1, v2)); + expectedK1[i] = k1; + expectedK2[i] = k2; + expectedV1[i] = v1; + expectedV2[i] = v2; + } + for (int j = 0; j < 10000; j++) { int rowId = rand.nextInt(numEntry); if (rand.nextBoolean()) { @@ -410,8 +382,6 @@ public void randomizedTest() { Assert.assertTrue(checkValue(value, expectedV1[rowId], expectedV2[rowId])); } } - } finally { - batch.close(); } } } diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/XXH64Suite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/XXH64Suite.java index cd8bce623c5d..1baee91b3439 100644 --- a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/XXH64Suite.java +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/XXH64Suite.java @@ -24,8 +24,6 @@ import java.util.Set; import org.apache.spark.unsafe.Platform; -import org.apache.spark.unsafe.memory.ByteArrayMemoryBlock; -import org.apache.spark.unsafe.memory.MemoryBlock; import org.junit.Assert; import org.junit.Test; @@ -144,13 +142,13 @@ public void randomizedStressTestBytes() { int byteArrSize = rand.nextInt(100) * 8; byte[] bytes = new byte[byteArrSize]; rand.nextBytes(bytes); - MemoryBlock mb = ByteArrayMemoryBlock.fromArray(bytes); Assert.assertEquals( - hasher.hashUnsafeWordsBlock(mb), - hasher.hashUnsafeWordsBlock(mb)); + hasher.hashUnsafeWords(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize), + hasher.hashUnsafeWords(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); - hashcodes.add(hasher.hashUnsafeWordsBlock(mb)); + hashcodes.add(hasher.hashUnsafeWords( + bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); } // A very loose bound. @@ -167,13 +165,13 @@ public void randomizedStressTestPaddedStrings() { byte[] strBytes = String.valueOf(i).getBytes(StandardCharsets.UTF_8); byte[] paddedBytes = new byte[byteArrSize]; System.arraycopy(strBytes, 0, paddedBytes, 0, strBytes.length); - MemoryBlock mb = ByteArrayMemoryBlock.fromArray(paddedBytes); Assert.assertEquals( - hasher.hashUnsafeWordsBlock(mb), - hasher.hashUnsafeWordsBlock(mb)); + hasher.hashUnsafeWords(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize), + hasher.hashUnsafeWords(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); - hashcodes.add(hasher.hashUnsafeWordsBlock(mb)); + hashcodes.add(hasher.hashUnsafeWords( + paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize)); } // A very loose bound. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala index 9a89e6290e69..4226ab3773fe 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala @@ -17,103 +17,96 @@ package org.apache.spark.sql +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection import org.apache.spark.sql.types._ -import org.apache.spark.util.Benchmark /** * Benchmark for the previous interpreted hash function(InternalRow.hashCode) vs codegened * hash expressions (Murmur3Hash/xxHash64). + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "catalyst/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain " + * Results will be written to "benchmarks/HashBenchmark-results.txt". + * }}} */ -object HashBenchmark { +object HashBenchmark extends BenchmarkBase { def test(name: String, schema: StructType, numRows: Int, iters: Int): Unit = { - val generator = RandomDataGenerator.forType(schema, nullable = false).get - val encoder = RowEncoder(schema) - val attrs = schema.toAttributes - val safeProjection = GenerateSafeProjection.generate(attrs, attrs) + runBenchmark(name) { + val generator = RandomDataGenerator.forType(schema, nullable = false).get + val encoder = RowEncoder(schema) + val attrs = schema.toAttributes + val safeProjection = GenerateSafeProjection.generate(attrs, attrs) - val rows = (1 to numRows).map(_ => - // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format. - safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy() - ).toArray + val rows = (1 to numRows).map(_ => + // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format. + safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy() + ).toArray - val benchmark = new Benchmark("Hash For " + name, iters * numRows.toLong) - benchmark.addCase("interpreted version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += rows(i).hashCode() - i += 1 + val benchmark = new Benchmark("Hash For " + name, iters * numRows.toLong, output = output) + benchmark.addCase("interpreted version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += rows(i).hashCode() + i += 1 + } } } - } - val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs) - benchmark.addCase("codegen version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHashCode(rows(i)).getInt(0) - i += 1 + val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs) + benchmark.addCase("codegen version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHashCode(rows(i)).getInt(0) + i += 1 + } } } - } - val getHashCode64b = UnsafeProjection.create(new XxHash64(attrs) :: Nil, attrs) - benchmark.addCase("codegen version 64-bit") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHashCode64b(rows(i)).getInt(0) - i += 1 + val getHashCode64b = UnsafeProjection.create(new XxHash64(attrs) :: Nil, attrs) + benchmark.addCase("codegen version 64-bit") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHashCode64b(rows(i)).getInt(0) + i += 1 + } } } - } - val getHiveHashCode = UnsafeProjection.create(new HiveHash(attrs) :: Nil, attrs) - benchmark.addCase("codegen HiveHash version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHiveHashCode(rows(i)).getInt(0) - i += 1 + val getHiveHashCode = UnsafeProjection.create(new HiveHash(attrs) :: Nil, attrs) + benchmark.addCase("codegen HiveHash version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHiveHashCode(rows(i)).getInt(0) + i += 1 + } } } - } - benchmark.run() + benchmark.run() + } } - def main(args: Array[String]): Unit = { + override def runBenchmarkSuite(): Unit = { val singleInt = new StructType().add("i", IntegerType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3262 / 3267 164.6 6.1 1.0X - codegen version 6448 / 6718 83.3 12.0 0.5X - codegen version 64-bit 6088 / 6154 88.2 11.3 0.5X - codegen HiveHash version 4732 / 4745 113.5 8.8 0.7X - */ test("single ints", singleInt, 1 << 15, 1 << 14) val singleLong = new StructType().add("i", LongType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3716 / 3726 144.5 6.9 1.0X - codegen version 7706 / 7732 69.7 14.4 0.5X - codegen version 64-bit 6370 / 6399 84.3 11.9 0.6X - codegen HiveHash version 4924 / 5026 109.0 9.2 0.8X - */ test("single longs", singleLong, 1 << 15, 1 << 14) val normal = new StructType() @@ -131,45 +124,18 @@ object HashBenchmark { .add("binary", BinaryType) .add("date", DateType) .add("timestamp", TimestampType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 2985 / 3013 0.7 1423.4 1.0X - codegen version 2422 / 2434 0.9 1155.1 1.2X - codegen version 64-bit 856 / 920 2.5 408.0 3.5X - codegen HiveHash version 4501 / 4979 0.5 2146.4 0.7X - */ test("normal", normal, 1 << 10, 1 << 11) val arrayOfInt = ArrayType(IntegerType) val array = new StructType() .add("array", arrayOfInt) .add("arrayOfArray", ArrayType(arrayOfInt)) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3100 / 3555 0.0 23651.8 1.0X - codegen version 5779 / 5865 0.0 44088.4 0.5X - codegen version 64-bit 4738 / 4821 0.0 36151.7 0.7X - codegen HiveHash version 2200 / 2246 0.1 16785.9 1.4X - */ test("array", array, 1 << 8, 1 << 9) val mapOfInt = MapType(IntegerType, IntegerType) val map = new StructType() .add("map", mapOfInt) .add("mapOfMap", MapType(IntegerType, mapOfInt)) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 0 / 0 48.1 20.8 1.0X - codegen version 257 / 275 0.0 62768.7 0.0X - codegen version 64-bit 226 / 240 0.0 55224.5 0.0X - codegen HiveHash version 89 / 96 0.0 21708.8 0.0X - */ test("map", map, 1 << 6, 1 << 6) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashByteArrayBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashByteArrayBenchmark.scala index f6c8111f5bc5..7dc865d85af0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashByteArrayBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashByteArrayBenchmark.scala @@ -19,15 +19,24 @@ package org.apache.spark.sql import java.util.Random +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.expressions.{HiveHasher, XXH64} import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.hash.Murmur3_x86_32 -import org.apache.spark.util.Benchmark /** * Synthetic benchmark for MurMurHash 3 and xxHash64. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "catalyst/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain " + * Results will be written to "benchmarks/HashByteArrayBenchmark-results.txt". + * }}} */ -object HashByteArrayBenchmark { +object HashByteArrayBenchmark extends BenchmarkBase { def test(length: Int, seed: Long, numArrays: Int, iters: Int): Unit = { val random = new Random(seed) val arrays = Array.fill[Array[Byte]](numArrays) { @@ -36,8 +45,8 @@ object HashByteArrayBenchmark { bytes } - val benchmark = - new Benchmark("Hash byte arrays with length " + length, iters * numArrays.toLong) + val benchmark = new Benchmark( + "Hash byte arrays with length " + length, iters * numArrays.toLong, output = output) benchmark.addCase("Murmur3_x86_32") { _: Int => var sum = 0L for (_ <- 0L until iters) { @@ -74,96 +83,17 @@ object HashByteArrayBenchmark { benchmark.run() } - def main(args: Array[String]): Unit = { - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 8: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 12 / 16 174.3 5.7 1.0X - xxHash 64-bit 17 / 22 120.0 8.3 0.7X - HiveHasher 13 / 15 162.1 6.2 0.9X - */ - test(8, 42L, 1 << 10, 1 << 11) - - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 16: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 19 / 22 107.6 9.3 1.0X - xxHash 64-bit 20 / 24 104.6 9.6 1.0X - HiveHasher 24 / 28 87.0 11.5 0.8X - */ - test(16, 42L, 1 << 10, 1 << 11) - - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 24: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 28 / 32 74.8 13.4 1.0X - xxHash 64-bit 24 / 29 87.3 11.5 1.2X - HiveHasher 36 / 41 57.7 17.3 0.8X - */ - test(24, 42L, 1 << 10, 1 << 11) - - // Add 31 to all arrays to create worse case alignment for xxHash. - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 31: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 41 / 45 51.1 19.6 1.0X - xxHash 64-bit 36 / 44 58.8 17.0 1.2X - HiveHasher 49 / 54 42.6 23.5 0.8X - */ - test(31, 42L, 1 << 10, 1 << 11) - - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 95: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 100 / 110 21.0 47.7 1.0X - xxHash 64-bit 74 / 78 28.2 35.5 1.3X - HiveHasher 189 / 196 11.1 90.3 0.5X - */ - test(64 + 31, 42L, 1 << 10, 1 << 11) - - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 287: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 299 / 311 7.0 142.4 1.0X - xxHash 64-bit 113 / 122 18.5 54.1 2.6X - HiveHasher 620 / 624 3.4 295.5 0.5X - */ - test(256 + 31, 42L, 1 << 10, 1 << 11) - - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 1055: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 1068 / 1070 2.0 509.1 1.0X - xxHash 64-bit 306 / 315 6.9 145.9 3.5X - HiveHasher 2316 / 2369 0.9 1104.3 0.5X - */ - test(1024 + 31, 42L, 1 << 10, 1 << 11) - - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 2079: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 2252 / 2274 0.9 1074.1 1.0X - xxHash 64-bit 534 / 580 3.9 254.6 4.2X - HiveHasher 4739 / 4786 0.4 2259.8 0.5X - */ - test(2048 + 31, 42L, 1 << 10, 1 << 11) - - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash byte arrays with length 8223: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Murmur3_x86_32 9249 / 9586 0.2 4410.5 1.0X - xxHash 64-bit 2897 / 3241 0.7 1381.6 3.2X - HiveHasher 19392 / 20211 0.1 9246.6 0.5X - */ - test(8192 + 31, 42L, 1 << 10, 1 << 11) + override def runBenchmarkSuite(): Unit = { + runBenchmark("Benchmark for MurMurHash 3 and xxHash64") { + test(8, 42L, 1 << 10, 1 << 11) + test(16, 42L, 1 << 10, 1 << 11) + test(24, 42L, 1 << 10, 1 << 11) + test(31, 42L, 1 << 10, 1 << 11) + test(64 + 31, 42L, 1 << 10, 1 << 11) + test(256 + 31, 42L, 1 << 10, 1 << 11) + test(1024 + 31, 42L, 1 << 10, 1 << 11) + test(2048 + 31, 42L, 1 << 10, 1 << 11) + test(8192 + 31, 42L, 1 << 10, 1 << 11) + } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala index 6c6376994531..e7a99485cdf0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/UnsafeProjectionBenchmark.scala @@ -17,16 +17,23 @@ package org.apache.spark.sql +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ -import org.apache.spark.util.Benchmark /** * Benchmark `UnsafeProjection` for fixed-length/primitive-type fields. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/UnsafeProjectionBenchmark-results.txt". + * }}} */ -object UnsafeProjectionBenchmark { +object UnsafeProjectionBenchmark extends BenchmarkBase { def generateRows(schema: StructType, numRows: Int): Array[InternalRow] = { val generator = RandomDataGenerator.forType(schema, nullable = false).get @@ -34,103 +41,92 @@ object UnsafeProjectionBenchmark { (1 to numRows).map(_ => encoder.toRow(generator().asInstanceOf[Row]).copy()).toArray } - def main(args: Array[String]) { - val iters = 1024 * 16 - val numRows = 1024 * 16 - - val benchmark = new Benchmark("unsafe projection", iters * numRows.toLong) - - - val schema1 = new StructType().add("l", LongType, false) - val attrs1 = schema1.toAttributes - val rows1 = generateRows(schema1, numRows) - val projection1 = UnsafeProjection.create(attrs1, attrs1) - - benchmark.addCase("single long") { _ => - for (_ <- 1 to iters) { - var sum = 0L - var i = 0 - while (i < numRows) { - sum += projection1(rows1(i)).getLong(0) - i += 1 + override def runBenchmarkSuite(): Unit = { + runBenchmark("unsafe projection") { + val iters = 1024 * 16 + val numRows = 1024 * 16 + + val benchmark = new Benchmark("unsafe projection", iters * numRows.toLong, output = output) + + val schema1 = new StructType().add("l", LongType, false) + val attrs1 = schema1.toAttributes + val rows1 = generateRows(schema1, numRows) + val projection1 = UnsafeProjection.create(attrs1, attrs1) + + benchmark.addCase("single long") { _ => + for (_ <- 1 to iters) { + var sum = 0L + var i = 0 + while (i < numRows) { + sum += projection1(rows1(i)).getLong(0) + i += 1 + } } } - } - - val schema2 = new StructType().add("l", LongType, true) - val attrs2 = schema2.toAttributes - val rows2 = generateRows(schema2, numRows) - val projection2 = UnsafeProjection.create(attrs2, attrs2) - benchmark.addCase("single nullable long") { _ => - for (_ <- 1 to iters) { - var sum = 0L - var i = 0 - while (i < numRows) { - sum += projection2(rows2(i)).getLong(0) - i += 1 + val schema2 = new StructType().add("l", LongType, true) + val attrs2 = schema2.toAttributes + val rows2 = generateRows(schema2, numRows) + val projection2 = UnsafeProjection.create(attrs2, attrs2) + + benchmark.addCase("single nullable long") { _ => + for (_ <- 1 to iters) { + var sum = 0L + var i = 0 + while (i < numRows) { + sum += projection2(rows2(i)).getLong(0) + i += 1 + } } } - } - - val schema3 = new StructType() - .add("boolean", BooleanType, false) - .add("byte", ByteType, false) - .add("short", ShortType, false) - .add("int", IntegerType, false) - .add("long", LongType, false) - .add("float", FloatType, false) - .add("double", DoubleType, false) - val attrs3 = schema3.toAttributes - val rows3 = generateRows(schema3, numRows) - val projection3 = UnsafeProjection.create(attrs3, attrs3) - - benchmark.addCase("7 primitive types") { _ => - for (_ <- 1 to iters) { - var sum = 0L - var i = 0 - while (i < numRows) { - sum += projection3(rows3(i)).getLong(0) - i += 1 + val schema3 = new StructType() + .add("boolean", BooleanType, false) + .add("byte", ByteType, false) + .add("short", ShortType, false) + .add("int", IntegerType, false) + .add("long", LongType, false) + .add("float", FloatType, false) + .add("double", DoubleType, false) + val attrs3 = schema3.toAttributes + val rows3 = generateRows(schema3, numRows) + val projection3 = UnsafeProjection.create(attrs3, attrs3) + + benchmark.addCase("7 primitive types") { _ => + for (_ <- 1 to iters) { + var sum = 0L + var i = 0 + while (i < numRows) { + sum += projection3(rows3(i)).getLong(0) + i += 1 + } } } - } - - - val schema4 = new StructType() - .add("boolean", BooleanType, true) - .add("byte", ByteType, true) - .add("short", ShortType, true) - .add("int", IntegerType, true) - .add("long", LongType, true) - .add("float", FloatType, true) - .add("double", DoubleType, true) - val attrs4 = schema4.toAttributes - val rows4 = generateRows(schema4, numRows) - val projection4 = UnsafeProjection.create(attrs4, attrs4) - benchmark.addCase("7 nullable primitive types") { _ => - for (_ <- 1 to iters) { - var sum = 0L - var i = 0 - while (i < numRows) { - sum += projection4(rows4(i)).getLong(0) - i += 1 + val schema4 = new StructType() + .add("boolean", BooleanType, true) + .add("byte", ByteType, true) + .add("short", ShortType, true) + .add("int", IntegerType, true) + .add("long", LongType, true) + .add("float", FloatType, true) + .add("double", DoubleType, true) + val attrs4 = schema4.toAttributes + val rows4 = generateRows(schema4, numRows) + val projection4 = UnsafeProjection.create(attrs4, attrs4) + + benchmark.addCase("7 nullable primitive types") { _ => + for (_ <- 1 to iters) { + var sum = 0L + var i = 0 + while (i < numRows) { + sum += projection4(rows4(i)).getLong(0) + i += 1 + } } } - } - - /* - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - unsafe projection: Avg Time(ms) Avg Rate(M/s) Relative Rate - ------------------------------------------------------------------------------- - single long 1533.34 175.07 1.00 X - single nullable long 2306.73 116.37 0.66 X - primitive types 8403.93 31.94 0.18 X - nullable primitive types 12448.39 21.56 0.12 X - */ - benchmark.run() + benchmark.run() + } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 3b3edac0a314..f9facbb71a4e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -32,6 +32,8 @@ import org.apache.spark.sql.catalyst.plans.{Cross, Inner} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} +import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -586,4 +588,20 @@ class AnalysisSuite extends AnalysisTest with Matchers { listRelation.select(MultiAlias(MultiAlias( PosExplode('list), Seq("first_pos", "first_val")), Seq("second_pos", "second_val")))) } + + test("SPARK-24151: CURRENT_DATE, CURRENT_TIMESTAMP should be case insensitive") { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + val input = Project(Seq( + UnresolvedAttribute("current_date"), + UnresolvedAttribute("CURRENT_DATE"), + UnresolvedAttribute("CURRENT_TIMESTAMP"), + UnresolvedAttribute("current_timestamp")), testRelation) + val expected = Project(Seq( + Alias(CurrentDate(), toPrettySQL(CurrentDate()))(), + Alias(CurrentDate(), toPrettySQL(CurrentDate()))(), + Alias(CurrentTimestamp(), toPrettySQL(CurrentTimestamp()))(), + Alias(CurrentTimestamp(), toPrettySQL(CurrentTimestamp()))()), testRelation).analyze + checkAnalysis(input, expected) + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala index 461eda4334bb..0eba1c537d67 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala @@ -257,12 +257,43 @@ class TypeCoercionSuite extends AnalysisTest { shouldNotCast(checkedType, IntegralType) } - test("implicit type cast - MapType(StringType, StringType)") { - val checkedType = MapType(StringType, StringType) - checkTypeCasting(checkedType, castableTypes = Seq(checkedType)) - shouldNotCast(checkedType, DecimalType) - shouldNotCast(checkedType, NumericType) - shouldNotCast(checkedType, IntegralType) + test("implicit type cast between two Map types") { + val sourceType = MapType(IntegerType, IntegerType, true) + val castableTypes = numericTypes ++ Seq(StringType).filter(!Cast.forceNullable(IntegerType, _)) + val targetTypes = numericTypes.filter(!Cast.forceNullable(IntegerType, _)).map { t => + MapType(t, sourceType.valueType, valueContainsNull = true) + } + val nonCastableTargetTypes = allTypes.filterNot(castableTypes.contains(_)).map {t => + MapType(t, sourceType.valueType, valueContainsNull = true) + } + + // Tests that its possible to setup implicit casts between two map types when + // source map's key type is integer and the target map's key type are either Byte, Short, + // Long, Double, Float, Decimal(38, 18) or String. + targetTypes.foreach { targetType => + shouldCast(sourceType, targetType, targetType) + } + + // Tests that its not possible to setup implicit casts between two map types when + // source map's key type is integer and the target map's key type are either Binary, + // Boolean, Date, Timestamp, Array, Struct, CaleandarIntervalType or NullType + nonCastableTargetTypes.foreach { targetType => + shouldNotCast(sourceType, targetType) + } + + // Tests that its not possible to cast from nullable map type to not nullable map type. + val targetNotNullableTypes = allTypes.filterNot(_ == IntegerType).map { t => + MapType(t, sourceType.valueType, valueContainsNull = false) + } + val sourceMapExprWithValueNull = + CreateMap(Seq(Literal.default(sourceType.keyType), + Literal.create(null, sourceType.valueType))) + targetNotNullableTypes.foreach { targetType => + val castDefault = + TypeCoercion.ImplicitTypeCasts.implicitCast(sourceMapExprWithValueNull, targetType) + assert(castDefault.isEmpty, + s"Should not be able to cast $sourceType to $targetType, but got $castDefault") + } } test("implicit type cast - StructType().add(\"a1\", StringType)") { @@ -680,11 +711,11 @@ class TypeCoercionSuite extends AnalysisTest { test("cast NullType for expressions that implement ExpectsInputTypes") { import TypeCoercionSuite._ - ruleTest(new TypeCoercion.ImplicitTypeCasts(conf), + ruleTest(TypeCoercion.ImplicitTypeCasts, AnyTypeUnaryExpression(Literal.create(null, NullType)), AnyTypeUnaryExpression(Literal.create(null, NullType))) - ruleTest(new TypeCoercion.ImplicitTypeCasts(conf), + ruleTest(TypeCoercion.ImplicitTypeCasts, NumericTypeUnaryExpression(Literal.create(null, NullType)), NumericTypeUnaryExpression(Literal.create(null, DoubleType))) } @@ -692,11 +723,11 @@ class TypeCoercionSuite extends AnalysisTest { test("cast NullType for binary operators") { import TypeCoercionSuite._ - ruleTest(new TypeCoercion.ImplicitTypeCasts(conf), + ruleTest(TypeCoercion.ImplicitTypeCasts, AnyTypeBinaryOperator(Literal.create(null, NullType), Literal.create(null, NullType)), AnyTypeBinaryOperator(Literal.create(null, NullType), Literal.create(null, NullType))) - ruleTest(new TypeCoercion.ImplicitTypeCasts(conf), + ruleTest(TypeCoercion.ImplicitTypeCasts, NumericTypeBinaryOperator(Literal.create(null, NullType), Literal.create(null, NullType)), NumericTypeBinaryOperator(Literal.create(null, DoubleType), Literal.create(null, DoubleType))) } @@ -976,7 +1007,7 @@ class TypeCoercionSuite extends AnalysisTest { } test("type coercion for CaseKeyWhen") { - ruleTest(new TypeCoercion.ImplicitTypeCasts(conf), + ruleTest(TypeCoercion.ImplicitTypeCasts, CaseKeyWhen(Literal(1.toShort), Seq(Literal(1), Literal("a"))), CaseKeyWhen(Cast(Literal(1.toShort), IntegerType), Seq(Literal(1), Literal("a"))) ) @@ -1436,7 +1467,7 @@ class TypeCoercionSuite extends AnalysisTest { } test("SPARK-17117 null type coercion in divide") { - val rules = Seq(FunctionArgumentConversion, Division, new ImplicitTypeCasts(conf)) + val rules = Seq(FunctionArgumentConversion, Division, ImplicitTypeCasts) val nullLit = Literal.create(null, NullType) ruleTest(rules, Divide(1L, nullLit), Divide(Cast(1L, DoubleType), Cast(nullLit, DoubleType))) ruleTest(rules, Divide(nullLit, 1L), Divide(Cast(nullLit, DoubleType), Cast(1L, DoubleType))) @@ -1459,7 +1490,7 @@ class TypeCoercionSuite extends AnalysisTest { DoubleType))) Seq(true, false).foreach { convertToTS => withSQLConf( - "spark.sql.typeCoercion.compareDateTimestampInTimestamp" -> convertToTS.toString) { + "spark.sql.legacy.compareDateTimestampInTimestamp" -> convertToTS.toString) { val date0301 = Literal(java.sql.Date.valueOf("2017-03-01")) val timestamp0301000000 = Literal(Timestamp.valueOf("2017-03-01 00:00:00")) val timestamp0301000001 = Literal(Timestamp.valueOf("2017-03-01 00:00:01")) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala index 89fabd477406..19e8c0334689 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala @@ -1427,6 +1427,7 @@ abstract class SessionCatalogSuite extends AnalysisTest { Seq(true, false) foreach { caseSensitive => val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> caseSensitive) val catalog = new SessionCatalog(newBasicCatalog(), new SimpleFunctionRegistry, conf) + catalog.setCurrentDatabase("db1") try { val analyzer = new Analyzer(catalog, conf) @@ -1440,6 +1441,8 @@ abstract class SessionCatalogSuite extends AnalysisTest { } assert(cause.getMessage.contains("Undefined function: 'undefined_fn'")) + // SPARK-21318: the error message should contains the current database name + assert(cause.getMessage.contains("db1")) } finally { catalog.reset() } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala index 9a752af523ff..1318ab185983 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { @@ -143,16 +144,25 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper } } - // By fixing SPARK-15776, Divide's inputType is required to be DoubleType of DecimalType. - // TODO: in future release, we should add a IntegerDivide to support integral types. - ignore("/ (Divide) for integral type") { - checkEvaluation(Divide(Literal(1.toByte), Literal(2.toByte)), 0.toByte) - checkEvaluation(Divide(Literal(1.toShort), Literal(2.toShort)), 0.toShort) - checkEvaluation(Divide(Literal(1), Literal(2)), 0) - checkEvaluation(Divide(Literal(1.toLong), Literal(2.toLong)), 0.toLong) - checkEvaluation(Divide(positiveShortLit, negativeShortLit), 0.toShort) - checkEvaluation(Divide(positiveIntLit, negativeIntLit), 0) - checkEvaluation(Divide(positiveLongLit, negativeLongLit), 0L) + test("/ (Divide) for integral type") { + withSQLConf(SQLConf.LEGACY_INTEGRALDIVIDE_RETURN_LONG.key -> "false") { + checkEvaluation(IntegralDivide(Literal(1.toByte), Literal(2.toByte)), 0.toByte) + checkEvaluation(IntegralDivide(Literal(1.toShort), Literal(2.toShort)), 0.toShort) + checkEvaluation(IntegralDivide(Literal(1), Literal(2)), 0) + checkEvaluation(IntegralDivide(Literal(1.toLong), Literal(2.toLong)), 0.toLong) + checkEvaluation(IntegralDivide(positiveShortLit, negativeShortLit), 0.toShort) + checkEvaluation(IntegralDivide(positiveIntLit, negativeIntLit), 0) + checkEvaluation(IntegralDivide(positiveLongLit, negativeLongLit), 0L) + } + withSQLConf(SQLConf.LEGACY_INTEGRALDIVIDE_RETURN_LONG.key -> "true") { + checkEvaluation(IntegralDivide(Literal(1.toByte), Literal(2.toByte)), 0L) + checkEvaluation(IntegralDivide(Literal(1.toShort), Literal(2.toShort)), 0L) + checkEvaluation(IntegralDivide(Literal(1), Literal(2)), 0L) + checkEvaluation(IntegralDivide(Literal(1.toLong), Literal(2.toLong)), 0L) + checkEvaluation(IntegralDivide(positiveShortLit, negativeShortLit), 0L) + checkEvaluation(IntegralDivide(positiveIntLit, negativeIntLit), 0L) + checkEvaluation(IntegralDivide(positiveLongLit, negativeLongLit), 0L) + } } test("% (Remainder)") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index d9f32c000a88..90c0bf7d8b3d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.{Date, Timestamp} import java.util.{Calendar, Locale, TimeZone} +import scala.util.Random + import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow @@ -110,7 +112,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper { } test("cast string to timestamp") { - for (tz <- ALL_TIMEZONES) { + for (tz <- Random.shuffle(ALL_TIMEZONES).take(50)) { def checkCastStringToTimestamp(str: String, expected: Timestamp): Unit = { checkEvaluation(cast(Literal(str), TimestampType, Option(tz.getID)), expected) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index c383eec3d56b..5e8113ac8658 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -346,6 +346,16 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { projection(row) } + test("SPARK-22226: splitExpressions should not generate codes beyond 64KB") { + val colNumber = 10000 + val attrs = (1 to colNumber).map(colIndex => AttributeReference(s"_$colIndex", IntegerType)()) + val lit = Literal(1000) + val exprs = attrs.flatMap { a => + Seq(If(lit < a, lit, a), sqrt(a)) + } + UnsafeProjection.create(exprs, attrs) + } + test("SPARK-22543: split large predicates into blocks due to JVM code size limit") { val length = 600 diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallbackSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallbackSuite.scala index 28edd85ab6e8..6ea3b05ff9c1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallbackSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGeneratorWithInterpretedFallbackSuite.scala @@ -20,13 +20,18 @@ package org.apache.spark.sql.catalyst.expressions import java.util.concurrent.ExecutionException import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator} import org.apache.spark.sql.catalyst.plans.PlanTestBase import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.IntegerType +import org.apache.spark.sql.types.{IntegerType, StructType} class CodeGeneratorWithInterpretedFallbackSuite extends SparkFunSuite with PlanTestBase { + val codegenOnly = CodegenObjectFactoryMode.CODEGEN_ONLY.toString + val noCodegen = CodegenObjectFactoryMode.NO_CODEGEN.toString + object FailedCodegenProjection extends CodeGeneratorWithInterpretedFallback[Seq[Expression], UnsafeProjection] { @@ -44,19 +49,30 @@ class CodeGeneratorWithInterpretedFallbackSuite extends SparkFunSuite with PlanT test("UnsafeProjection with codegen factory mode") { val input = Seq(BoundReference(0, IntegerType, nullable = true)) - val codegenOnly = CodegenObjectFactoryMode.CODEGEN_ONLY.toString withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codegenOnly) { val obj = UnsafeProjection.createObject(input) assert(obj.getClass.getName.contains("GeneratedClass$SpecificUnsafeProjection")) } - val noCodegen = CodegenObjectFactoryMode.NO_CODEGEN.toString withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> noCodegen) { val obj = UnsafeProjection.createObject(input) assert(obj.isInstanceOf[InterpretedUnsafeProjection]) } } + test("MutableProjection with codegen factory mode") { + val input = Seq(BoundReference(0, IntegerType, nullable = true)) + withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codegenOnly) { + val obj = MutableProjection.createObject(input) + assert(obj.getClass.getName.contains("GeneratedClass$SpecificMutableProjection")) + } + + withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> noCodegen) { + val obj = MutableProjection.createObject(input) + assert(obj.isInstanceOf[InterpretedMutableProjection]) + } + } + test("fallback to the interpreter mode") { val input = Seq(BoundReference(0, IntegerType, nullable = true)) val fallback = CodegenObjectFactoryMode.FALLBACK.toString @@ -69,11 +85,25 @@ class CodeGeneratorWithInterpretedFallbackSuite extends SparkFunSuite with PlanT test("codegen failures in the CODEGEN_ONLY mode") { val errMsg = intercept[ExecutionException] { val input = Seq(BoundReference(0, IntegerType, nullable = true)) - val codegenOnly = CodegenObjectFactoryMode.CODEGEN_ONLY.toString withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codegenOnly) { FailedCodegenProjection.createObject(input) } }.getMessage assert(errMsg.contains("failed to compile: org.codehaus.commons.compiler.CompileException:")) } + + test("SPARK-25358 Correctly handles NoOp in MutableProjection") { + val exprs = Seq(Add(BoundReference(0, IntegerType, nullable = true), Literal.create(1)), NoOp) + val input = InternalRow.fromSeq(1 :: 1 :: Nil) + val expected = 2 :: null :: Nil + withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codegenOnly) { + val proj = MutableProjection.createObject(exprs) + assert(proj(input).toSeq(StructType.fromDDL("c0 int, c1 int")) === expected) + } + + withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> noCodegen) { + val proj = MutableProjection.createObject(exprs) + assert(proj(input).toSeq(StructType.fromDDL("c0 int, c1 int")) === expected) + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala index c7db4ec9e16b..2e0adbb46500 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala @@ -1510,16 +1510,16 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper val seed1 = Some(r.nextLong()) assert(evaluateWithoutCodegen(Shuffle(ai0, seed1)) === evaluateWithoutCodegen(Shuffle(ai0, seed1))) - assert(evaluateWithGeneratedMutableProjection(Shuffle(ai0, seed1)) === - evaluateWithGeneratedMutableProjection(Shuffle(ai0, seed1))) + assert(evaluateWithMutableProjection(Shuffle(ai0, seed1)) === + evaluateWithMutableProjection(Shuffle(ai0, seed1))) assert(evaluateWithUnsafeProjection(Shuffle(ai0, seed1)) === evaluateWithUnsafeProjection(Shuffle(ai0, seed1))) val seed2 = Some(r.nextLong()) assert(evaluateWithoutCodegen(Shuffle(ai0, seed1)) !== evaluateWithoutCodegen(Shuffle(ai0, seed2))) - assert(evaluateWithGeneratedMutableProjection(Shuffle(ai0, seed1)) !== - evaluateWithGeneratedMutableProjection(Shuffle(ai0, seed2))) + assert(evaluateWithMutableProjection(Shuffle(ai0, seed1)) !== + evaluateWithMutableProjection(Shuffle(ai0, seed2))) assert(evaluateWithUnsafeProjection(Shuffle(ai0, seed1)) !== evaluateWithUnsafeProjection(Shuffle(ai0, seed2))) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 63b24fb9eb13..c9d733726ff2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -273,9 +273,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { for (tz <- Seq(TimeZoneGMT, TimeZonePST, TimeZoneJST)) { val timeZoneId = Option(tz.getID) c.setTimeZone(tz) - (0 to 24).foreach { h => - (0 to 60 by 15).foreach { m => - (0 to 60 by 15).foreach { s => + (0 to 24 by 6).foreach { h => + (0 to 60 by 30).foreach { m => + (0 to 60 by 30).foreach { s => c.set(2015, 18, 3, h, m, s) checkEvaluation( Hour(Literal(new Timestamp(c.getTimeInMillis)), timeZoneId), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 6684e5ce18d4..b5986aac6555 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -60,7 +60,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks with PlanTestBa def expr = prepareEvaluation(expression) val catalystValue = CatalystTypeConverters.convertToCatalyst(expected) checkEvaluationWithoutCodegen(expr, catalystValue, inputRow) - checkEvaluationWithGeneratedMutableProjection(expr, catalystValue, inputRow) + checkEvaluationWithMutableProjection(expr, catalystValue, inputRow) if (GenerateUnsafeProjection.canSupport(expr.dataType)) { checkEvaluationWithUnsafeProjection(expr, catalystValue, inputRow) } @@ -136,7 +136,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks with PlanTestBa // Make it as method to obtain fresh expression everytime. def expr = prepareEvaluation(expression) checkException(evaluateWithoutCodegen(expr, inputRow), "non-codegen mode") - checkException(evaluateWithGeneratedMutableProjection(expr, inputRow), "codegen mode") + checkException(evaluateWithMutableProjection(expr, inputRow), "codegen mode") if (GenerateUnsafeProjection.canSupport(expr.dataType)) { checkException(evaluateWithUnsafeProjection(expr, inputRow), "unsafe mode") } @@ -183,22 +183,28 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks with PlanTestBa } } - protected def checkEvaluationWithGeneratedMutableProjection( - expression: Expression, + protected def checkEvaluationWithMutableProjection( + expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = { - val actual = evaluateWithGeneratedMutableProjection(expression, inputRow) - if (!checkResult(actual, expected, expression.dataType)) { - val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" - fail(s"Incorrect evaluation: $expression, actual: $actual, expected: $expected$input") + val modes = Seq(CodegenObjectFactoryMode.CODEGEN_ONLY, CodegenObjectFactoryMode.NO_CODEGEN) + for (fallbackMode <- modes) { + withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> fallbackMode.toString) { + val actual = evaluateWithMutableProjection(expression, inputRow) + if (!checkResult(actual, expected, expression.dataType)) { + val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" + fail(s"Incorrect evaluation (fallback mode = $fallbackMode): $expression, " + + s"actual: $actual, expected: $expected$input") + } + } } } - protected def evaluateWithGeneratedMutableProjection( - expression: Expression, + protected def evaluateWithMutableProjection( + expression: => Expression, inputRow: InternalRow = EmptyRow): Any = { val plan = generateProject( - GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil), + MutableProjection.create(Alias(expression, s"Optimized($expression)")() :: Nil), expression) plan.initialize(0) @@ -218,7 +224,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks with PlanTestBa if (expected == null) { if (!unsafeRow.isNullAt(0)) { val expectedRow = InternalRow(expected, expected) - fail("Incorrect evaluation in unsafe mode: " + + fail(s"Incorrect evaluation in unsafe mode (fallback mode = $fallbackMode): " + s"$expression, actual: $unsafeRow, expected: $expectedRow$input") } } else { @@ -226,7 +232,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks with PlanTestBa val expectedRow = UnsafeProjection.create(Array(expression.dataType, expression.dataType)).apply(lit) if (unsafeRow != expectedRow) { - fail("Incorrect evaluation in unsafe mode: " + + fail(s"Incorrect evaluation in unsafe mode (fallback mode = $fallbackMode): " + s"$expression, actual: $unsafeRow, expected: $expectedRow$input") } } @@ -266,7 +272,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks with PlanTestBa expected: Spread[Double], inputRow: InternalRow = EmptyRow): Unit = { checkEvaluationWithoutCodegen(expression, expected) - checkEvaluationWithGeneratedMutableProjection(expression, expected) + checkEvaluationWithMutableProjection(expression, expected) checkEvaluationWithOptimization(expression, expected) var plan = generateProject( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index 0e9c8abec33e..81ab7d690396 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -244,6 +244,13 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with "1234") } + test("some big value") { + val value = "x" * 3000 + checkEvaluation( + GetJsonObject(NonFoldableLiteral((s"""{"big": "$value"}""")), + NonFoldableLiteral("$.big")), value) + } + val jsonTupleQuery = Literal("f1") :: Literal("f2") :: Literal("f3") :: @@ -707,9 +714,17 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with } test("SPARK-24709: infer schema of json strings") { - checkEvaluation(SchemaOfJson(Literal.create("""{"col":0}""")), "struct") + checkEvaluation(new SchemaOfJson(Literal.create("""{"col":0}""")), + "struct") checkEvaluation( - SchemaOfJson(Literal.create("""{"col0":["a"], "col1": {"col2": "b"}}""")), + new SchemaOfJson(Literal.create("""{"col0":["a"], "col1": {"col2": "b"}}""")), "struct,col1:struct>") } + + test("infer schema of JSON strings by using options") { + checkEvaluation( + new SchemaOfJson(Literal.create("""{"col":01}"""), + CreateMap(Seq(Literal.create("allowNumericLeadingZeros"), Literal.create("true")))), + "struct") + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index 86f80fe66d28..3ea6bfac9ddc 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -226,4 +226,25 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Literal('\u0000'), "\u0000") checkEvaluation(Literal.create('\n'), "\n") } + + test("fromString converts String/DataType input correctly") { + checkEvaluation(Literal.fromString(false.toString, BooleanType), false) + checkEvaluation(Literal.fromString(null, NullType), null) + checkEvaluation(Literal.fromString(Int.MaxValue.toByte.toString, ByteType), Int.MaxValue.toByte) + checkEvaluation(Literal.fromString(Short.MaxValue.toShort.toString, ShortType), Short.MaxValue + .toShort) + checkEvaluation(Literal.fromString(Int.MaxValue.toString, IntegerType), Int.MaxValue) + checkEvaluation(Literal.fromString(Long.MaxValue.toString, LongType), Long.MaxValue) + checkEvaluation(Literal.fromString(Float.MaxValue.toString, FloatType), Float.MaxValue) + checkEvaluation(Literal.fromString(Double.MaxValue.toString, DoubleType), Double.MaxValue) + checkEvaluation(Literal.fromString("1.23456", DecimalType(10, 5)), Decimal(1.23456)) + checkEvaluation(Literal.fromString("Databricks", StringType), "Databricks") + val dateString = "1970-01-01" + checkEvaluation(Literal.fromString(dateString, DateType), java.sql.Date.valueOf(dateString)) + val timestampString = "0000-01-01 00:00:00" + checkEvaluation(Literal.fromString(timestampString, TimestampType), + java.sql.Timestamp.valueOf(timestampString)) + val calInterval = new CalendarInterval(1, 1) + checkEvaluation(Literal.fromString(calInterval.toString, CalendarIntervalType), calInterval) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala index b6c269348b00..4b2d153a28cc 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala @@ -48,15 +48,15 @@ class MiscExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val r = new Random() val seed1 = Some(r.nextLong()) assert(evaluateWithoutCodegen(Uuid(seed1)) === evaluateWithoutCodegen(Uuid(seed1))) - assert(evaluateWithGeneratedMutableProjection(Uuid(seed1)) === - evaluateWithGeneratedMutableProjection(Uuid(seed1))) + assert(evaluateWithMutableProjection(Uuid(seed1)) === + evaluateWithMutableProjection(Uuid(seed1))) assert(evaluateWithUnsafeProjection(Uuid(seed1)) === evaluateWithUnsafeProjection(Uuid(seed1))) val seed2 = Some(r.nextLong()) assert(evaluateWithoutCodegen(Uuid(seed1)) !== evaluateWithoutCodegen(Uuid(seed2))) - assert(evaluateWithGeneratedMutableProjection(Uuid(seed1)) !== - evaluateWithGeneratedMutableProjection(Uuid(seed2))) + assert(evaluateWithMutableProjection(Uuid(seed1)) !== + evaluateWithMutableProjection(Uuid(seed2))) assert(evaluateWithUnsafeProjection(Uuid(seed1)) !== evaluateWithUnsafeProjection(Uuid(seed2))) @@ -79,7 +79,7 @@ class MiscExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val outputEval = errorStream.toString errorStream.reset() // check with codegen - checkEvaluationWithGeneratedMutableProjection(PrintToStderr(inputExpr), 1) + checkEvaluationWithMutableProjection(PrintToStderr(inputExpr), 1) val outputCodegen = errorStream.toString (outputEval, outputCodegen) } finally { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala index b0af9e07d1d1..d145fd0aaba4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala @@ -72,7 +72,7 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val cls = classOf[Tuple2[Boolean, java.lang.Integer]] val inputObject = BoundReference(0, ObjectType(cls), nullable = true) val invoke = Invoke(inputObject, "_2", IntegerType) - checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow) + checkEvaluationWithMutableProjection(invoke, null, inputRow) } test("MapObjects should make copies of unsafe-backed data") { @@ -233,13 +233,13 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal.fromObject(new TestBean), Map("setNonPrimitive" -> Literal(null))) evaluateWithoutCodegen(initializeBean, InternalRow.fromSeq(Seq())) - evaluateWithGeneratedMutableProjection(initializeBean, InternalRow.fromSeq(Seq())) + evaluateWithMutableProjection(initializeBean, InternalRow.fromSeq(Seq())) val initializeBean2 = InitializeJavaBean( Literal.fromObject(new TestBean), Map("setNonPrimitive" -> Literal("string"))) evaluateWithoutCodegen(initializeBean2, InternalRow.fromSeq(Seq())) - evaluateWithGeneratedMutableProjection(initializeBean2, InternalRow.fromSeq(Seq())) + evaluateWithMutableProjection(initializeBean2, InternalRow.fromSeq(Seq())) } test("SPARK-23585: UnwrapOption should support interpreted execution") { @@ -273,7 +273,7 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val resolver = ResolveTimeZone(new SQLConf) val expr = resolver.resolveTimeZones(serializer.deserialize(serializer.serialize(expression))) checkEvaluationWithoutCodegen(expr, expected, inputRow) - checkEvaluationWithGeneratedMutableProjection(expr, expected, inputRow) + checkEvaluationWithMutableProjection(expr, expected, inputRow) if (GenerateUnsafeProjection.canSupport(expr.dataType)) { checkEvaluationWithUnsafeProjection( expr, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index d532dc4f7719..06fb73ad8392 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -225,11 +225,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val row3 = create_row("aa2bb3cc", null) checkEvaluation( - StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1) + StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), -1), Seq("aa", "bb", "cc"), row1) checkEvaluation( - StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1) - checkEvaluation(StringSplit(s1, s2), null, row2) - checkEvaluation(StringSplit(s1, s2), null, row3) + StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), 2), Seq("aa", "bb3cc"), row1) + // limit = 0 should behave just like limit = -1 + checkEvaluation( + StringSplit(Literal("aacbbcddc"), Literal("c"), 0), Seq("aa", "bb", "dd", ""), row1) + checkEvaluation( + StringSplit(Literal("aacbbcddc"), Literal("c"), -1), Seq("aa", "bb", "dd", ""), row1) + checkEvaluation( + StringSplit(s1, s2, -1), Seq("aa", "bb", "cc"), row1) + checkEvaluation(StringSplit(s1, s2, -1), null, row2) + checkEvaluation(StringSplit(s1, s2, -1), null, row3) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala new file mode 100644 index 000000000000..fb651b76fc16 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriterSuite.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions.codegen + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.types.Decimal + +class UnsafeRowWriterSuite extends SparkFunSuite { + + def checkDecimalSizeInBytes(decimal: Decimal, numBytes: Int): Unit = { + assert(decimal.toJavaBigDecimal.unscaledValue().toByteArray.length == numBytes) + } + + test("SPARK-25538: zero-out all bits for decimals") { + val decimal1 = Decimal(0.431) + decimal1.changePrecision(38, 18) + checkDecimalSizeInBytes(decimal1, 8) + + val decimal2 = Decimal(123456789.1232456789) + decimal2.changePrecision(38, 18) + checkDecimalSizeInBytes(decimal2, 11) + // On an UnsafeRowWriter we write decimal2 first and then decimal1 + val unsafeRowWriter1 = new UnsafeRowWriter(1) + unsafeRowWriter1.resetRowWriter() + unsafeRowWriter1.write(0, decimal2, decimal2.precision, decimal2.scale) + unsafeRowWriter1.reset() + unsafeRowWriter1.write(0, decimal1, decimal1.precision, decimal1.scale) + val res1 = unsafeRowWriter1.getRow + // On a second UnsafeRowWriter we write directly decimal1 + val unsafeRowWriter2 = new UnsafeRowWriter(1) + unsafeRowWriter2.resetRowWriter() + unsafeRowWriter2.write(0, decimal1, decimal1.precision, decimal1.scale) + val res2 = unsafeRowWriter2.getRow + // The two rows should be the equal + assert(res1 == res2) + } + +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala index 653c07f1835c..6cd1108eef33 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.BooleanType class BooleanSimplificationSuite extends PlanTest with PredicateHelper { @@ -37,6 +38,7 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { Batch("Constant Folding", FixedPoint(50), NullPropagation, ConstantFolding, + SimplifyConditionals, BooleanSimplification, PruneFilters) :: Nil } @@ -48,6 +50,14 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { testRelation.output, Seq(Row(1, 2, 3, "abc")) ) + val testNotNullableRelation = LocalRelation('a.int.notNull, 'b.int.notNull, 'c.int.notNull, + 'd.string.notNull, 'e.boolean.notNull, 'f.boolean.notNull, 'g.boolean.notNull, + 'h.boolean.notNull) + + val testNotNullableRelationWithData = LocalRelation.fromExternalRows( + testNotNullableRelation.output, Seq(Row(1, 2, 3, "abc")) + ) + private def checkCondition(input: Expression, expected: LogicalPlan): Unit = { val plan = testRelationWithData.where(input).analyze val actual = Optimize.execute(plan) @@ -61,6 +71,13 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { comparePlans(actual, correctAnswer) } + private def checkConditionInNotNullableRelation( + input: Expression, expected: LogicalPlan): Unit = { + val plan = testNotNullableRelationWithData.where(input).analyze + val actual = Optimize.execute(plan) + comparePlans(actual, expected) + } + test("a && a => a") { checkCondition(Literal(1) < 'a && Literal(1) < 'a, Literal(1) < 'a) checkCondition(Literal(1) < 'a && Literal(1) < 'a && Literal(1) < 'a, Literal(1) < 'a) @@ -174,10 +191,30 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper { } test("Complementation Laws") { - checkCondition('a && !'a, testRelation) - checkCondition(!'a && 'a, testRelation) + checkConditionInNotNullableRelation('e && !'e, testNotNullableRelation) + checkConditionInNotNullableRelation(!'e && 'e, testNotNullableRelation) + + checkConditionInNotNullableRelation('e || !'e, testNotNullableRelationWithData) + checkConditionInNotNullableRelation(!'e || 'e, testNotNullableRelationWithData) + } + + test("Complementation Laws - null handling") { + checkCondition('e && !'e, + testRelationWithData.where(If('e.isNull, Literal.create(null, BooleanType), false)).analyze) + checkCondition(!'e && 'e, + testRelationWithData.where(If('e.isNull, Literal.create(null, BooleanType), false)).analyze) + + checkCondition('e || !'e, + testRelationWithData.where(If('e.isNull, Literal.create(null, BooleanType), true)).analyze) + checkCondition(!'e || 'e, + testRelationWithData.where(If('e.isNull, Literal.create(null, BooleanType), true)).analyze) + } + + test("Complementation Laws - negative case") { + checkCondition('e && !'f, testRelationWithData.where('e && !'f).analyze) + checkCondition(!'f && 'e, testRelationWithData.where(!'f && 'e).analyze) - checkCondition('a || !'a, testRelationWithData) - checkCondition(!'a || 'a, testRelationWithData) + checkCondition('e || !'f, testRelationWithData.where('e || !'f).analyze) + checkCondition(!'f || 'e, testRelationWithData.where(!'f || 'e).analyze) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala index e4671f0d1cce..a40ba2dc38b7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala @@ -196,7 +196,7 @@ class InferFiltersFromConstraintsSuite extends PlanTest { test("constraints should be inferred from aliased literals") { val originalLeft = testRelation.subquery('left).as("left") - val optimizedLeft = testRelation.subquery('left).where(IsNotNull('a) && 'a === 2).as("left") + val optimizedLeft = testRelation.subquery('left).where(IsNotNull('a) && 'a <=> 2).as("left") val right = Project(Seq(Literal(2).as("two")), testRelation.subquery('right)).as("right") val condition = Some("left.a".attr === "right.two".attr) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerLoggingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerLoggingSuite.scala new file mode 100644 index 000000000000..915f408089fe --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerLoggingSuite.scala @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import scala.collection.mutable.ArrayBuffer + +import org.apache.log4j.{Appender, AppenderSkeleton, Level, Logger} +import org.apache.log4j.spi.LoggingEvent + +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.RuleExecutor +import org.apache.spark.sql.internal.SQLConf + +class OptimizerLoggingSuite extends PlanTest { + + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = Batch("Optimizer Batch", FixedPoint(100), + PushDownPredicate, + ColumnPruning, + CollapseProject) :: Nil + } + + class MockAppender extends AppenderSkeleton { + val loggingEvents = new ArrayBuffer[LoggingEvent]() + + override def append(loggingEvent: LoggingEvent): Unit = { + if (loggingEvent.getRenderedMessage().contains("Applying Rule")) { + loggingEvents.append(loggingEvent) + } + } + + override def close(): Unit = {} + override def requiresLayout(): Boolean = false + } + + private def withLogLevelAndAppender(level: Level, appender: Appender)(f: => Unit): Unit = { + val logger = Logger.getLogger(Optimize.getClass.getName.dropRight(1)) + val restoreLevel = logger.getLevel + logger.setLevel(level) + logger.addAppender(appender) + try f finally { + logger.setLevel(restoreLevel) + logger.removeAppender(appender) + } + } + + private def verifyLog(expectedLevel: Level, expectedRules: Seq[String]): Unit = { + val logAppender = new MockAppender() + withLogLevelAndAppender(Level.TRACE, logAppender) { + val input = LocalRelation('a.int, 'b.string, 'c.double) + val query = input.select('a, 'b).select('a).where('a > 1).analyze + val expected = input.where('a > 1).select('a).analyze + comparePlans(Optimize.execute(query), expected) + } + val logMessages = logAppender.loggingEvents.map(_.getRenderedMessage) + assert(expectedRules.forall(rule => logMessages.exists(_.contains(rule)))) + assert(logAppender.loggingEvents.forall(_.getLevel == expectedLevel)) + } + + test("test log level") { + val levels = Seq( + "TRACE" -> Level.TRACE, + "trace" -> Level.TRACE, + "DEBUG" -> Level.DEBUG, + "debug" -> Level.DEBUG, + "INFO" -> Level.INFO, + "info" -> Level.INFO, + "WARN" -> Level.WARN, + "warn" -> Level.WARN, + "ERROR" -> Level.ERROR, + "error" -> Level.ERROR, + "deBUG" -> Level.DEBUG) + + levels.foreach { level => + withSQLConf(SQLConf.OPTIMIZER_PLAN_CHANGE_LOG_LEVEL.key -> level._1) { + verifyLog( + level._2, + Seq( + PushDownPredicate.ruleName, + ColumnPruning.ruleName, + CollapseProject.ruleName)) + } + } + } + + test("test invalid log level conf") { + val levels = Seq( + "", + "*d_", + "infoo") + + levels.foreach { level => + val error = intercept[IllegalArgumentException] { + withSQLConf(SQLConf.OPTIMIZER_PLAN_CHANGE_LOG_LEVEL.key -> level) {} + } + assert(error.getMessage.contains( + "Invalid value for 'spark.sql.optimizer.planChangeLog.level'.")) + } + } + + test("test log rules") { + val rulesSeq = Seq( + Seq(PushDownPredicate.ruleName, + ColumnPruning.ruleName, + CollapseProject.ruleName).reduce(_ + "," + _) -> + Seq(PushDownPredicate.ruleName, + ColumnPruning.ruleName, + CollapseProject.ruleName), + Seq(PushDownPredicate.ruleName, + ColumnPruning.ruleName).reduce(_ + "," + _) -> + Seq(PushDownPredicate.ruleName, + ColumnPruning.ruleName), + CollapseProject.ruleName -> + Seq(CollapseProject.ruleName), + Seq(ColumnPruning.ruleName, + "DummyRule").reduce(_ + "," + _) -> + Seq(ColumnPruning.ruleName), + "DummyRule" -> Seq(), + "" -> Seq() + ) + + rulesSeq.foreach { case (rulesConf, expectedRules) => + withSQLConf( + SQLConf.OPTIMIZER_PLAN_CHANGE_LOG_RULES.key -> rulesConf, + SQLConf.OPTIMIZER_PLAN_CHANGE_LOG_LEVEL.key -> "INFO") { + verifyLog(Level.INFO, expectedRules) + } + } + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala index f1ce7543ffdc..d395bba105a7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala @@ -147,7 +147,7 @@ class PropagateEmptyRelationSuite extends PlanTest { .where(false) .select('a) .where('a > 1) - .where('a != 200) + .where('a =!= 200) .orderBy('a.asc) val optimized = Optimize.execute(query.analyze) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PushProjectThroughUnionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PushProjectThroughUnionSuite.scala new file mode 100644 index 000000000000..294d29842b04 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PushProjectThroughUnionSuite.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.RuleExecutor + +class PushProjectThroughUnionSuite extends PlanTest { + + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = Batch("Optimizer Batch", FixedPoint(100), + PushProjectionThroughUnion, + FoldablePropagation) :: Nil + } + + test("SPARK-25450 PushProjectThroughUnion rule uses the same exprId for project expressions " + + "in each Union child, causing mistakes in constant propagation") { + val testRelation1 = LocalRelation('a.string, 'b.int, 'c.string) + val testRelation2 = LocalRelation('d.string, 'e.int, 'f.string) + val query = testRelation1 + .union(testRelation2.select("bar".as("d"), 'e, 'f)) + .select('a.as("n")) + .select('n, "dummy").analyze + val optimized = Optimize.execute(query) + + val expected = testRelation1 + .select('a.as("n")) + .select('n, "dummy") + .union(testRelation2 + .select("bar".as("d"), 'e, 'f) + .select("bar".as("n")) + .select("bar".as("n"), "dummy")).analyze + + comparePlans(optimized, expected) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/TransposeWindowSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/TransposeWindowSuite.scala new file mode 100644 index 000000000000..58b3d1c98f3c --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/TransposeWindowSuite.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.expressions.Rand +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.RuleExecutor + +class TransposeWindowSuite extends PlanTest { + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = + Batch("CollapseProject", FixedPoint(100), CollapseProject, RemoveRedundantProject) :: + Batch("FlipWindow", Once, CollapseWindow, TransposeWindow) :: Nil + } + + val testRelation = LocalRelation('a.string, 'b.string, 'c.int, 'd.string) + + val a = testRelation.output(0) + val b = testRelation.output(1) + val c = testRelation.output(2) + val d = testRelation.output(3) + + val partitionSpec1 = Seq(a) + val partitionSpec2 = Seq(a, b) + val partitionSpec3 = Seq(d) + val partitionSpec4 = Seq(b, a, d) + + val orderSpec1 = Seq(d.asc) + val orderSpec2 = Seq(d.desc) + + test("transpose two adjacent windows with compatible partitions") { + val query = testRelation + .window(Seq(sum(c).as('sum_a_2)), partitionSpec2, orderSpec2) + .window(Seq(sum(c).as('sum_a_1)), partitionSpec1, orderSpec1) + + val analyzed = query.analyze + val optimized = Optimize.execute(analyzed) + + val correctAnswer = testRelation + .window(Seq(sum(c).as('sum_a_1)), partitionSpec1, orderSpec1) + .window(Seq(sum(c).as('sum_a_2)), partitionSpec2, orderSpec2) + .select('a, 'b, 'c, 'd, 'sum_a_2, 'sum_a_1) + + comparePlans(optimized, correctAnswer.analyze) + } + + test("transpose two adjacent windows with differently ordered compatible partitions") { + val query = testRelation + .window(Seq(sum(c).as('sum_a_2)), partitionSpec4, Seq.empty) + .window(Seq(sum(c).as('sum_a_1)), partitionSpec2, Seq.empty) + + val analyzed = query.analyze + val optimized = Optimize.execute(analyzed) + + val correctAnswer = testRelation + .window(Seq(sum(c).as('sum_a_1)), partitionSpec2, Seq.empty) + .window(Seq(sum(c).as('sum_a_2)), partitionSpec4, Seq.empty) + .select('a, 'b, 'c, 'd, 'sum_a_2, 'sum_a_1) + + comparePlans(optimized, correctAnswer.analyze) + } + + test("don't transpose two adjacent windows with incompatible partitions") { + val query = testRelation + .window(Seq(sum(c).as('sum_a_2)), partitionSpec3, Seq.empty) + .window(Seq(sum(c).as('sum_a_1)), partitionSpec1, Seq.empty) + + val analyzed = query.analyze + val optimized = Optimize.execute(analyzed) + + comparePlans(optimized, analyzed) + } + + test("don't transpose two adjacent windows with intersection of partition and output set") { + val query = testRelation + .window(Seq(('a + 'b).as('e), sum(c).as('sum_a_2)), partitionSpec3, Seq.empty) + .window(Seq(sum(c).as('sum_a_1)), Seq(a, 'e), Seq.empty) + + val analyzed = query.analyze + val optimized = Optimize.execute(analyzed) + + comparePlans(optimized, analyzed) + } + + test("don't transpose two adjacent windows with non-deterministic expressions") { + val query = testRelation + .window(Seq(Rand(0).as('e), sum(c).as('sum_a_2)), partitionSpec3, Seq.empty) + .window(Seq(sum(c).as('sum_a_1)), partitionSpec1, Seq.empty) + + val analyzed = query.analyze + val optimized = Optimize.execute(analyzed) + + comparePlans(optimized, analyzed) + } + +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala index 781fc1e957ae..b4df22c5b29f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala @@ -203,7 +203,7 @@ class ExpressionParserSuite extends PlanTest { // Simple operations assertEqual("a * b", 'a * 'b) assertEqual("a / b", 'a / 'b) - assertEqual("a DIV b", ('a / 'b).cast(LongType)) + assertEqual("a DIV b", 'a div 'b) assertEqual("a % b", 'a % 'b) assertEqual("a + b", 'a + 'b) assertEqual("a - b", 'a - 'b) @@ -214,7 +214,7 @@ class ExpressionParserSuite extends PlanTest { // Check precedences assertEqual( "a * t | b ^ c & d - e + f % g DIV h / i * k", - 'a * 't | ('b ^ ('c & ('d - 'e + (('f % 'g / 'h).cast(LongType) / 'i * 'k))))) + 'a * 't | ('b ^ ('c & ('d - 'e + (('f % 'g div 'h) / 'i * 'k))))) } test("unary arithmetic expressions") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala index 67740c316647..3081ff935f04 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala @@ -22,7 +22,6 @@ import org.scalatest.Suite import org.scalatest.Tag import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode @@ -57,7 +56,7 @@ trait CodegenInterpretedPlanTest extends PlanTest { * Provides helper methods for comparing plans, but without the overhead of * mandating a FunSuite. */ -trait PlanTestBase extends PredicateHelper { self: Suite => +trait PlanTestBase extends PredicateHelper with SQLHelper { self: Suite => // TODO(gatorsmile): remove this from PlanTest and all the analyzer rules protected def conf = SQLConf.get @@ -174,32 +173,4 @@ trait PlanTestBase extends PredicateHelper { self: Suite => plan1 == plan2 } } - - /** - * Sets all SQL configurations specified in `pairs`, calls `f`, and then restores all SQL - * configurations. - */ - protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { - val conf = SQLConf.get - val (keys, values) = pairs.unzip - val currentValues = keys.map { key => - if (conf.contains(key)) { - Some(conf.getConfString(key)) - } else { - None - } - } - (keys, values).zipped.foreach { (k, v) => - if (SQLConf.staticConfKeys.contains(k)) { - throw new AnalysisException(s"Cannot modify the value of a static config: $k") - } - conf.setConfString(k, v) - } - try f finally { - keys.zip(currentValues).foreach { - case (key, Some(value)) => conf.setConfString(key, value) - case (key, None) => conf.unsetConf(key) - } - } - } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SQLHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SQLHelper.scala new file mode 100644 index 000000000000..4d869d79ad59 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/SQLHelper.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.plans + +import java.io.File + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.util.Utils + +trait SQLHelper { + + /** + * Sets all SQL configurations specified in `pairs`, calls `f`, and then restores all SQL + * configurations. + */ + protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { + val conf = SQLConf.get + val (keys, values) = pairs.unzip + val currentValues = keys.map { key => + if (conf.contains(key)) { + Some(conf.getConfString(key)) + } else { + None + } + } + (keys, values).zipped.foreach { (k, v) => + if (SQLConf.staticConfKeys.contains(k)) { + throw new AnalysisException(s"Cannot modify the value of a static config: $k") + } + conf.setConfString(k, v) + } + try f finally { + keys.zip(currentValues).foreach { + case (key, Some(value)) => conf.setConfString(key, value) + case (key, None) => conf.unsetConf(key) + } + } + } + + /** + * Generates a temporary path without creating the actual file/directory, then pass it to `f`. If + * a file/directory is created there by `f`, it will be delete after `f` returns. + */ + protected def withTempPath(f: File => Unit): Unit = { + val path = Utils.createTempDir() + path.delete() + try f(path) finally Utils.deleteRecursively(path) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMapSuite.scala new file mode 100644 index 000000000000..a8bb1d0afdb8 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMapSuite.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.serializer.JavaSerializer + +class CaseInsensitiveMapSuite extends SparkFunSuite { + private def shouldBeSerializable(m: Map[String, String]): Unit = { + new JavaSerializer(new SparkConf()).newInstance().serialize(m) + } + + test("Keys are case insensitive") { + val m = CaseInsensitiveMap(Map("a" -> "b", "foO" -> "bar")) + assert(m("FOO") == "bar") + assert(m("fOo") == "bar") + assert(m("A") == "b") + shouldBeSerializable(m) + } + + test("CaseInsensitiveMap should be serializable after '-' operator") { + val m = CaseInsensitiveMap(Map("a" -> "b", "foo" -> "bar")) - "a" + assert(m == Map("foo" -> "bar")) + shouldBeSerializable(m) + } + + test("CaseInsensitiveMap should be serializable after '+' operator") { + val m = CaseInsensitiveMap(Map("a" -> "b", "foo" -> "bar")) + ("x" -> "y") + assert(m == Map("a" -> "b", "foo" -> "bar", "x" -> "y")) + shouldBeSerializable(m) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala index 8f75c14192c9..755c8897cada 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala @@ -114,7 +114,7 @@ class UnsafeArraySuite extends SparkFunSuite { assert(unsafeDate.isInstanceOf[UnsafeArrayData]) assert(unsafeDate.numElements == dateArray.length) dateArray.zipWithIndex.map { case (e, i) => - assert(unsafeDate.get(i, DateType) == e) + assert(unsafeDate.get(i, DateType).asInstanceOf[Int] == e) } val unsafeTimestamp = ExpressionEncoder[Array[Long]].resolveAndBind(). @@ -122,7 +122,7 @@ class UnsafeArraySuite extends SparkFunSuite { assert(unsafeTimestamp.isInstanceOf[UnsafeArrayData]) assert(unsafeTimestamp.numElements == timestampArray.length) timestampArray.zipWithIndex.map { case (e, i) => - assert(unsafeTimestamp.get(i, TimestampType) == e) + assert(unsafeTimestamp.get(i, TimestampType).asInstanceOf[Long] == e) } Seq(decimalArray4_1, decimalArray20_20).map { decimalArray => diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/SchemaUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/SchemaUtilsSuite.scala index a25be2fe61db..2f576a4031e9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/SchemaUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/SchemaUtilsSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.util +import java.util.Locale + import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis._ @@ -39,7 +41,7 @@ class SchemaUtilsSuite extends SparkFunSuite { test(s"Check column name duplication in $testType cases") { def checkExceptionCases(schemaStr: String, duplicatedColumns: Seq[String]): Unit = { val expectedErrorMsg = "Found duplicate column(s) in SchemaUtilsSuite: " + - duplicatedColumns.map(c => s"`${c.toLowerCase}`").mkString(", ") + duplicatedColumns.map(c => s"`${c.toLowerCase(Locale.ROOT)}`").mkString(", ") val schema = StructType.fromDDL(schemaStr) var msg = intercept[AnalysisException] { SchemaUtils.checkSchemaColumnNameDuplication( diff --git a/sql/core/benchmarks/AggregateBenchmark-results.txt b/sql/core/benchmarks/AggregateBenchmark-results.txt new file mode 100644 index 000000000000..19e524777692 --- /dev/null +++ b/sql/core/benchmarks/AggregateBenchmark-results.txt @@ -0,0 +1,143 @@ +================================================================================================ +aggregate without grouping +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +agg w/o group: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +agg w/o group wholestage off 65374 / 70665 32.1 31.2 1.0X +agg w/o group wholestage on 1178 / 1209 1779.8 0.6 55.5X + + +================================================================================================ +stat functions +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +stddev: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +stddev wholestage off 8667 / 8851 12.1 82.7 1.0X +stddev wholestage on 1266 / 1273 82.8 12.1 6.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +kurtosis: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +kurtosis wholestage off 41218 / 41231 2.5 393.1 1.0X +kurtosis wholestage on 1347 / 1357 77.8 12.8 30.6X + + +================================================================================================ +aggregate with linear keys +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Aggregate w keys: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +codegen = F 9309 / 9389 9.0 111.0 1.0X +codegen = T hashmap = F 4417 / 4435 19.0 52.7 2.1X +codegen = T hashmap = T 1289 / 1298 65.1 15.4 7.2X + + +================================================================================================ +aggregate with randomized keys +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Aggregate w keys: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +codegen = F 11424 / 11426 7.3 136.2 1.0X +codegen = T hashmap = F 6441 / 6496 13.0 76.8 1.8X +codegen = T hashmap = T 2333 / 2344 36.0 27.8 4.9X + + +================================================================================================ +aggregate with string key +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Aggregate w string key: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +codegen = F 4751 / 4890 4.4 226.5 1.0X +codegen = T hashmap = F 3146 / 3182 6.7 150.0 1.5X +codegen = T hashmap = T 2211 / 2261 9.5 105.4 2.1X + + +================================================================================================ +aggregate with decimal key +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Aggregate w decimal key: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +codegen = F 3029 / 3062 6.9 144.4 1.0X +codegen = T hashmap = F 1534 / 1569 13.7 73.2 2.0X +codegen = T hashmap = T 575 / 578 36.5 27.4 5.3X + + +================================================================================================ +aggregate with multiple key types +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Aggregate w multiple keys: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +codegen = F 7506 / 7521 2.8 357.9 1.0X +codegen = T hashmap = F 4791 / 4808 4.4 228.5 1.6X +codegen = T hashmap = T 3553 / 3585 5.9 169.4 2.1X + + +================================================================================================ +max function bytecode size of wholestagecodegen +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +max function bytecode size: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +codegen = F 608 / 656 1.1 927.1 1.0X +codegen = T hugeMethodLimit = 10000 402 / 419 1.6 613.5 1.5X +codegen = T hugeMethodLimit = 1500 616 / 619 1.1 939.9 1.0X + + +================================================================================================ +cube +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +cube: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +cube wholestage off 3229 / 3237 1.6 615.9 1.0X +cube wholestage on 1285 / 1306 4.1 245.2 2.5X + + +================================================================================================ +hash and BytesToBytesMap +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +BytesToBytesMap: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +UnsafeRowhash 328 / 330 64.0 15.6 1.0X +murmur3 hash 167 / 167 125.4 8.0 2.0X +fast hash 84 / 85 249.0 4.0 3.9X +arrayEqual 192 / 192 109.3 9.1 1.7X +Java HashMap (Long) 144 / 147 145.9 6.9 2.3X +Java HashMap (two ints) 147 / 153 142.3 7.0 2.2X +Java HashMap (UnsafeRow) 785 / 788 26.7 37.4 0.4X +LongToUnsafeRowMap (opt=false) 456 / 457 46.0 21.8 0.7X +LongToUnsafeRowMap (opt=true) 125 / 125 168.3 5.9 2.6X +BytesToBytesMap (off Heap) 885 / 885 23.7 42.2 0.4X +BytesToBytesMap (on Heap) 860 / 864 24.4 41.0 0.4X +Aggregate HashMap 56 / 56 373.9 2.7 5.8X + + diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt new file mode 100644 index 000000000000..2eeb26c899b4 --- /dev/null +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -0,0 +1,24 @@ +================================================================================================ +ORC Write +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Write 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Without bloom filter 16765 / 17587 6.0 167.7 1.0X +With bloom filter 20060 / 20626 5.0 200.6 0.8X + + +================================================================================================ +ORC Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Read a row from 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Without bloom filter 1857 / 1904 53.9 18.6 1.0X +With bloom filter 1399 / 1437 71.5 14.0 1.3X + + diff --git a/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt b/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt new file mode 100644 index 000000000000..59637162f0a1 --- /dev/null +++ b/sql/core/benchmarks/ColumnarBatchBenchmark-results.txt @@ -0,0 +1,59 @@ +================================================================================================ +Int Read/Write +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Int Read/Write: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Java Array 244 / 244 1342.3 0.7 1.0X +ByteBuffer Unsafe 445 / 445 736.5 1.4 0.5X +ByteBuffer API 2124 / 2125 154.3 6.5 0.1X +DirectByteBuffer 750 / 750 437.2 2.3 0.3X +Unsafe Buffer 234 / 236 1401.3 0.7 1.0X +Column(on heap) 245 / 245 1335.6 0.7 1.0X +Column(off heap) 489 / 489 670.3 1.5 0.5X +Column(off heap direct) 236 / 236 1388.1 0.7 1.0X +UnsafeRow (on heap) 532 / 534 616.0 1.6 0.5X +UnsafeRow (off heap) 564 / 565 580.7 1.7 0.4X +Column On Heap Append 489 / 489 670.6 1.5 0.5X + + +================================================================================================ +Boolean Read/Write +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Boolean Read/Write: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Bitset 879 / 879 381.9 2.6 1.0X +Byte Array 794 / 794 422.6 2.4 1.1X + + +================================================================================================ +String Read/Write +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String Read/Write: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +On Heap 449 / 449 36.5 27.4 1.0X +Off Heap 679 / 679 24.1 41.4 0.7X + + +================================================================================================ +Array Vector Read +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Array Vector Read: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +On Heap Read Size Only 713 / 713 229.8 4.4 1.0X +Off Heap Read Size Only 757 / 757 216.5 4.6 0.9X +On Heap Read Elements 3648 / 3650 44.9 22.3 0.2X +Off Heap Read Elements 5263 / 5265 31.1 32.1 0.1X + + diff --git a/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt b/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt new file mode 100644 index 000000000000..caa9378301f5 --- /dev/null +++ b/sql/core/benchmarks/CompressionSchemeBenchmark-results.txt @@ -0,0 +1,137 @@ +================================================================================================ +Compression Scheme Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +BOOLEAN Encode: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough(1.000) 4 / 4 17998.9 0.1 1.0X +RunLengthEncoding(2.501) 680 / 680 98.7 10.1 0.0X +BooleanBitSet(0.125) 365 / 365 183.9 5.4 0.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +BOOLEAN Decode: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough 144 / 144 466.5 2.1 1.0X +RunLengthEncoding 679 / 679 98.9 10.1 0.2X +BooleanBitSet 1425 / 1431 47.1 21.2 0.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SHORT Encode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough(1.000) 7 / 7 10115.0 0.1 1.0X +RunLengthEncoding(1.494) 1671 / 1672 40.2 24.9 0.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SHORT Decode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough 1128 / 1128 59.5 16.8 1.0X +RunLengthEncoding 1630 / 1633 41.2 24.3 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SHORT Encode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough(1.000) 7 / 7 10164.2 0.1 1.0X +RunLengthEncoding(1.989) 1562 / 1563 43.0 23.3 0.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SHORT Decode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough 1127 / 1127 59.6 16.8 1.0X +RunLengthEncoding 1629 / 1631 41.2 24.3 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +INT Encode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough(1.000) 22 / 23 2983.2 0.3 1.0X +RunLengthEncoding(1.003) 2426 / 2427 27.7 36.1 0.0X +DictionaryEncoding(0.500) 958 / 958 70.1 14.3 0.0X +IntDelta(0.250) 286 / 286 235.0 4.3 0.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +INT Decode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough 1268 / 1269 52.9 18.9 1.0X +RunLengthEncoding 1906 / 1911 35.2 28.4 0.7X +DictionaryEncoding 981 / 982 68.4 14.6 1.3X +IntDelta 812 / 817 82.6 12.1 1.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +INT Encode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough(1.000) 23 / 23 2926.9 0.3 1.0X +RunLengthEncoding(1.326) 2614 / 2614 25.7 38.9 0.0X +DictionaryEncoding(0.501) 1024 / 1024 65.5 15.3 0.0X +IntDelta(0.250) 286 / 286 234.7 4.3 0.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +INT Decode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough 1433 / 1433 46.8 21.4 1.0X +RunLengthEncoding 1923 / 1926 34.9 28.6 0.7X +DictionaryEncoding 1285 / 1285 52.2 19.2 1.1X +IntDelta 1129 / 1137 59.4 16.8 1.3X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +LONG Encode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough(1.000) 45 / 45 1495.6 0.7 1.0X +RunLengthEncoding(0.738) 2662 / 2663 25.2 39.7 0.0X +DictionaryEncoding(0.250) 1269 / 1269 52.9 18.9 0.0X +LongDelta(0.125) 450 / 450 149.1 6.7 0.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +LONG Decode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough 1483 / 1483 45.3 22.1 1.0X +RunLengthEncoding 1875 / 1875 35.8 27.9 0.8X +DictionaryEncoding 1213 / 1214 55.3 18.1 1.2X +LongDelta 816 / 817 82.2 12.2 1.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +LONG Encode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough(1.000) 45 / 45 1489.3 0.7 1.0X +RunLengthEncoding(1.003) 2906 / 2906 23.1 43.3 0.0X +DictionaryEncoding(0.251) 1610 / 1610 41.7 24.0 0.0X +LongDelta(0.125) 451 / 451 148.7 6.7 0.1X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +LONG Decode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough 1485 / 1485 45.2 22.1 1.0X +RunLengthEncoding 1889 / 1890 35.5 28.2 0.8X +DictionaryEncoding 1215 / 1216 55.2 18.1 1.2X +LongDelta 1107 / 1110 60.6 16.5 1.3X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +STRING Encode: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough(1.000) 67 / 68 994.5 1.0 1.0X +RunLengthEncoding(0.894) 5877 / 5882 11.4 87.6 0.0X +DictionaryEncoding(0.167) 3597 / 3602 18.7 53.6 0.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +STRING Decode: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +PassThrough 3243 / 3244 20.7 48.3 1.0X +RunLengthEncoding 3598 / 3601 18.7 53.6 0.9X +DictionaryEncoding 3182 / 3182 21.1 47.4 1.0X + + diff --git a/sql/core/benchmarks/DatasetBenchmark-results.txt b/sql/core/benchmarks/DatasetBenchmark-results.txt new file mode 100644 index 000000000000..dcc190eb45c0 --- /dev/null +++ b/sql/core/benchmarks/DatasetBenchmark-results.txt @@ -0,0 +1,46 @@ +================================================================================================ +Dataset Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +back-to-back map long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD 11800 / 12042 8.5 118.0 1.0X +DataFrame 1927 / 2189 51.9 19.3 6.1X +Dataset 2483 / 2605 40.3 24.8 4.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +back-to-back map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD 16286 / 16301 6.1 162.9 1.0X +DataFrame 8101 / 8104 12.3 81.0 2.0X +Dataset 17445 / 17811 5.7 174.4 0.9X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +back-to-back filter Long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD 2971 / 3184 33.7 29.7 1.0X +DataFrame 1243 / 1296 80.5 12.4 2.4X +Dataset 3062 / 3091 32.7 30.6 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +back-to-back filter: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD 5253 / 5269 19.0 52.5 1.0X +DataFrame 211 / 234 473.4 2.1 24.9X +Dataset 9550 / 9552 10.5 95.5 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +aggregate: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +RDD sum 5086 / 5108 19.7 50.9 1.0X +DataFrame sum 65 / 73 1548.9 0.6 78.8X +Dataset sum using Aggregator 9024 / 9320 11.1 90.2 0.6X +Dataset complex Aggregator 15079 / 15171 6.6 150.8 0.3X + + diff --git a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt index a75a15c99328..e680ddff53dd 100644 --- a/sql/core/benchmarks/FilterPushdownBenchmark-results.txt +++ b/sql/core/benchmarks/FilterPushdownBenchmark-results.txt @@ -2,737 +2,669 @@ Pushdown for many distinct value case ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 string row (value IS NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8970 / 9122 1.8 570.3 1.0X -Parquet Vectorized (Pushdown) 471 / 491 33.4 30.0 19.0X -Native ORC Vectorized 7661 / 7853 2.1 487.0 1.2X -Native ORC Vectorized (Pushdown) 1134 / 1161 13.9 72.1 7.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11405 / 11485 1.4 725.1 1.0X +Parquet Vectorized (Pushdown) 675 / 690 23.3 42.9 16.9X +Native ORC Vectorized 7127 / 7170 2.2 453.1 1.6X +Native ORC Vectorized (Pushdown) 519 / 541 30.3 33.0 22.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 string row ('7864320' < value < '7864320'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 9246 / 9297 1.7 587.8 1.0X -Parquet Vectorized (Pushdown) 480 / 488 32.8 30.5 19.3X -Native ORC Vectorized 7838 / 7850 2.0 498.3 1.2X -Native ORC Vectorized (Pushdown) 1054 / 1118 14.9 67.0 8.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11457 / 11473 1.4 728.4 1.0X +Parquet Vectorized (Pushdown) 656 / 686 24.0 41.7 17.5X +Native ORC Vectorized 7328 / 7342 2.1 465.9 1.6X +Native ORC Vectorized (Pushdown) 539 / 565 29.2 34.2 21.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 string row (value = '7864320'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8989 / 9100 1.7 571.5 1.0X -Parquet Vectorized (Pushdown) 448 / 467 35.1 28.5 20.1X -Native ORC Vectorized 7680 / 7768 2.0 488.3 1.2X -Native ORC Vectorized (Pushdown) 1067 / 1118 14.7 67.8 8.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11878 / 11888 1.3 755.2 1.0X +Parquet Vectorized (Pushdown) 630 / 654 25.0 40.1 18.9X +Native ORC Vectorized 7342 / 7362 2.1 466.8 1.6X +Native ORC Vectorized (Pushdown) 519 / 537 30.3 33.0 22.9X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 string row (value <=> '7864320'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 9115 / 9266 1.7 579.5 1.0X -Parquet Vectorized (Pushdown) 466 / 492 33.7 29.7 19.5X -Native ORC Vectorized 7800 / 7914 2.0 495.9 1.2X -Native ORC Vectorized (Pushdown) 1075 / 1102 14.6 68.4 8.5X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11423 / 11440 1.4 726.2 1.0X +Parquet Vectorized (Pushdown) 625 / 643 25.2 39.7 18.3X +Native ORC Vectorized 7315 / 7335 2.2 465.1 1.6X +Native ORC Vectorized (Pushdown) 507 / 520 31.0 32.2 22.5X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 string row ('7864320' <= value <= '7864320'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 9099 / 9237 1.7 578.5 1.0X -Parquet Vectorized (Pushdown) 462 / 475 34.1 29.3 19.7X -Native ORC Vectorized 7847 / 7925 2.0 498.9 1.2X -Native ORC Vectorized (Pushdown) 1078 / 1114 14.6 68.5 8.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11440 / 11478 1.4 727.3 1.0X +Parquet Vectorized (Pushdown) 634 / 652 24.8 40.3 18.0X +Native ORC Vectorized 7311 / 7324 2.2 464.8 1.6X +Native ORC Vectorized (Pushdown) 517 / 548 30.4 32.8 22.1X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select all string rows (value IS NOT NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 19303 / 19547 0.8 1227.3 1.0X -Parquet Vectorized (Pushdown) 19924 / 20089 0.8 1266.7 1.0X -Native ORC Vectorized 18725 / 19079 0.8 1190.5 1.0X -Native ORC Vectorized (Pushdown) 19310 / 19492 0.8 1227.7 1.0X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 20750 / 20872 0.8 1319.3 1.0X +Parquet Vectorized (Pushdown) 21002 / 21032 0.7 1335.3 1.0X +Native ORC Vectorized 16714 / 16742 0.9 1062.6 1.2X +Native ORC Vectorized (Pushdown) 16926 / 16965 0.9 1076.1 1.2X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 int row (value IS NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8117 / 8323 1.9 516.1 1.0X -Parquet Vectorized (Pushdown) 484 / 494 32.5 30.8 16.8X -Native ORC Vectorized 6811 / 7036 2.3 433.0 1.2X -Native ORC Vectorized (Pushdown) 1061 / 1082 14.8 67.5 7.6X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10510 / 10532 1.5 668.2 1.0X +Parquet Vectorized (Pushdown) 642 / 665 24.5 40.8 16.4X +Native ORC Vectorized 6609 / 6618 2.4 420.2 1.6X +Native ORC Vectorized (Pushdown) 502 / 512 31.4 31.9 21.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 int row (7864320 < value < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8105 / 8140 1.9 515.3 1.0X -Parquet Vectorized (Pushdown) 478 / 505 32.9 30.4 17.0X -Native ORC Vectorized 6914 / 7211 2.3 439.6 1.2X -Native ORC Vectorized (Pushdown) 1044 / 1064 15.1 66.4 7.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10505 / 10514 1.5 667.9 1.0X +Parquet Vectorized (Pushdown) 659 / 673 23.9 41.9 15.9X +Native ORC Vectorized 6634 / 6641 2.4 421.8 1.6X +Native ORC Vectorized (Pushdown) 513 / 526 30.7 32.6 20.5X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 int row (value = 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7983 / 8116 2.0 507.6 1.0X -Parquet Vectorized (Pushdown) 464 / 487 33.9 29.5 17.2X -Native ORC Vectorized 6703 / 6774 2.3 426.1 1.2X -Native ORC Vectorized (Pushdown) 1017 / 1058 15.5 64.6 7.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10555 / 10570 1.5 671.1 1.0X +Parquet Vectorized (Pushdown) 651 / 668 24.2 41.4 16.2X +Native ORC Vectorized 6721 / 6728 2.3 427.3 1.6X +Native ORC Vectorized (Pushdown) 508 / 519 31.0 32.3 20.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 int row (value <=> 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7942 / 7983 2.0 504.9 1.0X -Parquet Vectorized (Pushdown) 468 / 479 33.6 29.7 17.0X -Native ORC Vectorized 6677 / 6779 2.4 424.5 1.2X -Native ORC Vectorized (Pushdown) 1021 / 1068 15.4 64.9 7.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10556 / 10566 1.5 671.1 1.0X +Parquet Vectorized (Pushdown) 647 / 654 24.3 41.1 16.3X +Native ORC Vectorized 6716 / 6728 2.3 427.0 1.6X +Native ORC Vectorized (Pushdown) 510 / 521 30.9 32.4 20.7X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 int row (7864320 <= value <= 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7909 / 7958 2.0 502.8 1.0X -Parquet Vectorized (Pushdown) 485 / 494 32.4 30.8 16.3X -Native ORC Vectorized 6751 / 6846 2.3 429.2 1.2X -Native ORC Vectorized (Pushdown) 1043 / 1077 15.1 66.3 7.6X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10556 / 10565 1.5 671.1 1.0X +Parquet Vectorized (Pushdown) 649 / 654 24.2 41.3 16.3X +Native ORC Vectorized 6700 / 6712 2.3 426.0 1.6X +Native ORC Vectorized (Pushdown) 509 / 520 30.9 32.3 20.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 int row (7864319 < value < 7864321): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8010 / 8033 2.0 509.2 1.0X -Parquet Vectorized (Pushdown) 472 / 489 33.3 30.0 17.0X -Native ORC Vectorized 6655 / 6808 2.4 423.1 1.2X -Native ORC Vectorized (Pushdown) 1015 / 1067 15.5 64.5 7.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10547 / 10566 1.5 670.5 1.0X +Parquet Vectorized (Pushdown) 649 / 653 24.2 41.3 16.3X +Native ORC Vectorized 6703 / 6713 2.3 426.2 1.6X +Native ORC Vectorized (Pushdown) 510 / 520 30.8 32.5 20.7X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 10% int rows (value < 1572864): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8983 / 9035 1.8 571.1 1.0X -Parquet Vectorized (Pushdown) 2204 / 2231 7.1 140.1 4.1X -Native ORC Vectorized 7864 / 8011 2.0 500.0 1.1X -Native ORC Vectorized (Pushdown) 2674 / 2789 5.9 170.0 3.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11478 / 11525 1.4 729.7 1.0X +Parquet Vectorized (Pushdown) 2576 / 2587 6.1 163.8 4.5X +Native ORC Vectorized 7633 / 7657 2.1 485.3 1.5X +Native ORC Vectorized (Pushdown) 2076 / 2096 7.6 132.0 5.5X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 50% int rows (value < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 12723 / 12903 1.2 808.9 1.0X -Parquet Vectorized (Pushdown) 9112 / 9282 1.7 579.3 1.4X -Native ORC Vectorized 12090 / 12230 1.3 768.7 1.1X -Native ORC Vectorized (Pushdown) 9242 / 9372 1.7 587.6 1.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 14785 / 14802 1.1 940.0 1.0X +Parquet Vectorized (Pushdown) 9971 / 9977 1.6 633.9 1.5X +Native ORC Vectorized 11082 / 11107 1.4 704.6 1.3X +Native ORC Vectorized (Pushdown) 8061 / 8073 2.0 512.5 1.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 90% int rows (value < 14155776): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 16453 / 16678 1.0 1046.1 1.0X -Parquet Vectorized (Pushdown) 15997 / 16262 1.0 1017.0 1.0X -Native ORC Vectorized 16652 / 17070 0.9 1058.7 1.0X -Native ORC Vectorized (Pushdown) 15843 / 16112 1.0 1007.2 1.0X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 18174 / 18214 0.9 1155.5 1.0X +Parquet Vectorized (Pushdown) 17387 / 17403 0.9 1105.5 1.0X +Native ORC Vectorized 14465 / 14492 1.1 919.7 1.3X +Native ORC Vectorized (Pushdown) 14024 / 14041 1.1 891.6 1.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select all int rows (value IS NOT NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 17098 / 17254 0.9 1087.1 1.0X -Parquet Vectorized (Pushdown) 17302 / 17529 0.9 1100.1 1.0X -Native ORC Vectorized 16790 / 17098 0.9 1067.5 1.0X -Native ORC Vectorized (Pushdown) 17329 / 17914 0.9 1101.7 1.0X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 19004 / 19014 0.8 1208.2 1.0X +Parquet Vectorized (Pushdown) 19219 / 19232 0.8 1221.9 1.0X +Native ORC Vectorized 15266 / 15290 1.0 970.6 1.2X +Native ORC Vectorized (Pushdown) 15469 / 15482 1.0 983.5 1.2X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select all int rows (value > -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 17088 / 17392 0.9 1086.4 1.0X -Parquet Vectorized (Pushdown) 17609 / 17863 0.9 1119.5 1.0X -Native ORC Vectorized 18334 / 69831 0.9 1165.7 0.9X -Native ORC Vectorized (Pushdown) 17465 / 17629 0.9 1110.4 1.0X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 19036 / 19052 0.8 1210.3 1.0X +Parquet Vectorized (Pushdown) 19287 / 19306 0.8 1226.2 1.0X +Native ORC Vectorized 15311 / 15371 1.0 973.5 1.2X +Native ORC Vectorized (Pushdown) 15517 / 15590 1.0 986.5 1.2X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select all int rows (value != -1): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 16903 / 17233 0.9 1074.6 1.0X -Parquet Vectorized (Pushdown) 16945 / 17032 0.9 1077.3 1.0X -Native ORC Vectorized 16377 / 16762 1.0 1041.2 1.0X -Native ORC Vectorized (Pushdown) 16950 / 17212 0.9 1077.7 1.0X +Parquet Vectorized 19072 / 19102 0.8 1212.6 1.0X +Parquet Vectorized (Pushdown) 19288 / 19318 0.8 1226.3 1.0X +Native ORC Vectorized 15277 / 15293 1.0 971.3 1.2X +Native ORC Vectorized (Pushdown) 15479 / 15499 1.0 984.1 1.2X ================================================================================================ Pushdown for few distinct value case (use dictionary encoding) ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 distinct string row (value IS NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7245 / 7322 2.2 460.7 1.0X -Parquet Vectorized (Pushdown) 378 / 389 41.6 24.0 19.2X -Native ORC Vectorized 6720 / 6778 2.3 427.2 1.1X -Native ORC Vectorized (Pushdown) 1009 / 1032 15.6 64.2 7.2X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10250 / 10274 1.5 651.7 1.0X +Parquet Vectorized (Pushdown) 571 / 576 27.5 36.3 17.9X +Native ORC Vectorized 8651 / 8660 1.8 550.0 1.2X +Native ORC Vectorized (Pushdown) 909 / 933 17.3 57.8 11.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 0 distinct string row ('100' < value < '100'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7627 / 7795 2.1 484.9 1.0X -Parquet Vectorized (Pushdown) 384 / 406 41.0 24.4 19.9X -Native ORC Vectorized 6724 / 7824 2.3 427.5 1.1X -Native ORC Vectorized (Pushdown) 968 / 986 16.3 61.5 7.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10420 / 10426 1.5 662.5 1.0X +Parquet Vectorized (Pushdown) 574 / 579 27.4 36.5 18.2X +Native ORC Vectorized 8973 / 8982 1.8 570.5 1.2X +Native ORC Vectorized (Pushdown) 916 / 955 17.2 58.2 11.4X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 distinct string row (value = '100'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7157 / 7534 2.2 455.0 1.0X -Parquet Vectorized (Pushdown) 542 / 565 29.0 34.5 13.2X -Native ORC Vectorized 6716 / 7214 2.3 427.0 1.1X -Native ORC Vectorized (Pushdown) 1212 / 1288 13.0 77.0 5.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10428 / 10441 1.5 663.0 1.0X +Parquet Vectorized (Pushdown) 789 / 809 19.9 50.2 13.2X +Native ORC Vectorized 9042 / 9055 1.7 574.9 1.2X +Native ORC Vectorized (Pushdown) 1130 / 1145 13.9 71.8 9.2X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 distinct string row (value <=> '100'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7368 / 7552 2.1 468.4 1.0X -Parquet Vectorized (Pushdown) 544 / 556 28.9 34.6 13.5X -Native ORC Vectorized 6740 / 6867 2.3 428.5 1.1X -Native ORC Vectorized (Pushdown) 1230 / 1426 12.8 78.2 6.0X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10402 / 10416 1.5 661.3 1.0X +Parquet Vectorized (Pushdown) 791 / 806 19.9 50.3 13.2X +Native ORC Vectorized 9042 / 9055 1.7 574.9 1.2X +Native ORC Vectorized (Pushdown) 1112 / 1145 14.1 70.7 9.4X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 distinct string row ('100' <= value <= '100'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7427 / 7734 2.1 472.2 1.0X -Parquet Vectorized (Pushdown) 556 / 568 28.3 35.4 13.3X -Native ORC Vectorized 6847 / 7059 2.3 435.3 1.1X -Native ORC Vectorized (Pushdown) 1226 / 1230 12.8 77.9 6.1X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10548 / 10563 1.5 670.6 1.0X +Parquet Vectorized (Pushdown) 790 / 796 19.9 50.2 13.4X +Native ORC Vectorized 9144 / 9153 1.7 581.3 1.2X +Native ORC Vectorized (Pushdown) 1117 / 1148 14.1 71.0 9.4X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select all distinct string rows (value IS NOT NULL): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 16998 / 17311 0.9 1080.7 1.0X -Parquet Vectorized (Pushdown) 16977 / 17250 0.9 1079.4 1.0X -Native ORC Vectorized 18447 / 19852 0.9 1172.8 0.9X -Native ORC Vectorized (Pushdown) 16614 / 17102 0.9 1056.3 1.0X +Parquet Vectorized 20445 / 20469 0.8 1299.8 1.0X +Parquet Vectorized (Pushdown) 20686 / 20699 0.8 1315.2 1.0X +Native ORC Vectorized 18851 / 18953 0.8 1198.5 1.1X +Native ORC Vectorized (Pushdown) 19255 / 19268 0.8 1224.2 1.1X ================================================================================================ Pushdown benchmark for StringStartsWith ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz StringStartsWith filter: (value like '10%'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 9705 / 10814 1.6 617.0 1.0X -Parquet Vectorized (Pushdown) 3086 / 3574 5.1 196.2 3.1X -Native ORC Vectorized 10094 / 10695 1.6 641.8 1.0X -Native ORC Vectorized (Pushdown) 9611 / 9999 1.6 611.0 1.0X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 14265 / 15213 1.1 907.0 1.0X +Parquet Vectorized (Pushdown) 4228 / 4870 3.7 268.8 3.4X +Native ORC Vectorized 10116 / 10977 1.6 643.2 1.4X +Native ORC Vectorized (Pushdown) 10653 / 11376 1.5 677.3 1.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz StringStartsWith filter: (value like '1000%'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8016 / 8183 2.0 509.7 1.0X -Parquet Vectorized (Pushdown) 444 / 457 35.4 28.2 18.0X -Native ORC Vectorized 6970 / 7169 2.3 443.2 1.2X -Native ORC Vectorized (Pushdown) 7447 / 7503 2.1 473.5 1.1X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11499 / 11539 1.4 731.1 1.0X +Parquet Vectorized (Pushdown) 669 / 672 23.5 42.5 17.2X +Native ORC Vectorized 7343 / 7363 2.1 466.8 1.6X +Native ORC Vectorized (Pushdown) 7559 / 7568 2.1 480.6 1.5X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz StringStartsWith filter: (value like '786432%'): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7908 / 8046 2.0 502.8 1.0X -Parquet Vectorized (Pushdown) 408 / 429 38.6 25.9 19.4X -Native ORC Vectorized 7021 / 7100 2.2 446.4 1.1X -Native ORC Vectorized (Pushdown) 7310 / 7490 2.2 464.8 1.1X +Parquet Vectorized 11463 / 11468 1.4 728.8 1.0X +Parquet Vectorized (Pushdown) 647 / 651 24.3 41.1 17.7X +Native ORC Vectorized 7322 / 7338 2.1 465.5 1.6X +Native ORC Vectorized (Pushdown) 7533 / 7544 2.1 478.9 1.5X ================================================================================================ Pushdown benchmark for decimal ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 decimal(9, 2) row (value = 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 4546 / 4743 3.5 289.0 1.0X -Parquet Vectorized (Pushdown) 161 / 175 98.0 10.2 28.3X -Native ORC Vectorized 5721 / 5842 2.7 363.7 0.8X -Native ORC Vectorized (Pushdown) 1019 / 1070 15.4 64.8 4.5X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 5543 / 5564 2.8 352.4 1.0X +Parquet Vectorized (Pushdown) 168 / 174 93.7 10.7 33.0X +Native ORC Vectorized 4992 / 5052 3.2 317.4 1.1X +Native ORC Vectorized (Pushdown) 840 / 850 18.7 53.4 6.6X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 10% decimal(9, 2) rows (value < 1572864): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 6340 / 7236 2.5 403.1 1.0X -Parquet Vectorized (Pushdown) 3052 / 3164 5.2 194.1 2.1X -Native ORC Vectorized 8370 / 9214 1.9 532.1 0.8X -Native ORC Vectorized (Pushdown) 4137 / 4242 3.8 263.0 1.5X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 7312 / 7358 2.2 464.9 1.0X +Parquet Vectorized (Pushdown) 3008 / 3078 5.2 191.2 2.4X +Native ORC Vectorized 6775 / 6798 2.3 430.7 1.1X +Native ORC Vectorized (Pushdown) 6819 / 6832 2.3 433.5 1.1X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 50% decimal(9, 2) rows (value < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 12976 / 13249 1.2 825.0 1.0X -Parquet Vectorized (Pushdown) 12655 / 13570 1.2 804.6 1.0X -Native ORC Vectorized 15562 / 15950 1.0 989.4 0.8X -Native ORC Vectorized (Pushdown) 15042 / 15668 1.0 956.3 0.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 13232 / 13241 1.2 841.3 1.0X +Parquet Vectorized (Pushdown) 12555 / 12569 1.3 798.2 1.1X +Native ORC Vectorized 12597 / 12627 1.2 800.9 1.1X +Native ORC Vectorized (Pushdown) 12677 / 12711 1.2 806.0 1.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 90% decimal(9, 2) rows (value < 14155776): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 14303 / 14616 1.1 909.3 1.0X -Parquet Vectorized (Pushdown) 14380 / 14649 1.1 914.3 1.0X -Native ORC Vectorized 16964 / 17358 0.9 1078.5 0.8X -Native ORC Vectorized (Pushdown) 17255 / 17874 0.9 1097.0 0.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 14725 / 14729 1.1 936.2 1.0X +Parquet Vectorized (Pushdown) 14781 / 14800 1.1 939.7 1.0X +Native ORC Vectorized 15360 / 15453 1.0 976.5 1.0X +Native ORC Vectorized (Pushdown) 15444 / 15466 1.0 981.9 1.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 decimal(18, 2) row (value = 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 4701 / 6416 3.3 298.9 1.0X -Parquet Vectorized (Pushdown) 128 / 164 122.8 8.1 36.7X -Native ORC Vectorized 5698 / 7904 2.8 362.3 0.8X -Native ORC Vectorized (Pushdown) 913 / 942 17.2 58.0 5.2X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 5746 / 5763 2.7 365.3 1.0X +Parquet Vectorized (Pushdown) 166 / 169 94.8 10.6 34.6X +Native ORC Vectorized 5007 / 5023 3.1 318.3 1.1X +Native ORC Vectorized (Pushdown) 2629 / 2640 6.0 167.1 2.2X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 10% decimal(18, 2) rows (value < 1572864): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 5376 / 5461 2.9 341.8 1.0X -Parquet Vectorized (Pushdown) 1479 / 1543 10.6 94.0 3.6X -Native ORC Vectorized 6640 / 6748 2.4 422.2 0.8X -Native ORC Vectorized (Pushdown) 2438 / 2479 6.5 155.0 2.2X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 6827 / 6864 2.3 434.0 1.0X +Parquet Vectorized (Pushdown) 1809 / 1827 8.7 115.0 3.8X +Native ORC Vectorized 6287 / 6296 2.5 399.7 1.1X +Native ORC Vectorized (Pushdown) 6364 / 6377 2.5 404.6 1.1X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 50% decimal(18, 2) rows (value < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 9224 / 9356 1.7 586.5 1.0X -Parquet Vectorized (Pushdown) 7172 / 7415 2.2 456.0 1.3X -Native ORC Vectorized 11017 / 11408 1.4 700.4 0.8X -Native ORC Vectorized (Pushdown) 8771 / 10218 1.8 557.7 1.1X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 11315 / 11342 1.4 719.4 1.0X +Parquet Vectorized (Pushdown) 8431 / 8450 1.9 536.0 1.3X +Native ORC Vectorized 11591 / 11611 1.4 736.9 1.0X +Native ORC Vectorized (Pushdown) 11424 / 11475 1.4 726.3 1.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 90% decimal(18, 2) rows (value < 14155776): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 13933 / 15990 1.1 885.8 1.0X -Parquet Vectorized (Pushdown) 12683 / 12942 1.2 806.4 1.1X -Native ORC Vectorized 16344 / 20196 1.0 1039.1 0.9X -Native ORC Vectorized (Pushdown) 15162 / 16627 1.0 964.0 0.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 15703 / 15712 1.0 998.4 1.0X +Parquet Vectorized (Pushdown) 14982 / 15009 1.0 952.5 1.0X +Native ORC Vectorized 16887 / 16955 0.9 1073.7 0.9X +Native ORC Vectorized (Pushdown) 16518 / 16530 1.0 1050.2 1.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 decimal(38, 2) row (value = 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7102 / 8282 2.2 451.5 1.0X -Parquet Vectorized (Pushdown) 124 / 150 126.4 7.9 57.1X -Native ORC Vectorized 5811 / 6883 2.7 369.5 1.2X -Native ORC Vectorized (Pushdown) 1121 / 1502 14.0 71.3 6.3X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 8101 / 8130 1.9 515.1 1.0X +Parquet Vectorized (Pushdown) 184 / 187 85.6 11.7 44.1X +Native ORC Vectorized 4998 / 5027 3.1 317.8 1.6X +Native ORC Vectorized (Pushdown) 165 / 168 95.6 10.5 49.2X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 10% decimal(38, 2) rows (value < 1572864): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 6894 / 7562 2.3 438.3 1.0X -Parquet Vectorized (Pushdown) 1863 / 1980 8.4 118.4 3.7X -Native ORC Vectorized 6812 / 6848 2.3 433.1 1.0X -Native ORC Vectorized (Pushdown) 2511 / 2598 6.3 159.7 2.7X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 9405 / 9447 1.7 597.9 1.0X +Parquet Vectorized (Pushdown) 2269 / 2275 6.9 144.2 4.1X +Native ORC Vectorized 6167 / 6203 2.6 392.1 1.5X +Native ORC Vectorized (Pushdown) 1783 / 1787 8.8 113.3 5.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 50% decimal(38, 2) rows (value < 7864320): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 11732 / 12183 1.3 745.9 1.0X -Parquet Vectorized (Pushdown) 8912 / 9945 1.8 566.6 1.3X -Native ORC Vectorized 11499 / 12387 1.4 731.1 1.0X -Native ORC Vectorized (Pushdown) 9328 / 9382 1.7 593.1 1.3X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 14700 / 14707 1.1 934.6 1.0X +Parquet Vectorized (Pushdown) 10699 / 10712 1.5 680.2 1.4X +Native ORC Vectorized 10687 / 10703 1.5 679.5 1.4X +Native ORC Vectorized (Pushdown) 8364 / 8415 1.9 531.8 1.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 90% decimal(38, 2) rows (value < 14155776): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 16272 / 16328 1.0 1034.6 1.0X -Parquet Vectorized (Pushdown) 15714 / 18100 1.0 999.1 1.0X -Native ORC Vectorized 16539 / 18897 1.0 1051.5 1.0X -Native ORC Vectorized (Pushdown) 16328 / 17306 1.0 1038.1 1.0X +Parquet Vectorized 19780 / 19894 0.8 1257.6 1.0X +Parquet Vectorized (Pushdown) 19003 / 19025 0.8 1208.1 1.0X +Native ORC Vectorized 15385 / 15404 1.0 978.2 1.3X +Native ORC Vectorized (Pushdown) 15032 / 15060 1.0 955.7 1.3X ================================================================================================ Pushdown benchmark for InSet -> InFilters ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 5, distribution: 10): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7993 / 8104 2.0 508.2 1.0X -Parquet Vectorized (Pushdown) 507 / 532 31.0 32.2 15.8X -Native ORC Vectorized 6922 / 7163 2.3 440.1 1.2X -Native ORC Vectorized (Pushdown) 1017 / 1058 15.5 64.6 7.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10521 / 10534 1.5 668.9 1.0X +Parquet Vectorized (Pushdown) 677 / 691 23.2 43.1 15.5X +Native ORC Vectorized 6768 / 6776 2.3 430.3 1.6X +Native ORC Vectorized (Pushdown) 501 / 512 31.4 31.8 21.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 5, distribution: 50): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7855 / 7963 2.0 499.4 1.0X -Parquet Vectorized (Pushdown) 503 / 516 31.3 32.0 15.6X -Native ORC Vectorized 6825 / 6954 2.3 433.9 1.2X -Native ORC Vectorized (Pushdown) 1019 / 1044 15.4 64.8 7.7X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10531 / 10538 1.5 669.5 1.0X +Parquet Vectorized (Pushdown) 677 / 718 23.2 43.0 15.6X +Native ORC Vectorized 6765 / 6773 2.3 430.1 1.6X +Native ORC Vectorized (Pushdown) 499 / 507 31.5 31.7 21.1X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 5, distribution: 90): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7858 / 7928 2.0 499.6 1.0X -Parquet Vectorized (Pushdown) 490 / 519 32.1 31.1 16.0X -Native ORC Vectorized 7079 / 7966 2.2 450.1 1.1X -Native ORC Vectorized (Pushdown) 1276 / 1673 12.3 81.1 6.2X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10540 / 10553 1.5 670.1 1.0X +Parquet Vectorized (Pushdown) 678 / 710 23.2 43.1 15.5X +Native ORC Vectorized 6787 / 6794 2.3 431.5 1.6X +Native ORC Vectorized (Pushdown) 501 / 509 31.4 31.9 21.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 10, distribution: 10): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8007 / 11155 2.0 509.0 1.0X -Parquet Vectorized (Pushdown) 519 / 540 30.3 33.0 15.4X -Native ORC Vectorized 6848 / 7072 2.3 435.4 1.2X -Native ORC Vectorized (Pushdown) 1026 / 1050 15.3 65.2 7.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10551 / 10559 1.5 670.8 1.0X +Parquet Vectorized (Pushdown) 703 / 708 22.4 44.7 15.0X +Native ORC Vectorized 6791 / 6802 2.3 431.7 1.6X +Native ORC Vectorized (Pushdown) 519 / 526 30.3 33.0 20.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 10, distribution: 50): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7876 / 7956 2.0 500.7 1.0X -Parquet Vectorized (Pushdown) 521 / 535 30.2 33.1 15.1X -Native ORC Vectorized 7051 / 7368 2.2 448.3 1.1X -Native ORC Vectorized (Pushdown) 1014 / 1035 15.5 64.5 7.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10561 / 10565 1.5 671.4 1.0X +Parquet Vectorized (Pushdown) 711 / 716 22.1 45.2 14.9X +Native ORC Vectorized 6791 / 6806 2.3 431.8 1.6X +Native ORC Vectorized (Pushdown) 529 / 537 29.8 33.6 20.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 10, distribution: 90): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7897 / 8229 2.0 502.1 1.0X -Parquet Vectorized (Pushdown) 513 / 530 30.7 32.6 15.4X -Native ORC Vectorized 6730 / 6990 2.3 427.9 1.2X -Native ORC Vectorized (Pushdown) 1003 / 1036 15.7 63.8 7.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10572 / 10590 1.5 672.1 1.0X +Parquet Vectorized (Pushdown) 713 / 716 22.1 45.3 14.8X +Native ORC Vectorized 6808 / 6815 2.3 432.9 1.6X +Native ORC Vectorized (Pushdown) 530 / 541 29.7 33.7 19.9X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 50, distribution: 10): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7967 / 8175 2.0 506.5 1.0X -Parquet Vectorized (Pushdown) 8155 / 8434 1.9 518.5 1.0X -Native ORC Vectorized 7002 / 7107 2.2 445.2 1.1X -Native ORC Vectorized (Pushdown) 1092 / 1139 14.4 69.4 7.3X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10871 / 10882 1.4 691.2 1.0X +Parquet Vectorized (Pushdown) 11104 / 11110 1.4 706.0 1.0X +Native ORC Vectorized 7088 / 7104 2.2 450.7 1.5X +Native ORC Vectorized (Pushdown) 665 / 677 23.6 42.3 16.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 50, distribution: 50): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8032 / 8122 2.0 510.7 1.0X -Parquet Vectorized (Pushdown) 8141 / 8908 1.9 517.6 1.0X -Native ORC Vectorized 7140 / 7387 2.2 454.0 1.1X -Native ORC Vectorized (Pushdown) 1156 / 1220 13.6 73.5 6.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10861 / 10867 1.4 690.5 1.0X +Parquet Vectorized (Pushdown) 11094 / 11099 1.4 705.3 1.0X +Native ORC Vectorized 7075 / 7092 2.2 449.8 1.5X +Native ORC Vectorized (Pushdown) 718 / 733 21.9 45.6 15.1X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 50, distribution: 90): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8088 / 8350 1.9 514.2 1.0X -Parquet Vectorized (Pushdown) 8629 / 8702 1.8 548.6 0.9X -Native ORC Vectorized 7480 / 7886 2.1 475.6 1.1X -Native ORC Vectorized (Pushdown) 1106 / 1145 14.2 70.3 7.3X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10868 / 10887 1.4 691.0 1.0X +Parquet Vectorized (Pushdown) 11100 / 11106 1.4 705.7 1.0X +Native ORC Vectorized 7087 / 7093 2.2 450.6 1.5X +Native ORC Vectorized (Pushdown) 712 / 731 22.1 45.3 15.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 100, distribution: 10): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8028 / 8165 2.0 510.4 1.0X -Parquet Vectorized (Pushdown) 8349 / 8674 1.9 530.8 1.0X -Native ORC Vectorized 7107 / 7354 2.2 451.8 1.1X -Native ORC Vectorized (Pushdown) 1175 / 1207 13.4 74.7 6.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10850 / 10888 1.4 689.8 1.0X +Parquet Vectorized (Pushdown) 11086 / 11105 1.4 704.9 1.0X +Native ORC Vectorized 7090 / 7101 2.2 450.8 1.5X +Native ORC Vectorized (Pushdown) 867 / 882 18.1 55.1 12.5X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 100, distribution: 50): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8041 / 8195 2.0 511.2 1.0X -Parquet Vectorized (Pushdown) 8466 / 8604 1.9 538.2 0.9X -Native ORC Vectorized 7116 / 7286 2.2 452.4 1.1X -Native ORC Vectorized (Pushdown) 1197 / 1214 13.1 76.1 6.7X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10816 / 10819 1.5 687.7 1.0X +Parquet Vectorized (Pushdown) 11052 / 11059 1.4 702.7 1.0X +Native ORC Vectorized 7037 / 7044 2.2 447.4 1.5X +Native ORC Vectorized (Pushdown) 919 / 931 17.1 58.4 11.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz InSet -> InFilters (values count: 100, distribution: 90): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7998 / 8311 2.0 508.5 1.0X -Parquet Vectorized (Pushdown) 9366 / 11257 1.7 595.5 0.9X -Native ORC Vectorized 7856 / 9273 2.0 499.5 1.0X -Native ORC Vectorized (Pushdown) 1350 / 1747 11.7 85.8 5.9X +Parquet Vectorized 10807 / 10815 1.5 687.1 1.0X +Parquet Vectorized (Pushdown) 11047 / 11054 1.4 702.4 1.0X +Native ORC Vectorized 7042 / 7047 2.2 447.7 1.5X +Native ORC Vectorized (Pushdown) 950 / 961 16.6 60.4 11.4X ================================================================================================ Pushdown benchmark for tinyint ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 tinyint row (value = CAST(63 AS tinyint)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 3461 / 3997 4.5 220.1 1.0X -Parquet Vectorized (Pushdown) 270 / 315 58.4 17.1 12.8X -Native ORC Vectorized 4107 / 5372 3.8 261.1 0.8X -Native ORC Vectorized (Pushdown) 778 / 1553 20.2 49.5 4.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 6034 / 6048 2.6 383.6 1.0X +Parquet Vectorized (Pushdown) 333 / 344 47.2 21.2 18.1X +Native ORC Vectorized 3240 / 3307 4.9 206.0 1.9X +Native ORC Vectorized (Pushdown) 330 / 341 47.6 21.0 18.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 10% tinyint rows (value < CAST(12 AS tinyint)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 4771 / 6655 3.3 303.3 1.0X -Parquet Vectorized (Pushdown) 1322 / 1606 11.9 84.0 3.6X -Native ORC Vectorized 4437 / 4572 3.5 282.1 1.1X -Native ORC Vectorized (Pushdown) 1781 / 1976 8.8 113.2 2.7X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 6759 / 6800 2.3 429.7 1.0X +Parquet Vectorized (Pushdown) 1533 / 1537 10.3 97.5 4.4X +Native ORC Vectorized 3863 / 3874 4.1 245.6 1.7X +Native ORC Vectorized (Pushdown) 1235 / 1248 12.7 78.5 5.5X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 50% tinyint rows (value < CAST(63 AS tinyint)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 7433 / 7752 2.1 472.6 1.0X -Parquet Vectorized (Pushdown) 5863 / 5913 2.7 372.8 1.3X -Native ORC Vectorized 7986 / 8084 2.0 507.7 0.9X -Native ORC Vectorized (Pushdown) 6522 / 6608 2.4 414.6 1.1X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10247 / 10289 1.5 651.5 1.0X +Parquet Vectorized (Pushdown) 7430 / 7453 2.1 472.4 1.4X +Native ORC Vectorized 6995 / 7009 2.2 444.7 1.5X +Native ORC Vectorized (Pushdown) 5561 / 5571 2.8 353.6 1.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 90% tinyint rows (value < CAST(114 AS tinyint)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 11190 / 11519 1.4 711.4 1.0X -Parquet Vectorized (Pushdown) 10861 / 11206 1.4 690.5 1.0X -Native ORC Vectorized 11622 / 12196 1.4 738.9 1.0X -Native ORC Vectorized (Pushdown) 11377 / 11654 1.4 723.3 1.0X +Parquet Vectorized 13949 / 13991 1.1 886.9 1.0X +Parquet Vectorized (Pushdown) 13486 / 13511 1.2 857.4 1.0X +Native ORC Vectorized 10149 / 10186 1.5 645.3 1.4X +Native ORC Vectorized (Pushdown) 9889 / 9905 1.6 628.7 1.4X ================================================================================================ Pushdown benchmark for Timestamp ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 timestamp stored as INT96 row (value = CAST(7864320 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 4784 / 4956 3.3 304.2 1.0X -Parquet Vectorized (Pushdown) 4838 / 4917 3.3 307.6 1.0X -Native ORC Vectorized 3923 / 4173 4.0 249.4 1.2X -Native ORC Vectorized (Pushdown) 894 / 943 17.6 56.8 5.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 6307 / 6310 2.5 401.0 1.0X +Parquet Vectorized (Pushdown) 6360 / 6397 2.5 404.3 1.0X +Native ORC Vectorized 2912 / 2917 5.4 185.1 2.2X +Native ORC Vectorized (Pushdown) 138 / 141 114.4 8.7 45.9X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 10% timestamp stored as INT96 rows (value < CAST(1572864 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 5686 / 5901 2.8 361.5 1.0X -Parquet Vectorized (Pushdown) 5555 / 5895 2.8 353.2 1.0X -Native ORC Vectorized 4844 / 4957 3.2 308.0 1.2X -Native ORC Vectorized (Pushdown) 2141 / 2230 7.3 136.1 2.7X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 7225 / 7233 2.2 459.4 1.0X +Parquet Vectorized (Pushdown) 7250 / 7255 2.2 461.0 1.0X +Native ORC Vectorized 3772 / 3783 4.2 239.8 1.9X +Native ORC Vectorized (Pushdown) 1277 / 1282 12.3 81.2 5.7X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 50% timestamp stored as INT96 rows (value < CAST(7864320 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 9100 / 9421 1.7 578.6 1.0X -Parquet Vectorized (Pushdown) 9122 / 9496 1.7 580.0 1.0X -Native ORC Vectorized 8365 / 8874 1.9 531.9 1.1X -Native ORC Vectorized (Pushdown) 7128 / 7376 2.2 453.2 1.3X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10952 / 10965 1.4 696.3 1.0X +Parquet Vectorized (Pushdown) 10985 / 10998 1.4 698.4 1.0X +Native ORC Vectorized 7178 / 7227 2.2 456.3 1.5X +Native ORC Vectorized (Pushdown) 5825 / 5830 2.7 370.3 1.9X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 90% timestamp stored as INT96 rows (value < CAST(14155776 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 12764 / 13120 1.2 811.5 1.0X -Parquet Vectorized (Pushdown) 12656 / 13003 1.2 804.7 1.0X -Native ORC Vectorized 13096 / 13233 1.2 832.6 1.0X -Native ORC Vectorized (Pushdown) 12710 / 15611 1.2 808.1 1.0X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 14560 / 14583 1.1 925.7 1.0X +Parquet Vectorized (Pushdown) 14608 / 14620 1.1 928.7 1.0X +Native ORC Vectorized 10601 / 10640 1.5 674.0 1.4X +Native ORC Vectorized (Pushdown) 10392 / 10406 1.5 660.7 1.4X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 timestamp stored as TIMESTAMP_MICROS row (value = CAST(7864320 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 4381 / 4796 3.6 278.5 1.0X -Parquet Vectorized (Pushdown) 122 / 137 129.3 7.7 36.0X -Native ORC Vectorized 3913 / 3988 4.0 248.8 1.1X -Native ORC Vectorized (Pushdown) 905 / 945 17.4 57.6 4.8X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 5653 / 5658 2.8 359.4 1.0X +Parquet Vectorized (Pushdown) 165 / 169 95.1 10.5 34.2X +Native ORC Vectorized 2918 / 2921 5.4 185.5 1.9X +Native ORC Vectorized (Pushdown) 137 / 145 114.9 8.7 41.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 10% timestamp stored as TIMESTAMP_MICROS rows (value < CAST(1572864 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 5145 / 5184 3.1 327.1 1.0X -Parquet Vectorized (Pushdown) 1426 / 1519 11.0 90.7 3.6X -Native ORC Vectorized 4827 / 4901 3.3 306.9 1.1X -Native ORC Vectorized (Pushdown) 2133 / 2210 7.4 135.6 2.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 6540 / 6552 2.4 415.8 1.0X +Parquet Vectorized (Pushdown) 1610 / 1614 9.8 102.3 4.1X +Native ORC Vectorized 3775 / 3788 4.2 240.0 1.7X +Native ORC Vectorized (Pushdown) 1274 / 1277 12.3 81.0 5.1X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 50% timestamp stored as TIMESTAMP_MICROS rows (value < CAST(7864320 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 9234 / 9516 1.7 587.1 1.0X -Parquet Vectorized (Pushdown) 6752 / 7046 2.3 429.3 1.4X -Native ORC Vectorized 8418 / 8998 1.9 535.2 1.1X -Native ORC Vectorized (Pushdown) 7199 / 7314 2.2 457.7 1.3X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10259 / 10278 1.5 652.3 1.0X +Parquet Vectorized (Pushdown) 7591 / 7601 2.1 482.6 1.4X +Native ORC Vectorized 7185 / 7194 2.2 456.8 1.4X +Native ORC Vectorized (Pushdown) 5828 / 5843 2.7 370.6 1.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 90% timestamp stored as TIMESTAMP_MICROS rows (value < CAST(14155776 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 12414 / 12458 1.3 789.2 1.0X -Parquet Vectorized (Pushdown) 12094 / 12249 1.3 768.9 1.0X -Native ORC Vectorized 12198 / 13755 1.3 775.5 1.0X -Native ORC Vectorized (Pushdown) 12205 / 12431 1.3 776.0 1.0X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 13850 / 13868 1.1 880.5 1.0X +Parquet Vectorized (Pushdown) 13433 / 13450 1.2 854.0 1.0X +Native ORC Vectorized 10635 / 10669 1.5 676.1 1.3X +Native ORC Vectorized (Pushdown) 10437 / 10448 1.5 663.6 1.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 timestamp stored as TIMESTAMP_MILLIS row (value = CAST(7864320 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 4369 / 4515 3.6 277.8 1.0X -Parquet Vectorized (Pushdown) 116 / 125 136.2 7.3 37.8X -Native ORC Vectorized 3965 / 4703 4.0 252.1 1.1X -Native ORC Vectorized (Pushdown) 892 / 1162 17.6 56.7 4.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 5884 / 5888 2.7 374.1 1.0X +Parquet Vectorized (Pushdown) 166 / 170 94.7 10.6 35.4X +Native ORC Vectorized 2913 / 2916 5.4 185.2 2.0X +Native ORC Vectorized (Pushdown) 136 / 144 115.4 8.7 43.2X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 10% timestamp stored as TIMESTAMP_MILLIS rows (value < CAST(1572864 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 5211 / 5409 3.0 331.3 1.0X -Parquet Vectorized (Pushdown) 1427 / 1438 11.0 90.7 3.7X -Native ORC Vectorized 4719 / 4883 3.3 300.1 1.1X -Native ORC Vectorized (Pushdown) 2191 / 2228 7.2 139.3 2.4X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 6763 / 6776 2.3 430.0 1.0X +Parquet Vectorized (Pushdown) 1634 / 1638 9.6 103.9 4.1X +Native ORC Vectorized 3777 / 3785 4.2 240.1 1.8X +Native ORC Vectorized (Pushdown) 1276 / 1279 12.3 81.2 5.3X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 50% timestamp stored as TIMESTAMP_MILLIS rows (value < CAST(7864320 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 8716 / 8953 1.8 554.2 1.0X -Parquet Vectorized (Pushdown) 6632 / 6968 2.4 421.7 1.3X -Native ORC Vectorized 8376 / 9118 1.9 532.5 1.0X -Native ORC Vectorized (Pushdown) 7218 / 7609 2.2 458.9 1.2X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 -Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz +Parquet Vectorized 10460 / 10469 1.5 665.0 1.0X +Parquet Vectorized (Pushdown) 7689 / 7698 2.0 488.9 1.4X +Native ORC Vectorized 7190 / 7197 2.2 457.1 1.5X +Native ORC Vectorized (Pushdown) 5820 / 5834 2.7 370.0 1.8X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 90% timestamp stored as TIMESTAMP_MILLIS rows (value < CAST(14155776 AS timestamp)): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 12264 / 12452 1.3 779.7 1.0X -Parquet Vectorized (Pushdown) 11766 / 11927 1.3 748.0 1.0X -Native ORC Vectorized 12101 / 12301 1.3 769.3 1.0X -Native ORC Vectorized (Pushdown) 11983 / 12651 1.3 761.9 1.0X +Parquet Vectorized 14033 / 14039 1.1 892.2 1.0X +Parquet Vectorized (Pushdown) 13608 / 13636 1.2 865.2 1.0X +Native ORC Vectorized 10635 / 10686 1.5 676.2 1.3X +Native ORC Vectorized (Pushdown) 10420 / 10442 1.5 662.5 1.3X ================================================================================================ Pushdown benchmark with many filters ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.13.6 -Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 row with 1 filters: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 158 / 182 0.0 158442969.0 1.0X -Parquet Vectorized (Pushdown) 150 / 158 0.0 149718289.0 1.1X -Native ORC Vectorized 141 / 148 0.0 141259852.0 1.1X -Native ORC Vectorized (Pushdown) 142 / 147 0.0 142016472.0 1.1X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.13.6 -Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz +Parquet Vectorized 319 / 323 0.0 318789986.0 1.0X +Parquet Vectorized (Pushdown) 323 / 347 0.0 322755287.0 1.0X +Native ORC Vectorized 316 / 336 0.0 315670745.0 1.0X +Native ORC Vectorized (Pushdown) 317 / 320 0.0 317392594.0 1.0X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 row with 250 filters: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 1013 / 1026 0.0 1013194322.0 1.0X -Parquet Vectorized (Pushdown) 1326 / 1332 0.0 1326301956.0 0.8X -Native ORC Vectorized 1005 / 1010 0.0 1005266379.0 1.0X -Native ORC Vectorized (Pushdown) 1068 / 1071 0.0 1067964993.0 0.9X - -Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.13.6 -Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz +Parquet Vectorized 2192 / 2218 0.0 2191883823.0 1.0X +Parquet Vectorized (Pushdown) 2675 / 2687 0.0 2675439029.0 0.8X +Native ORC Vectorized 2158 / 2162 0.0 2157646071.0 1.0X +Native ORC Vectorized (Pushdown) 2309 / 2326 0.0 2309096612.0 0.9X +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Select 1 row with 500 filters: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Parquet Vectorized 3598 / 3614 0.0 3598001202.0 1.0X -Parquet Vectorized (Pushdown) 4282 / 4333 0.0 4281849770.0 0.8X -Native ORC Vectorized 3594 / 3619 0.0 3593551548.0 1.0X -Native ORC Vectorized (Pushdown) 3834 / 3840 0.0 3834240570.0 0.9X +Parquet Vectorized 6219 / 6248 0.0 6218727737.0 1.0X +Parquet Vectorized (Pushdown) 7376 / 7436 0.0 7375977710.0 0.8X +Native ORC Vectorized 6252 / 6279 0.0 6252473320.0 1.0X +Native ORC Vectorized (Pushdown) 6858 / 6876 0.0 6857854486.0 0.9X + + diff --git a/sql/core/benchmarks/MiscBenchmark-results.txt b/sql/core/benchmarks/MiscBenchmark-results.txt new file mode 100644 index 000000000000..85acd5789365 --- /dev/null +++ b/sql/core/benchmarks/MiscBenchmark-results.txt @@ -0,0 +1,120 @@ +================================================================================================ +filter & aggregate without group +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +range/filter/sum: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +range/filter/sum wholestage off 47752 / 48952 43.9 22.8 1.0X +range/filter/sum wholestage on 3123 / 3558 671.5 1.5 15.3X + + +================================================================================================ +range/limit/sum +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +range/limit/sum: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +range/limit/sum wholestage off 229 / 236 2288.9 0.4 1.0X +range/limit/sum wholestage on 257 / 267 2041.0 0.5 0.9X + + +================================================================================================ +sample +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +sample with replacement: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +sample with replacement wholestage off 12908 / 13076 10.2 98.5 1.0X +sample with replacement wholestage on 7334 / 7346 17.9 56.0 1.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +sample without replacement: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +sample without replacement wholestage off 3082 / 3095 42.5 23.5 1.0X +sample without replacement wholestage on 1125 / 1211 116.5 8.6 2.7X + + +================================================================================================ +collect +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +collect: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +collect 1 million 291 / 311 3.6 277.3 1.0X +collect 2 millions 552 / 564 1.9 526.6 0.5X +collect 4 millions 1104 / 1108 0.9 1053.0 0.3X + + +================================================================================================ +collect limit +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +collect limit: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +collect limit 1 million 311 / 340 3.4 296.2 1.0X +collect limit 2 millions 581 / 614 1.8 554.4 0.5X + + +================================================================================================ +generate explode +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +generate explode array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +generate explode array wholestage off 15211 / 15368 1.1 906.6 1.0X +generate explode array wholestage on 10761 / 10776 1.6 641.4 1.4X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +generate explode map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +generate explode map wholestage off 22128 / 22578 0.8 1318.9 1.0X +generate explode map wholestage on 16421 / 16520 1.0 978.8 1.3X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +generate posexplode array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +generate posexplode array wholestage off 17108 / 18019 1.0 1019.7 1.0X +generate posexplode array wholestage on 11715 / 11804 1.4 698.3 1.5X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +generate inline array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +generate inline array wholestage off 16358 / 16418 1.0 975.0 1.0X +generate inline array wholestage on 11152 / 11472 1.5 664.7 1.5X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +generate big struct array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +generate big struct array wholestage off 708 / 776 0.1 11803.5 1.0X +generate big struct array wholestage on 535 / 589 0.1 8913.9 1.3X + + +================================================================================================ +generate regular generator +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +generate stack: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +generate stack wholestage off 29082 / 29393 0.6 1733.4 1.0X +generate stack wholestage on 21066 / 21128 0.8 1255.6 1.4X + + diff --git a/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt b/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt new file mode 100644 index 000000000000..b06b5c092b61 --- /dev/null +++ b/sql/core/benchmarks/PrimitiveArrayBenchmark-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +Write primitive arrays in dataset +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_131-b11 on Mac OS X 10.13.6 +Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz + +Write an array in Dataset: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Int 437 / 529 19.2 52.1 1.0X +Double 638 / 670 13.1 76.1 0.7X + + diff --git a/sql/core/benchmarks/SortBenchmark-results.txt b/sql/core/benchmarks/SortBenchmark-results.txt new file mode 100644 index 000000000000..0d00a0c89d02 --- /dev/null +++ b/sql/core/benchmarks/SortBenchmark-results.txt @@ -0,0 +1,17 @@ +================================================================================================ +radix sort +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6 +Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz + +radix sort 25000000: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +reference TimSort key prefix array 11770 / 11960 2.1 470.8 1.0X +reference Arrays.sort 2106 / 2128 11.9 84.3 5.6X +radix sort one byte 93 / 100 269.7 3.7 126.9X +radix sort two bytes 171 / 179 146.0 6.9 68.7X +radix sort eight bytes 659 / 664 37.9 26.4 17.9X +radix sort key prefix array 1024 / 1053 24.4 41.0 11.5X + + diff --git a/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt b/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt new file mode 100644 index 000000000000..4ecc1f1fad4b --- /dev/null +++ b/sql/core/benchmarks/UnsafeArrayDataBenchmark-results.txt @@ -0,0 +1,33 @@ +================================================================================================ +Benchmark UnsafeArrayData +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Read UnsafeArrayData: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Int 233 / 234 718.6 1.4 1.0X +Double 244 / 244 687.0 1.5 1.0X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Write UnsafeArrayData: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Int 32 / 33 658.6 1.5 1.0X +Double 73 / 75 287.0 3.5 0.4X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Get primitive array from UnsafeArrayData: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Int 70 / 72 895.0 1.1 1.0X +Double 141 / 143 446.9 2.2 0.5X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Create UnsafeArrayData from primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Int 72 / 73 874.7 1.1 1.0X +Double 145 / 146 433.7 2.3 0.5X + + diff --git a/sql/core/pom.xml b/sql/core/pom.xml index ba17f5f33f2b..2f72ff6cfdbf 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java b/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java index 74c9c0599271..3d0511b7ba83 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java @@ -73,16 +73,6 @@ public void append(InternalRow row) { currentRows.add(row); } - /** - * Returns whether this iterator should stop fetching next row from [[CodegenSupport#inputRDDs]]. - * - * If it returns true, the caller should exit the loop that [[InputAdapter]] generates. - * This interface is mainly used to limit the number of input rows. - */ - public boolean stopEarly() { - return false; - } - /** * Returns whether `processNext()` should stop processing next row from `input` or not. * diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index 6fdadde62855..5e0cf7d370dd 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -23,7 +23,6 @@ import org.apache.spark.sql.types.*; import org.apache.spark.unsafe.Platform; -import org.apache.spark.unsafe.memory.OffHeapMemoryBlock; import org.apache.spark.unsafe.types.UTF8String; /** @@ -207,7 +206,7 @@ public byte[] getBytes(int rowId, int count) { @Override protected UTF8String getBytesAsUTF8String(int rowId, int count) { - return new UTF8String(new OffHeapMemoryBlock(data + rowId, count)); + return UTF8String.fromAddress(null, data + rowId, count); } // diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java index 1c9beda40435..5f58b031f6ae 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java @@ -25,7 +25,6 @@ import org.apache.spark.annotation.InterfaceStability; import org.apache.spark.sql.execution.arrow.ArrowUtils; import org.apache.spark.sql.types.*; -import org.apache.spark.unsafe.memory.OffHeapMemoryBlock; import org.apache.spark.unsafe.types.UTF8String; /** @@ -378,10 +377,9 @@ final UTF8String getUTF8String(int rowId) { if (stringResult.isSet == 0) { return null; } else { - return new UTF8String(new OffHeapMemoryBlock( + return UTF8String.fromAddress(null, stringResult.buffer.memoryAddress() + stringResult.start, - stringResult.end - stringResult.start - )); + stringResult.end - stringResult.start); } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 0cfcc45fb3d3..72694463cedb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -202,7 +202,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { DataSourceOptions.PATHS_KEY -> objectMapper.writeValueAsString(paths.toArray) } Dataset.ofRows(sparkSession, DataSourceV2Relation.create( - ds, extraOptions.toMap ++ sessionOptions + pathsOption, + ds, sessionOptions ++ extraOptions.toMap + pathsOption, userSpecifiedSchema = userSpecifiedSchema)) } else { loadV1Source(paths: _*) @@ -505,7 +505,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { val actualSchema = StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) - val linesWithoutHeader: RDD[String] = maybeFirstLine.map { firstLine => + val linesWithoutHeader = if (parsedOptions.headerFlag && maybeFirstLine.isDefined) { + val firstLine = maybeFirstLine.get val parser = new CsvParser(parsedOptions.asParserSettings) val columnNames = parser.parseLine(firstLine) CSVDataSource.checkHeaderColumnNames( @@ -515,7 +516,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { parsedOptions.enforceSchema, sparkSession.sessionState.conf.caseSensitiveAnalysis) filteredLines.rdd.mapPartitions(CSVUtils.filterHeaderLine(_, firstLine, parsedOptions)) - }.getOrElse(filteredLines.rdd) + } else { + filteredLines.rdd + } val parsed = linesWithoutHeader.mapPartitions { iter => val rawParser = new UnivocityParser(actualSchema, parsedOptions) @@ -571,6 +574,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * whitespaces from values being read should be skipped. *
  • `nullValue` (default empty string): sets the string representation of a null value. Since * 2.0.1, this applies to all supported types including the string type.
  • + *
  • `emptyValue` (default empty string): sets the string representation of an empty value.
  • *
  • `nanValue` (default `NaN`): sets the string representation of a non-number" value.
  • *
  • `positiveInf` (default `Inf`): sets the string representation of a positive infinity * value.
  • diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index a41753098966..7c12432d33c3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -370,19 +370,66 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @since 1.5.0 */ def sampleBy[T](col: String, fractions: Map[T, Double], seed: Long): DataFrame = { + sampleBy(Column(col), fractions, seed) + } + + /** + * Returns a stratified sample without replacement based on the fraction given on each stratum. + * @param col column that defines strata + * @param fractions sampling fraction for each stratum. If a stratum is not specified, we treat + * its fraction as zero. + * @param seed random seed + * @tparam T stratum type + * @return a new `DataFrame` that represents the stratified sample + * + * @since 1.5.0 + */ + def sampleBy[T](col: String, fractions: ju.Map[T, jl.Double], seed: Long): DataFrame = { + sampleBy(col, fractions.asScala.toMap.asInstanceOf[Map[T, Double]], seed) + } + + /** + * Returns a stratified sample without replacement based on the fraction given on each stratum. + * @param col column that defines strata + * @param fractions sampling fraction for each stratum. If a stratum is not specified, we treat + * its fraction as zero. + * @param seed random seed + * @tparam T stratum type + * @return a new `DataFrame` that represents the stratified sample + * + * The stratified sample can be performed over multiple columns: + * {{{ + * import org.apache.spark.sql.Row + * import org.apache.spark.sql.functions.struct + * + * val df = spark.createDataFrame(Seq(("Bob", 17), ("Alice", 10), ("Nico", 8), ("Bob", 17), + * ("Alice", 10))).toDF("name", "age") + * val fractions = Map(Row("Alice", 10) -> 0.3, Row("Nico", 8) -> 1.0) + * df.stat.sampleBy(struct($"name", $"age"), fractions, 36L).show() + * +-----+---+ + * | name|age| + * +-----+---+ + * | Nico| 8| + * |Alice| 10| + * +-----+---+ + * }}} + * + * @since 3.0.0 + */ + def sampleBy[T](col: Column, fractions: Map[T, Double], seed: Long): DataFrame = { require(fractions.values.forall(p => p >= 0.0 && p <= 1.0), s"Fractions must be in [0, 1], but got $fractions.") import org.apache.spark.sql.functions.{rand, udf} - val c = Column(col) val r = rand(seed) val f = udf { (stratum: Any, x: Double) => x < fractions.getOrElse(stratum.asInstanceOf[T], 0.0) } - df.filter(f(c, r)) + df.filter(f(col, r)) } /** - * Returns a stratified sample without replacement based on the fraction given on each stratum. + * (Java-specific) Returns a stratified sample without replacement based on the fraction given + * on each stratum. * @param col column that defines strata * @param fractions sampling fraction for each stratum. If a stratum is not specified, we treat * its fraction as zero. @@ -390,9 +437,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * @tparam T stratum type * @return a new `DataFrame` that represents the stratified sample * - * @since 1.5.0 + * @since 3.0.0 */ - def sampleBy[T](col: String, fractions: ju.Map[T, jl.Double], seed: Long): DataFrame = { + def sampleBy[T](col: Column, fractions: ju.Map[T, jl.Double], seed: Long): DataFrame = { sampleBy(col, fractions.asScala.toMap.asInstanceOf[Map[T, Double]], seed) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index eca2d5b97190..fd1c13204802 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -241,26 +241,21 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val source = cls.newInstance().asInstanceOf[DataSourceV2] source match { case provider: BatchWriteSupportProvider => - val options = extraOptions ++ - DataSourceV2Utils.extractSessionConfigs(source, df.sparkSession.sessionState.conf) - - val relation = DataSourceV2Relation.create(source, options.toMap) - if (mode == SaveMode.Append) { + val sessionOptions = DataSourceV2Utils.extractSessionConfigs( + source, + df.sparkSession.sessionState.conf) + val options = sessionOptions ++ extraOptions + + // TODO: SPARK-24251 was reverted because it creates a readsupport at write path. + val writer = provider.createBatchWriteSupport( + UUID.randomUUID().toString, + df.logicalPlan.output.toStructType, + mode, + new DataSourceOptions(options.asJava)) + + if (writer.isPresent) { runCommand(df.sparkSession, "save") { - AppendData.byName(relation, df.logicalPlan) - } - - } else { - val writer = provider.createBatchWriteSupport( - UUID.randomUUID().toString, - df.logicalPlan.output.toStructType, - mode, - new DataSourceOptions(options.asJava)) - - if (writer.isPresent) { - runCommand(df.sparkSession, "save") { - WriteToDataSourceV2(writer.get, df.logicalPlan) - } + WriteToDataSourceV2(writer.get, df.logicalPlan) } } @@ -635,6 +630,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * enclosed in quotes. Default is to only escape values containing a quote character. *
  • `header` (default `false`): writes the names of columns as the first line.
  • *
  • `nullValue` (default empty string): sets the string representation of a null value.
  • + *
  • `emptyValue` (default `""`): sets the string representation of an empty value.
  • *
  • `encoding` (by default it is not set): specifies encoding (charset) of saved csv * files. If it is not set, the UTF-8 charset will be used.
  • *
  • `compression` (default `null`): compression codec to use when saving to file. This can be diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index db439b1ee76f..fa14aa14ee96 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -306,16 +306,16 @@ class Dataset[T] private[sql]( // Compute the width of each column for (row <- rows) { for ((cell, i) <- row.zipWithIndex) { - colWidths(i) = math.max(colWidths(i), cell.length) + colWidths(i) = math.max(colWidths(i), Utils.stringHalfWidth(cell)) } } val paddedRows = rows.map { row => row.zipWithIndex.map { case (cell, i) => if (truncate > 0) { - StringUtils.leftPad(cell, colWidths(i)) + StringUtils.leftPad(cell, colWidths(i) - Utils.stringHalfWidth(cell) + cell.length) } else { - StringUtils.rightPad(cell, colWidths(i)) + StringUtils.rightPad(cell, colWidths(i) - Utils.stringHalfWidth(cell) + cell.length) } } } @@ -337,12 +337,10 @@ class Dataset[T] private[sql]( // Compute the width of field name and data columns val fieldNameColWidth = fieldNames.foldLeft(minimumColWidth) { case (curMax, fieldName) => - math.max(curMax, fieldName.length) + math.max(curMax, Utils.stringHalfWidth(fieldName)) } val dataColWidth = dataRows.foldLeft(minimumColWidth) { case (curMax, row) => - math.max(curMax, row.map(_.length).reduceLeftOption[Int] { case (cellMax, cell) => - math.max(cellMax, cell) - }.getOrElse(0)) + math.max(curMax, row.map(cell => Utils.stringHalfWidth(cell)).max) } dataRows.zipWithIndex.foreach { case (row, i) => @@ -351,8 +349,10 @@ class Dataset[T] private[sql]( s"-RECORD $i", fieldNameColWidth + dataColWidth + 5, "-") sb.append(rowHeader).append("\n") row.zipWithIndex.map { case (cell, j) => - val fieldName = StringUtils.rightPad(fieldNames(j), fieldNameColWidth) - val data = StringUtils.rightPad(cell, dataColWidth) + val fieldName = StringUtils.rightPad(fieldNames(j), + fieldNameColWidth - Utils.stringHalfWidth(fieldNames(j)) + fieldNames(j).length) + val data = StringUtils.rightPad(cell, + dataColWidth - Utils.stringHalfWidth(cell) + cell.length) s" $fieldName | $data " }.addString(sb, "", "\n", "\n") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index d700fb83b9b7..d4e75b5ebd40 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -330,6 +330,15 @@ class RelationalGroupedDataset protected[sql]( * df.groupBy("year").pivot("course").sum("earnings") * }}} * + * From Spark 3.0.0, values can be literal columns, for instance, struct. For pivoting by + * multiple columns, use the `struct` function to combine the columns and values: + * + * {{{ + * df.groupBy("year") + * .pivot("trainingCourse", Seq(struct(lit("java"), lit("Experts")))) + * .agg(sum($"earnings")) + * }}} + * * @param pivotColumn Name of the column to pivot. * @param values List of values that will be translated to columns in the output DataFrame. * @since 1.6.0 @@ -413,10 +422,14 @@ class RelationalGroupedDataset protected[sql]( def pivot(pivotColumn: Column, values: Seq[Any]): RelationalGroupedDataset = { groupType match { case RelationalGroupedDataset.GroupByType => + val valueExprs = values.map(_ match { + case c: Column => c.expr + case v => Literal.apply(v) + }) new RelationalGroupedDataset( df, groupingExprs, - RelationalGroupedDataset.PivotType(pivotColumn.expr, values.map(Literal.apply))) + RelationalGroupedDataset.PivotType(pivotColumn.expr, valueExprs)) case _: RelationalGroupedDataset.PivotType => throw new UnsupportedOperationException("repeated pivots are not supported") case _ => @@ -561,5 +574,5 @@ private[sql] object RelationalGroupedDataset { /** * To indicate it's the PIVOT */ - private[sql] case class PivotType(pivotCol: Expression, values: Seq[Literal]) extends GroupType + private[sql] case class PivotType(pivotCol: Expression, values: Seq[Expression]) extends GroupType } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index af6018472cb0..dfb12f272eb2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -1098,16 +1098,29 @@ object SQLContext { data: Iterator[_], beanClass: Class[_], attrs: Seq[AttributeReference]): Iterator[InternalRow] = { - val extractors = - JavaTypeInference.getJavaBeanReadableProperties(beanClass).map(_.getReadMethod) - val methodsToConverts = extractors.zip(attrs).map { case (e, attr) => - (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType)) + def createStructConverter(cls: Class[_], fieldTypes: Seq[DataType]): Any => InternalRow = { + val methodConverters = + JavaTypeInference.getJavaBeanReadableProperties(cls).zip(fieldTypes) + .map { case (property, fieldType) => + val method = property.getReadMethod + method -> createConverter(method.getReturnType, fieldType) + } + value => + if (value == null) { + null + } else { + new GenericInternalRow( + methodConverters.map { case (method, converter) => + converter(method.invoke(value)) + }) + } } - data.map { element => - new GenericInternalRow( - methodsToConverts.map { case (e, convert) => convert(e.invoke(element)) } - ): InternalRow + def createConverter(cls: Class[_], dataType: DataType): Any => Any = dataType match { + case struct: StructType => createStructConverter(cls, struct.map(_.dataType)) + case _ => CatalystTypeConverters.createToCatalystConverter(dataType) } + val dataConverter = createStructConverter(beanClass, attrs.map(_.dataType)) + data.map(dataConverter) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index c0830e77b5a8..482e2bfeb709 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -17,6 +17,12 @@ package org.apache.spark.sql.api.python +import java.io.InputStream +import java.nio.channels.Channels + +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.api.python.PythonRDDServer +import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo @@ -33,19 +39,36 @@ private[sql] object PythonSQLUtils { } /** - * Python callable function to read a file in Arrow stream format and create a [[DataFrame]] + * Python callable function to read a file in Arrow stream format and create a [[RDD]] * using each serialized ArrowRecordBatch as a partition. - * - * @param sqlContext The active [[SQLContext]]. - * @param filename File to read the Arrow stream from. - * @param schemaString JSON Formatted Spark schema for Arrow batches. - * @return A new [[DataFrame]]. */ - def arrowReadStreamFromFile( - sqlContext: SQLContext, - filename: String, - schemaString: String): DataFrame = { - val jrdd = ArrowConverters.readArrowStreamFromFile(sqlContext, filename) - ArrowConverters.toDataFrame(jrdd, schemaString, sqlContext) + def readArrowStreamFromFile(sqlContext: SQLContext, filename: String): JavaRDD[Array[Byte]] = { + ArrowConverters.readArrowStreamFromFile(sqlContext, filename) + } + + /** + * Python callable function to read a file in Arrow stream format and create a [[DataFrame]] + * from an RDD. + */ + def toDataFrame( + arrowBatchRDD: JavaRDD[Array[Byte]], + schemaString: String, + sqlContext: SQLContext): DataFrame = { + ArrowConverters.toDataFrame(arrowBatchRDD, schemaString, sqlContext) } } + +/** + * Helper for making a dataframe from arrow data from data sent from python over a socket. This is + * used when encryption is enabled, and we don't want to write data to a file. + */ +private[sql] class ArrowRDDServer(sqlContext: SQLContext) extends PythonRDDServer { + + override protected def streamToRDD(input: InputStream): RDD[Array[Byte]] = { + // Create array to consume iterator so that we can safely close the inputStream + val batches = ArrowConverters.getBatchesFromStream(Channels.newChannel(input)).toArray + // Parallelize the record batches to create an RDD + JavaRDD.fromRDD(sqlContext.sparkContext.parallelize(batches, batches.length)) + } + +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala index 48abad907865..9f6b59336080 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala @@ -136,7 +136,7 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport { |if ($batch == null) { | $nextBatchFuncName(); |} - |while ($batch != null) { + |while ($limitNotReachedCond $batch != null) { | int $numRows = $batch.numRows(); | int $localEnd = $numRows - $idx; | for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) { @@ -166,7 +166,7 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport { } val inputRow = if (needsUnsafeRowConversion) null else row s""" - |while ($input.hasNext()) { + |while ($limitNotReachedCond $input.hasNext()) { | InternalRow $row = (InternalRow) $input.next(); | $numOutputRows.add(1); | ${consume(ctx, outputVars, inputRow).trim} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 36ed016773b6..738c0666bc3f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -284,6 +284,7 @@ case class FileSourceScanExec( "Batched" -> supportsBatch.toString, "PartitionFilters" -> seqToString(partitionFilters), "PushedFilters" -> seqToString(pushedDownFilters), + "DataFilters" -> seqToString(dataFilters), "Location" -> locationDesc) val withOptPartitionCount = relation.partitionSchemaOption.map { _ => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala index 0dc16ba5ce28..f1470e45f129 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala @@ -39,7 +39,7 @@ case class SortExec( global: Boolean, child: SparkPlan, testSpillFrequency: Int = 0) - extends UnaryExecNode with CodegenSupport { + extends UnaryExecNode with BlockingOperatorWithCodegen { override def output: Seq[Attribute] = child.output @@ -124,14 +124,6 @@ case class SortExec( // Name of sorter variable used in codegen. private var sorterVariable: String = _ - // The result rows come from the sort buffer, so this operator doesn't need to copy its result - // even if its child does. - override def needCopyResult: Boolean = false - - // Sort operator always consumes all the input rows before outputting any result, so we don't need - // a stop check before sorting. - override def needStopCheck: Boolean = false - override protected def doProduce(ctx: CodegenContext): String = { val needToSort = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "needToSort", v => s"$v = true;") @@ -172,7 +164,7 @@ case class SortExec( | $needToSort = false; | } | - | while ($sortedIterator.hasNext()) { + | while ($limitNotReachedCond $sortedIterator.hasNext()) { | UnsafeRow $outputRow = (UnsafeRow)$sortedIterator.next(); | ${consume(ctx, null, outputRow)} | if (shouldStop()) return; diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index 1f97993e2045..9d9b020309d9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -250,7 +250,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ val codec = CompressionCodec.createCodec(SparkEnv.get.conf) val bos = new ByteArrayOutputStream() val out = new DataOutputStream(codec.compressedOutputStream(bos)) - while (iter.hasNext && (n < 0 || count < n)) { + // `iter.hasNext` may produce one row and buffer it, we should only call it when the limit is + // not hit. + while ((n < 0 || count < n) && iter.hasNext) { val row = iter.next().asInstanceOf[UnsafeRow] out.writeInt(row.getSizeInBytes) row.writeToStream(out, buffer) @@ -380,7 +382,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ inputSchema: Seq[Attribute], useSubexprElimination: Boolean = false): MutableProjection = { log.debug(s"Creating MutableProj: $expressions, inputSchema: $inputSchema") - GenerateMutableProjection.generate(expressions, inputSchema, useSubexprElimination) + MutableProjection.create(expressions, inputSchema) } private def genInterpretedPredicate( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala index 2a2315896831..59ffd1638111 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution -import com.fasterxml.jackson.annotation.JsonIgnoreProperties - import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.metric.SQLMetricInfo @@ -28,11 +26,11 @@ import org.apache.spark.sql.execution.metric.SQLMetricInfo * Stores information about a SQL SparkPlan. */ @DeveloperApi -@JsonIgnoreProperties(Array("metadata")) // The metadata field was removed in Spark 2.3. class SparkPlanInfo( val nodeName: String, val simpleString: String, val children: Seq[SparkPlanInfo], + val metadata: Map[String, String], val metrics: Seq[SQLMetricInfo]) { override def hashCode(): Int = { @@ -59,6 +57,12 @@ private[execution] object SparkPlanInfo { new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType) } - new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan), metrics) + // dump the file scan metadata (e.g file path) to event log + val metadata = plan match { + case fileScan: FileSourceScanExec => fileScan.metadata + case _ => Map[String, String]() + } + new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan), + metadata, metrics) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 89cb63784c0f..4ed14d3e077f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -102,15 +102,29 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { * {{{ * ANALYZE TABLE [db_name.]tablename COMPUTE STATISTICS FOR COLUMNS column1, column2; * }}} + * + * Example SQL for analyzing all columns of a table: + * {{{ + * ANALYZE TABLE [db_name.]tablename COMPUTE STATISTICS FOR ALL COLUMNS; + * }}} */ override def visitAnalyze(ctx: AnalyzeContext): LogicalPlan = withOrigin(ctx) { + def checkPartitionSpec(): Unit = { + if (ctx.partitionSpec != null) { + logWarning("Partition specification is ignored when collecting column statistics: " + + ctx.partitionSpec.getText) + } + } if (ctx.identifier != null && ctx.identifier.getText.toLowerCase(Locale.ROOT) != "noscan") { throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`", ctx) } val table = visitTableIdentifier(ctx.tableIdentifier) - if (ctx.identifierSeq() == null) { + if (ctx.ALL() != null) { + checkPartitionSpec() + AnalyzeColumnCommand(table, None, allColumns = true) + } else if (ctx.identifierSeq() == null) { if (ctx.partitionSpec != null) { AnalyzePartitionCommand(table, visitPartitionSpec(ctx.partitionSpec), noscan = ctx.identifier != null) @@ -118,13 +132,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { AnalyzeTableCommand(table, noscan = ctx.identifier != null) } } else { - if (ctx.partitionSpec != null) { - logWarning("Partition specification is ignored when collecting column statistics: " + - ctx.partitionSpec.getText) - } - AnalyzeColumnCommand( - table, - visitIdentifierSeq(ctx.identifierSeq())) + checkPartitionSpec() + AnalyzeColumnCommand(table, + Option(visitIdentifierSeq(ctx.identifierSeq())), allColumns = false) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala index 1fc4de9e5601..f5aee627fe90 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala @@ -345,6 +345,61 @@ trait CodegenSupport extends SparkPlan { * don't require shouldStop() in the loop of producing rows. */ def needStopCheck: Boolean = parent.needStopCheck + + /** + * A sequence of checks which evaluate to true if the downstream Limit operators have not received + * enough records and reached the limit. If current node is a data producing node, it can leverage + * this information to stop producing data and complete the data flow earlier. Common data + * producing nodes are leaf nodes like Range and Scan, and blocking nodes like Sort and Aggregate. + * These checks should be put into the loop condition of the data producing loop. + */ + def limitNotReachedChecks: Seq[String] = parent.limitNotReachedChecks + + /** + * A helper method to generate the data producing loop condition according to the + * limit-not-reached checks. + */ + final def limitNotReachedCond: String = { + // InputAdapter is also a leaf node. + val isLeafNode = children.isEmpty || this.isInstanceOf[InputAdapter] + if (!isLeafNode && !this.isInstanceOf[BlockingOperatorWithCodegen]) { + val errMsg = "Only leaf nodes and blocking nodes need to call 'limitNotReachedCond' " + + "in its data producing loop." + if (Utils.isTesting) { + throw new IllegalStateException(errMsg) + } else { + logWarning(s"[BUG] $errMsg Please open a JIRA ticket to report it.") + } + } + if (parent.limitNotReachedChecks.isEmpty) { + "" + } else { + parent.limitNotReachedChecks.mkString("", " && ", " &&") + } + } +} + +/** + * A special kind of operators which support whole stage codegen. Blocking means these operators + * will consume all the inputs first, before producing output. Typical blocking operators are + * sort and aggregate. + */ +trait BlockingOperatorWithCodegen extends CodegenSupport { + + // Blocking operators usually have some kind of buffer to keep the data before producing them, so + // then don't to copy its result even if its child does. + override def needCopyResult: Boolean = false + + // Blocking operators always consume all the input first, so its upstream operators don't need a + // stop check. + override def needStopCheck: Boolean = false + + // Blocking operators need to consume all the inputs before producing any output. This means, + // Limit operator after this blocking operator will never reach its limit during the execution of + // this blocking operator's upstream operators. Here we override this method to return Nil, so + // that upstream operators will not generate useless conditions (which are always evaluated to + // false) for the Limit operators after this blocking operator. + override def limitNotReachedChecks: Seq[String] = Nil } @@ -381,7 +436,7 @@ case class InputAdapter(child: SparkPlan) extends UnaryExecNode with CodegenSupp forceInline = true) val row = ctx.freshName("row") s""" - | while ($input.hasNext() && !stopEarly()) { + | while ($limitNotReachedCond $input.hasNext()) { | InternalRow $row = (InternalRow) $input.next(); | ${consume(ctx, null, row).trim} | if (shouldStop()) return; @@ -677,6 +732,8 @@ case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int) override def needStopCheck: Boolean = true + override def limitNotReachedChecks: Seq[String] = Nil + override protected def otherCopyArgs: Seq[AnyRef] = Seq(codegenStageId.asInstanceOf[Integer]) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index 98adba50b297..6155ec9d30db 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -45,7 +45,7 @@ case class HashAggregateExec( initialInputBufferOffset: Int, resultExpressions: Seq[NamedExpression], child: SparkPlan) - extends UnaryExecNode with CodegenSupport { + extends UnaryExecNode with BlockingOperatorWithCodegen { private[this] val aggregateBufferAttributes = { aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) @@ -151,14 +151,6 @@ case class HashAggregateExec( child.asInstanceOf[CodegenSupport].inputRDDs() } - // The result rows come from the aggregate buffer, or a single row(no grouping keys), so this - // operator doesn't need to copy its result even if its child does. - override def needCopyResult: Boolean = false - - // Aggregate operator always consumes all the input rows before outputting any result, so we - // don't need a stop check before aggregating. - override def needStopCheck: Boolean = false - protected override def doProduce(ctx: CodegenContext): String = { if (groupingExpressions.isEmpty) { doProduceWithoutKeys(ctx) @@ -705,13 +697,16 @@ case class HashAggregateExec( def outputFromRegularHashMap: String = { s""" - |while ($iterTerm.next()) { + |while ($limitNotReachedCond $iterTerm.next()) { | UnsafeRow $keyTerm = (UnsafeRow) $iterTerm.getKey(); | UnsafeRow $bufferTerm = (UnsafeRow) $iterTerm.getValue(); | $outputFunc($keyTerm, $bufferTerm); - | | if (shouldStop()) return; |} + |$iterTerm.close(); + |if ($sorterTerm == null) { + | $hashMapTerm.free(); + |} """.stripMargin } @@ -728,11 +723,6 @@ case class HashAggregateExec( // output the result $outputFromFastHashMap $outputFromRegularHashMap - - $iterTerm.close(); - if ($sorterTerm == null) { - $hashMapTerm.free(); - } """ } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala index 3d2443ca959a..56cf78d8b7fc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala @@ -48,6 +48,12 @@ class RowBasedHashMapGenerator( val keySchema = ctx.addReferenceObj("keySchemaTerm", groupingKeySchema) val valueSchema = ctx.addReferenceObj("valueSchemaTerm", bufferSchema) + val numVarLenFields = groupingKeys.map(_.dataType).count { + case dt if UnsafeRow.isFixedLength(dt) => false + // TODO: consider large decimal and interval type + case _ => true + } + s""" | private org.apache.spark.sql.catalyst.expressions.RowBasedKeyValueBatch batch; | private int[] buckets; @@ -60,6 +66,7 @@ class RowBasedHashMapGenerator( | private long emptyVOff; | private int emptyVLen; | private boolean isBatchFull = false; + | private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter agg_rowWriter; | | | public $generatedClassName( @@ -75,6 +82,9 @@ class RowBasedHashMapGenerator( | emptyVOff = Platform.BYTE_ARRAY_OFFSET; | emptyVLen = emptyBuffer.length; | + | agg_rowWriter = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter( + | ${groupingKeySchema.length}, ${numVarLenFields * 32}); + | | buckets = new int[numBuckets]; | java.util.Arrays.fill(buckets, -1); | } @@ -112,12 +122,6 @@ class RowBasedHashMapGenerator( * */ protected def generateFindOrInsert(): String = { - val numVarLenFields = groupingKeys.map(_.dataType).count { - case dt if UnsafeRow.isFixedLength(dt) => false - // TODO: consider large decimal and interval type - case _ => true - } - val createUnsafeRowForKey = groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) => key.dataType match { case t: DecimalType => @@ -130,6 +134,12 @@ class RowBasedHashMapGenerator( } }.mkString(";\n") + val resetNullBits = if (groupingKeySchema.map(_.nullable).forall(_ == false)) { + "" + } else { + "agg_rowWriter.zeroOutNullBytes();" + } + s""" |public org.apache.spark.sql.catalyst.expressions.UnsafeRow findOrInsert(${ groupingKeySignature}) { @@ -140,12 +150,8 @@ class RowBasedHashMapGenerator( | // Return bucket index if it's either an empty slot or already contains the key | if (buckets[idx] == -1) { | if (numRows < capacity && !isBatchFull) { - | // creating the unsafe for new entry - | org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter agg_rowWriter - | = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter( - | ${groupingKeySchema.length}, ${numVarLenFields * 32}); - | agg_rowWriter.reset(); //TODO: investigate if reset or zeroout are actually needed - | agg_rowWriter.zeroOutNullBytes(); + | agg_rowWriter.reset(); + | $resetNullBits | ${createUnsafeRowForKey}; | org.apache.spark.sql.catalyst.expressions.UnsafeRow agg_result | = agg_rowWriter.getRow(); diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala index 72aa4adff4e6..100486fa9850 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala @@ -365,7 +365,7 @@ case class ScalaUDAF( val inputAttributes = childrenSchema.toAttributes log.debug( s"Creating MutableProj: $children, inputSchema: $inputAttributes.") - GenerateMutableProjection.generate(children, inputAttributes) + MutableProjection.create(children, inputAttributes) } private[this] lazy val inputToScalaConverters: Any => Any = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala index 1a48bc8398a6..2bf6a58b5565 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.arrow import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream, OutputStream} -import java.nio.channels.{Channels, SeekableByteChannel} +import java.nio.channels.{Channels, ReadableByteChannel} import scala.collection.JavaConverters._ @@ -31,6 +31,7 @@ import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, MessageSerializer} import org.apache.spark.TaskContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.network.util.JavaUtils +import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ @@ -189,7 +190,7 @@ private[sql] object ArrowConverters { } /** - * Create a DataFrame from a JavaRDD of serialized ArrowRecordBatches. + * Create a DataFrame from an RDD of serialized ArrowRecordBatches. */ private[sql] def toDataFrame( arrowBatchRDD: JavaRDD[Array[Byte]], @@ -221,7 +222,7 @@ private[sql] object ArrowConverters { /** * Read an Arrow stream input and return an iterator of serialized ArrowRecordBatches. */ - private[sql] def getBatchesFromStream(in: SeekableByteChannel): Iterator[Array[Byte]] = { + private[sql] def getBatchesFromStream(in: ReadableByteChannel): Iterator[Array[Byte]] = { // Iterate over the serialized Arrow RecordBatch messages from a stream new Iterator[Array[Byte]] { @@ -271,7 +272,7 @@ private[sql] object ArrowConverters { } else { if (bodyLength > 0) { // Skip message body if not a RecordBatch - in.position(in.position() + bodyLength) + Channels.newInputStream(in).skip(bodyLength) } // Proceed to next message diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala index 533097ac399e..b1e8fb39ac9d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowUtils.scala @@ -131,11 +131,8 @@ object ArrowUtils { } else { Nil } - val pandasColsByPosition = if (conf.pandasGroupedMapAssignColumnssByPosition) { - Seq(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_POSITION.key -> "true") - } else { - Nil - } - Map(timeZoneConf ++ pandasColsByPosition: _*) + val pandasColsByName = Seq(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME.key -> + conf.pandasGroupedMapAssignColumnsByName.toString) + Map(timeZoneConf ++ pandasColsByName: _*) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index 9434ceb7cd16..4cd2e788ade0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -68,8 +68,7 @@ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan) protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitionsWithIndexInternal { (index, iter) => - val project = UnsafeProjection.create(projectList, child.output, - subexpressionEliminationEnabled) + val project = UnsafeProjection.create(projectList, child.output) project.initialize(index) iter.map(project) } @@ -379,7 +378,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) val numOutput = metricTerm(ctx, "numOutputRows") val initTerm = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "initRange") - val number = ctx.addMutableState(CodeGenerator.JAVA_LONG, "number") + val nextIndex = ctx.addMutableState(CodeGenerator.JAVA_LONG, "nextIndex") val value = ctx.freshName("value") val ev = ExprCode.forNonNullValue(JavaCode.variable(value, LongType)) @@ -398,7 +397,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) // within a batch, while the code in the outer loop is setting batch parameters and updating // the metrics. - // Once number == batchEnd, it's time to progress to the next batch. + // Once nextIndex == batchEnd, it's time to progress to the next batch. val batchEnd = ctx.addMutableState(CodeGenerator.JAVA_LONG, "batchEnd") // How many values should still be generated by this range operator. @@ -422,13 +421,13 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) | | $BigInt st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); | if (st.compareTo($BigInt.valueOf(Long.MAX_VALUE)) > 0) { - | $number = Long.MAX_VALUE; + | $nextIndex = Long.MAX_VALUE; | } else if (st.compareTo($BigInt.valueOf(Long.MIN_VALUE)) < 0) { - | $number = Long.MIN_VALUE; + | $nextIndex = Long.MIN_VALUE; | } else { - | $number = st.longValue(); + | $nextIndex = st.longValue(); | } - | $batchEnd = $number; + | $batchEnd = $nextIndex; | | $BigInt end = index.add($BigInt.ONE).multiply(numElement).divide(numSlice) | .multiply(step).add(start); @@ -441,7 +440,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) | } | | $BigInt startToEnd = $BigInt.valueOf(partitionEnd).subtract( - | $BigInt.valueOf($number)); + | $BigInt.valueOf($nextIndex)); | $numElementsTodo = startToEnd.divide(step).longValue(); | if ($numElementsTodo < 0) { | $numElementsTodo = 0; @@ -453,12 +452,42 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) val localIdx = ctx.freshName("localIdx") val localEnd = ctx.freshName("localEnd") - val range = ctx.freshName("range") val shouldStop = if (parent.needStopCheck) { - s"if (shouldStop()) { $number = $value + ${step}L; return; }" + s"if (shouldStop()) { $nextIndex = $value + ${step}L; return; }" } else { "// shouldStop check is eliminated" } + val loopCondition = if (limitNotReachedChecks.isEmpty) { + "true" + } else { + limitNotReachedChecks.mkString(" && ") + } + + // An overview of the Range processing. + // + // For each partition, the Range task needs to produce records from partition start(inclusive) + // to end(exclusive). For better performance, we separate the partition range into batches, and + // use 2 loops to produce data. The outer while loop is used to iterate batches, and the inner + // for loop is used to iterate records inside a batch. + // + // `nextIndex` tracks the index of the next record that is going to be consumed, initialized + // with partition start. `batchEnd` tracks the end index of the current batch, initialized + // with `nextIndex`. In the outer loop, we first check if `nextIndex == batchEnd`. If it's true, + // it means the current batch is fully consumed, and we will update `batchEnd` to process the + // next batch. If `batchEnd` reaches partition end, exit the outer loop. Finally we enter the + // inner loop. Note that, when we enter inner loop, `nextIndex` must be different from + // `batchEnd`, otherwise we already exit the outer loop. + // + // The inner loop iterates from 0 to `localEnd`, which is calculated by + // `(batchEnd - nextIndex) / step`. Since `batchEnd` is increased by `nextBatchTodo * step` in + // the outer loop, and initialized with `nextIndex`, so `batchEnd - nextIndex` is always + // divisible by `step`. The `nextIndex` is increased by `step` during each iteration, and ends + // up being equal to `batchEnd` when the inner loop finishes. + // + // The inner loop can be interrupted, if the query has produced at least one result row, so that + // we don't buffer too many result rows and waste memory. It's ok to interrupt the inner loop, + // because `nextIndex` will be updated before interrupting. + s""" | // initialize Range | if (!$initTerm) { @@ -466,33 +495,30 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) | $initRangeFuncName(partitionIndex); | } | - | while (true) { - | long $range = $batchEnd - $number; - | if ($range != 0L) { - | int $localEnd = (int)($range / ${step}L); - | for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) { - | long $value = ((long)$localIdx * ${step}L) + $number; - | ${consume(ctx, Seq(ev))} - | $shouldStop + | while ($loopCondition) { + | if ($nextIndex == $batchEnd) { + | long $nextBatchTodo; + | if ($numElementsTodo > ${batchSize}L) { + | $nextBatchTodo = ${batchSize}L; + | $numElementsTodo -= ${batchSize}L; + | } else { + | $nextBatchTodo = $numElementsTodo; + | $numElementsTodo = 0; + | if ($nextBatchTodo == 0) break; | } - | $number = $batchEnd; + | $numOutput.add($nextBatchTodo); + | $inputMetrics.incRecordsRead($nextBatchTodo); + | $batchEnd += $nextBatchTodo * ${step}L; | } | - | $taskContext.killTaskIfInterrupted(); - | - | long $nextBatchTodo; - | if ($numElementsTodo > ${batchSize}L) { - | $nextBatchTodo = ${batchSize}L; - | $numElementsTodo -= ${batchSize}L; - | } else { - | $nextBatchTodo = $numElementsTodo; - | $numElementsTodo = 0; - | if ($nextBatchTodo == 0) break; + | int $localEnd = (int)(($batchEnd - $nextIndex) / ${step}L); + | for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) { + | long $value = ((long)$localIdx * ${step}L) + $nextIndex; + | ${consume(ctx, Seq(ev))} + | $shouldStop | } - | $numOutput.add($nextBatchTodo); - | $inputMetrics.incRecordsRead($nextBatchTodo); - | - | $batchEnd += $nextBatchTodo * ${step}L; + | $nextIndex = $batchEnd; + | $taskContext.killTaskIfInterrupted(); | } """.stripMargin } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala index 3fea6d7c7fbf..93447a52097c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala @@ -33,13 +33,17 @@ import org.apache.spark.sql.types._ /** * Analyzes the given columns of the given table to generate statistics, which will be used in - * query optimizations. + * query optimizations. Parameter `allColumns` may be specified to generate statistics of all the + * columns of a given table. */ case class AnalyzeColumnCommand( tableIdent: TableIdentifier, - columnNames: Seq[String]) extends RunnableCommand { + columnNames: Option[Seq[String]], + allColumns: Boolean) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { + require((columnNames.isDefined ^ allColumns), "Parameter `columnNames` or `allColumns` are " + + "mutually exclusive. Only one of them should be specified.") val sessionState = sparkSession.sessionState val db = tableIdent.database.getOrElse(sessionState.catalog.getCurrentDatabase) val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db)) @@ -48,9 +52,12 @@ case class AnalyzeColumnCommand( throw new AnalysisException("ANALYZE TABLE is not supported on views.") } val sizeInBytes = CommandUtils.calculateTotalSize(sparkSession, tableMeta) + val relation = sparkSession.table(tableIdent).logicalPlan + val columnsToAnalyze = getColumnsToAnalyze(tableIdent, relation, columnNames, allColumns) - // Compute stats for each column - val (rowCount, newColStats) = computeColumnStats(sparkSession, tableIdentWithDB, columnNames) + // Compute stats for the computed list of columns. + val (rowCount, newColStats) = + computeColumnStats(sparkSession, relation, columnsToAnalyze) // We also update table-level stats in order to keep them consistent with column-level stats. val statistics = CatalogStatistics( @@ -64,31 +71,39 @@ case class AnalyzeColumnCommand( Seq.empty[Row] } - /** - * Compute stats for the given columns. - * @return (row count, map from column name to CatalogColumnStats) - */ - private def computeColumnStats( - sparkSession: SparkSession, + private def getColumnsToAnalyze( tableIdent: TableIdentifier, - columnNames: Seq[String]): (Long, Map[String, CatalogColumnStat]) = { - - val conf = sparkSession.sessionState.conf - val relation = sparkSession.table(tableIdent).logicalPlan - // Resolve the column names and dedup using AttributeSet - val attributesToAnalyze = columnNames.map { col => - val exprOption = relation.output.find(attr => conf.resolver(attr.name, col)) - exprOption.getOrElse(throw new AnalysisException(s"Column $col does not exist.")) + relation: LogicalPlan, + columnNames: Option[Seq[String]], + allColumns: Boolean = false): Seq[Attribute] = { + val columnsToAnalyze = if (allColumns) { + relation.output + } else { + columnNames.get.map { col => + val exprOption = relation.output.find(attr => conf.resolver(attr.name, col)) + exprOption.getOrElse(throw new AnalysisException(s"Column $col does not exist.")) + } } - // Make sure the column types are supported for stats gathering. - attributesToAnalyze.foreach { attr => + columnsToAnalyze.foreach { attr => if (!supportsType(attr.dataType)) { throw new AnalysisException( s"Column ${attr.name} in table $tableIdent is of type ${attr.dataType}, " + "and Spark does not support statistics collection on this column type.") } } + columnsToAnalyze + } + + /** + * Compute stats for the given columns. + * @return (row count, map from column name to CatalogColumnStats) + */ + private def computeColumnStats( + sparkSession: SparkSession, + relation: LogicalPlan, + columns: Seq[Attribute]): (Long, Map[String, CatalogColumnStat]) = { + val conf = sparkSession.sessionState.conf // Collect statistics per column. // If no histogram is required, we run a job to compute basic column stats such as @@ -99,20 +114,20 @@ case class AnalyzeColumnCommand( // 2. use the percentiles as value intervals of bins, e.g. [p(0), p(1/n)], // [p(1/n), p(2/n)], ..., [p((n-1)/n), p(1)], and then count ndv in each bin. // Basic column stats will be computed together in the second job. - val attributePercentiles = computePercentiles(attributesToAnalyze, sparkSession, relation) + val attributePercentiles = computePercentiles(columns, sparkSession, relation) // The first element in the result will be the overall row count, the following elements // will be structs containing all column stats. // The layout of each struct follows the layout of the ColumnStats. val expressions = Count(Literal(1)).toAggregateExpression() +: - attributesToAnalyze.map(statExprs(_, conf, attributePercentiles)) + columns.map(statExprs(_, conf, attributePercentiles)) val namedExpressions = expressions.map(e => Alias(e, e.toString)()) val statsRow = new QueryExecution(sparkSession, Aggregate(Nil, namedExpressions, relation)) .executedPlan.executeTake(1).head val rowCount = statsRow.getLong(0) - val columnStats = attributesToAnalyze.zipWithIndex.map { case (attr, i) => + val columnStats = columns.zipWithIndex.map { case (attr, i) => // according to `statExprs`, the stats struct always have 7 fields. (attr.name, rowToColumnStat(statsRow.getStruct(i + 1, 7), attr, rowCount, attributePercentiles.get(attr)).toCatalogColumnStat(attr.name, attr.dataType)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala index 0a185b847206..a1bb5af1ab72 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration /** @@ -75,18 +74,4 @@ object DataWritingCommand { attr.withName(outputName) } } - - /** - * Returns schema of logical plan with provided names. - * The length of provided names should be the same of the length of [[LogicalPlan.schema]]. - */ - def logicalPlanSchemaWithNames( - query: LogicalPlan, - names: Seq[String]): StructType = { - assert(query.schema.length == names.length, - "The length of provided names doesn't match the length of query schema.") - StructType(query.schema.zip(names).map { case (structField, outputName) => - structField.copy(name = outputName) - }) - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index 5172f32ec7b9..cd34dfafd132 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -242,6 +242,7 @@ case class CreateViewCommand( storage = CatalogStorageFormat.empty, schema = aliasPlan(session, analyzedPlan).schema, properties = newProperties, + viewOriginalText = originalText, viewText = originalText, comment = comment ) @@ -299,6 +300,7 @@ case class AlterViewAsCommand( val updatedViewMeta = viewMeta.copy( schema = analyzedPlan.schema, properties = newProperties, + viewOriginalText = Some(originalText), viewText = Some(originalText)) session.sessionState.catalog.alterTable(updatedViewMeta) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala index 7c6ab4bc922f..774fe38f5c2e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala @@ -183,15 +183,15 @@ object FileFormatWriter extends Logging { val commitMsgs = ret.map(_.commitMsg) committer.commitJob(job, commitMsgs) - logInfo(s"Job ${job.getJobID} committed.") + logInfo(s"Write Job ${description.uuid} committed.") processStats(description.statsTrackers, ret.map(_.summary.stats)) - logInfo(s"Finished processing stats for job ${job.getJobID}.") + logInfo(s"Finished processing stats for write job ${description.uuid}.") // return a set of all the partition paths that were updated during this job ret.map(_.summary.updatedPartitions).reduceOption(_ ++ _).getOrElse(Set.empty) } catch { case cause: Throwable => - logError(s"Aborting job ${job.getJobID}.", cause) + logError(s"Aborting job ${description.uuid}.", cause) committer.abortJob(job) throw new SparkException("Job aborted.", cause) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index 99fc78ff3e49..345c9d82ca0e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -89,14 +89,6 @@ class FileScanRDD( inputMetrics.setBytesRead(existingBytesRead + getBytesReadCallback()) } - // If we can't get the bytes read from the FS stats, fall back to the file size, - // which may be inaccurate. - private def updateBytesReadWithFileSize(): Unit = { - if (currentFile != null) { - inputMetrics.incBytesRead(currentFile.length) - } - } - private[this] val files = split.asInstanceOf[FilePartition].files.toIterator private[this] var currentFile: PartitionedFile = null private[this] var currentIterator: Iterator[Object] = null @@ -139,7 +131,6 @@ class FileScanRDD( /** Advances to the next file. Returns true if a new non-empty iterator is available. */ private def nextIterator(): Boolean = { - updateBytesReadWithFileSize() if (files.hasNext) { currentFile = files.next() logInfo(s"Reading File $currentFile") @@ -208,7 +199,6 @@ class FileScanRDD( override def close(): Unit = { updateBytesRead() - updateBytesReadWithFileSize() InputFileBlockHolder.unset() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala index dc5c2ff927e4..fe418e610da8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala @@ -315,7 +315,14 @@ object InMemoryFileIndex extends Logging { // which is very slow on some file system (RawLocalFileSystem, which is launch a // subprocess and parse the stdout). try { - val locations = fs.getFileBlockLocations(f, 0, f.getLen) + val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc => + // Store BlockLocation objects to consume less memory + if (loc.getClass == classOf[BlockLocation]) { + loc + } else { + new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength) + } + } val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize, f.getModificationTime, 0, null, null, null, null, f.getPath, locations) if (f.isSymlink) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index 484942d35c85..d43fa3893df1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -95,7 +95,9 @@ case class InsertIntoHadoopFsRelationCommand( val parameters = CaseInsensitiveMap(options) val partitionOverwriteMode = parameters.get("partitionOverwriteMode") + // scalastyle:off caselocale .map(mode => PartitionOverwriteMode.withName(mode.toUpperCase)) + // scalastyle:on caselocale .getOrElse(sparkSession.sessionState.conf.partitionOverwriteMode) val enableDynamicOverwrite = partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC // This config only makes sense when we are overwriting a partitioned dataset with dynamic diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 2b86054c0ffc..b93f418bcb5b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -86,7 +86,9 @@ abstract class CSVDataSource extends Serializable { if (options.headerFlag) { val duplicates = { val headerNames = row.filter(_ != null) + // scalastyle:off caselocale .map(name => if (caseSensitive) name else name.toLowerCase) + // scalastyle:on caselocale headerNames.diff(headerNames.distinct).distinct } @@ -95,7 +97,9 @@ abstract class CSVDataSource extends Serializable { // When there are empty strings or the values set in `nullValue`, put the // index as the suffix. s"_c$index" + // scalastyle:off caselocale } else if (!caseSensitive && duplicates.contains(value.toLowerCase)) { + // scalastyle:on caselocale // When there are case-insensitive duplicates, put the index as the suffix. s"$value$index" } else if (duplicates.contains(value)) { @@ -153,8 +157,10 @@ object CSVDataSource extends Logging { while (errorMessage.isEmpty && i < headerLen) { var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i)) if (!caseSensitive) { + // scalastyle:off caselocale nameInSchema = nameInSchema.toLowerCase nameInHeader = nameInHeader.toLowerCase + // scalastyle:on caselocale } if (nameInHeader != nameInSchema) { errorMessage = Some( @@ -240,23 +246,25 @@ object TextInputCSVDataSource extends CSVDataSource { sparkSession: SparkSession, csv: Dataset[String], maybeFirstLine: Option[String], - parsedOptions: CSVOptions): StructType = maybeFirstLine match { - case Some(firstLine) => - val firstRow = new CsvParser(parsedOptions.asParserSettings).parseLine(firstLine) - val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis - val header = makeSafeHeader(firstRow, caseSensitive, parsedOptions) - val sampled: Dataset[String] = CSVUtils.sample(csv, parsedOptions) - val tokenRDD = sampled.rdd.mapPartitions { iter => - val filteredLines = CSVUtils.filterCommentAndEmpty(iter, parsedOptions) - val linesWithoutHeader = - CSVUtils.filterHeaderLine(filteredLines, firstLine, parsedOptions) - val parser = new CsvParser(parsedOptions.asParserSettings) - linesWithoutHeader.map(parser.parseLine) - } - CSVInferSchema.infer(tokenRDD, header, parsedOptions) - case None => - // If the first line could not be read, just return the empty schema. - StructType(Nil) + parsedOptions: CSVOptions): StructType = { + val csvParser = new CsvParser(parsedOptions.asParserSettings) + maybeFirstLine.map(csvParser.parseLine(_)) match { + case Some(firstRow) if firstRow != null => + val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis + val header = makeSafeHeader(firstRow, caseSensitive, parsedOptions) + val sampled: Dataset[String] = CSVUtils.sample(csv, parsedOptions) + val tokenRDD = sampled.rdd.mapPartitions { iter => + val filteredLines = CSVUtils.filterCommentAndEmpty(iter, parsedOptions) + val linesWithoutHeader = + CSVUtils.filterHeaderLine(filteredLines, maybeFirstLine.get, parsedOptions) + val parser = new CsvParser(parsedOptions.asParserSettings) + linesWithoutHeader.map(parser.parseLine) + } + CSVInferSchema.infer(tokenRDD, header, parsedOptions) + case _ => + // If the first line could not be read, just return the empty schema. + StructType(Nil) + } } private def createBaseDataset( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala index a585cbed2551..3596ff105fd7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala @@ -70,7 +70,7 @@ private[csv] object CSVInferSchema { def mergeRowTypes(first: Array[DataType], second: Array[DataType]): Array[DataType] = { first.zipAll(second, NullType, NullType).map { case (a, b) => - findTightestCommonType(a, b).getOrElse(NullType) + compatibleType(a, b).getOrElse(NullType) } } @@ -88,7 +88,7 @@ private[csv] object CSVInferSchema { case LongType => tryParseLong(field, options) case _: DecimalType => // DecimalTypes have different precisions and scales, so we try to find the common type. - findTightestCommonType(typeSoFar, tryParseDecimal(field, options)).getOrElse(StringType) + compatibleType(typeSoFar, tryParseDecimal(field, options)).getOrElse(StringType) case DoubleType => tryParseDouble(field, options) case TimestampType => tryParseTimestamp(field, options) case BooleanType => tryParseBoolean(field, options) @@ -172,35 +172,27 @@ private[csv] object CSVInferSchema { StringType } - private val numericPrecedence: IndexedSeq[DataType] = TypeCoercion.numericPrecedence + /** + * Returns the common data type given two input data types so that the return type + * is compatible with both input data types. + */ + private def compatibleType(t1: DataType, t2: DataType): Option[DataType] = { + TypeCoercion.findTightestCommonType(t1, t2).orElse(findCompatibleTypeForCSV(t1, t2)) + } /** - * Copied from internal Spark api - * [[org.apache.spark.sql.catalyst.analysis.TypeCoercion]] + * The following pattern matching represents additional type promotion rules that + * are CSV specific. */ - val findTightestCommonType: (DataType, DataType) => Option[DataType] = { - case (t1, t2) if t1 == t2 => Some(t1) - case (NullType, t1) => Some(t1) - case (t1, NullType) => Some(t1) + private val findCompatibleTypeForCSV: (DataType, DataType) => Option[DataType] = { case (StringType, t2) => Some(StringType) case (t1, StringType) => Some(StringType) - // Promote numeric types to the highest of the two and all numeric types to unlimited decimal - case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) => - val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2) - Some(numericPrecedence(index)) - - // These two cases below deal with when `DecimalType` is larger than `IntegralType`. - case (t1: IntegralType, t2: DecimalType) if t2.isWiderThan(t1) => - Some(t2) - case (t1: DecimalType, t2: IntegralType) if t1.isWiderThan(t2) => - Some(t1) - // These two cases below deal with when `IntegralType` is larger than `DecimalType`. case (t1: IntegralType, t2: DecimalType) => - findTightestCommonType(DecimalType.forType(t1), t2) + compatibleType(DecimalType.forType(t1), t2) case (t1: DecimalType, t2: IntegralType) => - findTightestCommonType(t1, DecimalType.forType(t2)) + compatibleType(t1, DecimalType.forType(t2)) // Double support larger range than fixed decimal, DecimalType.Maximum should be enough // in most case, also have better precision. @@ -216,7 +208,6 @@ private[csv] object CSVInferSchema { } else { Some(DecimalType(range + scale, scale)) } - case _ => None } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index fab8d62da0c1..492a21be6df3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -162,6 +162,21 @@ class CSVOptions( */ val enforceSchema = getBool("enforceSchema", default = true) + + /** + * String representation of an empty value in read and in write. + */ + val emptyValue = parameters.get("emptyValue") + /** + * The string is returned when CSV reader doesn't have any characters for input value, + * or an empty quoted string `""`. Default value is empty string. + */ + val emptyValueInRead = emptyValue.getOrElse("") + /** + * The value is used instead of an empty string in write. Default value is `""` + */ + val emptyValueInWrite = emptyValue.getOrElse("\"\"") + def asWriterSettings: CsvWriterSettings = { val writerSettings = new CsvWriterSettings() val format = writerSettings.getFormat @@ -173,7 +188,7 @@ class CSVOptions( writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite) writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite) writerSettings.setNullValue(nullValue) - writerSettings.setEmptyValue("\"\"") + writerSettings.setEmptyValue(emptyValueInWrite) writerSettings.setSkipEmptyLines(true) writerSettings.setQuoteAllFields(quoteAll) writerSettings.setQuoteEscapingEnabled(escapeQuotes) @@ -194,7 +209,7 @@ class CSVOptions( settings.setInputBufferSize(inputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) - settings.setEmptyValue("") + settings.setEmptyValue(emptyValueInRead) settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) settings diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index e15af425b264..9088d43905e2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -216,7 +216,12 @@ class UnivocityParser( } private def convert(tokens: Array[String]): InternalRow = { - if (tokens.length != parsedSchema.length) { + if (tokens == null) { + throw BadRecordException( + () => getCurrentInput, + () => None, + new RuntimeException("Malformed CSV record")) + } else if (tokens.length != parsedSchema.length) { // If the number of tokens doesn't match the schema, we should treat it as a malformed record. // However, we still have chance to parse some of the tokens, by adding extra null tokens in // the tail if the number is smaller, or by dropping extra tokens if the number is larger. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala index ac062fdc092e..95fb25bf5add 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.datasources.orc +import java.util.Locale + import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration @@ -27,7 +29,7 @@ import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, caseSensitiveResolution} +import org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types._ @@ -116,8 +118,29 @@ object OrcUtils extends Logging { } }) } else { - val resolver = if (isCaseSensitive) caseSensitiveResolution else caseInsensitiveResolution - Some(requiredSchema.fieldNames.map { name => orcFieldNames.indexWhere(resolver(_, name)) }) + if (isCaseSensitive) { + Some(requiredSchema.fieldNames.map { name => + orcFieldNames.indexWhere(caseSensitiveResolution(_, name)) + }) + } else { + // Do case-insensitive resolution only if in case-insensitive mode + val caseInsensitiveOrcFieldMap = + orcFieldNames.zipWithIndex.groupBy(_._1.toLowerCase(Locale.ROOT)) + Some(requiredSchema.fieldNames.map { requiredFieldName => + caseInsensitiveOrcFieldMap + .get(requiredFieldName.toLowerCase(Locale.ROOT)) + .map { matchedOrcFields => + if (matchedOrcFields.size > 1) { + // Need to fail if there is ambiguity, i.e. more than one field is matched. + val matchedOrcFieldsString = matchedOrcFields.map(_._1).mkString("[", ", ", "]") + throw new RuntimeException(s"""Found duplicate field(s) "$requiredFieldName": """ + + s"$matchedOrcFieldsString in case-insensitive mode") + } else { + matchedOrcFields.head._2 + } + }.getOrElse(-1) + }) + } } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index 0c286defb940..21ab9c78e53d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -394,7 +394,22 @@ private[parquet] class ParquetFilters( */ def createFilter(schema: MessageType, predicate: sources.Filter): Option[FilterPredicate] = { val nameToParquetField = getFieldMap(schema) + createFilterHelper(nameToParquetField, predicate, canPartialPushDownConjuncts = true) + } + /** + * @param nameToParquetField a map from the field name to its field name and data type. + * This only includes the root fields whose types are primitive types. + * @param predicate the input filter predicates. Not all the predicates can be pushed down. + * @param canPartialPushDownConjuncts whether a subset of conjuncts of predicates can be pushed + * down safely. Pushing ONLY one side of AND down is safe to + * do at the top level or none of its ancestors is NOT and OR. + * @return the Parquet-native filter predicates that are eligible for pushdown. + */ + private def createFilterHelper( + nameToParquetField: Map[String, ParquetField], + predicate: sources.Filter, + canPartialPushDownConjuncts: Boolean): Option[FilterPredicate] = { // Decimal type must make sure that filter value's scale matched the file. // If doesn't matched, which would cause data corruption. def isDecimalMatched(value: Any, decimalMeta: DecimalMetadata): Boolean = value match { @@ -488,26 +503,40 @@ private[parquet] class ParquetFilters( .map(_(nameToParquetField(name).fieldName, value)) case sources.And(lhs, rhs) => - // At here, it is not safe to just convert one side if we do not understand the - // other side. Here is an example used to explain the reason. + // At here, it is not safe to just convert one side and remove the other side + // if we do not understand what the parent filters are. + // + // Here is an example used to explain the reason. // Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to // convert b in ('1'). If we only convert a = 2, we will end up with a filter // NOT(a = 2), which will generate wrong results. - // Pushing one side of AND down is only safe to do at the top level. - // You can see ParquetRelation's initializeLocalJobFunc method as an example. - for { - lhsFilter <- createFilter(schema, lhs) - rhsFilter <- createFilter(schema, rhs) - } yield FilterApi.and(lhsFilter, rhsFilter) + // + // Pushing one side of AND down is only safe to do at the top level or in the child + // AND before hitting NOT or OR conditions, and in this case, the unsupported predicate + // can be safely removed. + val lhsFilterOption = + createFilterHelper(nameToParquetField, lhs, canPartialPushDownConjuncts) + val rhsFilterOption = + createFilterHelper(nameToParquetField, rhs, canPartialPushDownConjuncts) + + (lhsFilterOption, rhsFilterOption) match { + case (Some(lhsFilter), Some(rhsFilter)) => Some(FilterApi.and(lhsFilter, rhsFilter)) + case (Some(lhsFilter), None) if canPartialPushDownConjuncts => Some(lhsFilter) + case (None, Some(rhsFilter)) if canPartialPushDownConjuncts => Some(rhsFilter) + case _ => None + } case sources.Or(lhs, rhs) => for { - lhsFilter <- createFilter(schema, lhs) - rhsFilter <- createFilter(schema, rhs) + lhsFilter <- + createFilterHelper(nameToParquetField, lhs, canPartialPushDownConjuncts = false) + rhsFilter <- + createFilterHelper(nameToParquetField, rhs, canPartialPushDownConjuncts = false) } yield FilterApi.or(lhsFilter, rhsFilter) case sources.Not(pred) => - createFilter(schema, pred).map(FilterApi.not) + createFilterHelper(nameToParquetField, pred, canPartialPushDownConjuncts = false) + .map(FilterApi.not) case sources.In(name, values) if canMakeFilterOn(name, values.head) && values.distinct.length <= pushDownInFilterThreshold => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala index 6a46b5f8edc5..91080b15727d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.parquet -import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule @@ -110,7 +110,17 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] { val projectionRootFields = projects.flatMap(getRootFields) val filterRootFields = filters.flatMap(getRootFields) - (projectionRootFields ++ filterRootFields).distinct + // Kind of expressions don't need to access any fields of a root fields, e.g., `IsNotNull`. + // For them, if there are any nested fields accessed in the query, we don't need to add root + // field access of above expressions. + // For example, for a query `SELECT name.first FROM contacts WHERE name IS NOT NULL`, + // we don't need to read nested fields of `name` struct other than `first` field. + val (rootFields, optRootFields) = (projectionRootFields ++ filterRootFields) + .distinct.partition(_.contentAccessed) + + optRootFields.filter { opt => + !rootFields.exists(_.field.name == opt.field.name) + } ++ rootFields } /** @@ -156,7 +166,7 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] { // in the resulting schema may differ from their ordering in the logical relation's // original schema val mergedSchema = requestedRootFields - .map { case RootField(field, _) => StructType(Array(field)) } + .map { case root: RootField => StructType(Array(root.field)) } .reduceLeft(_ merge _) val dataSchemaFieldNames = fileDataSchema.fieldNames.toSet val mergedDataSchema = @@ -199,6 +209,15 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] { case att: Attribute => RootField(StructField(att.name, att.dataType, att.nullable), derivedFromAtt = true) :: Nil case SelectedField(field) => RootField(field, derivedFromAtt = false) :: Nil + // Root field accesses by `IsNotNull` and `IsNull` are special cases as the expressions + // don't actually use any nested fields. These root field accesses might be excluded later + // if there are any nested fields accesses in the query plan. + case IsNotNull(SelectedField(field)) => + RootField(field, derivedFromAtt = false, contentAccessed = false) :: Nil + case IsNull(SelectedField(field)) => + RootField(field, derivedFromAtt = false, contentAccessed = false) :: Nil + case IsNotNull(_: Attribute) | IsNull(_: Attribute) => + expr.children.flatMap(getRootFields).map(_.copy(contentAccessed = false)) case _ => expr.children.flatMap(getRootFields) } @@ -250,8 +269,11 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] { } /** - * A "root" schema field (aka top-level, no-parent) and whether it was derived from - * an attribute or had a proper child. + * This represents a "root" schema field (aka top-level, no-parent). `field` is the + * `StructField` for field name and datatype. `derivedFromAtt` indicates whether it + * was derived from an attribute or had a proper child. `contentAccessed` means whether + * it was accessed with its content by the expressions refer it. */ - private case class RootField(field: StructField, derivedFromAtt: Boolean) + private case class RootField(field: StructField, derivedFromAtt: Boolean, + contentAccessed: Boolean = true) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala index 9576605b1a21..aba94885f941 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala @@ -231,11 +231,6 @@ object ShuffleExchangeExec { override def numPartitions: Int = 1 override def getPartition(key: Any): Int = 0 } - case l: LocalPartitioning => - new Partitioner { - override def numPartitions: Int = l.numPartitions - override def getPartition(key: Any): Int = key.asInstanceOf[Int] - } case _ => sys.error(s"Exchange not implemented for $newPartitioning") // TODO: Handle BroadcastPartitioning. } @@ -252,9 +247,6 @@ object ShuffleExchangeExec { val projection = UnsafeProjection.create(h.partitionIdExpression :: Nil, outputAttributes) row => projection(row).getInt(0) case RangePartitioning(_, _) | SinglePartition => identity - case _: LocalPartitioning => - val partitionId = TaskContext.get().partitionId() - _ => partitionId case _ => sys.error(s"Exchange not implemented for $newPartitioning") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala index fb46970e38f3..9bfe1a79fc1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala @@ -46,17 +46,23 @@ case class CollectLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode } } +object BaseLimitExec { + private val curId = new java.util.concurrent.atomic.AtomicInteger() + + def newLimitCountTerm(): String = { + val id = curId.getAndIncrement() + s"_limit_counter_$id" + } +} + /** - * Take the first `limit` elements of each child partition, but do not collect or shuffle them. + * Helper trait which defines methods that are shared by both + * [[LocalLimitExec]] and [[GlobalLimitExec]]. */ -case class LocalLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode with CodegenSupport { - +trait BaseLimitExec extends UnaryExecNode with CodegenSupport { + val limit: Int override def output: Seq[Attribute] = child.output - override def outputOrdering: Seq[SortOrder] = child.outputOrdering - - override def outputPartitioning: Partitioning = child.outputPartitioning - protected override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter => iter.take(limit) } @@ -69,120 +75,50 @@ case class LocalLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode wi // to the parent operator. override def usedInputs: AttributeSet = AttributeSet.empty + private lazy val countTerm = BaseLimitExec.newLimitCountTerm() + + override lazy val limitNotReachedChecks: Seq[String] = { + s"$countTerm < $limit" +: super.limitNotReachedChecks + } + protected override def doProduce(ctx: CodegenContext): String = { child.asInstanceOf[CodegenSupport].produce(ctx, this) } override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { - val stopEarly = - ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "stopEarly") // init as stopEarly = false - - ctx.addNewFunction("stopEarly", s""" - @Override - protected boolean stopEarly() { - return $stopEarly; - } - """, inlineToOuterClass = true) - val countTerm = ctx.addMutableState(CodeGenerator.JAVA_INT, "count") // init as count = 0 + // The counter name is already obtained by the upstream operators via `limitNotReachedChecks`. + // Here we have to inline it to not change its name. This is fine as we won't have many limit + // operators in one query. + ctx.addMutableState(CodeGenerator.JAVA_INT, countTerm, forceInline = true, useFreshName = false) s""" | if ($countTerm < $limit) { | $countTerm += 1; | ${consume(ctx, input)} - | } else { - | $stopEarly = true; | } """.stripMargin } } /** - * Take the `limit` elements of the child output. + * Take the first `limit` elements of each child partition, but do not collect or shuffle them. */ -case class GlobalLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode { +case class LocalLimitExec(limit: Int, child: SparkPlan) extends BaseLimitExec { - override def output: Seq[Attribute] = child.output + override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def outputPartitioning: Partitioning = child.outputPartitioning +} - override def outputOrdering: Seq[SortOrder] = child.outputOrdering +/** + * Take the first `limit` elements of the child's single output partition. + */ +case class GlobalLimitExec(limit: Int, child: SparkPlan) extends BaseLimitExec { - private val serializer: Serializer = new UnsafeRowSerializer(child.output.size) + override def requiredChildDistribution: List[Distribution] = AllTuples :: Nil - protected override def doExecute(): RDD[InternalRow] = { - val childRDD = child.execute() - val partitioner = LocalPartitioning(childRDD) - val shuffleDependency = ShuffleExchangeExec.prepareShuffleDependency( - childRDD, child.output, partitioner, serializer) - val numberOfOutput: Seq[Long] = if (shuffleDependency.rdd.getNumPartitions != 0) { - // submitMapStage does not accept RDD with 0 partition. - // So, we will not submit this dependency. - val submittedStageFuture = sparkContext.submitMapStage(shuffleDependency) - submittedStageFuture.get().recordsByPartitionId.toSeq - } else { - Nil - } + override def outputPartitioning: Partitioning = child.outputPartitioning - // This is an optimization to evenly distribute limited rows across all partitions. - // When enabled, Spark goes to take rows at each partition repeatedly until reaching - // limit number. When disabled, Spark takes all rows at first partition, then rows - // at second partition ..., until reaching limit number. - val flatGlobalLimit = sqlContext.conf.limitFlatGlobalLimit - - val shuffled = new ShuffledRowRDD(shuffleDependency) - - val sumOfOutput = numberOfOutput.sum - if (sumOfOutput <= limit) { - shuffled - } else if (!flatGlobalLimit) { - var numRowTaken = 0 - val takeAmounts = numberOfOutput.map { num => - if (numRowTaken + num < limit) { - numRowTaken += num.toInt - num.toInt - } else { - val toTake = limit - numRowTaken - numRowTaken += toTake - toTake - } - } - val broadMap = sparkContext.broadcast(takeAmounts) - shuffled.mapPartitionsWithIndexInternal { case (index, iter) => - iter.take(broadMap.value(index).toInt) - } - } else { - // We try to evenly require the asked limit number of rows across all child rdd's partitions. - var rowsNeedToTake: Long = limit - val takeAmountByPartition: Array[Long] = Array.fill[Long](numberOfOutput.length)(0L) - val remainingRowsByPartition: Array[Long] = Array(numberOfOutput: _*) - - while (rowsNeedToTake > 0) { - val nonEmptyParts = remainingRowsByPartition.count(_ > 0) - // If the rows needed to take are less the number of non-empty partitions, take one row from - // each non-empty partitions until we reach `limit` rows. - // Otherwise, evenly divide the needed rows to each non-empty partitions. - val takePerPart = math.max(1, rowsNeedToTake / nonEmptyParts) - remainingRowsByPartition.zipWithIndex.foreach { case (num, index) => - // In case `rowsNeedToTake` < `nonEmptyParts`, we may run out of `rowsNeedToTake` during - // the traversal, so we need to add this check. - if (rowsNeedToTake > 0 && num > 0) { - if (num >= takePerPart) { - rowsNeedToTake -= takePerPart - takeAmountByPartition(index) += takePerPart - remainingRowsByPartition(index) -= takePerPart - } else { - rowsNeedToTake -= num - takeAmountByPartition(index) += num - remainingRowsByPartition(index) -= num - } - } - } - } - val broadMap = sparkContext.broadcast(takeAmountByPartition) - shuffled.mapPartitionsWithIndexInternal { case (index, iter) => - iter.take(broadMap.value(index).toInt) - } - } - } + override def outputOrdering: Seq[SortOrder] = child.outputOrdering } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala index e2fa6e7f504b..d2820ff335ec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/RowQueue.scala @@ -21,9 +21,10 @@ import java.io._ import com.google.common.io.Closeables -import org.apache.spark.SparkException +import org.apache.spark.{SparkEnv, SparkException} import org.apache.spark.io.NioBufferedFileInputStream import org.apache.spark.memory.{MemoryConsumer, TaskMemoryManager} +import org.apache.spark.serializer.SerializerManager import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.memory.MemoryBlock @@ -108,9 +109,13 @@ private[python] abstract class InMemoryRowQueue(val page: MemoryBlock, numFields * A RowQueue that is backed by a file on disk. This queue will stop accepting new rows once any * reader has begun reading from the queue. */ -private[python] case class DiskRowQueue(file: File, fields: Int) extends RowQueue { - private var out = new DataOutputStream( - new BufferedOutputStream(new FileOutputStream(file.toString))) +private[python] case class DiskRowQueue( + file: File, + fields: Int, + serMgr: SerializerManager) extends RowQueue { + + private var out = new DataOutputStream(serMgr.wrapForEncryption( + new BufferedOutputStream(new FileOutputStream(file.toString)))) private var unreadBytes = 0L private var in: DataInputStream = _ @@ -131,7 +136,8 @@ private[python] case class DiskRowQueue(file: File, fields: Int) extends RowQueu if (out != null) { out.close() out = null - in = new DataInputStream(new NioBufferedFileInputStream(file)) + in = new DataInputStream(serMgr.wrapForEncryption( + new NioBufferedFileInputStream(file))) } if (unreadBytes > 0) { @@ -166,7 +172,8 @@ private[python] case class DiskRowQueue(file: File, fields: Int) extends RowQueu private[python] case class HybridRowQueue( memManager: TaskMemoryManager, tempDir: File, - numFields: Int) + numFields: Int, + serMgr: SerializerManager) extends MemoryConsumer(memManager) with RowQueue { // Each buffer should have at least one row @@ -212,7 +219,7 @@ private[python] case class HybridRowQueue( } private def createDiskQueue(): RowQueue = { - DiskRowQueue(File.createTempFile("buffer", "", tempDir), numFields) + DiskRowQueue(File.createTempFile("buffer", "", tempDir), numFields, serMgr) } private def createNewQueue(required: Long): RowQueue = { @@ -279,3 +286,9 @@ private[python] case class HybridRowQueue( } } } + +private[python] object HybridRowQueue { + def apply(taskMemoryMgr: TaskMemoryManager, file: File, fields: Int): HybridRowQueue = { + HybridRowQueue(taskMemoryMgr, file, fields, SparkEnv.get.serializerManager) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala index d54ed44b43bf..1d57cb084df9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala @@ -54,8 +54,7 @@ class FileStreamOptions(parameters: CaseInsensitiveMap[String]) extends Logging Utils.timeStringAsMs(parameters.getOrElse("maxFileAge", "7d")) /** Options as specified by the user, in a case-insensitive map, without "path" set. */ - val optionMapWithoutPath: Map[String, String] = - parameters.filterKeys(_ != "path") + val optionMapWithoutPath: Map[String, String] = parameters - "path" /** * Whether to scan latest files first. If it's true, when the source finds unprocessed files in a diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala index b1cafd67820c..2cac86599ef1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala @@ -511,6 +511,8 @@ class MicroBatchExecution( sparkSessionToRunBatch.sparkContext.setLocalProperty( MicroBatchExecution.BATCH_ID_KEY, currentBatchId.toString) + sparkSessionToRunBatch.sparkContext.setLocalProperty( + StreamExecution.IS_CONTINUOUS_PROCESSING, false.toString) reportTimeTaken("queryPlanning") { lastExecution = new IncrementalExecution( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala index d4b50655c721..392229bcb5f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala @@ -240,9 +240,6 @@ trait ProgressReporter extends Logging { /** Extract number of input sources for each streaming source in plan */ private def extractSourceToNumInputRows(): Map[BaseStreamingSource, Long] = { - import java.util.IdentityHashMap - import scala.collection.JavaConverters._ - def sumRows(tuples: Seq[(BaseStreamingSource, Long)]): Map[BaseStreamingSource, Long] = { tuples.groupBy(_._1).mapValues(_.map(_._2).sum) // sum up rows for each source } @@ -255,40 +252,15 @@ trait ProgressReporter extends Logging { } if (onlyDataSourceV2Sources) { - // DataSourceV2ScanExec is the execution plan leaf that is responsible for reading data - // from a V2 source and has a direct reference to the V2 source that generated it. Each - // DataSourceV2ScanExec records the number of rows it has read using SQLMetrics. However, - // just collecting all DataSourceV2ScanExec nodes and getting the metric is not correct as - // a DataSourceV2ScanExec instance may be referred to in the execution plan from two (or - // even multiple times) points and considering it twice will lead to double counting. We - // can't dedup them using their hashcode either because two different instances of - // DataSourceV2ScanExec can have the same hashcode but account for separate sets of - // records read, and deduping them to consider only one of them would be undercounting the - // records read. Therefore the right way to do this is to consider the unique instances of - // DataSourceV2ScanExec (using their identity hash codes) and get metrics from them. - // Hence we calculate in the following way. - // - // 1. Collect all the unique DataSourceV2ScanExec instances using IdentityHashMap. - // - // 2. Extract the source and the number of rows read from the DataSourceV2ScanExec instanes. - // - // 3. Multiple DataSourceV2ScanExec instance may refer to the same source (can happen with - // self-unions or self-joins). Add up the number of rows for each unique source. - val uniqueStreamingExecLeavesMap = - new IdentityHashMap[DataSourceV2ScanExec, DataSourceV2ScanExec]() - - lastExecution.executedPlan.collectLeaves().foreach { + // It's possible that multiple DataSourceV2ScanExec instances may refer to the same source + // (can happen with self-unions or self-joins). This means the source is scanned multiple + // times in the query, we should count the numRows for each scan. + val sourceToInputRowsTuples = lastExecution.executedPlan.collect { case s: DataSourceV2ScanExec if s.readSupport.isInstanceOf[BaseStreamingSource] => - uniqueStreamingExecLeavesMap.put(s, s) - case _ => - } - - val sourceToInputRowsTuples = - uniqueStreamingExecLeavesMap.values.asScala.map { execLeaf => - val numRows = execLeaf.metrics.get("numOutputRows").map(_.value).getOrElse(0L) - val source = execLeaf.readSupport.asInstanceOf[BaseStreamingSource] + val numRows = s.metrics.get("numOutputRows").map(_.value).getOrElse(0L) + val source = s.readSupport.asInstanceOf[BaseStreamingSource] source -> numRows - }.toSeq + } logDebug("Source -> # input rows\n\t" + sourceToInputRowsTuples.mkString("\n\t")) sumRows(sourceToInputRowsTuples) } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala index a39bb715c991..631a6eb649ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala @@ -30,6 +30,7 @@ import scala.util.control.NonFatal import com.google.common.util.concurrent.UncheckedExecutionException import org.apache.hadoop.fs.Path +import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -282,7 +283,7 @@ abstract class StreamExecution( // `stop()` is already called. Let `finally` finish the cleanup. } } catch { - case e if isInterruptedByStop(e) => + case e if isInterruptedByStop(e, sparkSession.sparkContext) => // interrupted by stop() updateStatusMessage("Stopped") case e: IOException if e.getMessage != null @@ -354,9 +355,9 @@ abstract class StreamExecution( } } - private def isInterruptedByStop(e: Throwable): Boolean = { + private def isInterruptedByStop(e: Throwable, sc: SparkContext): Boolean = { if (state.get == TERMINATED) { - StreamExecution.isInterruptionException(e) + StreamExecution.isInterruptionException(e, sc) } else { false } @@ -529,8 +530,9 @@ abstract class StreamExecution( object StreamExecution { val QUERY_ID_KEY = "sql.streaming.queryId" + val IS_CONTINUOUS_PROCESSING = "__is_continuous_processing" - def isInterruptionException(e: Throwable): Boolean = e match { + def isInterruptionException(e: Throwable, sc: SparkContext): Boolean = e match { // InterruptedIOException - thrown when an I/O operation is interrupted // ClosedByInterruptException - thrown when an I/O operation upon a channel is interrupted case _: InterruptedException | _: InterruptedIOException | _: ClosedByInterruptException => @@ -545,7 +547,18 @@ object StreamExecution { // ExecutionException, such as BiFunction.apply case e2 @ (_: UncheckedIOException | _: ExecutionException | _: UncheckedExecutionException) if e2.getCause != null => - isInterruptionException(e2.getCause) + isInterruptionException(e2.getCause, sc) + case se: SparkException => + val jobGroup = sc.getLocalProperty("spark.jobGroup.id") + if (jobGroup == null) return false + val errorMsg = se.getMessage + if (errorMsg.contains("cancelled") && errorMsg.contains(jobGroup) && se.getCause == null) { + true + } else if (se.getCause != null) { + isInterruptionException(se.getCause, sc) + } else { + false + } case _ => false } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala index 7b30db44a209..76ab1284633b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.streaming +import java.util.Locale + import scala.collection.mutable import org.apache.spark.internal.Logging @@ -36,7 +38,7 @@ object MultipleWatermarkPolicy { val DEFAULT_POLICY_NAME = "min" def apply(policyName: String): MultipleWatermarkPolicy = { - policyName.toLowerCase match { + policyName.toLowerCase(Locale.ROOT) match { case DEFAULT_POLICY_NAME => MinWatermark case "max" => MaxWatermark case _ => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala index 4ddebb33b79d..f009c52449ad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala @@ -209,6 +209,8 @@ class ContinuousExecution( scan.readSupport.asInstanceOf[ContinuousReadSupport] -> scan.scanConfig }.head + sparkSessionForQuery.sparkContext.setLocalProperty( + StreamExecution.IS_CONTINUOUS_PROCESSING, true.toString) sparkSessionForQuery.sparkContext.setLocalProperty( ContinuousExecution.START_EPOCH_KEY, currentBatchId.toString) // Add another random ID on top of the run ID, to distinguish epoch coordinators across @@ -263,8 +265,8 @@ class ContinuousExecution( sparkSessionForQuery, lastExecution)(lastExecution.toRdd) } } catch { - case t: Throwable - if StreamExecution.isInterruptionException(t) && state.get() == RECONFIGURING => + case t: Throwable if StreamExecution.isInterruptionException(t, sparkSession.sparkContext) && + state.get() == RECONFIGURING => logInfo(s"Query $id ignoring exception from reconfiguring: $t") // interrupted by reconfiguration - swallow exception so we can restart the query } finally { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala index c216b6138385..a797ac1879f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala @@ -57,7 +57,7 @@ case class WriteToContinuousDataSourceExec(writeSupport: StreamingWriteSupport, case cause: Throwable => cause match { // Do not wrap interruption exceptions that will be handled by streaming specially. - case _ if StreamExecution.isInterruptionException(cause) => throw cause + case _ if StreamExecution.isInterruptionException(cause, sparkContext) => throw cause // Only wrap non fatal exceptions. case NonFatal(e) => throw new SparkException("Writing job aborted.", e) case _ => throw cause diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala index 3f11b8f79943..4a69a48fed75 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala @@ -23,6 +23,7 @@ import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.EpochTracker import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType @@ -74,9 +75,14 @@ class StateStoreRDD[T: ClassTag, U: ClassTag]( // If we're in continuous processing mode, we should get the store version for the current // epoch rather than the one at planning time. - val currentVersion = EpochTracker.getCurrentEpoch match { - case None => storeVersion - case Some(value) => value + val isContinuous = Option(ctxt.getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING)) + .map(_.toBoolean).getOrElse(false) + val currentVersion = if (isContinuous) { + val epoch = EpochTracker.getCurrentEpoch + assert(epoch.isDefined, "Current epoch must be defined for continuous processing streams.") + epoch.get + } else { + storeVersion } store = StateStore.get( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala index 352b3d3616fb..43f22803e768 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.streaming.state +import java.util.Locale + import org.apache.hadoop.conf.Configuration import org.apache.spark.TaskContext @@ -263,7 +265,7 @@ class SymmetricHashJoinStateManager( def metrics: StateStoreMetrics = { val keyToNumValuesMetrics = keyToNumValues.metrics val keyWithIndexToValueMetrics = keyWithIndexToValue.metrics - def newDesc(desc: String): String = s"${joinSide.toString.toUpperCase}: $desc" + def newDesc(desc: String): String = s"${joinSide.toString.toUpperCase(Locale.ROOT)}: $desc" StateStoreMetrics( keyWithIndexToValueMetrics.numKeys, // represent each buffered row only once diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala index d11045fb6ac8..310ebcdf6768 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala @@ -90,49 +90,6 @@ case class ScalarSubquery( } } -/** - * A subquery that will check the value of `child` whether is in the result of a query or not. - */ -case class InSubquery( - child: Expression, - plan: SubqueryExec, - exprId: ExprId, - private var result: Array[Any] = null, - private var updated: Boolean = false) extends ExecSubqueryExpression { - - override def dataType: DataType = BooleanType - override def children: Seq[Expression] = child :: Nil - override def nullable: Boolean = child.nullable - override def toString: String = s"$child IN ${plan.name}" - override def withNewPlan(plan: SubqueryExec): InSubquery = copy(plan = plan) - - override def semanticEquals(other: Expression): Boolean = other match { - case in: InSubquery => child.semanticEquals(in.child) && plan.sameResult(in.plan) - case _ => false - } - - def updateResult(): Unit = { - val rows = plan.executeCollect() - result = rows.map(_.get(0, child.dataType)).asInstanceOf[Array[Any]] - updated = true - } - - override def eval(input: InternalRow): Any = { - require(updated, s"$this has not finished") - val v = child.eval(input) - if (v == null) { - null - } else { - result.contains(v) - } - } - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - require(updated, s"$this has not finished") - InSet(child, result.toSet).doGenCode(ctx, ev) - } -} - /** * Plans scalar subqueries from that are present in the given [[SparkPlan]]. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala index a7a24ac3641b..1b2d8a821b36 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala @@ -55,24 +55,57 @@ private[ui] class AllExecutionsPage(parent: SQLTab) extends WebUIPage("") with L val _content = mutable.ListBuffer[Node]() if (running.nonEmpty) { + val runningPageTable = new RunningExecutionTable( + parent, currentTime, running.sortBy(_.submissionTime).reverse).toNodeSeq(request) + _content ++= - new RunningExecutionTable( - parent, s"Running Queries (${running.size})", currentTime, - running.sortBy(_.submissionTime).reverse).toNodeSeq(request) + +

    + + Running Queries ({running.size}) +

    +
    ++ +
    + {runningPageTable} +
    } if (completed.nonEmpty) { + val completedPageTable = new CompletedExecutionTable( + parent, currentTime, completed.sortBy(_.submissionTime).reverse).toNodeSeq(request) + _content ++= - new CompletedExecutionTable( - parent, s"Completed Queries (${completed.size})", currentTime, - completed.sortBy(_.submissionTime).reverse).toNodeSeq(request) + +

    + + Completed Queries ({completed.size}) +

    +
    ++ +
    + {completedPageTable} +
    } if (failed.nonEmpty) { + val failedPageTable = new FailedExecutionTable( + parent, currentTime, failed.sortBy(_.submissionTime).reverse).toNodeSeq(request) + _content ++= - new FailedExecutionTable( - parent, s"Failed Queries (${failed.size})", currentTime, - failed.sortBy(_.submissionTime).reverse).toNodeSeq(request) + +

    + + Failed Queries ({failed.size}) +

    +
    ++ +
    + {failedPageTable} +
    } _content } @@ -118,7 +151,6 @@ private[ui] class AllExecutionsPage(parent: SQLTab) extends WebUIPage("") with L private[ui] abstract class ExecutionTable( parent: SQLTab, tableId: String, - tableName: String, currentTime: Long, executionUIDatas: Seq[SQLExecutionUIData], showRunningJobs: Boolean, @@ -206,11 +238,8 @@ private[ui] abstract class ExecutionTable( } def toNodeSeq(request: HttpServletRequest): Seq[Node] = { -
    -

    {tableName}

    - {UIUtils.listingTable[SQLExecutionUIData]( - header, row(request, currentTime, _), executionUIDatas, id = Some(tableId))} -
    + UIUtils.listingTable[SQLExecutionUIData]( + header, row(request, currentTime, _), executionUIDatas, id = Some(tableId)) } private def jobURL(request: HttpServletRequest, jobId: Long): String = @@ -223,13 +252,11 @@ private[ui] abstract class ExecutionTable( private[ui] class RunningExecutionTable( parent: SQLTab, - tableName: String, currentTime: Long, executionUIDatas: Seq[SQLExecutionUIData]) extends ExecutionTable( parent, "running-execution-table", - tableName, currentTime, executionUIDatas, showRunningJobs = true, @@ -242,13 +269,11 @@ private[ui] class RunningExecutionTable( private[ui] class CompletedExecutionTable( parent: SQLTab, - tableName: String, currentTime: Long, executionUIDatas: Seq[SQLExecutionUIData]) extends ExecutionTable( parent, "completed-execution-table", - tableName, currentTime, executionUIDatas, showRunningJobs = false, @@ -260,13 +285,11 @@ private[ui] class CompletedExecutionTable( private[ui] class FailedExecutionTable( parent: SQLTab, - tableName: String, currentTime: Long, executionUIDatas: Seq[SQLExecutionUIData]) extends ExecutionTable( parent, "failed-execution-table", - tableName, currentTime, executionUIDatas, showRunningJobs = false, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala index d254af400a7c..1199eeca959d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala @@ -81,9 +81,9 @@ class SQLAppStatusListener( // Record the accumulator IDs for the stages of this job, so that the code that keeps // track of the metrics knows which accumulators to look at. - val accumIds = exec.metrics.map(_.accumulatorId).sorted.toList + val accumIds = exec.metrics.map(_.accumulatorId).toSet event.stageIds.foreach { id => - stageMetrics.put(id, new LiveStageMetrics(id, 0, accumIds.toArray, new ConcurrentHashMap())) + stageMetrics.put(id, new LiveStageMetrics(id, 0, accumIds, new ConcurrentHashMap())) } exec.jobs = exec.jobs + (jobId -> JobExecutionStatus.RUNNING) @@ -382,7 +382,7 @@ private class LiveExecutionData(val executionId: Long) extends LiveEntity { private class LiveStageMetrics( val stageId: Int, var attemptId: Int, - val accumulatorIds: Array[Long], + val accumulatorIds: Set[Long], val taskMetrics: ConcurrentHashMap[Long, LiveTaskMetrics]) private class LiveTaskMetrics( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 10b67d7a1ca5..4247d3110f1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2546,15 +2546,39 @@ object functions { def soundex(e: Column): Column = withExpr { SoundEx(e.expr) } /** - * Splits str around pattern (pattern is a regular expression). + * Splits str around matches of the given regex. * - * @note Pattern is a string representation of the regular expression. + * @param str a string expression to split + * @param regex a string representing a regular expression. The regex string should be + * a Java regular expression. * * @group string_funcs * @since 1.5.0 */ - def split(str: Column, pattern: String): Column = withExpr { - StringSplit(str.expr, lit(pattern).expr) + def split(str: Column, regex: String): Column = withExpr { + StringSplit(str.expr, Literal(regex), Literal(-1)) + } + + /** + * Splits str around matches of the given regex. + * + * @param str a string expression to split + * @param regex a string representing a regular expression. The regex string should be + * a Java regular expression. + * @param limit an integer expression which controls the number of times the regex is applied. + *
      + *
    • limit greater than 0: The resulting array's length will not be more than limit, + * and the resulting array's last entry will contain all input beyond the last + * matched regex.
    • + *
    • limit less than or equal to 0: `regex` will be applied as many times as + * possible, and the resulting array can be of any size.
    • + *
    + * + * @group string_funcs + * @since 3.0.0 + */ + def split(str: Column, regex: String, limit: Int): Column = withExpr { + StringSplit(str.expr, Literal(regex), Literal(limit)) } /** @@ -3611,6 +3635,21 @@ object functions { */ def schema_of_json(e: Column): Column = withExpr(new SchemaOfJson(e.expr)) + /** + * Parses a column containing a JSON string and infers its schema using options. + * + * @param e a string column containing JSON data. + * @param options options to control how the json is parsed. accepts the same options and the + * json data source. See [[DataFrameReader#json]]. + * @return a column with string literal containing schema in DDL format. + * + * @group collection_funcs + * @since 3.0.0 + */ + def schema_of_json(e: Column, options: java.util.Map[String, String]): Column = { + withExpr(SchemaOfJson(e.expr, options.asScala.toMap)) + } + /** * (Scala-specific) Converts a column containing a `StructType`, `ArrayType` or * a `MapType` into a JSON string with the specified schema. @@ -3619,6 +3658,8 @@ object functions { * @param e a column containing a struct, an array or a map. * @param options options to control how the struct column is converted into a json string. * accepts the same options and the json data source. + * Additionally the function supports the `pretty` option which enables + * pretty JSON generation. * * @group collection_funcs * @since 2.1.0 @@ -3635,6 +3676,8 @@ object functions { * @param e a column containing a struct, an array or a map. * @param options options to control how the struct column is converted into a json string. * accepts the same options and the json data source. + * Additionally the function supports the `pretty` option which enables + * pretty JSON generation. * * @group collection_funcs * @since 2.1.0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 39e9e1ad426b..4c7dcedafeea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -26,6 +26,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession} import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.DataSource +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2} import org.apache.spark.sql.sources.StreamSourceProvider import org.apache.spark.sql.sources.v2.{ContinuousReadSupportProvider, DataSourceOptions, MicroBatchReadSupportProvider} @@ -158,7 +159,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } val ds = DataSource.lookupDataSource(source, sparkSession.sqlContext.conf).newInstance() - val options = new DataSourceOptions(extraOptions.asJava) // We need to generate the V1 data source so we can pass it to the V2 relation as a shim. // We can't be sure at this point whether we'll actually want to use V2, since we don't know the // writer or whether the query is continuous. @@ -173,13 +173,18 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo } ds match { case s: MicroBatchReadSupportProvider => + val sessionOptions = DataSourceV2Utils.extractSessionConfigs( + ds = s, conf = sparkSession.sessionState.conf) + val options = sessionOptions ++ extraOptions + val dataSourceOptions = new DataSourceOptions(options.asJava) var tempReadSupport: MicroBatchReadSupport = null val schema = try { val tmpCheckpointPath = Utils.createTempDir(namePrefix = s"tempCP").getCanonicalPath tempReadSupport = if (userSpecifiedSchema.isDefined) { - s.createMicroBatchReadSupport(userSpecifiedSchema.get, tmpCheckpointPath, options) + s.createMicroBatchReadSupport( + userSpecifiedSchema.get, tmpCheckpointPath, dataSourceOptions) } else { - s.createMicroBatchReadSupport(tmpCheckpointPath, options) + s.createMicroBatchReadSupport(tmpCheckpointPath, dataSourceOptions) } tempReadSupport.fullSchema() } finally { @@ -192,16 +197,21 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo Dataset.ofRows( sparkSession, StreamingRelationV2( - s, source, extraOptions.toMap, + s, source, options, schema.toAttributes, v1Relation)(sparkSession)) case s: ContinuousReadSupportProvider => + val sessionOptions = DataSourceV2Utils.extractSessionConfigs( + ds = s, conf = sparkSession.sessionState.conf) + val options = sessionOptions ++ extraOptions + val dataSourceOptions = new DataSourceOptions(options.asJava) var tempReadSupport: ContinuousReadSupport = null val schema = try { val tmpCheckpointPath = Utils.createTempDir(namePrefix = s"tempCP").getCanonicalPath tempReadSupport = if (userSpecifiedSchema.isDefined) { - s.createContinuousReadSupport(userSpecifiedSchema.get, tmpCheckpointPath, options) + s.createContinuousReadSupport( + userSpecifiedSchema.get, tmpCheckpointPath, dataSourceOptions) } else { - s.createContinuousReadSupport(tmpCheckpointPath, options) + s.createContinuousReadSupport(tmpCheckpointPath, dataSourceOptions) } tempReadSupport.fullSchema() } finally { @@ -214,7 +224,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo Dataset.ofRows( sparkSession, StreamingRelationV2( - s, source, extraOptions.toMap, + s, source, options, schema.toAttributes, v1Relation)(sparkSession)) case _ => // Code path for data source v1. @@ -327,6 +337,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * whitespaces from values being read should be skipped.
  • *
  • `nullValue` (default empty string): sets the string representation of a null value. Since * 2.0.1, this applies to all supported types including the string type.
  • + *
  • `emptyValue` (default empty string): sets the string representation of an empty value.
  • *
  • `nanValue` (default `NaN`): sets the string representation of a non-number" value.
  • *
  • `positiveInf` (default `Inf`): sets the string representation of a positive infinity * value.
  • diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index 7866e4f70f14..b23e86a78645 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.DataSource +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger import org.apache.spark.sql.execution.streaming.sources._ @@ -298,23 +299,28 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { } else { val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf) val disabledSources = df.sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") + var options = extraOptions.toMap val sink = ds.newInstance() match { case w: StreamingWriteSupportProvider - if !disabledSources.contains(w.getClass.getCanonicalName) => w + if !disabledSources.contains(w.getClass.getCanonicalName) => + val sessionOptions = DataSourceV2Utils.extractSessionConfigs( + w, df.sparkSession.sessionState.conf) + options = sessionOptions ++ extraOptions + w case _ => val ds = DataSource( df.sparkSession, className = source, - options = extraOptions.toMap, + options = options, partitionColumns = normalizedParCols.getOrElse(Nil)) ds.createSink(outputMode) } df.sparkSession.sessionState.streamingQueryManager.startQuery( - extraOptions.get("queryName"), - extraOptions.get("checkpointLocation"), + options.get("queryName"), + options.get("checkpointLocation"), df, - extraOptions.toMap, + options, sink, outputMode, useTempCheckpointLocation = source == "console", @@ -374,7 +380,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { * @since 2.4.0 */ @InterfaceStability.Evolving - def foreachBatch(function: VoidFunction2[Dataset[T], Long]): DataStreamWriter[T] = { + def foreachBatch(function: VoidFunction2[Dataset[T], java.lang.Long]): DataStreamWriter[T] = { foreachBatch((batchDs: Dataset[T], batchId: Long) => function.call(batchDs, batchId)) } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index 69a2904f5f3f..a05afa4f6ba3 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -134,6 +134,8 @@ public static class Bean implements Serializable { private Map c = ImmutableMap.of("hello", new int[] { 1, 2 }); private List d = Arrays.asList("floppy", "disk"); private BigInteger e = new BigInteger("1234567"); + private NestedBean f = new NestedBean(); + private NestedBean g = null; public double getA() { return a; @@ -152,6 +154,22 @@ public List getD() { } public BigInteger getE() { return e; } + + public NestedBean getF() { + return f; + } + + public NestedBean getG() { + return g; + } + + public static class NestedBean implements Serializable { + private int a = 1; + + public int getA() { + return a; + } + } } void validateDataFrameWithBeans(Bean bean, Dataset df) { @@ -171,7 +189,14 @@ void validateDataFrameWithBeans(Bean bean, Dataset df) { schema.apply("d")); Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true, Metadata.empty()), schema.apply("e")); - Row first = df.select("a", "b", "c", "d", "e").first(); + StructType nestedBeanType = + DataTypes.createStructType(Collections.singletonList(new StructField( + "a", IntegerType$.MODULE$, false, Metadata.empty()))); + Assert.assertEquals(new StructField("f", nestedBeanType, true, Metadata.empty()), + schema.apply("f")); + Assert.assertEquals(new StructField("g", nestedBeanType, true, Metadata.empty()), + schema.apply("g")); + Row first = df.select("a", "b", "c", "d", "e", "f", "g").first(); Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0); // Now Java lists and maps are converted to Scala Seq's and Map's. Once we get a Seq below, // verify that it has the expected length, and contains expected elements. @@ -192,6 +217,9 @@ void validateDataFrameWithBeans(Bean bean, Dataset df) { } // Java.math.BigInteger is equivalent to Spark Decimal(38,0) Assert.assertEquals(new BigDecimal(bean.getE()), first.getDecimal(4)); + Row nested = first.getStruct(5); + Assert.assertEquals(bean.getF().getA(), nested.getInt(0)); + Assert.assertTrue(first.isNullAt(6)); } @Test @@ -290,6 +318,17 @@ public void testSampleBy() { Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); } + @Test + public void testSampleByColumn() { + Dataset df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); + Dataset sampled = df.stat().sampleBy(col("key"), ImmutableMap.of(0, 0.1, 1, 0.2), 0L); + List actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); + Assert.assertEquals(0, actual.get(0).getLong(0)); + Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); + Assert.assertEquals(1, actual.get(1).getLong(0)); + Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); + } + @Test public void pivot() { Dataset df = spark.table("courseSales"); @@ -306,6 +345,22 @@ public void pivot() { Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); } + @Test + public void pivotColumnValues() { + Dataset df = spark.table("courseSales"); + List actual = df.groupBy("year") + .pivot(col("course"), Arrays.asList(lit("dotNET"), lit("Java"))) + .agg(sum("earnings")).orderBy("year").collectAsList(); + + Assert.assertEquals(2012, actual.get(0).getInt(0)); + Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01); + Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01); + + Assert.assertEquals(2013, actual.get(1).getInt(0)); + Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01); + Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); + } + private String getResource(String resource) { try { // The following "getResource" has different behaviors in SBT and Maven. diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/streaming/JavaDataStreamReaderWriterSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/streaming/JavaDataStreamReaderWriterSuite.java new file mode 100644 index 000000000000..48cdb2642d83 --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/streaming/JavaDataStreamReaderWriterSuite.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql.streaming; + +import java.io.File; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.function.VoidFunction2; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.ForeachWriter; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.streaming.StreamingQuery; +import org.apache.spark.sql.test.TestSparkSession; +import org.apache.spark.util.Utils; + +public class JavaDataStreamReaderWriterSuite { + private SparkSession spark; + private String input; + + @Before + public void setUp() { + spark = new TestSparkSession(); + input = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "input").toString(); + } + + @After + public void tearDown() { + try { + Utils.deleteRecursively(new File(input)); + } finally { + spark.stop(); + spark = null; + } + } + + @Test + public void testForeachBatchAPI() { + StreamingQuery query = spark + .readStream() + .textFile(input) + .writeStream() + .foreachBatch(new VoidFunction2, Long>() { + @Override + public void call(Dataset v1, Long v2) throws Exception {} + }) + .start(); + query.stop(); + } + + @Test + public void testForeachAPI() { + StreamingQuery query = spark + .readStream() + .textFile(input) + .writeStream() + .foreach(new ForeachWriter() { + @Override + public boolean open(long partitionId, long epochId) { + return true; + } + + @Override + public void process(String value) {} + + @Override + public void close(Throwable errorOrNull) {} + }) + .start(); + query.stop(); + } +} diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index 4950a4b7a4e5..547c2bef02b2 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -27,36 +27,3 @@ select current_date = current_date(), current_timestamp = current_timestamp(), a select a, b from ttf2 order by a, current_date; select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), weekday(null), weekday('1582-10-15 13:10:15'); - -select from_utc_timestamp('2015-07-24 00:00:00', 'PST'); - -select from_utc_timestamp('2015-01-24 00:00:00', 'PST'); - -select from_utc_timestamp(null, 'PST'); - -select from_utc_timestamp('2015-07-24 00:00:00', null); - -select from_utc_timestamp(null, null); - -select from_utc_timestamp(cast(0 as timestamp), 'PST'); - -select from_utc_timestamp(cast('2015-01-24' as date), 'PST'); - -select to_utc_timestamp('2015-07-24 00:00:00', 'PST'); - -select to_utc_timestamp('2015-01-24 00:00:00', 'PST'); - -select to_utc_timestamp(null, 'PST'); - -select to_utc_timestamp('2015-07-24 00:00:00', null); - -select to_utc_timestamp(null, null); - -select to_utc_timestamp(cast(0 as timestamp), 'PST'); - -select to_utc_timestamp(cast('2015-01-24' as date), 'PST'); - --- SPARK-23715: the input of to/from_utc_timestamp can not have timezone -select from_utc_timestamp('2000-10-10 00:00:00+00:00', 'PST'); - -select to_utc_timestamp('2000-10-10 00:00:00+00:00', 'PST'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by-ordinal.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by-ordinal.sql index 928f766b4add..3144833b608b 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by-ordinal.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by-ordinal.sql @@ -38,7 +38,9 @@ select a, b, sum(b) from data group by 3; select a, b, sum(b) + 2 from data group by 3; -- negative case: nondeterministic expression -select a, rand(0), sum(b) from data group by a, 2; +select a, rand(0), sum(b) +from +(select /*+ REPARTITION(1) */ a, b from data) group by a, 2; -- negative case: star select * from data group by a, b, 1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql index 0f22c0eeed58..bdd1fe4074f3 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql @@ -56,3 +56,7 @@ select from_json('[{"a": 1}, 2]', 'array>'); select to_json(array('1', '2', '3')); select to_json(array(array(1, 2, 3), array(4))); +-- infer schema of json literal using options +select schema_of_json('{"c1":1}', map('primitivesAsString', 'true')); +select schema_of_json('{"c1":01, "c2":0.1}', map('allowNumericLeadingZeros', 'true', 'prefersDecimal', 'true')); + diff --git a/sql/core/src/test/resources/sql-tests/inputs/limit.sql b/sql/core/src/test/resources/sql-tests/inputs/limit.sql index e33cd819f281..b4c73cf33e53 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/limit.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/limit.sql @@ -1,5 +1,3 @@ --- Disable global limit parallel -set spark.sql.limit.flatGlobalLimit=false; -- limit on various data types SELECT * FROM testdata LIMIT 2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/operator-div.sql b/sql/core/src/test/resources/sql-tests/inputs/operator-div.sql new file mode 100644 index 000000000000..6e1c1bded904 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/operator-div.sql @@ -0,0 +1,14 @@ +set spark.sql.legacy.integralDivide.returnBigint=true; + +select 5 div 2; +select 5 div 0; +select 5 div null; +select null div 5; + +set spark.sql.legacy.integralDivide.returnBigint=false; + +select 5 div 2; +select 5 div 0; +select 5 div null; +select null div 5; + diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql index 15d981985c55..37f9cd44da7f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql @@ -16,15 +16,11 @@ select + + 100; select - - max(key) from testdata; select + - key from testdata where key = 33; --- div +-- division select 5 / 2; select 5 / 0; select 5 / null; select null / 5; -select 5 div 2; -select 5 div 0; -select 5 div null; -select null div 5; -- other arithmetics select 1 + 2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pivot.sql b/sql/core/src/test/resources/sql-tests/inputs/pivot.sql index 1f607b334dc1..c2ecd97e2b02 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pivot.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/pivot.sql @@ -287,3 +287,14 @@ PIVOT ( sum(earnings) FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) ); + +-- grouping columns output in the same order as input +-- correctly handle pivot columns with different cases +SELECT * FROM ( + SELECT course, earnings, "a" as a, "z" as z, "b" as b, "y" as y, "c" as c, "x" as x, "d" as d, "w" as w + FROM courseSales +) +PIVOT ( + sum(Earnings) + FOR Course IN ('dotNET', 'Java') +); diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 4113734e1707..2effb43183d7 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -46,4 +46,8 @@ FROM ( encode(string(id + 2), 'utf-8') col3, encode(string(id + 3), 'utf-8') col4 FROM range(10) -) +); + +-- split function +SELECT split('aa1cc2ee3', '[1-9]+'); +SELECT split('aa1cc2ee3', '[1-9]+', 2); diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql index a862e0985b20..a40ee082ba3b 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql @@ -1,9 +1,6 @@ -- A test suite for IN LIMIT in parent side, subquery, and both predicate subquery -- It includes correlated cases. --- Disable global limit optimization -set spark.sql.limit.flatGlobalLimit=false; - create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -100,4 +97,4 @@ WHERE t1d NOT IN (SELECT t2d LIMIT 1) GROUP BY t1b ORDER BY t1b NULLS last -LIMIT 1; +LIMIT 1; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/udaf-regrfunctions.sql b/sql/core/src/test/resources/sql-tests/inputs/udaf-regrfunctions.sql deleted file mode 100644 index 92c7e26e3add..000000000000 --- a/sql/core/src/test/resources/sql-tests/inputs/udaf-regrfunctions.sql +++ /dev/null @@ -1,56 +0,0 @@ --- --- Licensed to the Apache Software Foundation (ASF) under one or more --- contributor license agreements. See the NOTICE file distributed with --- this work for additional information regarding copyright ownership. --- The ASF licenses this file to You under the Apache License, Version 2.0 --- (the "License"); you may not use this file except in compliance with --- the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, software --- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language governing permissions and --- limitations under the License. --- - -CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES - (101, 1, 1, 1), - (201, 2, 1, 1), - (301, 3, 1, 1), - (401, 4, 1, 11), - (501, 5, 1, null), - (601, 6, null, 1), - (701, 6, null, null), - (102, 1, 2, 2), - (202, 2, 1, 2), - (302, 3, 2, 1), - (402, 4, 2, 12), - (502, 5, 2, null), - (602, 6, null, 2), - (702, 6, null, null), - (103, 1, 3, 3), - (203, 2, 1, 3), - (303, 3, 3, 1), - (403, 4, 3, 13), - (503, 5, 3, null), - (603, 6, null, 3), - (703, 6, null, null), - (104, 1, 4, 4), - (204, 2, 1, 4), - (304, 3, 4, 1), - (404, 4, 4, 14), - (504, 5, 4, null), - (604, 6, null, 4), - (704, 6, null, null), - (800, 7, 1, 1) -as t1(id, px, y, x); - -select px, var_pop(x), var_pop(y), corr(y,x), covar_samp(y,x), covar_pop(y,x), regr_count(y,x), - regr_slope(y,x), regr_intercept(y,x), regr_r2(y,x), regr_sxx(y,x), regr_syy(y,x), regr_sxy(y,x), - regr_avgx(y,x), regr_avgy(y,x), regr_count(y,x) -from t1 group by px order by px; - - -select id, regr_count(y,x) over (partition by px) from t1 order by id; diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 9eede305dbdc..63aa00426ea3 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 26 +-- Number of queries: 10 -- !query 0 @@ -89,131 +89,3 @@ select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), week struct -- !query 9 output 5 3 5 NULL 4 - - --- !query 10 -select from_utc_timestamp('2015-07-24 00:00:00', 'PST') --- !query 10 schema -struct --- !query 10 output -2015-07-23 17:00:00 - - --- !query 11 -select from_utc_timestamp('2015-01-24 00:00:00', 'PST') --- !query 11 schema -struct --- !query 11 output -2015-01-23 16:00:00 - - --- !query 12 -select from_utc_timestamp(null, 'PST') --- !query 12 schema -struct --- !query 12 output -NULL - - --- !query 13 -select from_utc_timestamp('2015-07-24 00:00:00', null) --- !query 13 schema -struct --- !query 13 output -NULL - - --- !query 14 -select from_utc_timestamp(null, null) --- !query 14 schema -struct --- !query 14 output -NULL - - --- !query 15 -select from_utc_timestamp(cast(0 as timestamp), 'PST') --- !query 15 schema -struct --- !query 15 output -1969-12-31 08:00:00 - - --- !query 16 -select from_utc_timestamp(cast('2015-01-24' as date), 'PST') --- !query 16 schema -struct --- !query 16 output -2015-01-23 16:00:00 - - --- !query 17 -select to_utc_timestamp('2015-07-24 00:00:00', 'PST') --- !query 17 schema -struct --- !query 17 output -2015-07-24 07:00:00 - - --- !query 18 -select to_utc_timestamp('2015-01-24 00:00:00', 'PST') --- !query 18 schema -struct --- !query 18 output -2015-01-24 08:00:00 - - --- !query 19 -select to_utc_timestamp(null, 'PST') --- !query 19 schema -struct --- !query 19 output -NULL - - --- !query 20 -select to_utc_timestamp('2015-07-24 00:00:00', null) --- !query 20 schema -struct --- !query 20 output -NULL - - --- !query 21 -select to_utc_timestamp(null, null) --- !query 21 schema -struct --- !query 21 output -NULL - - --- !query 22 -select to_utc_timestamp(cast(0 as timestamp), 'PST') --- !query 22 schema -struct --- !query 22 output -1970-01-01 00:00:00 - - --- !query 23 -select to_utc_timestamp(cast('2015-01-24' as date), 'PST') --- !query 23 schema -struct --- !query 23 output -2015-01-24 08:00:00 - - --- !query 24 -select from_utc_timestamp('2000-10-10 00:00:00+00:00', 'PST') --- !query 24 schema -struct --- !query 24 output -NULL - - --- !query 25 -select to_utc_timestamp('2000-10-10 00:00:00+00:00', 'PST') --- !query 25 schema -struct --- !query 25 output -NULL diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 79390cb42444..9c4b70d1b1ab 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -474,6 +474,7 @@ Last Access [not included in comparison] Created By [not included in comparison] Type VIEW View Text SELECT * FROM t +View Original Text SELECT * FROM t View Default Database default View Query Output Columns [a, b, c, d] Table Properties [view.query.out.col.3=d, view.query.out.col.0=a, view.query.out.numCols=4, view.default.database=default, view.query.out.col.1=b, view.query.out.col.2=c] @@ -497,6 +498,7 @@ Last Access [not included in comparison] Created By [not included in comparison] Type VIEW View Text SELECT * FROM t +View Original Text SELECT * FROM t View Default Database default View Query Output Columns [a, b, c, d] Table Properties [view.query.out.col.3=d, view.query.out.col.0=a, view.query.out.numCols=4, view.default.database=default, view.query.out.col.1=b, view.query.out.col.2=c] diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out index 9ecbe19078dd..cf5add6a71af 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out @@ -135,7 +135,9 @@ aggregate functions are not allowed in GROUP BY, but found (sum(CAST(data.`b` AS -- !query 13 -select a, rand(0), sum(b) from data group by a, 2 +select a, rand(0), sum(b) +from +(select /*+ REPARTITION(1) */ a, b from data) group by a, 2 -- !query 13 schema struct -- !query 13 output diff --git a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out index 32d20d1b7341..1b7c6f4f7625 100644 --- a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out @@ -201,6 +201,7 @@ struct<> -- !query 20 output + -- !query 21 select transform_keys(ys, (k, v) -> k) as v from nested -- !query 21 schema diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index e550b43e08c2..77e900040114 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 40 +-- Number of queries: 42 -- !query 0 @@ -370,3 +370,19 @@ select to_json(array(array(1, 2, 3), array(4))) struct -- !query 39 output [[1,2,3],[4]] + + +-- !query 40 +select schema_of_json('{"c1":1}', map('primitivesAsString', 'true')) +-- !query 40 schema +struct +-- !query 40 output +struct + + +-- !query 41 +select schema_of_json('{"c1":01, "c2":0.1}', map('allowNumericLeadingZeros', 'true', 'prefersDecimal', 'true')) +-- !query 41 schema +struct +-- !query 41 output +struct diff --git a/sql/core/src/test/resources/sql-tests/results/limit.sql.out b/sql/core/src/test/resources/sql-tests/results/limit.sql.out index 187f3bd6858f..02fe1de84f75 100644 --- a/sql/core/src/test/resources/sql-tests/results/limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/limit.sql.out @@ -1,134 +1,126 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 15 +-- Number of queries: 14 -- !query 0 -set spark.sql.limit.flatGlobalLimit=false --- !query 0 schema -struct --- !query 0 output -spark.sql.limit.flatGlobalLimit false - - --- !query 1 SELECT * FROM testdata LIMIT 2 --- !query 1 schema +-- !query 0 schema struct --- !query 1 output +-- !query 0 output 1 1 2 2 --- !query 2 +-- !query 1 SELECT * FROM arraydata LIMIT 2 --- !query 2 schema +-- !query 1 schema struct,nestedarraycol:array>> --- !query 2 output +-- !query 1 output [1,2,3] [[1,2,3]] [2,3,4] [[2,3,4]] --- !query 3 +-- !query 2 SELECT * FROM mapdata LIMIT 2 --- !query 3 schema +-- !query 2 schema struct> --- !query 3 output +-- !query 2 output {1:"a1",2:"b1",3:"c1",4:"d1",5:"e1"} {1:"a2",2:"b2",3:"c2",4:"d2"} --- !query 4 +-- !query 3 SELECT * FROM testdata LIMIT 2 + 1 --- !query 4 schema +-- !query 3 schema struct --- !query 4 output +-- !query 3 output 1 1 2 2 3 3 --- !query 5 +-- !query 4 SELECT * FROM testdata LIMIT CAST(1 AS int) --- !query 5 schema +-- !query 4 schema struct --- !query 5 output +-- !query 4 output 1 1 --- !query 6 +-- !query 5 SELECT * FROM testdata LIMIT -1 --- !query 6 schema +-- !query 5 schema struct<> --- !query 6 output +-- !query 5 output org.apache.spark.sql.AnalysisException The limit expression must be equal to or greater than 0, but got -1; --- !query 7 +-- !query 6 SELECT * FROM testData TABLESAMPLE (-1 ROWS) --- !query 7 schema +-- !query 6 schema struct<> --- !query 7 output +-- !query 6 output org.apache.spark.sql.AnalysisException The limit expression must be equal to or greater than 0, but got -1; --- !query 8 +-- !query 7 SELECT * FROM testdata LIMIT CAST(1 AS INT) --- !query 8 schema +-- !query 7 schema struct --- !query 8 output +-- !query 7 output 1 1 --- !query 9 +-- !query 8 SELECT * FROM testdata LIMIT CAST(NULL AS INT) --- !query 9 schema +-- !query 8 schema struct<> --- !query 9 output +-- !query 8 output org.apache.spark.sql.AnalysisException The evaluated limit expression must not be null, but got CAST(NULL AS INT); --- !query 10 +-- !query 9 SELECT * FROM testdata LIMIT key > 3 --- !query 10 schema +-- !query 9 schema struct<> --- !query 10 output +-- !query 9 output org.apache.spark.sql.AnalysisException The limit expression must evaluate to a constant value, but got (testdata.`key` > 3); --- !query 11 +-- !query 10 SELECT * FROM testdata LIMIT true --- !query 11 schema +-- !query 10 schema struct<> --- !query 11 output +-- !query 10 output org.apache.spark.sql.AnalysisException The limit expression must be integer type, but got boolean; --- !query 12 +-- !query 11 SELECT * FROM testdata LIMIT 'a' --- !query 12 schema +-- !query 11 schema struct<> --- !query 12 output +-- !query 11 output org.apache.spark.sql.AnalysisException The limit expression must be integer type, but got string; --- !query 13 +-- !query 12 SELECT * FROM (SELECT * FROM range(10) LIMIT 5) WHERE id > 3 --- !query 13 schema +-- !query 12 schema struct --- !query 13 output +-- !query 12 output 4 --- !query 14 +-- !query 13 SELECT * FROM testdata WHERE key < 3 LIMIT ALL --- !query 14 schema +-- !query 13 schema struct --- !query 14 output +-- !query 13 output 1 1 2 2 diff --git a/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out b/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out new file mode 100644 index 000000000000..088b4d1c231f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out @@ -0,0 +1,82 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 10 + + +-- !query 0 +set spark.sql.legacy.integralDivide.returnBigint=true +-- !query 0 schema +struct +-- !query 0 output +spark.sql.legacy.integralDivide.returnBigint true + + +-- !query 1 +select 5 div 2 +-- !query 1 schema +struct<(5 div 2):bigint> +-- !query 1 output +2 + + +-- !query 2 +select 5 div 0 +-- !query 2 schema +struct<(5 div 0):bigint> +-- !query 2 output +NULL + + +-- !query 3 +select 5 div null +-- !query 3 schema +struct<(5 div CAST(NULL AS INT)):bigint> +-- !query 3 output +NULL + + +-- !query 4 +select null div 5 +-- !query 4 schema +struct<(CAST(NULL AS INT) div 5):bigint> +-- !query 4 output +NULL + + +-- !query 5 +set spark.sql.legacy.integralDivide.returnBigint=false +-- !query 5 schema +struct +-- !query 5 output +spark.sql.legacy.integralDivide.returnBigint false + + +-- !query 6 +select 5 div 2 +-- !query 6 schema +struct<(5 div 2):int> +-- !query 6 output +2 + + +-- !query 7 +select 5 div 0 +-- !query 7 schema +struct<(5 div 0):int> +-- !query 7 output +NULL + + +-- !query 8 +select 5 div null +-- !query 8 schema +struct<(5 div CAST(NULL AS INT)):int> +-- !query 8 output +NULL + + +-- !query 9 +select null div 5 +-- !query 9 schema +struct<(CAST(NULL AS INT) div 5):int> +-- !query 9 output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out index 840655b7a644..fd1d0db9e3f7 100644 --- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 59 +-- Number of queries: 55 -- !query 0 @@ -155,332 +155,300 @@ NULL -- !query 19 -select 5 div 2 --- !query 19 schema -struct --- !query 19 output -2 - - --- !query 20 -select 5 div 0 --- !query 20 schema -struct --- !query 20 output -NULL - - --- !query 21 -select 5 div null --- !query 21 schema -struct --- !query 21 output -NULL - - --- !query 22 -select null div 5 --- !query 22 schema -struct --- !query 22 output -NULL - - --- !query 23 select 1 + 2 --- !query 23 schema +-- !query 19 schema struct<(1 + 2):int> --- !query 23 output +-- !query 19 output 3 --- !query 24 +-- !query 20 select 1 - 2 --- !query 24 schema +-- !query 20 schema struct<(1 - 2):int> --- !query 24 output +-- !query 20 output -1 --- !query 25 +-- !query 21 select 2 * 5 --- !query 25 schema +-- !query 21 schema struct<(2 * 5):int> --- !query 25 output +-- !query 21 output 10 --- !query 26 +-- !query 22 select 5 % 3 --- !query 26 schema +-- !query 22 schema struct<(5 % 3):int> --- !query 26 output +-- !query 22 output 2 --- !query 27 +-- !query 23 select pmod(-7, 3) --- !query 27 schema +-- !query 23 schema struct --- !query 27 output +-- !query 23 output 2 --- !query 28 +-- !query 24 explain select 'a' || 1 + 2 --- !query 28 schema +-- !query 24 schema struct --- !query 28 output +-- !query 24 output == Physical Plan == *Project [null AS (CAST(concat(a, CAST(1 AS STRING)) AS DOUBLE) + CAST(2 AS DOUBLE))#x] +- Scan OneRowRelation[] --- !query 29 +-- !query 25 explain select 1 - 2 || 'b' --- !query 29 schema +-- !query 25 schema struct --- !query 29 output +-- !query 25 output == Physical Plan == *Project [-1b AS concat(CAST((1 - 2) AS STRING), b)#x] +- Scan OneRowRelation[] --- !query 30 +-- !query 26 explain select 2 * 4 + 3 || 'b' --- !query 30 schema +-- !query 26 schema struct --- !query 30 output +-- !query 26 output == Physical Plan == *Project [11b AS concat(CAST(((2 * 4) + 3) AS STRING), b)#x] +- Scan OneRowRelation[] --- !query 31 +-- !query 27 explain select 3 + 1 || 'a' || 4 / 2 --- !query 31 schema +-- !query 27 schema struct --- !query 31 output +-- !query 27 output == Physical Plan == *Project [4a2.0 AS concat(concat(CAST((3 + 1) AS STRING), a), CAST((CAST(4 AS DOUBLE) / CAST(2 AS DOUBLE)) AS STRING))#x] +- Scan OneRowRelation[] --- !query 32 +-- !query 28 explain select 1 == 1 OR 'a' || 'b' == 'ab' --- !query 32 schema +-- !query 28 schema struct --- !query 32 output +-- !query 28 output == Physical Plan == *Project [true AS ((1 = 1) OR (concat(a, b) = ab))#x] +- Scan OneRowRelation[] --- !query 33 +-- !query 29 explain select 'a' || 'c' == 'ac' AND 2 == 3 --- !query 33 schema +-- !query 29 schema struct --- !query 33 output +-- !query 29 output == Physical Plan == *Project [false AS ((concat(a, c) = ac) AND (2 = 3))#x] +- Scan OneRowRelation[] --- !query 34 +-- !query 30 select cot(1) --- !query 34 schema +-- !query 30 schema struct --- !query 34 output +-- !query 30 output 0.6420926159343306 --- !query 35 +-- !query 31 select cot(null) --- !query 35 schema +-- !query 31 schema struct --- !query 35 output +-- !query 31 output NULL --- !query 36 +-- !query 32 select cot(0) --- !query 36 schema +-- !query 32 schema struct --- !query 36 output +-- !query 32 output Infinity --- !query 37 +-- !query 33 select cot(-1) --- !query 37 schema +-- !query 33 schema struct --- !query 37 output +-- !query 33 output -0.6420926159343306 --- !query 38 +-- !query 34 select ceiling(0) --- !query 38 schema +-- !query 34 schema struct --- !query 38 output +-- !query 34 output 0 --- !query 39 +-- !query 35 select ceiling(1) --- !query 39 schema +-- !query 35 schema struct --- !query 39 output +-- !query 35 output 1 --- !query 40 +-- !query 36 select ceil(1234567890123456) --- !query 40 schema +-- !query 36 schema struct --- !query 40 output +-- !query 36 output 1234567890123456 --- !query 41 +-- !query 37 select ceiling(1234567890123456) --- !query 41 schema +-- !query 37 schema struct --- !query 41 output +-- !query 37 output 1234567890123456 --- !query 42 +-- !query 38 select ceil(0.01) --- !query 42 schema +-- !query 38 schema struct --- !query 42 output +-- !query 38 output 1 --- !query 43 +-- !query 39 select ceiling(-0.10) --- !query 43 schema +-- !query 39 schema struct --- !query 43 output +-- !query 39 output 0 --- !query 44 +-- !query 40 select floor(0) --- !query 44 schema +-- !query 40 schema struct --- !query 44 output +-- !query 40 output 0 --- !query 45 +-- !query 41 select floor(1) --- !query 45 schema +-- !query 41 schema struct --- !query 45 output +-- !query 41 output 1 --- !query 46 +-- !query 42 select floor(1234567890123456) --- !query 46 schema +-- !query 42 schema struct --- !query 46 output +-- !query 42 output 1234567890123456 --- !query 47 +-- !query 43 select floor(0.01) --- !query 47 schema +-- !query 43 schema struct --- !query 47 output +-- !query 43 output 0 --- !query 48 +-- !query 44 select floor(-0.10) --- !query 48 schema +-- !query 44 schema struct --- !query 48 output +-- !query 44 output -1 --- !query 49 +-- !query 45 select 1 > 0.00001 --- !query 49 schema +-- !query 45 schema struct<(CAST(1 AS BIGINT) > 0):boolean> --- !query 49 output +-- !query 45 output true --- !query 50 +-- !query 46 select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null) --- !query 50 schema +-- !query 46 schema struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double> --- !query 50 output +-- !query 46 output 1 NULL 0 NULL NULL NULL --- !query 51 +-- !query 47 select BIT_LENGTH('abc') --- !query 51 schema +-- !query 47 schema struct --- !query 51 output +-- !query 47 output 24 --- !query 52 +-- !query 48 select CHAR_LENGTH('abc') --- !query 52 schema +-- !query 48 schema struct --- !query 52 output +-- !query 48 output 3 --- !query 53 +-- !query 49 select CHARACTER_LENGTH('abc') --- !query 53 schema +-- !query 49 schema struct --- !query 53 output +-- !query 49 output 3 --- !query 54 +-- !query 50 select OCTET_LENGTH('abc') --- !query 54 schema +-- !query 50 schema struct --- !query 54 output +-- !query 50 output 3 --- !query 55 +-- !query 51 select abs(-3.13), abs('-2.19') --- !query 55 schema +-- !query 51 schema struct --- !query 55 output +-- !query 51 output 3.13 2.19 --- !query 56 +-- !query 52 select positive('-1.11'), positive(-1.11), negative('-1.11'), negative(-1.11) --- !query 56 schema +-- !query 52 schema struct<(+ CAST(-1.11 AS DOUBLE)):double,(+ -1.11):decimal(3,2),(- CAST(-1.11 AS DOUBLE)):double,(- -1.11):decimal(3,2)> --- !query 56 output +-- !query 52 output -1.11 -1.11 1.11 1.11 --- !query 57 +-- !query 53 select pmod(-7, 2), pmod(0, 2), pmod(7, 0), pmod(7, null), pmod(null, 2), pmod(null, null) --- !query 57 schema +-- !query 53 schema struct --- !query 57 output +-- !query 53 output 1 0 NULL NULL NULL NULL --- !query 58 +-- !query 54 select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint), cast(0 as smallint)) --- !query 58 schema +-- !query 54 schema struct --- !query 58 output +-- !query 54 output NULL NULL diff --git a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out index 2dd92930f92a..595ce1f8efcd 100644 --- a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 31 +-- Number of queries: 32 -- !query 0 @@ -476,3 +476,18 @@ struct<> -- !query 30 output org.apache.spark.sql.AnalysisException Invalid pivot column 'named_struct(course, course#x, m, m#x)'. Pivot columns must be comparable.; + + +-- !query 31 +SELECT * FROM ( + SELECT course, earnings, "a" as a, "z" as z, "b" as b, "y" as y, "c" as c, "x" as x, "d" as d, "w" as w + FROM courseSales +) +PIVOT ( + sum(Earnings) + FOR Course IN ('dotNET', 'Java') +) +-- !query 31 schema +struct +-- !query 31 output +a z b y c x d w 63000 50000 diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 7b3dc8438888..e8f2e0a81455 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 15 +-- Number of queries: 17 -- !query 0 @@ -161,3 +161,19 @@ struct == Physical Plan == *Project [concat(cast(id#xL as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x] +- *Range (0, 10, step=1, splits=2) + + +-- !query 15 +SELECT split('aa1cc2ee3', '[1-9]+') +-- !query 15 schema +struct> +-- !query 15 output +["aa","cc","ee",""] + + +-- !query 16 +SELECT split('aa1cc2ee3', '[1-9]+', 2) +-- !query 16 schema +struct> +-- !query 16 output +["aa","cc2ee3"] diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-basic.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-basic.sql.out index 088db55d6640..686fe4975379 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-basic.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-basic.sql.out @@ -41,15 +41,15 @@ select 1 from tab_a where (a1, b1) not in (select (a2, b2) from tab_b) struct<> -- !query 4 output org.apache.spark.sql.AnalysisException -Cannot analyze (named_struct('a1', tab_a.`a1`, 'b1', tab_a.`b1`) IN (listquery())). +cannot resolve '(named_struct('a1', tab_a.`a1`, 'b1', tab_a.`b1`) IN (listquery()))' due to data type mismatch: The number of columns in the left hand side of an IN subquery does not match the number of columns in the output of subquery. -#columns in left hand side: 2 -#columns in right hand side: 1 +#columns in left hand side: 2. +#columns in right hand side: 1. Left side columns: -[tab_a.`a1`, tab_a.`b1`] +[tab_a.`a1`, tab_a.`b1`]. Right side columns: -[`named_struct(a2, a2, b2, b2)`]; +[`named_struct(a2, a2, b2, b2)`].; -- !query 5 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out index 9eb5b3383e73..71ca1f864947 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out @@ -1,16 +1,8 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 9 +-- Number of queries: 8 -- !query 0 -set spark.sql.limit.flatGlobalLimit=false --- !query 0 schema -struct --- !query 0 output -spark.sql.limit.flatGlobalLimit false - - --- !query 1 create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -25,13 +17,13 @@ create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 1 schema +-- !query 0 schema struct<> --- !query 1 output +-- !query 0 output --- !query 2 +-- !query 1 create temporary view t2 as select * from values ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -47,13 +39,13 @@ create temporary view t2 as select * from values ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 2 schema +-- !query 1 schema struct<> --- !query 2 output +-- !query 1 output --- !query 3 +-- !query 2 create temporary view t3 as select * from values ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), @@ -68,27 +60,27 @@ create temporary view t3 as select * from values ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 3 schema +-- !query 2 schema struct<> --- !query 3 output +-- !query 2 output --- !query 4 +-- !query 3 SELECT * FROM t1 WHERE t1a IN (SELECT t2a FROM t2 WHERE t1d = t2d) LIMIT 2 --- !query 4 schema +-- !query 3 schema struct --- !query 4 output +-- !query 3 output val1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 val1c 8 16 19 17.0 25.0 2600 2014-05-04 01:02:00.001 2014-05-05 --- !query 5 +-- !query 4 SELECT * FROM t1 WHERE t1c IN (SELECT t2c @@ -96,16 +88,16 @@ WHERE t1c IN (SELECT t2c WHERE t2b >= 8 LIMIT 2) LIMIT 4 --- !query 5 schema +-- !query 4 schema struct --- !query 5 output +-- !query 4 output val1a 16 12 10 15.0 20.0 2000 2014-07-04 01:01:00 2014-07-04 val1a 16 12 21 15.0 20.0 2000 2014-06-04 01:02:00.001 2014-06-04 val1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 val1c 8 16 19 17.0 25.0 2600 2014-05-04 01:02:00.001 2014-05-05 --- !query 6 +-- !query 5 SELECT Count(DISTINCT( t1a )), t1b FROM t1 @@ -116,29 +108,29 @@ WHERE t1d IN (SELECT t2d GROUP BY t1b ORDER BY t1b DESC NULLS FIRST LIMIT 1 --- !query 6 schema +-- !query 5 schema struct --- !query 6 output +-- !query 5 output 1 NULL --- !query 7 +-- !query 6 SELECT * FROM t1 WHERE t1b NOT IN (SELECT t2b FROM t2 WHERE t2b > 6 LIMIT 2) --- !query 7 schema +-- !query 6 schema struct --- !query 7 output +-- !query 6 output val1a 16 12 10 15.0 20.0 2000 2014-07-04 01:01:00 2014-07-04 val1a 16 12 21 15.0 20.0 2000 2014-06-04 01:02:00.001 2014-06-04 val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:00:00 2014-04-04 val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:02:00.001 2014-04-04 --- !query 8 +-- !query 7 SELECT Count(DISTINCT( t1a )), t1b FROM t1 @@ -149,7 +141,7 @@ WHERE t1d NOT IN (SELECT t2d GROUP BY t1b ORDER BY t1b NULLS last LIMIT 1 --- !query 8 schema +-- !query 7 schema struct --- !query 8 output +-- !query 7 output 1 6 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out index c52e5706deee..dcd30055bca1 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out @@ -92,15 +92,15 @@ t1a IN (SELECT t2a, t2b struct<> -- !query 7 output org.apache.spark.sql.AnalysisException -Cannot analyze (t1.`t1a` IN (listquery(t1.`t1a`))). +cannot resolve '(t1.`t1a` IN (listquery(t1.`t1a`)))' due to data type mismatch: The number of columns in the left hand side of an IN subquery does not match the number of columns in the output of subquery. -#columns in left hand side: 1 -#columns in right hand side: 2 +#columns in left hand side: 1. +#columns in right hand side: 2. Left side columns: -[t1.`t1a`] +[t1.`t1a`]. Right side columns: -[t2.`t2a`, t2.`t2b`]; +[t2.`t2a`, t2.`t2b`].; -- !query 8 @@ -113,15 +113,15 @@ WHERE struct<> -- !query 8 output org.apache.spark.sql.AnalysisException -Cannot analyze (named_struct('t1a', t1.`t1a`, 't1b', t1.`t1b`) IN (listquery(t1.`t1a`))). +cannot resolve '(named_struct('t1a', t1.`t1a`, 't1b', t1.`t1b`) IN (listquery(t1.`t1a`)))' due to data type mismatch: The number of columns in the left hand side of an IN subquery does not match the number of columns in the output of subquery. -#columns in left hand side: 2 -#columns in right hand side: 1 +#columns in left hand side: 2. +#columns in right hand side: 1. Left side columns: -[t1.`t1a`, t1.`t1b`] +[t1.`t1a`, t1.`t1b`]. Right side columns: -[t2.`t2a`]; +[t2.`t2a`].; -- !query 9 diff --git a/sql/core/src/test/resources/sql-tests/results/udaf-regrfunctions.sql.out b/sql/core/src/test/resources/sql-tests/results/udaf-regrfunctions.sql.out deleted file mode 100644 index d7d009a64bf8..000000000000 --- a/sql/core/src/test/resources/sql-tests/results/udaf-regrfunctions.sql.out +++ /dev/null @@ -1,93 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 3 - - --- !query 0 -CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES - (101, 1, 1, 1), - (201, 2, 1, 1), - (301, 3, 1, 1), - (401, 4, 1, 11), - (501, 5, 1, null), - (601, 6, null, 1), - (701, 6, null, null), - (102, 1, 2, 2), - (202, 2, 1, 2), - (302, 3, 2, 1), - (402, 4, 2, 12), - (502, 5, 2, null), - (602, 6, null, 2), - (702, 6, null, null), - (103, 1, 3, 3), - (203, 2, 1, 3), - (303, 3, 3, 1), - (403, 4, 3, 13), - (503, 5, 3, null), - (603, 6, null, 3), - (703, 6, null, null), - (104, 1, 4, 4), - (204, 2, 1, 4), - (304, 3, 4, 1), - (404, 4, 4, 14), - (504, 5, 4, null), - (604, 6, null, 4), - (704, 6, null, null), - (800, 7, 1, 1) -as t1(id, px, y, x) --- !query 0 schema -struct<> --- !query 0 output - - - --- !query 1 -select px, var_pop(x), var_pop(y), corr(y,x), covar_samp(y,x), covar_pop(y,x), regr_count(y,x), - regr_slope(y,x), regr_intercept(y,x), regr_r2(y,x), regr_sxx(y,x), regr_syy(y,x), regr_sxy(y,x), - regr_avgx(y,x), regr_avgy(y,x), regr_count(y,x) -from t1 group by px order by px --- !query 1 schema -struct --- !query 1 output -1 1.25 1.25 1.0 1.6666666666666667 1.25 4 1.0 0.0 1.0 5.0 5.0 5.0 2.5 2.5 4 -2 1.25 0.0 NULL 0.0 0.0 4 0.0 1.0 1.0 5.0 0.0 0.0 2.5 1.0 4 -3 0.0 1.25 NULL 0.0 0.0 4 NULL NULL NULL 0.0 5.0 0.0 1.0 2.5 4 -4 1.25 1.25 1.0 1.6666666666666667 1.25 4 1.0 -10.0 1.0 5.0 5.0 5.0 12.5 2.5 4 -5 NULL 1.25 NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL 0 -6 1.25 NULL NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL 0 -7 0.0 0.0 NaN NaN 0.0 1 NULL NULL NULL 0.0 0.0 0.0 1.0 1.0 1 - - --- !query 2 -select id, regr_count(y,x) over (partition by px) from t1 order by id --- !query 2 schema -struct --- !query 2 output -101 4 -102 4 -103 4 -104 4 -201 4 -202 4 -203 4 -204 4 -301 4 -302 4 -303 4 -304 4 -401 4 -402 4 -403 4 -404 4 -501 0 -502 0 -503 0 -504 0 -601 0 -602 0 -603 0 -604 0 -701 0 -702 0 -703 0 -704 0 -800 1 diff --git a/sql/core/src/test/resources/test-data/cars-empty-value.csv b/sql/core/src/test/resources/test-data/cars-empty-value.csv new file mode 100644 index 000000000000..0f20a2f23ac0 --- /dev/null +++ b/sql/core/src/test/resources/test-data/cars-empty-value.csv @@ -0,0 +1,4 @@ +year,make,model,comment,blank +"2012","Tesla","S","","" +1997,Ford,E350,"Go get one now they are going fast", +2015,Chevy,Volt,,"" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index 2917c56dbeb5..f984a1b722e3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -505,7 +505,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext { test("upper") { checkAnswer( lowerCaseData.select(upper('l)), - ('a' to 'd').map(c => Row(c.toString.toUpperCase)) + ('a' to 'd').map(c => Row(c.toString.toUpperCase(Locale.ROOT))) ) checkAnswer( @@ -526,7 +526,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext { test("lower") { checkAnswer( upperCaseData.select(lower('L)), - ('A' to 'F').map(c => Row(c.toString.toLowerCase)) + ('A' to 'F').map(c => Row(c.toString.toLowerCase(Locale.ROOT))) ) checkAnswer( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index 85b3ca11383f..d0106c44b7db 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -557,13 +557,11 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext { } test("SPARK-18004 limit + aggregates") { - withSQLConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT.key -> "true") { - val df = Seq(("a", 1), ("b", 2), ("c", 1), ("d", 5)).toDF("id", "value") - val limit2Df = df.limit(2) - checkAnswer( - limit2Df.groupBy("id").count().select($"id"), - limit2Df.select($"id")) - } + val df = Seq(("a", 1), ("b", 2), ("c", 1), ("d", 5)).toDF("id", "value") + val limit2Df = df.limit(2) + checkAnswer( + limit2Df.groupBy("id").count().select($"id"), + limit2Df.select($"id")) } test("SPARK-17237 remove backticks in a pivot result schema") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 156e54300e38..60ebc5e6cc09 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -26,6 +26,7 @@ import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation import org.apache.spark.sql.catalyst.util.DateTimeTestUtils import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf @@ -85,14 +86,16 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { } val df5 = Seq((Seq("a", null), Seq(1, 2))).toDF("k", "v") - intercept[RuntimeException] { + val msg1 = intercept[Exception] { df5.select(map_from_arrays($"k", $"v")).collect - } + }.getMessage + assert(msg1.contains("Cannot use null as map key!")) val df6 = Seq((Seq(1, 2), Seq("a"))).toDF("k", "v") - intercept[RuntimeException] { + val msg2 = intercept[Exception] { df6.select(map_from_arrays($"k", $"v")).collect - } + }.getMessage + assert(msg2.contains("The given two arrays should have the same length")) } test("struct with column name") { @@ -733,6 +736,56 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { df.selectExpr("array_contains(array(1, null), array(1, null)[0])"), Seq(Row(true), Row(true)) ) + + checkAnswer( + OneRowRelation().selectExpr("array_contains(array(1), 1.23D)"), + Seq(Row(false)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_contains(array(1), 1.0D)"), + Seq(Row(true)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_contains(array(1.0D), 1)"), + Seq(Row(true)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_contains(array(1.23D), 1)"), + Seq(Row(false)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_contains(array(array(1)), array(1.0D))"), + Seq(Row(true)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_contains(array(array(1)), array(1.23D))"), + Seq(Row(false)) + ) + + val e1 = intercept[AnalysisException] { + OneRowRelation().selectExpr("array_contains(array(1), .01234567890123456790123456780)") + } + val errorMsg1 = + s""" + |Input to function array_contains should have been array followed by a + |value with same element type, but it's [array, decimal(29,29)]. + """.stripMargin.replace("\n", " ").trim() + assert(e1.message.contains(errorMsg1)) + + val e2 = intercept[AnalysisException] { + OneRowRelation().selectExpr("array_contains(array(1), 'foo')") + } + val errorMsg2 = + s""" + |Input to function array_contains should have been array followed by a + |value with same element type, but it's [array, string]. + """.stripMargin.replace("\n", " ").trim() + assert(e2.message.contains(errorMsg2)) } test("arrays_overlap function") { @@ -1044,18 +1097,63 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) checkAnswer( - df.selectExpr("array_position(array(array(1), null)[0], 1)"), - Seq(Row(1L), Row(1L)) + OneRowRelation().selectExpr("array_position(array(1), 1.23D)"), + Seq(Row(0L)) ) + checkAnswer( - df.selectExpr("array_position(array(1, null), array(1, null)[0])"), - Seq(Row(1L), Row(1L)) + OneRowRelation().selectExpr("array_position(array(1), 1.0D)"), + Seq(Row(1L)) ) - val e = intercept[AnalysisException] { + checkAnswer( + OneRowRelation().selectExpr("array_position(array(1.D), 1)"), + Seq(Row(1L)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_position(array(1.23D), 1)"), + Seq(Row(0L)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_position(array(array(1)), array(1.0D))"), + Seq(Row(1L)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_position(array(array(1)), array(1.23D))"), + Seq(Row(0L)) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_position(array(array(1), null)[0], 1)"), + Seq(Row(1L)) + ) + checkAnswer( + OneRowRelation().selectExpr("array_position(array(1, null), array(1, null)[0])"), + Seq(Row(1L)) + ) + + val e1 = intercept[AnalysisException] { Seq(("a string element", "a")).toDF().selectExpr("array_position(_1, _2)") } - assert(e.message.contains("argument 1 requires array type, however, '`_1`' is of string type")) + val errorMsg1 = + s""" + |Input to function array_position should have been array followed by a + |value with same element type, but it's [string, string]. + """.stripMargin.replace("\n", " ").trim() + assert(e1.message.contains(errorMsg1)) + + val e2 = intercept[AnalysisException] { + OneRowRelation().selectExpr("array_position(array(1), '1')") + } + val errorMsg2 = + s""" + |Input to function array_position should have been array followed by a + |value with same element type, but it's [array, string]. + """.stripMargin.replace("\n", " ").trim() + assert(e2.message.contains(errorMsg2)) } test("element_at function") { @@ -1113,11 +1211,80 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { Seq(Row("3"), Row(""), Row(null)) ) - val e = intercept[AnalysisException] { + val e1 = intercept[AnalysisException] { Seq(("a string element", 1)).toDF().selectExpr("element_at(_1, _2)") } - assert(e.message.contains( - "argument 1 requires (array or map) type, however, '`_1`' is of string type")) + val errorMsg1 = + s""" + |The first argument to function element_at should have been array or map type, but + |its string type. + """.stripMargin.replace("\n", " ").trim() + assert(e1.message.contains(errorMsg1)) + + checkAnswer( + OneRowRelation().selectExpr("element_at(array(2, 1), 2S)"), + Seq(Row(1)) + ) + + checkAnswer( + OneRowRelation().selectExpr("element_at(array('a', 'b'), 1Y)"), + Seq(Row("a")) + ) + + checkAnswer( + OneRowRelation().selectExpr("element_at(array(1, 2, 3), 3)"), + Seq(Row(3)) + ) + + val e2 = intercept[AnalysisException] { + OneRowRelation().selectExpr("element_at(array('a', 'b'), 1L)") + } + val errorMsg2 = + s""" + |Input to function element_at should have been array followed by a int, but it's + |[array, bigint]. + """.stripMargin.replace("\n", " ").trim() + assert(e2.message.contains(errorMsg2)) + + checkAnswer( + OneRowRelation().selectExpr("element_at(map(1, 'a', 2, 'b'), 2Y)"), + Seq(Row("b")) + ) + + checkAnswer( + OneRowRelation().selectExpr("element_at(map(1, 'a', 2, 'b'), 1S)"), + Seq(Row("a")) + ) + + checkAnswer( + OneRowRelation().selectExpr("element_at(map(1, 'a', 2, 'b'), 2)"), + Seq(Row("b")) + ) + + checkAnswer( + OneRowRelation().selectExpr("element_at(map(1, 'a', 2, 'b'), 2L)"), + Seq(Row("b")) + ) + + checkAnswer( + OneRowRelation().selectExpr("element_at(map(1, 'a', 2, 'b'), 1.0D)"), + Seq(Row("a")) + ) + + checkAnswer( + OneRowRelation().selectExpr("element_at(map(1, 'a', 2, 'b'), 1.23D)"), + Seq(Row(null)) + ) + + val e3 = intercept[AnalysisException] { + OneRowRelation().selectExpr("element_at(map(1, 'a', 2, 'b'), '1')") + } + val errorMsg3 = + s""" + |Input to function element_at should have been map followed by a value of same + |key type, but it's [map, string]. + """.stripMargin.replace("\n", " ").trim() + assert(e3.message.contains(errorMsg3)) } test("array_union functions") { @@ -1476,6 +1643,34 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { Row(null)) ) + checkAnswer( + OneRowRelation().selectExpr("array_remove(array(1, 2), 1.23D)"), + Seq( + Row(Seq(1.0, 2.0)) + ) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_remove(array(1, 2), 1.0D)"), + Seq( + Row(Seq(2.0)) + ) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_remove(array(1.0D, 2.0D), 2)"), + Seq( + Row(Seq(1.0)) + ) + ) + + checkAnswer( + OneRowRelation().selectExpr("array_remove(array(1.1D, 1.2D), 1)"), + Seq( + Row(Seq(1.1, 1.2)) + ) + ) + checkAnswer( df.selectExpr("array_remove(a, 2)", "array_remove(b, \"a\")", "array_remove(c, \"\")"), @@ -1485,10 +1680,26 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { Row(null, null, null)) ) - val e = intercept[AnalysisException] { + val e1 = intercept[AnalysisException] { Seq(("a string element", "a")).toDF().selectExpr("array_remove(_1, _2)") } - assert(e.message.contains("argument 1 requires array type, however, '`_1`' is of string type")) + val errorMsg1 = + s""" + |Input to function array_remove should have been array followed by a + |value with same element type, but it's [string, string]. + """.stripMargin.replace("\n", " ").trim() + assert(e1.message.contains(errorMsg1)) + + val e2 = intercept[AnalysisException] { + OneRowRelation().selectExpr("array_remove(array(1, 2), '1')") + } + + val errorMsg2 = + s""" + |Input to function array_remove should have been array followed by a + |value with same element type, but it's [array, string]. + """.stripMargin.replace("\n", " ").trim() + assert(e2.message.contains(errorMsg2)) } test("array_distinct functions") { @@ -2377,7 +2588,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { assert(ex2.getMessage.contains( "The number of lambda function arguments '3' does not match")) - val ex3 = intercept[RuntimeException] { + val ex3 = intercept[Exception] { dfExample1.selectExpr("transform_keys(i, (k, v) -> v)").show() } assert(ex3.getMessage.contains("Cannot use null as map key!")) @@ -2675,8 +2886,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { val funcsMustHaveAtLeastOneArg = ("coalesce", (df: DataFrame) => df.select(coalesce())) :: ("coalesce", (df: DataFrame) => df.selectExpr("coalesce()")) :: - ("named_struct", (df: DataFrame) => df.select(struct())) :: - ("named_struct", (df: DataFrame) => df.selectExpr("named_struct()")) :: ("hash", (df: DataFrame) => df.select(hash())) :: ("hash", (df: DataFrame) => df.selectExpr("hash()")) :: Nil funcsMustHaveAtLeastOneArg.foreach { case (name, func) => @@ -2697,7 +2906,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { test("SPARK-24734: Fix containsNull of Concat for array type") { val df = Seq((Seq(1), Seq[Integer](null), Seq("a", "b"))).toDF("k1", "k2", "v") - val ex = intercept[RuntimeException] { + val ex = intercept[Exception] { df.select(map_from_arrays(concat($"k1", $"k2"), $"v")).show() } assert(ex.getMessage.contains("Cannot use null as map key")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala index b972b9ef93e5..b52ca58c07d2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql +import java.util.Locale + import org.apache.spark.sql.catalyst.expressions.aggregate.PivotFirst import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf @@ -272,7 +274,7 @@ class DataFramePivotSuite extends QueryTest with SharedSQLContext { val expected = Row(2012, 15000.0, 20000.0) :: Row(2013, 48000.0, 30000.0) :: Nil val df = trainingSales .groupBy($"sales.year") - .pivot(lower($"sales.course"), Seq("dotNet", "Java").map(_.toLowerCase)) + .pivot(lower($"sales.course"), Seq("dotNet", "Java").map(_.toLowerCase(Locale.ROOT))) .agg(sum($"sales.earnings")) checkAnswer(df, expected) @@ -308,4 +310,27 @@ class DataFramePivotSuite extends QueryTest with SharedSQLContext { assert(exception.getMessage.contains("aggregate functions are not allowed")) } + + test("pivoting column list with values") { + val expected = Row(2012, 10000.0, null) :: Row(2013, 48000.0, 30000.0) :: Nil + val df = trainingSales + .groupBy($"sales.year") + .pivot(struct(lower($"sales.course"), $"training"), Seq( + struct(lit("dotnet"), lit("Experts")), + struct(lit("java"), lit("Dummies"))) + ).agg(sum($"sales.earnings")) + + checkAnswer(df, expected) + } + + test("pivoting column list") { + val exception = intercept[RuntimeException] { + trainingSales + .groupBy($"sales.year") + .pivot(struct(lower($"sales.course"), $"training")) + .agg(sum($"sales.earnings")) + .collect() + } + assert(exception.getMessage.contains("Unsupported literal type")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 8eae35325fae..589873b9c3ea 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -23,7 +23,7 @@ import org.scalatest.Matchers._ import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.stat.StatFunctions -import org.apache.spark.sql.functions.col +import org.apache.spark.sql.functions.{col, lit, struct} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @@ -374,6 +374,24 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { Seq(Row(0, 6), Row(1, 11))) } + test("sampleBy one column") { + val df = spark.range(0, 100).select((col("id") % 3).as("key")) + val sampled = df.stat.sampleBy($"key", Map(0 -> 0.1, 1 -> 0.2), 0L) + checkAnswer( + sampled.groupBy("key").count().orderBy("key"), + Seq(Row(0, 6), Row(1, 11))) + } + + test("sampleBy multiple columns") { + val df = spark.range(0, 100) + .select(lit("Foo").as("name"), (col("id") % 3).as("key")) + val sampled = df.stat.sampleBy( + struct($"name", $"key"), Map(Row("Foo", 0) -> 0.1, Row("Foo", 1) -> 0.2), 0L) + checkAnswer( + sampled.groupBy("key").count().orderBy("key"), + Seq(Row(0, 6), Row(1, 11))) + } + // This test case only verifies that `DataFrame.countMinSketch()` methods do return // `CountMinSketch`es that meet required specs. Test cases for `CountMinSketch` can be found in // `CountMinSketchSuite` in project spark-sketch. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index d43fcf3c6f5d..c0b277f76ae6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -40,6 +40,7 @@ import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSQLContex import org.apache.spark.sql.test.SQLTestData.{NullInts, NullStrings, TestData2} import org.apache.spark.sql.types._ import org.apache.spark.util.Utils +import org.apache.spark.util.random.XORShiftRandom class DataFrameSuite extends QueryTest with SharedSQLContext { import testImplicits._ @@ -1729,10 +1730,8 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { } test("SPARK-9083: sort with non-deterministic expressions") { - import org.apache.spark.util.random.XORShiftRandom - val seed = 33 - val df = (1 to 100).map(Tuple1.apply).toDF("i") + val df = (1 to 100).map(Tuple1.apply).toDF("i").repartition(1) val random = new XORShiftRandom(seed) val expected = (1 to 100).map(_ -> random.nextDouble()).sortBy(_._2).map(_._1) val actual = df.sort(rand(seed)).collect().map(_.getInt(0)) @@ -2409,18 +2408,6 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { Seq(Row(7, 1, 1), Row(7, 1, 2), Row(7, 2, 1), Row(7, 2, 2), Row(7, 3, 1), Row(7, 3, 2))) } - test("SPARK-22226: splitExpressions should not generate codes beyond 64KB") { - val colNumber = 10000 - val input = spark.range(2).rdd.map(_ => Row(1 to colNumber: _*)) - val df = sqlContext.createDataFrame(input, StructType( - (1 to colNumber).map(colIndex => StructField(s"_$colIndex", IntegerType, false)))) - val newCols = (1 to colNumber).flatMap { colIndex => - Seq(expr(s"if(1000 < _$colIndex, 1000, _$colIndex)"), - expr(s"sqrt(_$colIndex)")) - } - df.select(newCols: _*).collect() - } - test("SPARK-22271: mean overflows and returns null for some decimal variables") { val d = 0.034567890 val df = Seq(d, d, d, d, d, d, d, d, d, d).toDF("DecimalCol") @@ -2553,4 +2540,31 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { } } + test("SPARK-25368 Incorrect predicate pushdown returns wrong result") { + def check(newCol: Column, filter: Column, result: Seq[Row]): Unit = { + val df1 = spark.createDataFrame(Seq( + (1, 1) + )).toDF("a", "b").withColumn("c", newCol) + + val df2 = df1.union(df1).withColumn("d", spark_partition_id).filter(filter) + checkAnswer(df2, result) + } + + check(lit(null).cast("int"), $"c".isNull, Seq(Row(1, 1, null, 0), Row(1, 1, null, 1))) + check(lit(null).cast("int"), $"c".isNotNull, Seq()) + check(lit(2).cast("int"), $"c".isNull, Seq()) + check(lit(2).cast("int"), $"c".isNotNull, Seq(Row(1, 1, 2, 0), Row(1, 1, 2, 1))) + check(lit(2).cast("int"), $"c" === 2, Seq(Row(1, 1, 2, 0), Row(1, 1, 2, 1))) + check(lit(2).cast("int"), $"c" =!= 2, Seq()) + } + + test("SPARK-25402 Null handling in BooleanSimplification") { + val schema = StructType.fromDDL("a boolean, b int") + val rows = Seq(Row(null, 1)) + + val rdd = sparkContext.parallelize(rows) + val df = spark.createDataFrame(rdd, schema) + + checkAnswer(df.where("(NOT a) OR a"), Seq.empty) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala index 97a843978f0b..78277d7dcf75 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.types._ * Window function testing for DataFrame API. */ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext { + import testImplicits._ test("reuse window partitionBy") { @@ -72,9 +73,9 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext { cume_dist().over(Window.partitionBy("value").orderBy("key")), percent_rank().over(Window.partitionBy("value").orderBy("key"))), Row(1, 1, 1, 1.0d, 1, 1, 1, 1, 1, 1, 1.0d, 0.0d) :: - Row(1, 1, 1, 1.0d, 1, 1, 1, 1, 1, 1, 1.0d / 3.0d, 0.0d) :: - Row(2, 2, 1, 5.0d / 3.0d, 3, 5, 1, 2, 2, 2, 1.0d, 0.5d) :: - Row(2, 2, 1, 5.0d / 3.0d, 3, 5, 2, 3, 2, 2, 1.0d, 0.5d) :: Nil) + Row(1, 1, 1, 1.0d, 1, 1, 1, 1, 1, 1, 1.0d / 3.0d, 0.0d) :: + Row(2, 2, 1, 5.0d / 3.0d, 3, 5, 1, 2, 2, 2, 1.0d, 0.5d) :: + Row(2, 2, 1, 5.0d / 3.0d, 3, 5, 2, 3, 2, 2, 1.0d, 0.5d) :: Nil) } test("window function should fail if order by clause is not specified") { @@ -162,12 +163,12 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext { Seq( Row("a", -50.0, 50.0, 50.0, 7.0710678118654755, 7.0710678118654755), Row("b", -50.0, 50.0, 50.0, 7.0710678118654755, 7.0710678118654755), - Row("c", 0.0, 0.0, 0.0, 0.0, 0.0 ), - Row("d", 0.0, 0.0, 0.0, 0.0, 0.0 ), - Row("e", 24.0, 12.0, 12.0, 3.4641016151377544, 3.4641016151377544 ), - Row("f", 24.0, 12.0, 12.0, 3.4641016151377544, 3.4641016151377544 ), - Row("g", 24.0, 12.0, 12.0, 3.4641016151377544, 3.4641016151377544 ), - Row("h", 24.0, 12.0, 12.0, 3.4641016151377544, 3.4641016151377544 ), + Row("c", 0.0, 0.0, 0.0, 0.0, 0.0), + Row("d", 0.0, 0.0, 0.0, 0.0, 0.0), + Row("e", 24.0, 12.0, 12.0, 3.4641016151377544, 3.4641016151377544), + Row("f", 24.0, 12.0, 12.0, 3.4641016151377544, 3.4641016151377544), + Row("g", 24.0, 12.0, 12.0, 3.4641016151377544, 3.4641016151377544), + Row("h", 24.0, 12.0, 12.0, 3.4641016151377544, 3.4641016151377544), Row("i", Double.NaN, Double.NaN, Double.NaN, Double.NaN, Double.NaN))) } @@ -326,7 +327,7 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext { var_samp($"value").over(window), approx_count_distinct($"value").over(window)), Seq.fill(4)(Row("a", 1.0d / 4.0d, 1.0d / 3.0d, 2)) - ++ Seq.fill(3)(Row("b", 2.0d / 3.0d, 1.0d, 3))) + ++ Seq.fill(3)(Row("b", 2.0d / 3.0d, 1.0d, 3))) } test("window function with aggregates") { @@ -624,7 +625,7 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext { test("SPARK-24575: Window functions inside WHERE and HAVING clauses") { def checkAnalysisError(df: => DataFrame): Unit = { - val thrownException = the [AnalysisException] thrownBy { + val thrownException = the[AnalysisException] thrownBy { df.queryExecution.analyzed } assert(thrownException.message.contains("window functions inside WHERE and HAVING clauses")) @@ -658,4 +659,26 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext { |GROUP BY a |HAVING SUM(b) = 5 AND RANK() OVER(ORDER BY a) = 1""".stripMargin)) } + + test("window functions in multiple selects") { + val df = Seq( + ("S1", "P1", 100), + ("S1", "P1", 700), + ("S2", "P1", 200), + ("S2", "P2", 300) + ).toDF("sno", "pno", "qty") + + val w1 = Window.partitionBy("sno") + val w2 = Window.partitionBy("sno", "pno") + + checkAnswer( + df.select($"sno", $"pno", $"qty", sum($"qty").over(w2).alias("sum_qty_2")) + .select($"sno", $"pno", $"qty", col("sum_qty_2"), sum("qty").over(w1).alias("sum_qty_1")), + Seq( + Row("S1", "P1", 100, 800, 800), + Row("S1", "P1", 700, 800, 800), + Row("S2", "P1", 200, 200, 500), + Row("S2", "P2", 300, 300, 500))) + + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala index 1a0672b8876d..e3df449b41f0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala @@ -17,17 +17,24 @@ package org.apache.spark.sql -import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StringType -import org.apache.spark.util.Benchmark /** * Benchmark for Dataset typed operations comparing with DataFrame and RDD versions. + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/DatasetBenchmark-results.txt". + * }}} */ -object DatasetBenchmark { +object DatasetBenchmark extends SqlBasedBenchmark { case class Data(l: Long, s: String) @@ -39,7 +46,7 @@ object DatasetBenchmark { val df = ds.toDF("l") val func = (l: Long) => l + 1 - val benchmark = new Benchmark("back-to-back map long", numRows) + val benchmark = new Benchmark("back-to-back map long", numRows, output = output) benchmark.addCase("RDD") { iter => var res = rdd @@ -78,7 +85,7 @@ object DatasetBenchmark { import spark.implicits._ val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) - val benchmark = new Benchmark("back-to-back map", numRows) + val benchmark = new Benchmark("back-to-back map", numRows, output = output) val func = (d: Data) => Data(d.l + 1, d.s) val rdd = spark.sparkContext.range(1, numRows).map(l => Data(l, l.toString)) @@ -123,7 +130,7 @@ object DatasetBenchmark { val df = ds.toDF("l") val func = (l: Long) => l % 2L == 0L - val benchmark = new Benchmark("back-to-back filter Long", numRows) + val benchmark = new Benchmark("back-to-back filter Long", numRows, output = output) benchmark.addCase("RDD") { iter => var res = rdd @@ -162,7 +169,7 @@ object DatasetBenchmark { import spark.implicits._ val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) - val benchmark = new Benchmark("back-to-back filter", numRows) + val benchmark = new Benchmark("back-to-back filter", numRows, output = output) val func = (d: Data, i: Int) => d.l % (100L + i) == 0L val funcs = 0.until(numChains).map { i => (d: Data) => func(d, i) @@ -220,7 +227,7 @@ object DatasetBenchmark { import spark.implicits._ val df = spark.range(1, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) - val benchmark = new Benchmark("aggregate", numRows) + val benchmark = new Benchmark("aggregate", numRows, output = output) val rdd = spark.sparkContext.range(1, numRows).map(l => Data(l, l.toString)) benchmark.addCase("RDD sum") { iter => @@ -242,75 +249,22 @@ object DatasetBenchmark { benchmark } - def main(args: Array[String]): Unit = { - val spark = SparkSession.builder + override def getSparkSession: SparkSession = { + SparkSession.builder .master("local[*]") .appName("Dataset benchmark") .getOrCreate() + } + override def runBenchmarkSuite(): Unit = { val numRows = 100000000 val numChains = 10 - - val benchmark0 = backToBackMapLong(spark, numRows, numChains) - val benchmark1 = backToBackMap(spark, numRows, numChains) - val benchmark2 = backToBackFilterLong(spark, numRows, numChains) - val benchmark3 = backToBackFilter(spark, numRows, numChains) - val benchmark4 = aggregate(spark, numRows) - - /* - OpenJDK 64-Bit Server VM 1.8.0_111-8u111-b14-2ubuntu0.16.04.2-b14 on Linux 4.4.0-47-generic - Intel(R) Xeon(R) CPU E5-2667 v3 @ 3.20GHz - back-to-back map long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD 1883 / 1892 53.1 18.8 1.0X - DataFrame 502 / 642 199.1 5.0 3.7X - Dataset 657 / 784 152.2 6.6 2.9X - */ - benchmark0.run() - - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 3.10.0-327.18.2.el7.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - back-to-back map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD 3448 / 3646 29.0 34.5 1.0X - DataFrame 2647 / 3116 37.8 26.5 1.3X - Dataset 4781 / 5155 20.9 47.8 0.7X - */ - benchmark1.run() - - /* - OpenJDK 64-Bit Server VM 1.8.0_121-8u121-b13-0ubuntu1.16.04.2-b13 on Linux 4.4.0-47-generic - Intel(R) Xeon(R) CPU E5-2667 v3 @ 3.20GHz - back-to-back filter Long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD 846 / 1120 118.1 8.5 1.0X - DataFrame 270 / 329 370.9 2.7 3.1X - Dataset 545 / 789 183.5 5.4 1.6X - */ - benchmark2.run() - - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 3.10.0-327.18.2.el7.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - back-to-back filter: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD 1346 / 1618 74.3 13.5 1.0X - DataFrame 59 / 72 1695.4 0.6 22.8X - Dataset 2777 / 2805 36.0 27.8 0.5X - */ - benchmark3.run() - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.12.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - aggregate: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - RDD sum 1913 / 1942 52.3 19.1 1.0X - DataFrame sum 46 / 61 2157.7 0.5 41.3X - Dataset sum using Aggregator 4656 / 4758 21.5 46.6 0.4X - Dataset complex Aggregator 6636 / 7039 15.1 66.4 0.3X - */ - benchmark4.run() + runBenchmark("Dataset Benchmark") { + backToBackMapLong(spark, numRows, numChains).run() + backToBackMap(spark, numRows, numChains).run() + backToBackFilterLong(spark, numRows, numChains).run() + backToBackFilter(spark, numRows, numChains).run() + aggregate(spark, numRows).run() + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala index 5c6a021d5b76..fef6ddd0b93c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala @@ -127,8 +127,8 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext with TimeLimits } test("cache UDF result correctly") { - val expensiveUDF = udf({x: Int => Thread.sleep(5000); x}) - val df = spark.range(0, 10).toDF("a").withColumn("b", expensiveUDF($"a")) + val expensiveUDF = udf({x: Int => Thread.sleep(2000); x}) + val df = spark.range(0, 2).toDF("a").repartition(1).withColumn("b", expensiveUDF($"a")) val df2 = df.agg(sum(df("b"))) df.cache() @@ -136,7 +136,7 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext with TimeLimits assertCached(df2) // udf has been evaluated during caching, and thus should not be re-evaluated here - failAfter(3 seconds) { + failAfter(2 seconds) { df2.collect() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index cf24eba12801..4e593ff046a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -611,7 +611,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { ).toDF("id", "stringData") val sampleDF = df.sample(false, 0.7, 50) // After sampling, sampleDF doesn't contain id=1. - assert(!sampleDF.select("id").collect.contains(1)) + assert(!sampleDF.select("id").as[Int].collect.contains(1)) // simpleUdf should not encounter id=1. checkAnswer(sampleDF.select(simpleUdf($"id")), List.fill(sampleDF.count.toInt)(Row(1))) } @@ -969,6 +969,55 @@ class DatasetSuite extends QueryTest with SharedSQLContext { checkShowString(ds, expected) } + test("SPARK-25108 Fix the show method to display the full width character alignment problem") { + // scalastyle:off nonascii + val df = Seq( + (0, null, 1), + (0, "", 1), + (0, "ab c", 1), + (0, "1098", 1), + (0, "mø", 1), + (0, "γύρ", 1), + (0, "pê", 1), + (0, "ー", 1), + (0, "测", 1), + (0, "か", 1), + (0, "걸", 1), + (0, "à", 1), + (0, "焼", 1), + (0, "羍む", 1), + (0, "뺭ᾘ", 1), + (0, "\u0967\u0968\u0969", 1) + ).toDF("b", "a", "c") + // scalastyle:on nonascii + val ds = df.as[ClassData] + val expected = + // scalastyle:off nonascii + """+---+----+---+ + || b| a| c| + |+---+----+---+ + || 0|null| 1| + || 0| | 1| + || 0|ab c| 1| + || 0|1098| 1| + || 0| mø| 1| + || 0| γύρ| 1| + || 0| pê| 1| + || 0| ー| 1| + || 0| 测| 1| + || 0| か| 1| + || 0| 걸| 1| + || 0| à| 1| + || 0| 焼| 1| + || 0|羍む| 1| + || 0| 뺭ᾘ| 1| + || 0| १२३| 1| + |+---+----+---+ + |""".stripMargin + // scalastyle:on nonascii + checkShowString(ds, expected) + } + test( "SPARK-15112: EmbedDeserializerInFilter should not optimize plan fragment that changes schema" ) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 3af80b36ec42..c4ec7150c407 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -23,7 +23,6 @@ import java.util.Locale import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.CalendarInterval @@ -730,12 +729,4 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { Row(Timestamp.valueOf("2015-07-24 07:00:00")), Row(Timestamp.valueOf("2015-07-24 22:00:00")))) } - - test("SPARK-23715: to/from_utc_timestamp can retain the previous behavior") { - withSQLConf(SQLConf.REJECT_TIMEZONE_IN_STRING.key -> "false") { - checkAnswer( - sql("SELECT from_utc_timestamp('2000-10-10 00:00:00+00:00', 'GMT+1')"), - Row(Timestamp.valueOf("2000-10-09 18:00:00"))) - } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 4aa6afd69620..94f163708832 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -20,10 +20,13 @@ package org.apache.spark.sql import java.io.{File, FileNotFoundException} import java.util.Locale +import scala.collection.mutable + import org.apache.hadoop.fs.Path import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkException +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT, NullData, NullUDT} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf @@ -431,46 +434,68 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext with Befo } } - test(s"SPARK-25132: case-insensitive field resolution when reading from Parquet") { - withTempDir { dir => - val format = "parquet" - val tableDir = dir.getCanonicalPath + s"/$format" - val tableName = s"spark_25132_${format}" - withTable(tableName) { - val end = 5 - val data = spark.range(end).selectExpr("id as A", "id * 2 as b", "id * 3 as B") - withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - data.write.format(format).mode("overwrite").save(tableDir) - } - sql(s"CREATE TABLE $tableName (a LONG, b LONG) USING $format LOCATION '$tableDir'") - - withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { - checkAnswer(sql(s"select a from $tableName"), data.select("A")) - checkAnswer(sql(s"select A from $tableName"), data.select("A")) - - // RuntimeException is triggered at executor side, which is then wrapped as - // SparkException at driver side - val e1 = intercept[SparkException] { - sql(s"select b from $tableName").collect() + Seq("parquet", "orc").foreach { format => + test(s"Spark native readers should respect spark.sql.caseSensitive - ${format}") { + withTempDir { dir => + val tableName = s"spark_25132_${format}_native" + val tableDir = dir.getCanonicalPath + s"/$tableName" + withTable(tableName) { + val end = 5 + val data = spark.range(end).selectExpr("id as A", "id * 2 as b", "id * 3 as B") + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + data.write.format(format).mode("overwrite").save(tableDir) } - assert( - e1.getCause.isInstanceOf[RuntimeException] && - e1.getCause.getMessage.contains( - """Found duplicate field(s) "b": [b, B] in case-insensitive mode""")) - val e2 = intercept[SparkException] { - sql(s"select B from $tableName").collect() + sql(s"CREATE TABLE $tableName (a LONG, b LONG) USING $format LOCATION '$tableDir'") + + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + checkAnswer(sql(s"select a from $tableName"), data.select("A")) + checkAnswer(sql(s"select A from $tableName"), data.select("A")) + + // RuntimeException is triggered at executor side, which is then wrapped as + // SparkException at driver side + val e1 = intercept[SparkException] { + sql(s"select b from $tableName").collect() + } + assert( + e1.getCause.isInstanceOf[RuntimeException] && + e1.getCause.getMessage.contains( + """Found duplicate field(s) "b": [b, B] in case-insensitive mode""")) + val e2 = intercept[SparkException] { + sql(s"select B from $tableName").collect() + } + assert( + e2.getCause.isInstanceOf[RuntimeException] && + e2.getCause.getMessage.contains( + """Found duplicate field(s) "b": [b, B] in case-insensitive mode""")) + } + + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + checkAnswer(sql(s"select a from $tableName"), (0 until end).map(_ => Row(null))) + checkAnswer(sql(s"select b from $tableName"), data.select("b")) } - assert( - e2.getCause.isInstanceOf[RuntimeException] && - e2.getCause.getMessage.contains( - """Found duplicate field(s) "b": [b, B] in case-insensitive mode""")) } + } + } + } - withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - checkAnswer(sql(s"select a from $tableName"), (0 until end).map(_ => Row(null))) - checkAnswer(sql(s"select b from $tableName"), data.select("b")) + test("SPARK-25237 compute correct input metrics in FileScanRDD") { + withTempPath { p => + val path = p.getAbsolutePath + spark.range(1000).repartition(1).write.csv(path) + val bytesReads = new mutable.ArrayBuffer[Long]() + val bytesReadListener = new SparkListener() { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + bytesReads += taskEnd.taskMetrics.inputMetrics.bytesRead } } + sparkContext.addSparkListener(bytesReadListener) + try { + spark.read.csv(path).limit(1).collect() + sparkContext.listenerBus.waitUntilEmpty(1000L) + assert(bytesReads.sum === 7860) + } finally { + sparkContext.removeSparkListener(bytesReadListener) + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index 44767dfc9249..aa2162c9d2cd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql +import java.util.Locale + import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer import scala.language.existentials @@ -831,7 +833,7 @@ class JoinSuite extends QueryTest with SharedSQLContext { case _ => } val joinPairs = physicalJoins.zip(executedJoins) - val numOfJoins = sqlString.split(" ").count(_.toUpperCase == "JOIN") + val numOfJoins = sqlString.split(" ").count(_.toUpperCase(Locale.ROOT) == "JOIN") assert(joinPairs.size == numOfJoins) joinPairs.foreach { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index fe4bf15fa392..5cbf10129a4d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql +import collection.JavaConverters._ + import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -402,6 +404,12 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext { assert(out.schema == expected) } + test("infers schemas using options") { + val df = spark.range(1) + .select(schema_of_json(lit("{a:1}"), Map("allowUnquotedFieldNames" -> "true").asJava)) + checkAnswer(df, Seq(Row("struct"))) + } + test("from_json - array of primitive types") { val df = Seq("[1, 2, 3]").toDF("a") val schema = new ArrayType(IntegerType, false) @@ -518,4 +526,25 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext { jsonDF.select(to_json(from_json($"a", schema))), Seq(Row(json))) } + + test("pretty print - roundtrip from_json -> to_json") { + val json = """[{"book":{"publisher":[{"country":"NL","year":[1981,1986,1999]}]}}]""" + val jsonDF = Seq(json).toDF("root") + val expected = + """[ { + | "book" : { + | "publisher" : [ { + | "country" : "NL", + | "year" : [ 1981, 1986, 1999 ] + | } ] + | } + |} ]""".stripMargin + + checkAnswer( + jsonDF.select( + to_json( + from_json($"root", schema_of_json(lit(json))), + Map("pretty" -> "true"))), + Seq(Row(expected))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 01dc28d70184..631ab1b7ece7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -524,15 +524,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { sortTest() } - test("limit for skew dataframe") { - // Create a skew dataframe. - val df = testData.repartition(100).union(testData).limit(50) - // Because `rdd` of dataframe will add a `DeserializeToObject` on top of `GlobalLimit`, - // the `GlobalLimit` will not be replaced with `CollectLimit`. So we can test if `GlobalLimit` - // work on skew partitions. - assert(df.rdd.count() == 50L) - } - test("CTE feature") { checkAnswer( sql("with q1 as (select * from testData limit 10) select * from q1"), @@ -1944,7 +1935,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { // TODO: support subexpression elimination in whole stage codegen withSQLConf("spark.sql.codegen.wholeStage" -> "false") { // select from a table to prevent constant folding. - val df = sql("SELECT a, b from testData2 order by a, b limit 1") + val df = sql("SELECT a, b from testData2 limit 1") checkAnswer(df, Row(1, 1)) checkAnswer(df.selectExpr("a + 1", "a + 1"), Row(2, 2)) @@ -2858,6 +2849,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { val result = ds.flatMap(_.bar).distinct result.rdd.isEmpty } + + test("SPARK-25454: decimal division with negative scale") { + // TODO: completely fix this issue even when LITERAL_PRECISE_PRECISION is true. + withSQLConf(SQLConf.LITERAL_PICK_MINIMUM_PRECISION.key -> "false") { + checkAnswer(sql("select 26393499451 / (1e6 * 1000)"), Row(BigDecimal("26.3934994510000"))) + } + } } case class Foo(bar: Option[String]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala index 7d1366092d1e..e1b5eba53f06 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala @@ -41,13 +41,16 @@ class SessionStateSuite extends SparkFunSuite { } override def afterAll(): Unit = { - if (activeSession != null) { - activeSession.stop() - activeSession = null - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() + try { + if (activeSession != null) { + activeSession.stop() + activeSession = null + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() + } + } finally { + super.afterAll() } - super.afterAll() } test("fork new session and inherit RuntimeConfig options") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 3d76b9ac33e5..bb19fde2b2b5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -329,16 +329,52 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext { Row(" ")) } - test("string split function") { - val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b") + test("string split function with no limit") { + val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b") checkAnswer( df.select(split($"a", "[1-9]+")), - Row(Seq("aa", "bb", "cc"))) + Row(Seq("aa", "bb", "cc", ""))) checkAnswer( df.selectExpr("split(a, '[1-9]+')"), - Row(Seq("aa", "bb", "cc"))) + Row(Seq("aa", "bb", "cc", ""))) + } + + test("string split function with limit explicitly set to 0") { + val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b") + + checkAnswer( + df.select(split($"a", "[1-9]+", 0)), + Row(Seq("aa", "bb", "cc", ""))) + + checkAnswer( + df.selectExpr("split(a, '[1-9]+', 0)"), + Row(Seq("aa", "bb", "cc", ""))) + } + + test("string split function with positive limit") { + val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b") + + checkAnswer( + df.select(split($"a", "[1-9]+", 2)), + Row(Seq("aa", "bb3cc4"))) + + checkAnswer( + df.selectExpr("split(a, '[1-9]+', 2)"), + Row(Seq("aa", "bb3cc4"))) + } + + test("string split function with negative limit") { + val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b") + + checkAnswer( + df.select(split($"a", "[1-9]+", -2)), + Row(Seq("aa", "bb", "cc", ""))) + + checkAnswer( + df.selectExpr("split(a, '[1-9]+', -2)"), + Row(Seq("aa", "bb", "cc", ""))) } test("string / binary length function") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala index e3e700529bba..b32d95d0b286 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala @@ -69,7 +69,7 @@ class TPCHQuerySuite extends BenchmarkQueryTest { sql( """ |CREATE TABLE `customer` (`c_custkey` BIGINT, `c_name` STRING, `c_address` STRING, - |`c_nationkey` STRING, `c_phone` STRING, `c_acctbal` DECIMAL(10,0), + |`c_nationkey` BIGINT, `c_phone` STRING, `c_acctbal` DECIMAL(10,0), |`c_mktsegment` STRING, `c_comment` STRING) |USING parquet """.stripMargin) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala index c8d045a32d73..11a1c9a1f9b9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala @@ -83,4 +83,20 @@ class DataSourceScanExecRedactionSuite extends QueryTest with SharedSQLContext { } } + test("FileSourceScanExec metadata") { + withTempPath { path => + val dir = path.getCanonicalPath + spark.range(0, 10).write.parquet(dir) + val df = spark.read.parquet(dir) + + assert(isIncluded(df.queryExecution, "Format")) + assert(isIncluded(df.queryExecution, "ReadSchema")) + assert(isIncluded(df.queryExecution, "Batched")) + assert(isIncluded(df.queryExecution, "PartitionFilters")) + assert(isIncluded(df.queryExecution, "PushedFilters")) + assert(isIncluded(df.queryExecution, "DataFilters")) + assert(isIncluded(df.queryExecution, "Location")) + } + } + } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala index 41de731d41f8..6ad025f37e44 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExchangeCoordinatorSuite.scala @@ -31,6 +31,7 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll { private var originalInstantiatedSparkSession: Option[SparkSession] = _ override protected def beforeAll(): Unit = { + super.beforeAll() originalActiveSparkSession = SparkSession.getActiveSession originalInstantiatedSparkSession = SparkSession.getDefaultSession @@ -39,9 +40,13 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll { } override protected def afterAll(): Unit = { - // Set these states back. - originalActiveSparkSession.foreach(ctx => SparkSession.setActiveSession(ctx)) - originalInstantiatedSparkSession.foreach(ctx => SparkSession.setDefaultSession(ctx)) + try { + // Set these states back. + originalActiveSparkSession.foreach(ctx => SparkSession.setActiveSession(ctx)) + originalInstantiatedSparkSession.foreach(ctx => SparkSession.setDefaultSession(ctx)) + } finally { + super.afterAll() + } } private def checkEstimation( @@ -50,7 +55,7 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll { expectedPartitionStartIndices: Array[Int]): Unit = { val mapOutputStatistics = bytesByPartitionIdArray.zipWithIndex.map { case (bytesByPartitionId, index) => - new MapOutputStatistics(index, bytesByPartitionId, Array[Long](1)) + new MapOutputStatistics(index, bytesByPartitionId) } val estimatedPartitionStartIndices = coordinator.estimatePartitionStartIndices(mapOutputStatistics) @@ -114,8 +119,8 @@ class ExchangeCoordinatorSuite extends SparkFunSuite with BeforeAndAfterAll { val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0, 0) val mapOutputStatistics = Array( - new MapOutputStatistics(0, bytesByPartitionId1, Array[Long](0)), - new MapOutputStatistics(1, bytesByPartitionId2, Array[Long](0))) + new MapOutputStatistics(0, bytesByPartitionId1), + new MapOutputStatistics(1, bytesByPartitionId2)) intercept[AssertionError](coordinator.estimatePartitionStartIndices(mapOutputStatistics)) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala index 59397dbcb1ca..611b2fc037f3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArrayBenchmark.scala @@ -20,10 +20,10 @@ package org.apache.spark.sql.execution import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkConf, SparkContext, SparkEnv, TaskContext} +import org.apache.spark.benchmark.Benchmark import org.apache.spark.internal.config import org.apache.spark.memory.MemoryTestingUtils import org.apache.spark.sql.catalyst.expressions.UnsafeRow -import org.apache.spark.util.Benchmark import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter object ExternalAppendOnlyUnsafeRowArrayBenchmark { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala index ecc7264d7944..b29de9c4adba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala @@ -29,7 +29,11 @@ class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSpar private val random = new java.util.Random() private var taskContext: TaskContext = _ - override def afterAll(): Unit = TaskContext.unset() + override def afterAll(): Unit = try { + TaskContext.unset() + } finally { + super.afterAll() + } private def withExternalArray(inMemoryThreshold: Int, spillThreshold: Int) (f: ExternalAppendOnlyUnsafeRowArray => Unit): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 3db89ecfad9f..e4e224df7607 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -262,7 +262,7 @@ class PlannerSuite extends SharedSQLContext { ).queryExecution.executedPlan.collect { case exchange: ShuffleExchangeExec => exchange }.length - assert(numExchanges === 3) + assert(numExchanges === 5) } { @@ -277,7 +277,7 @@ class PlannerSuite extends SharedSQLContext { ).queryExecution.executedPlan.collect { case exchange: ShuffleExchangeExec => exchange }.length - assert(numExchanges === 3) + assert(numExchanges === 5) } } @@ -704,6 +704,23 @@ class PlannerSuite extends SharedSQLContext { df.queryExecution.executedPlan.execute() } + test("SPARK-25278: physical nodes should be different instances for same logical nodes") { + val range = Range(1, 1, 1, 1) + val df = Union(range, range) + val ranges = df.queryExecution.optimizedPlan.collect { + case r: Range => r + } + assert(ranges.length == 2) + // Ensure the two Range instances are equal according to their equal method + assert(ranges.head == ranges.last) + val execRanges = df.queryExecution.sparkPlan.collect { + case r: RangeExec => r + } + assert(execRanges.length == 2) + // Ensure the two RangeExec instances are different instances + assert(!execRanges.head.eq(execRanges.last)) + } + test("SPARK-24556: always rewrite output partitioning in ReusedExchangeExec " + "and InMemoryTableScanExec") { def checkOutputPartitioningRewrite( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala index c2e62b987e0c..08e40e28d3d5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLJsonProtocolSuite.scala @@ -46,7 +46,7 @@ class SQLJsonProtocolSuite extends SparkFunSuite { """.stripMargin val reconstructedEvent = JsonProtocol.sparkEventFromJson(parse(SQLExecutionStartJsonString)) val expectedEvent = SparkListenerSQLExecutionStart(0, "test desc", "test detail", "test plan", - new SparkPlanInfo("TestNode", "test string", Nil, Nil), 0) + new SparkPlanInfo("TestNode", "test string", Nil, Map(), Nil), 0) assert(reconstructedEvent == expectedEvent) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala index 34dc6f37c0e4..47ff372992b9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala @@ -50,4 +50,12 @@ class SparkPlanSuite extends QueryTest with SharedSQLContext { } } } + + test("SPARK-25357 SparkPlanInfo of FileScan contains nonEmpty metadata") { + withTempPath { path => + spark.range(5).write.parquet(path.getAbsolutePath) + val f = spark.read.parquet(path.getAbsolutePath) + assert(SparkPlanInfo.fromSparkPlan(f.queryExecution.sparkPlan).metadata.nonEmpty) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala index 28a060aff47b..31b9bcdafbab 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala @@ -323,12 +323,22 @@ class SparkSqlParserSuite extends AnalysisTest { intercept("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS", "") assertEqual("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS key, value", - AnalyzeColumnCommand(TableIdentifier("t"), Seq("key", "value"))) + AnalyzeColumnCommand(TableIdentifier("t"), Option(Seq("key", "value")), allColumns = false)) // Partition specified - should be ignored assertEqual("ANALYZE TABLE t PARTITION(ds='2017-06-10') " + "COMPUTE STATISTICS FOR COLUMNS key, value", - AnalyzeColumnCommand(TableIdentifier("t"), Seq("key", "value"))) + AnalyzeColumnCommand(TableIdentifier("t"), Option(Seq("key", "value")), allColumns = false)) + + // Partition specified should be ignored in case of COMPUTE STATISTICS FOR ALL COLUMNS + assertEqual("ANALYZE TABLE t PARTITION(ds='2017-06-10') " + + "COMPUTE STATISTICS FOR ALL COLUMNS", + AnalyzeColumnCommand(TableIdentifier("t"), None, allColumns = true)) + + intercept("ANALYZE TABLE t COMPUTE STATISTICS FOR ALL COLUMNS key, value", + "mismatched input 'key' expecting ") + intercept("ANALYZE TABLE t COMPUTE STATISTICS FOR ALL", + "missing 'COLUMNS' at ''") } test("query organization") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala index f076959dfdf7..7e317a4d8026 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala @@ -22,7 +22,6 @@ import scala.util.Random import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.Literal -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -32,20 +31,10 @@ class TakeOrderedAndProjectSuite extends SparkPlanTest with SharedSQLContext { private var rand: Random = _ private var seed: Long = 0 - private val originalLimitFlatGlobalLimit = SQLConf.get.getConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT) - protected override def beforeAll(): Unit = { super.beforeAll() seed = System.currentTimeMillis() rand = new Random(seed) - - // Disable the optimization to make Sort-Limit match `TakeOrderedAndProject` semantics. - SQLConf.get.setConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT, false) - } - - protected override def afterAll() = { - SQLConf.get.setConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT, originalLimitFlatGlobalLimit) - super.afterAll() } private def generateRandomInputData(): DataFrame = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala index 3fad7dfddadc..dc6744646087 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala @@ -39,7 +39,11 @@ class SortBasedAggregationStoreSuite extends SparkFunSuite with LocalSparkConte new TaskContextImpl(0, 0, 0, 0, 0, taskManager, new Properties, null)) } - override def afterAll(): Unit = TaskContext.unset() + override def afterAll(): Unit = try { + TaskContext.unset() + } finally { + super.afterAll() + } private val rand = new java.util.Random() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala index 8f4ee8533e59..86e0df2fea35 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.benchmark import java.util.HashMap import org.apache.spark.SparkConf +import org.apache.spark.benchmark.Benchmark import org.apache.spark.internal.config._ import org.apache.spark.memory.{StaticMemoryManager, TaskMemoryManager} import org.apache.spark.sql.catalyst.expressions.UnsafeRow @@ -30,625 +31,541 @@ import org.apache.spark.sql.types.{LongType, StructType} import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.hash.Murmur3_x86_32 import org.apache.spark.unsafe.map.BytesToBytesMap -import org.apache.spark.util.Benchmark /** * Benchmark to measure performance for aggregate primitives. - * To run this: - * build/sbt "sql/test-only *benchmark.AggregateBenchmark" - * - * Benchmarks in this file are skipped in normal builds. + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/AggregateBenchmark-results.txt". + * }}} */ -class AggregateBenchmark extends BenchmarkBase { +object AggregateBenchmark extends SqlBasedBenchmark { - ignore("aggregate without grouping") { - val N = 500L << 22 - val benchmark = new Benchmark("agg without grouping", N) - runBenchmark("agg w/o group", N) { - sparkSession.range(N).selectExpr("sum(id)").collect() + override def runBenchmarkSuite(): Unit = { + runBenchmark("aggregate without grouping") { + val N = 500L << 22 + codegenBenchmark("agg w/o group", N) { + spark.range(N).selectExpr("sum(id)").collect() + } } - /* - agg w/o group: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - agg w/o group wholestage off 30136 / 31885 69.6 14.4 1.0X - agg w/o group wholestage on 1851 / 1860 1132.9 0.9 16.3X - */ - } - ignore("stat functions") { - val N = 100L << 20 + runBenchmark("stat functions") { + val N = 100L << 20 - runBenchmark("stddev", N) { - sparkSession.range(N).groupBy().agg("id" -> "stddev").collect() - } + codegenBenchmark("stddev", N) { + spark.range(N).groupBy().agg("id" -> "stddev").collect() + } - runBenchmark("kurtosis", N) { - sparkSession.range(N).groupBy().agg("id" -> "kurtosis").collect() + codegenBenchmark("kurtosis", N) { + spark.range(N).groupBy().agg("id" -> "kurtosis").collect() + } } - /* - Using ImperativeAggregate (as implemented in Spark 1.6): - - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - stddev: Avg Time(ms) Avg Rate(M/s) Relative Rate - ------------------------------------------------------------------------------- - stddev w/o codegen 2019.04 10.39 1.00 X - stddev w codegen 2097.29 10.00 0.96 X - kurtosis w/o codegen 2108.99 9.94 0.96 X - kurtosis w codegen 2090.69 10.03 0.97 X - - Using DeclarativeAggregate: - - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - stddev: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - stddev codegen=false 5630 / 5776 18.0 55.6 1.0X - stddev codegen=true 1259 / 1314 83.0 12.0 4.5X - - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - kurtosis: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - kurtosis codegen=false 14847 / 15084 7.0 142.9 1.0X - kurtosis codegen=true 1652 / 2124 63.0 15.9 9.0X - */ - } - - ignore("aggregate with linear keys") { - val N = 20 << 22 + runBenchmark("aggregate with linear keys") { + val N = 20 << 22 - val benchmark = new Benchmark("Aggregate w keys", N) - def f(): Unit = { - sparkSession.range(N).selectExpr("(id & 65535) as k").groupBy("k").sum().collect() - } + val benchmark = new Benchmark("Aggregate w keys", N, output = output) - benchmark.addCase(s"codegen = F", numIters = 2) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "false") - f() - } + def f(): Unit = { + spark.range(N).selectExpr("(id & 65535) as k").groupBy("k").sum().collect() + } - benchmark.addCase(s"codegen = T hashmap = F", numIters = 3) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false") - f() - } + benchmark.addCase("codegen = F", numIters = 2) { _ => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { + f() + } + } - benchmark.addCase(s"codegen = T hashmap = T", numIters = 5) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true") - f() - } + benchmark.addCase("codegen = T hashmap = F", numIters = 3) { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "false", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "false") { + f() + } + } - benchmark.run() + benchmark.addCase("codegen = T hashmap = T", numIters = 5) { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "true", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "true") { + f() + } + } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz + benchmark.run() + } - Aggregate w keys: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - codegen = F 6619 / 6780 12.7 78.9 1.0X - codegen = T hashmap = F 3935 / 4059 21.3 46.9 1.7X - codegen = T hashmap = T 897 / 971 93.5 10.7 7.4X - */ - } + runBenchmark("aggregate with randomized keys") { + val N = 20 << 22 - ignore("aggregate with randomized keys") { - val N = 20 << 22 + val benchmark = new Benchmark("Aggregate w keys", N, output = output) + spark.range(N).selectExpr("id", "floor(rand() * 10000) as k") + .createOrReplaceTempView("test") - val benchmark = new Benchmark("Aggregate w keys", N) - sparkSession.range(N).selectExpr("id", "floor(rand() * 10000) as k") - .createOrReplaceTempView("test") + def f(): Unit = spark.sql("select k, k, sum(id) from test group by k, k").collect() - def f(): Unit = sparkSession.sql("select k, k, sum(id) from test group by k, k").collect() + benchmark.addCase("codegen = F", numIters = 2) { _ => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { + f() + } + } - benchmark.addCase(s"codegen = F", numIters = 2) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", value = false) - f() - } + benchmark.addCase("codegen = T hashmap = F", numIters = 3) { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "false", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "false") { + f() + } + } - benchmark.addCase(s"codegen = T hashmap = F", numIters = 3) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", value = true) - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false") - f() - } + benchmark.addCase("codegen = T hashmap = T", numIters = 5) { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "true", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "true") { + f() + } + } - benchmark.addCase(s"codegen = T hashmap = T", numIters = 5) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", value = true) - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true") - f() + benchmark.run() } - benchmark.run() - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz + runBenchmark("aggregate with string key") { + val N = 20 << 20 - Aggregate w keys: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - codegen = F 7445 / 7517 11.3 88.7 1.0X - codegen = T hashmap = F 4672 / 4703 18.0 55.7 1.6X - codegen = T hashmap = T 1764 / 1958 47.6 21.0 4.2X - */ - } + val benchmark = new Benchmark("Aggregate w string key", N, output = output) - ignore("aggregate with string key") { - val N = 20 << 20 + def f(): Unit = spark.range(N).selectExpr("id", "cast(id & 1023 as string) as k") + .groupBy("k").count().collect() - val benchmark = new Benchmark("Aggregate w string key", N) - def f(): Unit = sparkSession.range(N).selectExpr("id", "cast(id & 1023 as string) as k") - .groupBy("k").count().collect() + benchmark.addCase("codegen = F", numIters = 2) { _ => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { + f() + } + } - benchmark.addCase(s"codegen = F", numIters = 2) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "false") - f() - } + benchmark.addCase("codegen = T hashmap = F", numIters = 3) { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "false", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "false") { + f() + } + } - benchmark.addCase(s"codegen = T hashmap = F", numIters = 3) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false") - f() - } + benchmark.addCase("codegen = T hashmap = T", numIters = 5) { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "true", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "true") { + f() + } + } - benchmark.addCase(s"codegen = T hashmap = T", numIters = 5) { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true") - f() + benchmark.run() } - benchmark.run() - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - Aggregate w string key: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - codegen = F 3307 / 3376 6.3 157.7 1.0X - codegen = T hashmap = F 2364 / 2471 8.9 112.7 1.4X - codegen = T hashmap = T 1740 / 1841 12.0 83.0 1.9X - */ - } - - ignore("aggregate with decimal key") { - val N = 20 << 20 - - val benchmark = new Benchmark("Aggregate w decimal key", N) - def f(): Unit = sparkSession.range(N).selectExpr("id", "cast(id & 65535 as decimal) as k") - .groupBy("k").count().collect() - - benchmark.addCase(s"codegen = F") { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "false") - f() - } + runBenchmark("aggregate with decimal key") { + val N = 20 << 20 - benchmark.addCase(s"codegen = T hashmap = F") { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false") - f() - } + val benchmark = new Benchmark("Aggregate w decimal key", N, output = output) - benchmark.addCase(s"codegen = T hashmap = T") { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true") - f() - } + def f(): Unit = spark.range(N).selectExpr("id", "cast(id & 65535 as decimal) as k") + .groupBy("k").count().collect() - benchmark.run() - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - Aggregate w decimal key: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - codegen = F 2756 / 2817 7.6 131.4 1.0X - codegen = T hashmap = F 1580 / 1647 13.3 75.4 1.7X - codegen = T hashmap = T 641 / 662 32.7 30.6 4.3X - */ - } + benchmark.addCase("codegen = F") { _ => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { + f() + } + } - ignore("aggregate with multiple key types") { - val N = 20 << 20 - - val benchmark = new Benchmark("Aggregate w multiple keys", N) - def f(): Unit = sparkSession.range(N) - .selectExpr( - "id", - "(id & 1023) as k1", - "cast(id & 1023 as string) as k2", - "cast(id & 1023 as int) as k3", - "cast(id & 1023 as double) as k4", - "cast(id & 1023 as float) as k5", - "id > 1023 as k6") - .groupBy("k1", "k2", "k3", "k4", "k5", "k6") - .sum() - .collect() - - benchmark.addCase(s"codegen = F") { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "false") - f() - } + benchmark.addCase("codegen = T hashmap = F") { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "false", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "false") { + f() + } + } - benchmark.addCase(s"codegen = T hashmap = F") { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "false") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "false") - f() - } + benchmark.addCase("codegen = T hashmap = T") { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "true", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "true") { + f() + } + } - benchmark.addCase(s"codegen = T hashmap = T") { iter => - sparkSession.conf.set("spark.sql.codegen.wholeStage", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.twolevel.enabled", "true") - sparkSession.conf.set("spark.sql.codegen.aggregate.map.vectorized.enable", "true") - f() + benchmark.run() } - benchmark.run() - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - Aggregate w decimal key: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - codegen = F 5885 / 6091 3.6 280.6 1.0X - codegen = T hashmap = F 3625 / 4009 5.8 172.8 1.6X - codegen = T hashmap = T 3204 / 3271 6.5 152.8 1.8X - */ - } + runBenchmark("aggregate with multiple key types") { + val N = 20 << 20 - ignore("max function bytecode size of wholestagecodegen") { - val N = 20 << 15 - - val benchmark = new Benchmark("max function bytecode size", N) - def f(): Unit = sparkSession.range(N) - .selectExpr( - "id", - "(id & 1023) as k1", - "cast(id & 1023 as double) as k2", - "cast(id & 1023 as int) as k3", - "case when id > 100 and id <= 200 then 1 else 0 end as v1", - "case when id > 200 and id <= 300 then 1 else 0 end as v2", - "case when id > 300 and id <= 400 then 1 else 0 end as v3", - "case when id > 400 and id <= 500 then 1 else 0 end as v4", - "case when id > 500 and id <= 600 then 1 else 0 end as v5", - "case when id > 600 and id <= 700 then 1 else 0 end as v6", - "case when id > 700 and id <= 800 then 1 else 0 end as v7", - "case when id > 800 and id <= 900 then 1 else 0 end as v8", - "case when id > 900 and id <= 1000 then 1 else 0 end as v9", - "case when id > 1000 and id <= 1100 then 1 else 0 end as v10", - "case when id > 1100 and id <= 1200 then 1 else 0 end as v11", - "case when id > 1200 and id <= 1300 then 1 else 0 end as v12", - "case when id > 1300 and id <= 1400 then 1 else 0 end as v13", - "case when id > 1400 and id <= 1500 then 1 else 0 end as v14", - "case when id > 1500 and id <= 1600 then 1 else 0 end as v15", - "case when id > 1600 and id <= 1700 then 1 else 0 end as v16", - "case when id > 1700 and id <= 1800 then 1 else 0 end as v17", - "case when id > 1800 and id <= 1900 then 1 else 0 end as v18") - .groupBy("k1", "k2", "k3") - .sum() - .collect() - - benchmark.addCase("codegen = F") { iter => - sparkSession.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "false") - f() - } + val benchmark = new Benchmark("Aggregate w multiple keys", N, output = output) - benchmark.addCase("codegen = T hugeMethodLimit = 10000") { iter => - sparkSession.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") - sparkSession.conf.set(SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key, "10000") - f() - } + def f(): Unit = spark.range(N) + .selectExpr( + "id", + "(id & 1023) as k1", + "cast(id & 1023 as string) as k2", + "cast(id & 1023 as int) as k3", + "cast(id & 1023 as double) as k4", + "cast(id & 1023 as float) as k5", + "id > 1023 as k6") + .groupBy("k1", "k2", "k3", "k4", "k5", "k6") + .sum() + .collect() - benchmark.addCase("codegen = T hugeMethodLimit = 1500") { iter => - sparkSession.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") - sparkSession.conf.set(SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key, "1500") - f() - } + benchmark.addCase("codegen = F") { _ => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { + f() + } + } - benchmark.run() + benchmark.addCase("codegen = T hashmap = F") { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "false", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "false") { + f() + } + } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_31-b13 on Mac OS X 10.10.2 - Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz + benchmark.addCase("codegen = T hashmap = T") { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> "true", + "spark.sql.codegen.aggregate.map.vectorized.enable" -> "true") { + f() + } + } - max function bytecode size: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - codegen = F 709 / 803 0.9 1082.1 1.0X - codegen = T hugeMethodLimit = 10000 3485 / 3548 0.2 5317.7 0.2X - codegen = T hugeMethodLimit = 1500 636 / 701 1.0 969.9 1.1X - */ - } + benchmark.run() + } + + runBenchmark("max function bytecode size of wholestagecodegen") { + val N = 20 << 15 + + val benchmark = new Benchmark("max function bytecode size", N, output = output) + + def f(): Unit = spark.range(N) + .selectExpr( + "id", + "(id & 1023) as k1", + "cast(id & 1023 as double) as k2", + "cast(id & 1023 as int) as k3", + "case when id > 100 and id <= 200 then 1 else 0 end as v1", + "case when id > 200 and id <= 300 then 1 else 0 end as v2", + "case when id > 300 and id <= 400 then 1 else 0 end as v3", + "case when id > 400 and id <= 500 then 1 else 0 end as v4", + "case when id > 500 and id <= 600 then 1 else 0 end as v5", + "case when id > 600 and id <= 700 then 1 else 0 end as v6", + "case when id > 700 and id <= 800 then 1 else 0 end as v7", + "case when id > 800 and id <= 900 then 1 else 0 end as v8", + "case when id > 900 and id <= 1000 then 1 else 0 end as v9", + "case when id > 1000 and id <= 1100 then 1 else 0 end as v10", + "case when id > 1100 and id <= 1200 then 1 else 0 end as v11", + "case when id > 1200 and id <= 1300 then 1 else 0 end as v12", + "case when id > 1300 and id <= 1400 then 1 else 0 end as v13", + "case when id > 1400 and id <= 1500 then 1 else 0 end as v14", + "case when id > 1500 and id <= 1600 then 1 else 0 end as v15", + "case when id > 1600 and id <= 1700 then 1 else 0 end as v16", + "case when id > 1700 and id <= 1800 then 1 else 0 end as v17", + "case when id > 1800 and id <= 1900 then 1 else 0 end as v18") + .groupBy("k1", "k2", "k3") + .sum() + .collect() + + benchmark.addCase("codegen = F") { _ => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { + f() + } + } + benchmark.addCase("codegen = T hugeMethodLimit = 10000") { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key -> "10000") { + f() + } + } - ignore("cube") { - val N = 5 << 20 + benchmark.addCase("codegen = T hugeMethodLimit = 1500") { _ => + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", + SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key -> "1500") { + f() + } + } - runBenchmark("cube", N) { - sparkSession.range(N).selectExpr("id", "id % 1000 as k1", "id & 256 as k2") - .cube("k1", "k2").sum("id").collect() + benchmark.run() } - /** - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - cube: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - cube codegen=false 3188 / 3392 1.6 608.2 1.0X - cube codegen=true 1239 / 1394 4.2 236.3 2.6X - */ - } - - ignore("hash and BytesToBytesMap") { - val N = 20 << 20 - val benchmark = new Benchmark("BytesToBytesMap", N) + runBenchmark("cube") { + val N = 5 << 20 - benchmark.addCase("UnsafeRowhash") { iter => - var i = 0 - val keyBytes = new Array[Byte](16) - val key = new UnsafeRow(1) - key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) - var s = 0 - while (i < N) { - key.setInt(0, i % 1000) - val h = Murmur3_x86_32.hashUnsafeWords( - key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, 42) - s += h - i += 1 + codegenBenchmark("cube", N) { + spark.range(N).selectExpr("id", "id % 1000 as k1", "id & 256 as k2") + .cube("k1", "k2").sum("id").collect() } } - benchmark.addCase("murmur3 hash") { iter => - var i = 0 - val keyBytes = new Array[Byte](16) - val key = new UnsafeRow(1) - key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) - var p = 524283 - var s = 0 - while (i < N) { - var h = Murmur3_x86_32.hashLong(i, 42) - key.setInt(0, h) - s += h - i += 1 - } - } + runBenchmark("hash and BytesToBytesMap") { + val N = 20 << 20 - benchmark.addCase("fast hash") { iter => - var i = 0 - val keyBytes = new Array[Byte](16) - val key = new UnsafeRow(1) - key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) - var p = 524283 - var s = 0 - while (i < N) { - var h = i % p - if (h < 0) { - h += p - } - key.setInt(0, h) - s += h - i += 1 - } - } + val benchmark = new Benchmark("BytesToBytesMap", N, output = output) - benchmark.addCase("arrayEqual") { iter => - var i = 0 - val keyBytes = new Array[Byte](16) - val valueBytes = new Array[Byte](16) - val key = new UnsafeRow(1) - key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) - val value = new UnsafeRow(1) - value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) - value.setInt(0, 555) - var s = 0 - while (i < N) { - key.setInt(0, i % 1000) - if (key.equals(value)) { - s += 1 - } - i += 1 + benchmark.addCase("UnsafeRowhash") { _ => + var i = 0 + val keyBytes = new Array[Byte](16) + val key = new UnsafeRow(1) + key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) + var s = 0 + while (i < N) { + key.setInt(0, i % 1000) + val h = Murmur3_x86_32.hashUnsafeWords( + key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, 42) + s += h + i += 1 + } } - } - benchmark.addCase("Java HashMap (Long)") { iter => - var i = 0 - val keyBytes = new Array[Byte](16) - val valueBytes = new Array[Byte](16) - val value = new UnsafeRow(1) - value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) - value.setInt(0, 555) - val map = new HashMap[Long, UnsafeRow]() - while (i < 65536) { - value.setInt(0, i) - map.put(i.toLong, value) - i += 1 - } - var s = 0 - i = 0 - while (i < N) { - if (map.get(i % 100000) != null) { - s += 1 - } - i += 1 + benchmark.addCase("murmur3 hash") { _ => + var i = 0 + val keyBytes = new Array[Byte](16) + val key = new UnsafeRow(1) + key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) + var p = 524283 + var s = 0 + while (i < N) { + var h = Murmur3_x86_32.hashLong(i, 42) + key.setInt(0, h) + s += h + i += 1 + } } - } - benchmark.addCase("Java HashMap (two ints) ") { iter => - var i = 0 - val valueBytes = new Array[Byte](16) - val value = new UnsafeRow(1) - value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) - value.setInt(0, 555) - val map = new HashMap[Long, UnsafeRow]() - while (i < 65536) { - value.setInt(0, i) - val key = (i.toLong << 32) + Integer.rotateRight(i, 15) - map.put(key, value) - i += 1 - } - var s = 0 - i = 0 - while (i < N) { - val key = ((i & 100000).toLong << 32) + Integer.rotateRight(i & 100000, 15) - if (map.get(key) != null) { - s += 1 - } - i += 1 + benchmark.addCase("fast hash") { _ => + var i = 0 + val keyBytes = new Array[Byte](16) + val key = new UnsafeRow(1) + key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) + var p = 524283 + var s = 0 + while (i < N) { + var h = i % p + if (h < 0) { + h += p + } + key.setInt(0, h) + s += h + i += 1 + } } - } - benchmark.addCase("Java HashMap (UnsafeRow)") { iter => - var i = 0 - val keyBytes = new Array[Byte](16) - val valueBytes = new Array[Byte](16) - val key = new UnsafeRow(1) - key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) - val value = new UnsafeRow(1) - value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) - value.setInt(0, 555) - val map = new HashMap[UnsafeRow, UnsafeRow]() - while (i < 65536) { - key.setInt(0, i) - value.setInt(0, i) - map.put(key, value.copy()) - i += 1 - } - var s = 0 - i = 0 - while (i < N) { - key.setInt(0, i % 100000) - if (map.get(key) != null) { - s += 1 - } - i += 1 + benchmark.addCase("arrayEqual") { _ => + var i = 0 + val keyBytes = new Array[Byte](16) + val valueBytes = new Array[Byte](16) + val key = new UnsafeRow(1) + key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) + val value = new UnsafeRow(1) + value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) + value.setInt(0, 555) + var s = 0 + while (i < N) { + key.setInt(0, i % 1000) + if (key.equals(value)) { + s += 1 + } + i += 1 + } } - } - Seq(false, true).foreach { optimized => - benchmark.addCase(s"LongToUnsafeRowMap (opt=$optimized)") { iter => + benchmark.addCase("Java HashMap (Long)") { _ => var i = 0 + val keyBytes = new Array[Byte](16) val valueBytes = new Array[Byte](16) val value = new UnsafeRow(1) value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) value.setInt(0, 555) - val taskMemoryManager = new TaskMemoryManager( - new StaticMemoryManager( - new SparkConf().set(MEMORY_OFFHEAP_ENABLED.key, "false"), - Long.MaxValue, - Long.MaxValue, - 1), - 0) - val map = new LongToUnsafeRowMap(taskMemoryManager, 64) + val map = new HashMap[Long, UnsafeRow]() while (i < 65536) { value.setInt(0, i) - val key = i % 100000 - map.append(key, value) + map.put(i.toLong, value) i += 1 } - if (optimized) { - map.optimize() + var s = 0 + i = 0 + while (i < N) { + if (map.get(i % 100000) != null) { + s += 1 + } + i += 1 + } + } + + benchmark.addCase("Java HashMap (two ints) ") { _ => + var i = 0 + val valueBytes = new Array[Byte](16) + val value = new UnsafeRow(1) + value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) + value.setInt(0, 555) + val map = new HashMap[Long, UnsafeRow]() + while (i < 65536) { + value.setInt(0, i) + val key = (i.toLong << 32) + Integer.rotateRight(i, 15) + map.put(key, value) + i += 1 } var s = 0 i = 0 while (i < N) { - val key = i % 100000 - if (map.getValue(key, value) != null) { + val key = ((i & 100000).toLong << 32) + Integer.rotateRight(i & 100000, 15) + if (map.get(key) != null) { s += 1 } i += 1 } } - } - Seq("off", "on").foreach { heap => - benchmark.addCase(s"BytesToBytesMap ($heap Heap)") { iter => - val taskMemoryManager = new TaskMemoryManager( - new StaticMemoryManager( - new SparkConf().set(MEMORY_OFFHEAP_ENABLED.key, s"${heap == "off"}") - .set(MEMORY_OFFHEAP_SIZE.key, "102400000"), - Long.MaxValue, - Long.MaxValue, - 1), - 0) - val map = new BytesToBytesMap(taskMemoryManager, 1024, 64L<<20) + benchmark.addCase("Java HashMap (UnsafeRow)") { _ => + var i = 0 val keyBytes = new Array[Byte](16) val valueBytes = new Array[Byte](16) val key = new UnsafeRow(1) key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) val value = new UnsafeRow(1) value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) - var i = 0 - val numKeys = 65536 - while (i < numKeys) { - key.setInt(0, i % 65536) - val loc = map.lookup(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, - Murmur3_x86_32.hashLong(i % 65536, 42)) - if (!loc.isDefined) { - loc.append(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, - value.getBaseObject, value.getBaseOffset, value.getSizeInBytes) - } + value.setInt(0, 555) + val map = new HashMap[UnsafeRow, UnsafeRow]() + while (i < 65536) { + key.setInt(0, i) + value.setInt(0, i) + map.put(key, value.copy()) i += 1 } - i = 0 var s = 0 + i = 0 while (i < N) { key.setInt(0, i % 100000) - val loc = map.lookup(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, - Murmur3_x86_32.hashLong(i % 100000, 42)) - if (loc.isDefined) { + if (map.get(key) != null) { s += 1 } i += 1 } } - } - benchmark.addCase("Aggregate HashMap") { iter => - var i = 0 - val numKeys = 65536 - val schema = new StructType() - .add("key", LongType) - .add("value", LongType) - val map = new AggregateHashMap(schema) - while (i < numKeys) { - val row = map.findOrInsert(i.toLong) - row.setLong(1, row.getLong(1) + 1) - i += 1 - } - var s = 0 - i = 0 - while (i < N) { - if (map.find(i % 100000) != -1) { - s += 1 - } - i += 1 + Seq(false, true).foreach { optimized => + benchmark.addCase(s"LongToUnsafeRowMap (opt=$optimized)") { _ => + var i = 0 + val valueBytes = new Array[Byte](16) + val value = new UnsafeRow(1) + value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) + value.setInt(0, 555) + val taskMemoryManager = new TaskMemoryManager( + new StaticMemoryManager( + new SparkConf().set(MEMORY_OFFHEAP_ENABLED.key, "false"), + Long.MaxValue, + Long.MaxValue, + 1), + 0) + val map = new LongToUnsafeRowMap(taskMemoryManager, 64) + while (i < 65536) { + value.setInt(0, i) + val key = i % 100000 + map.append(key, value) + i += 1 + } + if (optimized) { + map.optimize() + } + var s = 0 + i = 0 + while (i < N) { + val key = i % 100000 + if (map.getValue(key, value) != null) { + s += 1 + } + i += 1 + } + } + } + + Seq("off", "on").foreach { heap => + benchmark.addCase(s"BytesToBytesMap ($heap Heap)") { _ => + val taskMemoryManager = new TaskMemoryManager( + new StaticMemoryManager( + new SparkConf().set(MEMORY_OFFHEAP_ENABLED.key, s"${heap == "off"}") + .set(MEMORY_OFFHEAP_SIZE.key, "102400000"), + Long.MaxValue, + Long.MaxValue, + 1), + 0) + val map = new BytesToBytesMap(taskMemoryManager, 1024, 64L << 20) + val keyBytes = new Array[Byte](16) + val valueBytes = new Array[Byte](16) + val key = new UnsafeRow(1) + key.pointTo(keyBytes, Platform.BYTE_ARRAY_OFFSET, 16) + val value = new UnsafeRow(1) + value.pointTo(valueBytes, Platform.BYTE_ARRAY_OFFSET, 16) + var i = 0 + val numKeys = 65536 + while (i < numKeys) { + key.setInt(0, i % 65536) + val loc = map.lookup(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, + Murmur3_x86_32.hashLong(i % 65536, 42)) + if (!loc.isDefined) { + loc.append(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, + value.getBaseObject, value.getBaseOffset, value.getSizeInBytes) + } + i += 1 + } + i = 0 + var s = 0 + while (i < N) { + key.setInt(0, i % 100000) + val loc = map.lookup(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, + Murmur3_x86_32.hashLong(i % 100000, 42)) + if (loc.isDefined) { + s += 1 + } + i += 1 + } + } } - } - /* - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - BytesToBytesMap: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - UnsafeRow hash 267 / 284 78.4 12.8 1.0X - murmur3 hash 102 / 129 205.5 4.9 2.6X - fast hash 79 / 96 263.8 3.8 3.4X - arrayEqual 164 / 172 128.2 7.8 1.6X - Java HashMap (Long) 321 / 399 65.4 15.3 0.8X - Java HashMap (two ints) 328 / 363 63.9 15.7 0.8X - Java HashMap (UnsafeRow) 1140 / 1200 18.4 54.3 0.2X - LongToUnsafeRowMap (opt=false) 378 / 400 55.5 18.0 0.7X - LongToUnsafeRowMap (opt=true) 144 / 152 145.2 6.9 1.9X - BytesToBytesMap (off Heap) 1300 / 1616 16.1 62.0 0.2X - BytesToBytesMap (on Heap) 1165 / 1202 18.0 55.5 0.2X - Aggregate HashMap 121 / 131 173.3 5.8 2.2X - */ - benchmark.run() + benchmark.addCase("Aggregate HashMap") { _ => + var i = 0 + val numKeys = 65536 + val schema = new StructType() + .add("key", LongType) + .add("value", LongType) + val map = new AggregateHashMap(schema) + while (i < numKeys) { + val row = map.findOrInsert(i.toLong) + row.setLong(1, row.getLong(1) + 1) + i += 1 + } + var s = 0 + i = 0 + while (i < N) { + if (map.find(i % 100000) != -1) { + s += 1 + } + i += 1 + } + } + benchmark.run() + } } - } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkWideTable.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkWideTable.scala index 9dcaca0ca93e..76367cbbe534 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkWideTable.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkWideTable.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.execution.benchmark -import org.apache.spark.util.Benchmark - +import org.apache.spark.benchmark.Benchmark /** * Benchmark to measure performance for wide table. @@ -27,7 +26,7 @@ import org.apache.spark.util.Benchmark * * Benchmarks in this file are skipped in normal builds. */ -class BenchmarkWideTable extends BenchmarkBase { +class BenchmarkWideTable extends BenchmarkWithCodegen { ignore("project on wide table") { val N = 1 << 20 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkWithCodegen.scala similarity index 94% rename from sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkBase.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkWithCodegen.scala index c99a5aec1cd6..51331500479a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BenchmarkWithCodegen.scala @@ -18,14 +18,14 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.SparkFunSuite +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.SparkSession -import org.apache.spark.util.Benchmark /** * Common base trait for micro benchmarks that are supposed to run standalone (i.e. not together * with other test suites). */ -private[benchmark] trait BenchmarkBase extends SparkFunSuite { +private[benchmark] trait BenchmarkWithCodegen extends SparkFunSuite { lazy val sparkSession = SparkSession.builder .master("local[1]") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala new file mode 100644 index 000000000000..2f3caca849cd --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import scala.util.Random + +import org.apache.spark.benchmark.Benchmark + +/** + * Benchmark to measure read performance with Bloom filters. + * + * Currently, only ORC supports bloom filters, we will add Parquet BM as soon as it becomes + * available. + * + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/BloomFilterBenchmark-results.txt". + * }}} + */ +object BloomFilterBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + + private val scaleFactor = 100 + private val N = scaleFactor * 1000 * 1000 + private val df = spark.range(N).map(_ => Random.nextInt) + + private def writeBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + runBenchmark(s"ORC Write") { + val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + df.write.mode("overwrite").orc(path + "/withoutBF") + } + benchmark.addCase("With bloom filter") { _ => + df.write.mode("overwrite") + .option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + } + benchmark.run() + } + } + } + + private def readBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + df.write.orc(path + "/withoutBF") + df.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + + runBenchmark(s"ORC Read") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.orc(path + "/withoutBF").where("value = 0").count + } + benchmark.addCase("With bloom filter") { _ => + spark.read.orc(path + "/withBF").where("value = 0").count + } + benchmark.run() + } + } + } + + override def runBenchmarkSuite(): Unit = { + writeBenchmark() + readBenchmark() + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala new file mode 100644 index 000000000000..2de516c19da9 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.benchmark + +/** + * Benchmark to measure built-in data sources write performance. + * By default it measures 4 data source format: Parquet, ORC, JSON, CSV. Run it with spark-submit: + * spark-submit --class + * Or with sbt: + * build/sbt "sql/test:runMain " + * + * To measure specified formats, run it with arguments: + * spark-submit --class format1 [format2] [...] + * Or with sbt: + * build/sbt "sql/test:runMain format1 [format2] [...]" + */ +object BuiltInDataSourceWriteBenchmark extends DataSourceWriteBenchmark { + def main(args: Array[String]): Unit = { + val formats: Seq[String] = if (args.isEmpty) { + Seq("Parquet", "ORC", "JSON", "CSV") + } else { + args + } + + spark.conf.set("spark.sql.parquet.compression.codec", "snappy") + spark.conf.set("spark.sql.orc.compression.codec", "snappy") + /* + Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz + Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Output Single Int Column 1815 / 1932 8.7 115.4 1.0X + Output Single Double Column 1877 / 1878 8.4 119.3 1.0X + Output Int and String Column 6265 / 6543 2.5 398.3 0.3X + Output Partitions 4067 / 4457 3.9 258.6 0.4X + Output Buckets 5608 / 5820 2.8 356.6 0.3X + + ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Output Single Int Column 1201 / 1239 13.1 76.3 1.0X + Output Single Double Column 1542 / 1600 10.2 98.0 0.8X + Output Int and String Column 6495 / 6580 2.4 412.9 0.2X + Output Partitions 3648 / 3842 4.3 231.9 0.3X + Output Buckets 5022 / 5145 3.1 319.3 0.2X + + JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Output Single Int Column 1988 / 2093 7.9 126.4 1.0X + Output Single Double Column 2854 / 2911 5.5 181.4 0.7X + Output Int and String Column 6467 / 6653 2.4 411.1 0.3X + Output Partitions 4548 / 5055 3.5 289.1 0.4X + Output Buckets 5664 / 5765 2.8 360.1 0.4X + + CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------------ + Output Single Int Column 3025 / 3190 5.2 192.3 1.0X + Output Single Double Column 3575 / 3634 4.4 227.3 0.8X + Output Int and String Column 7313 / 7399 2.2 464.9 0.4X + Output Partitions 5105 / 5190 3.1 324.6 0.6X + Output Buckets 6986 / 6992 2.3 444.1 0.4X + */ + formats.foreach { format => + runBenchmark(format) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index 8711f5a8fa1c..51a7f9f1ef09 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala @@ -19,16 +19,17 @@ package org.apache.spark.sql.execution.benchmark import java.io.File import scala.collection.JavaConverters._ -import scala.util.{Random, Try} +import scala.util.Random import org.apache.spark.SparkConf +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.execution.datasources.parquet.{SpecificParquetRecordReaderBase, VectorizedParquetRecordReader} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnVector -import org.apache.spark.util.{Benchmark, Utils} /** @@ -36,7 +37,7 @@ import org.apache.spark.util.{Benchmark, Utils} * To run this: * spark-submit --class */ -object DataSourceReadBenchmark { +object DataSourceReadBenchmark extends SQLHelper { val conf = new SparkConf() .setAppName("DataSourceReadBenchmark") // Since `spark.master` always exists, overrides this value @@ -53,27 +54,10 @@ object DataSourceReadBenchmark { spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "true") spark.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") - def withTempPath(f: File => Unit): Unit = { - val path = Utils.createTempDir() - path.delete() - try f(path) finally Utils.deleteRecursively(path) - } - def withTempTable(tableNames: String*)(f: => Unit): Unit = { try f finally tableNames.foreach(spark.catalog.dropTempView) } - def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { - val (keys, values) = pairs.unzip - val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption) - (keys, values).zipped.foreach(spark.conf.set) - try f finally { - keys.zip(currentValues).foreach { - case (key, Some(value)) => spark.conf.set(key, value) - case (key, None) => spark.conf.unset(key) - } - } - } private def prepareTable(dir: File, df: DataFrame, partition: Option[String] = None): Unit = { val testDf = if (partition.isDefined) { df.write.partitionBy(partition.get) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala index 2d2cdebd067c..994d6b5b7d33 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala @@ -17,29 +17,18 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.SparkConf +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.util.Benchmark -/** - * Benchmark to measure data source write performance. - * By default it measures 4 data source format: Parquet, ORC, JSON, CSV: - * spark-submit --class - * To measure specified formats, run it with arguments: - * spark-submit --class format1 [format2] [...] - */ -object DataSourceWriteBenchmark { +trait DataSourceWriteBenchmark { val conf = new SparkConf() .setAppName("DataSourceWriteBenchmark") .setIfMissing("spark.master", "local[1]") - .set("spark.sql.parquet.compression.codec", "snappy") - .set("spark.sql.orc.compression.codec", "snappy") + .set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") val spark = SparkSession.builder.config(conf).getOrCreate() - // Set default configs. Individual cases will change them if necessary. - spark.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") - val tempTable = "temp" val numRows = 1024 * 1024 * 15 @@ -86,64 +75,24 @@ object DataSourceWriteBenchmark { } } - def main(args: Array[String]): Unit = { + def runBenchmark(format: String): Unit = { val tableInt = "tableInt" val tableDouble = "tableDouble" val tableIntString = "tableIntString" val tablePartition = "tablePartition" val tableBucket = "tableBucket" - val formats: Seq[String] = if (args.isEmpty) { - Seq("Parquet", "ORC", "JSON", "CSV") - } else { - args - } - /* - Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz - Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 1815 / 1932 8.7 115.4 1.0X - Output Single Double Column 1877 / 1878 8.4 119.3 1.0X - Output Int and String Column 6265 / 6543 2.5 398.3 0.3X - Output Partitions 4067 / 4457 3.9 258.6 0.4X - Output Buckets 5608 / 5820 2.8 356.6 0.3X - - ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 1201 / 1239 13.1 76.3 1.0X - Output Single Double Column 1542 / 1600 10.2 98.0 0.8X - Output Int and String Column 6495 / 6580 2.4 412.9 0.2X - Output Partitions 3648 / 3842 4.3 231.9 0.3X - Output Buckets 5022 / 5145 3.1 319.3 0.2X - - JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 1988 / 2093 7.9 126.4 1.0X - Output Single Double Column 2854 / 2911 5.5 181.4 0.7X - Output Int and String Column 6467 / 6653 2.4 411.1 0.3X - Output Partitions 4548 / 5055 3.5 289.1 0.4X - Output Buckets 5664 / 5765 2.8 360.1 0.4X - - CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 3025 / 3190 5.2 192.3 1.0X - Output Single Double Column 3575 / 3634 4.4 227.3 0.8X - Output Int and String Column 7313 / 7399 2.2 464.9 0.4X - Output Partitions 5105 / 5190 3.1 324.6 0.6X - Output Buckets 6986 / 6992 2.3 444.1 0.4X - */ withTempTable(tempTable) { spark.range(numRows).createOrReplaceTempView(tempTable) - formats.foreach { format => - withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) { - val benchmark = new Benchmark(s"$format writer benchmark", numRows) - writeNumeric(tableInt, format, benchmark, "Int") - writeNumeric(tableDouble, format, benchmark, "Double") - writeIntString(tableIntString, format, benchmark) - writePartition(tablePartition, format, benchmark) - writeBucket(tableBucket, format, benchmark) - benchmark.run() - } + withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) { + val benchmark = new Benchmark(s"$format writer benchmark", numRows) + writeNumeric(tableInt, format, benchmark, "Int") + writeNumeric(tableDouble, format, benchmark, "Double") + writeIntString(tableIntString, format, benchmark) + writePartition(tablePartition, format, benchmark) + writeBucket(tableBucket, format, benchmark) + benchmark.run() } } } } + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala index 8596abd1b4ff..cf05ca336171 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala @@ -17,29 +17,31 @@ package org.apache.spark.sql.execution.benchmark -import java.io.{File, FileOutputStream, OutputStream} +import java.io.File -import scala.util.{Random, Try} - -import org.scalatest.{BeforeAndAfterEachTestData, Suite, TestData} +import scala.util.Random import org.apache.spark.SparkConf -import org.apache.spark.SparkFunSuite +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.functions.monotonically_increasing_id import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType, TimestampType} -import org.apache.spark.util.{Benchmark, Utils} /** * Benchmark to measure read performance with Filter pushdown. - * To run this: - * build/sbt "sql/test-only *FilterPushdownBenchmark" - * - * Results will be written to "benchmarks/FilterPushdownBenchmark-results.txt". + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/FilterPushdownBenchmark-results.txt". + * }}} */ -class FilterPushdownBenchmark extends SparkFunSuite with BenchmarkBeforeAndAfterEachTest { +object FilterPushdownBenchmark extends BenchmarkBase with SQLHelper { + private val conf = new SparkConf() .setAppName(this.getClass.getSimpleName) // Since `spark.master` always exists, overrides this value @@ -53,59 +55,15 @@ class FilterPushdownBenchmark extends SparkFunSuite with BenchmarkBeforeAndAfter private val numRows = 1024 * 1024 * 15 private val width = 5 private val mid = numRows / 2 - private val blockSize = 1048576 + // For Parquet/ORC, we will use the same value for block size and compression size + private val blockSize = org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE private val spark = SparkSession.builder().config(conf).getOrCreate() - private var out: OutputStream = _ - - override def beforeAll() { - super.beforeAll() - out = new FileOutputStream(new File("benchmarks/FilterPushdownBenchmark-results.txt")) - } - - override def beforeEach(td: TestData) { - super.beforeEach(td) - val separator = "=" * 96 - val testHeader = (separator + '\n' + td.name + '\n' + separator + '\n' + '\n').getBytes - out.write(testHeader) - } - - override def afterEach(td: TestData) { - out.write('\n') - super.afterEach(td) - } - - override def afterAll() { - try { - out.close() - } finally { - super.afterAll() - } - } - - def withTempPath(f: File => Unit): Unit = { - val path = Utils.createTempDir() - path.delete() - try f(path) finally Utils.deleteRecursively(path) - } - def withTempTable(tableNames: String*)(f: => Unit): Unit = { try f finally tableNames.foreach(spark.catalog.dropTempView) } - def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { - val (keys, values) = pairs.unzip - val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption) - (keys, values).zipped.foreach(spark.conf.set) - try f finally { - keys.zip(currentValues).foreach { - case (key, Some(value)) => spark.conf.set(key, value) - case (key, None) => spark.conf.unset(key) - } - } - } - private def prepareTable( dir: File, numRows: Int, width: Int, useStringForValue: Boolean): Unit = { import spark.implicits._ @@ -130,16 +88,16 @@ class FilterPushdownBenchmark extends SparkFunSuite with BenchmarkBeforeAndAfter } val df = spark.range(numRows).selectExpr(selectExpr: _*).sort("value") - saveAsTable(df, dir) + saveAsTable(df, dir, true) } - private def saveAsTable(df: DataFrame, dir: File): Unit = { + private def saveAsTable(df: DataFrame, dir: File, useDictionary: Boolean = false): Unit = { val orcPath = dir.getCanonicalPath + "/orc" val parquetPath = dir.getCanonicalPath + "/parquet" - // To always turn on dictionary encoding, we set 1.0 at the threshold (the default is 0.8) df.write.mode("overwrite") - .option("orc.dictionary.key.threshold", 1.0) + .option("orc.dictionary.key.threshold", if (useDictionary) 1.0 else 0.8) + .option("orc.compress.size", blockSize) .option("orc.stripe.size", blockSize).orc(orcPath) spark.read.orc(orcPath).createOrReplaceTempView("orcTable") @@ -153,7 +111,7 @@ class FilterPushdownBenchmark extends SparkFunSuite with BenchmarkBeforeAndAfter title: String, whereExpr: String, selectExpr: String = "*"): Unit = { - val benchmark = new Benchmark(title, values, minNumIters = 5, output = Some(out)) + val benchmark = new Benchmark(title, values, minNumIters = 5, output = output) Seq(false, true).foreach { pushDownEnabled => val name = s"Parquet Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}" @@ -240,191 +198,184 @@ class FilterPushdownBenchmark extends SparkFunSuite with BenchmarkBeforeAndAfter } } - ignore("Pushdown for many distinct value case") { - withTempPath { dir => - withTempTable("orcTable", "parquetTable") { - Seq(true, false).foreach { useStringForValue => - prepareTable(dir, numRows, width, useStringForValue) - if (useStringForValue) { - runStringBenchmark(numRows, width, mid, "string") - } else { - runIntBenchmark(numRows, width, mid) + override def runBenchmarkSuite(): Unit = { + runBenchmark("Pushdown for many distinct value case") { + withTempPath { dir => + withTempTable("orcTable", "parquetTable") { + Seq(true, false).foreach { useStringForValue => + prepareTable(dir, numRows, width, useStringForValue) + if (useStringForValue) { + runStringBenchmark(numRows, width, mid, "string") + } else { + runIntBenchmark(numRows, width, mid) + } } } } } - } - ignore("Pushdown for few distinct value case (use dictionary encoding)") { - withTempPath { dir => - val numDistinctValues = 200 + runBenchmark("Pushdown for few distinct value case (use dictionary encoding)") { + withTempPath { dir => + val numDistinctValues = 200 - withTempTable("orcTable", "parquetTable") { - prepareStringDictTable(dir, numRows, numDistinctValues, width) - runStringBenchmark(numRows, width, numDistinctValues / 2, "distinct string") + withTempTable("orcTable", "parquetTable") { + prepareStringDictTable(dir, numRows, numDistinctValues, width) + runStringBenchmark(numRows, width, numDistinctValues / 2, "distinct string") + } } } - } - ignore("Pushdown benchmark for StringStartsWith") { - withTempPath { dir => - withTempTable("orcTable", "parquetTable") { - prepareTable(dir, numRows, width, true) - Seq( - "value like '10%'", - "value like '1000%'", - s"value like '${mid.toString.substring(0, mid.toString.length - 1)}%'" - ).foreach { whereExpr => - val title = s"StringStartsWith filter: ($whereExpr)" - filterPushDownBenchmark(numRows, title, whereExpr) + runBenchmark("Pushdown benchmark for StringStartsWith") { + withTempPath { dir => + withTempTable("orcTable", "parquetTable") { + prepareTable(dir, numRows, width, true) + Seq( + "value like '10%'", + "value like '1000%'", + s"value like '${mid.toString.substring(0, mid.toString.length - 1)}%'" + ).foreach { whereExpr => + val title = s"StringStartsWith filter: ($whereExpr)" + filterPushDownBenchmark(numRows, title, whereExpr) + } } } } - } - - ignore(s"Pushdown benchmark for ${DecimalType.simpleString}") { - withTempPath { dir => - Seq( - s"decimal(${Decimal.MAX_INT_DIGITS}, 2)", - s"decimal(${Decimal.MAX_LONG_DIGITS}, 2)", - s"decimal(${DecimalType.MAX_PRECISION}, 2)" - ).foreach { dt => - val columns = (1 to width).map(i => s"CAST(id AS string) c$i") - val valueCol = if (dt.equalsIgnoreCase(s"decimal(${Decimal.MAX_INT_DIGITS}, 2)")) { - monotonically_increasing_id() % 9999999 - } else { - monotonically_increasing_id() - } - val df = spark.range(numRows).selectExpr(columns: _*).withColumn("value", valueCol.cast(dt)) - withTempTable("orcTable", "parquetTable") { - saveAsTable(df, dir) - Seq(s"value = $mid").foreach { whereExpr => - val title = s"Select 1 $dt row ($whereExpr)".replace("value AND value", "value") - filterPushDownBenchmark(numRows, title, whereExpr) + runBenchmark(s"Pushdown benchmark for ${DecimalType.simpleString}") { + withTempPath { dir => + Seq( + s"decimal(${Decimal.MAX_INT_DIGITS}, 2)", + s"decimal(${Decimal.MAX_LONG_DIGITS}, 2)", + s"decimal(${DecimalType.MAX_PRECISION}, 2)" + ).foreach { dt => + val columns = (1 to width).map(i => s"CAST(id AS string) c$i") + val valueCol = if (dt.equalsIgnoreCase(s"decimal(${Decimal.MAX_INT_DIGITS}, 2)")) { + monotonically_increasing_id() % 9999999 + } else { + monotonically_increasing_id() } + val df = spark.range(numRows) + .selectExpr(columns: _*).withColumn("value", valueCol.cast(dt)) + withTempTable("orcTable", "parquetTable") { + saveAsTable(df, dir) + + Seq(s"value = $mid").foreach { whereExpr => + val title = s"Select 1 $dt row ($whereExpr)".replace("value AND value", "value") + filterPushDownBenchmark(numRows, title, whereExpr) + } - val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)") - Seq(10, 50, 90).foreach { percent => - filterPushDownBenchmark( - numRows, - s"Select $percent% $dt rows (value < ${numRows * percent / 100})", - s"value < ${numRows * percent / 100}", - selectExpr - ) + val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)") + Seq(10, 50, 90).foreach { percent => + filterPushDownBenchmark( + numRows, + s"Select $percent% $dt rows (value < ${numRows * percent / 100})", + s"value < ${numRows * percent / 100}", + selectExpr + ) + } } } } } - } - ignore("Pushdown benchmark for InSet -> InFilters") { - withTempPath { dir => - withTempTable("orcTable", "parquetTable") { - prepareTable(dir, numRows, width, false) - Seq(5, 10, 50, 100).foreach { count => - Seq(10, 50, 90).foreach { distribution => - val filter = - Range(0, count).map(r => scala.util.Random.nextInt(numRows * distribution / 100)) - val whereExpr = s"value in(${filter.mkString(",")})" - val title = s"InSet -> InFilters (values count: $count, distribution: $distribution)" - filterPushDownBenchmark(numRows, title, whereExpr) + runBenchmark("Pushdown benchmark for InSet -> InFilters") { + withTempPath { dir => + withTempTable("orcTable", "parquetTable") { + prepareTable(dir, numRows, width, false) + Seq(5, 10, 50, 100).foreach { count => + Seq(10, 50, 90).foreach { distribution => + val filter = + Range(0, count).map(r => scala.util.Random.nextInt(numRows * distribution / 100)) + val whereExpr = s"value in(${filter.mkString(",")})" + val title = s"InSet -> InFilters (values count: $count, distribution: $distribution)" + filterPushDownBenchmark(numRows, title, whereExpr) + } } } } } - } - ignore(s"Pushdown benchmark for ${ByteType.simpleString}") { - withTempPath { dir => - val columns = (1 to width).map(i => s"CAST(id AS string) c$i") - val df = spark.range(numRows).selectExpr(columns: _*) - .withColumn("value", (monotonically_increasing_id() % Byte.MaxValue).cast(ByteType)) - .orderBy("value") - withTempTable("orcTable", "parquetTable") { - saveAsTable(df, dir) - - Seq(s"value = CAST(${Byte.MaxValue / 2} AS ${ByteType.simpleString})") - .foreach { whereExpr => - val title = s"Select 1 ${ByteType.simpleString} row ($whereExpr)" - .replace("value AND value", "value") - filterPushDownBenchmark(numRows, title, whereExpr) - } + runBenchmark(s"Pushdown benchmark for ${ByteType.simpleString}") { + withTempPath { dir => + val columns = (1 to width).map(i => s"CAST(id AS string) c$i") + val df = spark.range(numRows).selectExpr(columns: _*) + .withColumn("value", (monotonically_increasing_id() % Byte.MaxValue).cast(ByteType)) + .orderBy("value") + withTempTable("orcTable", "parquetTable") { + saveAsTable(df, dir) - val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)") - Seq(10, 50, 90).foreach { percent => - filterPushDownBenchmark( - numRows, - s"Select $percent% ${ByteType.simpleString} rows " + - s"(value < CAST(${Byte.MaxValue * percent / 100} AS ${ByteType.simpleString}))", - s"value < CAST(${Byte.MaxValue * percent / 100} AS ${ByteType.simpleString})", - selectExpr - ) + Seq(s"value = CAST(${Byte.MaxValue / 2} AS ${ByteType.simpleString})") + .foreach { whereExpr => + val title = s"Select 1 ${ByteType.simpleString} row ($whereExpr)" + .replace("value AND value", "value") + filterPushDownBenchmark(numRows, title, whereExpr) + } + + val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)") + Seq(10, 50, 90).foreach { percent => + filterPushDownBenchmark( + numRows, + s"Select $percent% ${ByteType.simpleString} rows " + + s"(value < CAST(${Byte.MaxValue * percent / 100} AS ${ByteType.simpleString}))", + s"value < CAST(${Byte.MaxValue * percent / 100} AS ${ByteType.simpleString})", + selectExpr + ) + } } } } - } - ignore(s"Pushdown benchmark for Timestamp") { - withTempPath { dir => - withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED.key -> true.toString) { - ParquetOutputTimestampType.values.toSeq.map(_.toString).foreach { fileType => - withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> fileType) { - val columns = (1 to width).map(i => s"CAST(id AS string) c$i") - val df = spark.range(numRows).selectExpr(columns: _*) - .withColumn("value", monotonically_increasing_id().cast(TimestampType)) - withTempTable("orcTable", "parquetTable") { - saveAsTable(df, dir) - - Seq(s"value = CAST($mid AS timestamp)").foreach { whereExpr => - val title = s"Select 1 timestamp stored as $fileType row ($whereExpr)" - .replace("value AND value", "value") - filterPushDownBenchmark(numRows, title, whereExpr) - } - - val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)") - Seq(10, 50, 90).foreach { percent => - filterPushDownBenchmark( - numRows, - s"Select $percent% timestamp stored as $fileType rows " + - s"(value < CAST(${numRows * percent / 100} AS timestamp))", - s"value < CAST(${numRows * percent / 100} as timestamp)", - selectExpr - ) + runBenchmark(s"Pushdown benchmark for Timestamp") { + withTempPath { dir => + withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED.key -> true.toString) { + ParquetOutputTimestampType.values.toSeq.map(_.toString).foreach { fileType => + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> fileType) { + val columns = (1 to width).map(i => s"CAST(id AS string) c$i") + val df = spark.range(numRows).selectExpr(columns: _*) + .withColumn("value", monotonically_increasing_id().cast(TimestampType)) + withTempTable("orcTable", "parquetTable") { + saveAsTable(df, dir) + + Seq(s"value = CAST($mid AS timestamp)").foreach { whereExpr => + val title = s"Select 1 timestamp stored as $fileType row ($whereExpr)" + .replace("value AND value", "value") + filterPushDownBenchmark(numRows, title, whereExpr) + } + + val selectExpr = (1 to width) + .map(i => s"MAX(c$i)").mkString("", ",", ", MAX(value)") + Seq(10, 50, 90).foreach { percent => + filterPushDownBenchmark( + numRows, + s"Select $percent% timestamp stored as $fileType rows " + + s"(value < CAST(${numRows * percent / 100} AS timestamp))", + s"value < CAST(${numRows * percent / 100} as timestamp)", + selectExpr + ) + } } } } } } } - } - ignore(s"Pushdown benchmark with many filters") { - val numRows = 1 - val width = 500 - - withTempPath { dir => - val columns = (1 to width).map(i => s"id c$i") - val df = spark.range(1).selectExpr(columns: _*) - withTempTable("orcTable", "parquetTable") { - saveAsTable(df, dir) - Seq(1, 250, 500).foreach { numFilter => - val whereExpr = (1 to numFilter).map(i => s"c$i = 0").mkString(" and ") - // Note: InferFiltersFromConstraints will add more filters to this given filters - filterPushDownBenchmark(numRows, s"Select 1 row with $numFilter filters", whereExpr) + runBenchmark(s"Pushdown benchmark with many filters") { + val numRows = 1 + val width = 500 + + withTempPath { dir => + val columns = (1 to width).map(i => s"id c$i") + val df = spark.range(1).selectExpr(columns: _*) + withTempTable("orcTable", "parquetTable") { + saveAsTable(df, dir) + Seq(1, 250, 500).foreach { numFilter => + val whereExpr = (1 to numFilter).map(i => s"c$i = 0").mkString(" and ") + // Note: InferFiltersFromConstraints will add more filters to this given filters + filterPushDownBenchmark(numRows, s"Select 1 row with $numFilter filters", whereExpr) + } } } } } } - -trait BenchmarkBeforeAndAfterEachTest extends BeforeAndAfterEachTestData { this: Suite => - - override def beforeEach(td: TestData) { - super.beforeEach(td) - } - - override def afterEach(td: TestData) { - super.afterEach(td) - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala index 5a25d7230837..37744dccc06f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.types.IntegerType * * Benchmarks in this file are skipped in normal builds. */ -class JoinBenchmark extends BenchmarkBase { +class JoinBenchmark extends BenchmarkWithCodegen { ignore("broadcast hash join, long key") { val N = 20 << 20 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala index f039aeaad442..43380869fefe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala @@ -17,251 +17,144 @@ package org.apache.spark.sql.execution.benchmark -import org.apache.spark.util.Benchmark +import org.apache.spark.benchmark.Benchmark /** * Benchmark to measure whole stage codegen performance. - * To run this: - * build/sbt "sql/test-only *benchmark.MiscBenchmark" - * - * Benchmarks in this file are skipped in normal builds. + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/MiscBenchmark-results.txt". + * }}} */ -class MiscBenchmark extends BenchmarkBase { +object MiscBenchmark extends SqlBasedBenchmark { - ignore("filter & aggregate without group") { - val N = 500L << 22 - runBenchmark("range/filter/sum", N) { - sparkSession.range(N).filter("(id & 1) = 1").groupBy().sum().collect() + def filterAndAggregateWithoutGroup(numRows: Long): Unit = { + runBenchmark("filter & aggregate without group") { + codegenBenchmark("range/filter/sum", numRows) { + spark.range(numRows).filter("(id & 1) = 1").groupBy().sum().collect() + } } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - range/filter/sum: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - range/filter/sum codegen=false 30663 / 31216 68.4 14.6 1.0X - range/filter/sum codegen=true 2399 / 2409 874.1 1.1 12.8X - */ } - ignore("range/limit/sum") { - val N = 500L << 20 - runBenchmark("range/limit/sum", N) { - sparkSession.range(N).limit(1000000).groupBy().sum().collect() + def limitAndAggregateWithoutGroup(numRows: Long): Unit = { + runBenchmark("range/limit/sum") { + codegenBenchmark("range/limit/sum", numRows) { + spark.range(numRows).limit(1000000).groupBy().sum().collect() + } } - /* - Westmere E56xx/L56xx/X56xx (Nehalem-C) - range/limit/sum: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - range/limit/sum codegen=false 609 / 672 861.6 1.2 1.0X - range/limit/sum codegen=true 561 / 621 935.3 1.1 1.1X - */ } - ignore("sample") { - val N = 500 << 18 - runBenchmark("sample with replacement", N) { - sparkSession.range(N).sample(withReplacement = true, 0.01).groupBy().sum().collect() - } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz + def sample(numRows: Int): Unit = { + runBenchmark("sample") { + codegenBenchmark("sample with replacement", numRows) { + spark.range(numRows).sample(withReplacement = true, 0.01).groupBy().sum().collect() + } - sample with replacement: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - sample with replacement codegen=false 7073 / 7227 18.5 54.0 1.0X - sample with replacement codegen=true 5199 / 5203 25.2 39.7 1.4X - */ - - runBenchmark("sample without replacement", N) { - sparkSession.range(N).sample(withReplacement = false, 0.01).groupBy().sum().collect() + codegenBenchmark("sample without replacement", numRows) { + spark.range(numRows).sample(withReplacement = false, 0.01).groupBy().sum().collect() + } } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - sample without replacement: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - sample without replacement codegen=false 1508 / 1529 86.9 11.5 1.0X - sample without replacement codegen=true 644 / 662 203.5 4.9 2.3X - */ } - ignore("collect") { - val N = 1 << 20 - - val benchmark = new Benchmark("collect", N) - benchmark.addCase("collect 1 million") { iter => - sparkSession.range(N).collect() - } - benchmark.addCase("collect 2 millions") { iter => - sparkSession.range(N * 2).collect() - } - benchmark.addCase("collect 4 millions") { iter => - sparkSession.range(N * 4).collect() + def collect(numRows: Int): Unit = { + runBenchmark("collect") { + val benchmark = new Benchmark("collect", numRows, output = output) + benchmark.addCase("collect 1 million") { iter => + spark.range(numRows).collect() + } + benchmark.addCase("collect 2 millions") { iter => + spark.range(numRows * 2).collect() + } + benchmark.addCase("collect 4 millions") { iter => + spark.range(numRows * 4).collect() + } + benchmark.run() } - benchmark.run() - - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - collect: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - collect 1 million 439 / 654 2.4 418.7 1.0X - collect 2 millions 961 / 1907 1.1 916.4 0.5X - collect 4 millions 3193 / 3895 0.3 3044.7 0.1X - */ } - ignore("collect limit") { - val N = 1 << 20 - - val benchmark = new Benchmark("collect limit", N) - benchmark.addCase("collect limit 1 million") { iter => - sparkSession.range(N * 4).limit(N).collect() + def collectLimit(numRows: Int): Unit = { + runBenchmark("collect limit") { + val benchmark = new Benchmark("collect limit", numRows, output = output) + benchmark.addCase("collect limit 1 million") { iter => + spark.range(numRows * 4).limit(numRows).collect() + } + benchmark.addCase("collect limit 2 millions") { iter => + spark.range(numRows * 4).limit(numRows * 2).collect() + } + benchmark.run() } - benchmark.addCase("collect limit 2 millions") { iter => - sparkSession.range(N * 4).limit(N * 2).collect() - } - benchmark.run() - - /* - model name : Westmere E56xx/L56xx/X56xx (Nehalem-C) - collect limit: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - collect limit 1 million 833 / 1284 1.3 794.4 1.0X - collect limit 2 millions 3348 / 4005 0.3 3193.3 0.2X - */ } - ignore("generate explode") { - val N = 1 << 24 - runBenchmark("generate explode array", N) { - val df = sparkSession.range(N).selectExpr( - "id as key", - "array(rand(), rand(), rand(), rand(), rand()) as values") - df.selectExpr("key", "explode(values) value").count() - } - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6 - Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz - - generate explode array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - generate explode array wholestage off 6920 / 7129 2.4 412.5 1.0X - generate explode array wholestage on 623 / 646 26.9 37.1 11.1X - */ - - runBenchmark("generate explode map", N) { - val df = sparkSession.range(N).selectExpr( - "id as key", - "map('a', rand(), 'b', rand(), 'c', rand(), 'd', rand(), 'e', rand()) pairs") - df.selectExpr("key", "explode(pairs) as (k, v)").count() - } - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6 - Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz - - generate explode map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - generate explode map wholestage off 11978 / 11993 1.4 714.0 1.0X - generate explode map wholestage on 866 / 919 19.4 51.6 13.8X - */ - - runBenchmark("generate posexplode array", N) { - val df = sparkSession.range(N).selectExpr( - "id as key", - "array(rand(), rand(), rand(), rand(), rand()) as values") - df.selectExpr("key", "posexplode(values) as (idx, value)").count() + def explode(numRows: Int): Unit = { + runBenchmark("generate explode") { + codegenBenchmark("generate explode array", numRows) { + val df = spark.range(numRows).selectExpr( + "id as key", + "array(rand(), rand(), rand(), rand(), rand()) as values") + df.selectExpr("key", "explode(values) value").count() + } + + codegenBenchmark("generate explode map", numRows) { + val df = spark.range(numRows).selectExpr( + "id as key", + "map('a', rand(), 'b', rand(), 'c', rand(), 'd', rand(), 'e', rand()) pairs") + df.selectExpr("key", "explode(pairs) as (k, v)").count() + } + + codegenBenchmark("generate posexplode array", numRows) { + val df = spark.range(numRows).selectExpr( + "id as key", + "array(rand(), rand(), rand(), rand(), rand()) as values") + df.selectExpr("key", "posexplode(values) as (idx, value)").count() + } + + codegenBenchmark("generate inline array", numRows) { + val df = spark.range(numRows).selectExpr( + "id as key", + "array((rand(), rand()), (rand(), rand()), (rand(), 0.0d)) as values") + df.selectExpr("key", "inline(values) as (r1, r2)").count() + } + + val M = 60000 + codegenBenchmark("generate big struct array", M) { + import spark.implicits._ + val df = spark.sparkContext.parallelize(Seq(("1", + Array.fill(M)({ + val i = math.random + (i.toString, (i + 1).toString, (i + 2).toString, (i + 3).toString) + })))).toDF("col", "arr") + + df.selectExpr("*", "explode(arr) as arr_col") + .select("col", "arr_col.*").count + } } - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6 - Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz - - generate posexplode array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - generate posexplode array wholestage off 7502 / 7513 2.2 447.1 1.0X - generate posexplode array wholestage on 617 / 623 27.2 36.8 12.2X - */ - - runBenchmark("generate inline array", N) { - val df = sparkSession.range(N).selectExpr( - "id as key", - "array((rand(), rand()), (rand(), rand()), (rand(), 0.0d)) as values") - df.selectExpr("key", "inline(values) as (r1, r2)").count() - } - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6 - Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz - - generate inline array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - generate inline array wholestage off 6901 / 6928 2.4 411.3 1.0X - generate inline array wholestage on 1001 / 1010 16.8 59.7 6.9X - */ - - val M = 60000 - runBenchmark("generate big struct array", M) { - import sparkSession.implicits._ - val df = sparkSession.sparkContext.parallelize(Seq(("1", - Array.fill(M)({ - val i = math.random - (i.toString, (i + 1).toString, (i + 2).toString, (i + 3).toString) - })))).toDF("col", "arr") - - df.selectExpr("*", "expode(arr) as arr_col") - .select("col", "arr_col.*").count - } - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6 - Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz - - test the impact of adding the optimization of Generate.unrequiredChildIndex, - we can see enormous improvement of x250 in this case! and it grows O(n^2). - - with Optimization ON: - - generate big struct array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - generate big struct array wholestage off 331 / 378 0.2 5524.9 1.0X - generate big struct array wholestage on 205 / 232 0.3 3413.1 1.6X - - with Optimization OFF: - - generate big struct array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - generate big struct array wholestage off 49697 / 51496 0.0 828277.7 1.0X - generate big struct array wholestage on 50558 / 51434 0.0 842641.6 1.0X - */ - } - ignore("generate regular generator") { - val N = 1 << 24 - runBenchmark("generate stack", N) { - val df = sparkSession.range(N).selectExpr( - "id as key", - "id % 2 as t1", - "id % 3 as t2", - "id % 5 as t3", - "id % 7 as t4", - "id % 13 as t5") - df.selectExpr("key", "stack(4, t1, t2, t3, t4, t5)").count() + def stack(numRows: Int): Unit = { + runBenchmark("generate regular generator") { + codegenBenchmark("generate stack", numRows) { + val df = spark.range(numRows).selectExpr( + "id as key", + "id % 2 as t1", + "id % 3 as t2", + "id % 5 as t3", + "id % 7 as t4", + "id % 13 as t5") + df.selectExpr("key", "stack(4, t1, t2, t3, t4, t5)").count() + } } - - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_92-b14 on Mac OS X 10.11.6 - Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz - - generate stack: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - generate stack wholestage off 12953 / 13070 1.3 772.1 1.0X - generate stack wholestage on 836 / 847 20.1 49.8 15.5X - */ } + override def runBenchmarkSuite(): Unit = { + filterAndAggregateWithoutGroup(500L << 22) + limitAndAggregateWithoutGroup(500L << 20) + sample(500 << 18) + collect(1 << 20) + collectLimit(1 << 20) + explode(1 << 24) + stack(1 << 24) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index e7c8f2717fd7..83edf73abfae 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -17,21 +17,30 @@ package org.apache.spark.sql.execution.benchmark -import scala.concurrent.duration._ - -import org.apache.spark.SparkConf -import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.util.Benchmark +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.sql.SparkSession /** - * Benchmark [[PrimitiveArray]] for DataFrame and Dataset program using primitive array - * To run this: - * 1. replace ignore(...) with test(...) - * 2. build/sbt "sql/test-only *benchmark.PrimitiveArrayBenchmark" - * - * Benchmarks in this file are skipped in normal builds. + * Benchmark primitive arrays via DataFrame and Dataset program using primitive arrays + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/PrimitiveArrayBenchmark-results.txt". */ -class PrimitiveArrayBenchmark extends BenchmarkBase { +object PrimitiveArrayBenchmark extends BenchmarkBase { + lazy val sparkSession = SparkSession.builder + .master("local[1]") + .appName("microbenchmark") + .config("spark.sql.shuffle.partitions", 1) + .config("spark.sql.autoBroadcastJoinThreshold", 1) + .getOrCreate() + + override def runBenchmarkSuite(): Unit = { + runBenchmark("Write primitive arrays in dataset") { + writeDatasetArray(4) + } + } def writeDatasetArray(iters: Int): Unit = { import sparkSession.implicits._ @@ -62,21 +71,9 @@ class PrimitiveArrayBenchmark extends BenchmarkBase { } } - val benchmark = new Benchmark("Write an array in Dataset", count * iters) + val benchmark = new Benchmark("Write an array in Dataset", count * iters, output = output) benchmark.addCase("Int ")(intArray) benchmark.addCase("Double")(doubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Write an array in Dataset: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Int 352 / 401 23.8 42.0 1.0X - Double 821 / 885 10.2 97.9 0.4X - */ - } - - ignore("Write an array in Dataset") { - writeDatasetArray(4) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala index 470b93efd197..9a54e2320b80 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala @@ -19,24 +19,27 @@ package org.apache.spark.sql.execution.benchmark import java.util.{Arrays, Comparator} +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.unsafe.array.LongArray -import org.apache.spark.unsafe.memory.OnHeapMemoryBlock -import org.apache.spark.util.Benchmark +import org.apache.spark.unsafe.memory.MemoryBlock import org.apache.spark.util.collection.Sorter import org.apache.spark.util.collection.unsafe.sort._ import org.apache.spark.util.random.XORShiftRandom /** * Benchmark to measure performance for aggregate primitives. - * To run this: - * build/sbt "sql/test-only *benchmark.SortBenchmark" - * - * Benchmarks in this file are skipped in normal builds. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/-results.txt". + * }}} */ -class SortBenchmark extends BenchmarkBase { +object SortBenchmark extends BenchmarkBase { private def referenceKeyPrefixSort(buf: LongArray, lo: Int, hi: Int, refCmp: PrefixComparator) { - val sortBuffer = new LongArray(new OnHeapMemoryBlock(buf.size() * 8L)) + val sortBuffer = new LongArray(MemoryBlock.fromLongArray(new Array[Long](buf.size().toInt))) new Sorter(new UnsafeSortDataFormat(sortBuffer)).sort( buf, lo, hi, new Comparator[RecordPointerAndKeyPrefix] { override def compare( @@ -50,17 +53,17 @@ class SortBenchmark extends BenchmarkBase { private def generateKeyPrefixTestData(size: Int, rand: => Long): (LongArray, LongArray) = { val ref = Array.tabulate[Long](size * 2) { i => rand } val extended = ref ++ Array.fill[Long](size * 2)(0) - (new LongArray(OnHeapMemoryBlock.fromArray(ref)), - new LongArray(OnHeapMemoryBlock.fromArray(extended))) + (new LongArray(MemoryBlock.fromLongArray(ref)), + new LongArray(MemoryBlock.fromLongArray(extended))) } - ignore("sort") { + def sortBenchmark(): Unit = { val size = 25000000 val rand = new XORShiftRandom(123) - val benchmark = new Benchmark("radix sort " + size, size) + val benchmark = new Benchmark("radix sort " + size, size, output = output) benchmark.addTimerCase("reference TimSort key prefix array") { timer => val array = Array.tabulate[Long](size * 2) { i => rand.nextLong } - val buf = new LongArray(OnHeapMemoryBlock.fromArray(array)) + val buf = new LongArray(MemoryBlock.fromLongArray(array)) timer.startTiming() referenceKeyPrefixSort(buf, 0, size, PrefixComparators.BINARY) timer.stopTiming() @@ -78,7 +81,7 @@ class SortBenchmark extends BenchmarkBase { array(i) = rand.nextLong & 0xff i += 1 } - val buf = new LongArray(OnHeapMemoryBlock.fromArray(array)) + val buf = new LongArray(MemoryBlock.fromLongArray(array)) timer.startTiming() RadixSort.sort(buf, size, 0, 7, false, false) timer.stopTiming() @@ -90,7 +93,7 @@ class SortBenchmark extends BenchmarkBase { array(i) = rand.nextLong & 0xffff i += 1 } - val buf = new LongArray(OnHeapMemoryBlock.fromArray(array)) + val buf = new LongArray(MemoryBlock.fromLongArray(array)) timer.startTiming() RadixSort.sort(buf, size, 0, 7, false, false) timer.stopTiming() @@ -102,7 +105,7 @@ class SortBenchmark extends BenchmarkBase { array(i) = rand.nextLong i += 1 } - val buf = new LongArray(OnHeapMemoryBlock.fromArray(array)) + val buf = new LongArray(MemoryBlock.fromLongArray(array)) timer.startTiming() RadixSort.sort(buf, size, 0, 7, false, false) timer.stopTiming() @@ -114,20 +117,11 @@ class SortBenchmark extends BenchmarkBase { timer.stopTiming() } benchmark.run() + } - /* - Running benchmark: radix sort 25000000 - Java HotSpot(TM) 64-Bit Server VM 1.8.0_66-b17 on Linux 3.13.0-44-generic - Intel(R) Core(TM) i7-4600U CPU @ 2.10GHz - - radix sort 25000000: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - reference TimSort key prefix array 15546 / 15859 1.6 621.9 1.0X - reference Arrays.sort 2416 / 2446 10.3 96.6 6.4X - radix sort one byte 133 / 137 188.4 5.3 117.2X - radix sort two bytes 255 / 258 98.2 10.2 61.1X - radix sort eight bytes 991 / 997 25.2 39.6 15.7X - radix sort key prefix array 1540 / 1563 16.2 61.6 10.1X - */ + override def runBenchmarkSuite(): Unit = { + runBenchmark("radix sort") { + sortBenchmark() + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala new file mode 100644 index 000000000000..e95e5a960246 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.plans.SQLHelper +import org.apache.spark.sql.internal.SQLConf + +/** + * Common base trait to run benchmark with the Dataset and DataFrame API. + */ +trait SqlBasedBenchmark extends BenchmarkBase with SQLHelper { + + protected val spark: SparkSession = getSparkSession + + /** Subclass can override this function to build their own SparkSession */ + def getSparkSession: SparkSession = { + SparkSession.builder() + .master("local[1]") + .appName(this.getClass.getCanonicalName) + .config(SQLConf.SHUFFLE_PARTITIONS.key, 1) + .config(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, 1) + .getOrCreate() + } + + /** Runs function `f` with whole stage codegen on and off. */ + final def codegenBenchmark(name: String, cardinality: Long)(f: => Unit): Unit = { + val benchmark = new Benchmark(name, cardinality, output = output) + + benchmark.addCase(s"$name wholestage off", numIters = 2) { _ => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { + f + } + } + + benchmark.addCase(s"$name wholestage on", numIters = 5) { _ => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") { + f + } + } + + benchmark.run() + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala index fccee97820e7..2d72b1c14af7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala @@ -18,13 +18,13 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.SparkConf +import org.apache.spark.benchmark.Benchmark import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.util.Benchmark /** * Benchmark to measure TPCDS query performance. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala index 6c7779b5790d..79eaeab9c399 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala @@ -19,20 +19,21 @@ package org.apache.spark.sql.execution.benchmark import scala.util.Random +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.expressions.{UnsafeArrayData, UnsafeRow} -import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeArrayWriter} -import org.apache.spark.util.Benchmark +import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData /** * Benchmark [[UnsafeArrayDataBenchmark]] for UnsafeArrayData - * To run this: - * 1. replace ignore(...) with test(...) - * 2. build/sbt "sql/test-only *benchmark.UnsafeArrayDataBenchmark" - * - * Benchmarks in this file are skipped in normal builds. + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/UnsafeArrayDataBenchmark-results.txt". + * }}} */ -class UnsafeArrayDataBenchmark extends BenchmarkBase { +object UnsafeArrayDataBenchmark extends BenchmarkBase { def calculateHeaderPortionInBytes(count: Int) : Int = { /* 4 + 4 * count // Use this expression for SPARK-15962 */ @@ -77,18 +78,10 @@ class UnsafeArrayDataBenchmark extends BenchmarkBase { } } - val benchmark = new Benchmark("Read UnsafeArrayData", count * iters) + val benchmark = new Benchmark("Read UnsafeArrayData", count * iters, output = output) benchmark.addCase("Int")(readIntArray) benchmark.addCase("Double")(readDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Read UnsafeArrayData: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Int 252 / 260 666.1 1.5 1.0X - Double 281 / 292 597.7 1.7 0.9X - */ } def writeUnsafeArray(iters: Int): Unit = { @@ -121,18 +114,10 @@ class UnsafeArrayDataBenchmark extends BenchmarkBase { doubleTotalLength = len } - val benchmark = new Benchmark("Write UnsafeArrayData", count * iters) + val benchmark = new Benchmark("Write UnsafeArrayData", count * iters, output = output) benchmark.addCase("Int")(writeIntArray) benchmark.addCase("Double")(writeDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Write UnsafeArrayData: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Int 196 / 249 107.0 9.3 1.0X - Double 227 / 367 92.3 10.8 0.9X - */ } def getPrimitiveArray(iters: Int): Unit = { @@ -167,18 +152,11 @@ class UnsafeArrayDataBenchmark extends BenchmarkBase { doubleTotalLength = len } - val benchmark = new Benchmark("Get primitive array from UnsafeArrayData", count * iters) + val benchmark = + new Benchmark("Get primitive array from UnsafeArrayData", count * iters, output = output) benchmark.addCase("Int")(readIntArray) benchmark.addCase("Double")(readDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Get primitive array from UnsafeArrayData: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Int 151 / 198 415.8 2.4 1.0X - Double 214 / 394 293.6 3.4 0.7X - */ } def putPrimitiveArray(iters: Int): Unit = { @@ -209,24 +187,19 @@ class UnsafeArrayDataBenchmark extends BenchmarkBase { doubleTotalLen = len } - val benchmark = new Benchmark("Create UnsafeArrayData from primitive array", count * iters) + val benchmark = + new Benchmark("Create UnsafeArrayData from primitive array", count * iters, output = output) benchmark.addCase("Int")(createIntArray) benchmark.addCase("Double")(createDoubleArray) benchmark.run - /* - OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64 - Intel Xeon E3-12xx v2 (Ivy Bridge) - Create UnsafeArrayData from primitive array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Int 206 / 211 306.0 3.3 1.0X - Double 232 / 406 271.6 3.7 0.9X - */ } - ignore("Benchmark UnsafeArrayData") { - readUnsafeArray(10) - writeUnsafeArray(10) - getPrimitiveArray(5) - putPrimitiveArray(5) + override def runBenchmarkSuite(): Unit = { + runBenchmark("Benchmark UnsafeArrayData") { + readUnsafeArray(10) + writeUnsafeArray(10) + getPrimitiveArray(5) + putPrimitiveArray(5) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala index a42891e55a18..81017a6d244f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala @@ -22,8 +22,9 @@ import java.io.{File, FileOutputStream, OutputStream} import org.scalatest.BeforeAndAfterEach import org.apache.spark.SparkFunSuite +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.functions._ -import org.apache.spark.util.{Benchmark, Utils} +import org.apache.spark.util.Utils /** * Benchmark for performance with very wide and nested DataFrames. @@ -54,8 +55,11 @@ class WideSchemaBenchmark extends SparkFunSuite with BeforeAndAfterEach { } override def afterAll() { - super.afterAll() - out.close() + try { + out.close() + } finally { + super.afterAll() + } } override def afterEach() { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala index 619b76fabdd5..0f9079744a22 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala @@ -23,16 +23,23 @@ import java.nio.charset.StandardCharsets import org.apache.commons.lang3.RandomStringUtils import org.apache.commons.math3.distribution.LogNormalDistribution +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.execution.columnar.{BOOLEAN, INT, LONG, NativeColumnType, SHORT, STRING} import org.apache.spark.sql.types.AtomicType -import org.apache.spark.util.Benchmark import org.apache.spark.util.Utils._ /** * Benchmark to decoders using various compression schemes. + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/CompressionSchemeBenchmark-results.txt". + * }}} */ -object CompressionSchemeBenchmark extends AllCompressionSchemes { +object CompressionSchemeBenchmark extends BenchmarkBase with AllCompressionSchemes { private[this] def allocateLocal(size: Int): ByteBuffer = { ByteBuffer.allocate(size).order(ByteOrder.nativeOrder) @@ -77,7 +84,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { count: Int, tpe: NativeColumnType[T], input: ByteBuffer): Unit = { - val benchmark = new Benchmark(name, iters * count.toLong) + val benchmark = new Benchmark(name, iters * count.toLong, output = output) schemes.filter(_.supports(tpe)).foreach { scheme => val (compressFunc, compressionRatio, buf) = prepareEncodeInternal(count, tpe, scheme, input) @@ -101,7 +108,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { count: Int, tpe: NativeColumnType[T], input: ByteBuffer): Unit = { - val benchmark = new Benchmark(name, iters * count.toLong) + val benchmark = new Benchmark(name, iters * count.toLong, output = output) schemes.filter(_.supports(tpe)).foreach { scheme => val (compressFunc, _, buf) = prepareEncodeInternal(count, tpe, scheme, input) @@ -138,21 +145,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { testData.put(i * BOOLEAN.defaultSize, g()) } - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // BOOLEAN Encode: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough(1.000) 3 / 4 19300.2 0.1 1.0X - // RunLengthEncoding(2.491) 923 / 939 72.7 13.8 0.0X - // BooleanBitSet(0.125) 359 / 363 187.1 5.3 0.0X runEncodeBenchmark("BOOLEAN Encode", iters, count, BOOLEAN, testData) - - - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // BOOLEAN Decode: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough 129 / 136 519.8 1.9 1.0X - // RunLengthEncoding 613 / 623 109.4 9.1 0.2X - // BooleanBitSet 1196 / 1222 56.1 17.8 0.1X runDecodeBenchmark("BOOLEAN Decode", iters, count, BOOLEAN, testData) } @@ -165,18 +158,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { testData.putShort(i * SHORT.defaultSize, g1().toShort) } - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // SHORT Encode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough(1.000) 6 / 7 10971.4 0.1 1.0X - // RunLengthEncoding(1.510) 1526 / 1542 44.0 22.7 0.0X runEncodeBenchmark("SHORT Encode (Lower Skew)", iters, count, SHORT, testData) - - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // SHORT Decode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough 811 / 837 82.8 12.1 1.0X - // RunLengthEncoding 1219 / 1266 55.1 18.2 0.7X runDecodeBenchmark("SHORT Decode (Lower Skew)", iters, count, SHORT, testData) val g2 = genHigherSkewData() @@ -184,18 +166,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { testData.putShort(i * SHORT.defaultSize, g2().toShort) } - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // SHORT Encode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough(1.000) 7 / 7 10112.4 0.1 1.0X - // RunLengthEncoding(2.009) 1623 / 1661 41.4 24.2 0.0X runEncodeBenchmark("SHORT Encode (Higher Skew)", iters, count, SHORT, testData) - - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // SHORT Decode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough 818 / 827 82.0 12.2 1.0X - // RunLengthEncoding 1202 / 1237 55.8 17.9 0.7X runDecodeBenchmark("SHORT Decode (Higher Skew)", iters, count, SHORT, testData) } @@ -208,22 +179,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { testData.putInt(i * INT.defaultSize, g1().toInt) } - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // INT Encode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough(1.000) 18 / 19 3716.4 0.3 1.0X - // RunLengthEncoding(1.001) 1992 / 2056 33.7 29.7 0.0X - // DictionaryEncoding(0.500) 723 / 739 92.8 10.8 0.0X - // IntDelta(0.250) 368 / 377 182.2 5.5 0.0X runEncodeBenchmark("INT Encode (Lower Skew)", iters, count, INT, testData) - - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // INT Decode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough 821 / 845 81.8 12.2 1.0X - // RunLengthEncoding 1246 / 1256 53.9 18.6 0.7X - // DictionaryEncoding 757 / 766 88.6 11.3 1.1X - // IntDelta 680 / 689 98.7 10.1 1.2X runDecodeBenchmark("INT Decode (Lower Skew)", iters, count, INT, testData) val g2 = genHigherSkewData() @@ -231,22 +187,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { testData.putInt(i * INT.defaultSize, g2().toInt) } - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // INT Encode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough(1.000) 17 / 19 3888.4 0.3 1.0X - // RunLengthEncoding(1.339) 2127 / 2148 31.5 31.7 0.0X - // DictionaryEncoding(0.501) 960 / 972 69.9 14.3 0.0X - // IntDelta(0.250) 362 / 366 185.5 5.4 0.0X runEncodeBenchmark("INT Encode (Higher Skew)", iters, count, INT, testData) - - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // INT Decode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough 838 / 884 80.1 12.5 1.0X - // RunLengthEncoding 1287 / 1311 52.1 19.2 0.7X - // DictionaryEncoding 844 / 859 79.5 12.6 1.0X - // IntDelta 764 / 784 87.8 11.4 1.1X runDecodeBenchmark("INT Decode (Higher Skew)", iters, count, INT, testData) } @@ -259,22 +200,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { testData.putLong(i * LONG.defaultSize, g1().toLong) } - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // LONG Encode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough(1.000) 37 / 38 1804.8 0.6 1.0X - // RunLengthEncoding(0.748) 2065 / 2094 32.5 30.8 0.0X - // DictionaryEncoding(0.250) 950 / 962 70.6 14.2 0.0X - // LongDelta(0.125) 475 / 482 141.2 7.1 0.1X runEncodeBenchmark("LONG Encode (Lower Skew)", iters, count, LONG, testData) - - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // LONG Decode (Lower Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough 888 / 894 75.5 13.2 1.0X - // RunLengthEncoding 1301 / 1311 51.6 19.4 0.7X - // DictionaryEncoding 887 / 904 75.7 13.2 1.0X - // LongDelta 693 / 735 96.8 10.3 1.3X runDecodeBenchmark("LONG Decode (Lower Skew)", iters, count, LONG, testData) val g2 = genHigherSkewData() @@ -282,22 +208,7 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { testData.putLong(i * LONG.defaultSize, g2().toLong) } - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // LONG Encode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough(1.000) 34 / 35 1963.9 0.5 1.0X - // RunLengthEncoding(0.999) 2260 / 3021 29.7 33.7 0.0X - // DictionaryEncoding(0.251) 1270 / 1438 52.8 18.9 0.0X - // LongDelta(0.125) 496 / 509 135.3 7.4 0.1X runEncodeBenchmark("LONG Encode (Higher Skew)", iters, count, LONG, testData) - - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // LONG Decode (Higher Skew): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough 965 / 1494 69.5 14.4 1.0X - // RunLengthEncoding 1350 / 1378 49.7 20.1 0.7X - // DictionaryEncoding 892 / 924 75.2 13.3 1.1X - // LongDelta 817 / 847 82.2 12.2 1.2X runDecodeBenchmark("LONG Decode (Higher Skew)", iters, count, LONG, testData) } @@ -318,28 +229,17 @@ object CompressionSchemeBenchmark extends AllCompressionSchemes { } testData.rewind() - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // STRING Encode: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough(1.000) 56 / 57 1197.9 0.8 1.0X - // RunLengthEncoding(0.893) 4892 / 4937 13.7 72.9 0.0X - // DictionaryEncoding(0.167) 2968 / 2992 22.6 44.2 0.0X runEncodeBenchmark("STRING Encode", iters, count, STRING, testData) - - // Intel(R) Core(TM) i7-4578U CPU @ 3.00GHz - // STRING Decode: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - // ------------------------------------------------------------------------------------------- - // PassThrough 2422 / 2449 27.7 36.1 1.0X - // RunLengthEncoding 2885 / 3018 23.3 43.0 0.8X - // DictionaryEncoding 2716 / 2752 24.7 40.5 0.9X runDecodeBenchmark("STRING Decode", iters, count, STRING, testData) } - def main(args: Array[String]): Unit = { - bitEncodingBenchmark(1024) - shortEncodingBenchmark(1024) - intEncodingBenchmark(1024) - longEncodingBenchmark(1024) - stringEncodingBenchmark(1024) + override def runBenchmarkSuite(): Unit = { + runBenchmark("Compression Scheme Benchmark") { + bitEncodingBenchmark(1024) + shortEncodingBenchmark(1024) + intEncodingBenchmark(1024) + longEncodingBenchmark(1024) + stringEncodingBenchmark(1024) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/BasicWriteTaskStatsTrackerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/BasicWriteTaskStatsTrackerSuite.scala index bf3c8ede9a98..32941d8d2cd1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/BasicWriteTaskStatsTrackerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/BasicWriteTaskStatsTrackerSuite.scala @@ -49,7 +49,11 @@ class BasicWriteTaskStatsTrackerSuite extends SparkFunSuite { * In teardown delete the temp dir. */ protected override def afterAll(): Unit = { - Utils.deleteRecursively(tempDir) + try { + Utils.deleteRecursively(tempDir) + } finally { + super.afterAll() + } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala index 18bb4bfe661c..49e7af4a9896 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala @@ -23,7 +23,7 @@ import java.net.URI import scala.collection.mutable import scala.language.reflectiveCalls -import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem} +import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path, RawLocalFileSystem} import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.catalyst.util._ @@ -248,6 +248,26 @@ class FileIndexSuite extends SharedSQLContext { assert(spark.read.parquet(path.getAbsolutePath).schema.exists(_.name == colToUnescape)) } } + + test("SPARK-25062 - InMemoryFileIndex stores BlockLocation objects no matter what subclass " + + "the FS returns") { + withSQLConf("fs.file.impl" -> classOf[SpecialBlockLocationFileSystem].getName) { + withTempDir { dir => + val file = new File(dir, "text.txt") + stringToFile(file, "text") + + val inMemoryFileIndex = new InMemoryFileIndex( + spark, Seq(new Path(file.getCanonicalPath)), Map.empty, None) { + def leafFileStatuses = leafFiles.values + } + val blockLocations = inMemoryFileIndex.leafFileStatuses.flatMap( + _.asInstanceOf[LocatedFileStatus].getBlockLocations) + + assert(blockLocations.forall(_.getClass == classOf[BlockLocation])) + } + } + } + } class FakeParentPathFileSystem extends RawLocalFileSystem { @@ -257,3 +277,20 @@ class FakeParentPathFileSystem extends RawLocalFileSystem { URI.create("mockFs://some-bucket") } } + +class SpecialBlockLocationFileSystem extends RawLocalFileSystem { + + class SpecialBlockLocation( + names: Array[String], + hosts: Array[String], + offset: Long, + length: Long) + extends BlockLocation(names, hosts, offset, length) + + override def getFileBlockLocations( + file: FileStatus, + start: Long, + len: Long): Array[BlockLocation] = { + Array(new SpecialBlockLocation(Array("dummy"), Array("dummy"), 0L, file.getLen)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala index 24f5f55d5548..5d1a874999c0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmarks.scala @@ -16,20 +16,19 @@ */ package org.apache.spark.sql.execution.datasources.csv -import java.io.File - import org.apache.spark.SparkConf +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{Column, Row, SparkSession} +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.functions.lit import org.apache.spark.sql.types._ -import org.apache.spark.util.{Benchmark, Utils} /** * Benchmark to measure CSV read/write performance. * To run this: * spark-submit --class --jars */ -object CSVBenchmarks { +object CSVBenchmarks extends SQLHelper { val conf = new SparkConf() val spark = SparkSession.builder @@ -39,12 +38,6 @@ object CSVBenchmarks { .getOrCreate() import spark.implicits._ - def withTempPath(f: File => Unit): Unit = { - val path = Utils.createTempDir() - path.delete() - try f(path) finally Utils.deleteRecursively(path) - } - def quotedValuesBenchmark(rowsNum: Int, numIters: Int): Unit = { val benchmark = new Benchmark(s"Parsing quoted values", rowsNum) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 5a1d6679ebbd..5d4746cf90b3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -34,7 +34,7 @@ import org.apache.log4j.{AppenderSkeleton, LogManager} import org.apache.log4j.spi.LoggingEvent import org.apache.spark.SparkException -import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, UDT} +import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils} @@ -50,6 +50,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te private val carsAltFile = "test-data/cars-alternative.csv" private val carsUnbalancedQuotesFile = "test-data/cars-unbalanced-quotes.csv" private val carsNullFile = "test-data/cars-null.csv" + private val carsEmptyValueFile = "test-data/cars-empty-value.csv" private val carsBlankColName = "test-data/cars-blank-column-name.csv" private val emptyFile = "test-data/empty.csv" private val commentsFile = "test-data/comments.csv" @@ -668,6 +669,70 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te assert(results(2).toSeq === Array(null, "Chevy", "Volt", null, null)) } + test("empty fields with user defined empty values") { + + // year,make,model,comment,blank + val dataSchema = StructType(List( + StructField("year", IntegerType, nullable = true), + StructField("make", StringType, nullable = false), + StructField("model", StringType, nullable = false), + StructField("comment", StringType, nullable = true), + StructField("blank", StringType, nullable = true))) + val cars = spark.read + .format("csv") + .schema(dataSchema) + .option("header", "true") + .option("emptyValue", "empty") + .load(testFile(carsEmptyValueFile)) + + verifyCars(cars, withHeader = true, checkValues = false) + val results = cars.collect() + assert(results(0).toSeq === Array(2012, "Tesla", "S", "empty", "empty")) + assert(results(1).toSeq === + Array(1997, "Ford", "E350", "Go get one now they are going fast", null)) + assert(results(2).toSeq === Array(2015, "Chevy", "Volt", null, "empty")) + } + + test("save csv with empty fields with user defined empty values") { + withTempDir { dir => + val csvDir = new File(dir, "csv").getCanonicalPath + + // year,make,model,comment,blank + val dataSchema = StructType(List( + StructField("year", IntegerType, nullable = true), + StructField("make", StringType, nullable = false), + StructField("model", StringType, nullable = false), + StructField("comment", StringType, nullable = true), + StructField("blank", StringType, nullable = true))) + val cars = spark.read + .format("csv") + .schema(dataSchema) + .option("header", "true") + .option("nullValue", "NULL") + .load(testFile(carsEmptyValueFile)) + + cars.coalesce(1).write + .format("csv") + .option("header", "true") + .option("emptyValue", "empty") + .option("nullValue", null) + .save(csvDir) + + val carsCopy = spark.read + .format("csv") + .schema(dataSchema) + .option("header", "true") + .load(csvDir) + + verifyCars(carsCopy, withHeader = true, checkValues = false) + val results = carsCopy.collect() + assert(results(0).toSeq === Array(2012, "Tesla", "S", "empty", "empty")) + assert(results(1).toSeq === + Array(1997, "Ford", "E350", "Go get one now they are going fast", null)) + assert(results(2).toSeq === Array(2015, "Chevy", "Volt", null, "empty")) + } + } + test("save csv with compression codec option") { withTempDir { dir => val csvDir = new File(dir, "csv").getCanonicalPath @@ -1375,6 +1440,52 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te } } + test("SPARK-25241: An empty string should not be coerced to null when emptyValue is passed.") { + val litNull: String = null + val df = Seq( + (1, "John Doe"), + (2, ""), + (3, "-"), + (4, litNull) + ).toDF("id", "name") + + // Checks for new behavior where a null is not coerced to an empty string when `emptyValue` is + // set to anything but an empty string literal. + withTempPath { path => + df.write + .option("emptyValue", "-") + .csv(path.getAbsolutePath) + val computed = spark.read + .option("emptyValue", "-") + .schema(df.schema) + .csv(path.getAbsolutePath) + val expected = Seq( + (1, "John Doe"), + (2, "-"), + (3, "-"), + (4, "-") + ).toDF("id", "name") + + checkAnswer(computed, expected) + } + // Keeps the old behavior where empty string us coerced to emptyValue is not passed. + withTempPath { path => + df.write + .csv(path.getAbsolutePath) + val computed = spark.read + .schema(df.schema) + .csv(path.getAbsolutePath) + val expected = Seq( + (1, "John Doe"), + (2, litNull), + (3, "-"), + (4, litNull) + ).toDF("id", "name") + + checkAnswer(computed, expected) + } + } + test("SPARK-24329: skip lines with comments, and one or multiple whitespaces") { val schema = new StructType().add("colA", StringType) val ds = spark @@ -1700,4 +1811,19 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te checkCount(2) countForMalformedCSV(0, Seq("")) } + + test("SPARK-25387: bad input should not cause NPE") { + val schema = StructType(StructField("a", IntegerType) :: Nil) + val input = spark.createDataset(Seq("\u0000\u0000\u0001234")) + + checkAnswer(spark.read.schema(schema).csv(input), Row(null)) + checkAnswer(spark.read.option("multiLine", true).schema(schema).csv(input), Row(null)) + assert(spark.read.csv(input).collect().toSet == Set(Row())) + } + + test("field names of inferred schema shouldn't compare to the first row") { + val input = Seq("1,2").toDS() + val df = spark.read.option("enforceSchema", false).csv(input) + checkAnswer(df, Row("1", "2")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala index a2b747eaab41..3c4a5ab32724 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala @@ -19,17 +19,18 @@ package org.apache.spark.sql.execution.datasources.json import java.io.File import org.apache.spark.SparkConf +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.functions.lit import org.apache.spark.sql.types._ -import org.apache.spark.util.{Benchmark, Utils} /** * The benchmarks aims to measure performance of JSON parsing when encoding is set and isn't. * To run this: * spark-submit --class --jars */ -object JSONBenchmarks { +object JSONBenchmarks extends SQLHelper { val conf = new SparkConf() val spark = SparkSession.builder @@ -39,13 +40,6 @@ object JSONBenchmarks { .getOrCreate() import spark.implicits._ - def withTempPath(f: File => Unit): Unit = { - val path = Utils.createTempDir() - path.delete() - try f(path) finally Utils.deleteRecursively(path) - } - - def schemaInferring(rowsNum: Int): Unit = { val benchmark = new Benchmark("JSON schema inferring", rowsNum) @@ -72,12 +66,13 @@ object JSONBenchmarks { } /* - Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz + Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5 + Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - No encoding 38902 / 39282 2.6 389.0 1.0X - UTF-8 is set 56959 / 57261 1.8 569.6 0.7X + JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + --------------------------------------------------------------------------------------------- + No encoding 45908 / 46480 2.2 459.1 1.0X + UTF-8 is set 68469 / 69762 1.5 684.7 0.7X */ benchmark.run() } @@ -113,12 +108,13 @@ object JSONBenchmarks { } /* - Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz + Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5 + Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - No encoding 25947 / 26188 3.9 259.5 1.0X - UTF-8 is set 46319 / 46417 2.2 463.2 0.6X + JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + --------------------------------------------------------------------------------------------- + No encoding 9982 / 10237 10.0 99.8 1.0X + UTF-8 is set 16373 / 16806 6.1 163.7 0.6X */ benchmark.run() } @@ -161,12 +157,13 @@ object JSONBenchmarks { } /* - Intel(R) Core(TM) i7-7920HQ CPU @ 3.10GHz + Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5 + Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz - JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - -------------------------------------------------------------------------------------------- - No encoding 45543 / 45660 0.2 4554.3 1.0X - UTF-8 is set 65737 / 65957 0.2 6573.7 0.7X + JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + --------------------------------------------------------------------------------------------- + No encoding 26038 / 26386 0.4 2603.8 1.0X + UTF-8 is set 28343 / 28557 0.4 2834.3 0.9X */ benchmark.run() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala index 02bfb7197ffc..dc81c0585bf1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala @@ -21,7 +21,13 @@ import java.io.File import java.sql.Timestamp import java.util.Locale +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path import org.apache.orc.OrcConf.COMPRESS +import org.apache.orc.OrcFile +import org.apache.orc.OrcProto.ColumnEncoding.Kind.{DICTIONARY_V2, DIRECT, DIRECT_V2} +import org.apache.orc.OrcProto.Stream.Kind +import org.apache.orc.impl.RecordReaderImpl import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.Row @@ -50,6 +56,136 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { .createOrReplaceTempView("orc_temp_table") } + protected def testBloomFilterCreation(bloomFilterKind: Kind) { + val tableName = "bloomFilter" + + withTempDir { dir => + withTable(tableName) { + val sqlStatement = orcImp match { + case "native" => + s""" + |CREATE TABLE $tableName (a INT, b STRING) + |USING ORC + |OPTIONS ( + | path '${dir.toURI}', + | orc.bloom.filter.columns '*', + | orc.bloom.filter.fpp 0.1 + |) + """.stripMargin + case "hive" => + s""" + |CREATE TABLE $tableName (a INT, b STRING) + |STORED AS ORC + |LOCATION '${dir.toURI}' + |TBLPROPERTIES ( + | orc.bloom.filter.columns='*', + | orc.bloom.filter.fpp=0.1 + |) + """.stripMargin + case impl => + throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl") + } + + sql(sqlStatement) + sql(s"INSERT INTO $tableName VALUES (1, 'str')") + + val partFiles = dir.listFiles() + .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) + assert(partFiles.length === 1) + + val orcFilePath = new Path(partFiles.head.getAbsolutePath) + val readerOptions = OrcFile.readerOptions(new Configuration()) + val reader = OrcFile.createReader(orcFilePath, readerOptions) + var recordReader: RecordReaderImpl = null + try { + recordReader = reader.rows.asInstanceOf[RecordReaderImpl] + + // BloomFilter array is created for all types; `struct`, int (`a`), string (`b`) + val sargColumns = Array(true, true, true) + val orcIndex = recordReader.readRowIndex(0, null, sargColumns) + + // Check the types and counts of bloom filters + assert(orcIndex.getBloomFilterKinds.forall(_ === bloomFilterKind)) + assert(orcIndex.getBloomFilterIndex.forall(_.getBloomFilterCount > 0)) + } finally { + if (recordReader != null) { + recordReader.close() + } + } + } + } + } + + protected def testSelectiveDictionaryEncoding(isSelective: Boolean) { + val tableName = "orcTable" + + withTempDir { dir => + withTable(tableName) { + val sqlStatement = orcImp match { + case "native" => + s""" + |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE) + |USING ORC + |OPTIONS ( + | path '${dir.toURI}', + | orc.dictionary.key.threshold '1.0', + | orc.column.encoding.direct 'uniqColumn' + |) + """.stripMargin + case "hive" => + s""" + |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE) + |STORED AS ORC + |LOCATION '${dir.toURI}' + |TBLPROPERTIES ( + | orc.dictionary.key.threshold '1.0', + | hive.exec.orc.dictionary.key.size.threshold '1.0', + | orc.column.encoding.direct 'uniqColumn' + |) + """.stripMargin + case impl => + throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl") + } + + sql(sqlStatement) + sql(s"INSERT INTO $tableName VALUES ('94086', 'random-uuid-string', 0.0)") + + val partFiles = dir.listFiles() + .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) + assert(partFiles.length === 1) + + val orcFilePath = new Path(partFiles.head.getAbsolutePath) + val readerOptions = OrcFile.readerOptions(new Configuration()) + val reader = OrcFile.createReader(orcFilePath, readerOptions) + var recordReader: RecordReaderImpl = null + try { + recordReader = reader.rows.asInstanceOf[RecordReaderImpl] + + // Check the kind + val stripe = recordReader.readStripeFooter(reader.getStripes.get(0)) + + // The encodings are divided into direct or dictionary-based categories and + // further refined as to whether they use RLE v1 or v2. RLE v1 is used by + // Hive 0.11 and RLE v2 is introduced in Hive 0.12 ORC with more improvements. + // For more details, see https://orc.apache.org/specification/ + assert(stripe.getColumns(1).getKind === DICTIONARY_V2) + if (isSelective) { + assert(stripe.getColumns(2).getKind === DIRECT_V2) + } else { + assert(stripe.getColumns(2).getKind === DICTIONARY_V2) + } + // Floating point types are stored with DIRECT encoding in IEEE 754 floating + // point bit layout. + assert(stripe.getColumns(3).getKind === DIRECT) + } finally { + if (recordReader != null) { + recordReader.close() + } + } + } + } + } + test("create temporary orc table") { checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10)) @@ -215,4 +351,12 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext { |) """.stripMargin) } + + test("Check BloomFilter creation") { + testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101 + } + + test("Enforce direct encoding column-wise selectively") { + testSelectiveDictionaryEncoding(isSelective = true) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 7ebb75009555..01e41b3c5df3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -750,7 +750,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex } } - test("SPARK-12218 Converting conjunctions into Parquet filter predicates") { + test("SPARK-12218 and SPARK-25559 Converting conjunctions into Parquet filter predicates") { val schema = StructType(Seq( StructField("a", IntegerType, nullable = false), StructField("b", StringType, nullable = true), @@ -770,7 +770,11 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex sources.GreaterThan("c", 1.5D))) } - assertResult(None) { + // Testing when `canRemoveOneSideInAnd == true` + // case sources.And(lhs, rhs) => + // ... + // case (Some(lhsFilter), None) if canRemoveOneSideInAnd => Some(lhsFilter) + assertResult(Some(lt(intColumn("a"), 10: Integer))) { parquetFilters.createFilter( parquetSchema, sources.And( @@ -778,6 +782,83 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex sources.StringContains("b", "prefix"))) } + // Testing when `canRemoveOneSideInAnd == true` + // case sources.And(lhs, rhs) => + // ... + // case (None, Some(rhsFilter)) if canRemoveOneSideInAnd => Some(rhsFilter) + assertResult(Some(lt(intColumn("a"), 10: Integer))) { + parquetFilters.createFilter( + parquetSchema, + sources.And( + sources.StringContains("b", "prefix"), + sources.LessThan("a", 10))) + } + + // Testing complex And conditions + assertResult(Some( + FilterApi.and(lt(intColumn("a"), 10: Integer), gt(intColumn("a"), 5: Integer)))) { + parquetFilters.createFilter( + parquetSchema, + sources.And( + sources.And( + sources.LessThan("a", 10), + sources.StringContains("b", "prefix") + ), + sources.GreaterThan("a", 5))) + } + + // Testing complex And conditions + assertResult(Some( + FilterApi.and(gt(intColumn("a"), 5: Integer), lt(intColumn("a"), 10: Integer)))) { + parquetFilters.createFilter( + parquetSchema, + sources.And( + sources.GreaterThan("a", 5), + sources.And( + sources.StringContains("b", "prefix"), + sources.LessThan("a", 10) + ))) + } + + // Testing + // case sources.Or(lhs, rhs) => + // ... + // lhsFilter <- createFilterHelper(nameToParquetField, lhs, canRemoveOneSideInAnd = false) + assertResult(None) { + parquetFilters.createFilter( + parquetSchema, + sources.Or( + sources.And( + sources.GreaterThan("a", 1), + sources.StringContains("b", "prefix")), + sources.GreaterThan("a", 2))) + } + + // Testing + // case sources.Or(lhs, rhs) => + // ... + // rhsFilter <- createFilterHelper(nameToParquetField, rhs, canRemoveOneSideInAnd = false) + assertResult(None) { + parquetFilters.createFilter( + parquetSchema, + sources.Or( + sources.GreaterThan("a", 2), + sources.And( + sources.GreaterThan("a", 1), + sources.StringContains("b", "prefix")))) + } + + // Testing + // case sources.Not(pred) => + // createFilterHelper(nameToParquetField, pred, canRemoveOneSideInAnd = false) + // .map(FilterApi.not) + // + // and + // + // Testing when `canRemoveOneSideInAnd == false` + // case sources.And(lhs, rhs) => + // ... + // case (Some(lhsFilter), None) if canRemoveOneSideInAnd => Some(lhsFilter) assertResult(None) { parquetFilters.createFilter( parquetSchema, @@ -786,6 +867,68 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex sources.GreaterThan("a", 1), sources.StringContains("b", "prefix")))) } + + // Testing + // case sources.Not(pred) => + // createFilterHelper(nameToParquetField, pred, canRemoveOneSideInAnd = false) + // .map(FilterApi.not) + // + // and + // + // Testing when `canRemoveOneSideInAnd == false` + // case sources.And(lhs, rhs) => + // ... + // case (None, Some(rhsFilter)) if canRemoveOneSideInAnd => Some(rhsFilter) + assertResult(None) { + parquetFilters.createFilter( + parquetSchema, + sources.Not( + sources.And( + sources.StringContains("b", "prefix"), + sources.GreaterThan("a", 1)))) + } + + // Testing + // case sources.Not(pred) => + // createFilterHelper(nameToParquetField, pred, canRemoveOneSideInAnd = false) + // .map(FilterApi.not) + // + // and + // + // Testing passing `canRemoveOneSideInAnd = false` into + // case sources.And(lhs, rhs) => + // val lhsFilterOption = createFilterHelper(nameToParquetField, lhs, canRemoveOneSideInAnd) + assertResult(None) { + parquetFilters.createFilter( + parquetSchema, + sources.Not( + sources.And( + sources.And( + sources.GreaterThan("a", 1), + sources.StringContains("b", "prefix")), + sources.GreaterThan("a", 2)))) + } + + // Testing + // case sources.Not(pred) => + // createFilterHelper(nameToParquetField, pred, canRemoveOneSideInAnd = false) + // .map(FilterApi.not) + // + // and + // + // Testing passing `canRemoveOneSideInAnd = false` into + // case sources.And(lhs, rhs) => + // val rhsFilterOption = createFilterHelper(nameToParquetField, rhs, canRemoveOneSideInAnd) + assertResult(None) { + parquetFilters.createFilter( + parquetSchema, + sources.Not( + sources.And( + sources.GreaterThan("a", 2), + sources.And( + sources.GreaterThan("a", 1), + sources.StringContains("b", "prefix"))))) + } } test("SPARK-16371 Do not push down filters when inner name and outer name are the same") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala index eb99654fa78f..434c4414edeb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala @@ -35,22 +35,29 @@ class ParquetSchemaPruningSuite with SchemaPruningTest with SharedSQLContext { case class FullName(first: String, middle: String, last: String) + case class Company(name: String, address: String) + case class Employer(id: Int, company: Company) case class Contact( id: Int, name: FullName, address: String, pets: Int, friends: Array[FullName] = Array.empty, - relatives: Map[String, FullName] = Map.empty) + relatives: Map[String, FullName] = Map.empty, + employer: Employer = null) val janeDoe = FullName("Jane", "X.", "Doe") val johnDoe = FullName("John", "Y.", "Doe") val susanSmith = FullName("Susan", "Z.", "Smith") + val employer = Employer(0, Company("abc", "123 Business Street")) + val employerWithNullCompany = Employer(1, null) + private val contacts = Contact(0, janeDoe, "123 Main Street", 1, friends = Array(susanSmith), - relatives = Map("brother" -> johnDoe)) :: - Contact(1, johnDoe, "321 Wall Street", 3, relatives = Map("sister" -> janeDoe)) :: Nil + relatives = Map("brother" -> johnDoe), employer = employer) :: + Contact(1, johnDoe, "321 Wall Street", 3, relatives = Map("sister" -> janeDoe), + employer = employerWithNullCompany) :: Nil case class Name(first: String, last: String) case class BriefContact(id: Int, name: Name, address: String) @@ -66,13 +73,14 @@ class ParquetSchemaPruningSuite pets: Int, friends: Array[FullName] = Array(), relatives: Map[String, FullName] = Map(), + employer: Employer = null, p: Int) case class BriefContactWithDataPartitionColumn(id: Int, name: Name, address: String, p: Int) private val contactsWithDataPartitionColumn = - contacts.map { case Contact(id, name, address, pets, friends, relatives) => - ContactWithDataPartitionColumn(id, name, address, pets, friends, relatives, 1) } + contacts.map { case Contact(id, name, address, pets, friends, relatives, employer) => + ContactWithDataPartitionColumn(id, name, address, pets, friends, relatives, employer, 1) } private val briefContactsWithDataPartitionColumn = briefContacts.map { case BriefContact(id, name, address) => BriefContactWithDataPartitionColumn(id, name, address, 2) } @@ -155,21 +163,79 @@ class ParquetSchemaPruningSuite Row(null) :: Row(null) :: Nil) } + testSchemaPruning("select a single complex field and in where clause") { + val query1 = sql("select name.first from contacts where name.first = 'Jane'") + checkScan(query1, "struct>") + checkAnswer(query1, Row("Jane") :: Nil) + + val query2 = sql("select name.first, name.last from contacts where name.first = 'Jane'") + checkScan(query2, "struct>") + checkAnswer(query2, Row("Jane", "Doe") :: Nil) + + val query3 = sql("select name.first from contacts " + + "where employer.company.name = 'abc' and p = 1") + checkScan(query3, "struct," + + "employer:struct>>") + checkAnswer(query3, Row("Jane") :: Nil) + + val query4 = sql("select name.first, employer.company.name from contacts " + + "where employer.company is not null and p = 1") + checkScan(query4, "struct," + + "employer:struct>>") + checkAnswer(query4, Row("Jane", "abc") :: Nil) + } + + testSchemaPruning("select nullable complex field and having is not null predicate") { + val query = sql("select employer.company from contacts " + + "where employer is not null and p = 1") + checkScan(query, "struct>>") + checkAnswer(query, Row(Row("abc", "123 Business Street")) :: Row(null) :: Nil) + } + + testSchemaPruning("select a single complex field and is null expression in project") { + val query = sql("select name.first, address is not null from contacts") + checkScan(query, "struct,address:string>") + checkAnswer(query.orderBy("id"), + Row("Jane", true) :: Row("John", true) :: Row("Janet", true) :: Row("Jim", true) :: Nil) + } + + testSchemaPruning("select a single complex field array and in clause") { + val query = sql("select friends.middle from contacts where friends.first[0] = 'Susan'") + checkScan(query, + "struct>>") + checkAnswer(query.orderBy("id"), + Row(Array("Z.")) :: Nil) + } + + testSchemaPruning("select a single complex field from a map entry and in clause") { + val query = + sql("select relatives[\"brother\"].middle from contacts " + + "where relatives[\"brother\"].first = 'John'") + checkScan(query, + "struct>>") + checkAnswer(query.orderBy("id"), + Row("Y.") :: Nil) + } + private def testSchemaPruning(testName: String)(testThunk: => Unit) { - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { - test(s"Spark vectorized reader - without partition data column - $testName") { + test(s"Spark vectorized reader - without partition data column - $testName") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { withContacts(testThunk) } - test(s"Spark vectorized reader - with partition data column - $testName") { + } + test(s"Spark vectorized reader - with partition data column - $testName") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { withContactsWithDataPartitionColumn(testThunk) } } - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - test(s"Parquet-mr reader - without partition data column - $testName") { + test(s"Parquet-mr reader - without partition data column - $testName") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { withContacts(testThunk) } - test(s"Parquet-mr reader - with partition data column - $testName") { + } + test(s"Parquet-mr reader - with partition data column - $testName") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { withContactsWithDataPartitionColumn(testThunk) } } @@ -209,7 +275,7 @@ class ParquetSchemaPruningSuite MixedCase(1, "r1c1", MixedCaseColumn("123", 2)) :: Nil - testMixedCasePruning("select with exact column names") { + testExactCaseQueryPruning("select with exact column names") { val query = sql("select CoL1, coL2.B from mixedcase") checkScan(query, "struct>") checkAnswer(query.orderBy("id"), @@ -218,7 +284,7 @@ class ParquetSchemaPruningSuite Nil) } - testMixedCasePruning("select with lowercase column names") { + testMixedCaseQueryPruning("select with lowercase column names") { val query = sql("select col1, col2.b from mixedcase") checkScan(query, "struct>") checkAnswer(query.orderBy("id"), @@ -227,7 +293,7 @@ class ParquetSchemaPruningSuite Nil) } - testMixedCasePruning("select with different-case column names") { + testMixedCaseQueryPruning("select with different-case column names") { val query = sql("select cOL1, cOl2.b from mixedcase") checkScan(query, "struct>") checkAnswer(query.orderBy("id"), @@ -236,37 +302,43 @@ class ParquetSchemaPruningSuite Nil) } - testMixedCasePruning("filter with different-case column names") { + testMixedCaseQueryPruning("filter with different-case column names") { val query = sql("select id from mixedcase where Col2.b = 2") - // Pruning with filters is currently unsupported. As-is, the file reader will read the id column - // and the entire coL2 struct. Once pruning with filters has been implemented we can uncomment - // this line - // checkScan(query, "struct>") + checkScan(query, "struct>") checkAnswer(query.orderBy("id"), Row(1) :: Nil) } - private def testMixedCasePruning(testName: String)(testThunk: => Unit) { - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", - SQLConf.CASE_SENSITIVE.key -> "true") { - test(s"Spark vectorized reader - case-sensitive parser - mixed-case schema - $testName") { - withMixedCaseData(testThunk) + // Tests schema pruning for a query whose column and field names are exactly the same as the table + // schema's column and field names. N.B. this implies that `testThunk` should pass using either a + // case-sensitive or case-insensitive query parser + private def testExactCaseQueryPruning(testName: String)(testThunk: => Unit) { + test(s"Spark vectorized reader - case-sensitive parser - mixed-case schema - $testName") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + SQLConf.CASE_SENSITIVE.key -> "true") { + withMixedCaseData(testThunk) } } - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", - SQLConf.CASE_SENSITIVE.key -> "false") { - test(s"Parquet-mr reader - case-insensitive parser - mixed-case schema - $testName") { + test(s"Parquet-mr reader - case-sensitive parser - mixed-case schema - $testName") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", + SQLConf.CASE_SENSITIVE.key -> "true") { withMixedCaseData(testThunk) } } - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", - SQLConf.CASE_SENSITIVE.key -> "false") { - test(s"Spark vectorized reader - case-insensitive parser - mixed-case schema - $testName") { - withMixedCaseData(testThunk) + testMixedCaseQueryPruning(testName)(testThunk) + } + + // Tests schema pruning for a query whose column and field names may differ in case from the table + // schema's column and field names + private def testMixedCaseQueryPruning(testName: String)(testThunk: => Unit) { + test(s"Spark vectorized reader - case-insensitive parser - mixed-case schema - $testName") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + SQLConf.CASE_SENSITIVE.key -> "false") { + withMixedCaseData(testThunk) } } - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", - SQLConf.CASE_SENSITIVE.key -> "true") { - test(s"Parquet-mr reader - case-sensitive parser - mixed-case schema - $testName") { + test(s"Parquet-mr reader - case-insensitive parser - mixed-case schema - $testName") { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", + SQLConf.CASE_SENSITIVE.key -> "false") { withMixedCaseData(testThunk) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 7eefedb8ff5b..528a4d0ca800 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -427,7 +427,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { assert(errMsg.startsWith("Parquet column cannot be converted in file")) val file = errMsg.substring("Parquet column cannot be converted in file ".length, errMsg.indexOf(". ")) - val col = spark.read.parquet(file).schema.fields.filter(_.name.equals("a")) + val col = spark.read.parquet(file).schema.fields.filter(_.name == "a") assert(col.length == 1) if (col(0).dataType == StringType) { assert(errMsg.contains("Column: [a], Expected: int, Found: BINARY")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala index bcdee792f4c7..b4ad1db20a9e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala @@ -54,8 +54,12 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils { } override def afterAll(): Unit = { - spark.stop() - spark = null + try { + spark.stop() + spark = null + } finally { + super.afterAll() + } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index a3a3f3851e21..81db3e137964 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -19,12 +19,15 @@ package org.apache.spark.sql.execution.metric import java.io.File +import scala.reflect.{classTag, ClassTag} import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, Partial} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation -import org.apache.spark.sql.execution.ui.SQLAppStatusStore +import org.apache.spark.sql.execution.{FilterExec, RangeExec, SparkPlan, WholeStageCodegenExec} +import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext @@ -497,6 +500,19 @@ class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with Shared } } + test("SPARK-25278: output metrics are wrong for plans repeated in the query") { + val name = "demo_view" + withView(name) { + sql(s"CREATE OR REPLACE VIEW $name AS VALUES 1,2") + val view = spark.table(name) + val union = view.union(view) + testSparkPlanMetrics(union, 1, Map( + 0L -> ("Union" -> Map()), + 1L -> ("LocalTableScan" -> Map("number of output rows" -> 2L)), + 2L -> ("LocalTableScan" -> Map("number of output rows" -> 2L)))) + } + } + test("writing data out metrics: parquet") { testMetricsNonDynamicPartition("parquet", "t1") } @@ -504,4 +520,81 @@ class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with Shared test("writing data out metrics with dynamic partition: parquet") { testMetricsDynamicPartition("parquet", "parquet", "t1") } + + private def collectNodeWithinWholeStage[T <: SparkPlan : ClassTag](plan: SparkPlan): Seq[T] = { + val stages = plan.collect { + case w: WholeStageCodegenExec => w + } + assert(stages.length == 1, "The query plan should have one and only one whole-stage.") + + val cls = classTag[T].runtimeClass + stages.head.collect { + case n if n.getClass == cls => n.asInstanceOf[T] + } + } + + test("SPARK-25602: SparkPlan.getByteArrayRdd should not consume the input when not necessary") { + def checkFilterAndRangeMetrics( + df: DataFrame, + filterNumOutputs: Int, + rangeNumOutputs: Int): Unit = { + val plan = df.queryExecution.executedPlan + + val filters = collectNodeWithinWholeStage[FilterExec](plan) + assert(filters.length == 1, "The query plan should have one and only one Filter") + assert(filters.head.metrics("numOutputRows").value == filterNumOutputs) + + val ranges = collectNodeWithinWholeStage[RangeExec](plan) + assert(ranges.length == 1, "The query plan should have one and only one Range") + assert(ranges.head.metrics("numOutputRows").value == rangeNumOutputs) + } + + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") { + val df = spark.range(0, 3000, 1, 2).toDF().filter('id % 3 === 0) + df.collect() + checkFilterAndRangeMetrics(df, filterNumOutputs = 1000, rangeNumOutputs = 3000) + + df.queryExecution.executedPlan.foreach(_.resetMetrics()) + // For each partition, we get 2 rows. Then the Filter should produce 2 rows per-partition, + // and Range should produce 1000 rows (one batch) per-partition. Totally Filter produces + // 4 rows, and Range produces 2000 rows. + df.queryExecution.toRdd.mapPartitions(_.take(2)).collect() + checkFilterAndRangeMetrics(df, filterNumOutputs = 4, rangeNumOutputs = 2000) + + // Top-most limit will call `CollectLimitExec.executeCollect`, which will only run the first + // task, so totally the Filter produces 2 rows, and Range produces 1000 rows (one batch). + val df2 = df.limit(2) + df2.collect() + checkFilterAndRangeMetrics(df2, filterNumOutputs = 2, rangeNumOutputs = 1000) + } + } + + test("SPARK-25497: LIMIT within whole stage codegen should not consume all the inputs") { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") { + // A special query that only has one partition, so there is no shuffle and the entire query + // can be whole-stage-codegened. + val df = spark.range(0, 1500, 1, 1).limit(10).groupBy('id).count().limit(1).filter('id >= 0) + df.collect() + val plan = df.queryExecution.executedPlan + + val ranges = collectNodeWithinWholeStage[RangeExec](plan) + assert(ranges.length == 1, "The query plan should have one and only one Range") + // The Range should only produce the first batch, i.e. 1000 rows. + assert(ranges.head.metrics("numOutputRows").value == 1000) + + val aggs = collectNodeWithinWholeStage[HashAggregateExec](plan) + assert(aggs.length == 2, "The query plan should have two and only two Aggregate") + val partialAgg = aggs.filter(_.aggregateExpressions.head.mode == Partial).head + // The partial aggregate should output 10 rows, because its input is 10 rows. + assert(partialAgg.metrics("numOutputRows").value == 10) + val finalAgg = aggs.filter(_.aggregateExpressions.head.mode == Final).head + // The final aggregate should only produce 1 row, because the upstream limit only needs 1 row. + assert(finalAgg.metrics("numOutputRows").value == 1) + + val filters = collectNodeWithinWholeStage[FilterExec](plan) + assert(filters.length == 1, "The query plan should have one and only one Filter") + // The final Filter should produce 1 rows, because the input is just one row. + assert(filters.head.metrics("numOutputRows").value == 1) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala index 2cc55ff88b98..289cc667a1c6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala @@ -37,8 +37,11 @@ class BatchEvalPythonExecSuite extends SparkPlanTest with SharedSQLContext { } override def afterAll(): Unit = { - spark.sessionState.functionRegistry.dropFunction(FunctionIdentifier("dummyPythonUDF")) - super.afterAll() + try { + spark.sessionState.functionRegistry.dropFunction(FunctionIdentifier("dummyPythonUDF")) + } finally { + super.afterAll() + } } test("Python UDF: push down deterministic FilterExec predicates") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonForeachWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonForeachWriterSuite.scala index 07e603477012..d02014c0dee5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonForeachWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonForeachWriterSuite.scala @@ -19,17 +19,20 @@ package org.apache.spark.sql.execution.python import scala.collection.mutable.ArrayBuffer +import org.mockito.Mockito.when import org.scalatest.concurrent.Eventually +import org.scalatest.mockito.MockitoSugar import org.scalatest.time.SpanSugar._ import org.apache.spark._ import org.apache.spark.memory.{TaskMemoryManager, TestMemoryManager} +import org.apache.spark.serializer.{JavaSerializer, SerializerManager} import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.execution.python.PythonForeachWriter.UnsafeRowBuffer import org.apache.spark.sql.types.{DataType, IntegerType} import org.apache.spark.util.Utils -class PythonForeachWriterSuite extends SparkFunSuite with Eventually { +class PythonForeachWriterSuite extends SparkFunSuite with Eventually with MockitoSugar { testWithBuffer("UnsafeRowBuffer: iterator blocks when no data is available") { b => b.assertIteratorBlocked() @@ -75,7 +78,7 @@ class PythonForeachWriterSuite extends SparkFunSuite with Eventually { tester = new BufferTester(memBytes, sleepPerRowReadMs) f(tester) } finally { - if (tester == null) tester.close() + if (tester != null) tester.close() } } } @@ -83,7 +86,12 @@ class PythonForeachWriterSuite extends SparkFunSuite with Eventually { class BufferTester(memBytes: Long, sleepPerRowReadMs: Int) { private val buffer = { - val mem = new TestMemoryManager(new SparkConf()) + val mockEnv = mock[SparkEnv] + val conf = new SparkConf() + val serializerManager = new SerializerManager(new JavaSerializer(conf), conf, None) + when(mockEnv.serializerManager).thenReturn(serializerManager) + SparkEnv.set(mockEnv) + val mem = new TestMemoryManager(conf) mem.limit(memBytes) val taskM = new TaskMemoryManager(mem, 0) new UnsafeRowBuffer(taskM, Utils.createTempDir(), 1) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/RowQueueSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/RowQueueSuite.scala index 25ee95daa034..1ec998632842 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/RowQueueSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/RowQueueSuite.scala @@ -20,15 +20,18 @@ package org.apache.spark.sql.execution.python import java.io.File import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.memory.{MemoryManager, TaskMemoryManager, TestMemoryManager} +import org.apache.spark.internal.config._ +import org.apache.spark.memory.{TaskMemoryManager, TestMemoryManager} +import org.apache.spark.security.{CryptoStreamUtils, EncryptionFunSuite} +import org.apache.spark.serializer.{JavaSerializer, SerializerManager} import org.apache.spark.sql.catalyst.expressions.UnsafeRow -import org.apache.spark.unsafe.memory.OnHeapMemoryBlock +import org.apache.spark.unsafe.memory.MemoryBlock import org.apache.spark.util.Utils -class RowQueueSuite extends SparkFunSuite { +class RowQueueSuite extends SparkFunSuite with EncryptionFunSuite { test("in-memory queue") { - val page = new OnHeapMemoryBlock((1<<10) * 8L) + val page = MemoryBlock.fromLongArray(new Array[Long](1<<10)) val queue = new InMemoryRowQueue(page, 1) { override def close() {} } @@ -53,10 +56,20 @@ class RowQueueSuite extends SparkFunSuite { queue.close() } - test("disk queue") { + private def createSerializerManager(conf: SparkConf): SerializerManager = { + val ioEncryptionKey = if (conf.get(IO_ENCRYPTION_ENABLED)) { + Some(CryptoStreamUtils.createKey(conf)) + } else { + None + } + new SerializerManager(new JavaSerializer(conf), conf, ioEncryptionKey) + } + + encryptionTest("disk queue") { conf => + val serManager = createSerializerManager(conf) val dir = Utils.createTempDir().getCanonicalFile dir.mkdirs() - val queue = DiskRowQueue(new File(dir, "buffer"), 1) + val queue = DiskRowQueue(new File(dir, "buffer"), 1, serManager) val row = new UnsafeRow(1) row.pointTo(new Array[Byte](16), 16) val n = 1000 @@ -81,11 +94,12 @@ class RowQueueSuite extends SparkFunSuite { queue.close() } - test("hybrid queue") { - val mem = new TestMemoryManager(new SparkConf()) + encryptionTest("hybrid queue") { conf => + val serManager = createSerializerManager(conf) + val mem = new TestMemoryManager(conf) mem.limit(4<<10) val taskM = new TaskMemoryManager(mem, 0) - val queue = HybridRowQueue(taskM, Utils.createTempDir().getCanonicalFile, 1) + val queue = HybridRowQueue(taskM, Utils.createTempDir().getCanonicalFile, 1, serManager) val row = new UnsafeRow(1) row.pointTo(new Array[Byte](16), 16) val n = (4<<10) / 16 * 3 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManagerSuite.scala index fe59cb25d500..cbac1c13cdd3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManagerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CheckpointFileManagerSuite.scala @@ -25,12 +25,12 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.util.Utils -abstract class CheckpointFileManagerTests extends SparkFunSuite { +abstract class CheckpointFileManagerTests extends SparkFunSuite with SQLHelper { def createManager(path: Path): CheckpointFileManager @@ -88,12 +88,6 @@ abstract class CheckpointFileManagerTests extends SparkFunSuite { fm.delete(path) // should not throw exception } } - - protected def withTempPath(f: File => Unit): Unit = { - val path = Utils.createTempDir() - path.delete() - try f(path) finally Utils.deleteRecursively(path) - } } class CheckpointFileManagerSuite extends SparkFunSuite with SharedSparkSession { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala index 71dff443e883..3e9ccb0f705d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala @@ -99,11 +99,12 @@ class ForeachBatchSinkSuite extends StreamTest { } assert(ex1.getMessage.contains("foreachBatch function cannot be null")) val ex2 = intercept[AnalysisException] { - ds.writeStream.foreachBatch((_, _) => {}).trigger(Trigger.Continuous("1 second")).start() + ds.writeStream.foreachBatch((_: Dataset[Int], _: Long) => {}) + .trigger(Trigger.Continuous("1 second")).start() } assert(ex2.getMessage.contains("'foreachBatch' is not supported with continuous trigger")) val ex3 = intercept[AnalysisException] { - ds.writeStream.foreachBatch((_, _) => {}).partitionBy("value").start() + ds.writeStream.foreachBatch((_: Dataset[Int], _: Long) => {}).partitionBy("value").start() } assert(ex3.getMessage.contains("'foreachBatch' does not support partitioning")) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala index 579a364ebc3e..015415a534ff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDDSuite.scala @@ -49,8 +49,11 @@ class StateStoreRDDSuite extends SparkFunSuite with BeforeAndAfter with BeforeAn } override def afterAll(): Unit = { - super.afterAll() - Utils.deleteRecursively(new File(tempDir)) + try { + super.afterAll() + } finally { + Utils.deleteRecursively(new File(tempDir)) + } } test("versioning and immutability") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala index 8aeb06d42895..f311465e582a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala @@ -21,17 +21,24 @@ import java.nio.charset.StandardCharsets import scala.util.Random +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.memory.MemoryMode import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.types.{ArrayType, BinaryType, IntegerType} import org.apache.spark.unsafe.Platform -import org.apache.spark.util.Benchmark import org.apache.spark.util.collection.BitSet /** * Benchmark to low level memory access using different ways to manage buffers. + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/ColumnarBatchBenchmark-results.txt". + * }}} */ -object ColumnarBatchBenchmark { +object ColumnarBatchBenchmark extends BenchmarkBase { // This benchmark reads and writes an array of ints. // TODO: there is a big (2x) penalty for a random access API for off heap. // Note: carefully if modifying this code. It's hard to reason about the JIT. @@ -260,25 +267,7 @@ object ColumnarBatchBenchmark { col.close } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Int Read/Write: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Java Array 177 / 183 1851.1 0.5 1.0X - ByteBuffer Unsafe 314 / 330 1043.7 1.0 0.6X - ByteBuffer API 1298 / 1307 252.4 4.0 0.1X - DirectByteBuffer 465 / 483 704.2 1.4 0.4X - Unsafe Buffer 179 / 183 1835.5 0.5 1.0X - Column(on heap) 181 / 186 1815.2 0.6 1.0X - Column(off heap) 344 / 349 951.7 1.1 0.5X - Column(off heap direct) 178 / 186 1838.6 0.5 1.0X - UnsafeRow (on heap) 388 / 394 844.8 1.2 0.5X - UnsafeRow (off heap) 400 / 403 819.4 1.2 0.4X - Column On Heap Append 315 / 325 1041.8 1.0 0.6X - */ - val benchmark = new Benchmark("Int Read/Write", count * iters) + val benchmark = new Benchmark("Int Read/Write", count * iters, output = output) benchmark.addCase("Java Array")(javaArray) benchmark.addCase("ByteBuffer Unsafe")(byteBufferUnsafe) benchmark.addCase("ByteBuffer API")(byteBufferApi) @@ -295,7 +284,7 @@ object ColumnarBatchBenchmark { def booleanAccess(iters: Int): Unit = { val count = 8 * 1024 - val benchmark = new Benchmark("Boolean Read/Write", iters * count.toLong) + val benchmark = new Benchmark("Boolean Read/Write", iters * count.toLong, output = output) benchmark.addCase("Bitset") { i: Int => { val b = new BitSet(count) var sum = 0L @@ -329,15 +318,6 @@ object ColumnarBatchBenchmark { } } }} - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Boolean Read/Write: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Bitset 741 / 747 452.6 2.2 1.0X - Byte Array 531 / 542 631.6 1.6 1.4X - */ benchmark.run() } @@ -386,16 +366,7 @@ object ColumnarBatchBenchmark { } } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - String Read/Write: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - On Heap 351 / 362 46.6 21.4 1.0X - Off Heap 456 / 466 35.9 27.8 0.8X - */ - val benchmark = new Benchmark("String Read/Write", count * iters) + val benchmark = new Benchmark("String Read/Write", count * iters, output = output) benchmark.addCase("On Heap")(column(MemoryMode.ON_HEAP)) benchmark.addCase("Off Heap")(column(MemoryMode.OFF_HEAP)) benchmark.run @@ -463,30 +434,27 @@ object ColumnarBatchBenchmark { } } - val benchmark = new Benchmark("Array Vector Read", count * iters) + val benchmark = new Benchmark("Array Vector Read", count * iters, output = output) benchmark.addCase("On Heap Read Size Only") { _ => readArrays(true) } benchmark.addCase("Off Heap Read Size Only") { _ => readArrays(false) } benchmark.addCase("On Heap Read Elements") { _ => readArrayElements(true) } benchmark.addCase("Off Heap Read Elements") { _ => readArrayElements(false) } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Array Vector Read: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - On Heap Read Size Only 426 / 437 384.9 2.6 1.0X - Off Heap Read Size Only 406 / 421 404.0 2.5 1.0X - On Heap Read Elements 2636 / 2642 62.2 16.1 0.2X - Off Heap Read Elements 3770 / 3774 43.5 23.0 0.1X - */ benchmark.run } - def main(args: Array[String]): Unit = { - intAccess(1024 * 40) - booleanAccess(1024 * 40) - stringAccess(1024 * 4) - arrayAccess(1024 * 40) + override def runBenchmarkSuite(): Unit = { + runBenchmark("Int Read/Write") { + intAccess(1024 * 40) + } + runBenchmark("Boolean Read/Write") { + booleanAccess(1024 * 40) + } + runBenchmark("String Read/Write") { + stringAccess(1024 * 4) + } + runBenchmark("Array Vector Read") { + arrayAccess(1024 * 40) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala index 5b4736ef4f7f..d885348f3774 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala @@ -38,8 +38,12 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { } override def afterAll(): Unit = { - spark.stop() - spark = null + try { + spark.stop() + spark = null + } finally { + super.afterAll() + } } override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala index a9414200e70f..a2bc651bb2bd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.sources import java.io.File import java.net.URI +import scala.util.Random + import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions @@ -47,11 +49,13 @@ class BucketedReadWithoutHiveSupportSuite extends BucketedReadSuite with SharedS abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { import testImplicits._ - private lazy val df = (0 until 50).map(i => (i % 5, i % 13, i.toString)).toDF("i", "j", "k") + private val maxI = 5 + private val maxJ = 13 + private lazy val df = (0 until 50).map(i => (i % maxI, i % maxJ, i.toString)).toDF("i", "j", "k") private lazy val nullDF = (for { i <- 0 to 50 s <- Seq(null, "a", "b", "c", "d", "e", "f", null, "g") - } yield (i % 5, s, i % 13)).toDF("i", "j", "k") + } yield (i % maxI, s, i % maxJ)).toDF("i", "j", "k") // number of buckets that doesn't yield empty buckets when bucketing on column j on df/nullDF // empty buckets before filtering might hide bugs in pruning logic @@ -66,23 +70,22 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { .bucketBy(8, "j", "k") .saveAsTable("bucketed_table") - for (i <- 0 until 5) { - val table = spark.table("bucketed_table").filter($"i" === i) - val query = table.queryExecution - val output = query.analyzed.output - val rdd = query.toRdd - - assert(rdd.partitions.length == 8) - - val attrs = table.select("j", "k").queryExecution.analyzed.output - val checkBucketId = rdd.mapPartitionsWithIndex((index, rows) => { - val getBucketId = UnsafeProjection.create( - HashPartitioning(attrs, 8).partitionIdExpression :: Nil, - output) - rows.map(row => getBucketId(row).getInt(0) -> index) - }) - checkBucketId.collect().foreach(r => assert(r._1 == r._2)) - } + val bucketValue = Random.nextInt(maxI) + val table = spark.table("bucketed_table").filter($"i" === bucketValue) + val query = table.queryExecution + val output = query.analyzed.output + val rdd = query.toRdd + + assert(rdd.partitions.length == 8) + + val attrs = table.select("j", "k").queryExecution.analyzed.output + val checkBucketId = rdd.mapPartitionsWithIndex((index, rows) => { + val getBucketId = UnsafeProjection.create( + HashPartitioning(attrs, 8).partitionIdExpression :: Nil, + output) + rows.map(row => getBucketId(row).getInt(0) -> index) + }) + checkBucketId.collect().foreach(r => assert(r._1 == r._2)) } } @@ -145,36 +148,36 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { .bucketBy(numBuckets, "j") .saveAsTable("bucketed_table") - for (j <- 0 until 13) { - // Case 1: EqualTo - checkPrunedAnswers( - bucketSpec, - bucketValues = j :: Nil, - filterCondition = $"j" === j, - df) - - // Case 2: EqualNullSafe - checkPrunedAnswers( - bucketSpec, - bucketValues = j :: Nil, - filterCondition = $"j" <=> j, - df) - - // Case 3: In - checkPrunedAnswers( - bucketSpec, - bucketValues = Seq(j, j + 1, j + 2, j + 3), - filterCondition = $"j".isin(j, j + 1, j + 2, j + 3), - df) - - // Case 4: InSet - val inSetExpr = expressions.InSet($"j".expr, Set(j, j + 1, j + 2, j + 3).map(lit(_).expr)) - checkPrunedAnswers( - bucketSpec, - bucketValues = Seq(j, j + 1, j + 2, j + 3), - filterCondition = Column(inSetExpr), - df) - } + val bucketValue = Random.nextInt(maxJ) + // Case 1: EqualTo + checkPrunedAnswers( + bucketSpec, + bucketValues = bucketValue :: Nil, + filterCondition = $"j" === bucketValue, + df) + + // Case 2: EqualNullSafe + checkPrunedAnswers( + bucketSpec, + bucketValues = bucketValue :: Nil, + filterCondition = $"j" <=> bucketValue, + df) + + // Case 3: In + checkPrunedAnswers( + bucketSpec, + bucketValues = Seq(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3), + filterCondition = $"j".isin(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3), + df) + + // Case 4: InSet + val inSetExpr = expressions.InSet($"j".expr, + Set(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3).map(lit(_).expr)) + checkPrunedAnswers( + bucketSpec, + bucketValues = Seq(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3), + filterCondition = Column(inSetExpr), + df) } } @@ -188,13 +191,12 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { .bucketBy(numBuckets, "j") .saveAsTable("bucketed_table") - for (j <- 0 until 13) { - checkPrunedAnswers( - bucketSpec, - bucketValues = j :: Nil, - filterCondition = $"j" === j, - df) - } + val bucketValue = Random.nextInt(maxJ) + checkPrunedAnswers( + bucketSpec, + bucketValues = bucketValue :: Nil, + filterCondition = $"j" === bucketValue, + df) } } @@ -236,40 +238,39 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { .bucketBy(numBuckets, "j") .saveAsTable("bucketed_table") - for (j <- 0 until 13) { - checkPrunedAnswers( - bucketSpec, - bucketValues = j :: Nil, - filterCondition = $"j" === j && $"k" > $"j", - df) - - checkPrunedAnswers( - bucketSpec, - bucketValues = j :: Nil, - filterCondition = $"j" === j && $"i" > j % 5, - df) - - // check multiple bucket values OR condition - checkPrunedAnswers( - bucketSpec, - bucketValues = Seq(j, j + 1), - filterCondition = $"j" === j || $"j" === (j + 1), - df) - - // check bucket value and none bucket value OR condition - checkPrunedAnswers( - bucketSpec, - bucketValues = Nil, - filterCondition = $"j" === j || $"i" === 0, - df) - - // check AND condition in complex expression - checkPrunedAnswers( - bucketSpec, - bucketValues = Seq(j), - filterCondition = ($"i" === 0 || $"k" > $"j") && $"j" === j, - df) - } + val bucketValue = Random.nextInt(maxJ) + checkPrunedAnswers( + bucketSpec, + bucketValues = bucketValue :: Nil, + filterCondition = $"j" === bucketValue && $"k" > $"j", + df) + + checkPrunedAnswers( + bucketSpec, + bucketValues = bucketValue :: Nil, + filterCondition = $"j" === bucketValue && $"i" > bucketValue % 5, + df) + + // check multiple bucket values OR condition + checkPrunedAnswers( + bucketSpec, + bucketValues = Seq(bucketValue, bucketValue + 1), + filterCondition = $"j" === bucketValue || $"j" === (bucketValue + 1), + df) + + // check bucket value and none bucket value OR condition + checkPrunedAnswers( + bucketSpec, + bucketValues = Nil, + filterCondition = $"j" === bucketValue || $"i" === 0, + df) + + // check AND condition in complex expression + checkPrunedAnswers( + bucketSpec, + bucketValues = Seq(bucketValue), + filterCondition = ($"i" === 0 || $"k" > $"j") && $"j" === bucketValue, + df) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala index f6c3e0ce82e3..ed9493debd76 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.sources.v2 +import java.io.File + import test.org.apache.spark.sql.sources.v2._ import org.apache.spark.SparkException @@ -317,6 +319,54 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext { checkCanonicalizedOutput(df, 2, 2) checkCanonicalizedOutput(df.select('i), 2, 1) } + + test("SPARK-25425: extra options should override sessions options during reading") { + val prefix = "spark.datasource.userDefinedDataSource." + val optionName = "optionA" + withSQLConf(prefix + optionName -> "true") { + val df = spark + .read + .option(optionName, false) + .format(classOf[DataSourceV2WithSessionConfig].getName).load() + val options = df.queryExecution.optimizedPlan.collectFirst { + case d: DataSourceV2Relation => d.options + } + assert(options.get.get(optionName) == Some("false")) + } + } + + test("SPARK-25425: extra options should override sessions options during writing") { + withTempPath { path => + val sessionPath = path.getCanonicalPath + withSQLConf("spark.datasource.simpleWritableDataSource.path" -> sessionPath) { + withTempPath { file => + val optionPath = file.getCanonicalPath + val format = classOf[SimpleWritableDataSource].getName + + val df = Seq((1L, 2L)).toDF("i", "j") + df.write.format(format).option("path", optionPath).save() + assert(!new File(sessionPath).exists) + checkAnswer(spark.read.format(format).option("path", optionPath).load(), df) + } + } + } + } + + test("SPARK-25700: do not read schema when writing in other modes") { + withTempPath { file => + val cls = classOf[SimpleWriteOnlyDataSource] + val path = file.getCanonicalPath + val df = spark.range(5).select('id as 'i, -'id as 'j) + try { + df.write.format(cls.getName).option("path", path).mode("error").save() + df.write.format(cls.getName).option("path", path).mode("overwrite").save() + df.write.format(cls.getName).option("path", path).mode("ignore").save() + df.write.format(cls.getName).option("path", path).mode("append").save() + } catch { + case e: SchemaReadAttemptException => fail("Schema read was attempted.", e) + } + } + } } @@ -385,7 +435,6 @@ class SimpleDataSourceV2 extends DataSourceV2 with BatchReadSupportProvider { } } - class AdvancedDataSourceV2 extends DataSourceV2 with BatchReadSupportProvider { class ReadSupport extends SimpleReadSupport { @@ -607,3 +656,14 @@ object SpecificReaderFactory extends PartitionReaderFactory { } } } + +class SchemaReadAttemptException(m: String) extends RuntimeException(m) + +class SimpleWriteOnlyDataSource extends SimpleWritableDataSource { + override def fullSchema(): StructType = { + // This is a bit hacky since this source implements read support but throws + // during schema retrieval. Might have to rewrite but it's done + // such so for minimised changes. + throw new SchemaReadAttemptException("read is not supported") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala index 952241b0b6be..a7dfc2d1deac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala @@ -39,13 +39,17 @@ import org.apache.spark.util.SerializableConfiguration * Each job moves files from `target/_temporary/queryId/` to `target`. */ class SimpleWritableDataSource extends DataSourceV2 - with BatchReadSupportProvider with BatchWriteSupportProvider { + with BatchReadSupportProvider + with BatchWriteSupportProvider + with SessionConfigSupport { - private val schema = new StructType().add("i", "long").add("j", "long") + protected def fullSchema(): StructType = new StructType().add("i", "long").add("j", "long") + + override def keyPrefix: String = "simpleWritableDataSource" class ReadSupport(path: String, conf: Configuration) extends SimpleReadSupport { - override def fullSchema(): StructType = schema + override def fullSchema(): StructType = SimpleWritableDataSource.this.fullSchema() override def planInputPartitions(config: ScanConfig): Array[InputPartition] = { val dataPath = new Path(path) @@ -112,7 +116,6 @@ class SimpleWritableDataSource extends DataSourceV2 schema: StructType, mode: SaveMode, options: DataSourceOptions): Optional[BatchWriteSupport] = { - assert(DataType.equalsStructurally(schema.asNullable, this.schema.asNullable)) assert(!SparkContext.getActive.get.conf.getBoolean("spark.speculation", false)) val path = new Path(options.get("path").get()) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala index 026af17c7b23..c696204cecc2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.streaming import java.{util => ju} import java.io.File import java.text.SimpleDateFormat -import java.util.{Calendar, Date} +import java.util.{Calendar, Date, Locale} import org.apache.commons.io.FileUtils import org.scalatest.{BeforeAndAfter, Matchers} @@ -698,7 +698,7 @@ class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Matche val e = intercept[IllegalArgumentException] { spark.conf.set(SQLConf.STREAMING_MULTIPLE_WATERMARK_POLICY.key, value) } - assert(e.getMessage.toLowerCase.contains("valid values are 'min' and 'max'")) + assert(e.getMessage.toLowerCase(Locale.ROOT).contains("valid values are 'min' and 'max'")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala index e77ba1ec9f1e..43463a84093c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala @@ -45,19 +45,13 @@ case class RunningCount(count: Long) case class Result(key: Long, count: Int) -class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest - with BeforeAndAfterAll { +class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest { import testImplicits._ import GroupStateImpl._ import GroupStateTimeout._ import FlatMapGroupsWithStateSuite._ - override def afterAll(): Unit = { - super.afterAll() - StateStore.stop() - } - test("GroupState - get, exists, update, remove") { var state: GroupStateImpl[String] = null diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala index bf509b1976ed..f55ddb5419d2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala @@ -29,13 +29,14 @@ import org.apache.commons.io.FileUtils import org.apache.hadoop.conf.Configuration import org.scalatest.time.SpanSugar._ -import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.{SparkConf, SparkContext, TaskContext} import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.plans.logical.Range import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.command.ExplainCommand import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryStream import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreConf, StateStoreId, StateStoreProvider} import org.apache.spark.sql.functions._ @@ -788,7 +789,7 @@ class StreamSuite extends StreamTest { val query = input .toDS() .map { i => - while (!org.apache.spark.TaskContext.get().isInterrupted()) { + while (!TaskContext.get().isInterrupted()) { // keep looping till interrupted by query.stop() Thread.sleep(100) } @@ -1029,6 +1030,34 @@ class StreamSuite extends StreamTest { false)) } + test("is_continuous_processing property should be false for microbatch processing") { + val input = MemoryStream[Int] + val df = input.toDS() + .map(i => TaskContext.get().getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING)) + testStream(df) ( + AddData(input, 1), + CheckAnswer("false") + ) + } + + test("is_continuous_processing property should be true for continuous processing") { + val input = ContinuousMemoryStream[Int] + val stream = input.toDS() + .map(i => TaskContext.get().getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING)) + .writeStream.format("memory") + .queryName("output") + .trigger(Trigger.Continuous("1 seconds")) + .start() + try { + input.addData(1) + stream.processAllAvailable() + } finally { + stream.stop() + } + + checkAnswer(spark.sql("select * from output"), Row("true")) + } + for (e <- Seq( new InterruptedException, new InterruptedIOException, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala index 491dc34afa14..d878c345c298 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala @@ -79,8 +79,11 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be implicit val defaultSignaler: Signaler = ThreadSignaler override def afterAll(): Unit = { - super.afterAll() - StateStore.stop() // stop the state store maintenance thread and unload store providers + try { + super.afterAll() + } finally { + StateStore.stop() // stop the state store maintenance thread and unload store providers + } } protected val defaultTrigger = Trigger.ProcessingTime(0) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala index 1ae6ff3a9098..97dbb9b0360e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala @@ -46,13 +46,7 @@ object FailureSingleton { var firstTime = true } -class StreamingAggregationSuite extends StateStoreMetricsTest - with BeforeAndAfterAll with Assertions { - - override def afterAll(): Unit = { - super.afterAll() - StateStore.stop() - } +class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { import testImplicits._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala index 42ffd472eb84..cfd7204ea293 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala @@ -26,15 +26,10 @@ import org.apache.spark.sql.execution.streaming.state.StateStore import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf -class StreamingDeduplicationSuite extends StateStoreMetricsTest with BeforeAndAfterAll { +class StreamingDeduplicationSuite extends StateStoreMetricsTest { import testImplicits._ - override def afterAll(): Unit = { - super.afterAll() - StateStore.stop() - } - test("deduplicate with all columns") { val inputData = MemoryStream[String] val result = inputData.toDS().dropDuplicates() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala index 1dd817545a96..c170641372d6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala @@ -31,6 +31,7 @@ import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.expressions.{Literal, Rand, Randn, Shuffle, Uuid} +import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.TestForeachWriter import org.apache.spark.sql.functions._ @@ -500,29 +501,52 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi AssertOnQuery { q => val lastProgress = getLastProgressWithData(q) assert(lastProgress.nonEmpty) - assert(lastProgress.get.numInputRows == 6) assert(lastProgress.get.sources.length == 1) - assert(lastProgress.get.sources(0).numInputRows == 6) + // The source is scanned twice because of self-union + assert(lastProgress.get.numInputRows == 6) true } ) } test("input row calculation with same V2 source used twice in self-join") { - val streamInput = MemoryStream[Int] - val df = streamInput.toDF() - testStream(df.join(df, "value"), useV2Sink = true)( - AddData(streamInput, 1, 2, 3), - CheckAnswer(1, 2, 3), - AssertOnQuery { q => + def checkQuery(check: AssertOnQuery): Unit = { + val memoryStream = MemoryStream[Int] + // TODO: currently the streaming framework always add a dummy Project above streaming source + // relation, which breaks exchange reuse, as the optimizer will remove Project from one side. + // Here we manually add a useful Project, to trigger exchange reuse. + val streamDF = memoryStream.toDF().select('value + 0 as "v") + testStream(streamDF.join(streamDF, "v"), useV2Sink = true)( + AddData(memoryStream, 1, 2, 3), + CheckAnswer(1, 2, 3), + check + ) + } + + withSQLConf(SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { + checkQuery(AssertOnQuery { q => val lastProgress = getLastProgressWithData(q) assert(lastProgress.nonEmpty) + assert(lastProgress.get.sources.length == 1) + // The source is scanned twice because of self-join assert(lastProgress.get.numInputRows == 6) + true + }) + } + + withSQLConf(SQLConf.EXCHANGE_REUSE_ENABLED.key -> "true") { + checkQuery(AssertOnQuery { q => + val lastProgress = getLastProgressWithData(q) + assert(lastProgress.nonEmpty) assert(lastProgress.get.sources.length == 1) - assert(lastProgress.get.sources(0).numInputRows == 6) + assert(q.lastExecution.executedPlan.collect { + case r: ReusedExchangeExec => r + }.length == 1) + // The source is scanned only once because of exchange reuse + assert(lastProgress.get.numInputRows == 3) true - } - ) + }) + } } test("input row calculation with trigger having data for only one of two V2 sources") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala index aeef4c8fe933..3a0e780a7391 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.sources.v2._ import org.apache.spark.sql.sources.v2.reader.{InputPartition, PartitionReaderFactory, ScanConfig, ScanConfigBuilder} import org.apache.spark.sql.sources.v2.reader.streaming._ import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWriteSupport -import org.apache.spark.sql.streaming.{OutputMode, StreamTest, Trigger} +import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, StreamTest, Trigger} import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils @@ -56,13 +56,19 @@ case class FakeReadSupport() extends MicroBatchReadSupport with ContinuousReadSu trait FakeMicroBatchReadSupportProvider extends MicroBatchReadSupportProvider { override def createMicroBatchReadSupport( checkpointLocation: String, - options: DataSourceOptions): MicroBatchReadSupport = FakeReadSupport() + options: DataSourceOptions): MicroBatchReadSupport = { + LastReadOptions.options = options + FakeReadSupport() + } } trait FakeContinuousReadSupportProvider extends ContinuousReadSupportProvider { override def createContinuousReadSupport( checkpointLocation: String, - options: DataSourceOptions): ContinuousReadSupport = FakeReadSupport() + options: DataSourceOptions): ContinuousReadSupport = { + LastReadOptions.options = options + FakeReadSupport() + } } trait FakeStreamingWriteSupportProvider extends StreamingWriteSupportProvider { @@ -71,16 +77,27 @@ trait FakeStreamingWriteSupportProvider extends StreamingWriteSupportProvider { schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamingWriteSupport = { + LastWriteOptions.options = options throw new IllegalStateException("fake sink - cannot actually write") } } -class FakeReadMicroBatchOnly extends DataSourceRegister with FakeMicroBatchReadSupportProvider { +class FakeReadMicroBatchOnly + extends DataSourceRegister + with FakeMicroBatchReadSupportProvider + with SessionConfigSupport { override def shortName(): String = "fake-read-microbatch-only" + + override def keyPrefix: String = shortName() } -class FakeReadContinuousOnly extends DataSourceRegister with FakeContinuousReadSupportProvider { +class FakeReadContinuousOnly + extends DataSourceRegister + with FakeContinuousReadSupportProvider + with SessionConfigSupport { override def shortName(): String = "fake-read-continuous-only" + + override def keyPrefix: String = shortName() } class FakeReadBothModes extends DataSourceRegister @@ -92,8 +109,13 @@ class FakeReadNeitherMode extends DataSourceRegister { override def shortName(): String = "fake-read-neither-mode" } -class FakeWriteSupportProvider extends DataSourceRegister with FakeStreamingWriteSupportProvider { +class FakeWriteSupportProvider + extends DataSourceRegister + with FakeStreamingWriteSupportProvider + with SessionConfigSupport { override def shortName(): String = "fake-write-microbatch-continuous" + + override def keyPrefix: String = shortName() } class FakeNoWrite extends DataSourceRegister { @@ -121,6 +143,21 @@ class FakeWriteSupportProviderV1Fallback extends DataSourceRegister override def shortName(): String = "fake-write-v1-fallback" } +object LastReadOptions { + var options: DataSourceOptions = _ + + def clear(): Unit = { + options = null + } +} + +object LastWriteOptions { + var options: DataSourceOptions = _ + + def clear(): Unit = { + options = null + } +} class StreamingDataSourceV2Suite extends StreamTest { @@ -130,6 +167,11 @@ class StreamingDataSourceV2Suite extends StreamTest { spark.conf.set("spark.sql.streaming.checkpointLocation", fakeCheckpoint.getCanonicalPath) } + override def afterEach(): Unit = { + LastReadOptions.clear() + LastWriteOptions.clear() + } + val readFormats = Seq( "fake-read-microbatch-only", "fake-read-continuous-only", @@ -143,7 +185,14 @@ class StreamingDataSourceV2Suite extends StreamTest { Trigger.ProcessingTime(1000), Trigger.Continuous(1000)) - private def testPositiveCase(readFormat: String, writeFormat: String, trigger: Trigger) = { + private def testPositiveCase(readFormat: String, writeFormat: String, trigger: Trigger): Unit = { + testPositiveCaseWithQuery(readFormat, writeFormat, trigger)(() => _) + } + + private def testPositiveCaseWithQuery( + readFormat: String, + writeFormat: String, + trigger: Trigger)(check: StreamingQuery => Unit): Unit = { val query = spark.readStream .format(readFormat) .load() @@ -151,8 +200,8 @@ class StreamingDataSourceV2Suite extends StreamTest { .format(writeFormat) .trigger(trigger) .start() + check(query) query.stop() - query } private def testNegativeCase( @@ -188,19 +237,54 @@ class StreamingDataSourceV2Suite extends StreamTest { test("disabled v2 write") { // Ensure the V2 path works normally and generates a V2 sink.. - val v2Query = testPositiveCase( - "fake-read-microbatch-continuous", "fake-write-v1-fallback", Trigger.Once()) - assert(v2Query.asInstanceOf[StreamingQueryWrapper].streamingQuery.sink - .isInstanceOf[FakeWriteSupportProviderV1Fallback]) + testPositiveCaseWithQuery( + "fake-read-microbatch-continuous", "fake-write-v1-fallback", Trigger.Once()) { v2Query => + assert(v2Query.asInstanceOf[StreamingQueryWrapper].streamingQuery.sink + .isInstanceOf[FakeWriteSupportProviderV1Fallback]) + } // Ensure we create a V1 sink with the config. Note the config is a comma separated // list, including other fake entries. val fullSinkName = classOf[FakeWriteSupportProviderV1Fallback].getName withSQLConf(SQLConf.DISABLED_V2_STREAMING_WRITERS.key -> s"a,b,c,test,$fullSinkName,d,e") { - val v1Query = testPositiveCase( - "fake-read-microbatch-continuous", "fake-write-v1-fallback", Trigger.Once()) - assert(v1Query.asInstanceOf[StreamingQueryWrapper].streamingQuery.sink - .isInstanceOf[FakeSink]) + testPositiveCaseWithQuery( + "fake-read-microbatch-continuous", "fake-write-v1-fallback", Trigger.Once()) { v1Query => + assert(v1Query.asInstanceOf[StreamingQueryWrapper].streamingQuery.sink + .isInstanceOf[FakeSink]) + } + } + } + + Seq( + Tuple2(classOf[FakeReadMicroBatchOnly], Trigger.Once()), + Tuple2(classOf[FakeReadContinuousOnly], Trigger.Continuous(1000)) + ).foreach { case (source, trigger) => + test(s"SPARK-25460: session options are respected in structured streaming sources - $source") { + // `keyPrefix` and `shortName` are the same in this test case + val readSource = source.newInstance().shortName() + val writeSource = "fake-write-microbatch-continuous" + + val readOptionName = "optionA" + withSQLConf(s"spark.datasource.$readSource.$readOptionName" -> "true") { + testPositiveCaseWithQuery(readSource, writeSource, trigger) { _ => + eventually(timeout(streamingTimeout)) { + // Write options should not be set. + assert(LastWriteOptions.options.getBoolean(readOptionName, false) == false) + assert(LastReadOptions.options.getBoolean(readOptionName, false) == true) + } + } + } + + val writeOptionName = "optionB" + withSQLConf(s"spark.datasource.$writeSource.$writeOptionName" -> "true") { + testPositiveCaseWithQuery(readSource, writeSource, trigger) { _ => + eventually(timeout(streamingTimeout)) { + // Read options should not be set. + assert(LastReadOptions.options.getBoolean(writeOptionName, false) == false) + assert(LastWriteOptions.options.getBoolean(writeOptionName, false) == true) + } + } + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 2fb8f70a2079..6b03d1e5b766 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -40,7 +40,6 @@ import org.apache.spark.sql.catalyst.plans.PlanTestBase import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.execution.FilterExec -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.UninterruptibleThread import org.apache.spark.util.Utils @@ -167,18 +166,6 @@ private[sql] trait SQLTestUtilsBase super.withSQLConf(pairs: _*)(f) } - /** - * Generates a temporary path without creating the actual file/directory, then pass it to `f`. If - * a file/directory is created there by `f`, it will be delete after `f` returns. - * - * @todo Probably this method should be moved to a more general place - */ - protected def withTempPath(f: File => Unit): Unit = { - val path = Utils.createTempDir() - path.delete() - try f(path) finally Utils.deleteRecursively(path) - } - /** * Copy file in jar's resource to a temp file, then pass it to `f`. * This function is used to make `f` can use the path of temp file(e.g. file:/), instead of diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala index e6c7648c986a..0dd24d2d56b8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala @@ -35,7 +35,10 @@ trait SharedSQLContext extends SQLTestUtils with SharedSparkSession { } protected override def afterAll(): Unit = { - super.afterAll() - doThreadPostAudit() + try { + super.afterAll() + } finally { + doThreadPostAudit() + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index 8968dbf36d50..e7e0ce64963a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -24,6 +24,7 @@ import org.scalatest.concurrent.Eventually import org.apache.spark.{DebugFilesystem, SparkConf} import org.apache.spark.sql.{SparkSession, SQLContext} +import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.internal.SQLConf /** @@ -39,6 +40,11 @@ trait SharedSparkSession .set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName) .set("spark.unsafe.exceptionOnMemoryLeak", "true") .set(SQLConf.CODEGEN_FALLBACK.key, "false") + // Disable ConvertToLocalRelation for better test coverage. Test cases built on + // LocalRelation will exercise the optimization rules better by disabling it as + // this rule may potentially block testing of other optimization rules such as + // ConstantPropagation etc. + .set(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, ConvertToLocalRelation.ruleName) } /** diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 9f247f9224c7..55e051c3ed1b 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala index 4c53dd8f4616..fef18f147b05 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala @@ -46,10 +46,13 @@ class UISeleniumSuite } override def afterAll(): Unit = { - if (webDriver != null) { - webDriver.quit() + try { + if (webDriver != null) { + webDriver.quit() + } + } finally { + super.afterAll() } - super.afterAll() } override protected def serverStartCommand(port: Int) = { diff --git a/sql/hive/benchmarks/OrcReadBenchmark-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-results.txt new file mode 100644 index 000000000000..c77f966723d7 --- /dev/null +++ b/sql/hive/benchmarks/OrcReadBenchmark-results.txt @@ -0,0 +1,173 @@ +================================================================================================ +SQL Single Numeric Column Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1630 / 1639 9.7 103.6 1.0X +Native ORC Vectorized 253 / 288 62.2 16.1 6.4X +Native ORC Vectorized with copy 227 / 244 69.2 14.5 7.2X +Hive built-in ORC 1980 / 1991 7.9 125.9 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1587 / 1589 9.9 100.9 1.0X +Native ORC Vectorized 227 / 242 69.2 14.5 7.0X +Native ORC Vectorized with copy 228 / 238 69.0 14.5 7.0X +Hive built-in ORC 2323 / 2332 6.8 147.7 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1726 / 1771 9.1 109.7 1.0X +Native ORC Vectorized 309 / 333 50.9 19.7 5.6X +Native ORC Vectorized with copy 313 / 321 50.2 19.9 5.5X +Hive built-in ORC 2668 / 2672 5.9 169.6 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1722 / 1747 9.1 109.5 1.0X +Native ORC Vectorized 395 / 403 39.8 25.1 4.4X +Native ORC Vectorized with copy 399 / 405 39.4 25.4 4.3X +Hive built-in ORC 2767 / 2777 5.7 175.9 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1797 / 1824 8.8 114.2 1.0X +Native ORC Vectorized 434 / 441 36.2 27.6 4.1X +Native ORC Vectorized with copy 437 / 447 36.0 27.8 4.1X +Hive built-in ORC 2701 / 2710 5.8 171.7 0.7X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1931 / 2028 8.1 122.8 1.0X +Native ORC Vectorized 542 / 557 29.0 34.5 3.6X +Native ORC Vectorized with copy 550 / 564 28.6 35.0 3.5X +Hive built-in ORC 2816 / 3206 5.6 179.1 0.7X + + +================================================================================================ +Int and String Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 4012 / 4068 2.6 382.6 1.0X +Native ORC Vectorized 2337 / 2339 4.5 222.9 1.7X +Native ORC Vectorized with copy 2520 / 2540 4.2 240.3 1.6X +Hive built-in ORC 5503 / 5575 1.9 524.8 0.7X + + +================================================================================================ +Partitioned Table Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Data column - Native ORC MR 2020 / 2025 7.8 128.4 1.0X +Data column - Native ORC Vectorized 398 / 409 39.5 25.3 5.1X +Data column - Native ORC Vectorized with copy 406 / 411 38.8 25.8 5.0X +Data column - Hive built-in ORC 2967 / 2969 5.3 188.6 0.7X +Partition column - Native ORC MR 1494 / 1505 10.5 95.0 1.4X +Partition column - Native ORC Vectorized 73 / 82 216.3 4.6 27.8X +Partition column - Native ORC Vectorized with copy 71 / 80 221.4 4.5 28.4X +Partition column - Hive built-in ORC 1932 / 1937 8.1 122.8 1.0X +Both columns - Native ORC MR 2057 / 2071 7.6 130.8 1.0X +Both columns - Native ORC Vectorized 445 / 448 35.4 28.3 4.5X +Both column - Native ORC Vectorized with copy 534 / 539 29.4 34.0 3.8X +Both columns - Hive built-in ORC 2994 / 2994 5.3 190.3 0.7X + + +================================================================================================ +Repeated String Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1771 / 1785 5.9 168.9 1.0X +Native ORC Vectorized 372 / 375 28.2 35.5 4.8X +Native ORC Vectorized with copy 543 / 576 19.3 51.8 3.3X +Hive built-in ORC 2671 / 2671 3.9 254.7 0.7X + + +================================================================================================ +String with Nulls Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (0.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 3276 / 3302 3.2 312.5 1.0X +Native ORC Vectorized 1057 / 1080 9.9 100.8 3.1X +Native ORC Vectorized with copy 1420 / 1431 7.4 135.4 2.3X +Hive built-in ORC 5377 / 5407 2.0 512.8 0.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (0.5%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 3147 / 3147 3.3 300.1 1.0X +Native ORC Vectorized 1305 / 1319 8.0 124.4 2.4X +Native ORC Vectorized with copy 1685 / 1686 6.2 160.7 1.9X +Hive built-in ORC 4077 / 4085 2.6 388.8 0.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (0.95%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1739 / 1744 6.0 165.8 1.0X +Native ORC Vectorized 500 / 501 21.0 47.7 3.5X +Native ORC Vectorized with copy 618 / 631 17.0 58.9 2.8X +Hive built-in ORC 2411 / 2427 4.3 229.9 0.7X + + +================================================================================================ +Single Column Scan From Wide Columns +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 100 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 1348 / 1366 0.8 1285.3 1.0X +Native ORC Vectorized 119 / 134 8.8 113.5 11.3X +Native ORC Vectorized with copy 119 / 148 8.8 113.9 11.3X +Hive built-in ORC 487 / 507 2.2 464.8 2.8X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 200 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 2667 / 2837 0.4 2543.6 1.0X +Native ORC Vectorized 203 / 222 5.2 193.4 13.2X +Native ORC Vectorized with copy 217 / 255 4.8 207.0 12.3X +Hive built-in ORC 737 / 741 1.4 702.4 3.6X + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 300 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Native ORC MR 3954 / 3956 0.3 3770.4 1.0X +Native ORC Vectorized 348 / 360 3.0 331.7 11.4X +Native ORC Vectorized with copy 349 / 359 3.0 333.2 11.3X +Hive built-in ORC 1057 / 1067 1.0 1008.0 3.7X + + diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index b9b2b7dbf38e..cebaad5b4ad9 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -40,7 +40,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { private val originalColumnBatchSize = TestHive.conf.columnBatchSize private val originalInMemoryPartitionPruning = TestHive.conf.inMemoryPartitionPruning private val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled - private val originalLimitFlatGlobalLimit = TestHive.conf.limitFlatGlobalLimit private val originalSessionLocalTimeZone = TestHive.conf.sessionLocalTimeZone def testCases: Seq[(String, File)] = { @@ -60,8 +59,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true) // Ensures that cross joins are enabled so that we can test them TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true) - // Ensure that limit operation returns rows in the same order as Hive - TestHive.setConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT, false) // Fix session local timezone to America/Los_Angeles for those timezone sensitive tests // (timestamp_*) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, "America/Los_Angeles") @@ -76,7 +73,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize) TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) - TestHive.setConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT, originalLimitFlatGlobalLimit) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone) // For debugging dump some statistics about how much time was spent in various optimizer rules diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index c55ba32fa458..ef22e2abfb53 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../../pom.xml diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 5cc1047fc067..445161d5de1c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -28,6 +28,7 @@ import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.metadata.HiveException +import org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT import org.apache.thrift.TException import org.apache.spark.{SparkConf, SparkException} @@ -806,6 +807,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat updateLocationInStorageProps(table, newPath = None).copy( locationUri = tableLocation.map(CatalogUtils.stringToURI(_))) } + val storageWithoutHiveGeneratedProperties = storageWithLocation.copy( + properties = storageWithLocation.properties.filterKeys(!HIVE_GENERATED_STORAGE_PROPERTIES(_))) val partitionProvider = table.properties.get(TABLE_PARTITION_PROVIDER) val schemaFromTableProps = getSchemaFromTableProperties(table) @@ -814,7 +817,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat table.copy( provider = Some(provider), - storage = storageWithLocation, + storage = storageWithoutHiveGeneratedProperties, schema = reorderedSchema, partitionColumnNames = partColumnNames, bucketSpec = getBucketSpecFromTableProperties(table), @@ -865,7 +868,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // and Hive will validate the column names in partition spec to make sure they are partition // columns. Here we Lowercase the column names before passing the partition spec to Hive // client, to satisfy Hive. + // scalastyle:off caselocale orderedPartitionSpec.put(colName.toLowerCase, partition(colName)) + // scalastyle:on caselocale } client.loadPartition( @@ -893,7 +898,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // and Hive will validate the column names in partition spec to make sure they are partition // columns. Here we Lowercase the column names before passing the partition spec to Hive // client, to satisfy Hive. + // scalastyle:off caselocale orderedPartitionSpec.put(colName.toLowerCase, partition(colName)) + // scalastyle:on caselocale } client.loadDynamicPartitions( @@ -913,13 +920,17 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // to lower case the column names in partition specification before calling partition related Hive // APIs, to match this behaviour. private def lowerCasePartitionSpec(spec: TablePartitionSpec): TablePartitionSpec = { + // scalastyle:off caselocale spec.map { case (k, v) => k.toLowerCase -> v } + // scalastyle:on caselocale } // Build a map from lower-cased partition column names to exact column names for a given table private def buildLowerCasePartColNameMap(table: CatalogTable): Map[String, String] = { val actualPartColNames = table.partitionColumnNames + // scalastyle:off caselocale actualPartColNames.map(colName => (colName.toLowerCase, colName)).toMap + // scalastyle:on caselocale } // Hive metastore is not case preserving and the column names of the partition specification we @@ -928,7 +939,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat private def restorePartitionSpec( spec: TablePartitionSpec, partColMap: Map[String, String]): TablePartitionSpec = { + // scalastyle:off caselocale spec.map { case (k, v) => partColMap(k.toLowerCase) -> v } + // scalastyle:on caselocale } private def restorePartitionSpec( @@ -987,7 +1000,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // When Hive rename partition for managed tables, it will create the partition location with // a default path generate by the new spec with lower cased partition column names. This is // unexpected and we need to rename them manually and alter the partition location. + // scalastyle:off caselocale val hasUpperCasePartitionColumn = partitionColumnNames.exists(col => col.toLowerCase != col) + // scalastyle:on caselocale if (tableMeta.tableType == MANAGED && hasUpperCasePartitionColumn) { val tablePath = new Path(tableMeta.location) val fs = tablePath.getFileSystem(hadoopConf) @@ -1028,7 +1043,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // another partition to `A=1/B=3`, then we will have `A=1/B=2` and `a=1/b=3`, and we should // just move `a=1/b=3` into `A=1` with new name `B=3`. } else { + // scalastyle:off caselocale val actualPartitionString = getPartitionPathString(col.toLowerCase, partValue) + // scalastyle:on caselocale val actualPartitionPath = new Path(currentFullPath, actualPartitionString) try { fs.rename(actualPartitionPath, expectedPartitionPath) @@ -1179,7 +1196,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat clientPartitionNames.map { partitionPath => val partSpec = PartitioningUtils.parsePathFragmentAsSeq(partitionPath) partSpec.map { case (partName, partValue) => + // scalastyle:off caselocale partColNameMap(partName.toLowerCase) + "=" + escapePathName(partValue) + // scalastyle:on caselocale }.mkString("/") } } @@ -1309,6 +1328,8 @@ object HiveExternalCatalog { val CREATED_SPARK_VERSION = SPARK_SQL_PREFIX + "create.version" + val HIVE_GENERATED_STORAGE_PROPERTIES = Set(SERIALIZATION_FORMAT) + // When storing data source tables in hive metastore, we need to set data schema to empty if the // schema is hive-incompatible. However we need a hack to preserve existing behavior. Before // Spark 2.0, we do not set a default serde here (this was done in Hive), and so if the user diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 8adfda07d29d..d04795332795 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -59,8 +59,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log // For testing only private[hive] def getCachedDataSourceTable(table: TableIdentifier): LogicalPlan = { val key = QualifiedTableName( + // scalastyle:off caselocale table.database.getOrElse(sessionState.catalog.getCurrentDatabase).toLowerCase, table.table.toLowerCase) + // scalastyle:on caselocale catalogProxy.getCachedTable(key) } @@ -273,6 +275,7 @@ private[hive] object HiveMetastoreCatalog { def mergeWithMetastoreSchema( metastoreSchema: StructType, inferredSchema: StructType): StructType = try { + // scalastyle:off caselocale // Find any nullable fields in mestastore schema that are missing from the inferred schema. val metastoreFields = metastoreSchema.map(f => f.name.toLowerCase -> f).toMap val missingNullables = metastoreFields @@ -282,6 +285,7 @@ private[hive] object HiveMetastoreCatalog { // Merge missing nullable fields to inferred schema and build a case-insensitive field map. val inferredFields = StructType(inferredSchema ++ missingNullables) .map(f => f.name.toLowerCase -> f).toMap + // scalastyle:on caselocale StructType(metastoreSchema.map(f => f.copy(name = inferredFields(f.name).name))) } catch { case NonFatal(_) => diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index de41bb418181..405c0c8bfe66 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -131,14 +131,14 @@ private[sql] class HiveSessionCatalog( Try(super.lookupFunction(funcName, children)) match { case Success(expr) => expr case Failure(error) => - if (functionRegistry.functionExists(funcName)) { - // If the function actually exists in functionRegistry, it means that there is an - // error when we create the Expression using the given children. + if (super.functionExists(name)) { + // If the function exists (either in functionRegistry or externalCatalog), + // it means that there is an error when we create the Expression using the given children. // We need to throw the original exception. throw error } else { - // This function is not in functionRegistry, let's try to load it as a Hive's - // built-in function. + // This function does not exist (neither in functionRegistry or externalCatalog), + // let's try to load it as a Hive's built-in function. // Hive is case insensitive. val functionName = funcName.unquotedString.toLowerCase(Locale.ROOT) if (!hiveFunctions.contains(functionName)) { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 02c1ed93eb2f..5e9b324a168e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -467,9 +467,12 @@ private[hive] class HiveClientImpl( properties = filteredProperties, stats = readHiveStats(properties), comment = comment, - // In older versions of Spark(before 2.2.0), we expand the view original text and store - // that into `viewExpandedText`, and that should be used in view resolution. So we get - // `viewExpandedText` instead of `viewOriginalText` for viewText here. + // In older versions of Spark(before 2.2.0), we expand the view original text and + // store that into `viewExpandedText`, that should be used in view resolution. + // We get `viewExpandedText` as viewText, and also get `viewOriginalText` in order to + // display the original view text in `DESC [EXTENDED|FORMATTED] table` command for views + // that created by older versions of Spark. + viewOriginalText = Option(h.getViewOriginalText), viewText = Option(h.getViewExpandedText), unsupportedFeatures = unsupportedFeatures, ignoredProperties = ignoredProperties.toMap) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala index 0eb2f0de0acd..aa573b54a2b6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala @@ -69,8 +69,8 @@ case class CreateHiveTableAsSelectCommand( // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) - val schema = DataWritingCommand.logicalPlanSchemaWithNames(query, outputColumnNames) - catalog.createTable(tableDesc.copy(schema = schema), ignoreIfExists = false) + catalog.createTable( + tableDesc.copy(schema = outputColumns.toStructType), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala index 0a73aaa94bc7..0c694910b06d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.hive.client.HiveClientImpl +import org.apache.spark.sql.util.SchemaUtils /** * Command for writing the results of `query` to file system. @@ -61,12 +62,16 @@ case class InsertIntoHiveDirCommand( override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { assert(storage.locationUri.nonEmpty) + SchemaUtils.checkColumnNameDuplication( + outputColumnNames, + s"when inserting into ${storage.locationUri.get}", + sparkSession.sessionState.conf.caseSensitiveAnalysis) val hiveTable = HiveClientImpl.toHiveTable(CatalogTable( identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")), tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW, storage = storage, - schema = query.schema + schema = outputColumns.toStructType )) hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB, storage.serde.getOrElse(classOf[LazySimpleSerDe].getName)) @@ -104,8 +109,7 @@ case class InsertIntoHiveDirCommand( plan = child, hadoopConf = hadoopConf, fileSinkConf = fileSinkConf, - outputLocation = tmpPath.toString, - allColumns = outputColumns) + outputLocation = tmpPath.toString) val fs = writeToPath.getFileSystem(hadoopConf) if (overwrite && fs.exists(writeToPath)) { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 75a0563e72c9..0ed464dad91b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -198,7 +198,6 @@ case class InsertIntoHiveTable( hadoopConf = hadoopConf, fileSinkConf = fileSinkConf, outputLocation = tmpLocation.toString, - allColumns = outputColumns, partitionAttributes = partitionAttributes) if (partition.nonEmpty) { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index e0f7375387d2..078968ed0145 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -51,7 +51,6 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { hadoopConf: Configuration, fileSinkConf: FileSinkDesc, outputLocation: String, - allColumns: Seq[Attribute], customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty, partitionAttributes: Seq[Attribute] = Nil): Set[String] = { @@ -90,7 +89,7 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand { fileFormat = new HiveFileFormat(fileSinkConf), committer = committer, outputSpec = - FileFormatWriter.OutputSpec(outputLocation, customPartitionLocations, allColumns), + FileFormatWriter.OutputSpec(outputLocation, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionAttributes, bucketSpec = None, diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index ee3f99ab7e9b..71f15a45d162 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -36,6 +36,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.catalog.{ExternalCatalog, ExternalCatalogWithListener} +import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.execution.command.CacheTableCommand @@ -59,7 +60,12 @@ object TestHive .set("spark.sql.warehouse.dir", TestHiveContext.makeWarehouseDir().toURI.getPath) // SPARK-8910 .set("spark.ui.enabled", "false") - .set("spark.unsafe.exceptionOnMemoryLeak", "true"))) + .set("spark.unsafe.exceptionOnMemoryLeak", "true") + // Disable ConvertToLocalRelation for better test coverage. Test cases built on + // LocalRelation will exercise the optimization rules better by disabling it as + // this rule may potentially block testing of other optimization rules such as + // ConstantPropagation etc. + .set(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, ConvertToLocalRelation.ruleName))) case class TestHiveVersion(hiveClient: HiveClient) diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDAFEmpty.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDAFEmpty.java new file mode 100644 index 000000000000..badc396688f5 --- /dev/null +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDAFEmpty.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.hive.execution; + +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * An empty UDAF that throws a semantic exception + */ +public class UDAFEmpty extends AbstractGenericUDAFResolver { + @Override + public GenericUDAFEvaluator getEvaluator(TypeInfo[] info) throws SemanticException { + throw new SemanticException("Can not get an evaluator of the empty UDAF"); + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala index e599d1ab1d48..3b33785cdfbb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala @@ -21,6 +21,7 @@ import scala.concurrent.duration._ import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFPercentileApprox +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogFunction @@ -31,9 +32,8 @@ import org.apache.spark.sql.hive.execution.TestingTypedCount import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.LongType -import org.apache.spark.util.Benchmark -class ObjectHashAggregateExecBenchmark extends BenchmarkBase with TestHiveSingleton { +class ObjectHashAggregateExecBenchmark extends BenchmarkWithCodegen with TestHiveSingleton { ignore("Hive UDAF vs Spark AF") { val N = 2 << 15 diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala index 4550d350f6db..1bd7e52c88ec 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.hive import java.io.File +import java.util.Locale import scala.collection.JavaConverters._ @@ -50,23 +51,29 @@ class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with Befo private val maxRecordNum = 50 - private def getConvertMetastoreConfName(format: String): String = format.toLowerCase match { - case "parquet" => HiveUtils.CONVERT_METASTORE_PARQUET.key - case "orc" => HiveUtils.CONVERT_METASTORE_ORC.key + private def getConvertMetastoreConfName(format: String): String = { + format.toLowerCase(Locale.ROOT) match { + case "parquet" => HiveUtils.CONVERT_METASTORE_PARQUET.key + case "orc" => HiveUtils.CONVERT_METASTORE_ORC.key + } } - private def getSparkCompressionConfName(format: String): String = format.toLowerCase match { - case "parquet" => SQLConf.PARQUET_COMPRESSION.key - case "orc" => SQLConf.ORC_COMPRESSION.key + private def getSparkCompressionConfName(format: String): String = { + format.toLowerCase(Locale.ROOT) match { + case "parquet" => SQLConf.PARQUET_COMPRESSION.key + case "orc" => SQLConf.ORC_COMPRESSION.key + } } - private def getHiveCompressPropName(format: String): String = format.toLowerCase match { - case "parquet" => ParquetOutputFormat.COMPRESSION - case "orc" => COMPRESS.getAttribute + private def getHiveCompressPropName(format: String): String = { + format.toLowerCase(Locale.ROOT) match { + case "parquet" => ParquetOutputFormat.COMPRESSION + case "orc" => COMPRESS.getAttribute + } } private def normalizeCodecName(format: String, name: String): String = { - format.toLowerCase match { + format.toLowerCase(Locale.ROOT) match { case "parquet" => ParquetOptions.getParquetCompressionCodecName(name) case "orc" => OrcOptions.getORCCompressionCodecName(name) } @@ -74,7 +81,7 @@ class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with Befo private def getTableCompressionCodec(path: String, format: String): Seq[String] = { val hadoopConf = spark.sessionState.newHadoopConf() - val codecs = format.toLowerCase match { + val codecs = format.toLowerCase(Locale.ROOT) match { case "parquet" => for { footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf) block <- footer.getParquetMetadata.getBlocks.asScala @@ -122,7 +129,7 @@ class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with Befo """.stripMargin) } - private def writeDateToTableUsingCTAS( + private def writeDataToTableUsingCTAS( rootDir: File, tableName: String, partitionValue: Option[String], @@ -152,7 +159,7 @@ class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with Befo usingCTAS: Boolean): String = { val partitionValue = if (isPartitioned) Some("test") else None if (usingCTAS) { - writeDateToTableUsingCTAS(tmpDir, tableName, partitionValue, format, compressionCodec) + writeDataToTableUsingCTAS(tmpDir, tableName, partitionValue, format, compressionCodec) } else { createTable(tmpDir, tableName, isPartitioned, format, compressionCodec) writeDataToTable(tableName, partitionValue) @@ -258,8 +265,7 @@ class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with Befo def checkForTableWithCompressProp(format: String, compressCodecs: List[String]): Unit = { Seq(true, false).foreach { isPartitioned => Seq(true, false).foreach { convertMetastore => - // TODO: Also verify CTAS(usingCTAS=true) cases when the bug(SPARK-22926) is fixed. - Seq(false).foreach { usingCTAS => + Seq(true, false).foreach { usingCTAS => checkTableCompressionCodecForCodecs( format, isPartitioned, @@ -281,8 +287,7 @@ class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with Befo def checkForTableWithoutCompressProp(format: String, compressCodecs: List[String]): Unit = { Seq(true, false).foreach { isPartitioned => Seq(true, false).foreach { convertMetastore => - // TODO: Also verify CTAS(usingCTAS=true) cases when the bug(SPARK-22926) is fixed. - Seq(false).foreach { usingCTAS => + Seq(true, false).foreach { usingCTAS => checkTableCompressionCodecForCodecs( format, isPartitioned, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index 25df3339e62f..fd4985d13188 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -49,10 +49,13 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { private val unusedJar = TestUtils.createJarWithClasses(Seq.empty) override def afterAll(): Unit = { - Utils.deleteRecursively(wareHousePath) - Utils.deleteRecursively(tmpDataDir) - Utils.deleteRecursively(sparkTestingDir) - super.afterAll() + try { + Utils.deleteRecursively(wareHousePath) + Utils.deleteRecursively(tmpDataDir) + Utils.deleteRecursively(sparkTestingDir) + } finally { + super.afterAll() + } } private def tryDownloadSpark(version: String, path: String): Unit = { @@ -203,7 +206,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { object PROCESS_TABLES extends QueryTest with SQLTestUtils { // Tests the latest version of every release line. - val testingVersions = Seq("2.1.3", "2.2.2", "2.3.1") + val testingVersions = Seq("2.1.3", "2.2.2", "2.3.2") protected var spark: SparkSession = _ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetMetastoreSuite.scala similarity index 55% rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetMetastoreSuite.scala index e82d457eee39..0d4f04015608 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetMetastoreSuite.scala @@ -19,44 +19,18 @@ package org.apache.spark.sql.hive import java.io.File -import org.apache.spark.sql._ +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.DataSourceScanExec -import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InsertIntoHadoopFsRelationCommand, LogicalRelation} import org.apache.spark.sql.hive.execution.HiveTableScanExec -import org.apache.spark.sql.hive.test.TestHiveSingleton -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.test.SQLTestUtils -import org.apache.spark.sql.types._ -import org.apache.spark.util.Utils - -// The data where the partitioning key exists only in the directory structure. -case class ParquetData(intField: Int, stringField: String) -// The data that also includes the partitioning key -case class ParquetDataWithKey(p: Int, intField: Int, stringField: String) - -case class StructContainer(intStructField: Int, stringStructField: String) - -case class ParquetDataWithComplexTypes( - intField: Int, - stringField: String, - structField: StructContainer, - arrayField: Seq[Int]) - -case class ParquetDataWithKeyAndComplexTypes( - p: Int, - intField: Int, - stringField: String, - structField: StructContainer, - arrayField: Seq[Int]) /** * A suite to test the automatic conversion of metastore tables with parquet data to use the * built in parquet support. */ -class ParquetMetastoreSuite extends ParquetPartitioningTest { +class HiveParquetMetastoreSuite extends ParquetPartitioningTest { import hiveContext._ import spark.implicits._ @@ -70,78 +44,83 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { "jt", "jt_array", "test_parquet") - sql(s""" - create external table partitioned_parquet - ( - intField INT, - stringField STRING - ) - PARTITIONED BY (p int) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - location '${partitionedTableDir.toURI}' - """) - - sql(s""" - create external table partitioned_parquet_with_key - ( - intField INT, - stringField STRING - ) - PARTITIONED BY (p int) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - location '${partitionedTableDirWithKey.toURI}' - """) - - sql(s""" - create external table normal_parquet - ( - intField INT, - stringField STRING - ) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - location '${new File(normalTableDir, "normal").toURI}' - """) - - sql(s""" - CREATE EXTERNAL TABLE partitioned_parquet_with_complextypes - ( - intField INT, - stringField STRING, - structField STRUCT, - arrayField ARRAY - ) - PARTITIONED BY (p int) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - LOCATION '${partitionedTableDirWithComplexTypes.toURI}' - """) - - sql(s""" - CREATE EXTERNAL TABLE partitioned_parquet_with_key_and_complextypes - ( - intField INT, - stringField STRING, - structField STRUCT, - arrayField ARRAY - ) - PARTITIONED BY (p int) - ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' - STORED AS - INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' - LOCATION '${partitionedTableDirWithKeyAndComplexTypes.toURI}' - """) + sql( + s""" + |create external table partitioned_parquet + |( + | intField INT, + | stringField STRING + |) + |PARTITIONED BY (p int) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + | STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + |location '${partitionedTableDir.toURI}' + """.stripMargin) + + sql( + s""" + |create external table partitioned_parquet_with_key + |( + | intField INT, + | stringField STRING + |) + |PARTITIONED BY (p int) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + | STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + |location '${partitionedTableDirWithKey.toURI}' + """.stripMargin) + + sql( + s""" + |create external table normal_parquet + |( + | intField INT, + | stringField STRING + |) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + | STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + |location '${new File(normalTableDir, "normal").toURI}' + """.stripMargin) + + sql( + s""" + |CREATE EXTERNAL TABLE partitioned_parquet_with_complextypes + |( + | intField INT, + | stringField STRING, + | structField STRUCT, + | arrayField ARRAY + |) + |PARTITIONED BY (p int) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + | STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + |LOCATION '${partitionedTableDirWithComplexTypes.toURI}' + """.stripMargin) + + sql( + s""" + |CREATE EXTERNAL TABLE partitioned_parquet_with_key_and_complextypes + |( + | intField INT, + | stringField STRING, + | structField STRUCT, + | arrayField ARRAY + |) + |PARTITIONED BY (p int) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + | STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + |LOCATION '${partitionedTableDirWithKeyAndComplexTypes.toURI}' + """.stripMargin) sql( """ @@ -291,7 +270,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { case LogicalRelation(_: HadoopFsRelation, _, _, _) => // OK case _ => fail( "test_parquet_ctas should be converted to " + - s"${classOf[HadoopFsRelation ].getCanonicalName }") + s"${classOf[HadoopFsRelation ].getCanonicalName }") } } } @@ -430,7 +409,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { } test("SPARK-15968: nonempty partitioned metastore Parquet table lookup should use cached " + - "relation") { + "relation") { withTable("partitioned") { sql( """ @@ -678,404 +657,3 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest { sql("SELECT * FROM normal_parquet x CROSS JOIN normal_parquet y")) } } - -/** - * A suite of tests for the Parquet support through the data sources API. - */ -class ParquetSourceSuite extends ParquetPartitioningTest { - import testImplicits._ - import spark._ - - override def beforeAll(): Unit = { - super.beforeAll() - dropTables("partitioned_parquet", - "partitioned_parquet_with_key", - "partitioned_parquet_with_complextypes", - "partitioned_parquet_with_key_and_complextypes", - "normal_parquet") - - sql( s""" - CREATE TEMPORARY VIEW partitioned_parquet - USING org.apache.spark.sql.parquet - OPTIONS ( - path '${partitionedTableDir.toURI}' - ) - """) - - sql( s""" - CREATE TEMPORARY VIEW partitioned_parquet_with_key - USING org.apache.spark.sql.parquet - OPTIONS ( - path '${partitionedTableDirWithKey.toURI}' - ) - """) - - sql( s""" - CREATE TEMPORARY VIEW normal_parquet - USING org.apache.spark.sql.parquet - OPTIONS ( - path '${new File(partitionedTableDir, "p=1").toURI}' - ) - """) - - sql( s""" - CREATE TEMPORARY VIEW partitioned_parquet_with_key_and_complextypes - USING org.apache.spark.sql.parquet - OPTIONS ( - path '${partitionedTableDirWithKeyAndComplexTypes.toURI}' - ) - """) - - sql( s""" - CREATE TEMPORARY VIEW partitioned_parquet_with_complextypes - USING org.apache.spark.sql.parquet - OPTIONS ( - path '${partitionedTableDirWithComplexTypes.toURI}' - ) - """) - } - - test("SPARK-6016 make sure to use the latest footers") { - sql("drop table if exists spark_6016_fix") - - // Create a DataFrame with two partitions. So, the created table will have two parquet files. - val df1 = (1 to 10).map(Tuple1(_)).toDF("a").coalesce(2) - df1.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("spark_6016_fix") - checkAnswer( - sql("select * from spark_6016_fix"), - (1 to 10).map(i => Row(i)) - ) - - // Create a DataFrame with four partitions. So, the created table will have four parquet files. - val df2 = (1 to 10).map(Tuple1(_)).toDF("b").coalesce(4) - df2.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable("spark_6016_fix") - // For the bug of SPARK-6016, we are caching two outdated footers for df1. Then, - // since the new table has four parquet files, we are trying to read new footers from two files - // and then merge metadata in footers of these four (two outdated ones and two latest one), - // which will cause an error. - checkAnswer( - sql("select * from spark_6016_fix"), - (1 to 10).map(i => Row(i)) - ) - - sql("drop table spark_6016_fix") - } - - test("SPARK-8811: compatibility with array of struct in Hive") { - withTempPath { dir => - withTable("array_of_struct") { - val conf = Seq( - HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false", - SQLConf.PARQUET_BINARY_AS_STRING.key -> "true", - SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "false") - - withSQLConf(conf: _*) { - sql( - s"""CREATE TABLE array_of_struct - |STORED AS PARQUET LOCATION '${dir.toURI}' - |AS SELECT - | '1st' AS a, - | '2nd' AS b, - | ARRAY(NAMED_STRUCT('a', 'val_a', 'b', 'val_b')) AS c - """.stripMargin) - - checkAnswer( - spark.read.parquet(dir.getCanonicalPath), - Row("1st", "2nd", Seq(Row("val_a", "val_b")))) - } - } - } - } - - test("Verify the PARQUET conversion parameter: CONVERT_METASTORE_PARQUET") { - withTempView("single") { - val singleRowDF = Seq((0, "foo")).toDF("key", "value") - singleRowDF.createOrReplaceTempView("single") - - Seq("true", "false").foreach { parquetConversion => - withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> parquetConversion) { - val tableName = "test_parquet_ctas" - withTable(tableName) { - sql( - s""" - |CREATE TABLE $tableName STORED AS PARQUET - |AS SELECT tmp.key, tmp.value FROM single tmp - """.stripMargin) - - val df = spark.sql(s"SELECT * FROM $tableName WHERE key=0") - checkAnswer(df, singleRowDF) - - val queryExecution = df.queryExecution - if (parquetConversion == "true") { - queryExecution.analyzed.collectFirst { - case _: LogicalRelation => - }.getOrElse { - fail(s"Expecting the query plan to convert parquet to data sources, " + - s"but got:\n$queryExecution") - } - } else { - queryExecution.analyzed.collectFirst { - case _: HiveTableRelation => - }.getOrElse { - fail(s"Expecting no conversion from parquet to data sources, " + - s"but got:\n$queryExecution") - } - } - } - } - } - } - } - - test("values in arrays and maps stored in parquet are always nullable") { - val df = createDataFrame(Tuple2(Map(2 -> 3), Seq(4, 5, 6)) :: Nil).toDF("m", "a") - val mapType1 = MapType(IntegerType, IntegerType, valueContainsNull = false) - val arrayType1 = ArrayType(IntegerType, containsNull = false) - val expectedSchema1 = - StructType( - StructField("m", mapType1, nullable = true) :: - StructField("a", arrayType1, nullable = true) :: Nil) - assert(df.schema === expectedSchema1) - - withTable("alwaysNullable") { - df.write.format("parquet").saveAsTable("alwaysNullable") - - val mapType2 = MapType(IntegerType, IntegerType, valueContainsNull = true) - val arrayType2 = ArrayType(IntegerType, containsNull = true) - val expectedSchema2 = - StructType( - StructField("m", mapType2, nullable = true) :: - StructField("a", arrayType2, nullable = true) :: Nil) - - assert(table("alwaysNullable").schema === expectedSchema2) - - checkAnswer( - sql("SELECT m, a FROM alwaysNullable"), - Row(Map(2 -> 3), Seq(4, 5, 6))) - } - } - - test("Aggregation attribute names can't contain special chars \" ,;{}()\\n\\t=\"") { - val tempDir = Utils.createTempDir() - val filePath = new File(tempDir, "testParquet").getCanonicalPath - val filePath2 = new File(tempDir, "testParquet2").getCanonicalPath - - val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str") - val df2 = df.as('x).join(df.as('y), $"x.str" === $"y.str").groupBy("y.str").max("y.int") - intercept[Throwable](df2.write.parquet(filePath)) - - val df3 = df2.toDF("str", "max_int") - df3.write.parquet(filePath2) - val df4 = read.parquet(filePath2) - checkAnswer(df4, Row("1", 1) :: Row("2", 2) :: Row("3", 3) :: Nil) - assert(df4.columns === Array("str", "max_int")) - } -} - -/** - * A collection of tests for parquet data with various forms of partitioning. - */ -abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with TestHiveSingleton { - import testImplicits._ - - var partitionedTableDir: File = null - var normalTableDir: File = null - var partitionedTableDirWithKey: File = null - var partitionedTableDirWithComplexTypes: File = null - var partitionedTableDirWithKeyAndComplexTypes: File = null - - override def beforeAll(): Unit = { - super.beforeAll() - partitionedTableDir = Utils.createTempDir() - normalTableDir = Utils.createTempDir() - - (1 to 10).foreach { p => - val partDir = new File(partitionedTableDir, s"p=$p") - sparkContext.makeRDD(1 to 10) - .map(i => ParquetData(i, s"part-$p")) - .toDF() - .write.parquet(partDir.getCanonicalPath) - } - - sparkContext - .makeRDD(1 to 10) - .map(i => ParquetData(i, s"part-1")) - .toDF() - .write.parquet(new File(normalTableDir, "normal").getCanonicalPath) - - partitionedTableDirWithKey = Utils.createTempDir() - - (1 to 10).foreach { p => - val partDir = new File(partitionedTableDirWithKey, s"p=$p") - sparkContext.makeRDD(1 to 10) - .map(i => ParquetDataWithKey(p, i, s"part-$p")) - .toDF() - .write.parquet(partDir.getCanonicalPath) - } - - partitionedTableDirWithKeyAndComplexTypes = Utils.createTempDir() - - (1 to 10).foreach { p => - val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"p=$p") - sparkContext.makeRDD(1 to 10).map { i => - ParquetDataWithKeyAndComplexTypes( - p, i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i) - }.toDF().write.parquet(partDir.getCanonicalPath) - } - - partitionedTableDirWithComplexTypes = Utils.createTempDir() - - (1 to 10).foreach { p => - val partDir = new File(partitionedTableDirWithComplexTypes, s"p=$p") - sparkContext.makeRDD(1 to 10).map { i => - ParquetDataWithComplexTypes(i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i) - }.toDF().write.parquet(partDir.getCanonicalPath) - } - } - - override protected def afterAll(): Unit = { - try { - partitionedTableDir.delete() - normalTableDir.delete() - partitionedTableDirWithKey.delete() - partitionedTableDirWithComplexTypes.delete() - partitionedTableDirWithKeyAndComplexTypes.delete() - } finally { - super.afterAll() - } - } - - /** - * Drop named tables if they exist - * - * @param tableNames tables to drop - */ - def dropTables(tableNames: String*): Unit = { - tableNames.foreach { name => - sql(s"DROP TABLE IF EXISTS $name") - } - } - - Seq( - "partitioned_parquet", - "partitioned_parquet_with_key", - "partitioned_parquet_with_complextypes", - "partitioned_parquet_with_key_and_complextypes").foreach { table => - - test(s"ordering of the partitioning columns $table") { - checkAnswer( - sql(s"SELECT p, stringField FROM $table WHERE p = 1"), - Seq.fill(10)(Row(1, "part-1")) - ) - - checkAnswer( - sql(s"SELECT stringField, p FROM $table WHERE p = 1"), - Seq.fill(10)(Row("part-1", 1)) - ) - } - - test(s"project the partitioning column $table") { - checkAnswer( - sql(s"SELECT p, count(*) FROM $table group by p"), - Row(1, 10) :: - Row(2, 10) :: - Row(3, 10) :: - Row(4, 10) :: - Row(5, 10) :: - Row(6, 10) :: - Row(7, 10) :: - Row(8, 10) :: - Row(9, 10) :: - Row(10, 10) :: Nil - ) - } - - test(s"project partitioning and non-partitioning columns $table") { - checkAnswer( - sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"), - Row("part-1", 1, 10) :: - Row("part-2", 2, 10) :: - Row("part-3", 3, 10) :: - Row("part-4", 4, 10) :: - Row("part-5", 5, 10) :: - Row("part-6", 6, 10) :: - Row("part-7", 7, 10) :: - Row("part-8", 8, 10) :: - Row("part-9", 9, 10) :: - Row("part-10", 10, 10) :: Nil - ) - } - - test(s"simple count $table") { - checkAnswer( - sql(s"SELECT COUNT(*) FROM $table"), - Row(100)) - } - - test(s"pruned count $table") { - checkAnswer( - sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"), - Row(10)) - } - - test(s"non-existent partition $table") { - checkAnswer( - sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"), - Row(0)) - } - - test(s"multi-partition pruned count $table") { - checkAnswer( - sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"), - Row(30)) - } - - test(s"non-partition predicates $table") { - checkAnswer( - sql(s"SELECT COUNT(*) FROM $table WHERE intField IN (1,2,3)"), - Row(30)) - } - - test(s"sum $table") { - checkAnswer( - sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"), - Row(1 + 2 + 3)) - } - - test(s"hive udfs $table") { - checkAnswer( - sql(s"SELECT concat(stringField, stringField) FROM $table"), - sql(s"SELECT stringField FROM $table").rdd.map { - case Row(s: String) => Row(s + s) - }.collect().toSeq) - } - } - - Seq( - "partitioned_parquet_with_key_and_complextypes", - "partitioned_parquet_with_complextypes").foreach { table => - - test(s"SPARK-5775 read struct from $table") { - checkAnswer( - sql( - s""" - |SELECT p, structField.intStructField, structField.stringStructField - |FROM $table WHERE p = 1 - """.stripMargin), - (1 to 10).map(i => Row(1, i, f"${i}_string"))) - } - - test(s"SPARK-5775 read array from $table") { - checkAnswer( - sql(s"SELECT arrayField, p FROM $table WHERE p = 1"), - (1 to 10).map(i => Row((1 to i).toArray, 1))) - } - } - - - test("non-part select(*)") { - checkAnswer( - sql("SELECT COUNT(*) FROM normal_parquet"), - Row(10)) - } -} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala new file mode 100644 index 000000000000..de588768cfde --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import java.io.File + +import org.apache.spark.sql.{Row, SaveMode} +import org.apache.spark.sql.catalyst.catalog.HiveTableRelation +import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils + +/** + * A suite of tests for the Parquet support through the data sources API. + */ +class HiveParquetSourceSuite extends ParquetPartitioningTest { + import testImplicits._ + import spark._ + + override def beforeAll(): Unit = { + super.beforeAll() + dropTables("partitioned_parquet", + "partitioned_parquet_with_key", + "partitioned_parquet_with_complextypes", + "partitioned_parquet_with_key_and_complextypes", + "normal_parquet") + + sql( + s""" + |CREATE TEMPORARY VIEW partitioned_parquet + |USING org.apache.spark.sql.parquet + |OPTIONS ( + | path '${partitionedTableDir.toURI}' + |) + """.stripMargin) + + sql( + s""" + |CREATE TEMPORARY VIEW partitioned_parquet_with_key + |USING org.apache.spark.sql.parquet + |OPTIONS ( + | path '${partitionedTableDirWithKey.toURI}' + |) + """.stripMargin) + + sql( + s""" + |CREATE TEMPORARY VIEW normal_parquet + |USING org.apache.spark.sql.parquet + |OPTIONS ( + | path '${new File(partitionedTableDir, "p=1").toURI}' + |) + """.stripMargin) + + sql( + s""" + |CREATE TEMPORARY VIEW partitioned_parquet_with_key_and_complextypes + |USING org.apache.spark.sql.parquet + |OPTIONS ( + | path '${partitionedTableDirWithKeyAndComplexTypes.toURI}' + |) + """.stripMargin) + + sql( + s""" + |CREATE TEMPORARY VIEW partitioned_parquet_with_complextypes + |USING org.apache.spark.sql.parquet + |OPTIONS ( + | path '${partitionedTableDirWithComplexTypes.toURI}' + |) + """.stripMargin) + } + + test("SPARK-6016 make sure to use the latest footers") { + val tableName = "spark_6016_fix" + withTable(tableName) { + // Create a DataFrame with two partitions. So, the created table will have two parquet files. + val df1 = (1 to 10).map(Tuple1(_)).toDF("a").coalesce(2) + df1.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable(tableName) + checkAnswer( + sql(s"select * from $tableName"), + (1 to 10).map(i => Row(i)) + ) + + // Create a DataFrame with four partitions. So the created table will have four parquet files. + val df2 = (1 to 10).map(Tuple1(_)).toDF("b").coalesce(4) + df2.write.mode(SaveMode.Overwrite).format("parquet").saveAsTable(tableName) + // For the bug of SPARK-6016, we are caching two outdated footers for df1. Then, + // since the new table has four parquet files, we are trying to read new footers from two + // files and then merge metadata in footers of these four + // (two outdated ones and two latest one), which will cause an error. + checkAnswer( + sql(s"select * from $tableName"), + (1 to 10).map(i => Row(i)) + ) + } + } + + test("SPARK-8811: compatibility with array of struct in Hive") { + withTempPath { dir => + withTable("array_of_struct") { + val conf = Seq( + HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false", + SQLConf.PARQUET_BINARY_AS_STRING.key -> "true", + SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "false") + + withSQLConf(conf: _*) { + sql( + s"""CREATE TABLE array_of_struct + |STORED AS PARQUET LOCATION '${dir.toURI}' + |AS SELECT + | '1st' AS a, + | '2nd' AS b, + | ARRAY(NAMED_STRUCT('a', 'val_a', 'b', 'val_b')) AS c + """.stripMargin) + + checkAnswer( + spark.read.parquet(dir.getCanonicalPath), + Row("1st", "2nd", Seq(Row("val_a", "val_b")))) + } + } + } + } + + test("Verify the PARQUET conversion parameter: CONVERT_METASTORE_PARQUET") { + withTempView("single") { + val singleRowDF = Seq((0, "foo")).toDF("key", "value") + singleRowDF.createOrReplaceTempView("single") + + Seq("true", "false").foreach { parquetConversion => + withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> parquetConversion) { + val tableName = "test_parquet_ctas" + withTable(tableName) { + sql( + s""" + |CREATE TABLE $tableName STORED AS PARQUET + |AS SELECT tmp.key, tmp.value FROM single tmp + """.stripMargin) + + val df = spark.sql(s"SELECT * FROM $tableName WHERE key=0") + checkAnswer(df, singleRowDF) + + val queryExecution = df.queryExecution + if (parquetConversion == "true") { + queryExecution.analyzed.collectFirst { + case _: LogicalRelation => + }.getOrElse { + fail(s"Expecting the query plan to convert parquet to data sources, " + + s"but got:\n$queryExecution") + } + } else { + queryExecution.analyzed.collectFirst { + case _: HiveTableRelation => + }.getOrElse { + fail(s"Expecting no conversion from parquet to data sources, " + + s"but got:\n$queryExecution") + } + } + } + } + } + } + } + + test("values in arrays and maps stored in parquet are always nullable") { + val df = createDataFrame(Tuple2(Map(2 -> 3), Seq(4, 5, 6)) :: Nil).toDF("m", "a") + val mapType1 = MapType(IntegerType, IntegerType, valueContainsNull = false) + val arrayType1 = ArrayType(IntegerType, containsNull = false) + val expectedSchema1 = + StructType( + StructField("m", mapType1, nullable = true) :: + StructField("a", arrayType1, nullable = true) :: Nil) + assert(df.schema === expectedSchema1) + + withTable("alwaysNullable") { + df.write.format("parquet").saveAsTable("alwaysNullable") + + val mapType2 = MapType(IntegerType, IntegerType, valueContainsNull = true) + val arrayType2 = ArrayType(IntegerType, containsNull = true) + val expectedSchema2 = + StructType( + StructField("m", mapType2, nullable = true) :: + StructField("a", arrayType2, nullable = true) :: Nil) + + assert(table("alwaysNullable").schema === expectedSchema2) + + checkAnswer( + sql("SELECT m, a FROM alwaysNullable"), + Row(Map(2 -> 3), Seq(4, 5, 6))) + } + } + + test("Aggregation attribute names can't contain special chars \" ,;{}()\\n\\t=\"") { + withTempDir { tempDir => + val filePath = new File(tempDir, "testParquet").getCanonicalPath + val filePath2 = new File(tempDir, "testParquet2").getCanonicalPath + + val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str") + val df2 = df.as('x).join(df.as('y), $"x.str" === $"y.str").groupBy("y.str").max("y.int") + intercept[Throwable](df2.write.parquet(filePath)) + + val df3 = df2.toDF("str", "max_int") + df3.write.parquet(filePath2) + val df4 = read.parquet(filePath2) + checkAnswer(df4, Row("1", 1) :: Row("2", 2) :: Row("3", 3) :: Nil) + assert(df4.columns === Array("str", "max_int")) + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala index 51a48a20daaa..aa4fc13333c4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.hive import java.io.File +import java.util.Locale import scala.util.Random @@ -56,7 +57,7 @@ class HiveSchemaInferenceSuite // Return a copy of the given schema with all field names converted to lower case. private def lowerCaseSchema(schema: StructType): StructType = { - StructType(schema.map(f => f.copy(name = f.name.toLowerCase))) + StructType(schema.map(f => f.copy(name = f.name.toLowerCase(Locale.ROOT)))) } // Create a Hive external test table containing the given field and partition column names. @@ -78,7 +79,7 @@ class HiveSchemaInferenceSuite val partitionStructFields = partitionCols.map { field => StructField( // Partition column case isn't preserved - name = field.toLowerCase, + name = field.toLowerCase(Locale.ROOT), dataType = IntegerType, nullable = true, metadata = Metadata.empty) @@ -113,7 +114,7 @@ class HiveSchemaInferenceSuite properties = Map("serialization.format" -> "1")), schema = schema, provider = Option("hive"), - partitionColumnNames = partitionCols.map(_.toLowerCase), + partitionColumnNames = partitionCols.map(_.toLowerCase(Locale.ROOT)), properties = Map.empty), true) @@ -180,7 +181,7 @@ class HiveSchemaInferenceSuite val catalogTable = externalCatalog.getTable(DATABASE, TEST_TABLE_NAME) assert(catalogTable.schemaPreservesCase) assert(catalogTable.schema == schema) - assert(catalogTable.partitionColumnNames == partCols.map(_.toLowerCase)) + assert(catalogTable.partitionColumnNames == partCols.map(_.toLowerCase(Locale.ROOT))) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index a676cf6ce692..f839e8979d35 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -33,11 +33,13 @@ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext} import org.apache.spark.sql.types.{DecimalType, StructType} +import org.apache.spark.tags.ExtendedHiveTest import org.apache.spark.util.{ResetSystemProperties, Utils} /** * This suite tests spark-submit with applications using HiveContext. */ +@ExtendedHiveTest class HiveSparkSubmitSuite extends SparkSubmitTestUtils with Matchers @@ -46,8 +48,6 @@ class HiveSparkSubmitSuite override protected val enableAutoThreadAudit = false - // TODO: rewrite these or mark them as slow tests to be run sparingly - override def beforeEach() { super.beforeEach() } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index ab91727049ff..5879748d05b2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -750,4 +751,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter } } } + + Seq("LOCAL", "").foreach { local => + Seq(true, false).foreach { caseSensitivity => + Seq("orc", "parquet").foreach { format => + test(s"SPARK-25389 INSERT OVERWRITE $local DIRECTORY ... STORED AS with duplicated names" + + s"(caseSensitivity=$caseSensitivity, format=$format)") { + withTempDir { dir => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> s"$caseSensitivity") { + val m = intercept[AnalysisException] { + sql( + s""" + |INSERT OVERWRITE $local DIRECTORY '${dir.toURI}' + |STORED AS $format + |SELECT 'id', 'id2' ${if (caseSensitivity) "id" else "ID"} + """.stripMargin) + }.getMessage + assert(m.contains("Found duplicate column(s) when inserting into")) + } + } + } + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetPartitioningTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetPartitioningTest.scala new file mode 100644 index 000000000000..2ae3cf4b38f0 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetPartitioningTest.scala @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import java.io.File + +import org.apache.spark.sql._ +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.util.Utils + +// The data where the partitioning key exists only in the directory structure. +case class ParquetData(intField: Int, stringField: String) +// The data that also includes the partitioning key +case class ParquetDataWithKey(p: Int, intField: Int, stringField: String) + +case class StructContainer(intStructField: Int, stringStructField: String) + +case class ParquetDataWithComplexTypes( + intField: Int, + stringField: String, + structField: StructContainer, + arrayField: Seq[Int]) + +case class ParquetDataWithKeyAndComplexTypes( + p: Int, + intField: Int, + stringField: String, + structField: StructContainer, + arrayField: Seq[Int]) + +/** + * A collection of tests for parquet data with various forms of partitioning. + */ +abstract class ParquetPartitioningTest extends QueryTest with SQLTestUtils with TestHiveSingleton { + import testImplicits._ + + var partitionedTableDir: File = null + var normalTableDir: File = null + var partitionedTableDirWithKey: File = null + var partitionedTableDirWithComplexTypes: File = null + var partitionedTableDirWithKeyAndComplexTypes: File = null + + override def beforeAll(): Unit = { + super.beforeAll() + partitionedTableDir = Utils.createTempDir() + normalTableDir = Utils.createTempDir() + + (1 to 10).foreach { p => + val partDir = new File(partitionedTableDir, s"p=$p") + sparkContext.makeRDD(1 to 10) + .map(i => ParquetData(i, s"part-$p")) + .toDF() + .write.parquet(partDir.getCanonicalPath) + } + + sparkContext + .makeRDD(1 to 10) + .map(i => ParquetData(i, s"part-1")) + .toDF() + .write.parquet(new File(normalTableDir, "normal").getCanonicalPath) + + partitionedTableDirWithKey = Utils.createTempDir() + + (1 to 10).foreach { p => + val partDir = new File(partitionedTableDirWithKey, s"p=$p") + sparkContext.makeRDD(1 to 10) + .map(i => ParquetDataWithKey(p, i, s"part-$p")) + .toDF() + .write.parquet(partDir.getCanonicalPath) + } + + partitionedTableDirWithKeyAndComplexTypes = Utils.createTempDir() + + (1 to 10).foreach { p => + val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"p=$p") + sparkContext.makeRDD(1 to 10).map { i => + ParquetDataWithKeyAndComplexTypes( + p, i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i) + }.toDF().write.parquet(partDir.getCanonicalPath) + } + + partitionedTableDirWithComplexTypes = Utils.createTempDir() + + (1 to 10).foreach { p => + val partDir = new File(partitionedTableDirWithComplexTypes, s"p=$p") + sparkContext.makeRDD(1 to 10).map { i => + ParquetDataWithComplexTypes(i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i) + }.toDF().write.parquet(partDir.getCanonicalPath) + } + } + + override protected def afterAll(): Unit = { + try { + partitionedTableDir.delete() + normalTableDir.delete() + partitionedTableDirWithKey.delete() + partitionedTableDirWithComplexTypes.delete() + partitionedTableDirWithKeyAndComplexTypes.delete() + } finally { + super.afterAll() + } + } + + /** + * Drop named tables if they exist + * + * @param tableNames tables to drop + */ + def dropTables(tableNames: String*): Unit = { + tableNames.foreach { name => + sql(s"DROP TABLE IF EXISTS $name") + } + } + + Seq( + "partitioned_parquet", + "partitioned_parquet_with_key", + "partitioned_parquet_with_complextypes", + "partitioned_parquet_with_key_and_complextypes").foreach { table => + + test(s"ordering of the partitioning columns $table") { + checkAnswer( + sql(s"SELECT p, stringField FROM $table WHERE p = 1"), + Seq.fill(10)(Row(1, "part-1")) + ) + + checkAnswer( + sql(s"SELECT stringField, p FROM $table WHERE p = 1"), + Seq.fill(10)(Row("part-1", 1)) + ) + } + + test(s"project the partitioning column $table") { + checkAnswer( + sql(s"SELECT p, count(*) FROM $table group by p"), + Row(1, 10) :: + Row(2, 10) :: + Row(3, 10) :: + Row(4, 10) :: + Row(5, 10) :: + Row(6, 10) :: + Row(7, 10) :: + Row(8, 10) :: + Row(9, 10) :: + Row(10, 10) :: Nil + ) + } + + test(s"project partitioning and non-partitioning columns $table") { + checkAnswer( + sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"), + Row("part-1", 1, 10) :: + Row("part-2", 2, 10) :: + Row("part-3", 3, 10) :: + Row("part-4", 4, 10) :: + Row("part-5", 5, 10) :: + Row("part-6", 6, 10) :: + Row("part-7", 7, 10) :: + Row("part-8", 8, 10) :: + Row("part-9", 9, 10) :: + Row("part-10", 10, 10) :: Nil + ) + } + + test(s"simple count $table") { + checkAnswer( + sql(s"SELECT COUNT(*) FROM $table"), + Row(100)) + } + + test(s"pruned count $table") { + checkAnswer( + sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"), + Row(10)) + } + + test(s"non-existent partition $table") { + checkAnswer( + sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"), + Row(0)) + } + + test(s"multi-partition pruned count $table") { + checkAnswer( + sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"), + Row(30)) + } + + test(s"non-partition predicates $table") { + checkAnswer( + sql(s"SELECT COUNT(*) FROM $table WHERE intField IN (1,2,3)"), + Row(30)) + } + + test(s"sum $table") { + checkAnswer( + sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"), + Row(1 + 2 + 3)) + } + + test(s"hive udfs $table") { + checkAnswer( + sql(s"SELECT concat(stringField, stringField) FROM $table"), + sql(s"SELECT stringField FROM $table").rdd.map { + case Row(s: String) => Row(s + s) + }.collect().toSeq) + } + } + + Seq( + "partitioned_parquet_with_key_and_complextypes", + "partitioned_parquet_with_complextypes").foreach { table => + + test(s"SPARK-5775 read struct from $table") { + checkAnswer( + sql( + s""" + |SELECT p, structField.intStructField, structField.stringStructField + |FROM $table WHERE p = 1 + """.stripMargin), + (1 to 10).map(i => Row(1, i, f"${i}_string"))) + } + + test(s"SPARK-5775 read array from $table") { + checkAnswer( + sql(s"SELECT arrayField, p FROM $table WHERE p = 1"), + (1 to 10).map(i => Row((1 to i).toArray, 1))) + } + } + + test("non-part select(*)") { + checkAnswer( + sql("SELECT COUNT(*) FROM normal_parquet"), + Row(10)) + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index d8ffb29a5931..db2024e8b5d1 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive import java.io.{File, PrintWriter} import java.sql.Timestamp +import java.util.Locale import scala.reflect.ClassTag import scala.util.matching.Regex @@ -32,7 +33,7 @@ import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException import org.apache.spark.sql.catalyst.catalog.{CatalogColumnStat, CatalogStatistics, HiveTableRelation} import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, HistogramBin, HistogramSerializer} import org.apache.spark.sql.catalyst.util.{DateTimeUtils, StringUtils} -import org.apache.spark.sql.execution.command.{CommandUtils, DDLUtils} +import org.apache.spark.sql.execution.command.{AnalyzeColumnCommand, CommandUtils, DDLUtils} import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.hive.HiveExternalCatalog._ @@ -489,7 +490,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql(s"ANALYZE TABLE $tableName PARTITION (DS='2010-01-01') COMPUTE STATISTICS") }.getMessage assert(message.contains( - s"DS is not a valid partition column in table `default`.`${tableName.toLowerCase}`")) + "DS is not a valid partition column in table " + + s"`default`.`${tableName.toLowerCase(Locale.ROOT)}`")) } } } @@ -503,8 +505,9 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql(s"ANALYZE TABLE $tableName $partitionSpec COMPUTE STATISTICS") }.getMessage assert(message.contains("The list of partition columns with values " + - s"in partition specification for table '${tableName.toLowerCase}' in database 'default' " + - "is not a prefix of the list of partition columns defined in the table schema")) + s"in partition specification for table '${tableName.toLowerCase(Locale.ROOT)}' in " + + "database 'default' is not a prefix of the list of partition columns defined in " + + "the table schema")) } withTable(tableName) { @@ -550,12 +553,14 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto assertAnalysisException( s"ANALYZE TABLE $tableName PARTITION (hour=20) COMPUTE STATISTICS", - s"hour is not a valid partition column in table `default`.`${tableName.toLowerCase}`" + "hour is not a valid partition column in table " + + s"`default`.`${tableName.toLowerCase(Locale.ROOT)}`" ) assertAnalysisException( s"ANALYZE TABLE $tableName PARTITION (hour) COMPUTE STATISTICS", - s"hour is not a valid partition column in table `default`.`${tableName.toLowerCase}`" + "hour is not a valid partition column in table " + + s"`default`.`${tableName.toLowerCase(Locale.ROOT)}`" ) intercept[NoSuchPartitionException] { @@ -653,6 +658,51 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } } + test("collecting statistics for all columns") { + val table = "update_col_stats_table" + withTable(table) { + sql(s"CREATE TABLE $table (c1 INT, c2 STRING, c3 DOUBLE)") + sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR ALL COLUMNS") + val fetchedStats0 = + checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0)) + assert(fetchedStats0.get.colStats == Map( + "c1" -> CatalogColumnStat(distinctCount = Some(0), min = None, max = None, + nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)), + "c3" -> CatalogColumnStat(distinctCount = Some(0), min = None, max = None, + nullCount = Some(0), avgLen = Some(8), maxLen = Some(8)), + "c2" -> CatalogColumnStat(distinctCount = Some(0), min = None, max = None, + nullCount = Some(0), avgLen = Some(20), maxLen = Some(20)))) + + // Insert new data and analyze: have the latest column stats. + sql(s"INSERT INTO TABLE $table SELECT 1, 'a', 10.0") + sql(s"INSERT INTO TABLE $table SELECT 1, 'b', null") + + sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR ALL COLUMNS") + val fetchedStats1 = + checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(2)) + assert(fetchedStats1.get.colStats == Map( + "c1" -> CatalogColumnStat(distinctCount = Some(1), min = Some("1"), max = Some("1"), + nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)), + "c3" -> CatalogColumnStat(distinctCount = Some(1), min = Some("10.0"), max = Some("10.0"), + nullCount = Some(1), avgLen = Some(8), maxLen = Some(8)), + "c2" -> CatalogColumnStat(distinctCount = Some(2), min = None, max = None, + nullCount = Some(0), avgLen = Some(1), maxLen = Some(1)))) + } + } + + test("analyze column command paramaters validation") { + val e1 = intercept[IllegalArgumentException] { + AnalyzeColumnCommand(TableIdentifier("test"), Option(Seq("c1")), true).run(spark) + } + assert(e1.getMessage.contains("Parameter `columnNames` or `allColumns` are" + + " mutually exclusive")) + val e2 = intercept[IllegalArgumentException] { + AnalyzeColumnCommand(TableIdentifier("test"), None, false).run(spark) + } + assert(e1.getMessage.contains("Parameter `columnNames` or `allColumns` are" + + " mutually exclusive")) + } + private def createNonPartitionedTable( tabName: String, analyzedBySpark: Boolean = true, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala index 88cc42efd0fe..d567128e1a32 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/UDFSuite.scala @@ -141,11 +141,10 @@ class UDFSuite withTempDatabase { dbName => withUserDefinedFunction(functionName -> false) { sql(s"CREATE FUNCTION $dbName.$functionName AS '$functionClass'") - // TODO: Re-enable it after can distinguish qualified and unqualified function name - // checkAnswer( - // sql(s"SELECT $dbName.myuPPer(value) from $testTableName"), - // expectedDF - // ) + checkAnswer( + sql(s"SELECT $dbName.$functionName(value) from $testTableName"), + expectedDF + ) checkAnswer( sql(s"SHOW FUNCTIONS like $dbName.$functionNameUpper"), @@ -174,11 +173,10 @@ class UDFSuite // For this block, drop function command uses default.functionName as the function name. withUserDefinedFunction(s"$dbName.$functionNameUpper" -> false) { sql(s"CREATE FUNCTION $dbName.$functionName AS '$functionClass'") - // TODO: Re-enable it after can distinguish qualified and unqualified function name - // checkAnswer( - // sql(s"SELECT $dbName.myupper(value) from $testTableName"), - // expectedDF - // ) + checkAnswer( + sql(s"SELECT $dbName.$functionName(value) from $testTableName"), + expectedDF + ) sql(s"USE $dbName") @@ -195,4 +193,20 @@ class UDFSuite } } } + + test("SPARK-21318: The correct exception message should be thrown " + + "if a UDF/UDAF has already been registered") { + val functionName = "empty" + val functionClass = classOf[org.apache.spark.sql.hive.execution.UDAFEmpty].getCanonicalName + + withUserDefinedFunction(functionName -> false) { + sql(s"CREATE FUNCTION $functionName AS '$functionClass'") + + val e = intercept[AnalysisException] { + sql(s"SELECT $functionName(value) from $testTableName") + } + + assert(e.getMessage.contains("Can not get an evaluator of the empty UDAF")) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala index fa9f753795f6..7a325bf26b4c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala @@ -32,7 +32,7 @@ class HiveClientSuite(version: String) private val tryDirectSqlKey = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL.varname - private val testPartitionCount = 3 * 24 * 4 + private val testPartitionCount = 3 * 5 * 4 private def init(tryDirectSql: Boolean): HiveClient = { val storageFormat = CatalogStorageFormat( @@ -51,7 +51,7 @@ class HiveClientSuite(version: String) val partitions = for { ds <- 20170101 to 20170103 - h <- 0 to 23 + h <- 0 to 4 chunk <- Seq("aa", "ab", "ba", "bb") } yield CatalogTablePartition(Map( "ds" -> ds.toString, @@ -92,7 +92,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("ds") <=> 20170101, 20170101 to 20170103, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } @@ -100,7 +100,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("ds") === 20170101, 20170101 to 20170101, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } @@ -118,7 +118,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("chunk") === "aa", 20170101 to 20170103, - 0 to 23, + 0 to 4, "aa" :: Nil) } @@ -126,7 +126,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("chunk").cast(IntegerType) === 1, 20170101 to 20170103, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } @@ -134,7 +134,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("chunk").cast(BooleanType) === true, 20170101 to 20170103, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } @@ -142,23 +142,23 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( Literal(20170101) === attr("ds"), 20170101 to 20170101, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } - test("getPartitionsByFilter: ds=20170101 and h=10") { + test("getPartitionsByFilter: ds=20170101 and h=2") { testMetastorePartitionFiltering( - attr("ds") === 20170101 && attr("h") === 10, + attr("ds") === 20170101 && attr("h") === 2, 20170101 to 20170101, - 10 to 10, + 2 to 2, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } - test("getPartitionsByFilter: cast(ds as long)=20170101L and h=10") { + test("getPartitionsByFilter: cast(ds as long)=20170101L and h=2") { testMetastorePartitionFiltering( - attr("ds").cast(LongType) === 20170101L && attr("h") === 10, + attr("ds").cast(LongType) === 20170101L && attr("h") === 2, 20170101 to 20170101, - 10 to 10, + 2 to 2, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } @@ -166,7 +166,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("ds") === 20170101 || attr("ds") === 20170102, 20170101 to 20170102, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } @@ -174,7 +174,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("ds").in(20170102, 20170103), 20170102 to 20170103, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } @@ -182,7 +182,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("ds").cast(LongType).in(20170102L, 20170103L), 20170102 to 20170103, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil) } @@ -190,7 +190,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("ds").in(20170102, 20170103), 20170102 to 20170103, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil, { case expr @ In(v, list) if expr.inSetConvertible => InSet(v, list.map(_.eval(EmptyRow)).toSet) @@ -202,7 +202,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("ds").cast(LongType).in(20170102L, 20170103L), 20170102 to 20170103, - 0 to 23, + 0 to 4, "aa" :: "ab" :: "ba" :: "bb" :: Nil, { case expr @ In(v, list) if expr.inSetConvertible => InSet(v, list.map(_.eval(EmptyRow)).toSet) @@ -213,7 +213,7 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("chunk").in("ab", "ba"), 20170101 to 20170103, - 0 to 23, + 0 to 4, "ab" :: "ba" :: Nil) } @@ -221,34 +221,34 @@ class HiveClientSuite(version: String) testMetastorePartitionFiltering( attr("chunk").in("ab", "ba"), 20170101 to 20170103, - 0 to 23, + 0 to 4, "ab" :: "ba" :: Nil, { case expr @ In(v, list) if expr.inSetConvertible => InSet(v, list.map(_.eval(EmptyRow)).toSet) }) } - test("getPartitionsByFilter: (ds=20170101 and h>=8) or (ds=20170102 and h<8)") { - val day1 = (20170101 to 20170101, 8 to 23, Seq("aa", "ab", "ba", "bb")) - val day2 = (20170102 to 20170102, 0 to 7, Seq("aa", "ab", "ba", "bb")) - testMetastorePartitionFiltering((attr("ds") === 20170101 && attr("h") >= 8) || - (attr("ds") === 20170102 && attr("h") < 8), day1 :: day2 :: Nil) + test("getPartitionsByFilter: (ds=20170101 and h>=2) or (ds=20170102 and h<2)") { + val day1 = (20170101 to 20170101, 2 to 4, Seq("aa", "ab", "ba", "bb")) + val day2 = (20170102 to 20170102, 0 to 1, Seq("aa", "ab", "ba", "bb")) + testMetastorePartitionFiltering((attr("ds") === 20170101 && attr("h") >= 2) || + (attr("ds") === 20170102 && attr("h") < 2), day1 :: day2 :: Nil) } - test("getPartitionsByFilter: (ds=20170101 and h>=8) or (ds=20170102 and h<(7+1))") { - val day1 = (20170101 to 20170101, 8 to 23, Seq("aa", "ab", "ba", "bb")) + test("getPartitionsByFilter: (ds=20170101 and h>=2) or (ds=20170102 and h<(1+1))") { + val day1 = (20170101 to 20170101, 2 to 4, Seq("aa", "ab", "ba", "bb")) // Day 2 should include all hours because we can't build a filter for h<(7+1) - val day2 = (20170102 to 20170102, 0 to 23, Seq("aa", "ab", "ba", "bb")) - testMetastorePartitionFiltering((attr("ds") === 20170101 && attr("h") >= 8) || - (attr("ds") === 20170102 && attr("h") < (Literal(7) + 1)), day1 :: day2 :: Nil) + val day2 = (20170102 to 20170102, 0 to 4, Seq("aa", "ab", "ba", "bb")) + testMetastorePartitionFiltering((attr("ds") === 20170101 && attr("h") >= 2) || + (attr("ds") === 20170102 && attr("h") < (Literal(1) + 1)), day1 :: day2 :: Nil) } test("getPartitionsByFilter: " + - "chunk in ('ab', 'ba') and ((ds=20170101 and h>=8) or (ds=20170102 and h<8))") { - val day1 = (20170101 to 20170101, 8 to 23, Seq("ab", "ba")) - val day2 = (20170102 to 20170102, 0 to 7, Seq("ab", "ba")) + "chunk in ('ab', 'ba') and ((ds=20170101 and h>=2) or (ds=20170102 and h<2))") { + val day1 = (20170101 to 20170101, 2 to 4, Seq("ab", "ba")) + val day2 = (20170102 to 20170102, 0 to 1, Seq("ab", "ba")) testMetastorePartitionFiltering(attr("chunk").in("ab", "ba") && - ((attr("ds") === 20170101 && attr("h") >= 8) || (attr("ds") === 20170102 && attr("h") < 8)), + ((attr("ds") === 20170101 && attr("h") >= 2) || (attr("ds") === 20170102 && attr("h") < 2)), day1 :: day2 :: Nil) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 9acd5e1c248e..fd38944a5dd2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -72,7 +72,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA outputFormat = serde.get.outputFormat, serde = serde.get.serde, compressed = false, - properties = Map("serialization.format" -> "1")) + properties = Map.empty) } else { CatalogStorageFormat( locationUri = Some(catalog.defaultTablePath(name)), @@ -803,6 +803,25 @@ class HiveDDLSuite } } + test("SPARK-25313 Insert overwrite directory should output correct schema") { + withSQLConf(CONVERT_METASTORE_PARQUET.key -> "false") { + withTable("tbl") { + withView("view1") { + spark.sql("CREATE TABLE tbl(id long)") + spark.sql("INSERT OVERWRITE TABLE tbl VALUES 4") + spark.sql("CREATE VIEW view1 AS SELECT id FROM tbl") + withTempPath { path => + spark.sql(s"INSERT OVERWRITE LOCAL DIRECTORY '${path.getCanonicalPath}' " + + "STORED AS PARQUET SELECT ID FROM view1") + val expectedSchema = StructType(Seq(StructField("ID", LongType, true))) + assert(spark.read.parquet(path.toString).schema == expectedSchema) + checkAnswer(spark.read.parquet(path.toString), Seq(Row(4))) + } + } + } + } + } + test("alter table partition - storage information") { sql("CREATE TABLE boxes (height INT, length INT) PARTITIONED BY (width INT)") sql("INSERT OVERWRITE TABLE boxes PARTITION (width=4) SELECT 4, 4") @@ -2329,4 +2348,26 @@ class HiveDDLSuite } } } + + test("desc formatted table should also show viewOriginalText for views") { + withView("v1", "v2") { + sql("CREATE VIEW v1 AS SELECT 1 AS value") + assert(sql("DESC FORMATTED v1").collect().containsSlice( + Seq( + Row("Type", "VIEW", ""), + Row("View Text", "SELECT 1 AS value", ""), + Row("View Original Text", "SELECT 1 AS value", "") + ) + )) + + hiveClient.runSqlHive("CREATE VIEW v2 AS SELECT * FROM (SELECT 1) T") + assert(sql("DESC FORMATTED v2").collect().containsSlice( + Seq( + Row("Type", "VIEW", ""), + Row("View Text", "SELECT `t`.`_c0` FROM (SELECT 1) `T`", ""), + Row("View Original Text", "SELECT * FROM (SELECT 1) T", "") + ) + )) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala index 7402c9626873..fe3deceb0806 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala @@ -37,6 +37,7 @@ class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { import testImplicits._ protected override def beforeAll(): Unit = { + super.beforeAll() sql(s"CREATE TEMPORARY FUNCTION mock AS '${classOf[MockUDAF].getName}'") sql(s"CREATE TEMPORARY FUNCTION hive_max AS '${classOf[GenericUDAFMax].getName}'") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala index 8dbcd24cd78d..0ef630bbd367 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala @@ -43,6 +43,7 @@ class ObjectHashAggregateSuite import testImplicits._ protected override def beforeAll(): Unit = { + super.beforeAll() sql(s"CREATE TEMPORARY FUNCTION hive_max AS '${classOf[GenericUDAFMax].getName}'") } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala index 16541295eb45..cc592cf6ca62 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala @@ -22,29 +22,21 @@ import scala.collection.JavaConverters._ import org.scalatest.BeforeAndAfter import org.apache.spark.sql.hive.test.{TestHive, TestHiveQueryExecution} -import org.apache.spark.sql.internal.SQLConf /** * A set of test cases that validate partition and column pruning. */ class PruningSuite extends HiveComparisonTest with BeforeAndAfter { - private val originalLimitFlatGlobalLimit = TestHive.conf.limitFlatGlobalLimit - override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(false) - TestHive.setConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT, false) // Column/partition pruning is not implemented for `InMemoryColumnarTableScan` yet, // need to reset the environment to ensure all referenced tables in this suites are // not cached in-memory. Refer to https://issues.apache.org/jira/browse/SPARK-2283 // for details. TestHive.reset() } - override def afterAll() { - TestHive.setConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT, originalLimitFlatGlobalLimit) - super.afterAll() - } // Column pruning tests diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 20c4c36c0509..e49aea267026 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -1916,6 +1916,21 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } + test("SPARK-23425 Test LOAD DATA LOCAL INPATH with space in file name") { + withTempDir { dir => + val path = dir.toURI.toString.stripSuffix("/") + val dirPath = dir.getAbsoluteFile + for (i <- 1 to 3) { + Files.write(s"$i", new File(dirPath, s"part-r-0000 $i"), StandardCharsets.UTF_8) + } + withTable("load_t") { + sql("CREATE TABLE load_t (a STRING)") + sql(s"LOAD DATA LOCAL INPATH '$path/part-r-0000 1' INTO TABLE load_t") + checkAnswer(sql("SELECT * FROM load_t"), Seq(Row("1"))) + } + } + } + test("Support wildcard character in folderlevel for LOAD DATA LOCAL INPATH") { withTempDir { dir => val path = dir.toURI.toString.stripSuffix("/") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala index d84f9a382820..7fefaf53939b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.execution.datasources.orc.OrcSuite +import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.HiveSerDe import org.apache.spark.sql.types._ @@ -173,4 +174,20 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { assert(msg.contains("ORC data source does not support calendarinterval data type.")) } } + + test("Check BloomFilter creation") { + Seq(true, false).foreach { convertMetastore => + withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") { + testBloomFilterCreation(org.apache.orc.OrcProto.Stream.Kind.BLOOM_FILTER) // Before ORC-101 + } + } + } + + test("Enforce direct encoding column-wise selectively") { + Seq(true, false).foreach { convertMetastore => + withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") { + testSelectiveDictionaryEncoding(isSelective = false) + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala index bf6efa7c4c08..870ad4818eb2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala @@ -19,22 +19,29 @@ package org.apache.spark.sql.hive.orc import java.io.File -import scala.util.{Random, Try} +import scala.util.Random import org.apache.spark.SparkConf +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.util.{Benchmark, Utils} - /** * Benchmark to measure ORC read performance. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/OrcReadBenchmark-results.txt". + * }}} * * This is in `sql/hive` module in order to compare `sql/core` and `sql/hive` ORC data sources. */ // scalastyle:off line.size.limit -object OrcReadBenchmark { +object OrcReadBenchmark extends BenchmarkBase with SQLHelper { val conf = new SparkConf() conf.set("orc.compression", "snappy") @@ -47,28 +54,10 @@ object OrcReadBenchmark { // Set default configs. Individual cases will change them if necessary. spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true") - def withTempPath(f: File => Unit): Unit = { - val path = Utils.createTempDir() - path.delete() - try f(path) finally Utils.deleteRecursively(path) - } - def withTempTable(tableNames: String*)(f: => Unit): Unit = { try f finally tableNames.foreach(spark.catalog.dropTempView) } - def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { - val (keys, values) = pairs.unzip - val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption) - (keys, values).zipped.foreach(spark.conf.set) - try f finally { - keys.zip(currentValues).foreach { - case (key, Some(value)) => spark.conf.set(key, value) - case (key, None) => spark.conf.unset(key) - } - } - } - private val NATIVE_ORC_FORMAT = classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName private val HIVE_ORC_FORMAT = classOf[org.apache.spark.sql.hive.orc.OrcFileFormat].getCanonicalName @@ -86,7 +75,7 @@ object OrcReadBenchmark { } def numericScanBenchmark(values: Int, dataType: DataType): Unit = { - val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values) + val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -115,59 +104,13 @@ object OrcReadBenchmark { spark.sql("SELECT sum(id) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1135 / 1171 13.9 72.2 1.0X - Native ORC Vectorized 152 / 163 103.4 9.7 7.5X - Native ORC Vectorized with copy 149 / 162 105.4 9.5 7.6X - Hive built-in ORC 1380 / 1384 11.4 87.7 0.8X - - SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1182 / 1244 13.3 75.2 1.0X - Native ORC Vectorized 145 / 156 108.7 9.2 8.2X - Native ORC Vectorized with copy 148 / 158 106.4 9.4 8.0X - Hive built-in ORC 1591 / 1636 9.9 101.2 0.7X - - SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1271 / 1271 12.4 80.8 1.0X - Native ORC Vectorized 206 / 212 76.3 13.1 6.2X - Native ORC Vectorized with copy 200 / 213 78.8 12.7 6.4X - Hive built-in ORC 1776 / 1787 8.9 112.9 0.7X - - SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1344 / 1355 11.7 85.4 1.0X - Native ORC Vectorized 258 / 268 61.0 16.4 5.2X - Native ORC Vectorized with copy 252 / 257 62.4 16.0 5.3X - Hive built-in ORC 1818 / 1823 8.7 115.6 0.7X - - SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1333 / 1352 11.8 84.8 1.0X - Native ORC Vectorized 310 / 324 50.7 19.7 4.3X - Native ORC Vectorized with copy 312 / 320 50.4 19.9 4.3X - Hive built-in ORC 1904 / 1918 8.3 121.0 0.7X - - SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1408 / 1585 11.2 89.5 1.0X - Native ORC Vectorized 359 / 368 43.8 22.8 3.9X - Native ORC Vectorized with copy 364 / 371 43.2 23.2 3.9X - Hive built-in ORC 1881 / 1954 8.4 119.6 0.7X - */ benchmark.run() } } } def intStringScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Int and String Scan", values) + val benchmark = new Benchmark("Int and String Scan", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -198,24 +141,13 @@ object OrcReadBenchmark { spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 2566 / 2592 4.1 244.7 1.0X - Native ORC Vectorized 1098 / 1113 9.6 104.7 2.3X - Native ORC Vectorized with copy 1527 / 1593 6.9 145.6 1.7X - Hive built-in ORC 3561 / 3705 2.9 339.6 0.7X - */ benchmark.run() } } } def partitionTableScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Partitioned Table", values) + val benchmark = new Benchmark("Partitioned Table", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -284,32 +216,13 @@ object OrcReadBenchmark { spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Data only - Native ORC MR 1447 / 1457 10.9 92.0 1.0X - Data only - Native ORC Vectorized 256 / 266 61.4 16.3 5.6X - Data only - Native ORC Vectorized with copy 263 / 273 59.8 16.7 5.5X - Data only - Hive built-in ORC 1960 / 1988 8.0 124.6 0.7X - Partition only - Native ORC MR 1039 / 1043 15.1 66.0 1.4X - Partition only - Native ORC Vectorized 48 / 53 326.6 3.1 30.1X - Partition only - Native ORC Vectorized with copy 48 / 53 328.4 3.0 30.2X - Partition only - Hive built-in ORC 1234 / 1242 12.7 78.4 1.2X - Both columns - Native ORC MR 1465 / 1475 10.7 93.1 1.0X - Both columns - Native ORC Vectorized 292 / 301 53.9 18.6 5.0X - Both column - Native ORC Vectorized with copy 348 / 354 45.1 22.2 4.2X - Both columns - Hive built-in ORC 2051 / 2060 7.7 130.4 0.7X - */ benchmark.run() } } } def repeatedStringScanBenchmark(values: Int): Unit = { - val benchmark = new Benchmark("Repeated String", values) + val benchmark = new Benchmark("Repeated String", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -337,17 +250,6 @@ object OrcReadBenchmark { spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1271 / 1278 8.3 121.2 1.0X - Native ORC Vectorized 200 / 212 52.4 19.1 6.4X - Native ORC Vectorized with copy 342 / 347 30.7 32.6 3.7X - Hive built-in ORC 1874 / 2105 5.6 178.7 0.7X - */ benchmark.run() } } @@ -364,7 +266,8 @@ object OrcReadBenchmark { s"SELECT IF(RAND(1) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c1, " + s"IF(RAND(2) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c2 FROM t1")) - val benchmark = new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values) + val benchmark = + new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values, output = output) benchmark.addCase("Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { @@ -390,38 +293,13 @@ object OrcReadBenchmark { "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - String with Nulls Scan (0.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 2394 / 2886 4.4 228.3 1.0X - Native ORC Vectorized 699 / 729 15.0 66.7 3.4X - Native ORC Vectorized with copy 959 / 1025 10.9 91.5 2.5X - Hive built-in ORC 3899 / 3901 2.7 371.9 0.6X - - String with Nulls Scan (0.5%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 2234 / 2255 4.7 213.1 1.0X - Native ORC Vectorized 854 / 869 12.3 81.4 2.6X - Native ORC Vectorized with copy 1099 / 1128 9.5 104.8 2.0X - Hive built-in ORC 2767 / 2793 3.8 263.9 0.8X - - String with Nulls Scan (0.95%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1166 / 1202 9.0 111.2 1.0X - Native ORC Vectorized 338 / 345 31.1 32.2 3.5X - Native ORC Vectorized with copy 418 / 428 25.1 39.9 2.8X - Hive built-in ORC 1730 / 1761 6.1 164.9 0.7X - */ benchmark.run() } } } def columnsBenchmark(values: Int, width: Int): Unit = { - val benchmark = new Benchmark(s"Single Column Scan from $width columns", values) + val benchmark = new Benchmark(s"Single Column Scan from $width columns", values, output = output) withTempPath { dir => withTempTable("t1", "nativeOrcTable", "hiveOrcTable") { @@ -453,49 +331,36 @@ object OrcReadBenchmark { spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect() } - /* - Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1 - Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz - - Single Column Scan from 100 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 1050 / 1053 1.0 1001.1 1.0X - Native ORC Vectorized 95 / 101 11.0 90.9 11.0X - Native ORC Vectorized with copy 95 / 102 11.0 90.9 11.0X - Hive built-in ORC 348 / 358 3.0 331.8 3.0X - - Single Column Scan from 200 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 2099 / 2108 0.5 2002.1 1.0X - Native ORC Vectorized 179 / 187 5.8 171.1 11.7X - Native ORC Vectorized with copy 176 / 188 6.0 167.6 11.9X - Hive built-in ORC 562 / 581 1.9 535.9 3.7X - - Single Column Scan from 300 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Native ORC MR 3221 / 3246 0.3 3071.4 1.0X - Native ORC Vectorized 312 / 322 3.4 298.0 10.3X - Native ORC Vectorized with copy 306 / 320 3.4 291.6 10.5X - Hive built-in ORC 815 / 824 1.3 777.3 4.0X - */ benchmark.run() } } } - def main(args: Array[String]): Unit = { - Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType => - numericScanBenchmark(1024 * 1024 * 15, dataType) + override def runBenchmarkSuite(): Unit = { + runBenchmark("SQL Single Numeric Column Scan") { + Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType => + numericScanBenchmark(1024 * 1024 * 15, dataType) + } + } + runBenchmark("Int and String Scan") { + intStringScanBenchmark(1024 * 1024 * 10) + } + runBenchmark("Partitioned Table Scan") { + partitionTableScanBenchmark(1024 * 1024 * 15) + } + runBenchmark("Repeated String Scan") { + repeatedStringScanBenchmark(1024 * 1024 * 10) + } + runBenchmark("String with Nulls Scan") { + for (fractionOfNulls <- List(0.0, 0.50, 0.95)) { + stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls) + } } - intStringScanBenchmark(1024 * 1024 * 10) - partitionTableScanBenchmark(1024 * 1024 * 15) - repeatedStringScanBenchmark(1024 * 1024 * 10) - for (fractionOfNulls <- List(0.0, 0.50, 0.95)) { - stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls) + runBenchmark("Single Column Scan From Wide Columns") { + columnsBenchmark(1024 * 1024 * 1, 100) + columnsBenchmark(1024 * 1024 * 1, 200) + columnsBenchmark(1024 * 1024 * 1, 300) } - columnsBenchmark(1024 * 1024 * 1, 100) - columnsBenchmark(1024 * 1024 * 1, 200) - columnsBenchmark(1024 * 1024 * 1, 300) } } // scalastyle:on line.size.limit diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala index b9ec940ac492..6bd59fde550d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala @@ -38,6 +38,10 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes val dataSourceName: String + protected val parquetDataSourceName: String = "parquet" + + private def isParquetDataSource: Boolean = dataSourceName == parquetDataSourceName + protected def supportsDataType(dataType: DataType): Boolean = true val dataSchema = @@ -114,10 +118,21 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes new UDT.MyDenseVectorUDT() ).filter(supportsDataType) - for (dataType <- supportedDataTypes) { - for (parquetDictionaryEncodingEnabled <- Seq(true, false)) { - test(s"test all data types - $dataType with parquet.enable.dictionary = " + - s"$parquetDictionaryEncodingEnabled") { + test(s"test all data types") { + val parquetDictionaryEncodingEnabledConfs = if (isParquetDataSource) { + // Run with/without Parquet dictionary encoding enabled for Parquet data source. + Seq(true, false) + } else { + Seq(false) + } + for (dataType <- supportedDataTypes) { + for (parquetDictionaryEncodingEnabled <- parquetDictionaryEncodingEnabledConfs) { + val extraMessage = if (isParquetDataSource) { + s" with parquet.enable.dictionary = $parquetDictionaryEncodingEnabled" + } else { + "" + } + logInfo(s"Testing $dataType data type$extraMessage") val extraOptions = Map[String, String]( "parquet.enable.dictionary" -> parquetDictionaryEncodingEnabled.toString @@ -754,33 +769,6 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes } } - // NOTE: This test suite is not super deterministic. On nodes with only relatively few cores - // (4 or even 1), it's hard to reproduce the data loss issue. But on nodes with for example 8 or - // more cores, the issue can be reproduced steadily. Fortunately our Jenkins builder meets this - // requirement. We probably want to move this test case to spark-integration-tests or spark-perf - // later. - test("SPARK-8406: Avoids name collision while writing files") { - withTempPath { dir => - val path = dir.getCanonicalPath - spark - .range(10000) - .repartition(250) - .write - .mode(SaveMode.Overwrite) - .format(dataSourceName) - .save(path) - - assertResult(10000) { - spark - .read - .format(dataSourceName) - .option("dataSchema", StructType(StructField("id", LongType) :: Nil).json) - .load(path) - .count() - } - } - } - test("SPARK-8887: Explicitly define which data types can be used as dynamic partition columns") { val df = Seq( (1, "v1", Array(1, 2, 3), Map("k1" -> "v1"), Tuple2(1, "4")), diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala index 6858bbc44172..6ebc1d145848 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.types._ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest { import testImplicits._ - override val dataSourceName: String = "parquet" + override val dataSourceName: String = parquetDataSourceName // Parquet does not play well with NullType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { @@ -232,4 +232,33 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest { } } } + + // NOTE: This test suite is not super deterministic. On nodes with only relatively few cores + // (4 or even 1), it's hard to reproduce the data loss issue. But on nodes with for example 8 or + // more cores, the issue can be reproduced steadily. Fortunately our Jenkins builder meets this + // requirement. We probably want to move this test case to spark-integration-tests or spark-perf + // later. + // Also, this test is slow. As now all the file format data source are using common code + // for creating result files, we can test Parquet only to reduce test time. + test("SPARK-8406: Avoids name collision while writing files") { + withTempPath { dir => + val path = dir.getCanonicalPath + spark + .range(10000) + .repartition(250) + .write + .mode(SaveMode.Overwrite) + .format(dataSourceName) + .save(path) + + assertResult(10000) { + spark + .read + .format(dataSourceName) + .option("dataSchema", StructType(StructField("id", LongType) :: Nil).json) + .load(path) + .count() + } + } + } } diff --git a/streaming/pom.xml b/streaming/pom.xml index 4497e53b6598..f9a5029a8e81 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 3703a87cdb9a..135430f1ef62 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -54,9 +54,13 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime: Time) "spark.driver.bindAddress", "spark.driver.port", "spark.master", + "spark.kubernetes.driver.pod.name", + "spark.kubernetes.executor.podNamePrefix", "spark.yarn.jars", "spark.yarn.keytab", "spark.yarn.principal", + "spark.kerberos.keytab", + "spark.kerberos.principal", "spark.ui.filters", "spark.mesos.driver.frameworkId") @@ -64,6 +68,8 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime: Time) .remove("spark.driver.host") .remove("spark.driver.bindAddress") .remove("spark.driver.port") + .remove("spark.kubernetes.driver.pod.name") + .remove("spark.kubernetes.executor.podNamePrefix") val newReloadConf = new SparkConf(loadDefaults = true) propertiesToReload.foreach { prop => newReloadConf.getOption(prop).foreach { value => diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala index b8a5a96faf15..438847caf0c3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala @@ -17,19 +17,19 @@ package org.apache.spark.streaming.dstream -import java.io.{IOException, ObjectInputStream} +import java.io.{FileNotFoundException, IOException, ObjectInputStream} import scala.collection.mutable import scala.reflect.ClassTag import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat} import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming._ import org.apache.spark.streaming.scheduler.StreamInputInfo -import org.apache.spark.util.{SerializableConfiguration, TimeStampedHashMap, Utils} +import org.apache.spark.util.{SerializableConfiguration, Utils} /** * This class represents an input stream that monitors a Hadoop-compatible filesystem for new @@ -122,9 +122,6 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( // Set of files that were selected in the remembered batches @transient private var recentlySelectedFiles = new mutable.HashSet[String]() - // Read-through cache of file mod times, used to speed up mod time lookups - @transient private var fileToModTime = new TimeStampedHashMap[String, Long](true) - // Timestamp of the last round of finding files @transient private var lastNewFileFindingTime = 0L @@ -140,7 +137,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( * a union RDD out of them. Note that this maintains the list of files that were processed * in the latest modification time in the previous call to this method. This is because the * modification time returned by the FileStatus API seems to return times only at the - * granularity of seconds. And new files may have the same modification time as the + * granularity of seconds in HDFS. And new files may have the same modification time as the * latest modification time in the previous call to this method yet was not reported in * the previous call. */ @@ -174,8 +171,6 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( logDebug("Cleared files are:\n" + oldFiles.map(p => (p._1, p._2.mkString(", "))).mkString("\n")) } - // Delete file mod times that weren't accessed in the last round of getting new files - fileToModTime.clearOldValues(lastNewFileFindingTime - 1) } /** @@ -197,29 +192,29 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( logDebug(s"Getting new files for time $currentTime, " + s"ignoring files older than $modTimeIgnoreThreshold") - val newFileFilter = new PathFilter { - def accept(path: Path): Boolean = isNewFile(path, currentTime, modTimeIgnoreThreshold) - } - val directoryFilter = new PathFilter { - override def accept(path: Path): Boolean = fs.getFileStatus(path).isDirectory - } - val directories = fs.globStatus(directoryPath, directoryFilter).map(_.getPath) + val directories = Option(fs.globStatus(directoryPath)).getOrElse(Array.empty[FileStatus]) + .filter(_.isDirectory) + .map(_.getPath) val newFiles = directories.flatMap(dir => - fs.listStatus(dir, newFileFilter).map(_.getPath.toString)) + fs.listStatus(dir) + .filter(isNewFile(_, currentTime, modTimeIgnoreThreshold)) + .map(_.getPath.toString)) val timeTaken = clock.getTimeMillis() - lastNewFileFindingTime - logInfo("Finding new files took " + timeTaken + " ms") - logDebug("# cached file times = " + fileToModTime.size) + logDebug(s"Finding new files took $timeTaken ms") if (timeTaken > slideDuration.milliseconds) { logWarning( - "Time taken to find new files exceeds the batch size. " + + s"Time taken to find new files $timeTaken exceeds the batch size. " + "Consider increasing the batch size or reducing the number of " + - "files in the monitored directory." + "files in the monitored directories." ) } newFiles } catch { + case e: FileNotFoundException => + logWarning(s"No directory to scan: $directoryPath: $e") + Array.empty case e: Exception => - logWarning("Error finding new files", e) + logWarning(s"Error finding new files under $directoryPath", e) reset() Array.empty } @@ -242,8 +237,16 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( * The files with mod time T+5 are not remembered and cannot be ignored (since, t+5 > t+1). * Hence they can get selected as new files again. To prevent this, files whose mod time is more * than current batch time are not considered. + * @param fileStatus file status + * @param currentTime time of the batch + * @param modTimeIgnoreThreshold the ignore threshold + * @return true if the file has been modified within the batch window */ - private def isNewFile(path: Path, currentTime: Long, modTimeIgnoreThreshold: Long): Boolean = { + private def isNewFile( + fileStatus: FileStatus, + currentTime: Long, + modTimeIgnoreThreshold: Long): Boolean = { + val path = fileStatus.getPath val pathStr = path.toString // Reject file if it does not satisfy filter if (!filter(path)) { @@ -251,7 +254,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( return false } // Reject file if it was created before the ignore time - val modTime = getFileModTime(path) + val modTime = fileStatus.getModificationTime() if (modTime <= modTimeIgnoreThreshold) { // Use <= instead of < to avoid SPARK-4518 logDebug(s"$pathStr ignored as mod time $modTime <= ignore time $modTimeIgnoreThreshold") @@ -293,11 +296,6 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( new UnionRDD(context.sparkContext, fileRDDs) } - /** Get file mod time from cache or fetch it from the file system */ - private def getFileModTime(path: Path) = { - fileToModTime.getOrElseUpdate(path.toString, fs.getFileStatus(path).getModificationTime()) - } - private def directoryPath: Path = { if (_path == null) _path = new Path(directory) _path @@ -319,7 +317,6 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( generatedRDDs = new mutable.HashMap[Time, RDD[(K, V)]]() batchTimeToSelectedFiles = new mutable.HashMap[Time, Array[String]] recentlySelectedFiles = new mutable.HashSet[String]() - fileToModTime = new TimeStampedHashMap[String, Long](true) } /** diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala index b5d36a36513a..1cf21e8a2803 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala @@ -27,7 +27,8 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import com.google.common.io.Files -import org.apache.hadoop.fs.Path +import org.apache.commons.io.IOUtils +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.scalatest.BeforeAndAfter @@ -130,10 +131,8 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { } test("binary records stream") { - var testDir: File = null - try { + withTempDir { testDir => val batchDuration = Seconds(2) - testDir = Utils.createTempDir() // Create a file that exists before the StreamingContext is created: val existingFile = new File(testDir, "0") Files.write("0\n", existingFile, StandardCharsets.UTF_8) @@ -176,8 +175,6 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { assert(obtainedOutput(i) === input.map(b => (b + i).toByte)) } } - } finally { - if (testDir != null) Utils.deleteRecursively(testDir) } } @@ -190,10 +187,8 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { } test("file input stream - wildcard") { - var testDir: File = null - try { + withTempDir { testDir => val batchDuration = Seconds(2) - testDir = Utils.createTempDir() val testSubDir1 = Utils.createDirectory(testDir.toString, "tmp1") val testSubDir2 = Utils.createDirectory(testDir.toString, "tmp2") @@ -221,12 +216,12 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { // not enough to trigger a batch clock.advance(batchDuration.milliseconds / 2) - def createFileAndAdvenceTime(data: Int, dir: File): Unit = { + def createFileAndAdvanceTime(data: Int, dir: File): Unit = { val file = new File(testSubDir1, data.toString) Files.write(data + "\n", file, StandardCharsets.UTF_8) assert(file.setLastModified(clock.getTimeMillis())) assert(file.lastModified === clock.getTimeMillis()) - logInfo("Created file " + file) + logInfo(s"Created file $file") // Advance the clock after creating the file to avoid a race when // setting its modification time clock.advance(batchDuration.milliseconds) @@ -236,18 +231,85 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { } // Over time, create files in the temp directory 1 val input1 = Seq(1, 2, 3, 4, 5) - input1.foreach(i => createFileAndAdvenceTime(i, testSubDir1)) + input1.foreach(i => createFileAndAdvanceTime(i, testSubDir1)) // Over time, create files in the temp directory 1 val input2 = Seq(6, 7, 8, 9, 10) - input2.foreach(i => createFileAndAdvenceTime(i, testSubDir2)) + input2.foreach(i => createFileAndAdvanceTime(i, testSubDir2)) // Verify that all the files have been read val expectedOutput = (input1 ++ input2).map(_.toString).toSet assert(outputQueue.asScala.flatten.toSet === expectedOutput) } - } finally { - if (testDir != null) Utils.deleteRecursively(testDir) + } + } + + test("Modified files are correctly detected.") { + withTempDir { testDir => + val batchDuration = Seconds(2) + val durationMs = batchDuration.milliseconds + val testPath = new Path(testDir.toURI) + val streamDir = new Path(testPath, "streaming") + val streamGlobPath = new Path(streamDir, "sub*") + val generatedDir = new Path(testPath, "generated") + val generatedSubDir = new Path(generatedDir, "subdir") + val renamedSubDir = new Path(streamDir, "subdir") + + withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc => + val sparkContext = ssc.sparkContext + val hc = sparkContext.hadoopConfiguration + val fs = FileSystem.get(testPath.toUri, hc) + + fs.delete(testPath, true) + fs.mkdirs(testPath) + fs.mkdirs(streamDir) + fs.mkdirs(generatedSubDir) + + def write(path: Path, text: String): Unit = { + val out = fs.create(path, true) + IOUtils.write(text, out) + out.close() + } + + val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + val existingFile = new Path(generatedSubDir, "existing") + write(existingFile, "existing\n") + val status = fs.getFileStatus(existingFile) + clock.setTime(status.getModificationTime + durationMs) + val batchCounter = new BatchCounter(ssc) + val fileStream = ssc.textFileStream(streamGlobPath.toUri.toString) + val outputQueue = new ConcurrentLinkedQueue[Seq[String]] + val outputStream = new TestOutputStream(fileStream, outputQueue) + outputStream.register() + + ssc.start() + clock.advance(durationMs) + eventually(eventuallyTimeout) { + assert(1 === batchCounter.getNumCompletedBatches) + } + // create and rename the file + // put a file into the generated directory + val textPath = new Path(generatedSubDir, "renamed.txt") + write(textPath, "renamed\n") + val now = clock.getTimeMillis() + val modTime = now + durationMs / 2 + fs.setTimes(textPath, modTime, modTime) + val textFilestatus = fs.getFileStatus(existingFile) + assert(textFilestatus.getModificationTime < now + durationMs) + + // rename the directory under the path being scanned + fs.rename(generatedSubDir, renamedSubDir) + + // move forward one window + clock.advance(durationMs) + // await the next scan completing + eventually(eventuallyTimeout) { + assert(2 === batchCounter.getNumCompletedBatches) + } + // verify that the "renamed" file is found, but not the "existing" one which is out of + // the window + assert(Set("renamed") === outputQueue.asScala.flatten.toSet) + } } } @@ -416,10 +478,8 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { } def testFileStream(newFilesOnly: Boolean) { - var testDir: File = null - try { + withTempDir { testDir => val batchDuration = Seconds(2) - testDir = Utils.createTempDir() // Create a file that exists before the StreamingContext is created: val existingFile = new File(testDir, "0") Files.write("0\n", existingFile, StandardCharsets.UTF_8) @@ -466,8 +526,6 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { } assert(outputQueue.asScala.flatten.toSet === expectedOutput) } - } finally { - if (testDir != null) Utils.deleteRecursively(testDir) } } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala index dbab70886102..ada494eb897f 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -17,7 +17,7 @@ package org.apache.spark.streaming -import java.io.{IOException, ObjectInputStream} +import java.io.{File, IOException, ObjectInputStream} import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ @@ -557,4 +557,16 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { verifyOutput[W](output.toSeq, expectedOutput, useSet) } } + + /** + * Creates a temporary directory, which is then passed to `f` and will be deleted after `f` + * returns. + * (originally from `SqlTestUtils`.) + * @todo Probably this method should be moved to a more general place + */ + protected def withTempDir(f: File => Unit): Unit = { + val dir = Utils.createTempDir().getCanonicalFile + try f(dir) finally Utils.deleteRecursively(dir) + } + } diff --git a/tools/pom.xml b/tools/pom.xml index 242219e29f50..247f5a6df4b0 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.11 - 2.4.0-SNAPSHOT + 3.0.0-SNAPSHOT ../pom.xml
    diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index 73de1892977a..b6e427735e74 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -1560,6 +1560,35 @@ streamingDf <- dropDuplicates(streamingDf, "guid", "eventTime") +### Policy for handling multiple watermarks +A streaming query can have multiple input streams that are unioned or joined together. +Each of the input streams can have a different threshold of late data that needs to +be tolerated for stateful operations. You specify these thresholds using +``withWatermarks("eventTime", delay)`` on each of the input streams. For example, consider +a query with stream-stream joins between `inputStream1` and `inputStream2`. + + inputStream1.withWatermark("eventTime1", "1 hour") + .join( + inputStream2.withWatermark("eventTime2", "2 hours"), + joinCondition) + +While executing the query, Structured Streaming individually tracks the maximum +event time seen in each input stream, calculates watermarks based on the corresponding delay, +and chooses a single global watermark with them to be used for stateful operations. By default, +the minimum is chosen as the global watermark because it ensures that no data is +accidentally dropped as too late if one of the streams falls behind the others +(for example, one of the streams stop receiving data due to upstream failures). In other words, +the global watermark will safely move at the pace of the slowest stream and the query output will +be delayed accordingly. + +However, in some cases, you may want to get faster results even if it means dropping data from the +slowest stream. Since Spark 2.4, you can set the multiple watermark policy to choose +the maximum value as the global watermark by setting the SQL configuration +``spark.sql.streaming.multipleWatermarkPolicy`` to ``max`` (default is ``min``). +This lets the global watermark move at the pace of the fastest stream. +However, as a side effect, data from the slower streams will be aggressively dropped. Hence, use +this configuration judiciously. + ### Arbitrary Stateful Operations Many usecases require more advanced stateful operations than aggregations. For example, in many usecases, you have to track sessions from data streams of events. For doing such sessionization, you will have to save arbitrary types of data as state, and perform arbitrary operations on the state using the data stream events in every trigger. Since Spark 2.2, this can be done using the operation `mapGroupsWithState` and the more powerful operation `flatMapGroupsWithState`. Both operations allow you to apply user-defined code on grouped Datasets to update user-defined state. For more concrete details, take a look at the API documentation ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.GroupState)/[Java](api/java/org/apache/spark/sql/streaming/GroupState.html)) and the examples ([Scala]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredSessionization.scala)/[Java]({{site.SPARK_GITHUB_URL}}/blob/v{{site.SPARK_VERSION_SHORT}}/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredSessionization.java)). @@ -1799,8 +1828,16 @@ Here are the details of all the sinks in Spark. Append, Update, Complete None Depends on ForeachWriter implementationMore details in the next sectionMore details in the next section
    ForeachBatch SinkAppend, Update, CompleteNoneDepends on the implementationMore details in the next section
    Console Sink Append, Update, Complete