diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
index 0e41cf182645..5af45d6fa798 100644
--- a/.github/PULL_REQUEST_TEMPLATE
+++ b/.github/PULL_REQUEST_TEMPLATE
@@ -7,4 +7,4 @@
 (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
 (If this patch involves UI changes, please attach a screenshot; otherwise, remove this)
 
-Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.
+Please review http://spark.apache.org/contributing.html before opening a pull request.
diff --git a/.gitignore b/.gitignore
index 39d17e1793f7..5634a434db0c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,8 @@ project/plugins/project/build.properties
 project/plugins/src_managed/
 project/plugins/target/
 python/lib/pyspark.zip
+python/deps
+python/pyspark/python
 reports/
 scalastyle-on-compile.generated.xml
 scalastyle-output.xml
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1a8206abe383..8fdd5aa9e7df 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,12 @@
 ## Contributing to Spark
 
 *Before opening a pull request*, review the 
-[Contributing to Spark wiki](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark). 
+[Contributing to Spark guide](http://spark.apache.org/contributing.html). 
 It lists steps that are required before creating a PR. In particular, consider:
 
 - Is the change important and ready enough to ask the community to spend time reviewing?
 - Have you searched for existing, related JIRAs and pull requests?
-- Is this a new feature that can stand alone as a [third party project](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects) ?
+- Is this a new feature that can stand alone as a [third party project](http://spark.apache.org/third-party-projects.html) ?
 - Is the change being proposed clearly explained and motivated?
 
 When you contribute code, you affirm that the contribution is your original work and that you 
diff --git a/LICENSE b/LICENSE
index 7950dd6ceb6d..119ecbe15ab7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -249,11 +249,11 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
         (Interpreter classes (all .scala files in repl/src/main/scala
         except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
         and for SerializableMapWrapper in JavaUtils.scala)
-     (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.11.7 - http://www.scala-lang.org/)
-     (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.11.7 - http://www.scala-lang.org/)
-     (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.11.7 - http://www.scala-lang.org/)
-     (BSD-like) Scala Library (org.scala-lang:scala-library:2.11.7 - http://www.scala-lang.org/)
-     (BSD-like) Scalap (org.scala-lang:scalap:2.11.7 - http://www.scala-lang.org/)
+     (BSD-like) Scala Actors library (org.scala-lang:scala-actors:2.11.8 - http://www.scala-lang.org/)
+     (BSD-like) Scala Compiler (org.scala-lang:scala-compiler:2.11.8 - http://www.scala-lang.org/)
+     (BSD-like) Scala Compiler (org.scala-lang:scala-reflect:2.11.8 - http://www.scala-lang.org/)
+     (BSD-like) Scala Library (org.scala-lang:scala-library:2.11.8 - http://www.scala-lang.org/)
+     (BSD-like) Scalap (org.scala-lang:scalap:2.11.8 - http://www.scala-lang.org/)
      (BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org)
      (BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org)
      (BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org)
diff --git a/NOTICE b/NOTICE
index 69b513ea3ba3..f4b64b5c3f47 100644
--- a/NOTICE
+++ b/NOTICE
@@ -421,9 +421,6 @@ Copyright (c) 2011, Terrence Parr.
 This product includes/uses ASM (http://asm.ow2.org/),
 Copyright (c) 2000-2007 INRIA, France Telecom.
 
-This product includes/uses org.json (http://www.json.org/java/index.html),
-Copyright (c) 2002 JSON.org
-
 This product includes/uses JLine (http://jline.sourceforge.net/),
 Copyright (c) 2002-2006, Marc Prud'hommeaux <mwp1@cornell.edu>.
 
diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md
new file mode 100644
index 000000000000..d6084c7a7cc9
--- /dev/null
+++ b/R/CRAN_RELEASE.md
@@ -0,0 +1,91 @@
+# SparkR CRAN Release
+
+To release SparkR as a package to CRAN, we would use the `devtools` package. Please work with the
+`dev@spark.apache.org` community and R package maintainer on this.
+
+### Release
+
+First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control.
+
+Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`).
+
+To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible.
+
+Once everything is in place, run in R under the `SPARK_HOME/R` directory:
+
+```R
+paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::release(); .libPaths(paths)
+```
+
+For more information please refer to http://r-pkgs.had.co.nz/release.html#release-check
+
+### Testing: build package manually
+
+To build package manually such as to inspect the resulting `.tar.gz` file content, we would also use the `devtools` package.
+
+Source package is what get released to CRAN. CRAN would then build platform-specific binary packages from the source package.
+
+#### Build source package
+
+To build source package locally without releasing to CRAN, run in R under the `SPARK_HOME/R` directory:
+
+```R
+paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::build("pkg"); .libPaths(paths)
+```
+
+(http://r-pkgs.had.co.nz/vignettes.html#vignette-workflow-2)
+
+Similarly, the source package is also created by `check-cran.sh` with `R CMD build pkg`.
+
+For example, this should be the content of the source package:
+
+```sh
+DESCRIPTION	R		inst		tests
+NAMESPACE	build		man		vignettes
+
+inst/doc/
+sparkr-vignettes.html
+sparkr-vignettes.Rmd
+sparkr-vignettes.Rman
+
+build/
+vignette.rds
+
+man/
+ *.Rd files...
+
+vignettes/
+sparkr-vignettes.Rmd
+```
+
+#### Test source package
+
+To install, run this:
+
+```sh
+R CMD INSTALL SparkR_2.1.0.tar.gz
+```
+
+With "2.1.0" replaced with the version of SparkR.
+
+This command installs SparkR to the default libPaths. Once that is done, you should be able to start R and run:
+
+```R
+library(SparkR)
+vignette("sparkr-vignettes", package="SparkR")
+```
+
+#### Build binary package
+
+To build binary package locally, run in R under the `SPARK_HOME/R` directory:
+
+```R
+paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::build("pkg", binary = TRUE); .libPaths(paths)
+```
+
+For example, this should be the content of the binary package:
+
+```sh
+DESCRIPTION	Meta		R		html		tests
+INDEX		NAMESPACE	help		profile		worker
+```
diff --git a/R/README.md b/R/README.md
index 932d5272d0b4..4c40c5963db7 100644
--- a/R/README.md
+++ b/R/README.md
@@ -6,7 +6,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R
 
 Libraries of sparkR need to be created in `$SPARK_HOME/R/lib`. This can be done by running the script `$SPARK_HOME/R/install-dev.sh`.
 By default the above script uses the system wide installation of R. However, this can be changed to any user installed location of R by setting the environment variable `R_HOME` the full path of the base directory where R is installed, before running install-dev.sh script.
-Example: 
+Example:
 ```bash
 # where /home/username/R is where R is installed and /home/username/R/bin contains the files R and RScript
 export R_HOME=/home/username/R
@@ -46,19 +46,19 @@ Sys.setenv(SPARK_HOME="/Users/username/spark")
 # This line loads SparkR from the installed directory
 .libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))
 library(SparkR)
-sc <- sparkR.init(master="local")
+sparkR.session()
 ```
 
 #### Making changes to SparkR
 
-The [instructions](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) for making contributions to Spark also apply to SparkR.
+The [instructions](http://spark.apache.org/contributing.html) for making contributions to Spark also apply to SparkR.
 If you only make R file changes (i.e. no Scala changes) then you can just re-install the R package using `R/install-dev.sh` and test your changes.
 Once you have made your changes, please include unit tests for them and run existing unit tests using the `R/run-tests.sh` script as described below.
-    
+
 #### Generating documentation
 
 The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script. Also, you may need to install these [prerequisites](https://github.com/apache/spark/tree/master/docs#prerequisites). See also, `R/DOCUMENTATION.md`
-    
+
 ### Examples, Unit tests
 
 SparkR comes with several sample programs in the `examples/src/main/r` directory.
diff --git a/R/check-cran.sh b/R/check-cran.sh
index bb331466ae93..1288e7fc9fb4 100755
--- a/R/check-cran.sh
+++ b/R/check-cran.sh
@@ -34,13 +34,30 @@ if [ ! -z "$R_HOME" ]
     fi
     R_SCRIPT_PATH="$(dirname $(which R))"
 fi
-echo "USING R_HOME = $R_HOME"
+echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
 
-# Build the latest docs
+# Install the package (this is required for code in vignettes to run when building it later)
+# Build the latest docs, but not vignettes, which is built with the package next
 $FWDIR/create-docs.sh
 
-# Build a zip file containing the source package
-"$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg
+# Build source package with vignettes
+SPARK_HOME="$(cd "${FWDIR}"/..; pwd)"
+. "${SPARK_HOME}"/bin/load-spark-env.sh
+if [ -f "${SPARK_HOME}/RELEASE" ]; then
+  SPARK_JARS_DIR="${SPARK_HOME}/jars"
+else
+  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
+fi
+
+if [ -d "$SPARK_JARS_DIR" ]; then
+  # Build a zip file containing the source package with vignettes
+  SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg
+
+  find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete
+else
+  echo "Error Spark JARs not found in $SPARK_HOME"
+  exit 1
+fi
 
 # Run check as-cran.
 VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`
@@ -54,11 +71,32 @@ fi
 
 if [ -n "$NO_MANUAL" ]
 then
-  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-manual"
+  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-manual --no-vignettes"
 fi
 
 echo "Running CRAN check with $CRAN_CHECK_OPTIONS options"
 
-"$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
+if [ -n "$NO_TESTS" ] && [ -n "$NO_MANUAL" ]
+then
+  "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
+else
+  # This will run tests and/or build vignettes, and require SPARK_HOME
+  SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
+fi
+
+# Install source package to get it to generate vignettes rds files, etc.
+if [ -n "$CLEAN_INSTALL" ]
+then
+  echo "Removing lib path and installing from source package"
+  LIB_DIR="$FWDIR/lib"
+  rm -rf $LIB_DIR
+  mkdir -p $LIB_DIR
+  "$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR
+
+  # Zip the SparkR package so that it can be distributed to worker nodes on YARN
+  pushd $LIB_DIR > /dev/null
+  jar cfM "$LIB_DIR/sparkr.zip" SparkR
+  popd > /dev/null
+fi
 
 popd > /dev/null
diff --git a/R/create-docs.sh b/R/create-docs.sh
index 69ffc5f678c3..84e6aa928cb0 100755
--- a/R/create-docs.sh
+++ b/R/create-docs.sh
@@ -20,7 +20,7 @@
 # Script to create API docs and vignettes for SparkR
 # This requires `devtools`, `knitr` and `rmarkdown` to be installed on the machine.
 
-# After running this script the html docs can be found in 
+# After running this script the html docs can be found in
 # $SPARK_HOME/R/pkg/html
 # The vignettes can be found in
 # $SPARK_HOME/R/pkg/vignettes/sparkr_vignettes.html
@@ -52,21 +52,4 @@ Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knit
 
 popd
 
-# Find Spark jars.
-if [ -f "${SPARK_HOME}/RELEASE" ]; then
-  SPARK_JARS_DIR="${SPARK_HOME}/jars"
-else
-  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
-fi
-
-# Only create vignettes if Spark JARs exist
-if [ -d "$SPARK_JARS_DIR" ]; then
-  # render creates SparkR vignettes
-  Rscript -e 'library(rmarkdown); paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); render("pkg/vignettes/sparkr-vignettes.Rmd"); .libPaths(paths)'
-
-  find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete
-else
-  echo "Skipping R vignettes as Spark JARs not found in $SPARK_HOME"
-fi
-
 popd
diff --git a/R/install-dev.sh b/R/install-dev.sh
index ada6303a722b..0f881208bcad 100755
--- a/R/install-dev.sh
+++ b/R/install-dev.sh
@@ -46,7 +46,7 @@ if [ ! -z "$R_HOME" ]
     fi
     R_SCRIPT_PATH="$(dirname $(which R))"
 fi
-echo "USING R_HOME = $R_HOME"
+echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
 
 # Generate Rd files if devtools is installed
 "$R_SCRIPT_PATH/"Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore
index 544d203a6dce..f12f8c275a98 100644
--- a/R/pkg/.Rbuildignore
+++ b/R/pkg/.Rbuildignore
@@ -1,5 +1,8 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.lintr$
+^cran-comments\.md$
+^NEWS\.md$
+^README\.Rmd$
 ^src-native$
 ^html$
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 5a83883089e0..2d461ca68920 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: SparkR
 Type: Package
+Version: 2.1.2
 Title: R Frontend for Apache Spark
-Version: 2.0.0
-Date: 2016-08-27
+Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
              person("Xiangrui", "Meng", role = "aut",
@@ -10,17 +10,18 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
              person("Felix", "Cheung", role = "aut",
                     email = "felixcheung@apache.org"),
              person(family = "The Apache Software Foundation", role = c("aut", "cph")))
+License: Apache License (== 2.0)
 URL: http://www.apache.org/ http://spark.apache.org/
-BugReports: https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-ContributingBugReports
+BugReports: http://spark.apache.org/contributing.html
 Depends:
     R (>= 3.0),
     methods
 Suggests:
+    knitr,
+    rmarkdown,
     testthat,
     e1071,
     survival
-Description: The SparkR package provides an R frontend for Apache Spark.
-License: Apache License (== 2.0)
 Collate:
     'schema.R'
     'generics.R'
@@ -48,3 +49,4 @@ Collate:
     'utils.R'
     'window.R'
 RoxygenNote: 5.0.1
+VignetteBuilder: knitr
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 9cd6269f9a8f..6f96b969d149 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -1,9 +1,26 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 # Imports from base R
 # Do not include stats:: "rpois", "runif" - causes error at runtime
 importFrom("methods", "setGeneric", "setMethod", "setOldClass")
 importFrom("methods", "is", "new", "signature", "show")
 importFrom("stats", "gaussian", "setNames")
-importFrom("utils", "download.file", "object.size", "packageVersion", "untar")
+importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "untar")
 
 # Disable native libraries till we figure out how to package it
 # See SPARKR-7839
@@ -16,6 +33,7 @@ export("sparkR.stop")
 export("sparkR.session.stop")
 export("sparkR.conf")
 export("sparkR.version")
+export("sparkR.uiWebUrl")
 export("print.jobj")
 
 export("sparkR.newJObject")
@@ -45,7 +63,8 @@ exportMethods("glm",
               "spark.als",
               "spark.kstest",
               "spark.logit",
-              "spark.randomForest")
+              "spark.randomForest",
+              "spark.gbt")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -61,6 +80,7 @@ exportMethods("arrange",
               "as.data.frame",
               "attach",
               "cache",
+              "coalesce",
               "collect",
               "colnames",
               "colnames<-",
@@ -92,6 +112,7 @@ exportMethods("arrange",
               "freqItems",
               "gapply",
               "gapplyCollect",
+              "getNumPartitions",
               "group_by",
               "groupBy",
               "head",
@@ -353,7 +374,9 @@ export("as.DataFrame",
        "read.ml",
        "print.summary.KSTest",
        "print.summary.RandomForestRegressionModel",
-       "print.summary.RandomForestClassificationModel")
+       "print.summary.RandomForestClassificationModel",
+       "print.summary.GBTRegressionModel",
+       "print.summary.GBTClassificationModel")
 
 export("structField",
        "structField.jobj",
@@ -380,6 +403,8 @@ S3method(print, summary.GeneralizedLinearRegressionModel)
 S3method(print, summary.KSTest)
 S3method(print, summary.RandomForestRegressionModel)
 S3method(print, summary.RandomForestClassificationModel)
+S3method(print, summary.GBTRegressionModel)
+S3method(print, summary.GBTClassificationModel)
 S3method(structField, character)
 S3method(structField, jobj)
 S3method(structType, jobj)
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 1cf9b38ea648..d0f097925a2c 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -417,7 +417,7 @@ setMethod("coltypes",
                   type <- PRIMITIVE_TYPES[[specialtype]]
                 }
               }
-              type
+              type[[1]]
             })
 
             # Find which types don't have mapping to R
@@ -680,14 +680,53 @@ setMethod("storageLevel",
             storageLevelToString(callJMethod(x@sdf, "storageLevel"))
           })
 
+#' Coalesce
+#'
+#' Returns a new SparkDataFrame that has exactly \code{numPartitions} partitions.
+#' This operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100
+#' partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of
+#' the current partitions. If a larger number of partitions is requested, it will stay at the
+#' current number of partitions.
+#'
+#' However, if you're doing a drastic coalesce on a SparkDataFrame, e.g. to numPartitions = 1,
+#' this may result in your computation taking place on fewer nodes than
+#' you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+#' call \code{repartition}. This will add a shuffle step, but means the
+#' current upstream partitions will be executed in parallel (per whatever
+#' the current partitioning is).
+#'
+#' @param numPartitions the number of partitions to use.
+#'
+#' @family SparkDataFrame functions
+#' @rdname coalesce
+#' @name coalesce
+#' @aliases coalesce,SparkDataFrame-method
+#' @seealso \link{repartition}
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' newDF <- coalesce(df, 1L)
+#'}
+#' @note coalesce(SparkDataFrame) since 2.1.1
+setMethod("coalesce",
+          signature(x = "SparkDataFrame"),
+          function(x, numPartitions) {
+            stopifnot(is.numeric(numPartitions))
+            sdf <- callJMethod(x@sdf, "coalesce", numToInt(numPartitions))
+            dataFrame(sdf)
+          })
+
 #' Repartition
 #'
 #' The following options for repartition are possible:
 #' \itemize{
-#'  \item{1.} {Return a new SparkDataFrame partitioned by
+#'  \item{1.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.}
+#'  \item{2.} {Return a new SparkDataFrame hash partitioned by
 #'                      the given columns into \code{numPartitions}.}
-#'  \item{2.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.}
-#'  \item{3.} {Return a new SparkDataFrame partitioned by the given column(s),
+#'  \item{3.} {Return a new SparkDataFrame hash partitioned by the given column(s),
 #'                      using \code{spark.sql.shuffle.partitions} as number of partitions.}
 #'}
 #' @param x a SparkDataFrame.
@@ -699,6 +738,7 @@ setMethod("storageLevel",
 #' @rdname repartition
 #' @name repartition
 #' @aliases repartition,SparkDataFrame-method
+#' @seealso \link{coalesce}
 #' @export
 #' @examples
 #'\dontrun{
@@ -937,6 +977,8 @@ setMethod("unique",
 #' Sample
 #'
 #' Return a sampled subset of this SparkDataFrame using a random seed.
+#' Note: this is not guaranteed to provide exactly the fraction specified
+#' of the total count of of the given SparkDataFrame.
 #'
 #' @param x A SparkDataFrame
 #' @param withReplacement Sampling with replacement or not
@@ -1130,6 +1172,7 @@ setMethod("collect",
                   if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") {
                     vec <- do.call(c, col)
                     stopifnot(class(vec) != "list")
+                    class(vec) <- PRIMITIVE_TYPES[[colType]]
                     df[[colIndex]] <- vec
                   } else {
                     df[[colIndex]] <- col
@@ -1709,6 +1752,23 @@ getColumn <- function(x, c) {
   column(callJMethod(x@sdf, "col", c))
 }
 
+setColumn <- function(x, c, value) {
+  if (class(value) != "Column" && !is.null(value)) {
+    if (isAtomicLengthOne(value)) {
+      value <- lit(value)
+    } else {
+      stop("value must be a Column, literal value as atomic in length of 1, or NULL")
+    }
+  }
+
+  if (is.null(value)) {
+    nx <- drop(x, c)
+  } else {
+    nx <- withColumn(x, c, value)
+  }
+  nx
+}
+
 #' @param name name of a Column (without being wrapped by \code{""}).
 #' @rdname select
 #' @name $
@@ -1719,20 +1779,15 @@ setMethod("$", signature(x = "SparkDataFrame"),
             getColumn(x, name)
           })
 
-#' @param value a Column or \code{NULL}. If \code{NULL}, the specified Column is dropped.
+#' @param value a Column or an atomic vector in the length of 1 as literal value, or \code{NULL}.
+#'              If \code{NULL}, the specified Column is dropped.
 #' @rdname select
 #' @name $<-
 #' @aliases $<-,SparkDataFrame-method
 #' @note $<- since 1.4.0
 setMethod("$<-", signature(x = "SparkDataFrame"),
           function(x, name, value) {
-            stopifnot(class(value) == "Column" || is.null(value))
-
-            if (is.null(value)) {
-              nx <- drop(x, name)
-            } else {
-              nx <- withColumn(x, name, value)
-            }
+            nx <- setColumn(x, name, value)
             x@sdf <- nx@sdf
             x
           })
@@ -1745,6 +1800,10 @@ setClassUnion("numericOrcharacter", c("numeric", "character"))
 #' @note [[ since 1.4.0
 setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
           function(x, i) {
+            if (length(i) > 1) {
+              warning("Subset index has length > 1. Only the first index is used.")
+              i <- i[1]
+            }
             if (is.numeric(i)) {
               cols <- columns(x)
               i <- cols[[i]]
@@ -1752,6 +1811,25 @@ setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
             getColumn(x, i)
           })
 
+#' @rdname subset
+#' @name [[<-
+#' @aliases [[<-,SparkDataFrame,numericOrcharacter-method
+#' @note [[<- since 2.1.1
+setMethod("[[<-", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
+          function(x, i, value) {
+            if (length(i) > 1) {
+              warning("Subset index has length > 1. Only the first index is used.")
+              i <- i[1]
+            }
+            if (is.numeric(i)) {
+              cols <- columns(x)
+              i <- cols[[i]]
+            }
+            nx <- setColumn(x, i, value)
+            x@sdf <- nx@sdf
+            x
+          })
+
 #' @rdname subset
 #' @name [
 #' @aliases [,SparkDataFrame-method
@@ -1796,14 +1874,19 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #' Return subsets of SparkDataFrame according to given conditions
 #' @param x a SparkDataFrame.
 #' @param i,subset (Optional) a logical expression to filter on rows.
+#'                 For extract operator [[ and replacement operator [[<-, the indexing parameter for
+#'                 a single Column.
 #' @param j,select expression for the single Column or a list of columns to select from the SparkDataFrame.
 #' @param drop if TRUE, a Column will be returned if the resulting dataset has only one column.
 #'             Otherwise, a SparkDataFrame will always be returned.
+#' @param value a Column or an atomic vector in the length of 1 as literal value, or \code{NULL}.
+#'              If \code{NULL}, the specified Column is dropped.
 #' @param ... currently not used.
 #' @return A new SparkDataFrame containing only the rows that meet the condition with selected columns.
 #' @export
 #' @family SparkDataFrame functions
 #' @aliases subset,SparkDataFrame-method
+#' @seealso \link{withColumn}
 #' @rdname subset
 #' @name subset
 #' @family subsetting functions
@@ -1821,6 +1904,10 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #'   subset(df, df$age %in% c(19, 30), 1:2)
 #'   subset(df, df$age %in% c(19), select = c(1,2))
 #'   subset(df, select = c(1,2))
+#'   # Columns can be selected and set
+#'   df[["age"]] <- 23
+#'   df[[1]] <- df$age
+#'   df[[2]] <- NULL # drop column
 #' }
 #' @note subset since 1.5.0
 setMethod("subset", signature(x = "SparkDataFrame"),
@@ -1939,13 +2026,13 @@ setMethod("selectExpr",
 #'
 #' @param x a SparkDataFrame.
 #' @param colName a column name.
-#' @param col a Column expression.
+#' @param col a Column expression, or an atomic vector in the length of 1 as literal value.
 #' @return A SparkDataFrame with the new column added or the existing column replaced.
 #' @family SparkDataFrame functions
-#' @aliases withColumn,SparkDataFrame,character,Column-method
+#' @aliases withColumn,SparkDataFrame,character-method
 #' @rdname withColumn
 #' @name withColumn
-#' @seealso \link{rename} \link{mutate}
+#' @seealso \link{rename} \link{mutate} \link{subset}
 #' @export
 #' @examples
 #'\dontrun{
@@ -1955,11 +2042,20 @@ setMethod("selectExpr",
 #' newDF <- withColumn(df, "newCol", df$col1 * 5)
 #' # Replace an existing column
 #' newDF2 <- withColumn(newDF, "newCol", newDF$col1)
+#' newDF3 <- withColumn(newDF, "newCol", 42)
+#' # Use extract operator to set an existing or new column
+#' df[["age"]] <- 23
+#' df[[2]] <- df$col1
+#' df[[2]] <- NULL # drop column
 #' }
 #' @note withColumn since 1.4.0
 setMethod("withColumn",
-          signature(x = "SparkDataFrame", colName = "character", col = "Column"),
+          signature(x = "SparkDataFrame", colName = "character"),
           function(x, colName, col) {
+            if (class(col) != "Column") {
+              if (!isAtomicLengthOne(col)) stop("Literal value must be atomic in length of 1")
+              col <- lit(col)
+            }
             sdf <- callJMethod(x@sdf, "withColumn", colName, col@jc)
             dataFrame(sdf)
           })
@@ -2305,9 +2401,9 @@ setMethod("dropDuplicates",
 #' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a
 #' Column expression. If joinExpr is omitted, the default, inner join is attempted and an error is
 #' thrown if it would be a Cartesian Product. For Cartesian join, use crossJoin instead.
-#' @param joinType The type of join to perform. The following join types are available:
-#' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left',
-#' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner".
+#' @param joinType The type of join to perform, default 'inner'.
+#' Must be one of: 'inner', 'cross', 'outer', 'full', 'full_outer',
+#' 'left', 'left_outer', 'right', 'right_outer', 'left_semi', or 'left_anti'.
 #' @return A SparkDataFrame containing the result of the join operation.
 #' @family SparkDataFrame functions
 #' @aliases join,SparkDataFrame,SparkDataFrame-method
@@ -2336,15 +2432,18 @@ setMethod("join",
               if (is.null(joinType)) {
                 sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc)
               } else {
-                if (joinType %in% c("inner", "outer", "full", "fullouter",
-                    "leftouter", "left_outer", "left",
-                    "rightouter", "right_outer", "right", "leftsemi")) {
+                if (joinType %in% c("inner", "cross",
+                    "outer", "full", "fullouter", "full_outer",
+                    "left", "leftouter", "left_outer",
+                    "right", "rightouter", "right_outer",
+                    "left_semi", "leftsemi", "left_anti", "leftanti")) {
                   joinType <- gsub("_", "", joinType)
                   sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType)
                 } else {
                   stop("joinType must be one of the following types: ",
-                      "'inner', 'outer', 'full', 'fullouter', 'leftouter', 'left_outer', 'left',
-                      'rightouter', 'right_outer', 'right', 'leftsemi'")
+                       "'inner', 'cross', 'outer', 'full', 'full_outer',",
+                       "'left', 'left_outer', 'right', 'right_outer',",
+                       "'left_semi', or 'left_anti'.")
                 }
               }
             }
@@ -2539,7 +2638,8 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #'
 #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
 #' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
-#' Note that this does not remove duplicate rows across the two SparkDataFrames.
+#'
+#' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
 #' @param x A SparkDataFrame
 #' @param y A SparkDataFrame
@@ -2582,7 +2682,8 @@ setMethod("unionAll",
 #' Union two or more SparkDataFrames
 #'
 #' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
-#' Note that this does not remove duplicate rows across the two SparkDataFrames.
+#'
+#' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
 #' @param x a SparkDataFrame.
 #' @param ... additional SparkDataFrame(s).
@@ -3381,3 +3482,26 @@ setMethod("randomSplit",
             }
             sapply(sdfs, dataFrame)
           })
+
+#' getNumPartitions
+#'
+#' Return the number of partitions
+#'
+#' @param x A SparkDataFrame
+#' @family SparkDataFrame functions
+#' @aliases getNumPartitions,SparkDataFrame-method
+#' @rdname getNumPartitions
+#' @name getNumPartitions
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df <- createDataFrame(cars, numPartitions = 2)
+#' getNumPartitions(df)
+#' }
+#' @note getNumPartitions since 2.1.1
+setMethod("getNumPartitions",
+          signature(x = "SparkDataFrame"),
+          function(x) {
+            callJMethod(callJMethod(x@sdf, "rdd"), "getNumPartitions")
+          })
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 0f1162fec1df..5667b9d78882 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -313,7 +313,7 @@ setMethod("checkpoint",
 #' @rdname getNumPartitions
 #' @aliases getNumPartitions,RDD-method
 #' @noRd
-setMethod("getNumPartitions",
+setMethod("getNumPartitionsRDD",
           signature(x = "RDD"),
           function(x) {
             callJMethod(getJRDD(x), "getNumPartitions")
@@ -329,7 +329,7 @@ setMethod("numPartitions",
           signature(x = "RDD"),
           function(x) {
             .Deprecated("getNumPartitions")
-            getNumPartitions(x)
+            getNumPartitionsRDD(x)
           })
 
 #' Collect elements of an RDD
@@ -460,7 +460,7 @@ setMethod("countByValue",
           signature(x = "RDD"),
           function(x) {
             ones <- lapply(x, function(item) { list(item, 1L) })
-            collectRDD(reduceByKey(ones, `+`, getNumPartitions(x)))
+            collectRDD(reduceByKey(ones, `+`, getNumPartitionsRDD(x)))
           })
 
 #' Apply a function to all elements
@@ -780,7 +780,7 @@ setMethod("takeRDD",
             resList <- list()
             index <- -1
             jrdd <- getJRDD(x)
-            numPartitions <- getNumPartitions(x)
+            numPartitions <- getNumPartitionsRDD(x)
             serializedModeRDD <- getSerializedMode(x)
 
             # TODO(shivaram): Collect more than one partition based on size
@@ -846,7 +846,7 @@ setMethod("firstRDD",
 #' @noRd
 setMethod("distinctRDD",
           signature(x = "RDD"),
-          function(x, numPartitions = SparkR:::getNumPartitions(x)) {
+          function(x, numPartitions = SparkR:::getNumPartitionsRDD(x)) {
             identical.mapped <- lapply(x, function(x) { list(x, NULL) })
             reduced <- reduceByKey(identical.mapped,
                                    function(x, y) { x },
@@ -1028,7 +1028,7 @@ setMethod("repartitionRDD",
           signature(x = "RDD"),
           function(x, numPartitions) {
             if (!is.null(numPartitions) && is.numeric(numPartitions)) {
-              coalesce(x, numPartitions, TRUE)
+              coalesceRDD(x, numPartitions, TRUE)
             } else {
               stop("Please, specify the number of partitions")
             }
@@ -1049,11 +1049,11 @@ setMethod("repartitionRDD",
 #' @rdname coalesce
 #' @aliases coalesce,RDD
 #' @noRd
-setMethod("coalesce",
+setMethod("coalesceRDD",
            signature(x = "RDD", numPartitions = "numeric"),
            function(x, numPartitions, shuffle = FALSE) {
              numPartitions <- numToInt(numPartitions)
-             if (shuffle || numPartitions > SparkR:::getNumPartitions(x)) {
+             if (shuffle || numPartitions > SparkR:::getNumPartitionsRDD(x)) {
                func <- function(partIndex, part) {
                  set.seed(partIndex)  # partIndex as seed
                  start <- as.integer(base::sample(numPartitions, 1) - 1)
@@ -1143,7 +1143,7 @@ setMethod("saveAsTextFile",
 #' @noRd
 setMethod("sortBy",
           signature(x = "RDD", func = "function"),
-          function(x, func, ascending = TRUE, numPartitions = SparkR:::getNumPartitions(x)) {
+          function(x, func, ascending = TRUE, numPartitions = SparkR:::getNumPartitionsRDD(x)) {
             values(sortByKey(keyBy(x, func), ascending, numPartitions))
           })
 
@@ -1175,7 +1175,7 @@ takeOrderedElem <- function(x, num, ascending = TRUE) {
   resList <- list()
   index <- -1
   jrdd <- getJRDD(newRdd)
-  numPartitions <- getNumPartitions(newRdd)
+  numPartitions <- getNumPartitionsRDD(newRdd)
   serializedModeRDD <- getSerializedMode(newRdd)
 
   while (TRUE) {
@@ -1407,7 +1407,7 @@ setMethod("setName",
 setMethod("zipWithUniqueId",
           signature(x = "RDD"),
           function(x) {
-            n <- getNumPartitions(x)
+            n <- getNumPartitionsRDD(x)
 
             partitionFunc <- function(partIndex, part) {
               mapply(
@@ -1450,7 +1450,7 @@ setMethod("zipWithUniqueId",
 setMethod("zipWithIndex",
           signature(x = "RDD"),
           function(x) {
-            n <- getNumPartitions(x)
+            n <- getNumPartitionsRDD(x)
             if (n > 1) {
               nums <- collectRDD(lapplyPartition(x,
                                               function(part) {
@@ -1566,8 +1566,8 @@ setMethod("unionRDD",
 setMethod("zipRDD",
           signature(x = "RDD", other = "RDD"),
           function(x, other) {
-            n1 <- getNumPartitions(x)
-            n2 <- getNumPartitions(other)
+            n1 <- getNumPartitionsRDD(x)
+            n2 <- getNumPartitionsRDD(other)
             if (n1 != n2) {
               stop("Can only zip RDDs which have the same number of partitions.")
             }
@@ -1637,7 +1637,7 @@ setMethod("cartesian",
 #' @noRd
 setMethod("subtract",
           signature(x = "RDD", other = "RDD"),
-          function(x, other, numPartitions = SparkR:::getNumPartitions(x)) {
+          function(x, other, numPartitions = SparkR:::getNumPartitionsRDD(x)) {
             mapFunction <- function(e) { list(e, NA) }
             rdd1 <- map(x, mapFunction)
             rdd2 <- map(other, mapFunction)
@@ -1671,7 +1671,7 @@ setMethod("subtract",
 #' @noRd
 setMethod("intersection",
           signature(x = "RDD", other = "RDD"),
-          function(x, other, numPartitions = SparkR:::getNumPartitions(x)) {
+          function(x, other, numPartitions = SparkR:::getNumPartitionsRDD(x)) {
             rdd1 <- map(x, function(v) { list(v, NA) })
             rdd2 <- map(other, function(v) { list(v, NA) })
 
@@ -1714,7 +1714,7 @@ setMethod("zipPartitions",
             if (length(rrdds) == 1) {
               return(rrdds[[1]])
             }
-            nPart <- sapply(rrdds, getNumPartitions)
+            nPart <- sapply(rrdds, getNumPartitionsRDD)
             if (length(unique(nPart)) != 1) {
               stop("Can only zipPartitions RDDs which have the same number of partitions.")
             }
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 38d83c6e5c52..e771a057e244 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -184,8 +184,11 @@ getDefaultSqlSource <- function() {
 #'
 #' Converts R data.frame or list into SparkDataFrame.
 #'
-#' @param data an RDD or list or data.frame.
+#' @param data a list or data.frame.
 #' @param schema a list of column names or named list (StructType), optional.
+#' @param samplingRatio Currently not used.
+#' @param numPartitions the number of partitions of the SparkDataFrame. Defaults to 1, this is
+#'        limited by length of the list or number of rows of the data.frame
 #' @return A SparkDataFrame.
 #' @rdname createDataFrame
 #' @export
@@ -195,12 +198,14 @@ getDefaultSqlSource <- function() {
 #' df1 <- as.DataFrame(iris)
 #' df2 <- as.DataFrame(list(3,4,5,6))
 #' df3 <- createDataFrame(iris)
+#' df4 <- createDataFrame(cars, numPartitions = 2)
 #' }
 #' @name createDataFrame
 #' @method createDataFrame default
 #' @note createDataFrame since 1.4.0
 # TODO(davies): support sampling and infer type from NA
-createDataFrame.default <- function(data, schema = NULL, samplingRatio = 1.0) {
+createDataFrame.default <- function(data, schema = NULL, samplingRatio = 1.0,
+                                    numPartitions = NULL) {
   sparkSession <- getSparkSession()
 
   if (is.data.frame(data)) {
@@ -233,7 +238,11 @@ createDataFrame.default <- function(data, schema = NULL, samplingRatio = 1.0) {
 
   if (is.list(data)) {
     sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-    rdd <- parallelize(sc, data)
+    if (!is.null(numPartitions)) {
+      rdd <- parallelize(sc, data, numSlices = numToInt(numPartitions))
+    } else {
+      rdd <- parallelize(sc, data, numSlices = 1)
+    }
   } else if (inherits(data, "RDD")) {
     rdd <- data
   } else {
@@ -283,14 +292,13 @@ createDataFrame <- function(x, ...) {
   dispatchFunc("createDataFrame(data, schema = NULL)", x, ...)
 }
 
-#' @param samplingRatio Currently not used.
 #' @rdname createDataFrame
 #' @aliases createDataFrame
 #' @export
 #' @method as.DataFrame default
 #' @note as.DataFrame since 1.6.0
-as.DataFrame.default <- function(data, schema = NULL, samplingRatio = 1.0) {
-  createDataFrame(data, schema)
+as.DataFrame.default <- function(data, schema = NULL, samplingRatio = 1.0, numPartitions = NULL) {
+  createDataFrame(data, schema, samplingRatio, numPartitions)
 }
 
 #' @param ... additional argument(s).
@@ -634,7 +642,7 @@ tableNames <- function(x, ...) {
 cacheTable.default <- function(tableName) {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "cacheTable", tableName)
+  invisible(callJMethod(catalog, "cacheTable", tableName))
 }
 
 cacheTable <- function(x, ...) {
@@ -663,7 +671,7 @@ cacheTable <- function(x, ...) {
 uncacheTable.default <- function(tableName) {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "uncacheTable", tableName)
+  invisible(callJMethod(catalog, "uncacheTable", tableName))
 }
 
 uncacheTable <- function(x, ...) {
@@ -686,7 +694,7 @@ uncacheTable <- function(x, ...) {
 clearCache.default <- function() {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "clearCache")
+  invisible(callJMethod(catalog, "clearCache"))
 }
 
 clearCache <- function() {
@@ -730,6 +738,7 @@ dropTempTable <- function(x, ...) {
 #' If the view has been cached before, then it will also be uncached.
 #'
 #' @param viewName the name of the view to be dropped.
+#' @return TRUE if the view is dropped successfully, FALSE otherwise.
 #' @rdname dropTempView
 #' @name dropTempView
 #' @export
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 438d77a388f0..634bdcb52363 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -87,10 +87,20 @@ objectFile <- function(sc, path, minPartitions = NULL) {
 #' in the list are split into \code{numSlices} slices and distributed to nodes
 #' in the cluster.
 #'
-#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function 
-#' will write it to disk and send the file name to JVM. Also to make sure each slice is not 
+#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function
+#' will write it to disk and send the file name to JVM. Also to make sure each slice is not
 #' larger than that limit, number of slices may be increased.
 #'
+#' In 2.2.0 we are changing how the numSlices are used/computed to handle
+#' 1 < (length(coll) / numSlices) << length(coll) better, and to get the exact number of slices.
+#' This change affects both createDataFrame and spark.lapply.
+#' In the specific one case that it is used to convert R native object into SparkDataFrame, it has
+#' always been kept at the default of 1. In the case the object is large, we are explicitly setting
+#' the parallism to numSlices (which is still 1).
+#'
+#' Specifically, we are changing to split positions to match the calculation in positions() of
+#' ParallelCollectionRDD in Spark.
+#'
 #' @param sc SparkContext to use
 #' @param coll collection to parallelize
 #' @param numSlices number of partitions to create in the RDD
@@ -107,6 +117,8 @@ parallelize <- function(sc, coll, numSlices = 1) {
   # TODO: bound/safeguard numSlices
   # TODO: unit tests for if the split works for all primitives
   # TODO: support matrix, data frame, etc
+
+  # Note, for data.frame, createDataFrame turns it into a list before it calls here.
   # nolint start
   # suppress lintr warning: Place a space before left parenthesis, except in a function call.
   if ((!is.list(coll) && !is.vector(coll)) || is.data.frame(coll)) {
@@ -128,12 +140,29 @@ parallelize <- function(sc, coll, numSlices = 1) {
   objectSize <- object.size(coll)
 
   # For large objects we make sure the size of each slice is also smaller than sizeLimit
-  numSlices <- max(numSlices, ceiling(objectSize / sizeLimit))
-  if (numSlices > length(coll))
-    numSlices <- length(coll)
+  numSerializedSlices <- max(numSlices, ceiling(objectSize / sizeLimit))
+  if (numSerializedSlices > length(coll))
+    numSerializedSlices <- length(coll)
+
+  # Generate the slice ids to put each row
+  # For instance, for numSerializedSlices of 22, length of 50
+  #  [1]  0  0  2  2  4  4  6  6  6  9  9 11 11 13 13 15 15 15 18 18 20 20 22 22 22
+  # [26] 25 25 27 27 29 29 31 31 31 34 34 36 36 38 38 40 40 40 43 43 45 45 47 47 47
+  # Notice the slice group with 3 slices (ie. 6, 15, 22) are roughly evenly spaced.
+  # We are trying to reimplement the calculation in the positions method in ParallelCollectionRDD
+  splits <- if (numSerializedSlices > 0) {
+    unlist(lapply(0: (numSerializedSlices - 1), function(x) {
+      # nolint start
+      start <- trunc((x * length(coll)) / numSerializedSlices)
+      end <- trunc(((x + 1) * length(coll)) / numSerializedSlices)
+      # nolint end
+      rep(start, end - start)
+    }))
+  } else {
+    1
+  }
 
-  sliceLen <- ceiling(length(coll) / numSlices)
-  slices <- split(coll, rep(1: (numSlices + 1), each = sliceLen)[1:length(coll)])
+  slices <- split(coll, splits)
 
   # Serialize each slice: obtain a list of raws, or a list of lists (slices) of
   # 2-tuples of raws
@@ -301,7 +330,13 @@ spark.addFile <- function(path, recursive = FALSE) {
 #'}
 #' @note spark.getSparkFilesRootDirectory since 2.1.0
 spark.getSparkFilesRootDirectory <- function() {
-  callJStatic("org.apache.spark.SparkFiles", "getRootDirectory")
+  if (Sys.getenv("SPARKR_IS_RUNNING_ON_WORKER") == "") {
+    # Running on driver.
+    callJStatic("org.apache.spark.SparkFiles", "getRootDirectory")
+  } else {
+    # Running on worker.
+    Sys.getenv("SPARKR_SPARKFILES_ROOT_DIR")
+  }
 }
 
 #' Get the absolute path of a file added through spark.addFile.
@@ -316,7 +351,13 @@ spark.getSparkFilesRootDirectory <- function() {
 #'}
 #' @note spark.getSparkFiles since 2.1.0
 spark.getSparkFiles <- function(fileName) {
-  callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName))
+  if (Sys.getenv("SPARKR_IS_RUNNING_ON_WORKER") == "") {
+    # Running on driver.
+    callJStatic("org.apache.spark.SparkFiles", "get", as.character(fileName))
+  } else {
+    # Running on worker.
+    file.path(spark.getSparkFilesRootDirectory(), as.character(fileName))
+  }
 }
 
 #' Run a function over a list of elements, distributing the computations with Spark
@@ -379,5 +420,5 @@ spark.lapply <- function(list, func) {
 #' @note setLogLevel since 2.0.0
 setLogLevel <- function(level) {
   sc <- getSparkContext()
-  callJMethod(sc, "setLogLevel", level)
+  invisible(callJMethod(sc, "setLogLevel", level))
 }
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 4d94b4cd05d4..2992baee6809 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -286,6 +286,28 @@ setMethod("ceil",
             column(jc)
           })
 
+#' Returns the first column that is not NA
+#'
+#' Returns the first column that is not NA, or NA if all inputs are.
+#'
+#' @rdname coalesce
+#' @name coalesce
+#' @family normal_funcs
+#' @export
+#' @aliases coalesce,Column-method
+#' @examples \dontrun{coalesce(df$c, df$d, df$e)}
+#' @note coalesce(Column) since 2.1.1
+setMethod("coalesce",
+          signature(x = "Column"),
+          function(x, ...) {
+            jcols <- lapply(list(x, ...), function (x) {
+              stopifnot(class(x) == "Column")
+              x@jc
+            })
+            jc <- callJStatic("org.apache.spark.sql.functions", "coalesce", jcols)
+            column(jc)
+          })
+
 #' Though scala functions has "col" function, we don't expose it in SparkR
 #' because we don't want to conflict with the "col" function in the R base
 #' package and we also have "column" function exported which is an alias of "col".
@@ -297,7 +319,7 @@ col <- function(x) {
 #' Returns a Column based on the given column name
 #'
 #' Returns a Column based on the given column name.
-#
+#'
 #' @param x Character column name.
 #'
 #' @rdname column
@@ -305,7 +327,7 @@ col <- function(x) {
 #' @family normal_funcs
 #' @export
 #' @aliases column,character-method
-#' @examples \dontrun{column(df)}
+#' @examples \dontrun{column("name")}
 #' @note column since 1.6.0
 setMethod("column",
           signature(x = "character"),
@@ -1485,7 +1507,7 @@ setMethod("soundex",
 
 #' Return the partition ID as a column
 #'
-#' Return the partition ID of the Spark task as a SparkDataFrame column.
+#' Return the partition ID as a SparkDataFrame column.
 #' Note that this is nondeterministic because it depends on data partitioning and
 #' task scheduling.
 #'
@@ -2296,7 +2318,7 @@ setMethod("n", signature(x = "Column"),
 #' A pattern could be for instance \preformatted{dd.MM.yyyy} and could return a string like '18.03.1993'. All
 #' pattern letters of \code{java.text.SimpleDateFormat} can be used.
 #'
-#' NOTE: Use when ever possible specialized functions like \code{year}. These benefit from a
+#' Note: Use when ever possible specialized functions like \code{year}. These benefit from a
 #' specialized implementation.
 #'
 #' @param y Column to compute on.
@@ -2317,7 +2339,8 @@ setMethod("date_format", signature(y = "Column", x = "character"),
 
 #' from_utc_timestamp
 #'
-#' Assumes given timestamp is UTC and converts to given timezone.
+#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+#' that corresponds to the same time of day in the given timezone.
 #'
 #' @param y Column to compute on.
 #' @param x time zone to use.
@@ -2340,7 +2363,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
 #' Locate the position of the first occurrence of substr column in the given string.
 #' Returns null if either of the arguments are null.
 #'
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' Note: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param y column to check
@@ -2391,7 +2414,8 @@ setMethod("next_day", signature(y = "Column", x = "character"),
 
 #' to_utc_timestamp
 #'
-#' Assumes given timestamp is in given timezone and converts to UTC.
+#' Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+#' another timestamp that corresponds to the same time of day in UTC.
 #'
 #' @param y Column to compute on
 #' @param x timezone to use
@@ -2539,7 +2563,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
 
 #' shiftRight
 #'
-#' Shift the given value numBits right. If the given value is a long value, it will return
+#' (Signed) shift the given value numBits right. If the given value is a long value, it will return
 #' a long value else it will return an integer value.
 #'
 #' @param y column to compute on.
@@ -2777,7 +2801,8 @@ setMethod("window", signature(x = "Column"),
 #' locate
 #'
 #' Locate the position of the first occurrence of substr.
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#'
+#' Note: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param substr a character string to be matched.
@@ -2823,7 +2848,8 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
 
 #' rand
 #'
-#' Generate a random column with i.i.d. samples from U[0.0, 1.0].
+#' Generate a random column with independent and identically distributed (i.i.d.) samples
+#' from U[0.0, 1.0].
 #'
 #' @param seed a random seed. Can be missing.
 #' @family normal_funcs
@@ -2852,7 +2878,8 @@ setMethod("rand", signature(seed = "numeric"),
 
 #' randn
 #'
-#' Generate a column with i.i.d. samples from the standard normal distribution.
+#' Generate a column with independent and identically distributed (i.i.d.) samples from
+#' the standard normal distribution.
 #'
 #' @param seed a random seed. Can be missing.
 #' @family normal_funcs
@@ -3145,7 +3172,8 @@ setMethod("cume_dist",
 #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
 #' sequence when there are ties. That is, if you were ranking a competition using dense_rank
 #' and had three people tie for second place, you would say that all three were in second
-#' place and that the next person came in third.
+#' place and that the next person came in third. Rank would give me sequential numbers, making
+#' the person that came in third place (after the ties) would register as coming in fifth.
 #'
 #' This is equivalent to the \code{DENSE_RANK} function in SQL.
 #'
@@ -3316,10 +3344,11 @@ setMethod("percent_rank",
 #'
 #' Window function: returns the rank of rows within a window partition.
 #'
-#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
-#' sequence when there are ties. That is, if you were ranking a competition using denseRank
+#' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
+#' sequence when there are ties. That is, if you were ranking a competition using dense_rank
 #' and had three people tie for second place, you would say that all three were in second
-#' place and that the next person came in third.
+#' place and that the next person came in third. Rank would give me sequential numbers, making
+#' the person that came in third place (after the ties) would register as coming in fifth.
 #'
 #' This is equivalent to the RANK function in SQL.
 #'
@@ -3442,8 +3471,8 @@ setMethod("size",
 
 #' sort_array
 #'
-#' Sorts the input array for the given column in ascending order,
-#' according to the natural ordering of the array elements.
+#' Sorts the input array in ascending or descending order according
+#' to the natural ordering of the array elements.
 #'
 #' @param x A Column to sort
 #' @param asc A logical flag indicating the sorting order.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 0271b26a10a9..f018ec9e46a6 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -28,7 +28,7 @@ setGeneric("cacheRDD", function(x) { standardGeneric("cacheRDD") })
 # @rdname coalesce
 # @seealso repartition
 # @export
-setGeneric("coalesce", function(x, numPartitions, ...) { standardGeneric("coalesce") })
+setGeneric("coalesceRDD", function(x, numPartitions, ...) { standardGeneric("coalesceRDD") })
 
 # @rdname checkpoint-methods
 # @export
@@ -138,9 +138,9 @@ setGeneric("sumRDD", function(x) { standardGeneric("sumRDD") })
 # @export
 setGeneric("name", function(x) { standardGeneric("name") })
 
-# @rdname getNumPartitions
+# @rdname getNumPartitionsRDD
 # @export
-setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })
+setGeneric("getNumPartitionsRDD", function(x) { standardGeneric("getNumPartitionsRDD") })
 
 # @rdname getNumPartitions
 # @export
@@ -406,6 +406,13 @@ setGeneric("attach")
 #' @export
 setGeneric("cache", function(x) { standardGeneric("cache") })
 
+#' @rdname coalesce
+#' @param x a Column or a SparkDataFrame.
+#' @param ... additional argument(s). If \code{x} is a Column, additional Columns can be optionally
+#'        provided.
+#' @export
+setGeneric("coalesce", function(x, ...) { standardGeneric("coalesce") })
+
 #' @rdname collect
 #' @export
 setGeneric("collect", function(x, ...) { standardGeneric("collect") })
@@ -492,6 +499,10 @@ setGeneric("gapply", function(x, ...) { standardGeneric("gapply") })
 #' @export
 setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect") })
 
+# @rdname getNumPartitions
+# @export
+setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })
+
 #' @rdname summary
 #' @export
 setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
@@ -1343,6 +1354,10 @@ setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
 setGeneric("spark.gaussianMixture",
            function(data, formula, ...) { standardGeneric("spark.gaussianMixture") })
 
+#' @rdname spark.gbt
+#' @export
+setGeneric("spark.gbt", function(data, formula, ...) { standardGeneric("spark.gbt") })
+
 #' @rdname spark.glm
 #' @export
 setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.glm") })
@@ -1369,7 +1384,7 @@ setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.
 
 #' @rdname spark.mlp
 #' @export
-setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") })
+setGeneric("spark.mlp", function(data, formula, ...) { standardGeneric("spark.mlp") })
 
 #' @rdname spark.naiveBayes
 #' @export
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index 69b0a523b84e..4ca7aa664e02 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -21,9 +21,9 @@
 #' Download and Install Apache Spark to a Local Directory
 #'
 #' \code{install.spark} downloads and installs Spark to a local directory if
-#' it is not found. The Spark version we use is the same as the SparkR version.
-#' Users can specify a desired Hadoop version, the remote mirror site, and
-#' the directory where the package is installed locally.
+#' it is not found. If SPARK_HOME is set in the environment, and that directory is found, that is
+#' returned. The Spark version we use is the same as the SparkR version. Users can specify a desired
+#' Hadoop version, the remote mirror site, and the directory where the package is installed locally.
 #'
 #' The full url of remote file is inferred from \code{mirrorUrl} and \code{hadoopVersion}.
 #' \code{mirrorUrl} specifies the remote path to a Spark folder. It is followed by a subfolder
@@ -50,11 +50,11 @@
 #'                 \itemize{
 #'                   \item Mac OS X: \file{~/Library/Caches/spark}
 #'                   \item Unix: \env{$XDG_CACHE_HOME} if defined, otherwise \file{~/.cache/spark}
-#'                   \item Windows: \file{\%LOCALAPPDATA\%\\spark\\spark\\Cache}.
+#'                   \item Windows: \file{\%LOCALAPPDATA\%\\Apache\\Spark\\Cache}.
 #'                 }
 #' @param overwrite If \code{TRUE}, download and overwrite the existing tar file in localDir
 #'                  and force re-install Spark (in case the local directory or file is corrupted)
-#' @return \code{install.spark} returns the local directory where Spark is found or installed
+#' @return the (invisible) local directory where Spark is found or installed
 #' @rdname install.spark
 #' @name install.spark
 #' @aliases install.spark
@@ -68,6 +68,16 @@
 #'          \href{http://spark.apache.org/downloads.html}{Apache Spark}
 install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
                           localDir = NULL, overwrite = FALSE) {
+  sparkHome <- Sys.getenv("SPARK_HOME")
+  if (isSparkRShell()) {
+    stopifnot(nchar(sparkHome) > 0)
+    message("Spark is already running in sparkR shell.")
+    return(invisible(sparkHome))
+  } else if (!is.na(file.info(sparkHome)$isdir)) {
+    message("Spark package found in SPARK_HOME: ", sparkHome)
+    return(invisible(sparkHome))
+  }
+
   version <- paste0("spark-", packageVersion("SparkR"))
   hadoopVersion <- tolower(hadoopVersion)
   hadoopVersionName <- hadoopVersionName(hadoopVersion)
@@ -79,19 +89,28 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
     dir.create(localDir, recursive = TRUE)
   }
 
-  packageLocalDir <- file.path(localDir, packageName)
-
   if (overwrite) {
     message(paste0("Overwrite = TRUE: download and overwrite the tar file",
                    "and Spark package directory if they exist."))
   }
 
+  releaseUrl <- Sys.getenv("SPARKR_RELEASE_DOWNLOAD_URL")
+  if (releaseUrl != "") {
+    packageName <- basenameSansExtFromUrl(releaseUrl)
+  }
+
+  packageLocalDir <- file.path(localDir, packageName)
+
   # can use dir.exists(packageLocalDir) under R 3.2.0 or later
   if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) {
-    fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s"
-    msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
-                   packageLocalDir)
-    message(msg)
+    if (releaseUrl != "") {
+      message(paste(packageName, "found, setting SPARK_HOME to", packageLocalDir))
+    } else {
+      fmt <- "%s for Hadoop %s found, setting SPARK_HOME to %s"
+      msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
+                     packageLocalDir)
+      message(msg)
+    }
     Sys.setenv(SPARK_HOME = packageLocalDir)
     return(invisible(packageLocalDir))
   } else {
@@ -104,14 +123,37 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
   if (tarExists && !overwrite) {
     message("tar file found.")
   } else {
-    robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
+    if (releaseUrl != "") {
+      message("Downloading from alternate URL:\n- ", releaseUrl)
+      success <- downloadUrl(releaseUrl, packageLocalPath)
+      if (!success) {
+        unlink(packageLocalPath)
+        stop(paste0("Fetch failed from ", releaseUrl))
+      }
+    } else {
+      robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
+    }
   }
 
   message(sprintf("Installing to %s", localDir))
-  untar(tarfile = packageLocalPath, exdir = localDir)
-  if (!tarExists || overwrite) {
+  # There are two ways untar can fail - untar could stop() on errors like incomplete block on file
+  # or, tar command can return failure code
+  success <- tryCatch(untar(tarfile = packageLocalPath, exdir = localDir) == 0,
+                     error = function(e) {
+                       message(e)
+                       message()
+                       FALSE
+                     },
+                     warning = function(w) {
+                       # Treat warning as error, add an empty line with message()
+                       message(w)
+                       message()
+                       FALSE
+                     })
+  if (!tarExists || overwrite || !success) {
     unlink(packageLocalPath)
   }
+  if (!success) stop("Extract archive failed.")
   message("DONE.")
   Sys.setenv(SPARK_HOME = packageLocalDir)
   message(paste("SPARK_HOME set to", packageLocalDir))
@@ -121,8 +163,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
 robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
   # step 1: use user-provided url
   if (!is.null(mirrorUrl)) {
-    msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl)
-    message(msg)
+    message("Use user-provided mirror site: ", mirrorUrl)
     success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
                                    packageName, packageLocalPath)
     if (success) {
@@ -142,7 +183,7 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa
                                    packageName, packageLocalPath)
     if (success) return()
   } else {
-    message("Unable to find preferred mirror site.")
+    message("Unable to download from preferred mirror site: ", mirrorUrl)
   }
 
   # step 3: use backup option
@@ -151,8 +192,11 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa
   success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
                                  packageName, packageLocalPath)
   if (success) {
-    return(packageLocalPath)
+    return()
   } else {
+    # remove any partially downloaded file
+    unlink(packageLocalPath)
+    message("Unable to download from default mirror site: ", mirrorUrl)
     msg <- sprintf(paste("Unable to download Spark %s for Hadoop %s.",
                          "Please check network connection, Hadoop version,",
                          "or provide other mirror sites."),
@@ -182,17 +226,25 @@ getPreferredMirror <- function(version, packageName) {
 }
 
 directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
-  packageRemotePath <- paste0(
-    file.path(mirrorUrl, version, packageName), ".tgz")
+  packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz")
   fmt <- "Downloading %s for Hadoop %s from:\n- %s"
   msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
                  packageRemotePath)
   message(msg)
+  downloadUrl(packageRemotePath, packageLocalPath)
+}
 
-  isFail <- tryCatch(download.file(packageRemotePath, packageLocalPath),
+downloadUrl <- function(remotePath, localPath) {
+  isFail <- tryCatch(download.file(remotePath, localPath),
                      error = function(e) {
-                       message(sprintf("Fetch failed from %s", mirrorUrl))
-                       print(e)
+                       message(e)
+                       message()
+                       TRUE
+                     },
+                     warning = function(w) {
+                       # Treat warning as error, add an empty line with message()
+                       message(w)
+                       message()
                        TRUE
                      })
   !isFail
@@ -218,12 +270,11 @@ sparkCachePath <- function() {
   if (.Platform$OS.type == "windows") {
     winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
     if (is.na(winAppPath)) {
-      msg <- paste("%LOCALAPPDATA% not found.",
+      stop(paste("%LOCALAPPDATA% not found.",
                    "Please define the environment variable",
-                   "or restart and enter an installation path in localDir.")
-      stop(msg)
+                   "or restart and enter an installation path in localDir."))
     } else {
-      path <- file.path(winAppPath, "spark", "spark", "Cache")
+      path <- file.path(winAppPath, "Apache", "Spark", "Cache")
     }
   } else if (.Platform$OS.type == "unix") {
     if (Sys.info()["sysname"] == "Darwin") {
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 7a220b8d53a2..1ddfa30d7fa7 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -116,6 +116,20 @@ setClass("RandomForestRegressionModel", representation(jobj = "jobj"))
 #' @note RandomForestClassificationModel since 2.1.0
 setClass("RandomForestClassificationModel", representation(jobj = "jobj"))
 
+#' S4 class that represents a GBTRegressionModel
+#'
+#' @param jobj a Java object reference to the backing Scala GBTRegressionModel
+#' @export
+#' @note GBTRegressionModel since 2.1.0
+setClass("GBTRegressionModel", representation(jobj = "jobj"))
+
+#' S4 class that represents a GBTClassificationModel
+#'
+#' @param jobj a Java object reference to the backing Scala GBTClassificationModel
+#' @export
+#' @note GBTClassificationModel since 2.1.0
+setClass("GBTClassificationModel", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the specific
@@ -124,7 +138,8 @@ setClass("RandomForestClassificationModel", representation(jobj = "jobj"))
 #' @name write.ml
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
-#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
+#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, \link{spark.isoreg},
+#' @seealso \link{spark.kmeans},
 #' @seealso \link{spark.lda}, \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes},
 #' @seealso \link{spark.randomForest}, \link{spark.survreg},
 #' @seealso \link{read.ml}
@@ -138,7 +153,8 @@ NULL
 #' @name predict
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
-#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
+#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, \link{spark.isoreg},
+#' @seealso \link{spark.kmeans},
 #' @seealso \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes},
 #' @seealso \link{spark.randomForest}, \link{spark.survreg}
 NULL
@@ -157,7 +173,7 @@ predict_internal <- function(object, newData) {
 
 #' Generalized Linear Models
 #'
-#' Fits generalized linear model against a Spark DataFrame.
+#' Fits generalized linear model against a SparkDataFrame.
 #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
 #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
@@ -168,6 +184,8 @@ predict_internal <- function(object, newData) {
 #'               This can be a character string naming a family function, a family function or
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
+#'               Currently these families are supported: \code{binomial}, \code{gaussian},
+#'               \code{Gamma}, and \code{poisson}.
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
@@ -175,7 +193,7 @@ predict_internal <- function(object, newData) {
 #' @param regParam regularization parameter for L2 regularization.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
-#' @return \code{spark.glm} returns a fitted generalized linear model
+#' @return \code{spark.glm} returns a fitted generalized linear model.
 #' @rdname spark.glm
 #' @name spark.glm
 #' @export
@@ -220,8 +238,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               weightCol <- ""
             }
 
+            # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
-                                "fit", formula, data@sdf, family$family, family$link,
+                                "fit", formula, data@sdf, tolower(family$family), family$link,
                                 tol, as.integer(maxIter), as.character(weightCol), regParam)
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
@@ -236,6 +255,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'               This can be a character string naming a family function, a family function or
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
+#'               Currently these families are supported: \code{binomial}, \code{gaussian},
+#'               \code{Gamma}, and \code{poisson}.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
@@ -261,10 +282,12 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
 
 #' @param object a fitted generalized linear model.
-#' @return \code{summary} returns a summary object of the fitted model, a list of components
-#'         including at least the coefficients, null/residual deviance, null/residual degrees
-#'         of freedom, AIC and number of iterations IRLS takes.
-#'
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list of components includes at least the \code{coefficients} (coefficients matrix, which includes
+#'         coefficients, standard error of coefficients, t value and p value),
+#'         \code{null.deviance} (null/residual degrees of freedom), \code{aic} (AIC)
+#'         and \code{iter} (number of iterations IRLS takes). If there are collinear columns in the data,
+#'         the coefficients matrix only provides coefficients.
 #' @rdname spark.glm
 #' @export
 #' @note summary(GeneralizedLinearRegressionModel) since 2.0.0
@@ -287,9 +310,18 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
             } else {
               dataFrame(callJMethod(jobj, "rDevianceResiduals"))
             }
-            coefficients <- matrix(coefficients, ncol = 4)
-            colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
-            rownames(coefficients) <- unlist(features)
+            # If the underlying WeightedLeastSquares using "normal" solver, we can provide
+            # coefficients, standard error of coefficients, t value and p value. Otherwise,
+            # it will be fitted by local "l-bfgs", we can only provide coefficients.
+            if (length(features) == length(coefficients)) {
+              coefficients <- matrix(coefficients, ncol = 1)
+              colnames(coefficients) <- c("Estimate")
+              rownames(coefficients) <- unlist(features)
+            } else {
+              coefficients <- matrix(coefficients, ncol = 4)
+              colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
+              rownames(coefficients) <- unlist(features)
+            }
             ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
                         dispersion = dispersion, null.deviance = null.deviance,
                         deviance = deviance, df.null = df.null, df.residual = df.residual,
@@ -301,7 +333,7 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
 #  Prints the summary of GeneralizedLinearRegressionModel
 
 #' @rdname spark.glm
-#' @param x summary object of fitted generalized linear model returned by \code{summary} function
+#' @param x summary object of fitted generalized linear model returned by \code{summary} function.
 #' @export
 #' @note print.summary.GeneralizedLinearRegressionModel since 2.0.0
 print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
@@ -334,7 +366,7 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named
-#'         "prediction"
+#'         "prediction".
 #' @rdname spark.glm
 #' @export
 #' @note predict(GeneralizedLinearRegressionModel) since 1.5.0
@@ -348,7 +380,7 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
-#' "prediction"
+#' "prediction".
 #' @rdname spark.naiveBayes
 #' @export
 #' @note predict(NaiveBayesModel) since 2.0.0
@@ -360,8 +392,9 @@ setMethod("predict", signature(object = "NaiveBayesModel"),
 # Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes}
 
 #' @param object a naive Bayes model fitted by \code{spark.naiveBayes}.
-#' @return \code{summary} returns a list containing \code{apriori}, the label distribution, and
-#'         \code{tables}, conditional probabilities given the target label.
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes \code{apriori} (the label distribution) and
+#'         \code{tables} (conditional probabilities given the target label).
 #' @rdname spark.naiveBayes
 #' @export
 #' @note summary(NaiveBayesModel) since 2.0.0
@@ -382,9 +415,9 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 
 # Returns posterior probabilities from a Latent Dirichlet Allocation model produced by spark.lda()
 
-#' @param newData A SparkDataFrame for testing
+#' @param newData A SparkDataFrame for testing.
 #' @return \code{spark.posterior} returns a SparkDataFrame containing posterior probabilities
-#'         vectors named "topicDistribution"
+#'         vectors named "topicDistribution".
 #' @rdname spark.lda
 #' @aliases spark.posterior,LDAModel,SparkDataFrame-method
 #' @export
@@ -398,7 +431,8 @@ setMethod("spark.posterior", signature(object = "LDAModel", newData = "SparkData
 
 #' @param object A Latent Dirichlet Allocation model fitted by \code{spark.lda}.
 #' @param maxTermsPerTopic Maximum number of terms to collect for each topic. Default value of 10.
-#' @return \code{summary} returns a list containing
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes
 #'         \item{\code{docConcentration}}{concentration parameter commonly named \code{alpha} for
 #'               the prior placed on documents distributions over topics \code{theta}}
 #'         \item{\code{topicConcentration}}{concentration parameter commonly named \code{beta} or
@@ -449,7 +483,7 @@ setMethod("spark.perplexity", signature(object = "LDAModel", data = "SparkDataFr
 
 # Saves the Latent Dirichlet Allocation model to the input path.
 
-#' @param path The directory where the model is saved
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -465,19 +499,19 @@ setMethod("write.ml", signature(object = "LDAModel", path = "character"),
 
 #' Isotonic Regression Model
 #'
-#' Fits an Isotonic Regression model against a Spark DataFrame, similarly to R's isoreg().
+#' Fits an Isotonic Regression model against a SparkDataFrame, similarly to R's isoreg().
 #' Users can print, make predictions on the produced model and save the model to the input path.
 #'
-#' @param data SparkDataFrame for training
+#' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param isotonic Whether the output sequence should be isotonic/increasing (TRUE) or
-#'                 antitonic/decreasing (FALSE)
+#'                 antitonic/decreasing (FALSE).
 #' @param featureIndex The index of the feature if \code{featuresCol} is a vector column
-#'                     (default: 0), no effect otherwise
+#'                     (default: 0), no effect otherwise.
 #' @param weightCol The weight column name.
 #' @param ... additional arguments passed to the method.
-#' @return \code{spark.isoreg} returns a fitted Isotonic Regression model
+#' @return \code{spark.isoreg} returns a fitted Isotonic Regression model.
 #' @rdname spark.isoreg
 #' @aliases spark.isoreg,SparkDataFrame,formula-method
 #' @name spark.isoreg
@@ -509,7 +543,7 @@ setMethod("write.ml", signature(object = "LDAModel", path = "character"),
 #' @note spark.isoreg since 2.1.0
 setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol = NULL) {
-            formula <- paste0(deparse(formula), collapse = "")
+            formula <- paste(deparse(formula), collapse = "")
 
             if (is.null(weightCol)) {
               weightCol <- ""
@@ -523,9 +557,9 @@ setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"
 
 #  Predicted values based on an isotonicRegression model
 
-#' @param object a fitted IsotonicRegressionModel
-#' @param newData SparkDataFrame for testing
-#' @return \code{predict} returns a SparkDataFrame containing predicted values
+#' @param object a fitted IsotonicRegressionModel.
+#' @param newData SparkDataFrame for testing.
+#' @return \code{predict} returns a SparkDataFrame containing predicted values.
 #' @rdname spark.isoreg
 #' @aliases predict,IsotonicRegressionModel,SparkDataFrame-method
 #' @export
@@ -537,7 +571,9 @@ setMethod("predict", signature(object = "IsotonicRegressionModel"),
 
 #  Get the summary of an IsotonicRegressionModel model
 
-#' @return \code{summary} returns the model's boundaries and prediction as lists
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes model's \code{boundaries} (boundaries in increasing order)
+#'         and \code{predictions} (predictions associated with the boundaries at the same index).
 #' @rdname spark.isoreg
 #' @aliases summary,IsotonicRegressionModel-method
 #' @export
@@ -552,7 +588,7 @@ setMethod("summary", signature(object = "IsotonicRegressionModel"),
 
 #' K-Means Clustering Model
 #'
-#' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans().
+#' Fits a k-means clustering model against a SparkDataFrame, similarly to R's kmeans().
 #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
 #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
@@ -563,6 +599,10 @@ setMethod("summary", signature(object = "IsotonicRegressionModel"),
 #' @param k number of centers.
 #' @param maxIter maximum iteration number.
 #' @param initMode the initialization algorithm choosen to fit the model.
+#' @param seed the random seed for cluster initialization
+#' @param initSteps the number of steps for the k-means|| initialization mode.
+#'                  This is an advanced setting, the default of 2 is almost always enough. Must be > 0.
+#' @param tol convergence tolerance of iterations.
 #' @param ... additional argument(s) passed to the method.
 #' @return \code{spark.kmeans} returns a fitted k-means model.
 #' @rdname spark.kmeans
@@ -592,11 +632,16 @@ setMethod("summary", signature(object = "IsotonicRegressionModel"),
 #' @note spark.kmeans since 2.0.0
 #' @seealso \link{predict}, \link{read.ml}, \link{write.ml}
 setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"),
-          function(data, formula, k = 2, maxIter = 20, initMode = c("k-means||", "random")) {
+          function(data, formula, k = 2, maxIter = 20, initMode = c("k-means||", "random"),
+                   seed = NULL, initSteps = 2, tol = 1E-4) {
             formula <- paste(deparse(formula), collapse = "")
             initMode <- match.arg(initMode)
+            if (!is.null(seed)) {
+              seed <- as.character(as.integer(seed))
+            }
             jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", data@sdf, formula,
-                                as.integer(k), as.integer(maxIter), initMode)
+                                as.integer(k), as.integer(maxIter), initMode, seed,
+                                as.integer(initSteps), as.numeric(tol))
             new("KMeansModel", jobj = jobj)
           })
 
@@ -634,7 +679,14 @@ setMethod("fitted", signature(object = "KMeansModel"),
 #  Get the summary of a k-means model
 
 #' @param object a fitted k-means model.
-#' @return \code{summary} returns the model's coefficients, size and cluster.
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes the model's \code{k} (the configured number of cluster centers),
+#'         \code{coefficients} (model cluster centers),
+#'         \code{size} (number of data points in each cluster), \code{cluster}
+#'         (cluster centers of the transformed data), {is.loaded} (whether the model is loaded
+#'         from a saved file), and \code{clusterSize}
+#'         (the actual number of cluster centers. When using initMode = "random",
+#'         \code{clusterSize} may not equal to \code{k}).
 #' @rdname spark.kmeans
 #' @export
 #' @note summary(KMeansModel) since 2.0.0
@@ -646,16 +698,17 @@ setMethod("summary", signature(object = "KMeansModel"),
             coefficients <- callJMethod(jobj, "coefficients")
             k <- callJMethod(jobj, "k")
             size <- callJMethod(jobj, "size")
-            coefficients <- t(matrix(coefficients, ncol = k))
+            clusterSize <- callJMethod(jobj, "clusterSize")
+            coefficients <- t(matrix(coefficients, ncol = clusterSize))
             colnames(coefficients) <- unlist(features)
-            rownames(coefficients) <- 1:k
+            rownames(coefficients) <- 1:clusterSize
             cluster <- if (is.loaded) {
               NULL
             } else {
               dataFrame(callJMethod(jobj, "cluster"))
             }
-            list(coefficients = coefficients, size = size,
-                 cluster = cluster, is.loaded = is.loaded)
+            list(k = k, coefficients = coefficients, size = size,
+                 cluster = cluster, is.loaded = is.loaded, clusterSize = clusterSize)
           })
 
 #  Predicted values based on a k-means model
@@ -672,22 +725,21 @@ setMethod("predict", signature(object = "KMeansModel"),
 
 #' Logistic Regression Model
 #'
-#' Fits an logistic regression model against a Spark DataFrame. It supports "binomial": Binary logistic regression
+#' Fits an logistic regression model against a SparkDataFrame. It supports "binomial": Binary logistic regression
 #' with pivoting; "multinomial": Multinomial logistic (softmax) regression without pivoting, similar to glmnet.
 #' Users can print, make predictions on the produced model and save the model to the input path.
 #'
-#' @param data SparkDataFrame for training
+#' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
-#' @param regParam the regularization parameter. Default is 0.0.
+#' @param regParam the regularization parameter.
 #' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2 penalty.
 #'                        For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0, the penalty is a combination
 #'                        of L1 and L2. Default is 0.0 which is an L2 penalty.
 #' @param maxIter maximum iteration number.
 #' @param tol convergence tolerance of iterations.
-#' @param fitIntercept whether to fit an intercept term. Default is TRUE.
 #' @param family the name of family which is a description of the label distribution to be used in the model.
-#'               Supported options: Default is "auto".
+#'               Supported options:
 #'                 \itemize{
 #'                   \item{"auto": Automatically select the family based on the number of classes:
 #'                           If number of classes == 1 || number of classes == 2, set to "binomial".
@@ -705,13 +757,10 @@ setMethod("predict", signature(object = "KMeansModel"),
 #'                  threshold p is equivalent to setting thresholds c(1-p, p). In multiclass (or binary) classification to adjust the probability of
 #'                  predicting each class. Array must have length equal to the number of classes, with values > 0,
 #'                  excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
-#'                  is the original probability of that class and t is the class's threshold. Default is 0.5.
+#'                  is the original probability of that class and t is the class's threshold.
 #' @param weightCol The weight column name.
-#' @param aggregationDepth depth for treeAggregate (>= 2). If the dimensions of features or the number of partitions
-#'                         are large, this param could be adjusted to a larger size. Default is 2.
-#' @param probabilityCol column name for predicted class conditional probabilities. Default is "probability".
 #' @param ... additional arguments passed to the method.
-#' @return \code{spark.logit} returns a fitted logistic regression model
+#' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
 #' @aliases spark.logit,SparkDataFrame,formula-method
 #' @name spark.logit
@@ -720,46 +769,36 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' \dontrun{
 #' sparkR.session()
 #' # binary logistic regression
-#' label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
-#' feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-#' binary_data <- as.data.frame(cbind(label, feature))
-#' binary_df <- createDataFrame(binary_data)
-#' blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
-#' blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
-#'
-#' # summary of binary logistic regression
-#' blr_summary <- summary(blr_model)
-#' blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
+#' df <- createDataFrame(iris)
+#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
+#' model <- spark.logit(training, Species ~ ., regParam = 0.5)
+#' summary <- summary(model)
+#'
+#' # fitted values on training data
+#' fitted <- predict(model, training)
+#'
 #' # save fitted model to input path
 #' path <- "path/to/model"
-#' write.ml(blr_model, path)
+#' write.ml(model, path)
 #'
 #' # can also read back the saved model and predict
 #' # Note that summary deos not work on loaded model
 #' savedModel <- read.ml(path)
-#' blr_predict2 <- collect(select(predict(savedModel, binary_df), "prediction"))
+#' summary(savedModel)
 #'
 #' # multinomial logistic regression
 #'
-#' label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
-#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
-#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
-#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
-#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
-#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
-#' df <- createDataFrame(data)
+#' df <- createDataFrame(iris)
+#' model <- spark.logit(df, Species ~ ., regParam = 0.5)
+#' summary <- summary(model)
 #'
-#' # Note that summary of multinomial logistic regression is not implemented yet
-#' model <- spark.logit(df, label ~ ., family = "multinomial", thresholds = c(0, 1, 1))
-#' predict1 <- collect(select(predict(model, df), "prediction"))
 #' }
 #' @note spark.logit since 2.1.0
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
-                   tol = 1E-6, fitIntercept = TRUE, family = "auto", standardization = TRUE,
-                   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
-                   probabilityCol = "probability") {
-            formula <- paste0(deparse(formula), collapse = "")
+                   tol = 1E-6, family = "auto", standardization = TRUE,
+                   thresholds = 0.5, weightCol = NULL) {
+            formula <- paste(deparse(formula), collapse = "")
 
             if (is.null(weightCol)) {
               weightCol <- ""
@@ -768,10 +807,9 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
             jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
                                 data@sdf, formula, as.numeric(regParam),
                                 as.numeric(elasticNetParam), as.integer(maxIter),
-                                as.numeric(tol), as.logical(fitIntercept),
-                                as.character(family), as.logical(standardization),
-                                as.array(thresholds), as.character(weightCol),
-                                as.integer(aggregationDepth), as.character(probabilityCol))
+                                as.numeric(tol), as.character(family),
+                                as.logical(standardization), as.array(thresholds),
+                                as.character(weightCol))
             new("LogisticRegressionModel", jobj = jobj)
           })
 
@@ -790,9 +828,9 @@ setMethod("predict", signature(object = "LogisticRegressionModel"),
 
 #  Get the summary of an LogisticRegressionModel
 
-#' @param object an LogisticRegressionModel fitted by \code{spark.logit}
-#' @return \code{summary} returns the Binary Logistic regression results of a given model as lists. Note that
-#'                        Multinomial logistic regression summary is not available now.
+#' @param object an LogisticRegressionModel fitted by \code{spark.logit}.
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes \code{coefficients} (coefficients matrix of the fitted model).
 #' @rdname spark.logit
 #' @aliases summary,LogisticRegressionModel-method
 #' @export
@@ -800,33 +838,21 @@ setMethod("predict", signature(object = "LogisticRegressionModel"),
 setMethod("summary", signature(object = "LogisticRegressionModel"),
           function(object) {
             jobj <- object@jobj
-            is.loaded <- callJMethod(jobj, "isLoaded")
-
-            if (is.loaded) {
-              stop("Loaded model doesn't have training summary.")
+            features <- callJMethod(jobj, "rFeatures")
+            labels <- callJMethod(jobj, "labels")
+            coefficients <- callJMethod(jobj, "rCoefficients")
+            nCol <- length(coefficients) / length(features)
+            coefficients <- matrix(coefficients, ncol = nCol)
+            # If nCol == 1, means this is a binomial logistic regression model with pivoting.
+            # Otherwise, it's a multinomial logistic regression model without pivoting.
+            if (nCol == 1) {
+              colnames(coefficients) <- c("Estimate")
+            } else {
+              colnames(coefficients) <- unlist(labels)
             }
+            rownames(coefficients) <- unlist(features)
 
-            roc <- dataFrame(callJMethod(jobj, "roc"))
-
-            areaUnderROC <- callJMethod(jobj, "areaUnderROC")
-
-            pr <- dataFrame(callJMethod(jobj, "pr"))
-
-            fMeasureByThreshold <- dataFrame(callJMethod(jobj, "fMeasureByThreshold"))
-
-            precisionByThreshold <- dataFrame(callJMethod(jobj, "precisionByThreshold"))
-
-            recallByThreshold <- dataFrame(callJMethod(jobj, "recallByThreshold"))
-
-            totalIterations <- callJMethod(jobj, "totalIterations")
-
-            objectiveHistory <- callJMethod(jobj, "objectiveHistory")
-
-            list(roc = roc, areaUnderROC = areaUnderROC, pr = pr,
-                 fMeasureByThreshold = fMeasureByThreshold,
-                 precisionByThreshold = precisionByThreshold,
-                 recallByThreshold = recallByThreshold,
-                 totalIterations = totalIterations, objectiveHistory = objectiveHistory)
+            list(coefficients = coefficients)
           })
 
 #' Multilayer Perceptron Classification Model
@@ -840,8 +866,10 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #'   Multilayer Perceptron}
 #'
 #' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
+#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
+#'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param blockSize blockSize parameter.
-#' @param layers integer vector containing the number of nodes for each layer
+#' @param layers integer vector containing the number of nodes for each layer.
 #' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs".
 #' @param maxIter maximum iteration number.
 #' @param tol convergence tolerance of iterations.
@@ -852,7 +880,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
 #' @rdname spark.mlp
-#' @aliases spark.mlp,SparkDataFrame-method
+#' @aliases spark.mlp,SparkDataFrame,formula-method
 #' @name spark.mlp
 #' @seealso \link{read.ml}
 #' @export
@@ -861,7 +889,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
 #'
 #' # fit a Multilayer Perceptron Classification Model
-#' model <- spark.mlp(df, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
+#' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
 #'                    maxIter = 100, tol = 0.5, stepSize = 1, seed = 1,
 #'                    initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
 #'
@@ -878,9 +906,10 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #' summary(savedModel)
 #' }
 #' @note spark.mlp since 2.1.0
-setMethod("spark.mlp", signature(data = "SparkDataFrame"),
-          function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
+setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
                    tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) {
+            formula <- paste(deparse(formula), collapse = "")
             if (is.null(layers)) {
               stop ("layers must be a integer vector with length > 1.")
             }
@@ -895,7 +924,7 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame"),
               initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
             }
             jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
-                                "fit", data@sdf, as.integer(blockSize), as.array(layers),
+                                "fit", data@sdf, formula, as.integer(blockSize), as.array(layers),
                                 as.character(solver), as.integer(maxIter), as.numeric(tol),
                                 as.numeric(stepSize), seed, initialWeights)
             new("MultilayerPerceptronClassificationModel", jobj = jobj)
@@ -918,9 +947,12 @@ setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel
 # Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp}
 
 #' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp}
-#' @return \code{summary} returns a list containing \code{labelCount}, \code{layers}, and
-#'         \code{weights}. For \code{weights}, it is a numeric vector with length equal to
-#'         the expected given the architecture (i.e., for 8-10-2 network, 100 connection weights).
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes \code{numOfInputs} (number of inputs), \code{numOfOutputs}
+#'         (number of outputs), \code{layers} (array of layer sizes including input
+#'         and output layers), and \code{weights} (the weights of layers).
+#'         For \code{weights}, it is a numeric vector with length equal to the expected
+#'         given the architecture (i.e., for 8-10-2 network, 112 connection weights).
 #' @rdname spark.mlp
 #' @export
 #' @aliases summary,MultilayerPerceptronClassificationModel-method
@@ -928,10 +960,12 @@ setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel
 setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"),
           function(object) {
             jobj <- object@jobj
-            labelCount <- callJMethod(jobj, "labelCount")
             layers <- unlist(callJMethod(jobj, "layers"))
+            numOfInputs <- head(layers, n = 1)
+            numOfOutputs <- tail(layers, n = 1)
             weights <- callJMethod(jobj, "weights")
-            list(labelCount = labelCount, layers = layers, weights = weights)
+            list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs,
+                 layers = layers, weights = weights)
           })
 
 #' Naive Bayes Models
@@ -983,7 +1017,7 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form
 
 # Saves the Bernoulli naive Bayes model to the input path.
 
-#' @param path the directory where the model is saved
+#' @param path the directory where the model is saved.
 #' @param overwrite overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -1057,7 +1091,7 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
 
 #  Save fitted IsotonicRegressionModel to the input path
 
-#' @param path The directory where the model is saved
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -1072,7 +1106,7 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
 
 #  Save fitted LogisticRegressionModel to the input path
 
-#' @param path The directory where the model is saved
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -1141,6 +1175,10 @@ read.ml <- function(path) {
     new("RandomForestRegressionModel", jobj = jobj)
   } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.RandomForestClassifierWrapper")) {
     new("RandomForestClassificationModel", jobj = jobj)
+  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTRegressorWrapper")) {
+    new("GBTRegressionModel", jobj = jobj)
+  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTClassifierWrapper")) {
+    new("GBTClassificationModel", jobj = jobj)
   } else {
     stop("Unsupported model: ", jobj)
   }
@@ -1195,14 +1233,14 @@ setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula
 #' posterior probabilities on new data, \code{spark.perplexity} to compute log perplexity on new
 #' data and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
-#' @param data A SparkDataFrame for training
-#' @param features Features column name, default "features". Either libSVM-format column or
-#'        character-format column is valid.
-#' @param k Number of topics, default 10
-#' @param maxIter Maximum iterations, default 20
-#' @param optimizer Optimizer to train an LDA model, "online" or "em", default "online"
+#' @param data A SparkDataFrame for training.
+#' @param features Features column name. Either libSVM-format column or character-format column is
+#'        valid.
+#' @param k Number of topics.
+#' @param maxIter Maximum iterations.
+#' @param optimizer Optimizer to train an LDA model, "online" or "em", default is "online".
 #' @param subsamplingRate (For online optimizer) Fraction of the corpus to be sampled and used in
-#'        each iteration of mini-batch gradient descent, in range (0, 1], default 0.05
+#'        each iteration of mini-batch gradient descent, in range (0, 1].
 #' @param topicConcentration concentration parameter (commonly named \code{beta} or \code{eta}) for
 #'        the prior placed on topic distributions over terms, default -1 to set automatically on the
 #'        Spark side. Use \code{summary} to retrieve the effective topicConcentration. Only 1-size
@@ -1215,7 +1253,7 @@ setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula
 #'        parameter if libSVM-format column is used as the features column.
 #' @param maxVocabSize maximum vocabulary size, default 1 << 18
 #' @param ... additional argument(s) passed to the method.
-#' @return \code{spark.lda} returns a fitted Latent Dirichlet Allocation model
+#' @return \code{spark.lda} returns a fitted Latent Dirichlet Allocation model.
 #' @rdname spark.lda
 #' @aliases spark.lda,SparkDataFrame-method
 #' @seealso topicmodels: \url{https://cran.r-project.org/package=topicmodels}
@@ -1263,8 +1301,9 @@ setMethod("spark.lda", signature(data = "SparkDataFrame"),
 # similarly to R's summary().
 
 #' @param object a fitted AFT survival regression model.
-#' @return \code{summary} returns a list containing the model's coefficients,
-#' intercept and log(scale)
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes the model's \code{coefficients} (features, coefficients,
+#'         intercept and log(scale)).
 #' @rdname spark.survreg
 #' @export
 #' @note summary(AFTSurvivalRegressionModel) since 2.0.0
@@ -1284,7 +1323,7 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"),
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted values
-#' on the original scale of the data (mean predicted value at scale = 1.0).
+#'         on the original scale of the data (mean predicted value at scale = 1.0).
 #' @rdname spark.survreg
 #' @export
 #' @note predict(AFTSurvivalRegressionModel) since 2.0.0
@@ -1295,7 +1334,7 @@ setMethod("predict", signature(object = "AFTSurvivalRegressionModel"),
 
 #' Multivariate Gaussian Mixture Model (GMM)
 #'
-#' Fits multivariate gaussian mixture model against a Spark DataFrame, similarly to R's
+#' Fits multivariate gaussian mixture model against a SparkDataFrame, similarly to R's
 #' mvnormalmixEM(). Users can call \code{summary} to print a summary of the fitted model,
 #' \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml}
 #' to save/load fitted models.
@@ -1351,7 +1390,9 @@ setMethod("spark.gaussianMixture", signature(data = "SparkDataFrame", formula =
 #  Get the summary of a multivariate gaussian mixture model
 
 #' @param object a fitted gaussian mixture model.
-#' @return \code{summary} returns the model's lambda, mu, sigma and posterior.
+#' @return \code{summary} returns summary of the fitted model, which is a list.
+#'         The list includes the model's \code{lambda} (lambda), \code{mu} (mu),
+#'         \code{sigma} (sigma), and \code{posterior} (posterior).
 #' @aliases spark.gaussianMixture,SparkDataFrame,formula-method
 #' @rdname spark.gaussianMixture
 #' @export
@@ -1415,7 +1456,7 @@ setMethod("predict", signature(object = "GaussianMixtureModel"),
 #' @param userCol column name for user ids. Ids must be (or can be coerced into) integers.
 #' @param itemCol column name for item ids. Ids must be (or can be coerced into) integers.
 #' @param rank rank of the matrix factorization (> 0).
-#' @param reg regularization parameter (>= 0).
+#' @param regParam regularization parameter (>= 0).
 #' @param maxIter maximum number of iterations (>= 0).
 #' @param nonnegative logical value indicating whether to apply nonnegativity constraints.
 #' @param implicitPrefs logical value indicating whether to use implicit preference.
@@ -1425,7 +1466,7 @@ setMethod("predict", signature(object = "GaussianMixtureModel"),
 #' @param numItemBlocks number of item blocks used to parallelize computation (> 0).
 #' @param checkpointInterval number of checkpoint intervals (>= 1) or disable checkpoint (-1).
 #' @param ... additional argument(s) passed to the method.
-#' @return \code{spark.als} returns a fitted ALS model
+#' @return \code{spark.als} returns a fitted ALS model.
 #' @rdname spark.als
 #' @aliases spark.als,SparkDataFrame-method
 #' @name spark.als
@@ -1454,21 +1495,21 @@ setMethod("predict", signature(object = "GaussianMixtureModel"),
 #'
 #' # set other arguments
 #' modelS <- spark.als(df, "rating", "user", "item", rank = 20,
-#'                     reg = 0.1, nonnegative = TRUE)
+#'                     regParam = 0.1, nonnegative = TRUE)
 #' statsS <- summary(modelS)
 #' }
 #' @note spark.als since 2.1.0
 setMethod("spark.als", signature(data = "SparkDataFrame"),
           function(data, ratingCol = "rating", userCol = "user", itemCol = "item",
-                   rank = 10, reg = 0.1, maxIter = 10, nonnegative = FALSE,
+                   rank = 10, regParam = 0.1, maxIter = 10, nonnegative = FALSE,
                    implicitPrefs = FALSE, alpha = 1.0, numUserBlocks = 10, numItemBlocks = 10,
                    checkpointInterval = 10, seed = 0) {
 
             if (!is.numeric(rank) || rank <= 0) {
               stop("rank should be a positive number.")
             }
-            if (!is.numeric(reg) || reg < 0) {
-              stop("reg should be a nonnegative number.")
+            if (!is.numeric(regParam) || regParam < 0) {
+              stop("regParam should be a nonnegative number.")
             }
             if (!is.numeric(maxIter) || maxIter <= 0) {
               stop("maxIter should be a positive number.")
@@ -1476,7 +1517,7 @@ setMethod("spark.als", signature(data = "SparkDataFrame"),
 
             jobj <- callJStatic("org.apache.spark.ml.r.ALSWrapper",
                                 "fit", data@sdf, ratingCol, userCol, itemCol, as.integer(rank),
-                                reg, as.integer(maxIter), implicitPrefs, alpha, nonnegative,
+                                regParam, as.integer(maxIter), implicitPrefs, alpha, nonnegative,
                                 as.integer(numUserBlocks), as.integer(numItemBlocks),
                                 as.integer(checkpointInterval), as.integer(seed))
             new("ALSModel", jobj = jobj)
@@ -1485,9 +1526,11 @@ setMethod("spark.als", signature(data = "SparkDataFrame"),
 # Returns a summary of the ALS model produced by spark.als.
 
 #' @param object a fitted ALS model.
-#' @return \code{summary} returns a list containing the names of the user column,
-#'         the item column and the rating column, the estimated user and item factors,
-#'         rank, regularization parameter and maximum number of iterations used in training.
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes \code{user} (the names of the user column),
+#'         \code{item} (the item column), \code{rating} (the rating column), \code{userFactors}
+#'         (the estimated user factors), \code{itemFactors} (the estimated item factors),
+#'         and \code{rank} (rank of the matrix factorization model).
 #' @rdname spark.als
 #' @aliases summary,ALSModel-method
 #' @export
@@ -1570,14 +1613,14 @@ setMethod("write.ml", signature(object = "ALSModel", path = "character"),
 #' \dontrun{
 #' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25))
 #' df <- createDataFrame(data)
-#' test <- spark.ktest(df, "test", "norm", c(0, 1))
+#' test <- spark.kstest(df, "test", "norm", c(0, 1))
 #'
 #' # get a summary of the test result
 #' testSummary <- summary(test)
 #' testSummary
 #'
 #' # print out the summary in an organized way
-#' print.summary.KSTest(test)
+#' print.summary.KSTest(testSummary)
 #' }
 #' @note spark.kstest since 2.1.0
 setMethod("spark.kstest", signature(data = "SparkDataFrame"),
@@ -1600,9 +1643,10 @@ setMethod("spark.kstest", signature(data = "SparkDataFrame"),
 
 #  Get the summary of Kolmogorov-Smirnov (KS) Test.
 #' @param object test result object of KSTest by \code{spark.kstest}.
-#' @return \code{summary} returns a list containing the p-value, test statistic computed for the
-#'         test, the null hypothesis with its parameters tested against
-#'         and degrees of freedom of the test.
+#' @return \code{summary} returns summary information of KSTest object, which is a list.
+#'         The list includes the \code{p.value} (p-value), \code{statistic} (test statistic
+#'         computed for the test), \code{nullHypothesis} (the null hypothesis with its
+#'         parameters tested against) and \code{degreesOfFreedom} (degrees of freedom of the test).
 #' @rdname spark.kstest
 #' @aliases summary,KSTest-method
 #' @export
@@ -1644,33 +1688,36 @@ print.summary.KSTest <- function(x, ...) {
 #' model, \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to
 #' save/load fitted models.
 #' For more details, see
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{Random Forest}
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{
+#' Random Forest Regression} and
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{
+#' Random Forest Classification}
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', ':', '+', and '-'.
 #' @param type type of model, one of "regression" or "classification", to fit
-#' @param maxDepth Maximum depth of the tree (>= 0). (default = 5)
+#' @param maxDepth Maximum depth of the tree (>= 0).
 #' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing
 #'                how to split on features at each node. More bins give higher granularity. Must be
-#'                >= 2 and >= number of categories in any categorical feature. (default = 32)
+#'                >= 2 and >= number of categories in any categorical feature.
 #' @param numTrees Number of trees to train (>= 1).
 #' @param impurity Criterion used for information gain calculation.
 #'                 For regression, must be "variance". For classification, must be one of
-#'                 "entropy" and "gini". (default = gini)
-#' @param minInstancesPerNode Minimum number of instances each child must have after split.
-#' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
-#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                 "entropy" and "gini", default is "gini".
 #' @param featureSubsetStrategy The number of features to consider for splits at each tree node.
 #'        Supported options: "auto", "all", "onethird", "sqrt", "log2", (0.0-1.0], [1-n].
 #' @param seed integer seed for random number generation.
 #' @param subsamplingRate Fraction of the training data used for learning each decision tree, in
-#'                        range (0, 1]. (default = 1.0)
-#' @param probabilityCol column name for predicted class conditional probabilities, only for
-#'                       classification. (default = "probability")
+#'                        range (0, 1].
+#' @param minInstancesPerNode Minimum number of instances each child must have after split.
+#' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
+#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
-#'                     nodes.
+#'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
+#'                     can speed up training of deeper trees. Users can set how often should the
+#'                     cache be checkpointed or disable it by setting checkpointInterval.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -1703,9 +1750,9 @@ print.summary.KSTest <- function(x, ...) {
 setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, type = c("regression", "classification"),
                    maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL,
-                   minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
                    featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0,
-                   probabilityCol = "probability", maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+                   minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
+                   maxMemoryInMB = 256, cacheNodeIds = FALSE) {
             type <- match.arg(type)
             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(seed)) {
@@ -1734,7 +1781,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                                          impurity, as.integer(minInstancesPerNode),
                                          as.numeric(minInfoGain), as.integer(checkpointInterval),
                                          as.character(featureSubsetStrategy), seed,
-                                         as.numeric(subsamplingRate), as.character(probabilityCol),
+                                         as.numeric(subsamplingRate),
                                          as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
                      new("RandomForestClassificationModel", jobj = jobj)
                    }
@@ -1745,11 +1792,11 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
-#' "prediction"
+#'         "prediction".
 #' @rdname spark.randomForest
 #' @aliases predict,RandomForestRegressionModel-method
 #' @export
-#' @note predict(randomForestRegressionModel) since 2.1.0
+#' @note predict(RandomForestRegressionModel) since 2.1.0
 setMethod("predict", signature(object = "RandomForestRegressionModel"),
           function(object, newData) {
             predict_internal(object, newData)
@@ -1758,7 +1805,7 @@ setMethod("predict", signature(object = "RandomForestRegressionModel"),
 #' @rdname spark.randomForest
 #' @aliases predict,RandomForestClassificationModel-method
 #' @export
-#' @note predict(randomForestClassificationModel) since 2.1.0
+#' @note predict(RandomForestClassificationModel) since 2.1.0
 setMethod("predict", signature(object = "RandomForestClassificationModel"),
           function(object, newData) {
             predict_internal(object, newData)
@@ -1766,8 +1813,8 @@ setMethod("predict", signature(object = "RandomForestClassificationModel"),
 
 # Save the Random Forest Regression or Classification model to the input path.
 
-#' @param object A fitted Random Forest regression model or classification model
-#' @param path The directory where the model is saved
+#' @param object A fitted Random Forest regression model or classification model.
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -1789,8 +1836,8 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
             write_internal(object, path, overwrite)
           })
 
-#  Get the summary of an RandomForestRegressionModel model
-summary.randomForest <- function(model) {
+# Create the summary of a tree ensemble model (eg. Random Forest, GBT)
+summary.treeEnsemble <- function(model) {
   jobj <- model@jobj
   formula <- callJMethod(jobj, "formula")
   numFeatures <- callJMethod(jobj, "numFeatures")
@@ -1807,20 +1854,25 @@ summary.randomForest <- function(model) {
        jobj = jobj)
 }
 
-#' @return \code{summary} returns the model's features as lists, depth and number of nodes
-#'                        or number of classes.
+#  Get the summary of a Random Forest Regression Model
+
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list of components includes \code{formula} (formula),
+#'         \code{numFeatures} (number of features), \code{features} (list of features),
+#'         \code{featureImportances} (feature importances), \code{numTrees} (number of trees),
+#'         and \code{treeWeights} (tree weights).
 #' @rdname spark.randomForest
 #' @aliases summary,RandomForestRegressionModel-method
 #' @export
 #' @note summary(RandomForestRegressionModel) since 2.1.0
 setMethod("summary", signature(object = "RandomForestRegressionModel"),
           function(object) {
-            ans <- summary.randomForest(object)
+            ans <- summary.treeEnsemble(object)
             class(ans) <- "summary.RandomForestRegressionModel"
             ans
           })
 
-#  Get the summary of an RandomForestClassificationModel model
+#  Get the summary of a Random Forest Classification Model
 
 #' @rdname spark.randomForest
 #' @aliases summary,RandomForestClassificationModel-method
@@ -1828,13 +1880,13 @@ setMethod("summary", signature(object = "RandomForestRegressionModel"),
 #' @note summary(RandomForestClassificationModel) since 2.1.0
 setMethod("summary", signature(object = "RandomForestClassificationModel"),
           function(object) {
-            ans <- summary.randomForest(object)
+            ans <- summary.treeEnsemble(object)
             class(ans) <- "summary.RandomForestClassificationModel"
             ans
           })
 
-#  Prints the summary of Random Forest Regression Model
-print.summary.randomForest <- function(x) {
+#  Prints the summary of tree ensemble models (eg. Random Forest, GBT)
+print.summary.treeEnsemble <- function(x) {
   jobj <- x$jobj
   cat("Formula: ", x$formula)
   cat("\nNumber of features: ", x$numFeatures)
@@ -1848,13 +1900,15 @@ print.summary.randomForest <- function(x) {
   invisible(x)
 }
 
+#  Prints the summary of Random Forest Regression Model
+
 #' @param x summary object of Random Forest regression model or classification model
 #'          returned by \code{summary}.
 #' @rdname spark.randomForest
 #' @export
 #' @note print.summary.RandomForestRegressionModel since 2.1.0
 print.summary.RandomForestRegressionModel <- function(x, ...) {
-  print.summary.randomForest(x)
+  print.summary.treeEnsemble(x)
 }
 
 #  Prints the summary of Random Forest Classification Model
@@ -1863,5 +1917,216 @@ print.summary.RandomForestRegressionModel <- function(x, ...) {
 #' @export
 #' @note print.summary.RandomForestClassificationModel since 2.1.0
 print.summary.RandomForestClassificationModel <- function(x, ...) {
-  print.summary.randomForest(x)
+  print.summary.treeEnsemble(x)
+}
+
+#' Gradient Boosted Tree Model for Regression and Classification
+#'
+#' \code{spark.gbt} fits a Gradient Boosted Tree Regression model or Classification model on a
+#' SparkDataFrame. Users can call \code{summary} to get a summary of the fitted
+#' Gradient Boosted Tree model, \code{predict} to make predictions on new data, and
+#' \code{write.ml}/\code{read.ml} to save/load fitted models.
+#' For more details, see
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{
+#' GBT Regression} and
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{
+#' GBT Classification}
+#'
+#' @param data a SparkDataFrame for training.
+#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
+#'                operators are supported, including '~', ':', '+', and '-'.
+#' @param type type of model, one of "regression" or "classification", to fit
+#' @param maxDepth Maximum depth of the tree (>= 0).
+#' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing
+#'                how to split on features at each node. More bins give higher granularity. Must be
+#'                >= 2 and >= number of categories in any categorical feature.
+#' @param maxIter Param for maximum number of iterations (>= 0).
+#' @param stepSize Param for Step size to be used for each iteration of optimization.
+#' @param lossType Loss function which GBT tries to minimize.
+#'                 For classification, must be "logistic". For regression, must be one of
+#'                 "squared" (L2) and "absolute" (L1), default is "squared".
+#' @param seed integer seed for random number generation.
+#' @param subsamplingRate Fraction of the training data used for learning each decision tree, in
+#'                        range (0, 1].
+#' @param minInstancesPerNode Minimum number of instances each child must have after split. If a
+#'                            split causes the left or right child to have fewer than
+#'                            minInstancesPerNode, the split will be discarded as invalid. Should be
+#'                            >= 1.
+#' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
+#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
+#' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
+#'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
+#'                     can speed up training of deeper trees. Users can set how often should the
+#'                     cache be checkpointed or disable it by setting checkpointInterval.
+#' @param ... additional arguments passed to the method.
+#' @aliases spark.gbt,SparkDataFrame,formula-method
+#' @return \code{spark.gbt} returns a fitted Gradient Boosted Tree model.
+#' @rdname spark.gbt
+#' @name spark.gbt
+#' @export
+#' @examples
+#' \dontrun{
+#' # fit a Gradient Boosted Tree Regression Model
+#' df <- createDataFrame(longley)
+#' model <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 5, maxBins = 16)
+#'
+#' # get the summary of the model
+#' summary(model)
+#'
+#' # make predictions
+#' predictions <- predict(model, df)
+#'
+#' # save and load the model
+#' path <- "path/to/model"
+#' write.ml(model, path)
+#' savedModel <- read.ml(path)
+#' summary(savedModel)
+#'
+#' # fit a Gradient Boosted Tree Classification Model
+#' # label must be binary - Only binary classification is supported for GBT.
+#' df <- createDataFrame(iris[iris$Species != "virginica", ])
+#' model <- spark.gbt(df, Species ~ Petal_Length + Petal_Width, "classification")
+#'
+#' # numeric label is also supported
+#' iris2 <- iris[iris$Species != "virginica", ]
+#' iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
+#' df <- createDataFrame(iris2)
+#' model <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
+#' }
+#' @note spark.gbt since 2.1.0
+setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, type = c("regression", "classification"),
+                   maxDepth = 5, maxBins = 32, maxIter = 20, stepSize = 0.1, lossType = NULL,
+                   seed = NULL, subsamplingRate = 1.0, minInstancesPerNode = 1, minInfoGain = 0.0,
+                   checkpointInterval = 10, maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+            type <- match.arg(type)
+            formula <- paste(deparse(formula), collapse = "")
+            if (!is.null(seed)) {
+              seed <- as.character(as.integer(seed))
+            }
+            switch(type,
+                   regression = {
+                     if (is.null(lossType)) lossType <- "squared"
+                     lossType <- match.arg(lossType, c("squared", "absolute"))
+                     jobj <- callJStatic("org.apache.spark.ml.r.GBTRegressorWrapper",
+                                         "fit", data@sdf, formula, as.integer(maxDepth),
+                                         as.integer(maxBins), as.integer(maxIter),
+                                         as.numeric(stepSize), as.integer(minInstancesPerNode),
+                                         as.numeric(minInfoGain), as.integer(checkpointInterval),
+                                         lossType, seed, as.numeric(subsamplingRate),
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                     new("GBTRegressionModel", jobj = jobj)
+                   },
+                   classification = {
+                     if (is.null(lossType)) lossType <- "logistic"
+                     lossType <- match.arg(lossType, "logistic")
+                     jobj <- callJStatic("org.apache.spark.ml.r.GBTClassifierWrapper",
+                                         "fit", data@sdf, formula, as.integer(maxDepth),
+                                         as.integer(maxBins), as.integer(maxIter),
+                                         as.numeric(stepSize), as.integer(minInstancesPerNode),
+                                         as.numeric(minInfoGain), as.integer(checkpointInterval),
+                                         lossType, seed, as.numeric(subsamplingRate),
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                     new("GBTClassificationModel", jobj = jobj)
+                   }
+            )
+          })
+
+# Makes predictions from a Gradient Boosted Tree Regression model or Classification model
+
+#' @param newData a SparkDataFrame for testing.
+#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
+#'         "prediction".
+#' @rdname spark.gbt
+#' @aliases predict,GBTRegressionModel-method
+#' @export
+#' @note predict(GBTRegressionModel) since 2.1.0
+setMethod("predict", signature(object = "GBTRegressionModel"),
+          function(object, newData) {
+            predict_internal(object, newData)
+          })
+
+#' @rdname spark.gbt
+#' @aliases predict,GBTClassificationModel-method
+#' @export
+#' @note predict(GBTClassificationModel) since 2.1.0
+setMethod("predict", signature(object = "GBTClassificationModel"),
+          function(object, newData) {
+            predict_internal(object, newData)
+          })
+
+# Save the Gradient Boosted Tree Regression or Classification model to the input path.
+
+#' @param object A fitted Gradient Boosted Tree regression model or classification model.
+#' @param path The directory where the model is saved.
+#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
+#'                  which means throw exception if the output path exists.
+#' @aliases write.ml,GBTRegressionModel,character-method
+#' @rdname spark.gbt
+#' @export
+#' @note write.ml(GBTRegressionModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "GBTRegressionModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
+
+#' @aliases write.ml,GBTClassificationModel,character-method
+#' @rdname spark.gbt
+#' @export
+#' @note write.ml(GBTClassificationModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "GBTClassificationModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
+
+#  Get the summary of a Gradient Boosted Tree Regression Model
+
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list of components includes \code{formula} (formula),
+#'         \code{numFeatures} (number of features), \code{features} (list of features),
+#'         \code{featureImportances} (feature importances), \code{numTrees} (number of trees),
+#'         and \code{treeWeights} (tree weights).
+#' @rdname spark.gbt
+#' @aliases summary,GBTRegressionModel-method
+#' @export
+#' @note summary(GBTRegressionModel) since 2.1.0
+setMethod("summary", signature(object = "GBTRegressionModel"),
+          function(object) {
+            ans <- summary.treeEnsemble(object)
+            class(ans) <- "summary.GBTRegressionModel"
+            ans
+          })
+
+#  Get the summary of a Gradient Boosted Tree Classification Model
+
+#' @rdname spark.gbt
+#' @aliases summary,GBTClassificationModel-method
+#' @export
+#' @note summary(GBTClassificationModel) since 2.1.0
+setMethod("summary", signature(object = "GBTClassificationModel"),
+          function(object) {
+            ans <- summary.treeEnsemble(object)
+            class(ans) <- "summary.GBTClassificationModel"
+            ans
+          })
+
+#  Prints the summary of Gradient Boosted Tree Regression Model
+
+#' @param x summary object of Gradient Boosted Tree regression model or classification model
+#'          returned by \code{summary}.
+#' @rdname spark.gbt
+#' @export
+#' @note print.summary.GBTRegressionModel since 2.1.0
+print.summary.GBTRegressionModel <- function(x, ...) {
+  print.summary.treeEnsemble(x)
+}
+
+#  Prints the summary of Gradient Boosted Tree Classification Model
+
+#' @rdname spark.gbt
+#' @export
+#' @note print.summary.GBTClassificationModel since 2.1.0
+print.summary.GBTClassificationModel <- function(x, ...) {
+  print.summary.treeEnsemble(x)
 }
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index 4dee3245f9b7..8fa21be3076b 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -780,7 +780,7 @@ setMethod("cogroup",
 #' @noRd
 setMethod("sortByKey",
           signature(x = "RDD"),
-          function(x, ascending = TRUE, numPartitions = SparkR:::getNumPartitions(x)) {
+          function(x, ascending = TRUE, numPartitions = SparkR:::getNumPartitionsRDD(x)) {
             rangeBounds <- list()
 
             if (numPartitions > 1) {
@@ -850,7 +850,7 @@ setMethod("sortByKey",
 #' @noRd
 setMethod("subtractByKey",
           signature(x = "RDD", other = "RDD"),
-          function(x, other, numPartitions = SparkR:::getNumPartitions(x)) {
+          function(x, other, numPartitions = SparkR:::getNumPartitionsRDD(x)) {
             filterFunction <- function(elem) {
               iters <- elem[[2]]
               (length(iters[[1]]) > 0) && (length(iters[[2]]) == 0)
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 6b4a2f2fdc85..d0a12b7ecec6 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -323,6 +323,18 @@ sparkRHive.init <- function(jsc = NULL) {
 #' Additional Spark properties can be set in \code{...}, and these named parameters take priority
 #' over values in \code{master}, \code{appName}, named lists of \code{sparkConfig}.
 #'
+#' When called in an interactive session, this method checks for the Spark installation, and, if not
+#' found, it will be downloaded and cached automatically. Alternatively, \code{install.spark} can
+#' be called manually.
+#'
+#' A default warehouse is created automatically in the current directory when a managed table is
+#' created via \code{sql} statement \code{CREATE TABLE}, for example. To change the location of the
+#' warehouse, set the named parameter \code{spark.sql.warehouse.dir} to the SparkSession. Along with
+#' the warehouse, an accompanied metastore may also be automatically created in the current
+#' directory when a new SparkSession is initialized with \code{enableHiveSupport} set to
+#' \code{TRUE}, which is the default. For more details, refer to Hive configuration at
+#' \url{http://spark.apache.org/docs/latest/sql-programming-guide.html#hive-tables}.
+#'
 #' For details on how to initialize and use SparkR, refer to SparkR programming guide at
 #' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession}.
 #'
@@ -373,8 +385,17 @@ sparkR.session <- function(
     overrideEnvs(sparkConfigMap, paramMap)
   }
 
+  deployMode <- ""
+  if (exists("spark.submit.deployMode", envir = sparkConfigMap)) {
+    deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
+  }
+
+  if (!exists("spark.r.sql.derby.temp.dir", envir = sparkConfigMap)) {
+    sparkConfigMap[["spark.r.sql.derby.temp.dir"]] <- tempdir()
+  }
+
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
-    retHome <- sparkCheckInstall(sparkHome, master)
+    retHome <- sparkCheckInstall(sparkHome, master, deployMode)
     if (!is.null(retHome)) sparkHome <- retHome
     sparkExecutorEnvMap <- new.env()
     sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, sparkExecutorEnvMap,
@@ -402,6 +423,30 @@ sparkR.session <- function(
   sparkSession
 }
 
+#' Get the URL of the SparkUI instance for the current active SparkSession
+#'
+#' Get the URL of the SparkUI instance for the current active SparkSession.
+#'
+#' @return the SparkUI URL, or NA if it is disabled, or not started.
+#' @rdname sparkR.uiWebUrl
+#' @name sparkR.uiWebUrl
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' url <- sparkR.uiWebUrl()
+#' }
+#' @note sparkR.uiWebUrl since 2.1.1
+sparkR.uiWebUrl <- function() {
+  sc <- sparkR.callJMethod(getSparkContext(), "sc")
+  u <- callJMethod(sc, "uiWebUrl")
+  if (callJMethod(u, "isDefined")) {
+    callJMethod(u, "get")
+  } else {
+    NA
+  }
+}
+
 #' Assigns a group ID to all the jobs started by this thread until the group ID is set to a
 #' different value or cleared.
 #'
@@ -419,7 +464,7 @@ sparkR.session <- function(
 #' @method setJobGroup default
 setJobGroup.default <- function(groupId, description, interruptOnCancel) {
   sc <- getSparkContext()
-  callJMethod(sc, "setJobGroup", groupId, description, interruptOnCancel)
+  invisible(callJMethod(sc, "setJobGroup", groupId, description, interruptOnCancel))
 }
 
 setJobGroup <- function(sc, groupId, description, interruptOnCancel) {
@@ -449,7 +494,7 @@ setJobGroup <- function(sc, groupId, description, interruptOnCancel) {
 #' @method clearJobGroup default
 clearJobGroup.default <- function() {
   sc <- getSparkContext()
-  callJMethod(sc, "clearJobGroup")
+  invisible(callJMethod(sc, "clearJobGroup"))
 }
 
 clearJobGroup <- function(sc) {
@@ -476,7 +521,7 @@ clearJobGroup <- function(sc) {
 #' @method cancelJobGroup default
 cancelJobGroup.default <- function(groupId) {
   sc <- getSparkContext()
-  callJMethod(sc, "cancelJobGroup", groupId)
+  invisible(callJMethod(sc, "cancelJobGroup", groupId))
 }
 
 cancelJobGroup <- function(sc, groupId) {
@@ -550,24 +595,25 @@ processSparkPackages <- function(packages) {
 #
 # @param sparkHome directory to find Spark package.
 # @param master the Spark master URL, used to check local or remote mode.
+# @param deployMode whether to deploy your driver on the worker nodes (cluster)
+#        or locally as an external client (client).
 # @return NULL if no need to update sparkHome, and new sparkHome otherwise.
-sparkCheckInstall <- function(sparkHome, master) {
+sparkCheckInstall <- function(sparkHome, master, deployMode) {
   if (!isSparkRShell()) {
     if (!is.na(file.info(sparkHome)$isdir)) {
-      msg <- paste0("Spark package found in SPARK_HOME: ", sparkHome)
-      message(msg)
+      message("Spark package found in SPARK_HOME: ", sparkHome)
       NULL
     } else {
-      if (!nzchar(master) || isMasterLocal(master)) {
-        msg <- paste0("Spark not found in SPARK_HOME: ",
-                      sparkHome)
-        message(msg)
+      if (interactive() || isMasterLocal(master)) {
+        message("Spark not found in SPARK_HOME: ", sparkHome)
         packageLocalDir <- install.spark()
         packageLocalDir
-      } else {
+      } else if (isClientMode(master) || deployMode == "client") {
         msg <- paste0("Spark not found in SPARK_HOME: ",
                       sparkHome, "\n", installInstruction("remote"))
         stop(msg)
+      } else {
+        NULL
       }
     }
   } else {
diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
index abca703617c7..ade0f05c0254 100644
--- a/R/pkg/R/types.R
+++ b/R/pkg/R/types.R
@@ -29,7 +29,7 @@ PRIMITIVE_TYPES <- as.environment(list(
   "string" = "character",
   "binary" = "raw",
   "boolean" = "logical",
-  "timestamp" = "POSIXct",
+  "timestamp" = c("POSIXct", "POSIXt"),
   "date" = "Date",
   # following types are not SQL types returned by dtypes(). They are listed here for usage
   # by checkType() in schema.R.
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 20004549cc03..1f7848f2b413 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -756,12 +756,17 @@ varargsToJProperties <- function(...) {
   props
 }
 
-launchScript <- function(script, combinedArgs, capture = FALSE) {
+launchScript <- function(script, combinedArgs, wait = FALSE) {
   if (.Platform$OS.type == "windows") {
     scriptWithArgs <- paste(script, combinedArgs, sep = " ")
-    shell(scriptWithArgs, translate = TRUE, wait = capture, intern = capture) # nolint
+    # on Windows, intern = F seems to mean output to the console. (documentation on this is missing)
+    shell(scriptWithArgs, translate = TRUE, wait = wait, intern = wait) # nolint
   } else {
-    system2(script, combinedArgs, wait = capture, stdout = capture)
+    # http://stat.ethz.ch/R-manual/R-devel/library/base/html/system2.html
+    # stdout = F means discard output
+    # stdout = "" means to its console (default)
+    # Note that the console of this child process might not be the same as the running R process.
+    system2(script, combinedArgs, stdout = "", wait = wait)
   }
 }
 
@@ -777,6 +782,10 @@ isMasterLocal <- function(master) {
   grepl("^local(\\[([0-9]+|\\*)\\])?$", master, perl = TRUE)
 }
 
+isClientMode <- function(master) {
+  grepl("([a-z]+)-client$", master, perl = TRUE)
+}
+
 isSparkRShell <- function() {
   grepl(".*shell\\.R$", Sys.getenv("R_PROFILE_USER"), perl = TRUE)
 }
@@ -837,7 +846,7 @@ captureJVMException <- function(e, method) {
 #
 # @param inputData a list of rows, with each row a list
 # @return data.frame with raw columns as lists
-rbindRaws <- function(inputData){
+rbindRaws <- function(inputData) {
   row1 <- inputData[[1]]
   rawcolumns <- ("raw" == sapply(row1, class))
 
@@ -847,3 +856,19 @@ rbindRaws <- function(inputData){
   out[!rawcolumns] <- lapply(out[!rawcolumns], unlist)
   out
 }
+
+# Get basename without extension from URL
+basenameSansExtFromUrl <- function(url) {
+  # split by '/'
+  splits <- unlist(strsplit(url, "^.+/"))
+  last <- tail(splits, 1)
+  # this is from file_path_sans_ext
+  # first, remove any compression extension
+  filename <- sub("[.](gz|bz2|xz)$", "", last)
+  # then, strip extension by the last '.'
+  sub("([^.]+)\\.[[:alnum:]]+$", "\\1", filename)
+}
+
+isAtomicLengthOne <- function(x) {
+  is.atomic(x) && length(x) == 1
+}
diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/testthat/test_Windows.R
index 8813e18a1fa4..1d777ddb286d 100644
--- a/R/pkg/inst/tests/testthat/test_Windows.R
+++ b/R/pkg/inst/tests/testthat/test_Windows.R
@@ -20,7 +20,8 @@ test_that("sparkJars tag in SparkContext", {
   if (.Platform$OS.type != "windows") {
     skip("This test is only for Windows, skipped")
   }
-  testOutput <- launchScript("ECHO", "a/b/c", capture = TRUE)
+
+  testOutput <- launchScript("ECHO", "a/b/c", wait = TRUE)
   abcPath <- testOutput[1]
   expect_equal(abcPath, "a\\b\\c")
 })
diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R
index caca06933952..c84711349111 100644
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ b/R/pkg/inst/tests/testthat/test_context.R
@@ -177,6 +177,13 @@ test_that("add and get file to be downloaded with Spark job on every node", {
   spark.addFile(path)
   download_path <- spark.getSparkFiles(filename)
   expect_equal(readLines(download_path), words)
+
+  # Test spark.getSparkFiles works well on executors.
+  seq <- seq(from = 1, to = 10, length.out = 5)
+  f <- function(seq) { spark.getSparkFiles(filename) }
+  results <- spark.lapply(seq, f)
+  for (i in 1:5) { expect_equal(basename(results[[i]]), filename) }
+
   unlink(path)
 
   # Test add directory recursively.
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index db98d0e45547..8fe3a87f6faa 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -74,6 +74,14 @@ test_that("spark.glm and predict", {
   data = iris, family = poisson(link = identity)), iris))
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
+  # Gamma family
+  x <- runif(100, -1, 1)
+  y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
+  df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
+  model <- glm(y ~ x, family = Gamma, df)
+  out <- capture.output(print(summary(model)))
+  expect_true(any(grepl("Dispersion parameter for gamma family", out)))
+
   # Test stats::predict is working
   x <- rnorm(15)
   y <- x + rnorm(15)
@@ -159,6 +167,15 @@ test_that("spark.glm summary", {
   df <- suppressWarnings(createDataFrame(data))
   regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
   expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
+
+  # Test spark.glm works on collinear data
+  A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
+  b <- c(1, 2, 3, 4)
+  data <- as.data.frame(cbind(A, b))
+  df <- createDataFrame(data)
+  stats <- summary(spark.glm(df, b ~ . - 1))
+  coefs <- unlist(stats$coefficients)
+  expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
 })
 
 test_that("spark.glm save/load", {
@@ -341,6 +358,8 @@ test_that("spark.kmeans", {
   # Test summary works on KMeans
   summary.model <- summary(model)
   cluster <- summary.model$cluster
+  k <- summary.model$k
+  expect_equal(k, 2)
   expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
 
   # Test model save/load
@@ -356,17 +375,45 @@ test_that("spark.kmeans", {
   expect_true(summary2$is.loaded)
 
   unlink(modelPath)
+
+  # Test Kmeans on dataset that is sensitive to seed value
+  col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+  col2 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+  col3 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+  cols <- as.data.frame(cbind(col1, col2, col3))
+  df <- createDataFrame(cols)
+
+  model1 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
+                         initMode = "random", seed = 1, tol = 1E-5)
+  model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
+                         initMode = "random", seed = 22222, tol = 1E-5)
+
+  summary.model1 <- summary(model1)
+  summary.model2 <- summary(model2)
+  cluster1 <- summary.model1$cluster
+  cluster2 <- summary.model2$cluster
+  clusterSize1 <- summary.model1$clusterSize
+  clusterSize2 <- summary.model2$clusterSize
+
+  # The predicted clusters are different
+  expect_equal(sort(collect(distinct(select(cluster1, "prediction")))$prediction),
+               c(0, 1, 2, 3))
+  expect_equal(sort(collect(distinct(select(cluster2, "prediction")))$prediction),
+               c(0, 1, 2))
+  expect_equal(clusterSize1, 4)
+  expect_equal(clusterSize2, 3)
 })
 
 test_that("spark.mlp", {
   df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
                 source = "libsvm")
-  model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs", maxIter = 100,
-                     tol = 0.5, stepSize = 1, seed = 1)
+  model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
+                     solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
 
   # Test summary method
   summary <- summary(model)
-  expect_equal(summary$labelCount, 3)
+  expect_equal(summary$numOfInputs, 4)
+  expect_equal(summary$numOfOutputs, 3)
   expect_equal(summary$layers, c(4, 5, 4, 3))
   expect_equal(length(summary$weights), 64)
   expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
@@ -375,7 +422,7 @@ test_that("spark.mlp", {
   # Test predict method
   mlpTestDF <- df
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 6), c(0, 1, 1, 1, 1, 1))
+  expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
 
   # Test model save/load
   modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
@@ -385,46 +432,68 @@ test_that("spark.mlp", {
   model2 <- read.ml(modelPath)
   summary2 <- summary(model2)
 
-  expect_equal(summary2$labelCount, 3)
+  expect_equal(summary2$numOfInputs, 4)
+  expect_equal(summary2$numOfOutputs, 3)
   expect_equal(summary2$layers, c(4, 5, 4, 3))
   expect_equal(length(summary2$weights), 64)
 
   unlink(modelPath)
 
   # Test default parameter
-  model <- spark.mlp(df, layers = c(4, 5, 4, 3))
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 10), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 0))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
   # Test illegal parameter
-  expect_error(spark.mlp(df, layers = NULL), "layers must be a integer vector with length > 1.")
-  expect_error(spark.mlp(df, layers = c()), "layers must be a integer vector with length > 1.")
-  expect_error(spark.mlp(df, layers = c(3)), "layers must be a integer vector with length > 1.")
+  expect_error(spark.mlp(df, label ~ features, layers = NULL),
+               "layers must be a integer vector with length > 1.")
+  expect_error(spark.mlp(df, label ~ features, layers = c()),
+               "layers must be a integer vector with length > 1.")
+  expect_error(spark.mlp(df, label ~ features, layers = c(3)),
+               "layers must be a integer vector with length > 1.")
 
   # Test random seed
   # default seed
-  model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10)
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 2, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
   # seed equals 10
-  model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
   # test initialWeights
-  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
     c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
-  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
     c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0))
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
-  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2)
+  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 2, 1, 0, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "0.0", "2.0", "1.0", "0.0"))
+
+  # Test formula works well
+  df <- suppressWarnings(createDataFrame(iris))
+  model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
+                     layers = c(4, 3))
+  summary <- summary(model)
+  expect_equal(summary$numOfInputs, 4)
+  expect_equal(summary$numOfOutputs, 3)
+  expect_equal(summary$layers, c(4, 3))
+  expect_equal(length(summary$weights), 15)
+  expect_equal(head(summary$weights, 5), list(-1.1957257, -5.2693685, 7.4489734, -6.3751413,
+               -10.2376130), tolerance = 1e-6)
 })
 
 test_that("spark.naiveBayes", {
@@ -603,58 +672,141 @@ test_that("spark.isotonicRegression", {
 })
 
 test_that("spark.logit", {
-  # test binary logistic regression
-  label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
-  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-  binary_data <- as.data.frame(cbind(label, feature))
-  binary_df <- createDataFrame(binary_data)
-
-  blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
-  blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
-  expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0))
-  blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
-  blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
-  expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1))
-
-  # test summary of binary logistic regression
-  blr_summary <- summary(blr_model)
-  blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
-  expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487),
-               tolerance = 1e-4)
-  expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000),
-               tolerance = 1e-4)
-  blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
-  expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000),
-               tolerance = 1e-4)
-  blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
-  expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000),
-               tolerance = 1e-4)
+  # R code to reproduce the result.
+  # nolint start
+  #' library(glmnet)
+  #' iris.x = as.matrix(iris[, 1:4])
+  #' iris.y = as.factor(as.character(iris[, 5]))
+  #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # $setosa
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               1.0981324
+  # Sepal.Length -0.2909860
+  # Sepal.Width   0.5510907
+  # Petal.Length -0.1915217
+  # Petal.Width  -0.4211946
+  #
+  # $versicolor
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               1.520061e+00
+  # Sepal.Length  2.524501e-02
+  # Sepal.Width  -5.310313e-01
+  # Petal.Length  3.656543e-02
+  # Petal.Width  -3.144464e-05
+  #
+  # $virginica
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #              -2.61819385
+  # Sepal.Length  0.26574097
+  # Sepal.Width  -0.02005932
+  # Petal.Length  0.15495629
+  # Petal.Width   0.42122607
+  # nolint end
 
-  # test model save and read
-  modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")
-  write.ml(blr_model, modelPath)
-  expect_error(write.ml(blr_model, modelPath))
-  write.ml(blr_model, modelPath, overwrite = TRUE)
-  blr_model2 <- read.ml(modelPath)
-  blr_predict2 <- collect(select(predict(blr_model2, binary_df), "prediction"))
-  expect_equal(blr_predict$prediction, blr_predict2$prediction)
-  expect_error(summary(blr_model2))
+  # Test multinomial logistic regression againt three classes
+  df <- suppressWarnings(createDataFrame(iris))
+  model <- spark.logit(df, Species ~ ., regParam = 0.5)
+  summary <- summary(model)
+  versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
+  virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
+  setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
+  versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
+  virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
+  setosaCoefs <- unlist(summary$coefficients[, "setosa"])
+  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+  expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
+
+  # Test model save and load
+  modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  coefs <- summary(model)$coefficients
+  coefs2 <- summary(model2)$coefficients
+  expect_equal(coefs, coefs2)
   unlink(modelPath)
 
-  # test multinomial logistic regression
-  label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
-  feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
-  feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
-  feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
-  feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
-  data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
-  df <- createDataFrame(data)
+  # R code to reproduce the result.
+  # nolint start
+  #' library(glmnet)
+  #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+  #' iris.x = as.matrix(iris2[, 1:4])
+  #' iris.y = as.factor(as.character(iris2[, 5]))
+  #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # $versicolor
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               3.93844796
+  # Sepal.Length -0.13538675
+  # Sepal.Width  -0.02386443
+  # Petal.Length -0.35076451
+  # Petal.Width  -0.77971954
+  #
+  # $virginica
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #              -3.93844796
+  # Sepal.Length  0.13538675
+  # Sepal.Width   0.02386443
+  # Petal.Length  0.35076451
+  # Petal.Width   0.77971954
+  #
+  #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  # (Intercept)  -6.0824412
+  # Sepal.Length  0.2458260
+  # Sepal.Width   0.1642093
+  # Petal.Length  0.4759487
+  # Petal.Width   1.0383948
+  #
+  # nolint end
+
+  # Test multinomial logistic regression againt two classes
+  df <- suppressWarnings(createDataFrame(iris))
+  training <- df[df$Species %in% c("versicolor", "virginica"), ]
+  model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
+  summary <- summary(model)
+  versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
+  virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
+  versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
+  virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
+  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+
+  # Test binomial logistic regression againt two classes
+  model <- spark.logit(training, Species ~ ., regParam = 0.5)
+  summary <- summary(model)
+  coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
+  coefs <- unlist(summary$coefficients[, "Estimate"])
+  expect_true(all(abs(coefsR - coefs) < 0.1))
+
+  # Test prediction with string label
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+  expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
+                "versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
+  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
 
-  model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
-  predict1 <- collect(select(predict(model, df), "prediction"))
-  expect_equal(predict1$prediction, c(0, 0, 0, 0, 0))
-  # Summary of multinomial logistic regression is not implemented yet
-  expect_error(summary(model))
+  # Test prediction with numeric label
+  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
+  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+  data <- as.data.frame(cbind(label, feature))
+  df <- createDataFrame(data)
+  model <- spark.logit(df, label ~ feature)
+  prediction <- collect(select(predict(model, df), "prediction"))
+  expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0"))
 })
 
 test_that("spark.gaussianMixture", {
@@ -735,7 +887,7 @@ test_that("spark.lda with libsvm", {
   weights <- stats$topicTopTermsWeights
   vocabulary <- stats$vocabulary
 
-  expect_false(isDistributed)
+  expect_true(isDistributed)
   expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
   expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
   expect_equal(vocabSize, 11)
@@ -749,7 +901,7 @@ test_that("spark.lda with libsvm", {
   model2 <- read.ml(modelPath)
   stats2 <- summary(model2)
 
-  expect_false(stats2$isDistributed)
+  expect_true(stats2$isDistributed)
   expect_equal(logLikelihood, stats2$logLikelihood)
   expect_equal(logPerplexity, stats2$logPerplexity)
   expect_equal(vocabSize, stats2$vocabSize)
@@ -811,10 +963,10 @@ test_that("spark.posterior and spark.perplexity", {
 
 test_that("spark.als", {
   data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
-  list(2, 1, 1.0), list(2, 2, 5.0))
+               list(2, 1, 1.0), list(2, 2, 5.0))
   df <- createDataFrame(data, c("user", "item", "score"))
   model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
-  rank = 10, maxIter = 5, seed = 0, reg = 0.1)
+                     rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
   stats <- summary(model)
   expect_equal(stats$rank, 10)
   test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
@@ -869,9 +1021,16 @@ test_that("spark.kstest", {
   expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
   expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
   expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
+
+  # Test print.summary.KSTest
+  printStats <- capture.output(print.summary.KSTest(stats))
+  expect_match(printStats[1], "Kolmogorov-Smirnov test summary:")
+  expect_match(printStats[5],
+               "Low presumption against null hypothesis: Sample follows theoretical distribution. ")
 })
 
-test_that("spark.randomForest Regression", {
+test_that("spark.randomForest", {
+  # regression
   data <- suppressWarnings(createDataFrame(longley))
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
                               numTrees = 1)
@@ -891,10 +1050,11 @@ test_that("spark.randomForest Regression", {
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
                               numTrees = 20, seed = 123)
   predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.379, 61.096, 60.636, 62.258,
-                                         63.736, 64.296, 64.868, 64.300,
-                                         66.709, 67.697, 67.966, 67.252,
-                                         68.866, 69.593, 69.195, 69.658),
+  expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
+                                         63.53160, 64.05470, 65.12710, 64.30450,
+                                         66.70910, 67.86125, 68.08700, 67.21865,
+                                         68.89275, 69.53180, 69.39640, 69.68250),
+
                tolerance = 1e-4)
   stats <- summary(model)
   expect_equal(stats$numTrees, 20)
@@ -913,9 +1073,8 @@ test_that("spark.randomForest Regression", {
   expect_equal(stats$treeWeights, stats2$treeWeights)
 
   unlink(modelPath)
-})
 
-test_that("spark.randomForest Classification", {
+  # classification
   data <- suppressWarnings(createDataFrame(iris))
   model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
                               maxDepth = 5, maxBins = 16)
@@ -925,6 +1084,10 @@ test_that("spark.randomForest Classification", {
   expect_equal(stats$numTrees, 20)
   expect_error(capture.output(stats), NA)
   expect_true(length(capture.output(stats)) > 6)
+  # Test string prediction values
+  predictions <- collect(predict(model, data))$prediction
+  expect_equal(length(grep("setosa", predictions)), 50)
+  expect_equal(length(grep("versicolor", predictions)), 50)
 
   modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp")
   write.ml(model, modelPath)
@@ -937,6 +1100,106 @@ test_that("spark.randomForest Classification", {
   expect_equal(stats$numClasses, stats2$numClasses)
 
   unlink(modelPath)
+
+  # Test numeric response variable
+  labelToIndex <- function(species) {
+    switch(as.character(species),
+      setosa = 0.0,
+      versicolor = 1.0,
+      virginica = 2.0
+    )
+  }
+  iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
+  data <- suppressWarnings(createDataFrame(iris[-5]))
+  model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
+                              maxDepth = 5, maxBins = 16)
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$numTrees, 20)
+  # Test numeric prediction values
+  predictions <- collect(predict(model, data))$prediction
+  expect_equal(length(grep("1.0", predictions)), 50)
+  expect_equal(length(grep("2.0", predictions)), 50)
+
+  # spark.randomForest classification can work on libsvm data
+  data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
+                source = "libsvm")
+  model <- spark.randomForest(data, label ~ features, "classification")
+  expect_equal(summary(model)$numFeatures, 4)
+})
+
+test_that("spark.gbt", {
+  # regression
+  data <- suppressWarnings(createDataFrame(longley))
+  model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123)
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+                                         63.221, 63.639, 64.989, 63.761,
+                                         66.019, 67.857, 68.169, 66.513,
+                                         68.655, 69.564, 69.331, 70.551),
+               tolerance = 1e-4)
+  stats <- summary(model)
+  expect_equal(stats$numTrees, 20)
+  expect_equal(stats$formula, "Employed ~ .")
+  expect_equal(stats$numFeatures, 6)
+  expect_equal(length(stats$treeWeights), 20)
+
+  modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  stats2 <- summary(model2)
+  expect_equal(stats$formula, stats2$formula)
+  expect_equal(stats$numFeatures, stats2$numFeatures)
+  expect_equal(stats$features, stats2$features)
+  expect_equal(stats$featureImportances, stats2$featureImportances)
+  expect_equal(stats$numTrees, stats2$numTrees)
+  expect_equal(stats$treeWeights, stats2$treeWeights)
+
+  unlink(modelPath)
+
+  # classification
+  # label must be binary - GBTClassifier currently only supports binary classification.
+  iris2 <- iris[iris$Species != "virginica", ]
+  data <- suppressWarnings(createDataFrame(iris2))
+  model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$numTrees, 20)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+  predictions <- collect(predict(model, data))$prediction
+  # test string prediction values
+  expect_equal(length(grep("setosa", predictions)), 50)
+  expect_equal(length(grep("versicolor", predictions)), 50)
+
+  modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  stats2 <- summary(model2)
+  expect_equal(stats$depth, stats2$depth)
+  expect_equal(stats$numNodes, stats2$numNodes)
+  expect_equal(stats$numClasses, stats2$numClasses)
+
+  unlink(modelPath)
+
+  iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
+  df <- suppressWarnings(createDataFrame(iris2))
+  m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
+  s <- summary(m)
+  # test numeric prediction values
+  expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
+  expect_equal(s$numFeatures, 5)
+  expect_equal(s$numTrees, 20)
+
+  # spark.gbt classification can work on libsvm data
+  data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
+                source = "libsvm")
+  model <- spark.gbt(data, label ~ features, "classification")
+  expect_equal(summary(model)$numFeatures, 692)
 })
 
 sparkR.session.stop()
diff --git a/R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/testthat/test_rdd.R
index a3d66c245a7d..787ef51c501c 100644
--- a/R/pkg/inst/tests/testthat/test_rdd.R
+++ b/R/pkg/inst/tests/testthat/test_rdd.R
@@ -29,8 +29,8 @@ intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
 intRdd <- parallelize(sc, intPairs, 2L)
 
 test_that("get number of partitions in RDD", {
-  expect_equal(getNumPartitions(rdd), 2)
-  expect_equal(getNumPartitions(intRdd), 2)
+  expect_equal(getNumPartitionsRDD(rdd), 2)
+  expect_equal(getNumPartitionsRDD(intRdd), 2)
 })
 
 test_that("first on RDD", {
@@ -305,18 +305,18 @@ test_that("repartition/coalesce on RDDs", {
 
   # repartition
   r1 <- repartitionRDD(rdd, 2)
-  expect_equal(getNumPartitions(r1), 2L)
+  expect_equal(getNumPartitionsRDD(r1), 2L)
   count <- length(collectPartition(r1, 0L))
   expect_true(count >= 8 && count <= 12)
 
   r2 <- repartitionRDD(rdd, 6)
-  expect_equal(getNumPartitions(r2), 6L)
+  expect_equal(getNumPartitionsRDD(r2), 6L)
   count <- length(collectPartition(r2, 0L))
   expect_true(count >= 0 && count <= 4)
 
   # coalesce
-  r3 <- coalesce(rdd, 1)
-  expect_equal(getNumPartitions(r3), 1L)
+  r3 <- coalesceRDD(rdd, 1)
+  expect_equal(getNumPartitionsRDD(r3), 1L)
   count <- length(collectPartition(r3, 0L))
   expect_equal(count, 20)
 })
@@ -381,8 +381,8 @@ test_that("aggregateRDD() on RDDs", {
 test_that("zipWithUniqueId() on RDDs", {
   rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
   actual <- collectRDD(zipWithUniqueId(rdd))
-  expected <- list(list("a", 0), list("b", 3), list("c", 1),
-                   list("d", 4), list("e", 2))
+  expected <- list(list("a", 0), list("b", 1), list("c", 4),
+                   list("d", 2), list("e", 5))
   expect_equal(actual, expected)
 
   rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
diff --git a/R/pkg/inst/tests/testthat/test_sparkR.R b/R/pkg/inst/tests/testthat/test_sparkR.R
new file mode 100644
index 000000000000..f73fc6baecce
--- /dev/null
+++ b/R/pkg/inst/tests/testthat/test_sparkR.R
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in sparkR.R")
+
+test_that("sparkCheckInstall", {
+  # "local, yarn-client, mesos-client" mode, SPARK_HOME was set correctly,
+  # and the SparkR job was submitted by "spark-submit"
+  sparkHome <- paste0(tempdir(), "/", "sparkHome")
+  dir.create(sparkHome)
+  master <- ""
+  deployMode <- ""
+  expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
+  unlink(sparkHome, recursive = TRUE)
+
+  # "yarn-cluster, mesos-cluster" mode, SPARK_HOME was not set,
+  # and the SparkR job was submitted by "spark-submit"
+  sparkHome <- ""
+  master <- ""
+  deployMode <- ""
+  expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
+
+  # "yarn-client, mesos-client" mode, SPARK_HOME was not set
+  sparkHome <- ""
+  master <- "yarn-client"
+  deployMode <- ""
+  expect_error(sparkCheckInstall(sparkHome, master, deployMode))
+  sparkHome <- ""
+  master <- ""
+  deployMode <- "client"
+  expect_error(sparkCheckInstall(sparkHome, master, deployMode))
+})
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 806019d7524f..628512440d6e 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -60,6 +60,7 @@ unsetHiveContext <- function() {
 
 # Tests for SparkSQL functions in SparkR
 
+filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
 sparkSession <- sparkR.session()
 sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
 
@@ -196,6 +197,26 @@ test_that("create DataFrame from RDD", {
   expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float")))
   expect_equal(as.list(collect(where(df, df$name == "John"))),
                list(name = "John", age = 19L, height = 176.5))
+  expect_equal(getNumPartitions(df), 1)
+
+  df <- as.DataFrame(cars, numPartitions = 2)
+  expect_equal(getNumPartitions(df), 2)
+  df <- createDataFrame(cars, numPartitions = 3)
+  expect_equal(getNumPartitions(df), 3)
+  # validate limit by num of rows
+  df <- createDataFrame(cars, numPartitions = 60)
+  expect_equal(getNumPartitions(df), 50)
+  # validate when 1 < (length(coll) / numSlices) << length(coll)
+  df <- createDataFrame(cars, numPartitions = 20)
+  expect_equal(getNumPartitions(df), 20)
+
+  df <- as.DataFrame(data.frame(0))
+  expect_is(df, "SparkDataFrame")
+  df <- createDataFrame(list(list(1)))
+  expect_is(df, "SparkDataFrame")
+  df <- as.DataFrame(data.frame(0), numPartitions = 2)
+  # no data to partition, goes to 1
+  expect_equal(getNumPartitions(df), 1)
 
   setHiveContext(sc)
   sql("CREATE TABLE people (name string, age double, height float)")
@@ -205,6 +226,7 @@ test_that("create DataFrame from RDD", {
                c(16))
   expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height,
                c(176.5))
+  sql("DROP TABLE people")
   unsetHiveContext()
 })
 
@@ -212,7 +234,8 @@ test_that("createDataFrame uses files for large objects", {
   # To simulate a large file scenario, we set spark.r.maxAllocationLimit to a smaller value
   conf <- callJMethod(sparkSession, "conf")
   callJMethod(conf, "set", "spark.r.maxAllocationLimit", "100")
-  df <- suppressWarnings(createDataFrame(iris))
+  df <- suppressWarnings(createDataFrame(iris, numPartitions = 3))
+  expect_equal(getNumPartitions(df), 3)
 
   # Resetting the conf back to default value
   callJMethod(conf, "set", "spark.r.maxAllocationLimit", toString(.Machine$integer.max / 10))
@@ -576,7 +599,7 @@ test_that("test tableNames and tables", {
   tables <- tables()
   expect_equal(count(tables), 2)
   suppressWarnings(dropTempTable("table1"))
-  dropTempView("table2")
+  expect_true(dropTempView("table2"))
 
   tables <- tables()
   expect_equal(count(tables), 0)
@@ -589,7 +612,7 @@ test_that(
   newdf <- sql("SELECT * FROM table1 where name = 'Michael'")
   expect_is(newdf, "SparkDataFrame")
   expect_equal(count(newdf), 1)
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 
   createOrReplaceTempView(df, "dfView")
   sqlCast <- collect(sql("select cast('2' as decimal) as x from dfView limit 1"))
@@ -600,7 +623,7 @@ test_that(
   expect_equal(ncol(sqlCast), 1)
   expect_equal(out[1], "  x")
   expect_equal(out[2], "1 2")
-  dropTempView("dfView")
+  expect_true(dropTempView("dfView"))
 })
 
 test_that("test cache, uncache and clearCache", {
@@ -609,7 +632,7 @@ test_that("test cache, uncache and clearCache", {
   cacheTable("table1")
   uncacheTable("table1")
   clearCache()
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 })
 
 test_that("insertInto() on a registered table", {
@@ -630,13 +653,13 @@ test_that("insertInto() on a registered table", {
   insertInto(dfParquet2, "table1")
   expect_equal(count(sql("select * from table1")), 5)
   expect_equal(first(sql("select * from table1 order by age"))$name, "Michael")
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 
   createOrReplaceTempView(dfParquet, "table1")
   insertInto(dfParquet2, "table1", overwrite = TRUE)
   expect_equal(count(sql("select * from table1")), 2)
   expect_equal(first(sql("select * from table1 order by age"))$name, "Bob")
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 
   unlink(jsonPath2)
   unlink(parquetPath2)
@@ -650,7 +673,7 @@ test_that("tableToDF() returns a new DataFrame", {
   expect_equal(count(tabledf), 3)
   tabledf2 <- tableToDF("table1")
   expect_equal(count(tabledf2), 3)
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 })
 
 test_that("toRDD() returns an RRDD", {
@@ -703,7 +726,7 @@ test_that("objectFile() works with row serialization", {
   objectPath <- tempfile(pattern = "spark-test", fileext = ".tmp")
   df <- read.json(jsonPath)
   dfRDD <- toRDD(df)
-  saveAsObjectFile(coalesce(dfRDD, 1L), objectPath)
+  saveAsObjectFile(coalesceRDD(dfRDD, 1L), objectPath)
   objectIn <- objectFile(sc, objectPath)
 
   expect_is(objectIn, "RDD")
@@ -985,6 +1008,18 @@ test_that("select operators", {
   expect_is(df[[2]], "Column")
   expect_is(df[["age"]], "Column")
 
+  expect_warning(df[[1:2]],
+                 "Subset index has length > 1. Only the first index is used.")
+  expect_is(suppressWarnings(df[[1:2]]), "Column")
+  expect_warning(df[[c("name", "age")]],
+                 "Subset index has length > 1. Only the first index is used.")
+  expect_is(suppressWarnings(df[[c("name", "age")]]), "Column")
+
+  expect_warning(df[[1:2]] <- df[[1]],
+                 "Subset index has length > 1. Only the first index is used.")
+  expect_warning(df[[c("name", "age")]] <- df[[1]],
+                 "Subset index has length > 1. Only the first index is used.")
+
   expect_is(df[, 1, drop = F], "SparkDataFrame")
   expect_equal(columns(df[, 1, drop = F]), c("name"))
   expect_equal(columns(df[, "age", drop = F]), c("age"))
@@ -999,6 +1034,37 @@ test_that("select operators", {
   df$age2 <- df$age * 2
   expect_equal(columns(df), c("name", "age", "age2"))
   expect_equal(count(where(df, df$age2 == df$age * 2)), 2)
+  df$age2 <- df[["age"]] * 3
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == df$age * 3)), 2)
+
+  df$age2 <- 21
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == 21)), 3)
+
+  df$age2 <- c(22)
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == 22)), 3)
+
+  expect_error(df$age3 <- c(22, NA),
+              "value must be a Column, literal value as atomic in length of 1, or NULL")
+
+  df[["age2"]] <- 23
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == 23)), 3)
+
+  df[[3]] <- 24
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == 24)), 3)
+
+  df[[3]] <- df$age
+  expect_equal(count(where(df, df$age2 == df$age)), 2)
+
+  df[["age2"]] <- df[["name"]]
+  expect_equal(count(where(df, df$age2 == df$name)), 3)
+
+  expect_error(df[["age3"]] <- c(22, 23),
+              "value must be a Column, literal value as atomic in length of 1, or NULL")
 
   # Test parameter drop
   expect_equal(class(df[, 1]) == "SparkDataFrame", T)
@@ -1175,7 +1241,7 @@ test_that("column functions", {
   c16 <- is.nan(c) + isnan(c) + isNaN(c)
   c17 <- cov(c, c1) + cov("c", "c1") + covar_samp(c, c1) + covar_samp("c", "c1")
   c18 <- covar_pop(c, c1) + covar_pop("c", "c1")
-  c19 <- spark_partition_id()
+  c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3)
 
   # Test if base::is.nan() is exposed
   expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
@@ -1222,16 +1288,16 @@ test_that("column functions", {
   # Test struct()
   df <- createDataFrame(list(list(1L, 2L, 3L), list(4L, 5L, 6L)),
                         schema = c("a", "b", "c"))
-  result <- collect(select(df, struct("a", "c")))
+  result <- collect(select(df, alias(struct("a", "c"), "d")))
   expected <- data.frame(row.names = 1:2)
-  expected$"struct(a, c)" <- list(listToStruct(list(a = 1L, c = 3L)),
-                                 listToStruct(list(a = 4L, c = 6L)))
+  expected$"d" <- list(listToStruct(list(a = 1L, c = 3L)),
+                      listToStruct(list(a = 4L, c = 6L)))
   expect_equal(result, expected)
 
-  result <- collect(select(df, struct(df$a, df$b)))
+  result <- collect(select(df, alias(struct(df$a, df$b), "d")))
   expected <- data.frame(row.names = 1:2)
-  expected$"struct(a, b)" <- list(listToStruct(list(a = 1L, b = 2L)),
-                                 listToStruct(list(a = 4L, b = 5L)))
+  expected$"d" <- list(listToStruct(list(a = 1L, b = 2L)),
+                      listToStruct(list(a = 4L, b = 5L)))
   expect_equal(result, expected)
 
   # Test encode(), decode()
@@ -1244,9 +1310,9 @@ test_that("column functions", {
 
   # Test first(), last()
   df <- read.json(jsonPath)
-  expect_equal(collect(select(df, first(df$age)))[[1]], NA)
+  expect_equal(collect(select(df, first(df$age)))[[1]], NA_real_)
   expect_equal(collect(select(df, first(df$age, TRUE)))[[1]], 30)
-  expect_equal(collect(select(df, first("age")))[[1]], NA)
+  expect_equal(collect(select(df, first("age")))[[1]], NA_real_)
   expect_equal(collect(select(df, first("age", TRUE)))[[1]], 30)
   expect_equal(collect(select(df, last(df$age)))[[1]], 19)
   expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
@@ -1776,6 +1842,13 @@ test_that("withColumn() and withColumnRenamed()", {
   expect_equal(length(columns(newDF)), 2)
   expect_equal(first(filter(newDF, df$name != "Michael"))$age, 32)
 
+  newDF <- withColumn(df, "age", 18)
+  expect_equal(length(columns(newDF)), 2)
+  expect_equal(first(newDF)$age, 18)
+
+  expect_error(withColumn(df, "age", list("a")),
+              "Literal value must be atomic in length of 1")
+
   newDF2 <- withColumnRenamed(df, "age", "newerAge")
   expect_equal(length(columns(newDF2)), 2)
   expect_equal(columns(newDF2)[1], "newerAge")
@@ -2421,15 +2494,18 @@ test_that("repartition by columns on DataFrame", {
     ("Please, specify the number of partitions and/or a column\\(s\\)", retError), TRUE)
 
   # repartition by column and number of partitions
-  actual <- repartition(df, 3L, col = df$"a")
+  actual <- repartition(df, 3, col = df$"a")
 
-  # since we cannot access the number of partitions from dataframe, checking
-  # that at least the dimensions are identical
+  # Checking that at least the dimensions are identical
   expect_identical(dim(df), dim(actual))
+  expect_equal(getNumPartitions(actual), 3L)
 
   # repartition by number of partitions
   actual <- repartition(df, 13L)
   expect_identical(dim(df), dim(actual))
+  expect_equal(getNumPartitions(actual), 13L)
+
+  expect_equal(getNumPartitions(coalesce(actual, 1L)), 1L)
 
   # a test case with a column and dapply
   schema <-  structType(structField("a", "integer"), structField("avg", "double"))
@@ -2445,6 +2521,25 @@ test_that("repartition by columns on DataFrame", {
   expect_equal(nrow(df1), 2)
 })
 
+test_that("coalesce, repartition, numPartitions", {
+  df <- as.DataFrame(cars, numPartitions = 5)
+  expect_equal(getNumPartitions(df), 5)
+  expect_equal(getNumPartitions(coalesce(df, 3)), 3)
+  expect_equal(getNumPartitions(coalesce(df, 6)), 5)
+
+  df1 <- coalesce(df, 3)
+  expect_equal(getNumPartitions(df1), 3)
+  expect_equal(getNumPartitions(coalesce(df1, 6)), 5)
+  expect_equal(getNumPartitions(coalesce(df1, 4)), 4)
+  expect_equal(getNumPartitions(coalesce(df1, 2)), 2)
+
+  df2 <- repartition(df1, 10)
+  expect_equal(getNumPartitions(df2), 10)
+  expect_equal(getNumPartitions(coalesce(df2, 13)), 5)
+  expect_equal(getNumPartitions(coalesce(df2, 7)), 5)
+  expect_equal(getNumPartitions(coalesce(df2, 3)), 3)
+})
+
 test_that("gapply() and gapplyCollect() on a DataFrame", {
   df <- createDataFrame (
     list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
@@ -2612,7 +2707,7 @@ test_that("randomSplit", {
   expect_true(all(sapply(abs(counts / num - weights / sum(weights)), function(e) { e < 0.05 })))
 })
 
-test_that("Setting and getting config on SparkSession", {
+test_that("Setting and getting config on SparkSession, sparkR.conf(), sparkR.uiWebUrl()", {
   # first, set it to a random but known value
   conf <- callJMethod(sparkSession, "conf")
   property <- paste0("spark.testing.", as.character(runif(1)))
@@ -2636,6 +2731,9 @@ test_that("Setting and getting config on SparkSession", {
   expect_equal(appNameValue, "sparkSession test")
   expect_equal(testValue, value)
   expect_error(sparkR.conf("completely.dummy"), "Config 'completely.dummy' is not set")
+
+  url <- sparkR.uiWebUrl()
+  expect_equal(substr(url, 1, 7), "http://")
 })
 
 test_that("enableHiveSupport on SparkSession", {
@@ -2659,7 +2757,7 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume
   # It makes sure that we can omit path argument in write.df API and then it calls
   # DataFrameWriter.save() without path.
   expect_error(write.df(df, source = "csv"),
-               "Error in save : illegal argument - 'path' is not specified")
+              "Error in save : illegal argument - Expected exactly one path to be specified")
   expect_error(write.json(df, jsonPath),
               "Error in json : analysis error - path file:.*already exists")
   expect_error(write.text(df, jsonPath),
@@ -2667,7 +2765,7 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume
   expect_error(write.orc(df, jsonPath),
               "Error in orc : analysis error - path file:.*already exists")
   expect_error(write.parquet(df, jsonPath),
-                            "Error in parquet : analysis error - path file:.*already exists")
+              "Error in parquet : analysis error - path file:.*already exists")
 
   # Arguments checking in R side.
   expect_error(write.df(df, "data.tmp", source = c(1, 2)),
@@ -2684,7 +2782,7 @@ test_that("Call DataFrameWriter.load() API in Java without path and check argume
   # It makes sure that we can omit path argument in read.df API and then it calls
   # DataFrameWriter.load() without path.
   expect_error(read.df(source = "json"),
-               paste("Error in loadDF : analysis error - Unable to infer schema for JSON at .",
+               paste("Error in loadDF : analysis error - Unable to infer schema for JSON.",
                      "It must be specified manually"))
   expect_error(read.df("arbitrary_path"), "Error in loadDF : analysis error - Path does not exist")
   expect_error(read.json("arbitrary_path"), "Error in json : analysis error - Path does not exist")
@@ -2704,6 +2802,79 @@ test_that("Call DataFrameWriter.load() API in Java without path and check argume
                  "Unnamed arguments ignored: 2, 3, a.")
 })
 
+test_that("Collect on DataFrame when NAs exists at the top of a timestamp column", {
+  ldf <- data.frame(col1 = c(0, 1, 2),
+                   col2 = c(as.POSIXct("2017-01-01 00:00:01"),
+                            NA,
+                            as.POSIXct("2017-01-01 12:00:01")),
+                   col3 = c(as.POSIXlt("2016-01-01 00:59:59"),
+                            NA,
+                            as.POSIXlt("2016-01-01 12:01:01")))
+  sdf1 <- createDataFrame(ldf)
+  ldf1 <- collect(sdf1)
+  expect_equal(dtypes(sdf1), list(c("col1", "double"),
+                                  c("col2", "timestamp"),
+                                  c("col3", "timestamp")))
+  expect_equal(class(ldf1$col1), "numeric")
+  expect_equal(class(ldf1$col2), c("POSIXct", "POSIXt"))
+  expect_equal(class(ldf1$col3), c("POSIXct", "POSIXt"))
+
+  # Columns with NAs at the top
+  sdf2 <- filter(sdf1, "col1 > 1")
+  ldf2 <- collect(sdf2)
+  expect_equal(dtypes(sdf2), list(c("col1", "double"),
+                                  c("col2", "timestamp"),
+                                  c("col3", "timestamp")))
+  expect_equal(class(ldf2$col1), "numeric")
+  expect_equal(class(ldf2$col2), c("POSIXct", "POSIXt"))
+  expect_equal(class(ldf2$col3), c("POSIXct", "POSIXt"))
+
+  # Columns with only NAs, the type will also be cast to PRIMITIVE_TYPE
+  sdf3 <- filter(sdf1, "col1 == 0")
+  ldf3 <- collect(sdf3)
+  expect_equal(dtypes(sdf3), list(c("col1", "double"),
+                                  c("col2", "timestamp"),
+                                  c("col3", "timestamp")))
+  expect_equal(class(ldf3$col1), "numeric")
+  expect_equal(class(ldf3$col2), c("POSIXct", "POSIXt"))
+  expect_equal(class(ldf3$col3), c("POSIXct", "POSIXt"))
+})
+
+compare_list <- function(list1, list2) {
+  # get testthat to show the diff by first making the 2 lists equal in length
+  expect_equal(length(list1), length(list2))
+  l <- max(length(list1), length(list2))
+  length(list1) <- l
+  length(list2) <- l
+  expect_equal(sort(list1, na.last = TRUE), sort(list2, na.last = TRUE))
+}
+
+# This should always be the **very last test** in this test file.
+test_that("No extra files are created in SPARK_HOME by starting session and making calls", {
+  skip_on_cran()
+
+  # Check that it is not creating any extra file.
+  # Does not check the tempdir which would be cleaned up after.
+  filesAfter <- list.files(path = sparkRDir, all.files = TRUE)
+
+  expect_true(length(sparkRFilesBefore) > 0)
+  # first, ensure derby.log is not there
+  expect_false("derby.log" %in% filesAfter)
+  # second, ensure only spark-warehouse is created when calling SparkSession, enableHiveSupport = F
+  # note: currently all other test files have enableHiveSupport = F, so we capture the list of files
+  # before creating a SparkSession with enableHiveSupport = T at the top of this test file
+  # (filesBefore). The test here is to compare that (filesBefore) against the list of files before
+  # any test is run in run-all.R (sparkRFilesBefore).
+  # sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs,
+  # here allow the first value, spark-warehouse, in the diff, everything else should be exactly the
+  # same as before any test is run.
+  compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
+  # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
+  # note: as the note above, after running all tests in this file while enableHiveSupport = T, we
+  # check the list of files again. This time we allow both whitelisted dirs to be in the diff.
+  compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
+})
+
 unlink(parquetPath)
 unlink(orcPath)
 unlink(jsonPath)
diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R
index 607c407f04f9..c87524842876 100644
--- a/R/pkg/inst/tests/testthat/test_utils.R
+++ b/R/pkg/inst/tests/testthat/test_utils.R
@@ -228,4 +228,15 @@ test_that("varargsToStrEnv", {
   expect_warning(varargsToStrEnv(1, 2, 3, 4), "Unnamed arguments ignored: 1, 2, 3, 4.")
 })
 
+test_that("basenameSansExtFromUrl", {
+  x <- paste0("http://people.apache.org/~pwendell/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-",
+              "SNAPSHOT-2016_12_09_11_08-eb2d9bf-bin/spark-2.1.1-SNAPSHOT-bin-hadoop2.7.tgz")
+  y <- paste0("http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc2-bin/spark-2.1.0-",
+              "bin-hadoop2.4-without-hive.tgz")
+  expect_equal(basenameSansExtFromUrl(x), "spark-2.1.1-SNAPSHOT-bin-hadoop2.7")
+  expect_equal(basenameSansExtFromUrl(y), "spark-2.1.0-bin-hadoop2.4-without-hive")
+  z <- "http://people.apache.org/~pwendell/spark-releases/spark-2.1.0--hive.tar.gz"
+  expect_equal(basenameSansExtFromUrl(z), "spark-2.1.0--hive")
+})
+
 sparkR.session.stop()
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index 1d04656ac259..9b75d9556692 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -21,4 +21,13 @@ library(SparkR)
 # Turn all warnings into errors
 options("warn" = 2)
 
+install.spark()
+
+# Setup global test environment
+sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
+sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
+invisible(lapply(sparkRWhitelistSQLDirs,
+                 function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
+
 test_package("SparkR")
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 80e876027bdd..d16526d306d6 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -1,14 +1,32 @@
 ---
 title: "SparkR - Practical Guide"
 output:
-  html_document:
-    theme: united
+  rmarkdown::html_vignette:
     toc: true
     toc_depth: 4
-    toc_float: true
-    highlight: textmate
+vignette: >
+  %\VignetteIndexEntry{SparkR - Practical Guide}
+  %\VignetteEngine{knitr::rmarkdown}
+  \usepackage[utf8]{inputenc}
 ---
 
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 ## Overview
 
 SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](http://spark.apache.org/mllib/).
@@ -26,6 +44,9 @@ library(SparkR)
 
 We use default settings in which it runs in local mode. It auto downloads Spark package in the background if no previous installation is found. For more details about setup, see [Spark Session](#SetupSparkSession).
 
+```{r, include=FALSE}
+install.spark()
+```
 ```{r, message=FALSE, results="hide"}
 sparkR.session()
 ```
@@ -93,13 +114,13 @@ sparkR.session.stop()
 
 Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs.
 
-If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html). Alternatively, we provide an easy-to-use function `install.spark` to complete this process. You don't have to call it explicitly. We will check the installation when `sparkR.session` is called and `install.spark` function will be  triggered automatically if no installation is found.
+After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html).
 
 ```{r, eval=FALSE}
 install.spark()
 ```
 
-If you already have Spark installed, you don't have to install again and can pass the `sparkHome` argument to `sparkR.session` to let SparkR know where the Spark installation is.
+If you already have Spark installed, you don't have to install again and can pass the `sparkHome` argument to `sparkR.session` to let SparkR know where the existing Spark installation is.
 
 ```{r, eval=FALSE}
 sparkR.session(sparkHome = "/HOME/spark")
@@ -446,25 +467,43 @@ head(teenagers)
 
 SparkR supports the following machine learning models and algorithms.
 
-* Generalized Linear Model (GLM)
+#### Classification
 
-* Naive Bayes Model
+* Logistic Regression
 
-* $k$-means Clustering
+* Multilayer Perceptron (MLP)
+
+* Naive Bayes
+
+#### Regression
 
 * Accelerated Failure Time (AFT) Survival Model
 
+* Generalized Linear Model (GLM)
+
+* Isotonic Regression
+
+#### Tree - Classification and Regression
+
+* Gradient-Boosted Trees (GBT)
+
+* Random Forest
+
+#### Clustering
+
 * Gaussian Mixture Model (GMM)
 
+* $k$-means Clustering
+
 * Latent Dirichlet Allocation (LDA)
 
-* Multilayer Perceptron Model
+#### Collaborative Filtering
 
-* Collaborative Filtering with Alternating Least Squares (ALS)
+* Alternating Least Squares (ALS)
 
-* Isotonic Regression Model
+#### Statistics
 
-More will be added in the future.
+* Kolmogorov-Smirnov Test
 
 ### R Formula
 
@@ -489,9 +528,115 @@ count(carsDF_test)
 head(carsDF_test)
 ```
 
-
 ### Models and Algorithms
 
+#### Logistic Regression
+
+[Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) is a widely-used model when the response is categorical. It can be seen as a special case of the [Generalized Linear Predictive Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
+We provide `spark.logit` on top of `spark.glm` to support logistic regression with advanced hyper-parameters.
+It supports both binary and multiclass classification with elastic-net regularization and feature standardization, similar to `glmnet`.
+
+We use a simple example to demonstrate `spark.logit` usage. In general, there are three steps of using `spark.logit`:
+1). Create a dataframe from a proper data source; 2). Fit a logistic regression model using `spark.logit` with a proper parameter setting;
+and 3). Obtain the coefficient matrix of the fitted model using `summary` and use the model for prediction with `predict`.
+
+Binomial logistic regression
+```{r, warning=FALSE}
+df <- createDataFrame(iris)
+# Create a DataFrame containing two classes
+training <- df[df$Species %in% c("versicolor", "virginica"), ]
+model <- spark.logit(training, Species ~ ., regParam = 0.00042)
+summary(model)
+```
+
+Predict values on training data
+```{r}
+fitted <- predict(model, training)
+```
+
+Multinomial logistic regression against three classes
+```{r, warning=FALSE}
+df <- createDataFrame(iris)
+# Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional.
+model <- spark.logit(df, Species ~ ., regParam = 0.056)
+summary(model)
+```
+
+#### Multilayer Perceptron
+
+Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights $w$ and bias $b$ and applying an activation function. This can be written in matrix form for MLPC with $K+1$ layers as follows:
+$$
+y(x)=f_K(\ldots f_2(w_2^T f_1(w_1^T x + b_1) + b_2) \ldots + b_K).
+$$
+
+Nodes in intermediate layers use sigmoid (logistic) function:
+$$
+f(z_i) = \frac{1}{1+e^{-z_i}}.
+$$
+
+Nodes in the output layer use softmax function:
+$$
+f(z_i) = \frac{e^{z_i}}{\sum_{k=1}^N e^{z_k}}.
+$$
+
+The number of nodes $N$ in the output layer corresponds to the number of classes.
+
+MLPC employs backpropagation for learning the model. We use the logistic loss function for optimization and L-BFGS as an optimization routine.
+
+`spark.mlp` requires at least two columns in `data`: one named `"label"` and the other one `"features"`. The `"features"` column should be in libSVM-format.
+
+We use iris data set to show how to use `spark.mlp` in classification.
+```{r, warning=FALSE}
+df <- createDataFrame(iris)
+# fit a Multilayer Perceptron Classification Model
+model <- spark.mlp(df, Species ~ ., blockSize = 128, layers = c(4, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
+```
+
+To avoid lengthy display, we only present partial results of the model summary. You can check the full result from your sparkR shell.
+```{r, include=FALSE}
+ops <- options()
+options(max.print=5)
+```
+```{r}
+# check the summary of the fitted model
+summary(model)
+```
+```{r, include=FALSE}
+options(ops)
+```
+```{r}
+# make predictions use the fitted model
+predictions <- predict(model, df)
+head(select(predictions, predictions$prediction))
+```
+
+#### Naive Bayes
+
+Naive Bayes model assumes independence among the features. `spark.naiveBayes` fits a [Bernoulli naive Bayes model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes) against a SparkDataFrame. The data should be all categorical. These models are often used for document classification.
+
+```{r}
+titanic <- as.data.frame(Titanic)
+titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5])
+naiveBayesModel <- spark.naiveBayes(titanicDF, Survived ~ Class + Sex + Age)
+summary(naiveBayesModel)
+naiveBayesPrediction <- predict(naiveBayesModel, titanicDF)
+head(select(naiveBayesPrediction, "Class", "Sex", "Age", "Survived", "prediction"))
+```
+
+#### Accelerated Failure Time Survival Model
+
+Survival analysis studies the expected duration of time until an event happens, and often the relationship with risk factors or treatment taken on the subject. In contrast to standard regression analysis, survival modeling has to deal with special characteristics in the data including non-negative survival time and censoring.
+
+Accelerated Failure Time (AFT) model is a parametric survival model for censored data that assumes the effect of a covariate is to accelerate or decelerate the life course of an event by some constant. For more information, refer to the Wikipedia page [AFT Model](https://en.wikipedia.org/wiki/Accelerated_failure_time_model) and the references there. Different from a [Proportional Hazards Model](https://en.wikipedia.org/wiki/Proportional_hazards_model) designed for the same purpose, the AFT model is easier to parallelize because each instance contributes to the objective function independently.
+```{r, warning=FALSE}
+library(survival)
+ovarianDF <- createDataFrame(ovarian)
+aftModel <- spark.survreg(ovarianDF, Surv(futime, fustat) ~ ecog_ps + rx)
+summary(aftModel)
+aftPredictions <- predict(aftModel, ovarianDF)
+head(aftPredictions)
+```
+
 #### Generalized Linear Model
 
 The main function is `spark.glm`. The following families and link functions are supported. The default is gaussian.
@@ -525,46 +670,78 @@ gaussianFitted <- predict(gaussianGLM, carsDF)
 head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
 ```
 
-#### Naive Bayes Model
+#### Isotonic Regression
 
-Naive Bayes model assumes independence among the features. `spark.naiveBayes` fits a [Bernoulli naive Bayes model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes) against a SparkDataFrame. The data should be all categorical. These models are often used for document classification.
+`spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to  minimize
+$$
+\ell(f) = \sum_{i=1}^n w_i (y_i - f(x_i))^2.
+$$
+
+There are a few more arguments that may be useful.
+
+* `weightCol`: a character string specifying the weight column.
+
+* `isotonic`: logical value indicating whether the output sequence should be isotonic/increasing (`TRUE`) or antitonic/decreasing (`FALSE`).
+
+* `featureIndex`: the index of the feature on the right hand side of the formula if it is a vector column (default: 0), no effect otherwise.
+
+We use an artificial example to show the use.
 
 ```{r}
-titanic <- as.data.frame(Titanic)
-titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5])
-naiveBayesModel <- spark.naiveBayes(titanicDF, Survived ~ Class + Sex + Age)
-summary(naiveBayesModel)
-naiveBayesPrediction <- predict(naiveBayesModel, titanicDF)
-head(select(naiveBayesPrediction, "Class", "Sex", "Age", "Survived", "prediction"))
+y <- c(3.0, 6.0, 8.0, 5.0, 7.0)
+x <- c(1.0, 2.0, 3.5, 3.0, 4.0)
+w <- rep(1.0, 5)
+data <- data.frame(y = y, x = x, w = w)
+df <- createDataFrame(data)
+isoregModel <- spark.isoreg(df, y ~ x, weightCol = "w")
+isoregFitted <- predict(isoregModel, df)
+head(select(isoregFitted, "x", "y", "prediction"))
 ```
 
-#### k-Means Clustering
+In the prediction stage, based on the fitted monotone piecewise function, the rules are:
 
-`spark.kmeans` fits a $k$-means clustering model against a `SparkDataFrame`. As an unsupervised learning method, we don't need a response variable. Hence, the left hand side of the R formula should be left blank. The clustering is based only on the variables on the right hand side.
+* If the prediction input exactly matches a training feature then associated prediction is returned. In case there are multiple predictions with the same feature then one of them is returned. Which one is undefined.
+
+* If the prediction input is lower or higher than all training features then prediction with lowest or highest feature is returned respectively. In case there are multiple predictions with the same feature then the lowest or highest is returned respectively.
+
+* If the prediction input falls between two training features then prediction is treated as piecewise linear function and interpolated value is calculated from the predictions of the two closest features. In case there are multiple values with the same feature then the same rules as in previous point are used.
+
+For example, when the input is $3.2$, the two closest feature values are $3.0$ and $3.5$, then predicted value would be a linear interpolation between the predicted values at $3.0$ and $3.5$.
 
 ```{r}
-kmeansModel <- spark.kmeans(carsDF, ~ mpg + hp + wt, k = 3)
-summary(kmeansModel)
-kmeansPredictions <- predict(kmeansModel, carsDF)
-head(select(kmeansPredictions, "model", "mpg", "hp", "wt", "prediction"), n = 20L)
+newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
+head(predict(isoregModel, newDF))
 ```
 
-#### AFT Survival Model
-Survival analysis studies the expected duration of time until an event happens, and often the relationship with risk factors or treatment taken on the subject. In contrast to standard regression analysis, survival modeling has to deal with special characteristics in the data including non-negative survival time and censoring.
+#### Gradient-Boosted Trees
+
+`spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions:
 
-Accelerated Failure Time (AFT) model is a parametric survival model for censored data that assumes the effect of a covariate is to accelerate or decelerate the life course of an event by some constant. For more information, refer to the Wikipedia page [AFT Model](https://en.wikipedia.org/wiki/Accelerated_failure_time_model) and the references there. Different from a [Proportional Hazards Model](https://en.wikipedia.org/wiki/Proportional_hazards_model) designed for the same purpose, the AFT model is easier to parallelize because each instance contributes to the objective function independently.
 ```{r, warning=FALSE}
-library(survival)
-ovarianDF <- createDataFrame(ovarian)
-aftModel <- spark.survreg(ovarianDF, Surv(futime, fustat) ~ ecog_ps + rx)
-summary(aftModel)
-aftPredictions <- predict(aftModel, ovarianDF)
-head(aftPredictions)
+df <- createDataFrame(longley)
+gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2)
+summary(gbtModel)
+predictions <- predict(gbtModel, df)
 ```
 
-#### Gaussian Mixture Model
+#### Random Forest
+
+`spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+In the following example, we use the `longley` dataset to train a random forest and make predictions:
 
-(Coming in 2.1.0)
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2)
+summary(rfModel)
+predictions <- predict(rfModel, df)
+```
+
+#### Gaussian Mixture Model
 
 `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model.
 
@@ -580,10 +757,18 @@ gmmFitted <- predict(gmmModel, df)
 head(select(gmmFitted, "V1", "V2", "prediction"))
 ```
 
+#### k-Means Clustering
 
-#### Latent Dirichlet Allocation
+`spark.kmeans` fits a $k$-means clustering model against a `SparkDataFrame`. As an unsupervised learning method, we don't need a response variable. Hence, the left hand side of the R formula should be left blank. The clustering is based only on the variables on the right hand side.
+
+```{r}
+kmeansModel <- spark.kmeans(carsDF, ~ mpg + hp + wt, k = 3)
+summary(kmeansModel)
+kmeansPredictions <- predict(kmeansModel, carsDF)
+head(select(kmeansPredictions, "model", "mpg", "hp", "wt", "prediction"), n = 20L)
+```
 
-(Coming in 2.1.0)
+#### Latent Dirichlet Allocation
 
 `spark.lda` fits a [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on a `SparkDataFrame`. It is often used in topic modeling in which topics are inferred from a collection of text documents. LDA can be thought of as a clustering algorithm as follows:
 
@@ -599,22 +784,6 @@ To use LDA, we need to specify a `features` column in `data` where each entry re
 
 * libSVM: Each entry is a collection of words and will be processed directly.
 
-There are several parameters LDA takes for fitting the model.
-
-* `k`: number of topics (default 10).
-
-* `maxIter`: maximum iterations (default 20).
-
-* `optimizer`: optimizer to train an LDA model, "online" (default) uses [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf). "em" uses [expectation-maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm).
-
-* `subsamplingRate`: For `optimizer = "online"`. Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent, in range (0, 1] (default 0.05).
-
-* `topicConcentration`: concentration parameter (commonly named beta or eta) for the prior placed on topic distributions over terms, default -1 to set automatically on the Spark side. Use `summary` to retrieve the effective topicConcentration. Only 1-size numeric is accepted.
-
-* `docConcentration`: concentration parameter (commonly named alpha) for the prior placed on documents distributions over topics (theta), default -1 to set automatically on the Spark side. Use `summary` to retrieve the effective docConcentration. Only 1-size or k-size numeric is accepted.
-
-* `maxVocabSize`: maximum vocabulary size, default 1 << 18.
-
 Two more functions are provided for the fitted model.
 
 * `spark.posterior` returns a `SparkDataFrame` containing a column of posterior probabilities vectors named "topicDistribution".
@@ -653,47 +822,7 @@ perplexity <- spark.perplexity(model, corpusDF)
 perplexity
 ```
 
-
-#### Multilayer Perceptron
-
-(Coming in 2.1.0)
-
-Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights $w$ and bias $b$ and applying an activation function. This can be written in matrix form for MLPC with $K+1$ layers as follows:
-$$
-y(x)=f_K(\ldots f_2(w_2^T f_1(w_1^T x + b_1) + b_2) \ldots + b_K).
-$$
-
-Nodes in intermediate layers use sigmoid (logistic) function:
-$$
-f(z_i) = \frac{1}{1+e^{-z_i}}.
-$$
-
-Nodes in the output layer use softmax function:
-$$
-f(z_i) = \frac{e^{z_i}}{\sum_{k=1}^N e^{z_k}}.
-$$
-
-The number of nodes $N$ in the output layer corresponds to the number of classes.
-
-MLPC employs backpropagation for learning the model. We use the logistic loss function for optimization and L-BFGS as an optimization routine.
-
-`spark.mlp` requires at least two columns in `data`: one named `"label"` and the other one `"features"`. The `"features"` column should be in libSVM-format. According to the description above, there are several additional parameters that can be set:
-
-* `layers`: integer vector containing the number of nodes for each layer.
-
-* `solver`: solver parameter, supported options: `"gd"` (minibatch gradient descent) or `"l-bfgs"`.
-
-* `maxIter`: maximum iteration number.
-
-* `tol`: convergence tolerance of iterations.
-
-* `stepSize`: step size for `"gd"`.
-
-* `seed`: seed parameter for weights initialization.
-
-#### Collaborative Filtering
-
-(Coming in 2.1.0)
+#### Alternating Least Squares
 
 `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
 
@@ -722,53 +851,28 @@ predicted <- predict(model, df)
 head(predicted)
 ```
 
-#### Isotonic Regression Model
-
-(Coming in 2.1.0)
-
-`spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to  minimize
-$$
-\ell(f) = \sum_{i=1}^n w_i (y_i - f(x_i))^2.
-$$
-
-There are a few more arguments that may be useful.
-
-* `weightCol`: a character string specifying the weight column.
-
-* `isotonic`: logical value indicating whether the output sequence should be isotonic/increasing (`TRUE`) or antitonic/decreasing (`FALSE`).
-
-* `featureIndex`: the index of the feature on the right hand side of the formula if it is a vector column (default: 0), no effect otherwise.
-
-We use an artificial example to show the use.
-
-```{r}
-y <- c(3.0, 6.0, 8.0, 5.0, 7.0)
-x <- c(1.0, 2.0, 3.5, 3.0, 4.0)
-w <- rep(1.0, 5)
-data <- data.frame(y = y, x = x, w = w)
-df <- createDataFrame(data)
-isoregModel <- spark.isoreg(df, y ~ x, weightCol = "w")
-isoregFitted <- predict(isoregModel, df)
-head(select(isoregFitted, "x", "y", "prediction"))
-```
-
-In the prediction stage, based on the fitted monotone piecewise function, the rules are:
+#### Kolmogorov-Smirnov Test
 
-* If the prediction input exactly matches a training feature then associated prediction is returned. In case there are multiple predictions with the same feature then one of them is returned. Which one is undefined.
+`spark.kstest` runs a two-sided, one-sample [Kolmogorov-Smirnov (KS) test](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test).
+Given a `SparkDataFrame`, the test compares continuous data in a given column `testCol` with the theoretical distribution
+specified by parameter `nullHypothesis`.
+Users can call `summary` to get a summary of the test results.
 
-* If the prediction input is lower or higher than all training features then prediction with lowest or highest feature is returned respectively. In case there are multiple predictions with the same feature then the lowest or highest is returned respectively.
+In the following example, we test whether the `longley` dataset's `Armed_Forces` column
+follows a normal distribution.  We set the parameters of the normal distribution using
+the mean and standard deviation of the sample.
 
-* If the prediction input falls between two training features then prediction is treated as piecewise linear function and interpolated value is calculated from the predictions of the two closest features. In case there are multiple values with the same feature then the same rules as in previous point are used.
-
-For example, when the input is $3.2$, the two closest feature values are $3.0$ and $3.5$, then predicted value would be a linear interpolation between the predicted values at $3.0$ and $3.5$.
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+afStats <- head(select(df, mean(df$Armed_Forces), sd(df$Armed_Forces)))
+afMean <- afStats[1]
+afStd <- afStats[2]
 
-```{r}
-newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
-head(predict(isoregModel, newDF))
+test <- spark.kstest(df, "Armed_Forces", "norm", c(afMean, afStd))
+testSummary <- summary(test)
+testSummary
 ```
 
-#### What's More?
-We also expect Decision Tree, Random Forest, Kolmogorov-Smirnov Test coming in the next version 2.1.0.
 
 ### Model Persistence
 The following example shows how to save/load an ML model by SparkR.
@@ -822,9 +926,9 @@ The main method calls of actual computation happen in the Spark JVM of the drive
 
 Two kinds of RPCs are supported in the SparkR JVM backend: method invocation and creating new objects. Method invocation can be done in two ways.
 
-* `sparkR.invokeJMethod` takes a reference to an existing Java object and a list of arguments to be passed on to the method.
+* `sparkR.callJMethod` takes a reference to an existing Java object and a list of arguments to be passed on to the method.
 
-* `sparkR.invokeJStatic` takes a class name for static method and a list of arguments to be passed on to the method.
+* `sparkR.callJStatic` takes a class name for static method and a list of arguments to be passed on to the method.
 
 The arguments are serialized using our custom wire format which is then deserialized on the JVM side. We then use Java reflection to invoke the appropriate method.
 
diff --git a/README.md b/README.md
index dd7d0e22495b..d861e9fee705 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,7 @@ and Spark Streaming for stream processing.
 ## Online Documentation
 
 You can find the latest Spark documentation, including a programming
-guide, on the [project web page](http://spark.apache.org/documentation.html)
-and [project wiki](https://cwiki.apache.org/confluence/display/SPARK).
+guide, on the [project web page](http://spark.apache.org/documentation.html).
 This README file only contains basic setup instructions.
 
 ## Building Spark
@@ -29,8 +28,9 @@ To build Spark and its example programs, run:
 You can build Spark using more than one thread by using the -T option with Maven, see ["Parallel builds in Maven 3"](https://cwiki.apache.org/confluence/display/MAVEN/Parallel+builds+in+Maven+3).
 More detailed documentation is available from the project site, at
 ["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
-For developing Spark using an IDE, see [Eclipse](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-Eclipse)
-and [IntelliJ](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-IntelliJ).
+
+For general development tips, including info on developing Spark using an IDE, see 
+[http://spark.apache.org/developer-tools.html](the Useful Developer Tools page).
 
 ## Interactive Scala Shell
 
@@ -80,7 +80,7 @@ can be run using:
     ./dev/run-tests
 
 Please see the guidance on how to
-[run tests for a module, or individual tests](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools).
+[run tests for a module, or individual tests](http://spark.apache.org/developer-tools.html#individual-tests).
 
 ## A Note About Hadoop Versions
 
@@ -98,7 +98,7 @@ building for particular Hive and Hive Thriftserver distributions.
 Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html)
 in the online documentation for an overview on how to configure Spark.
 
-## Contributing
+## Contributing
 
-Please review the [Contribution to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark)
-wiki for information on how to get started contributing to the project.
+Please review the [Contribution to Spark guide](http://spark.apache.org/contributing.html)
+for information on how to get started contributing to the project.
diff --git a/assembly/pom.xml b/assembly/pom.xml
index ec243eaebaea..6e092ef8928b 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/bin/beeline b/bin/beeline
index 1627626941a7..058534699e44 100755
--- a/bin/beeline
+++ b/bin/beeline
@@ -25,7 +25,7 @@ set -o posix
 
 # Figure out if SPARK_HOME is set
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 CLASS="org.apache.hive.beeline.BeeLine"
diff --git a/bin/find-spark-home b/bin/find-spark-home
new file mode 100755
index 000000000000..fa78407d4175
--- /dev/null
+++ b/bin/find-spark-home
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Attempts to find a proper value for SPARK_HOME. Should be included using "source" directive.
+
+FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "$(dirname "$0")"; pwd)/find_spark_home.py"
+
+# Short cirtuit if the user already has this set.
+if [ ! -z "${SPARK_HOME}" ]; then
+   exit 0
+elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then
+  # If we are not in the same directory as find_spark_home.py we are not pip installed so we don't
+  # need to search the different Python directories for a Spark installation.
+  # Note only that, if the user has pip installed PySpark but is directly calling pyspark-shell or
+  # spark-submit in another directory we want to use that version of PySpark rather than the
+  # pip installed version of PySpark.
+  export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)"
+else
+  # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
+  # Default to standard python interpreter unless told otherwise
+  if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
+     PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
+  fi
+  export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")
+fi
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
index eaea964ed5b3..8a2f709960a2 100644
--- a/bin/load-spark-env.sh
+++ b/bin/load-spark-env.sh
@@ -23,7 +23,7 @@
 
 # Figure out where Spark is installed
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 if [ -z "$SPARK_ENV_LOADED" ]; then
diff --git a/bin/pyspark b/bin/pyspark
index d6b3ab0a4432..98387c2ec5b8 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 source "${SPARK_HOME}"/bin/load-spark-env.sh
@@ -46,7 +46,7 @@ WORKS_WITH_IPYTHON=$(python -c 'import sys; print(sys.version_info >= (2, 7, 0))
 
 # Determine the Python executable to use for the executors:
 if [[ -z "$PYSPARK_PYTHON" ]]; then
-  if [[ $PYSPARK_DRIVER_PYTHON == *ipython* && ! WORKS_WITH_IPYTHON ]]; then
+  if [[ $PYSPARK_DRIVER_PYTHON == *ipython* && ! $WORKS_WITH_IPYTHON ]]; then
     echo "IPython requires Python 2.7+; please install python2.7 or set PYSPARK_PYTHON" 1>&2
     exit 1
   else
@@ -68,7 +68,7 @@ if [[ -n "$SPARK_TESTING" ]]; then
   unset YARN_CONF_DIR
   unset HADOOP_CONF_DIR
   export PYTHONHASHSEED=0
-  exec "$PYSPARK_DRIVER_PYTHON" -m $1
+  exec "$PYSPARK_DRIVER_PYTHON" -m "$1"
   exit
 fi
 
diff --git a/bin/run-example b/bin/run-example
index dd0e3c412026..4ba5399311d3 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 export _SPARK_CMD_USAGE="Usage: ./bin/run-example [options] example-class [example args]"
diff --git a/bin/spark-class b/bin/spark-class
index 377c8d1add3f..65d3b9612909 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 . "${SPARK_HOME}"/bin/load-spark-env.sh
@@ -27,7 +27,7 @@ fi
 if [ -n "${JAVA_HOME}" ]; then
   RUNNER="${JAVA_HOME}/bin/java"
 else
-  if [ `command -v java` ]; then
+  if [ "$(command -v java)" ]; then
     RUNNER="java"
   else
     echo "JAVA_HOME is not set" >&2
@@ -36,7 +36,7 @@ else
 fi
 
 # Find Spark jars.
-if [ -f "${SPARK_HOME}/RELEASE" ]; then
+if [ -d "${SPARK_HOME}/jars" ]; then
   SPARK_JARS_DIR="${SPARK_HOME}/jars"
 else
   SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
@@ -72,6 +72,8 @@ build_command() {
   printf "%d\0" $?
 }
 
+# Turn off posix mode since it does not allow process substitution
+set +o posix
 CMD=()
 while IFS= read -d '' -r ARG; do
   CMD+=("$ARG")
diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd
index 869c0b202f7f..f6157f42843e 100644
--- a/bin/spark-class2.cmd
+++ b/bin/spark-class2.cmd
@@ -50,7 +50,16 @@ if not "x%SPARK_PREPEND_CLASSES%"=="x" (
 
 rem Figure out where java is.
 set RUNNER=java
-if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java
+if not "x%JAVA_HOME%"=="x" (
+  set RUNNER=%JAVA_HOME%\bin\java
+) else (
+  where /q "%RUNNER%"
+  if ERRORLEVEL 1 (
+    echo Java not found and JAVA_HOME environment variable is not set.
+    echo Install Java and set JAVA_HOME to point to the Java installation directory.
+    exit /b 1
+  )
+)
 
 rem The launcher library prints the command to be executed in a single line suitable for being
 rem executed by the batch interpreter. So read all the output of the launcher into a variable.
diff --git a/bin/spark-shell b/bin/spark-shell
index 6583b5bd880e..421f36cac3d4 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -21,7 +21,7 @@
 # Shell script for starting the Spark Shell REPL
 
 cygwin=false
-case "`uname`" in
+case "$(uname)" in
   CYGWIN*) cygwin=true;;
 esac
 
@@ -29,7 +29,7 @@ esac
 set -o posix
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]"
diff --git a/bin/spark-sql b/bin/spark-sql
index 970d12cbf51d..b08b944ebd31 100755
--- a/bin/spark-sql
+++ b/bin/spark-sql
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]"
diff --git a/bin/spark-submit b/bin/spark-submit
index 023f9c162f4b..4e9d3614e637 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 # disable randomized hash for string in Python 3.3+
diff --git a/bin/sparkR b/bin/sparkR
index 2c07a82e2173..29ab10df8ab6 100755
--- a/bin/sparkR
+++ b/bin/sparkR
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 source "${SPARK_HOME}"/bin/load-spark-env.sh
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index fcefe64d59c9..77a4b64e8da9 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -87,6 +87,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
diff --git a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java
index 5b69e2bb0354..37ba543380f0 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java
@@ -62,8 +62,20 @@ public class TransportContext {
   private final RpcHandler rpcHandler;
   private final boolean closeIdleConnections;
 
-  private final MessageEncoder encoder;
-  private final MessageDecoder decoder;
+  /**
+   * Force to create MessageEncoder and MessageDecoder so that we can make sure they will be created
+   * before switching the current context class loader to ExecutorClassLoader.
+   *
+   * Netty's MessageToMessageEncoder uses Javassist to generate a matcher class and the
+   * implementation calls "Class.forName" to check if this calls is already generated. If the
+   * following two objects are created in "ExecutorClassLoader.findClass", it will cause
+   * "ClassCircularityError". This is because loading this Netty generated class will call
+   * "ExecutorClassLoader.findClass" to search this class, and "ExecutorClassLoader" will try to use
+   * RPC to load it and cause to load the non-exist matcher class again. JVM will report
+   * `ClassCircularityError` to prevent such infinite recursion. (See SPARK-17714)
+   */
+  private static final MessageEncoder ENCODER = MessageEncoder.INSTANCE;
+  private static final MessageDecoder DECODER = MessageDecoder.INSTANCE;
 
   public TransportContext(TransportConf conf, RpcHandler rpcHandler) {
     this(conf, rpcHandler, false);
@@ -75,8 +87,6 @@ public TransportContext(
       boolean closeIdleConnections) {
     this.conf = conf;
     this.rpcHandler = rpcHandler;
-    this.encoder = new MessageEncoder();
-    this.decoder = new MessageDecoder();
     this.closeIdleConnections = closeIdleConnections;
   }
 
@@ -135,9 +145,9 @@ public TransportChannelHandler initializePipeline(
     try {
       TransportChannelHandler channelHandler = createChannelHandler(channel, channelRpcHandler);
       channel.pipeline()
-        .addLast("encoder", encoder)
+        .addLast("encoder", ENCODER)
         .addLast(TransportFrameDecoder.HANDLER_NAME, NettyUtils.createFrameDecoder())
-        .addLast("decoder", decoder)
+        .addLast("decoder", DECODER)
         .addLast("idleStateHandler", new IdleStateHandler(0, 0, conf.connectionTimeoutMs() / 1000))
         // NOTE: Chunks are currently guaranteed to be returned in the order of request, but this
         // would require more logic to guarantee if this were not part of the same event loop.
diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
index e895f13f4545..b50e043d5c9c 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java
@@ -100,8 +100,10 @@ public TransportClientFactory(
 
     IOMode ioMode = IOMode.valueOf(conf.ioMode());
     this.socketChannelClass = NettyUtils.getClientChannelClass(ioMode);
-    // TODO: Make thread pool name configurable.
-    this.workerGroup = NettyUtils.createEventLoop(ioMode, conf.clientThreads(), "shuffle-client");
+    this.workerGroup = NettyUtils.createEventLoop(
+        ioMode,
+        conf.clientThreads(),
+        conf.getModuleName() + "-client");
     this.pooledAllocator = NettyUtils.createPooledByteBufAllocator(
       conf.preferDirectBufs(), false /* allowCache */, conf.clientThreads());
   }
@@ -120,7 +122,8 @@ public TransportClientFactory(
    *
    * Concurrency: This method is safe to call from multiple threads.
    */
-  public TransportClient createClient(String remoteHost, int remotePort) throws IOException {
+  public TransportClient createClient(String remoteHost, int remotePort)
+      throws IOException, InterruptedException {
     // Get connection from the connection pool first.
     // If it is not found or not active, create a new one.
     // Use unresolved address here to avoid DNS resolution each time we creates a client.
@@ -188,13 +191,14 @@ public TransportClient createClient(String remoteHost, int remotePort) throws IO
    * As with {@link #createClient(String, int)}, this method is blocking.
    */
   public TransportClient createUnmanagedClient(String remoteHost, int remotePort)
-      throws IOException {
+      throws IOException, InterruptedException {
     final InetSocketAddress address = new InetSocketAddress(remoteHost, remotePort);
     return createClient(address);
   }
 
   /** Create a completely new {@link TransportClient} to the remote address. */
-  private TransportClient createClient(InetSocketAddress address) throws IOException {
+  private TransportClient createClient(InetSocketAddress address)
+      throws IOException, InterruptedException {
     logger.debug("Creating new connection to {}", address);
 
     Bootstrap bootstrap = new Bootstrap();
@@ -221,7 +225,7 @@ public void initChannel(SocketChannel ch) {
     // Connect to the remote server
     long preConnect = System.nanoTime();
     ChannelFuture cf = bootstrap.connect(address);
-    if (!cf.awaitUninterruptibly(conf.connectionTimeoutMs())) {
+    if (!cf.await(conf.connectionTimeoutMs())) {
       throw new IOException(
         String.format("Connecting to %s timed out (%s ms)", address, conf.connectionTimeoutMs()));
     } else if (cf.cause() != null) {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
index f0956438ade2..39a7495828a8 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageDecoder.java
@@ -35,6 +35,10 @@ public final class MessageDecoder extends MessageToMessageDecoder<ByteBuf> {
 
   private static final Logger logger = LoggerFactory.getLogger(MessageDecoder.class);
 
+  public static final MessageDecoder INSTANCE = new MessageDecoder();
+
+  private MessageDecoder() {}
+
   @Override
   public void decode(ChannelHandlerContext ctx, ByteBuf in, List<Object> out) {
     Message.Type msgType = Message.Type.decode(in);
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java
index 276f16637efc..997f74e1a21b 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageEncoder.java
@@ -35,6 +35,10 @@ public final class MessageEncoder extends MessageToMessageEncoder<Message> {
 
   private static final Logger logger = LoggerFactory.getLogger(MessageEncoder.class);
 
+  public static final MessageEncoder INSTANCE = new MessageEncoder();
+
+  private MessageEncoder() {}
+
   /***
    * Encodes a Message by invoking its encode() method. For non-data messages, we will add one
    * ByteBuf to 'out' containing the total frame length, the message type, and the message itself.
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java
index c33848c8406c..56782a832787 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java
@@ -18,7 +18,7 @@
 package org.apache.spark.network.server;
 
 import io.netty.channel.ChannelHandlerContext;
-import io.netty.channel.SimpleChannelInboundHandler;
+import io.netty.channel.ChannelInboundHandlerAdapter;
 import io.netty.handler.timeout.IdleState;
 import io.netty.handler.timeout.IdleStateEvent;
 import org.slf4j.Logger;
@@ -26,7 +26,6 @@
 
 import org.apache.spark.network.client.TransportClient;
 import org.apache.spark.network.client.TransportResponseHandler;
-import org.apache.spark.network.protocol.Message;
 import org.apache.spark.network.protocol.RequestMessage;
 import org.apache.spark.network.protocol.ResponseMessage;
 import static org.apache.spark.network.util.NettyUtils.getRemoteAddress;
@@ -48,7 +47,7 @@
  * on the channel for at least `requestTimeoutMs`. Note that this is duplex traffic; we will not
  * timeout if the client is continuously sending but getting no responses, for simplicity.
  */
-public class TransportChannelHandler extends SimpleChannelInboundHandler<Message> {
+public class TransportChannelHandler extends ChannelInboundHandlerAdapter {
   private static final Logger logger = LoggerFactory.getLogger(TransportChannelHandler.class);
 
   private final TransportClient client;
@@ -88,14 +87,14 @@ public void channelActive(ChannelHandlerContext ctx) throws Exception {
     try {
       requestHandler.channelActive();
     } catch (RuntimeException e) {
-      logger.error("Exception from request handler while registering channel", e);
+      logger.error("Exception from request handler while channel is active", e);
     }
     try {
       responseHandler.channelActive();
     } catch (RuntimeException e) {
-      logger.error("Exception from response handler while registering channel", e);
+      logger.error("Exception from response handler while channel is active", e);
     }
-    super.channelRegistered(ctx);
+    super.channelActive(ctx);
   }
 
   @Override
@@ -103,22 +102,24 @@ public void channelInactive(ChannelHandlerContext ctx) throws Exception {
     try {
       requestHandler.channelInactive();
     } catch (RuntimeException e) {
-      logger.error("Exception from request handler while unregistering channel", e);
+      logger.error("Exception from request handler while channel is inactive", e);
     }
     try {
       responseHandler.channelInactive();
     } catch (RuntimeException e) {
-      logger.error("Exception from response handler while unregistering channel", e);
+      logger.error("Exception from response handler while channel is inactive", e);
     }
-    super.channelUnregistered(ctx);
+    super.channelInactive(ctx);
   }
 
   @Override
-  public void channelRead0(ChannelHandlerContext ctx, Message request) throws Exception {
+  public void channelRead(ChannelHandlerContext ctx, Object request) throws Exception {
     if (request instanceof RequestMessage) {
       requestHandler.handle((RequestMessage) request);
-    } else {
+    } else if (request instanceof ResponseMessage) {
       responseHandler.handle((ResponseMessage) request);
+    } else {
+      ctx.fireChannelRead(request);
     }
   }
 
diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
index 0d7a677820d3..047c5f3f1f09 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java
@@ -89,7 +89,7 @@ private void init(String hostToBind, int portToBind) {
 
     IOMode ioMode = IOMode.valueOf(conf.ioMode());
     EventLoopGroup bossGroup =
-      NettyUtils.createEventLoop(ioMode, conf.serverThreads(), "shuffle-server");
+      NettyUtils.createEventLoop(ioMode, conf.serverThreads(), conf.getModuleName() + "-server");
     EventLoopGroup workerGroup = bossGroup;
 
     PooledByteBufAllocator allocator = NettyUtils.createPooledByteBufAllocator(
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
index 64eaba103ccc..fc5cc091f6e6 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java
@@ -73,6 +73,10 @@ private String getConfKey(String suffix) {
     return "spark." + module + "." + suffix;
   }
 
+  public String getModuleName() {
+    return module;
+  }
+
   /** IO mode: nio or epoll */
   public String ioMode() { return conf.get(SPARK_NETWORK_IO_MODE_KEY, "NIO").toUpperCase(); }
 
diff --git a/common/network-common/src/test/java/org/apache/spark/network/ProtocolSuite.java b/common/network-common/src/test/java/org/apache/spark/network/ProtocolSuite.java
index 6c8dd742f4b6..bb1c40c4b0e0 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/ProtocolSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/ProtocolSuite.java
@@ -49,11 +49,11 @@
 public class ProtocolSuite {
   private void testServerToClient(Message msg) {
     EmbeddedChannel serverChannel = new EmbeddedChannel(new FileRegionEncoder(),
-      new MessageEncoder());
+      MessageEncoder.INSTANCE);
     serverChannel.writeOutbound(msg);
 
     EmbeddedChannel clientChannel = new EmbeddedChannel(
-        NettyUtils.createFrameDecoder(), new MessageDecoder());
+        NettyUtils.createFrameDecoder(), MessageDecoder.INSTANCE);
 
     while (!serverChannel.outboundMessages().isEmpty()) {
       clientChannel.writeInbound(serverChannel.readOutbound());
@@ -65,11 +65,11 @@ private void testServerToClient(Message msg) {
 
   private void testClientToServer(Message msg) {
     EmbeddedChannel clientChannel = new EmbeddedChannel(new FileRegionEncoder(),
-      new MessageEncoder());
+      MessageEncoder.INSTANCE);
     clientChannel.writeOutbound(msg);
 
     EmbeddedChannel serverChannel = new EmbeddedChannel(
-        NettyUtils.createFrameDecoder(), new MessageDecoder());
+        NettyUtils.createFrameDecoder(), MessageDecoder.INSTANCE);
 
     while (!clientChannel.outboundMessages().isEmpty()) {
       serverChannel.writeInbound(clientChannel.readOutbound());
diff --git a/common/network-common/src/test/java/org/apache/spark/network/RequestTimeoutIntegrationSuite.java b/common/network-common/src/test/java/org/apache/spark/network/RequestTimeoutIntegrationSuite.java
index 959396bb8c26..9e8057438db9 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/RequestTimeoutIntegrationSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/RequestTimeoutIntegrationSuite.java
@@ -226,6 +226,8 @@ public StreamManager getStreamManager() {
     callback0.latch.await(60, TimeUnit.SECONDS);
     assertTrue(callback0.failure instanceof IOException);
 
+    // make sure callback1 is called.
+    callback1.latch.await(60, TimeUnit.SECONDS);
     // failed at same time as previous
     assertTrue(callback1.failure instanceof IOException);
   }
diff --git a/common/network-common/src/test/java/org/apache/spark/network/TransportClientFactorySuite.java b/common/network-common/src/test/java/org/apache/spark/network/TransportClientFactorySuite.java
index 44d16d54225e..43e63efed4a5 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/TransportClientFactorySuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/TransportClientFactorySuite.java
@@ -100,6 +100,8 @@ public void run() {
             clients.add(client);
           } catch (IOException e) {
             failed.incrementAndGet();
+          } catch (InterruptedException e) {
+            throw new RuntimeException(e);
           }
         }
       };
@@ -143,7 +145,7 @@ public void reuseClientsUpToConfigVariableConcurrent() throws Exception {
   }
 
   @Test
-  public void returnDifferentClientsForDifferentServers() throws IOException {
+  public void returnDifferentClientsForDifferentServers() throws IOException, InterruptedException {
     TransportClientFactory factory = context.createClientFactory();
     TransportClient c1 = factory.createClient(TestUtils.getLocalHost(), server1.getPort());
     TransportClient c2 = factory.createClient(TestUtils.getLocalHost(), server2.getPort());
@@ -172,7 +174,7 @@ public void neverReturnInactiveClients() throws IOException, InterruptedExceptio
   }
 
   @Test
-  public void closeBlockClientsWithFactory() throws IOException {
+  public void closeBlockClientsWithFactory() throws IOException, InterruptedException {
     TransportClientFactory factory = context.createClientFactory();
     TransportClient c1 = factory.createClient(TestUtils.getLocalHost(), server1.getPort());
     TransportClient c2 = factory.createClient(TestUtils.getLocalHost(), server2.getPort());
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 511e1f29de36..1a2d85a2ead6 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -70,6 +70,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>log4j</groupId>
       <artifactId>log4j</artifactId>
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
index 772fb88325b3..eea5cf7bec0f 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
@@ -101,7 +101,7 @@ public void fetchBlocks(
         new RetryingBlockFetcher.BlockFetchStarter() {
           @Override
           public void createAndStart(String[] blockIds, BlockFetchingListener listener)
-              throws IOException {
+              throws IOException, InterruptedException {
             TransportClient client = clientFactory.createClient(host, port);
             new OneForOneBlockFetcher(client, appId, execId, blockIds, listener).start();
           }
@@ -136,7 +136,7 @@ public void registerWithShuffleServer(
       String host,
       int port,
       String execId,
-      ExecutorShuffleInfo executorInfo) throws IOException {
+      ExecutorShuffleInfo executorInfo) throws IOException, InterruptedException {
     checkInit();
     TransportClient client = clientFactory.createUnmanagedClient(host, port);
     try {
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockFetcher.java
index 72bd0f803da3..5be855048e4d 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockFetcher.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RetryingBlockFetcher.java
@@ -57,7 +57,8 @@ public interface BlockFetchStarter {
      * {@link org.apache.spark.network.client.TransportClientFactory} in order to fix connection
      * issues.
      */
-    void createAndStart(String[] blockIds, BlockFetchingListener listener) throws IOException;
+    void createAndStart(String[] blockIds, BlockFetchingListener listener)
+         throws IOException, InterruptedException;
   }
 
   /** Shared executor service used for waiting and retrying. */
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/mesos/MesosExternalShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/mesos/MesosExternalShuffleClient.java
index 42cedd994315..c6d6029d5a2e 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/mesos/MesosExternalShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/mesos/MesosExternalShuffleClient.java
@@ -69,7 +69,7 @@ public void registerDriverWithShuffleService(
       String host,
       int port,
       long heartbeatTimeoutMs,
-      long heartbeatIntervalMs) throws IOException {
+      long heartbeatIntervalMs) throws IOException, InterruptedException {
 
     checkInit();
     ByteBuffer registerDriver = new RegisterDriver(appId, heartbeatTimeoutMs).toByteBuffer();
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
index 6ba937dddb2a..81a3f065d6fc 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java
@@ -103,7 +103,7 @@ public void afterEach() {
   }
 
   @Test
-  public void testGoodClient() throws IOException {
+  public void testGoodClient() throws IOException, InterruptedException {
     clientFactory = context.createClientFactory(
       Lists.<TransportClientBootstrap>newArrayList(
         new SaslClientBootstrap(conf, "app-1", secretKeyHolder)));
@@ -133,7 +133,7 @@ public void testBadClient() {
   }
 
   @Test
-  public void testNoSaslClient() throws IOException {
+  public void testNoSaslClient() throws IOException, InterruptedException {
     clientFactory = context.createClientFactory(
       Lists.<TransportClientBootstrap>newArrayList());
 
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java
index 552b5366c593..a04b6825d19d 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleIntegrationSuite.java
@@ -240,7 +240,7 @@ public void testFetchNoServer() throws Exception {
   }
 
   private void registerExecutor(String executorId, ExecutorShuffleInfo executorInfo)
-      throws IOException {
+      throws IOException, InterruptedException {
     ExternalShuffleClient client = new ExternalShuffleClient(conf, null, false, false);
     client.init(APP_ID);
     client.registerWithShuffleServer(TestUtils.getLocalHost(), server.getPort(),
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java
index a0f69ca29a28..6ef1a2d2c89f 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleSecuritySuite.java
@@ -59,7 +59,7 @@ public void afterEach() {
   }
 
   @Test
-  public void testValid() throws IOException {
+  public void testValid() throws IOException, InterruptedException {
     validate("my-app-id", "secret", false);
   }
 
@@ -82,12 +82,13 @@ public void testBadSecret() {
   }
 
   @Test
-  public void testEncryption() throws IOException {
+  public void testEncryption() throws IOException, InterruptedException {
     validate("my-app-id", "secret", true);
   }
 
   /** Creates an ExternalShuffleClient and attempts to register with the server. */
-  private void validate(String appId, String secretKey, boolean encrypt) throws IOException {
+  private void validate(String appId, String secretKey, boolean encrypt)
+        throws IOException, InterruptedException {
     ExternalShuffleClient client =
       new ExternalShuffleClient(conf, new TestSecretKeyHolder(appId, secretKey), true, encrypt);
     client.init(appId);
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
index 91882e3b3bcd..f221544e9e5e 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
@@ -66,7 +66,7 @@ public void afterEach() {
   }
 
   @Test
-  public void testNoFailures() throws IOException {
+  public void testNoFailures() throws IOException, InterruptedException {
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
 
     List<? extends Map<String, Object>> interactions = Arrays.asList(
@@ -85,7 +85,7 @@ public void testNoFailures() throws IOException {
   }
 
   @Test
-  public void testUnrecoverableFailure() throws IOException {
+  public void testUnrecoverableFailure() throws IOException, InterruptedException {
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
 
     List<? extends Map<String, Object>> interactions = Arrays.asList(
@@ -104,7 +104,7 @@ public void testUnrecoverableFailure() throws IOException {
   }
 
   @Test
-  public void testSingleIOExceptionOnFirst() throws IOException {
+  public void testSingleIOExceptionOnFirst() throws IOException, InterruptedException {
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
 
     List<? extends Map<String, Object>> interactions = Arrays.asList(
@@ -127,7 +127,7 @@ public void testSingleIOExceptionOnFirst() throws IOException {
   }
 
   @Test
-  public void testSingleIOExceptionOnSecond() throws IOException {
+  public void testSingleIOExceptionOnSecond() throws IOException, InterruptedException {
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
 
     List<? extends Map<String, Object>> interactions = Arrays.asList(
@@ -149,7 +149,7 @@ public void testSingleIOExceptionOnSecond() throws IOException {
   }
 
   @Test
-  public void testTwoIOExceptions() throws IOException {
+  public void testTwoIOExceptions() throws IOException, InterruptedException {
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
 
     List<? extends Map<String, Object>> interactions = Arrays.asList(
@@ -177,7 +177,7 @@ public void testTwoIOExceptions() throws IOException {
   }
 
   @Test
-  public void testThreeIOExceptions() throws IOException {
+  public void testThreeIOExceptions() throws IOException, InterruptedException {
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
 
     List<? extends Map<String, Object>> interactions = Arrays.asList(
@@ -209,7 +209,7 @@ public void testThreeIOExceptions() throws IOException {
   }
 
   @Test
-  public void testRetryAndUnrecoverable() throws IOException {
+  public void testRetryAndUnrecoverable() throws IOException, InterruptedException {
     BlockFetchingListener listener = mock(BlockFetchingListener.class);
 
     List<? extends Map<String, Object>> interactions = Arrays.asList(
@@ -252,7 +252,7 @@ public void testRetryAndUnrecoverable() throws IOException {
   @SuppressWarnings("unchecked")
   private static void performInteractions(List<? extends Map<String, Object>> interactions,
                                           BlockFetchingListener listener)
-    throws IOException {
+    throws IOException, InterruptedException {
 
     TransportConf conf = new TransportConf("shuffle", new SystemPropertyConfigProvider());
     BlockFetchStarter fetchStarter = mock(BlockFetchStarter.class);
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 606ad1573961..fb6c24118589 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -50,6 +50,17 @@
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
 
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <!-- Provided dependencies -->
     <dependency>
       <groupId>org.apache.hadoop</groupId>
@@ -76,6 +87,9 @@
             <includes>
               <include>*:*</include>
             </includes>
+            <excludes>
+              <exclude>org.scala-lang:scala-library</exclude>
+            </excludes>
           </artifactSet>
           <filters>
             <filter>
@@ -87,7 +101,7 @@
               </excludes>
             </filter>
           </filters>
-          <relocations>
+          <relocations combine.children="append">
             <relocation>
               <pattern>com.fasterxml.jackson</pattern>
               <shadedPattern>${spark.shade.packageName}.com.fasterxml.jackson</shadedPattern>
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 626f023a5b99..ff2d5c52730b 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -39,6 +39,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
 
   <build>
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 1c60d510e570..b9bf0342eb60 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -36,9 +36,9 @@
 
   <dependencies>
     <dependency>
-      <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
-      <scope>compile</scope>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>${scala.version}</version>
     </dependency>
   </dependencies>
 
diff --git a/common/tags/src/main/java/org/apache/spark/tags/DockerTest.java b/common/tags/src/test/java/org/apache/spark/tags/DockerTest.java
similarity index 100%
rename from common/tags/src/main/java/org/apache/spark/tags/DockerTest.java
rename to common/tags/src/test/java/org/apache/spark/tags/DockerTest.java
diff --git a/common/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java b/common/tags/src/test/java/org/apache/spark/tags/ExtendedHiveTest.java
similarity index 100%
rename from common/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java
rename to common/tags/src/test/java/org/apache/spark/tags/ExtendedHiveTest.java
diff --git a/common/tags/src/main/java/org/apache/spark/tags/ExtendedYarnTest.java b/common/tags/src/test/java/org/apache/spark/tags/ExtendedYarnTest.java
similarity index 100%
rename from common/tags/src/main/java/org/apache/spark/tags/ExtendedYarnTest.java
rename to common/tags/src/test/java/org/apache/spark/tags/ExtendedYarnTest.java
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 45af98d94ef9..f8a0e577777e 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -39,6 +39,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>com.twitter</groupId>
       <artifactId>chill_${scala.binary.version}</artifactId>
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
index 671b8c747594..ba35cf250e48 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
@@ -46,18 +46,22 @@ public final class Platform {
   private static final boolean unaligned;
   static {
     boolean _unaligned;
-    // use reflection to access unaligned field
-    try {
-      Class<?> bitsClass =
-        Class.forName("java.nio.Bits", false, ClassLoader.getSystemClassLoader());
-      Method unalignedMethod = bitsClass.getDeclaredMethod("unaligned");
-      unalignedMethod.setAccessible(true);
-      _unaligned = Boolean.TRUE.equals(unalignedMethod.invoke(null));
-    } catch (Throwable t) {
-      // We at least know x86 and x64 support unaligned access.
-      String arch = System.getProperty("os.arch", "");
-      //noinspection DynamicRegexReplaceableByCompiledPattern
-      _unaligned = arch.matches("^(i[3-6]86|x86(_64)?|x64|amd64|aarch64)$");
+    String arch = System.getProperty("os.arch", "");
+    if (arch.equals("ppc64le") || arch.equals("ppc64")) {
+      // Since java.nio.Bits.unaligned() doesn't return true on ppc (See JDK-8165231), but ppc64 and ppc64le support it
+      _unaligned = true;
+    } else {
+      try {
+        Class<?> bitsClass =
+          Class.forName("java.nio.Bits", false, ClassLoader.getSystemClassLoader());
+        Method unalignedMethod = bitsClass.getDeclaredMethod("unaligned");
+        unalignedMethod.setAccessible(true);
+        _unaligned = Boolean.TRUE.equals(unalignedMethod.invoke(null));
+      } catch (Throwable t) {
+        // We at least know x86 and x64 support unaligned access.
+        //noinspection DynamicRegexReplaceableByCompiledPattern
+        _unaligned = arch.matches("^(i[3-6]86|x86(_64)?|x64|amd64|aarch64)$");
+      }
     }
     unaligned = _unaligned;
   }
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
index 518ed6470a75..a7b0e6f80c2b 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
@@ -252,6 +252,10 @@ public static long parseSecondNano(String secondNano) throws IllegalArgumentExce
   public final int months;
   public final long microseconds;
 
+  public final long milliseconds() {
+    return this.microseconds / MICROS_PER_MILLI;
+  }
+
   public CalendarInterval(int months, long microseconds) {
     this.months = months;
     this.microseconds = microseconds;
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index e09a6b7d93a9..b03e7182b16a 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -816,6 +816,190 @@ public UTF8String translate(Map<Character, Character> dict) {
     return fromString(sb.toString());
   }
 
+  private int getDigit(byte b) {
+    if (b >= '0' && b <= '9') {
+      return b - '0';
+    }
+    throw new NumberFormatException(toString());
+  }
+
+  /**
+   * Parses this UTF8String to long.
+   *
+   * Note that, in this method we accumulate the result in negative format, and convert it to
+   * positive format at the end, if this string is not started with '-'. This is because min value
+   * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
+   * Integer.MIN_VALUE is '-2147483648'.
+   *
+   * This code is mostly copied from LazyLong.parseLong in Hive.
+   */
+  public long toLong() {
+    if (numBytes == 0) {
+      throw new NumberFormatException("Empty string");
+    }
+
+    byte b = getByte(0);
+    final boolean negative = b == '-';
+    int offset = 0;
+    if (negative || b == '+') {
+      offset++;
+      if (numBytes == 1) {
+        throw new NumberFormatException(toString());
+      }
+    }
+
+    final byte separator = '.';
+    final int radix = 10;
+    final long stopValue = Long.MIN_VALUE / radix;
+    long result = 0;
+
+    while (offset < numBytes) {
+      b = getByte(offset);
+      offset++;
+      if (b == separator) {
+        // We allow decimals and will return a truncated integral in that case.
+        // Therefore we won't throw an exception here (checking the fractional
+        // part happens below.)
+        break;
+      }
+
+      int digit = getDigit(b);
+      // We are going to process the new digit and accumulate the result. However, before doing
+      // this, if the result is already smaller than the stopValue(Long.MIN_VALUE / radix), then
+      // result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
+      if (result < stopValue) {
+        throw new NumberFormatException(toString());
+      }
+
+      result = result * radix - digit;
+      // Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / radix), we
+      // can just use `result > 0` to check overflow. If result overflows, we should stop and throw
+      // exception.
+      if (result > 0) {
+        throw new NumberFormatException(toString());
+      }
+    }
+
+    // This is the case when we've encountered a decimal separator. The fractional
+    // part will not change the number, but we will verify that the fractional part
+    // is well formed.
+    while (offset < numBytes) {
+      if (getDigit(getByte(offset)) == -1) {
+        throw new NumberFormatException(toString());
+      }
+      offset++;
+    }
+
+    if (!negative) {
+      result = -result;
+      if (result < 0) {
+        throw new NumberFormatException(toString());
+      }
+    }
+
+    return result;
+  }
+
+  /**
+   * Parses this UTF8String to int.
+   *
+   * Note that, in this method we accumulate the result in negative format, and convert it to
+   * positive format at the end, if this string is not started with '-'. This is because min value
+   * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
+   * Integer.MIN_VALUE is '-2147483648'.
+   *
+   * This code is mostly copied from LazyInt.parseInt in Hive.
+   *
+   * Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
+   * reasons, like Hive does.
+   */
+  public int toInt() {
+    if (numBytes == 0) {
+      throw new NumberFormatException("Empty string");
+    }
+
+    byte b = getByte(0);
+    final boolean negative = b == '-';
+    int offset = 0;
+    if (negative || b == '+') {
+      offset++;
+      if (numBytes == 1) {
+        throw new NumberFormatException(toString());
+      }
+    }
+
+    final byte separator = '.';
+    final int radix = 10;
+    final int stopValue = Integer.MIN_VALUE / radix;
+    int result = 0;
+
+    while (offset < numBytes) {
+      b = getByte(offset);
+      offset++;
+      if (b == separator) {
+        // We allow decimals and will return a truncated integral in that case.
+        // Therefore we won't throw an exception here (checking the fractional
+        // part happens below.)
+        break;
+      }
+
+      int digit = getDigit(b);
+      // We are going to process the new digit and accumulate the result. However, before doing
+      // this, if the result is already smaller than the stopValue(Integer.MIN_VALUE / radix), then
+      // result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
+      if (result < stopValue) {
+        throw new NumberFormatException(toString());
+      }
+
+      result = result * radix - digit;
+      // Since the previous result is less than or equal to stopValue(Integer.MIN_VALUE / radix),
+      // we can just use `result > 0` to check overflow. If result overflows, we should stop and
+      // throw exception.
+      if (result > 0) {
+        throw new NumberFormatException(toString());
+      }
+    }
+
+    // This is the case when we've encountered a decimal separator. The fractional
+    // part will not change the number, but we will verify that the fractional part
+    // is well formed.
+    while (offset < numBytes) {
+      if (getDigit(getByte(offset)) == -1) {
+        throw new NumberFormatException(toString());
+      }
+      offset++;
+    }
+
+    if (!negative) {
+      result = -result;
+      if (result < 0) {
+        throw new NumberFormatException(toString());
+      }
+    }
+
+    return result;
+  }
+
+  public short toShort() {
+    int intValue = toInt();
+    short result = (short) intValue;
+    if (result != intValue) {
+      throw new NumberFormatException(toString());
+    }
+
+    return result;
+  }
+
+  public byte toByte() {
+    int intValue = toInt();
+    byte result = (byte) intValue;
+    if (result != intValue) {
+      throw new NumberFormatException(toString());
+    }
+
+    return result;
+  }
+
   @Override
   public String toString() {
     return new String(getBytes(), StandardCharsets.UTF_8);
diff --git a/conf/docker.properties.template b/conf/docker.properties.template
index 55cb094b4af4..2ecb4f1464a4 100644
--- a/conf/docker.properties.template
+++ b/conf/docker.properties.template
@@ -15,6 +15,6 @@
 # limitations under the License.
 #
 
-spark.mesos.executor.docker.image: <image built from `../docker/spark-mesos/Dockerfile`>
+spark.mesos.executor.docker.image: <image built from `../external/docker/spark-mesos/Dockerfile`>
 spark.mesos.executor.docker.volumes: /usr/local/lib:/host/usr/local/lib:ro
 spark.mesos.executor.home: /opt/spark
diff --git a/core/pom.xml b/core/pom.xml
index eac99ab82a2e..bad3655452fb 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -337,6 +337,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-crypto</artifactId>
diff --git a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
index f6d1288cb263..ea5f1a9abf69 100644
--- a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
+++ b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
@@ -130,8 +130,10 @@ public synchronized void close() throws IOException {
     StorageUtils.dispose(byteBuffer);
   }
 
+  //checkstyle.off: NoFinalizer
   @Override
   protected void finalize() throws IOException {
     close();
   }
+  //checkstyle.on: NoFinalizer
 }
diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index 1a700aa37554..3385d0e6a31c 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -19,6 +19,7 @@
 
 import javax.annotation.concurrent.GuardedBy;
 import java.io.IOException;
+import java.nio.channels.ClosedByInterruptException;
 import java.util.Arrays;
 import java.util.BitSet;
 import java.util.HashSet;
@@ -156,6 +157,10 @@ public long acquireExecutionMemory(long required, MemoryConsumer consumer) {
                   break;
                 }
               }
+            } catch (ClosedByInterruptException e) {
+              // This called by user to kill a task (e.g: speculative task).
+              logger.error("error while calling spill() on " + c, e);
+              throw new RuntimeException(e.getMessage());
             } catch (IOException e) {
               logger.error("error while calling spill() on " + c, e);
               throw new OutOfMemoryError("error while calling spill() on " + c + " : "
@@ -174,6 +179,10 @@ public long acquireExecutionMemory(long required, MemoryConsumer consumer) {
               Utils.bytesToString(released), consumer);
             got += memoryManager.acquireExecutionMemory(required - got, taskAttemptId, mode);
           }
+        } catch (ClosedByInterruptException e) {
+          // This called by user to kill a task (e.g: speculative task).
+          logger.error("error while calling spill() on " + consumer, e);
+          throw new RuntimeException(e.getMessage());
         } catch (IOException e) {
           logger.error("error while calling spill() on " + consumer, e);
           throw new OutOfMemoryError("error while calling spill() on " + consumer + " : "
@@ -378,14 +387,14 @@ public long cleanUpAllAllocatedMemory() {
       for (MemoryConsumer c: consumers) {
         if (c != null && c.getUsed() > 0) {
           // In case of failed task, it's normal to see leaked memory
-          logger.warn("leak " + Utils.bytesToString(c.getUsed()) + " memory from " + c);
+          logger.debug("unreleased " + Utils.bytesToString(c.getUsed()) + " memory from " + c);
         }
       }
       consumers.clear();
 
       for (MemoryBlock page : pageTable) {
         if (page != null) {
-          logger.warn("leak a page: " + page + " in task " + taskAttemptId);
+          logger.debug("unreleased page: " + page + " in task " + taskAttemptId);
           memoryManager.tungstenMemoryAllocator().free(page);
         }
       }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index f235c434be7b..2fde5c300f07 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -40,6 +40,8 @@
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.io.CompressionCodec;
 import org.apache.spark.io.CompressionCodec$;
+import org.apache.commons.io.output.CloseShieldOutputStream;
+import org.apache.commons.io.output.CountingOutputStream;
 import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.network.util.LimitedInputStream;
 import org.apache.spark.scheduler.MapStatus;
@@ -264,6 +266,7 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
       sparkConf.getBoolean("spark.shuffle.unsafe.fastMergeEnabled", true);
     final boolean fastMergeIsSupported = !compressionEnabled ||
       CompressionCodec$.MODULE$.supportsConcatenationOfSerializedStreams(compressionCodec);
+    final boolean encryptionEnabled = blockManager.serializerManager().encryptionEnabled();
     try {
       if (spills.length == 0) {
         new FileOutputStream(outputFile).close(); // Create an empty file
@@ -289,7 +292,7 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
           // Compression is disabled or we are using an IO compression codec that supports
           // decompression of concatenated compressed streams, so we can perform a fast spill merge
           // that doesn't need to interpret the spilled bytes.
-          if (transferToEnabled) {
+          if (transferToEnabled && !encryptionEnabled) {
             logger.debug("Using transferTo-based fast merge");
             partitionLengths = mergeSpillsWithTransferTo(spills, outputFile);
           } else {
@@ -320,9 +323,9 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
   /**
    * Merges spill files using Java FileStreams. This code path is slower than the NIO-based merge,
    * {@link UnsafeShuffleWriter#mergeSpillsWithTransferTo(SpillInfo[], File)}, so it's only used in
-   * cases where the IO compression codec does not support concatenation of compressed data, or in
-   * cases where users have explicitly disabled use of {@code transferTo} in order to work around
-   * kernel bugs.
+   * cases where the IO compression codec does not support concatenation of compressed data, when
+   * encryption is enabled, or when users have explicitly disabled use of {@code transferTo} in
+   * order to work around kernel bugs.
    *
    * @param spills the spills to merge.
    * @param outputFile the file to write the merged data to.
@@ -337,7 +340,11 @@ private long[] mergeSpillsWithFileStream(
     final int numPartitions = partitioner.numPartitions();
     final long[] partitionLengths = new long[numPartitions];
     final InputStream[] spillInputStreams = new FileInputStream[spills.length];
-    OutputStream mergedFileOutputStream = null;
+
+    // Use a counting output stream to avoid having to close the underlying file and ask
+    // the file system for its size after each partition is written.
+    final CountingOutputStream mergedFileOutputStream = new CountingOutputStream(
+      new FileOutputStream(outputFile));
 
     boolean threwException = true;
     try {
@@ -345,34 +352,35 @@ private long[] mergeSpillsWithFileStream(
         spillInputStreams[i] = new FileInputStream(spills[i].file);
       }
       for (int partition = 0; partition < numPartitions; partition++) {
-        final long initialFileLength = outputFile.length();
-        mergedFileOutputStream =
-          new TimeTrackingOutputStream(writeMetrics, new FileOutputStream(outputFile, true));
+        final long initialFileLength = mergedFileOutputStream.getByteCount();
+        // Shield the underlying output stream from close() calls, so that we can close the higher
+        // level streams to make sure all data is really flushed and internal state is cleaned.
+        OutputStream partitionOutput = new CloseShieldOutputStream(
+          new TimeTrackingOutputStream(writeMetrics, mergedFileOutputStream));
+        partitionOutput = blockManager.serializerManager().wrapForEncryption(partitionOutput);
         if (compressionCodec != null) {
-          mergedFileOutputStream = compressionCodec.compressedOutputStream(mergedFileOutputStream);
+          partitionOutput = compressionCodec.compressedOutputStream(partitionOutput);
         }
-
         for (int i = 0; i < spills.length; i++) {
           final long partitionLengthInSpill = spills[i].partitionLengths[partition];
           if (partitionLengthInSpill > 0) {
-            InputStream partitionInputStream = null;
-            boolean innerThrewException = true;
+            InputStream partitionInputStream = new LimitedInputStream(spillInputStreams[i],
+              partitionLengthInSpill, false);
             try {
-              partitionInputStream =
-                  new LimitedInputStream(spillInputStreams[i], partitionLengthInSpill, false);
+              partitionInputStream = blockManager.serializerManager().wrapForEncryption(
+                partitionInputStream);
               if (compressionCodec != null) {
                 partitionInputStream = compressionCodec.compressedInputStream(partitionInputStream);
               }
-              ByteStreams.copy(partitionInputStream, mergedFileOutputStream);
-              innerThrewException = false;
+              ByteStreams.copy(partitionInputStream, partitionOutput);
             } finally {
-              Closeables.close(partitionInputStream, innerThrewException);
+              partitionInputStream.close();
             }
           }
         }
-        mergedFileOutputStream.flush();
-        mergedFileOutputStream.close();
-        partitionLengths[partition] = (outputFile.length() - initialFileLength);
+        partitionOutput.flush();
+        partitionOutput.close();
+        partitionLengths[partition] = (mergedFileOutputStream.getByteCount() - initialFileLength);
       }
       threwException = false;
     } finally {
@@ -414,17 +422,14 @@ private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) th
       for (int partition = 0; partition < numPartitions; partition++) {
         for (int i = 0; i < spills.length; i++) {
           final long partitionLengthInSpill = spills[i].partitionLengths[partition];
-          long bytesToTransfer = partitionLengthInSpill;
           final FileChannel spillInputChannel = spillInputChannels[i];
           final long writeStartTime = System.nanoTime();
-          while (bytesToTransfer > 0) {
-            final long actualBytesTransferred = spillInputChannel.transferTo(
-              spillInputChannelPositions[i],
-              bytesToTransfer,
-              mergedFileOutputChannel);
-            spillInputChannelPositions[i] += actualBytesTransferred;
-            bytesToTransfer -= actualBytesTransferred;
-          }
+          Utils.copyFileStreamNIO(
+            spillInputChannel,
+            mergedFileOutputChannel,
+            spillInputChannelPositions[i],
+            partitionLengthInSpill);
+          spillInputChannelPositions[i] += partitionLengthInSpill;
           writeMetrics.incWriteTime(System.nanoTime() - writeStartTime);
           bytesWrittenToMergedFile += partitionLengthInSpill;
           partitionLengths[partition] += partitionLengthInSpill;
diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index d2fcdea4f2ce..4bef21b6b4e4 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -170,6 +170,8 @@ public final class BytesToBytesMap extends MemoryConsumer {
 
   private long peakMemoryUsedBytes = 0L;
 
+  private final int initialCapacity;
+
   private final BlockManager blockManager;
   private final SerializerManager serializerManager;
   private volatile MapIterator destructiveIterator = null;
@@ -202,6 +204,7 @@ public BytesToBytesMap(
       throw new IllegalArgumentException("Page size " + pageSizeBytes + " cannot exceed " +
         TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES);
     }
+    this.initialCapacity = initialCapacity;
     allocate(initialCapacity);
   }
 
@@ -695,7 +698,7 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
       if (numKeys == MAX_CAPACITY
         // The map could be reused from last spill (because of no enough memory to grow),
         // then we don't try to grow again if hit the `growthThreshold`.
-        || !canGrowArray && numKeys > growthThreshold) {
+        || !canGrowArray && numKeys >= growthThreshold) {
         return false;
       }
 
@@ -739,7 +742,7 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
         longArray.set(pos * 2 + 1, keyHashcode);
         isDefined = true;
 
-        if (numKeys > growthThreshold && longArray.size() < MAX_CAPACITY) {
+        if (numKeys >= growthThreshold && longArray.size() < MAX_CAPACITY) {
           try {
             growAndRehash();
           } catch (OutOfMemoryError oom) {
@@ -902,12 +905,13 @@ public LongArray getArray() {
   public void reset() {
     numKeys = 0;
     numValues = 0;
-    longArray.zeroOut();
-
+    freeArray(longArray);
     while (dataPages.size() > 0) {
       MemoryBlock dataPage = dataPages.removeLast();
       freePage(dataPage);
     }
+    allocate(initialCapacity);
+    canGrowArray = true;
     currentPage = null;
     pageCursor = 0;
   }
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RadixSort.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RadixSort.java
index 404361734a55..3dd318471008 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RadixSort.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RadixSort.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.util.collection.unsafe.sort;
 
+import com.google.common.primitives.Ints;
+
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.LongArray;
 
@@ -40,14 +42,14 @@ public class RadixSort {
    *         of always copying the data back to position zero for efficiency.
    */
   public static int sort(
-      LongArray array, int numRecords, int startByteIndex, int endByteIndex,
+      LongArray array, long numRecords, int startByteIndex, int endByteIndex,
       boolean desc, boolean signed) {
     assert startByteIndex >= 0 : "startByteIndex (" + startByteIndex + ") should >= 0";
     assert endByteIndex <= 7 : "endByteIndex (" + endByteIndex + ") should <= 7";
     assert endByteIndex > startByteIndex;
     assert numRecords * 2 <= array.size();
-    int inIndex = 0;
-    int outIndex = numRecords;
+    long inIndex = 0;
+    long outIndex = numRecords;
     if (numRecords > 0) {
       long[][] counts = getCounts(array, numRecords, startByteIndex, endByteIndex);
       for (int i = startByteIndex; i <= endByteIndex; i++) {
@@ -55,13 +57,13 @@ public static int sort(
           sortAtByte(
             array, numRecords, counts[i], i, inIndex, outIndex,
             desc, signed && i == endByteIndex);
-          int tmp = inIndex;
+          long tmp = inIndex;
           inIndex = outIndex;
           outIndex = tmp;
         }
       }
     }
-    return inIndex;
+    return Ints.checkedCast(inIndex);
   }
 
   /**
@@ -78,14 +80,14 @@ public static int sort(
    * @param signed whether this is a signed (two's complement) sort (only applies to last byte).
    */
   private static void sortAtByte(
-      LongArray array, int numRecords, long[] counts, int byteIdx, int inIndex, int outIndex,
+      LongArray array, long numRecords, long[] counts, int byteIdx, long inIndex, long outIndex,
       boolean desc, boolean signed) {
     assert counts.length == 256;
     long[] offsets = transformCountsToOffsets(
-      counts, numRecords, array.getBaseOffset() + outIndex * 8, 8, desc, signed);
+      counts, numRecords, array.getBaseOffset() + outIndex * 8L, 8, desc, signed);
     Object baseObject = array.getBaseObject();
-    long baseOffset = array.getBaseOffset() + inIndex * 8;
-    long maxOffset = baseOffset + numRecords * 8;
+    long baseOffset = array.getBaseOffset() + inIndex * 8L;
+    long maxOffset = baseOffset + numRecords * 8L;
     for (long offset = baseOffset; offset < maxOffset; offset += 8) {
       long value = Platform.getLong(baseObject, offset);
       int bucket = (int)((value >>> (byteIdx * 8)) & 0xff);
@@ -106,13 +108,13 @@ private static void sortAtByte(
    *         significant byte. If the byte does not need sorting the array will be null.
    */
   private static long[][] getCounts(
-      LongArray array, int numRecords, int startByteIndex, int endByteIndex) {
+      LongArray array, long numRecords, int startByteIndex, int endByteIndex) {
     long[][] counts = new long[8][];
     // Optimization: do a fast pre-pass to determine which byte indices we can skip for sorting.
     // If all the byte values at a particular index are the same we don't need to count it.
     long bitwiseMax = 0;
     long bitwiseMin = -1L;
-    long maxOffset = array.getBaseOffset() + numRecords * 8;
+    long maxOffset = array.getBaseOffset() + numRecords * 8L;
     Object baseObject = array.getBaseObject();
     for (long offset = array.getBaseOffset(); offset < maxOffset; offset += 8) {
       long value = Platform.getLong(baseObject, offset);
@@ -146,18 +148,18 @@ private static long[][] getCounts(
    * @return the input counts array.
    */
   private static long[] transformCountsToOffsets(
-      long[] counts, int numRecords, long outputOffset, int bytesPerRecord,
+      long[] counts, long numRecords, long outputOffset, long bytesPerRecord,
       boolean desc, boolean signed) {
     assert counts.length == 256;
     int start = signed ? 128 : 0;  // output the negative records first (values 129-255).
     if (desc) {
-      int pos = numRecords;
+      long pos = numRecords;
       for (int i = start; i < start + 256; i++) {
         pos -= counts[i & 0xff];
         counts[i & 0xff] = outputOffset + pos * bytesPerRecord;
       }
     } else {
-      int pos = 0;
+      long pos = 0;
       for (int i = start; i < start + 256; i++) {
         long tmp = counts[i & 0xff];
         counts[i & 0xff] = outputOffset + pos * bytesPerRecord;
@@ -176,8 +178,8 @@ private static long[] transformCountsToOffsets(
    */
   public static int sortKeyPrefixArray(
       LongArray array,
-      int startIndex,
-      int numRecords,
+      long startIndex,
+      long numRecords,
       int startByteIndex,
       int endByteIndex,
       boolean desc,
@@ -186,8 +188,8 @@ public static int sortKeyPrefixArray(
     assert endByteIndex <= 7 : "endByteIndex (" + endByteIndex + ") should <= 7";
     assert endByteIndex > startByteIndex;
     assert numRecords * 4 <= array.size();
-    int inIndex = startIndex;
-    int outIndex = startIndex + numRecords * 2;
+    long inIndex = startIndex;
+    long outIndex = startIndex + numRecords * 2L;
     if (numRecords > 0) {
       long[][] counts = getKeyPrefixArrayCounts(
         array, startIndex, numRecords, startByteIndex, endByteIndex);
@@ -196,13 +198,13 @@ public static int sortKeyPrefixArray(
           sortKeyPrefixArrayAtByte(
             array, numRecords, counts[i], i, inIndex, outIndex,
             desc, signed && i == endByteIndex);
-          int tmp = inIndex;
+          long tmp = inIndex;
           inIndex = outIndex;
           outIndex = tmp;
         }
       }
     }
-    return inIndex;
+    return Ints.checkedCast(inIndex);
   }
 
   /**
@@ -210,7 +212,7 @@ public static int sortKeyPrefixArray(
    * getCounts with some added parameters but that seems to hurt in benchmarks.
    */
   private static long[][] getKeyPrefixArrayCounts(
-      LongArray array, int startIndex, int numRecords, int startByteIndex, int endByteIndex) {
+      LongArray array, long startIndex, long numRecords, int startByteIndex, int endByteIndex) {
     long[][] counts = new long[8][];
     long bitwiseMax = 0;
     long bitwiseMin = -1L;
@@ -238,11 +240,11 @@ private static long[][] getKeyPrefixArrayCounts(
    * Specialization of sortAtByte() for key-prefix arrays.
    */
   private static void sortKeyPrefixArrayAtByte(
-      LongArray array, int numRecords, long[] counts, int byteIdx, int inIndex, int outIndex,
+      LongArray array, long numRecords, long[] counts, int byteIdx, long inIndex, long outIndex,
       boolean desc, boolean signed) {
     assert counts.length == 256;
     long[] offsets = transformCountsToOffsets(
-      counts, numRecords, array.getBaseOffset() + outIndex * 8, 16, desc, signed);
+      counts, numRecords, array.getBaseOffset() + outIndex * 8L, 16, desc, signed);
     Object baseObject = array.getBaseObject();
     long baseOffset = array.getBaseOffset() + inIndex * 8L;
     long maxOffset = baseOffset + numRecords * 16L;
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index 2a71e68adafa..5b42843717e9 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -22,6 +22,8 @@
 
 import org.apache.avro.reflect.Nullable;
 
+import org.apache.spark.TaskContext;
+import org.apache.spark.TaskKilledException;
 import org.apache.spark.memory.MemoryConsumer;
 import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.unsafe.Platform;
@@ -253,6 +255,7 @@ public final class SortedIterator extends UnsafeSorterIterator implements Clonea
     private long keyPrefix;
     private int recordLength;
     private long currentPageNumber;
+    private final TaskContext taskContext = TaskContext.get();
 
     private SortedIterator(int numRecords, int offset) {
       this.numRecords = numRecords;
@@ -283,6 +286,14 @@ public boolean hasNext() {
 
     @Override
     public void loadNext() {
+      // Kill the task in case it has been marked as killed. This logic is from
+      // InterruptibleIterator, but we inline it here instead of wrapping the iterator in order
+      // to avoid performance overhead. This check is added here in `loadNext()` instead of in
+      // `hasNext()` because it's technically possible for the caller to be relying on
+      // `getNumRecords()` instead of `hasNext()` to know when to stop.
+      if (taskContext != null && taskContext.isInterrupted()) {
+        throw new TaskKilledException();
+      }
       // This pointer points to a 4-byte record length, followed by the record's bytes
       final long recordPointer = array.get(offset + position);
       currentPageNumber = TaskMemoryManager.decodePageNumber(recordPointer);
@@ -322,7 +333,7 @@ public UnsafeSorterIterator getSortedIterator() {
     if (sortComparator != null) {
       if (this.radixSortSupport != null) {
         offset = RadixSort.sortKeyPrefixArray(
-          array, nullBoundaryPos, (pos - nullBoundaryPos) / 2, 0, 7,
+          array, nullBoundaryPos, (pos - nullBoundaryPos) / 2L, 0, 7,
           radixSortSupport.sortDescending(), radixSortSupport.sortSigned());
       } else {
         MemoryBlock unused = new MemoryBlock(
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index a658e5eb47b7..b6323c624b7b 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -23,6 +23,8 @@
 import com.google.common.io.Closeables;
 
 import org.apache.spark.SparkEnv;
+import org.apache.spark.TaskContext;
+import org.apache.spark.TaskKilledException;
 import org.apache.spark.io.NioBufferedFileInputStream;
 import org.apache.spark.serializer.SerializerManager;
 import org.apache.spark.storage.BlockId;
@@ -51,6 +53,7 @@ public final class UnsafeSorterSpillReader extends UnsafeSorterIterator implemen
   private byte[] arr = new byte[1024 * 1024];
   private Object baseObject = arr;
   private final long baseOffset = Platform.BYTE_ARRAY_OFFSET;
+  private final TaskContext taskContext = TaskContext.get();
 
   public UnsafeSorterSpillReader(
       SerializerManager serializerManager,
@@ -94,6 +97,14 @@ public boolean hasNext() {
 
   @Override
   public void loadNext() throws IOException {
+    // Kill the task in case it has been marked as killed. This logic is from
+    // InterruptibleIterator, but we inline it here instead of wrapping the iterator in order
+    // to avoid performance overhead. This check is added here in `loadNext()` instead of in
+    // `hasNext()` because it's technically possible for the caller to be relying on
+    // `getNumRecords()` instead of `hasNext()` to know when to stop.
+    if (taskContext != null && taskContext.isInterrupted()) {
+      throw new TaskKilledException();
+    }
     recordLength = din.readInt();
     keyPrefix = din.readLong();
     if (recordLength > arr.length) {
diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
index 1df67337ea03..fe5db6aa26b6 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
@@ -411,10 +411,6 @@ $(document).ready(function () {
                         }
                     ],
                     "columnDefs": [
-                        {
-                            "targets": [ 15 ],
-                            "visible": logsExist(response)
-                        },
                         {
                             "targets": [ 16 ],
                             "visible": getThreadDumpEnabled()
@@ -423,7 +419,8 @@ $(document).ready(function () {
                     "order": [[0, "asc"]]
                 };
     
-                $(selector).DataTable(conf);
+                var dt = $(selector).DataTable(conf);
+                dt.column(15).visible(logsExist(response));
                 $('#active-executors [data-toggle="tooltip"]').tooltip();
     
                 var sumSelector = "#summary-execs-table";
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js b/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js
new file mode 100644
index 000000000000..55d540d8317a
--- /dev/null
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+$(document).ready(function() {
+    if ($('#last-updated').length) {
+      var lastUpdatedMillis = Number($('#last-updated').text());
+      var updatedDate = new Date(lastUpdatedMillis);
+      $('#last-updated').text(updatedDate.toLocaleDateString()+", "+updatedDate.toLocaleTimeString())
+    }
+});
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
index 1fd6ef4a7125..c2afa993b2f2 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
@@ -39,7 +39,7 @@
           Started
         </span>
       </th>
-      <th>
+      <th class="completedColumn">
         <span data-toggle="tooltip" data-placement="above" title="The completed time of this application.">
           Completed
         </span>
@@ -68,16 +68,16 @@
   <tbody>
   {{#applications}}
     <tr>
-      <td class="rowGroupColumn"><span title="{{id}}"><a href="/history/{{id}}/{{num}}/jobs/">{{id}}</a></span></td>
+      <td class="rowGroupColumn"><span title="{{id}}"><a href="{{uiroot}}/history/{{id}}/{{num}}/jobs/">{{id}}</a></span></td>
       <td class="rowGroupColumn">{{name}}</td>
       {{#attempts}}
-      <td class="attemptIDSpan"><a href="/history/{{id}}/{{attemptId}}/jobs/">{{attemptId}}</a></td>
+      <td class="attemptIDSpan"><a href="{{uiroot}}/history/{{id}}/{{attemptId}}/jobs/">{{attemptId}}</a></td>
       <td>{{startTime}}</td>
-      <td>{{endTime}}</td>
+      <td class="completedColumn">{{endTime}}</td>
       <td><span title="{{duration}}" class="durationClass">{{duration}}</span></td>
       <td>{{sparkUser}}</td>
       <td>{{lastUpdated}}</td>
-      <td><a href="/api/v1/applications/{{id}}/{{num}}/logs" class="btn btn-info btn-mini">Download</a></td>
+      <td><a href="{{log}}" class="btn btn-info btn-mini">Download</a></td>
       {{/attempts}}
     </tr>
   {{/applications}}
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index 2a32e18672a2..a4430347a0b2 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -114,12 +114,19 @@ $(document).ready(function() {
           attempt["startTime"] = formatDate(attempt["startTime"]);
           attempt["endTime"] = formatDate(attempt["endTime"]);
           attempt["lastUpdated"] = formatDate(attempt["lastUpdated"]);
+          attempt["log"] = uiRoot + "/api/v1/applications/" + id + "/" +
+            (attempt.hasOwnProperty("attemptId") ? attempt["attemptId"] + "/" : "") + "logs";
+
           var app_clone = {"id" : id, "name" : name, "num" : num, "attempts" : [attempt]};
           array.push(app_clone);
         }
       }
 
-      var data = {"applications": array}
+      var data = {
+        "uiroot": uiRoot,
+        "applications": array
+        }
+
       $.get("static/historypage-template.html", function(template) {
         historySummary.append(Mustache.render($(template).filter("#history-summary-template").html(),data));
         var selector = "#history-summary-table";
@@ -135,6 +142,9 @@ $(document).ready(function() {
                         {name: 'eighth'},
                         {name: 'ninth'},
                     ],
+                    "columnDefs": [
+                        {"searchable": false, "targets": [5]}
+                    ],
                     "autoWidth": false,
                     "order": [[ 4, "desc" ]]
         };
@@ -161,6 +171,13 @@ $(document).ready(function() {
           }
         }
 
+        if (requestedIncomplete) {
+          var completedCells = document.getElementsByClassName("completedColumn");
+          for (i = 0; i < completedCells.length; i++) {
+            completedCells[i].style.display='none';
+          }
+        }
+
         var durationCells = document.getElementsByClassName("durationClass");
         for (i = 0; i < durationCells.length; i++) {
           var timeInMilliseconds = parseInt(durationCells[i].title);
diff --git a/core/src/main/resources/org/apache/spark/ui/static/log-view.js b/core/src/main/resources/org/apache/spark/ui/static/log-view.js
index 1782b4f209c0..b5c43e5788bc 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/log-view.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/log-view.js
@@ -51,13 +51,26 @@ function noNewAlert() {
   window.setTimeout(function () {alert.css("display", "none");}, 4000);
 }
 
+
+function getRESTEndPoint() {
+  // If the worker is served from the master through a proxy (see doc on spark.ui.reverseProxy), 
+  // we need to retain the leading ../proxy/<workerid>/ part of the URL when making REST requests.
+  // Similar logic is contained in executorspage.js function createRESTEndPoint.
+  var words = document.baseURI.split('/');
+  var ind = words.indexOf("proxy");
+  if (ind > 0) {
+      return words.slice(0, ind + 2).join('/') + "/log";
+  }
+  return "/log"
+}
+
 function loadMore() {
   var offset = Math.max(startByte - byteLength, 0);
   var moreByteLength = Math.min(byteLength, startByte);
 
   $.ajax({
     type: "GET",
-    url: "/log" + baseParams + "&offset=" + offset + "&byteLength=" + moreByteLength,
+    url: getRESTEndPoint() + baseParams + "&offset=" + offset + "&byteLength=" + moreByteLength,
     success: function (data) {
       var oldHeight = $(".log-content")[0].scrollHeight;
       var newlineIndex = data.indexOf('\n');
@@ -83,14 +96,14 @@ function loadMore() {
 function loadNew() {
   $.ajax({
     type: "GET",
-    url: "/log" + baseParams + "&byteLength=0",
+    url: getRESTEndPoint() + baseParams + "&byteLength=0",
     success: function (data) {
       var dataInfo = data.substring(0, data.indexOf('\n')).match(/\d+/g);
       var newDataLen = dataInfo[2] - totalLogLength;
       if (newDataLen != 0) {
         $.ajax({
           type: "GET",
-          url: "/log" + baseParams + "&byteLength=" + newDataLen,
+          url: getRESTEndPoint() + baseParams + "&byteLength=" + newDataLen,
           success: function (data) {
             var newlineIndex = data.indexOf('\n');
             var dataInfo = data.substring(0, newlineIndex).match(/\d+/g);
diff --git a/core/src/main/resources/org/apache/spark/ui/static/sorttable.js b/core/src/main/resources/org/apache/spark/ui/static/sorttable.js
index ff241470f32d..9960d5c34d1f 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/sorttable.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/sorttable.js
@@ -207,8 +207,8 @@ sorttable = {
 
     hasInputs = (typeof node.getElementsByTagName == 'function') &&
                  node.getElementsByTagName('input').length;
-    
-    if (node.getAttribute("sorttable_customkey") != null) {
+
+    if (node.nodeType == 1 && node.getAttribute("sorttable_customkey") != null) {
       return node.getAttribute("sorttable_customkey");
     }
     else if (typeof node.textContent != 'undefined' && !hasInputs) {
diff --git a/core/src/main/resources/org/apache/spark/ui/static/table.js b/core/src/main/resources/org/apache/spark/ui/static/table.js
index 14b06bfe860e..0315ebf5c48a 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/table.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/table.js
@@ -36,7 +36,7 @@ function toggleThreadStackTrace(threadId, forceAdd) {
     if (stackTrace.length == 0) {
         var stackTraceText = $('#' + threadId + "_td_stacktrace").html()
         var threadCell = $("#thread_" + threadId + "_tr")
-        threadCell.after("<tr id=\"" + threadId +"_stacktrace\" class=\"accordion-body\"><td colspan=\"3\"><pre>" +
+        threadCell.after("<tr id=\"" + threadId +"_stacktrace\" class=\"accordion-body\"><td colspan=\"4\"><pre>" +
             stackTraceText +  "</pre></td></tr>")
     } else {
         if (!forceAdd) {
@@ -73,6 +73,7 @@ function onMouseOverAndOut(threadId) {
     $("#" + threadId + "_td_id").toggleClass("threaddump-td-mouseover");
     $("#" + threadId + "_td_name").toggleClass("threaddump-td-mouseover");
     $("#" + threadId + "_td_state").toggleClass("threaddump-td-mouseover");
+    $("#" + threadId + "_td_locking").toggleClass("threaddump-td-mouseover");
 }
 
 function onSearchStringChange() {
diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css
index b157f3e0a407..319a719efaa7 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css
@@ -246,4 +246,8 @@ a.expandbutton {
   text-align: center;
   margin: 0;
   padding: 4px 0;
+}
+
+.table-cell-width-limited td {
+  max-width: 600px;
 }
\ No newline at end of file
diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.js b/core/src/main/resources/org/apache/spark/ui/static/webui.js
index e37307aa1f70..0fa1fcf25f8b 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.js
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+var uiRoot = "";
+
+function setUIRoot(val) {
+    uiRoot = val;
+}
+
 function collapseTablePageLoad(name, table){
   if (window.localStorage.getItem(name) == "true") {
     // Set it to false so that the click function can revert it
diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala
index 9d1f1d59dbce..7bea636c94aa 100644
--- a/core/src/main/scala/org/apache/spark/Accumulator.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulator.scala
@@ -26,7 +26,7 @@ package org.apache.spark
  *
  * An accumulator is created from an initial value `v` by calling
  * [[SparkContext#accumulator SparkContext.accumulator]].
- * Tasks running on the cluster can then add to it using the [[Accumulable#+= +=]] operator.
+ * Tasks running on the cluster can then add to it using the `+=` operator.
  * However, they cannot read its value. Only the driver program can read the accumulator's value,
  * using its [[#value]] method.
  *
diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
index 5678d790e9e7..4d884dec0791 100644
--- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
@@ -18,7 +18,8 @@
 package org.apache.spark
 
 import java.lang.ref.{ReferenceQueue, WeakReference}
-import java.util.concurrent.{ConcurrentLinkedQueue, ScheduledExecutorService, TimeUnit}
+import java.util.Collections
+import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue, ScheduledExecutorService, TimeUnit}
 
 import scala.collection.JavaConverters._
 
@@ -58,7 +59,12 @@ private class CleanupTaskWeakReference(
  */
 private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
 
-  private val referenceBuffer = new ConcurrentLinkedQueue[CleanupTaskWeakReference]()
+  /**
+   * A buffer to ensure that `CleanupTaskWeakReference`s are not garbage collected as long as they
+   * have not been handled by the reference queue.
+   */
+  private val referenceBuffer =
+    Collections.newSetFromMap[CleanupTaskWeakReference](new ConcurrentHashMap)
 
   private val referenceQueue = new ReferenceQueue[AnyRef]
 
@@ -139,7 +145,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
     periodicGCService.shutdown()
   }
 
-  /** Register a RDD for cleanup when it is garbage collected. */
+  /** Register an RDD for cleanup when it is garbage collected. */
   def registerRDDForCleanup(rdd: RDD[_]): Unit = {
     registerForCleanup(rdd, CleanRDD(rdd.id))
   }
@@ -176,10 +182,10 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
           .map(_.asInstanceOf[CleanupTaskWeakReference])
         // Synchronize here to avoid being interrupted on stop()
         synchronized {
-          reference.map(_.task).foreach { task =>
-            logDebug("Got cleaning task " + task)
-            referenceBuffer.remove(reference.get)
-            task match {
+          reference.foreach { ref =>
+            logDebug("Got cleaning task " + ref.task)
+            referenceBuffer.remove(ref)
+            ref.task match {
               case CleanRDD(rddId) =>
                 doCleanupRDD(rddId, blocking = blockOnCleanupTasks)
               case CleanShuffle(shuffleId) =>
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 1366251d0618..f054a78d5788 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -331,7 +331,7 @@ private[spark] class ExecutorAllocationManager(
       val delta = addExecutors(maxNeeded)
       logDebug(s"Starting timer to add more executors (to " +
         s"expire in $sustainedSchedulerBacklogTimeoutS seconds)")
-      addTime += sustainedSchedulerBacklogTimeoutS * 1000
+      addTime = now + (sustainedSchedulerBacklogTimeoutS * 1000)
       delta
     } else {
       0
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 7f8f0f513134..6f5c31d7ab71 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -322,7 +322,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf,
   if (minSizeForBroadcast > maxRpcMessageSize) {
     val msg = s"spark.shuffle.mapOutput.minSizeForBroadcast ($minSizeForBroadcast bytes) must " +
       s"be <= spark.rpc.message.maxSize ($maxRpcMessageSize bytes) to prevent sending an rpc " +
-      "message that is to large."
+      "message that is too large."
     logError(msg)
     throw new IllegalArgumentException(msg)
   }
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 93dfbc0e6ed6..f83f5278e8b8 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -101,7 +101,7 @@ class HashPartitioner(partitions: Int) extends Partitioner {
  * A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly
  * equal ranges. The ranges are determined by sampling the content of the RDD passed in.
  *
- * Note that the actual number of partitions created by the RangePartitioner might not be the same
+ * @note The actual number of partitions created by the RangePartitioner might not be the same
  * as the `partitions` parameter, in the case where the number of sampled records is less than
  * the value of `partitions`.
  */
diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala b/core/src/main/scala/org/apache/spark/SSLOptions.scala
index be19179b00a4..5f14102c3c36 100644
--- a/core/src/main/scala/org/apache/spark/SSLOptions.scala
+++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala
@@ -150,8 +150,8 @@ private[spark] object SSLOptions extends Logging {
    * $ - `[ns].enabledAlgorithms` - a comma separated list of ciphers
    *
    * For a list of protocols and ciphers supported by particular Java versions, you may go to
-   * [[https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https Oracle
-   * blog page]].
+   * <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">
+   * Oracle blog page</a>.
    *
    * You can optionally specify the default configuration. If you do, for each setting which is
    * missing in SparkConf, the corresponding setting is used from the default configuration.
diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 199365ad925a..6e12cd16a3a5 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -21,7 +21,6 @@ import java.lang.{Byte => JByte}
 import java.net.{Authenticator, PasswordAuthentication}
 import java.security.{KeyStore, SecureRandom}
 import java.security.cert.X509Certificate
-import javax.crypto.KeyGenerator
 import javax.net.ssl._
 
 import com.google.common.hash.HashCodes
@@ -33,7 +32,6 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.sasl.SecretKeyHolder
-import org.apache.spark.security.CryptoStreamUtils._
 import org.apache.spark.util.Utils
 
 /**
@@ -185,7 +183,9 @@ import org.apache.spark.util.Utils
  *  setting `spark.ssl.useNodeLocalConf` to `true`.
  */
 
-private[spark] class SecurityManager(sparkConf: SparkConf)
+private[spark] class SecurityManager(
+    sparkConf: SparkConf,
+    val ioEncryptionKey: Option[Array[Byte]] = None)
   extends Logging with SecretKeyHolder {
 
   import SecurityManager._
@@ -415,6 +415,8 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
     logInfo("Changing acls enabled to: " + aclsOn)
   }
 
+  def getIOEncryptionKey(): Option[Array[Byte]] = ioEncryptionKey
+
   /**
    * Generates or looks up the secret key.
    *
@@ -559,19 +561,4 @@ private[spark] object SecurityManager {
   // key used to store the spark secret in the Hadoop UGI
   val SECRET_LOOKUP_KEY = "sparkCookie"
 
-  /**
-   * Setup the cryptographic key used by IO encryption in credentials. The key is generated using
-   * [[KeyGenerator]]. The algorithm and key length is specified by the [[SparkConf]].
-   */
-  def initIOEncryptionKey(conf: SparkConf, credentials: Credentials): Unit = {
-    if (credentials.getSecretKey(SPARK_IO_TOKEN) == null) {
-      val keyLen = conf.get(IO_ENCRYPTION_KEY_SIZE_BITS)
-      val ioKeyGenAlgorithm = conf.get(IO_ENCRYPTION_KEYGEN_ALGORITHM)
-      val keyGen = KeyGenerator.getInstance(ioKeyGenAlgorithm)
-      keyGen.init(keyLen)
-
-      val ioKey = keyGen.generateKey()
-      credentials.addSecretKey(SPARK_IO_TOKEN, ioKey.getEncoded)
-    }
-  }
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index c9c342df82c9..d78b9f1b2968 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -42,10 +42,10 @@ import org.apache.spark.util.Utils
  * All setter methods in this class support chaining. For example, you can write
  * `new SparkConf().setMaster("local").setAppName("My app")`.
  *
- * Note that once a SparkConf object is passed to Spark, it is cloned and can no longer be modified
- * by the user. Spark does not support modifying the configuration at runtime.
- *
  * @param loadDefaults whether to also load values from Java system properties
+ *
+ * @note Once a SparkConf object is passed to Spark, it is cloned and can no longer be modified
+ * by the user. Spark does not support modifying the configuration at runtime.
  */
 class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Serializable {
 
@@ -262,7 +262,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then seconds are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getTimeAsSeconds(key: String): Long = {
     Utils.timeStringAsSeconds(get(key))
@@ -279,7 +279,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a time parameter as milliseconds; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then milliseconds are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getTimeAsMs(key: String): Long = {
     Utils.timeStringAsMs(get(key))
@@ -296,7 +296,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then bytes are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getSizeAsBytes(key: String): Long = {
     Utils.byteStringAsBytes(get(key))
@@ -320,7 +320,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Kibibytes are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getSizeAsKb(key: String): Long = {
     Utils.byteStringAsKb(get(key))
@@ -337,7 +337,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Mebibytes are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getSizeAsMb(key: String): Long = {
     Utils.byteStringAsMb(get(key))
@@ -354,7 +354,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Gibibytes are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getSizeAsGb(key: String): Long = {
     Utils.byteStringAsGb(get(key))
@@ -378,7 +378,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
     settings.entrySet().asScala.map(x => (x.getKey, x.getValue)).toArray
   }
 
-  /** Get all parameters that start with `prefix` */
+  /**
+   * Get all parameters that start with `prefix`
+   */
   def getAllWithPrefix(prefix: String): Array[(String, String)] = {
     getAll.filter { case (k, v) => k.startsWith(prefix) }
       .map { case (k, v) => (k.substring(prefix.length), v) }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 4694790c72cd..cf03ae545635 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -19,7 +19,7 @@ package org.apache.spark
 
 import java.io._
 import java.lang.reflect.Constructor
-import java.net.{MalformedURLException, URI}
+import java.net.{URI}
 import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID}
 import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap}
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference}
@@ -183,6 +183,8 @@ class SparkContext(config: SparkConf) extends Logging {
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
 
+  warnDeprecatedVersions()
+
   /* ------------------------------------------------------------------------------------- *
    | Private variables. These variables keep the internal state of the context, and are    |
    | not accessible by the outside world. They're mutable since we want to initialize all  |
@@ -279,7 +281,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * A default Hadoop Configuration for the Hadoop code (e.g. file systems) that we reuse.
    *
-   * '''Note:''' As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
+   * @note As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
    * plan to set some global configurations for all Hadoop RDDs.
    */
   def hadoopConfiguration: Configuration = _hadoopConfiguration
@@ -346,6 +348,16 @@ class SparkContext(config: SparkConf) extends Logging {
     value
   }
 
+  private def warnDeprecatedVersions(): Unit = {
+    val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3)
+    if (javaVersion.length >= 2 && javaVersion(1).toInt == 7) {
+      logWarning("Support for Java 7 is deprecated as of Spark 2.0.0")
+    }
+    if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) {
+      logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0")
+    }
+  }
+
   /** Control our logLevel. This overrides any user-defined log settings.
    * @param logLevel The desired log level as a string.
    * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
@@ -410,10 +422,6 @@ class SparkContext(config: SparkConf) extends Logging {
     }
 
     if (master == "yarn" && deployMode == "client") System.setProperty("SPARK_YARN_MODE", "true")
-    if (_conf.get(IO_ENCRYPTION_ENABLED) && !SparkHadoopUtil.get.isYarnMode()) {
-      throw new SparkException("IO encryption is only supported in YARN mode, please disable it " +
-        s"by setting ${IO_ENCRYPTION_ENABLED.key} to false")
-    }
 
     // "_jobProgressListener" should be set up before creating SparkEnv because when creating
     // "SparkEnv", some messages will be posted to "listenerBus" and we should not miss them.
@@ -633,7 +641,7 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /**
    * Get a local property set in this thread, or null if it is missing. See
-   * [[org.apache.spark.SparkContext.setLocalProperty]].
+   * `org.apache.spark.SparkContext.setLocalProperty`.
    */
   def getLocalProperty(key: String): String =
     Option(localProperties.get).map(_.getProperty(key)).orNull
@@ -651,7 +659,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Application programmers can use this method to group all those jobs together and give a
    * group description. Once set, the Spark web UI will associate such jobs with this group.
    *
-   * The application can also use [[org.apache.spark.SparkContext.cancelJobGroup]] to cancel all
+   * The application can also use `org.apache.spark.SparkContext.cancelJobGroup` to cancel all
    * running jobs in this group. For example,
    * {{{
    * // In the main thread:
@@ -688,7 +696,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Execute a block of code in a scope such that all new RDDs created in this body will
    * be part of the same scope. For more detail, see {{org.apache.spark.rdd.RDDOperationScope}}.
    *
-   * Note: Return statements are NOT allowed in the given body.
+   * @note Return statements are NOT allowed in the given body.
    */
   private[spark] def withScope[U](body: => U): U = RDDOperationScope.withScope[U](this)(body)
 
@@ -915,7 +923,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Load data from a flat binary file, assuming the length of each record is constant.
    *
-   * '''Note:''' We ensure that the byte array for each record in the resulting RDD
+   * @note We ensure that the byte array for each record in the resulting RDD
    * has the provided record length.
    *
    * @param path Directory to the input data files, the path can be comma separated paths as the
@@ -936,12 +944,11 @@ class SparkContext(config: SparkConf) extends Logging {
       classOf[LongWritable],
       classOf[BytesWritable],
       conf = conf)
-    val data = br.map { case (k, v) =>
-      val bytes = v.getBytes
+    br.map { case (k, v) =>
+      val bytes = v.copyBytes()
       assert(bytes.length == recordLength, "Byte array does not have correct length")
       bytes
     }
-    data
   }
 
   /**
@@ -958,7 +965,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * @param valueClass Class of the values
    * @param minPartitions Minimum number of Hadoop Splits to generate.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -983,7 +990,7 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /** Get an RDD for a Hadoop file with an arbitrary InputFormat
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1022,7 +1029,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path, minPartitions)
    * }}}
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1046,7 +1053,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path)
    * }}}
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1072,7 +1079,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1112,7 +1119,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * @param kClass Class of the keys
    * @param vClass Class of the values
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1138,7 +1145,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Get an RDD for a Hadoop SequenceFile with given key and value types.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1157,7 +1164,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Get an RDD for a Hadoop SequenceFile with given key and value types.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1187,7 +1194,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * for the appropriate type. In addition, we pass the converter a ClassTag of its type to
    * allow it to figure out the Writable class to use in the subclass case.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1268,7 +1275,7 @@ class SparkContext(config: SparkConf) extends Logging {
   @deprecated("use AccumulatorV2", "2.0.0")
   def accumulator[T](initialValue: T, name: String)(implicit param: AccumulatorParam[T])
     : Accumulator[T] = {
-    val acc = new Accumulator(initialValue, param, Some(name))
+    val acc = new Accumulator(initialValue, param, Option(name))
     cleaner.foreach(_.registerAccumulatorForCleanup(acc.newAcc))
     acc
   }
@@ -1297,7 +1304,7 @@ class SparkContext(config: SparkConf) extends Logging {
   @deprecated("use AccumulatorV2", "2.0.0")
   def accumulable[R, T](initialValue: R, name: String)(implicit param: AccumulableParam[R, T])
     : Accumulable[R, T] = {
-    val acc = new Accumulable(initialValue, param, Some(name))
+    val acc = new Accumulable(initialValue, param, Option(name))
     cleaner.foreach(_.registerAccumulatorForCleanup(acc.newAcc))
     acc
   }
@@ -1318,19 +1325,21 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Register the given accumulator.  Note that accumulators must be registered before use, or it
-   * will throw exception.
+   * Register the given accumulator.
+   *
+   * @note Accumulators must be registered before use, or it will throw exception.
    */
   def register(acc: AccumulatorV2[_, _]): Unit = {
     acc.register(this)
   }
 
   /**
-   * Register the given accumulator with given name.  Note that accumulators must be registered
-   * before use, or it will throw exception.
+   * Register the given accumulator with given name.
+   *
+   * @note Accumulators must be registered before use, or it will throw exception.
    */
   def register(acc: AccumulatorV2[_, _], name: String): Unit = {
-    acc.register(this, name = Some(name))
+    acc.register(this, name = Option(name))
   }
 
   /**
@@ -1370,7 +1379,7 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Create and register a [[CollectionAccumulator]], which starts with empty list and accumulates
+   * Create and register a `CollectionAccumulator`, which starts with empty list and accumulates
    * inputs by adding them into the list.
    */
   def collectionAccumulator[T]: CollectionAccumulator[T] = {
@@ -1380,7 +1389,7 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Create and register a [[CollectionAccumulator]], which starts with empty list and accumulates
+   * Create and register a `CollectionAccumulator`, which starts with empty list and accumulates
    * inputs by adding them into the list.
    */
   def collectionAccumulator[T](name: String): CollectionAccumulator[T] = {
@@ -1538,7 +1547,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * :: DeveloperApi ::
    * Request that the cluster manager kill the specified executors.
    *
-   * Note: This is an indication to the cluster manager that the application wishes to adjust
+   * @note This is an indication to the cluster manager that the application wishes to adjust
    * its resource usage downwards. If the application wishes to replace the executors it kills
    * through this method with new ones, it should follow up explicitly with a call to
    * {{SparkContext#requestExecutors}}.
@@ -1560,7 +1569,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * :: DeveloperApi ::
    * Request that the cluster manager kill the specified executor.
    *
-   * Note: This is an indication to the cluster manager that the application wishes to adjust
+   * @note This is an indication to the cluster manager that the application wishes to adjust
    * its resource usage downwards. If the application wishes to replace the executor it kills
    * through this method with a new one, it should follow up explicitly with a call to
    * {{SparkContext#requestExecutors}}.
@@ -1578,7 +1587,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * this request. This assumes the cluster manager will automatically and eventually
    * fulfill all missing application resource requests.
    *
-   * Note: The replace is by no means guaranteed; another application on the same cluster
+   * @note The replace is by no means guaranteed; another application on the same cluster
    * can steal the window of opportunity and acquire this application's resources in the
    * mean time.
    *
@@ -1627,7 +1636,8 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /**
    * Returns an immutable map of RDDs that have marked themselves as persistent via cache() call.
-   * Note that this does not necessarily mean the caching or computation was successful.
+   *
+   * @note This does not necessarily mean the caching or computation was successful.
    */
   def getPersistentRDDs: Map[Int, RDD[_]] = persistentRdds.toMap
 
@@ -1716,29 +1726,20 @@ class SparkContext(config: SparkConf) extends Logging {
         key = uri.getScheme match {
           // A JAR file which exists only on the driver node
           case null | "file" =>
-            if (master == "yarn" && deployMode == "cluster") {
-              // In order for this to work in yarn cluster mode the user must specify the
-              // --addJars option to the client to upload the file into the distributed cache
-              // of the AM to make it show up in the current working directory.
-              val fileName = new Path(uri.getPath).getName()
-              try {
-                env.rpcEnv.fileServer.addJar(new File(fileName))
-              } catch {
-                case e: Exception =>
-                  // For now just log an error but allow to go through so spark examples work.
-                  // The spark examples don't really need the jar distributed since its also
-                  // the app jar.
-                  logError("Error adding jar (" + e + "), was the --addJars option used?")
-                  null
+            try {
+              val file = new File(uri.getPath)
+              if (!file.exists()) {
+                throw new FileNotFoundException(s"Jar ${file.getAbsolutePath} not found")
               }
-            } else {
-              try {
-                env.rpcEnv.fileServer.addJar(new File(uri.getPath))
-              } catch {
-                case exc: FileNotFoundException =>
-                  logError(s"Jar not found at $path")
-                  null
+              if (file.isDirectory) {
+                throw new IllegalArgumentException(
+                  s"Directory ${file.getAbsoluteFile} is not allowed for addJar")
               }
+              env.rpcEnv.fileServer.addJar(new File(uri.getPath))
+            } catch {
+              case NonFatal(e) =>
+                logError(s"Failed to add $path to Spark environment", e)
+                null
             }
           // A JAR file which exists locally on every worker node
           case "local" =>
@@ -1762,8 +1763,31 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   def listJars(): Seq[String] = addedJars.keySet.toSeq
 
-  // Shut down the SparkContext.
-  def stop() {
+  /**
+   * When stopping SparkContext inside Spark components, it's easy to cause dead-lock since Spark
+   * may wait for some internal threads to finish. It's better to use this method to stop
+   * SparkContext instead.
+   */
+  private[spark] def stopInNewThread(): Unit = {
+    new Thread("stop-spark-context") {
+      setDaemon(true)
+
+      override def run(): Unit = {
+        try {
+          SparkContext.this.stop()
+        } catch {
+          case e: Throwable =>
+            logError(e.getMessage, e)
+            throw e
+        }
+      }
+    }.start()
+  }
+
+  /**
+   * Shut down the SparkContext.
+   */
+  def stop(): Unit = {
     if (LiveListenerBus.withinListenerThread.value) {
       throw new SparkException(
         s"Cannot stop SparkContext within listener thread of ${LiveListenerBus.name}")
@@ -1826,6 +1850,9 @@ class SparkContext(config: SparkConf) extends Logging {
       }
       SparkEnv.set(null)
     }
+    // Clear this `InheritableThreadLocal`, or it will still be inherited in child threads even this
+    // `SparkContext` is stopped.
+    localProperties.remove()
     // Unset YARN mode system env variable, to allow switching between cluster types.
     System.clearProperty("SPARK_YARN_MODE")
     SparkContext.clearActiveContext()
@@ -2027,7 +2054,7 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Cancel active jobs for the specified group. See [[org.apache.spark.SparkContext.setJobGroup]]
+   * Cancel active jobs for the specified group. See `org.apache.spark.SparkContext.setJobGroup`
    * for more information.
    */
   def cancelJobGroup(groupId: String) {
@@ -2045,7 +2072,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Cancel a given job if it's scheduled or running.
    *
    * @param jobId the job ID to cancel
-   * @throws InterruptedException if the cancel message cannot be sent
+   * @note Throws `InterruptedException` if the cancel message cannot be sent
    */
   def cancelJob(jobId: Int) {
     dagScheduler.cancelJob(jobId)
@@ -2055,7 +2082,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Cancel a given stage and all jobs associated with it.
    *
    * @param stageId the stage ID to cancel
-   * @throws InterruptedException if the cancel message cannot be sent
+   * @note Throws `InterruptedException` if the cancel message cannot be sent
    */
   def cancelStage(stageId: Int) {
     dagScheduler.cancelStage(stageId)
@@ -2285,7 +2312,7 @@ object SparkContext extends Logging {
    * singleton object. Because we can only have one active SparkContext per JVM,
    * this is useful when applications may wish to share a SparkContext.
    *
-   * Note: This function cannot be used to create multiple SparkContext instances
+   * @note This function cannot be used to create multiple SparkContext instances
    * even if multiple contexts are allowed.
    */
   def getOrCreate(config: SparkConf): SparkContext = {
@@ -2310,7 +2337,7 @@ object SparkContext extends Logging {
    *
    * This method allows not passing a SparkConf (useful if just retrieving).
    *
-   * Note: This function cannot be used to create multiple SparkContext instances
+   * @note This function cannot be used to create multiple SparkContext instances
    * even if multiple contexts are allowed.
    */
   def getOrCreate(): SparkContext = {
@@ -2322,6 +2349,13 @@ object SparkContext extends Logging {
     }
   }
 
+  /** Return the current active [[SparkContext]] if any. */
+  private[spark] def getActive: Option[SparkContext] = {
+    SPARK_CONTEXT_CONSTRUCTOR_LOCK.synchronized {
+      Option(activeContext.get())
+    }
+  }
+
   /**
    * Called at the beginning of the SparkContext constructor to ensure that no SparkContext is
    * running.  Throws an exception if a running context is detected and logs a warning if another
@@ -2550,8 +2584,8 @@ object SparkContext extends Logging {
     val serviceLoaders =
       ServiceLoader.load(classOf[ExternalClusterManager], loader).asScala.filter(_.canCreate(url))
     if (serviceLoaders.size > 1) {
-      throw new SparkException(s"Multiple Cluster Managers ($serviceLoaders) registered " +
-          s"for the url $url:")
+      throw new SparkException(
+        s"Multiple external cluster managers registered for the url $url: $serviceLoaders")
     }
     serviceLoaders.headOption
   }
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 1ffeb129880f..1296386ac9bd 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -36,6 +36,7 @@ import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.{RpcEndpoint, RpcEndpointRef, RpcEnv}
 import org.apache.spark.scheduler.{LiveListenerBus, OutputCommitCoordinator}
 import org.apache.spark.scheduler.OutputCommitCoordinator.OutputCommitCoordinatorEndpoint
+import org.apache.spark.security.CryptoStreamUtils
 import org.apache.spark.serializer.{JavaSerializer, Serializer, SerializerManager}
 import org.apache.spark.shuffle.ShuffleManager
 import org.apache.spark.storage._
@@ -165,15 +166,20 @@ object SparkEnv extends Logging {
     val bindAddress = conf.get(DRIVER_BIND_ADDRESS)
     val advertiseAddress = conf.get(DRIVER_HOST_ADDRESS)
     val port = conf.get("spark.driver.port").toInt
+    val ioEncryptionKey = if (conf.get(IO_ENCRYPTION_ENABLED)) {
+      Some(CryptoStreamUtils.createKey(conf))
+    } else {
+      None
+    }
     create(
       conf,
       SparkContext.DRIVER_IDENTIFIER,
       bindAddress,
       advertiseAddress,
       port,
-      isDriver = true,
-      isLocal = isLocal,
-      numUsableCores = numCores,
+      isLocal,
+      numCores,
+      ioEncryptionKey,
       listenerBus = listenerBus,
       mockOutputCommitCoordinator = mockOutputCommitCoordinator
     )
@@ -189,6 +195,7 @@ object SparkEnv extends Logging {
       hostname: String,
       port: Int,
       numCores: Int,
+      ioEncryptionKey: Option[Array[Byte]],
       isLocal: Boolean): SparkEnv = {
     val env = create(
       conf,
@@ -196,9 +203,9 @@ object SparkEnv extends Logging {
       hostname,
       hostname,
       port,
-      isDriver = false,
-      isLocal = isLocal,
-      numUsableCores = numCores
+      isLocal,
+      numCores,
+      ioEncryptionKey
     )
     SparkEnv.set(env)
     env
@@ -213,18 +220,26 @@ object SparkEnv extends Logging {
       bindAddress: String,
       advertiseAddress: String,
       port: Int,
-      isDriver: Boolean,
       isLocal: Boolean,
       numUsableCores: Int,
+      ioEncryptionKey: Option[Array[Byte]],
       listenerBus: LiveListenerBus = null,
       mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {
 
+    val isDriver = executorId == SparkContext.DRIVER_IDENTIFIER
+
     // Listener bus is only used on the driver
     if (isDriver) {
       assert(listenerBus != null, "Attempted to create driver SparkEnv with null listener bus!")
     }
 
-    val securityManager = new SecurityManager(conf)
+    val securityManager = new SecurityManager(conf, ioEncryptionKey)
+    ioEncryptionKey.foreach { _ =>
+      if (!securityManager.isSaslEncryptionEnabled()) {
+        logWarning("I/O encryption enabled without RPC encryption: keys will be visible on the " +
+          "wire.")
+      }
+    }
 
     val systemName = if (isDriver) driverSystemName else executorSystemName
     val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port, conf,
@@ -270,7 +285,7 @@ object SparkEnv extends Logging {
       "spark.serializer", "org.apache.spark.serializer.JavaSerializer")
     logDebug(s"Using serializer: ${serializer.getClass}")
 
-    val serializerManager = new SerializerManager(serializer, conf)
+    val serializerManager = new SerializerManager(serializer, conf, ioEncryptionKey)
 
     val closureSerializer = new JavaSerializer(conf)
 
diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
index 6550d703bc86..7f75a393bf8f 100644
--- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
@@ -20,7 +20,7 @@ package org.apache.spark
 import java.io.IOException
 import java.text.NumberFormat
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.fs.Path
@@ -67,12 +67,12 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable {
 
   def setup(jobid: Int, splitid: Int, attemptid: Int) {
     setIDs(jobid, splitid, attemptid)
-    HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss").format(now),
+    HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(now),
       jobid, splitID, attemptID, conf.value)
   }
 
   def open() {
-    val numfmt = NumberFormat.getInstance()
+    val numfmt = NumberFormat.getInstance(Locale.US)
     numfmt.setMinimumIntegerDigits(5)
     numfmt.setGroupingUsed(false)
 
@@ -162,7 +162,7 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable {
 private[spark]
 object SparkHadoopWriter {
   def createJobID(time: Date, id: Int): JobID = {
-    val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
+    val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
     val jobtrackerID = formatter.format(time)
     new JobID(jobtrackerID, id)
   }
diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
index 52c4656c271b..22a553e68439 100644
--- a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
+++ b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
@@ -112,7 +112,7 @@ class SparkStatusTracker private[spark] (sc: SparkContext) {
    */
   def getExecutorInfos: Array[SparkExecutorInfo] = {
     val executorIdToRunningTasks: Map[String, Int] =
-      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors()
+      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors
 
     sc.getExecutorStorageStatus.map { status =>
       val bmId = status.blockManagerId
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 27abccf5ac2a..0fd777ed1282 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -164,7 +164,7 @@ abstract class TaskContext extends Serializable {
 
   /**
    * Get a local property set upstream in the driver, or null if it is missing. See also
-   * [[org.apache.spark.SparkContext.setLocalProperty]].
+   * `org.apache.spark.SparkContext.setLocalProperty`.
    */
   def getLocalProperty(key: String): String
 
@@ -174,7 +174,7 @@ abstract class TaskContext extends Serializable {
   /**
    * ::DeveloperApi::
    * Returns all metrics sources with the given name which are associated with the instance
-   * which runs the task. For more information see [[org.apache.spark.metrics.MetricsSystem!]].
+   * which runs the task. For more information see `org.apache.spark.metrics.MetricsSystem`.
    */
   @DeveloperApi
   def getMetricsSources(sourceName: String): Seq[Source]
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 7ca3c103dbf5..7745387dbceb 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -65,7 +65,7 @@ sealed trait TaskFailedReason extends TaskEndReason {
 
 /**
  * :: DeveloperApi ::
- * A [[org.apache.spark.scheduler.ShuffleMapTask]] that completed successfully earlier, but we
+ * A `org.apache.spark.scheduler.ShuffleMapTask` that completed successfully earlier, but we
  * lost the executor before the stage completed. This means Spark needs to reschedule the task
  * to be re-executed on a different executor.
  */
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 871b9d1ad575..5cdc4eeeccbc 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -18,19 +18,23 @@
 package org.apache.spark
 
 import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
-import java.net.{URI, URL}
+import java.net.{HttpURLConnection, URI, URL}
 import java.nio.charset.StandardCharsets
 import java.nio.file.Paths
+import java.security.SecureRandom
+import java.security.cert.X509Certificate
 import java.util.Arrays
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 import java.util.jar.{JarEntry, JarOutputStream}
+import javax.net.ssl._
+import javax.servlet.http.HttpServletResponse
+import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.io.{ByteStreams, Files}
-import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
 
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
@@ -182,11 +186,60 @@ private[spark] object TestUtils {
     assert(spillListener.numSpilledStages == 0, s"expected $identifier to not spill, but did")
   }
 
+  /**
+   * Returns the response code and url (if redirected) from an HTTP(S) URL.
+   */
+  def httpResponseCodeAndURL(
+      url: URL,
+      method: String = "GET",
+      headers: Seq[(String, String)] = Nil): (Int, Option[String]) = {
+    val connection = url.openConnection().asInstanceOf[HttpURLConnection]
+    connection.setRequestMethod(method)
+    headers.foreach { case (k, v) => connection.setRequestProperty(k, v) }
+
+    // Disable cert and host name validation for HTTPS tests.
+    if (connection.isInstanceOf[HttpsURLConnection]) {
+      val sslCtx = SSLContext.getInstance("SSL")
+      val trustManager = new X509TrustManager {
+        override def getAcceptedIssuers(): Array[X509Certificate] = null
+        override def checkClientTrusted(x509Certificates: Array[X509Certificate], s: String) {}
+        override def checkServerTrusted(x509Certificates: Array[X509Certificate], s: String) {}
+      }
+      val verifier = new HostnameVerifier() {
+        override def verify(hostname: String, session: SSLSession): Boolean = true
+      }
+      sslCtx.init(null, Array(trustManager), new SecureRandom())
+      connection.asInstanceOf[HttpsURLConnection].setSSLSocketFactory(sslCtx.getSocketFactory())
+      connection.asInstanceOf[HttpsURLConnection].setHostnameVerifier(verifier)
+      connection.setInstanceFollowRedirects(false)
+    }
+
+    try {
+      connection.connect()
+      if (connection.getResponseCode == HttpServletResponse.SC_FOUND) {
+        (connection.getResponseCode, Option(connection.getHeaderField("Location")))
+      } else {
+        (connection.getResponseCode(), None)
+      }
+    } finally {
+      connection.disconnect()
+    }
+  }
+
+  /**
+   * Returns the response code from an HTTP(S) URL.
+   */
+  def httpResponseCode(
+      url: URL,
+      method: String = "GET",
+      headers: Seq[(String, String)] = Nil): Int = {
+    httpResponseCodeAndURL(url, method, headers)._1
+  }
 }
 
 
 /**
- * A [[SparkListener]] that detects whether spills have occurred in Spark jobs.
+ * A `SparkListener` that detects whether spills have occurred in Spark jobs.
  */
 private class SpillListener extends SparkListener {
   private val stageIdToTaskMetrics = new mutable.HashMap[Int, ArrayBuffer[TaskMetrics]]
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
index 0026fc9dad51..b71af0d42cdb 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
@@ -45,7 +45,9 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
 
   import JavaDoubleRDD.fromRDD
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def cache(): JavaDoubleRDD = fromRDD(srdd.cache())
 
   /**
@@ -153,7 +155,7 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    */
   def intersection(other: JavaDoubleRDD): JavaDoubleRDD = fromRDD(srdd.intersection(other.srdd))
 
@@ -256,7 +258,7 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
    *  e.g 1&lt;=x&lt;10 , 10&lt;=x&lt;20, 20&lt;=x&lt;50
    *  And on the input of 1 and 50 we would have a histogram of 1,0,0
    *
-   * Note: if your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
+   * @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
    * from an O(log n) insertion to O(1) per element. (where n = # buckets) if you set evenBuckets
    * to true.
    * buckets must be sorted and not contain any duplicates.
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 1c95bc4bfcaa..766aea213a97 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -54,7 +54,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
 
   // Common RDD functions
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def cache(): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.cache())
 
   /**
@@ -206,7 +208,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    */
   def intersection(other: JavaPairRDD[K, V]): JavaPairRDD[K, V] =
     new JavaPairRDD[K, V](rdd.intersection(other.rdd))
@@ -223,9 +225,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
   /**
    * Generic function to combine the elements for each key using a custom set of aggregation
    * functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
-   * "combined type" C. Note that V and C can be different -- for example, one might group an
-   * RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
-   * functions:
+   * "combined type" C.
+   *
+   * Users provide three functions:
    *
    *  - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
    *  - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
@@ -234,6 +236,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * In addition, users can control the partitioning of the output RDD, the serializer that is use
    * for the shuffle, and whether to perform map-side aggregation (if a mapper can produce multiple
    * items with the same key).
+   *
+   * @note V and C can be different -- for example, one might group an RDD of type (Int, Int) into
+   * an RDD of type (Int, List[Int]).
    */
   def combineByKey[C](createCombiner: JFunction[V, C],
       mergeValue: JFunction2[C, V, C],
@@ -255,9 +260,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
   /**
    * Generic function to combine the elements for each key using a custom set of aggregation
    * functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
-   * "combined type" C. Note that V and C can be different -- for example, one might group an
-   * RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
-   * functions:
+   * "combined type" C.
+   *
+   * Users provide three functions:
    *
    *  - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
    *  - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
@@ -265,6 +270,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    *
    * In addition, users can control the partitioning of the output RDD. This method automatically
    * uses map-side aggregation in shuffling the RDD.
+   *
+   * @note V and C can be different -- for example, one might group an RDD of type (Int, Int) into
+   * an RDD of type (Int, List[Int]).
    */
   def combineByKey[C](createCombiner: JFunction[V, C],
       mergeValue: JFunction2[C, V, C],
@@ -398,8 +406,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Group the values for each key in the RDD into a single sequence. Allows controlling the
    * partitioning of the resulting key-value pair RDD by passing a Partitioner.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
+   * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
+   * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey`
    * will provide much better performance.
    */
   def groupByKey(partitioner: Partitioner): JavaPairRDD[K, JIterable[V]] =
@@ -409,8 +417,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Group the values for each key in the RDD into a single sequence. Hash-partitions the
    * resulting RDD with into `numPartitions` partitions.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
+   * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
+   * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey`
    * will provide much better performance.
    */
   def groupByKey(numPartitions: Int): JavaPairRDD[K, JIterable[V]] =
@@ -448,13 +456,17 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
     fromRDD(rdd.subtractByKey(other))
   }
 
-  /** Return an RDD with the pairs from `this` whose keys are not in `other`. */
+  /**
+   * Return an RDD with the pairs from `this` whose keys are not in `other`.
+   */
   def subtractByKey[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, V] = {
     implicit val ctag: ClassTag[W] = fakeClassTag
     fromRDD(rdd.subtractByKey(other, numPartitions))
   }
 
-  /** Return an RDD with the pairs from `this` whose keys are not in `other`. */
+  /**
+   * Return an RDD with the pairs from `this` whose keys are not in `other`.
+   */
   def subtractByKey[W](other: JavaPairRDD[K, W], p: Partitioner): JavaPairRDD[K, V] = {
     implicit val ctag: ClassTag[W] = fakeClassTag
     fromRDD(rdd.subtractByKey(other, p))
@@ -539,8 +551,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Group the values for each key in the RDD into a single sequence. Hash-partitions the
    * resulting RDD with the existing partitioner/parallelism level.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
+   * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
+   * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey`
    * will provide much better performance.
    */
   def groupByKey(): JavaPairRDD[K, JIterable[V]] =
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index 20d6c9341bf7..41b5cab601c3 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -34,7 +34,9 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
 
   // Common RDD functions
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def cache(): JavaRDD[T] = wrapRDD(rdd.cache())
 
   /**
@@ -98,24 +100,32 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
   def repartition(numPartitions: Int): JavaRDD[T] = rdd.repartition(numPartitions)
 
   /**
-   * Return a sampled subset of this RDD.
+   * Return a sampled subset of this RDD with a random seed.
    *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be greater
+   *  than or equal to 0
+   *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given `RDD`.
    */
   def sample(withReplacement: Boolean, fraction: Double): JavaRDD[T] =
     sample(withReplacement, fraction, Utils.random.nextLong)
 
   /**
-   * Return a sampled subset of this RDD.
+   * Return a sampled subset of this RDD, with a user-supplied seed.
    *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be greater
+   *  than or equal to 0
    * @param seed seed for the random number generator
+   *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given `RDD`.
    */
   def sample(withReplacement: Boolean, fraction: Double, seed: Long): JavaRDD[T] =
     wrapRDD(rdd.sample(withReplacement, fraction, seed))
@@ -153,7 +163,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    */
   def intersection(other: JavaRDD[T]): JavaRDD[T] = wrapRDD(rdd.intersection(other.rdd))
 
@@ -161,7 +171,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * Return an RDD with the elements from `this` that are not in `other`.
    *
    * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
-   * RDD will be <= us.
+   * RDD will be less than or equal to us.
    */
   def subtract(other: JavaRDD[T]): JavaRDD[T] = wrapRDD(rdd.subtract(other))
 
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index a37c52cbaf21..eda16d957cc5 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -47,7 +47,8 @@ private[spark] abstract class AbstractJavaRDDLike[T, This <: JavaRDDLike[T, This
 
 /**
  * Defines operations common to several Java RDD implementations.
- * Note that this trait is not intended to be implemented by user code.
+ *
+ * @note This trait is not intended to be implemented by user code.
  */
 trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def wrapRDD(rdd: RDD[T]): This
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 4e50c2686dd5..9481156bc93a 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -238,7 +238,9 @@ class JavaSparkContext(val sc: SparkContext)
    * }}}
    *
    * Do
-   * `JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
+   * {{{
+   *   JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")
+   * }}}
    *
    * then `rdd` contains
    * {{{
@@ -270,7 +272,9 @@ class JavaSparkContext(val sc: SparkContext)
    * }}}
    *
    * Do
-   * `JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
+   * {{{
+   *   JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")
+   * }}},
    *
    * then `rdd` contains
    * {{{
@@ -298,7 +302,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Get an RDD for a Hadoop SequenceFile with given key and value types.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -316,7 +320,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Get an RDD for a Hadoop SequenceFile.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -366,7 +370,7 @@ class JavaSparkContext(val sc: SparkContext)
    * @param valueClass Class of the values
    * @param minPartitions Minimum number of Hadoop Splits to generate.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -396,7 +400,7 @@ class JavaSparkContext(val sc: SparkContext)
    * @param keyClass Class of the keys
    * @param valueClass Class of the values
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -416,7 +420,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Get an RDD for a Hadoop file with an arbitrary InputFormat.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -437,7 +441,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Get an RDD for a Hadoop file with an arbitrary InputFormat
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -458,7 +462,7 @@ class JavaSparkContext(val sc: SparkContext)
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -487,7 +491,7 @@ class JavaSparkContext(val sc: SparkContext)
    * @param kClass Class of the keys
    * @param vClass Class of the values
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -694,7 +698,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Returns the Hadoop configuration used for the Hadoop code (e.g. file systems) we reuse.
    *
-   * '''Note:''' As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
+   * @note As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
    * plan to set some global configurations for all Hadoop RDDs.
    */
   def hadoopConfiguration(): Configuration = {
@@ -749,7 +753,7 @@ class JavaSparkContext(val sc: SparkContext)
 
   /**
    * Get a local property set in this thread, or null if it is missing. See
-   * [[org.apache.spark.api.java.JavaSparkContext.setLocalProperty]].
+   * `org.apache.spark.api.java.JavaSparkContext.setLocalProperty`.
    */
   def getLocalProperty(key: String): String = sc.getLocalProperty(key)
 
@@ -769,7 +773,7 @@ class JavaSparkContext(val sc: SparkContext)
    * Application programmers can use this method to group all those jobs together and give a
    * group description. Once set, the Spark web UI will associate such jobs with this group.
    *
-   * The application can also use [[org.apache.spark.api.java.JavaSparkContext.cancelJobGroup]]
+   * The application can also use `org.apache.spark.api.java.JavaSparkContext.cancelJobGroup`
    * to cancel all running jobs in this group. For example,
    * {{{
    * // In the main thread:
@@ -802,7 +806,7 @@ class JavaSparkContext(val sc: SparkContext)
 
   /**
    * Cancel active jobs for the specified group. See
-   * [[org.apache.spark.api.java.JavaSparkContext.setJobGroup]] for more information.
+   * `org.apache.spark.api.java.JavaSparkContext.setJobGroup` for more information.
    */
   def cancelJobGroup(groupId: String): Unit = sc.cancelJobGroup(groupId)
 
@@ -811,7 +815,8 @@ class JavaSparkContext(val sc: SparkContext)
 
   /**
    * Returns a Java map of JavaRDDs that have marked themselves as persistent via cache() call.
-   * Note that this does not necessarily mean the caching or computation was successful.
+   *
+   * @note This does not necessarily mean the caching or computation was successful.
    */
   def getPersistentRDDs: JMap[java.lang.Integer, JavaRDD[_]] = {
     sc.getPersistentRDDs.mapValues(s => JavaRDD.fromRDD(s))
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkStatusTracker.scala
index 99ca3c77cced..6aa290ecd7bb 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkStatusTracker.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkStatusTracker.scala
@@ -31,7 +31,7 @@ import org.apache.spark.{SparkContext, SparkJobInfo, SparkStageInfo}
  * will provide information for the last `spark.ui.retainedStages` stages and
  * `spark.ui.retainedJobs` jobs.
  *
- * NOTE: this class's constructor should be considered private and may be subject to change.
+ * @note This class's constructor should be considered private and may be subject to change.
  */
 class JavaSparkStatusTracker private[spark] (sc: SparkContext) {
 
diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
new file mode 100644
index 000000000000..3432700f1160
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import java.util.concurrent.atomic.AtomicInteger
+import java.util.concurrent.ConcurrentHashMap
+
+/** JVM object ID wrapper */
+private[r] case class JVMObjectId(id: String) {
+  require(id != null, "Object ID cannot be null.")
+}
+
+/**
+ * Counter that tracks JVM objects returned to R.
+ * This is useful for referencing these objects in RPC calls.
+ */
+private[r] class JVMObjectTracker {
+
+  private[this] val objMap = new ConcurrentHashMap[JVMObjectId, Object]()
+  private[this] val objCounter = new AtomicInteger()
+
+  /**
+   * Returns the JVM object associated with the input key or None if not found.
+   */
+  final def get(id: JVMObjectId): Option[Object] = this.synchronized {
+    if (objMap.containsKey(id)) {
+      Some(objMap.get(id))
+    } else {
+      None
+    }
+  }
+
+  /**
+   * Returns the JVM object associated with the input key or throws an exception if not found.
+   */
+  @throws[NoSuchElementException]("if key does not exist.")
+  final def apply(id: JVMObjectId): Object = {
+    get(id).getOrElse(
+      throw new NoSuchElementException(s"$id does not exist.")
+    )
+  }
+
+  /**
+   * Adds a JVM object to track and returns assigned ID, which is unique within this tracker.
+   */
+  final def addAndGetId(obj: Object): JVMObjectId = {
+    val id = JVMObjectId(objCounter.getAndIncrement().toString)
+    objMap.put(id, obj)
+    id
+  }
+
+  /**
+   * Removes and returns a JVM object with the specific ID from the tracker, or None if not found.
+   */
+  final def remove(id: JVMObjectId): Option[Object] = this.synchronized {
+    if (objMap.containsKey(id)) {
+      Some(objMap.remove(id))
+    } else {
+      None
+    }
+  }
+
+  /**
+   * Number of JVM objects being tracked.
+   */
+  final def size: Int = objMap.size()
+
+  /**
+   * Clears the tracker.
+   */
+  final def clear(): Unit = objMap.clear()
+}
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
index 550746c552d0..2d1152a03644 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
@@ -22,7 +22,7 @@ import java.net.{InetAddress, InetSocketAddress, ServerSocket}
 import java.util.concurrent.TimeUnit
 
 import io.netty.bootstrap.ServerBootstrap
-import io.netty.channel.{ChannelFuture, ChannelInitializer, ChannelOption, EventLoopGroup}
+import io.netty.channel.{ChannelFuture, ChannelInitializer, EventLoopGroup}
 import io.netty.channel.nio.NioEventLoopGroup
 import io.netty.channel.socket.SocketChannel
 import io.netty.channel.socket.nio.NioServerSocketChannel
@@ -42,6 +42,9 @@ private[spark] class RBackend {
   private[this] var bootstrap: ServerBootstrap = null
   private[this] var bossGroup: EventLoopGroup = null
 
+  /** Tracks JVM objects returned to R for this RBackend instance. */
+  private[r] val jvmObjectTracker = new JVMObjectTracker
+
   def init(): Int = {
     val conf = new SparkConf()
     val backendConnectionTimeout = conf.getInt(
@@ -94,6 +97,7 @@ private[spark] class RBackend {
       bootstrap.childGroup().shutdownGracefully()
     }
     bootstrap = null
+    jvmObjectTracker.clear()
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 9f5afa29d6d2..cfd37ac54ba2 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -20,7 +20,6 @@ package org.apache.spark.api.r
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
 import java.util.concurrent.TimeUnit
 
-import scala.collection.mutable.HashMap
 import scala.language.existentials
 
 import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler}
@@ -62,7 +61,7 @@ private[r] class RBackendHandler(server: RBackend)
           assert(numArgs == 1)
 
           writeInt(dos, 0)
-          writeObject(dos, args(0))
+          writeObject(dos, args(0), server.jvmObjectTracker)
         case "stopBackend" =>
           writeInt(dos, 0)
           writeType(dos, "void")
@@ -72,9 +71,9 @@ private[r] class RBackendHandler(server: RBackend)
             val t = readObjectType(dis)
             assert(t == 'c')
             val objToRemove = readString(dis)
-            JVMObjectTracker.remove(objToRemove)
+            server.jvmObjectTracker.remove(JVMObjectId(objToRemove))
             writeInt(dos, 0)
-            writeObject(dos, null)
+            writeObject(dos, null, server.jvmObjectTracker)
           } catch {
             case e: Exception =>
               logError(s"Removing $objId failed", e)
@@ -143,12 +142,8 @@ private[r] class RBackendHandler(server: RBackend)
       val cls = if (isStatic) {
         Utils.classForName(objId)
       } else {
-        JVMObjectTracker.get(objId) match {
-          case None => throw new IllegalArgumentException("Object not found " + objId)
-          case Some(o) =>
-            obj = o
-            o.getClass
-        }
+        obj = server.jvmObjectTracker(JVMObjectId(objId))
+        obj.getClass
       }
 
       val args = readArgs(numArgs, dis)
@@ -173,7 +168,7 @@ private[r] class RBackendHandler(server: RBackend)
 
         // Write status bit
         writeInt(dos, 0)
-        writeObject(dos, ret.asInstanceOf[AnyRef])
+        writeObject(dos, ret.asInstanceOf[AnyRef], server.jvmObjectTracker)
       } else if (methodName == "<init>") {
         // methodName should be "<init>" for constructor
         val ctors = cls.getConstructors
@@ -193,7 +188,7 @@ private[r] class RBackendHandler(server: RBackend)
         val obj = ctors(index.get).newInstance(args : _*)
 
         writeInt(dos, 0)
-        writeObject(dos, obj.asInstanceOf[AnyRef])
+        writeObject(dos, obj.asInstanceOf[AnyRef], server.jvmObjectTracker)
       } else {
         throw new IllegalArgumentException("invalid method " + methodName + " for object " + objId)
       }
@@ -210,7 +205,7 @@ private[r] class RBackendHandler(server: RBackend)
   // Read a number of arguments from the data input stream
   def readArgs(numArgs: Int, dis: DataInputStream): Array[java.lang.Object] = {
     (0 until numArgs).map { _ =>
-      readObject(dis)
+      readObject(dis, server.jvmObjectTracker)
     }.toArray
   }
 
@@ -286,37 +281,4 @@ private[r] class RBackendHandler(server: RBackend)
   }
 }
 
-/**
- * Helper singleton that tracks JVM objects returned to R.
- * This is useful for referencing these objects in RPC calls.
- */
-private[r] object JVMObjectTracker {
-
-  // TODO: This map should be thread-safe if we want to support multiple
-  // connections at the same time
-  private[this] val objMap = new HashMap[String, Object]
-
-  // TODO: We support only one connection now, so an integer is fine.
-  // Investigate using use atomic integer in the future.
-  private[this] var objCounter: Int = 0
-
-  def getObject(id: String): Object = {
-    objMap(id)
-  }
-
-  def get(id: String): Option[Object] = {
-    objMap.get(id)
-  }
-
-  def put(obj: Object): String = {
-    val objId = objCounter.toString
-    objCounter = objCounter + 1
-    objMap.put(objId, obj)
-    objId
-  }
 
-  def remove(id: String): Option[Object] = {
-    objMap.remove(id)
-  }
-
-}
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index a1a5eb8cf55e..72ae0340aa3d 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.api.r
 
+import java.io.File
 import java.util.{Map => JMap}
 
 import scala.collection.JavaConverters._
@@ -127,6 +128,14 @@ private[r] object RRDD {
       sparkConf.setExecutorEnv(name.toString, value.toString)
     }
 
+    if (sparkEnvirMap.containsKey("spark.r.sql.derby.temp.dir") &&
+        System.getProperty("derby.stream.error.file") == null) {
+      // This must be set before SparkContext is instantiated.
+      System.setProperty("derby.stream.error.file",
+                         Seq(sparkEnvirMap.get("spark.r.sql.derby.temp.dir").toString, "derby.log")
+                         .mkString(File.separator))
+    }
+
     val jsc = new JavaSparkContext(sparkConf)
     jars.foreach { jar =>
       jsc.addJar(jar)
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
index 7ef64723d959..88118392003e 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
@@ -152,7 +152,7 @@ private[spark] class RRunner[U](
           dataOut.writeInt(mode)
 
           if (isDataFrame) {
-            SerDe.writeObject(dataOut, colNames)
+            SerDe.writeObject(dataOut, colNames, jvmObjectTracker = null)
           }
 
           if (!iter.hasNext) {
@@ -347,6 +347,8 @@ private[r] object RRunner {
     pb.environment().put("SPARKR_RLIBDIR", rLibDir.mkString(","))
     pb.environment().put("SPARKR_WORKER_PORT", port.toString)
     pb.environment().put("SPARKR_BACKEND_CONNECTION_TIMEOUT", rConnectionTimeout.toString)
+    pb.environment().put("SPARKR_SPARKFILES_ROOT_DIR", SparkFiles.getRootDirectory())
+    pb.environment().put("SPARKR_IS_RUNNING_ON_WORKER", "TRUE")
     pb.redirectErrorStream(true)  // redirect stderr into stdout
     val proc = pb.start()
     val errThread = startStdoutThread(proc)
diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
index 77825e75e513..fdd8cf62f0e5 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
@@ -84,7 +84,6 @@ private[spark] object RUtils {
       }
     } else {
       // Otherwise, assume the package is local
-      // TODO: support this for Mesos
       val sparkRPkgPath = localSparkRPackagePath.getOrElse {
           throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")
       }
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 550e075a9512..dad928cdcfd0 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -28,13 +28,20 @@ import scala.collection.mutable.WrappedArray
  * Utility functions to serialize, deserialize objects to / from R
  */
 private[spark] object SerDe {
-  type ReadObject = (DataInputStream, Char) => Object
-  type WriteObject = (DataOutputStream, Object) => Boolean
+  type SQLReadObject = (DataInputStream, Char) => Object
+  type SQLWriteObject = (DataOutputStream, Object) => Boolean
 
-  var sqlSerDe: (ReadObject, WriteObject) = _
+  private[this] var sqlReadObject: SQLReadObject = _
+  private[this] var sqlWriteObject: SQLWriteObject = _
 
-  def registerSqlSerDe(sqlSerDe: (ReadObject, WriteObject)): Unit = {
-    this.sqlSerDe = sqlSerDe
+  def setSQLReadObject(value: SQLReadObject): this.type = {
+    sqlReadObject = value
+    this
+  }
+
+  def setSQLWriteObject(value: SQLWriteObject): this.type = {
+    sqlWriteObject = value
+    this
   }
 
   // Type mapping from R to Java
@@ -56,32 +63,33 @@ private[spark] object SerDe {
     dis.readByte().toChar
   }
 
-  def readObject(dis: DataInputStream): Object = {
+  def readObject(dis: DataInputStream, jvmObjectTracker: JVMObjectTracker): Object = {
     val dataType = readObjectType(dis)
-    readTypedObject(dis, dataType)
+    readTypedObject(dis, dataType, jvmObjectTracker)
   }
 
   def readTypedObject(
       dis: DataInputStream,
-      dataType: Char): Object = {
+      dataType: Char,
+      jvmObjectTracker: JVMObjectTracker): Object = {
     dataType match {
       case 'n' => null
       case 'i' => new java.lang.Integer(readInt(dis))
       case 'd' => new java.lang.Double(readDouble(dis))
       case 'b' => new java.lang.Boolean(readBoolean(dis))
       case 'c' => readString(dis)
-      case 'e' => readMap(dis)
+      case 'e' => readMap(dis, jvmObjectTracker)
       case 'r' => readBytes(dis)
-      case 'a' => readArray(dis)
-      case 'l' => readList(dis)
+      case 'a' => readArray(dis, jvmObjectTracker)
+      case 'l' => readList(dis, jvmObjectTracker)
       case 'D' => readDate(dis)
       case 't' => readTime(dis)
-      case 'j' => JVMObjectTracker.getObject(readString(dis))
+      case 'j' => jvmObjectTracker(JVMObjectId(readString(dis)))
       case _ =>
-        if (sqlSerDe == null || sqlSerDe._1 == null) {
+        if (sqlReadObject == null) {
           throw new IllegalArgumentException (s"Invalid type $dataType")
         } else {
-          val obj = (sqlSerDe._1)(dis, dataType)
+          val obj = sqlReadObject(dis, dataType)
           if (obj == null) {
             throw new IllegalArgumentException (s"Invalid type $dataType")
           } else {
@@ -181,28 +189,28 @@ private[spark] object SerDe {
   }
 
   // All elements of an array must be of the same type
-  def readArray(dis: DataInputStream): Array[_] = {
+  def readArray(dis: DataInputStream, jvmObjectTracker: JVMObjectTracker): Array[_] = {
     val arrType = readObjectType(dis)
     arrType match {
       case 'i' => readIntArr(dis)
       case 'c' => readStringArr(dis)
       case 'd' => readDoubleArr(dis)
       case 'b' => readBooleanArr(dis)
-      case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
+      case 'j' => readStringArr(dis).map(x => jvmObjectTracker(JVMObjectId(x)))
       case 'r' => readBytesArr(dis)
       case 'a' =>
         val len = readInt(dis)
-        (0 until len).map(_ => readArray(dis)).toArray
+        (0 until len).map(_ => readArray(dis, jvmObjectTracker)).toArray
       case 'l' =>
         val len = readInt(dis)
-        (0 until len).map(_ => readList(dis)).toArray
+        (0 until len).map(_ => readList(dis, jvmObjectTracker)).toArray
       case _ =>
-        if (sqlSerDe == null || sqlSerDe._1 == null) {
+        if (sqlReadObject == null) {
           throw new IllegalArgumentException (s"Invalid array type $arrType")
         } else {
           val len = readInt(dis)
           (0 until len).map { _ =>
-            val obj = (sqlSerDe._1)(dis, arrType)
+            val obj = sqlReadObject(dis, arrType)
             if (obj == null) {
               throw new IllegalArgumentException (s"Invalid array type $arrType")
             } else {
@@ -215,17 +223,19 @@ private[spark] object SerDe {
 
   // Each element of a list can be of different type. They are all represented
   // as Object on JVM side
-  def readList(dis: DataInputStream): Array[Object] = {
+  def readList(dis: DataInputStream, jvmObjectTracker: JVMObjectTracker): Array[Object] = {
     val len = readInt(dis)
-    (0 until len).map(_ => readObject(dis)).toArray
+    (0 until len).map(_ => readObject(dis, jvmObjectTracker)).toArray
   }
 
-  def readMap(in: DataInputStream): java.util.Map[Object, Object] = {
+  def readMap(
+      in: DataInputStream,
+      jvmObjectTracker: JVMObjectTracker): java.util.Map[Object, Object] = {
     val len = readInt(in)
     if (len > 0) {
       // Keys is an array of String
-      val keys = readArray(in).asInstanceOf[Array[Object]]
-      val values = readList(in)
+      val keys = readArray(in, jvmObjectTracker).asInstanceOf[Array[Object]]
+      val values = readList(in, jvmObjectTracker)
 
       keys.zip(values).toMap.asJava
     } else {
@@ -272,7 +282,11 @@ private[spark] object SerDe {
     }
   }
 
-  private def writeKeyValue(dos: DataOutputStream, key: Object, value: Object): Unit = {
+  private def writeKeyValue(
+      dos: DataOutputStream,
+      key: Object,
+      value: Object,
+      jvmObjectTracker: JVMObjectTracker): Unit = {
     if (key == null) {
       throw new IllegalArgumentException("Key in map can't be null.")
     } else if (!key.isInstanceOf[String]) {
@@ -280,10 +294,10 @@ private[spark] object SerDe {
     }
 
     writeString(dos, key.asInstanceOf[String])
-    writeObject(dos, value)
+    writeObject(dos, value, jvmObjectTracker)
   }
 
-  def writeObject(dos: DataOutputStream, obj: Object): Unit = {
+  def writeObject(dos: DataOutputStream, obj: Object, jvmObjectTracker: JVMObjectTracker): Unit = {
     if (obj == null) {
       writeType(dos, "void")
     } else {
@@ -373,14 +387,14 @@ private[spark] object SerDe {
         case v: Array[Object] =>
           writeType(dos, "list")
           writeInt(dos, v.length)
-          v.foreach(elem => writeObject(dos, elem))
+          v.foreach(elem => writeObject(dos, elem, jvmObjectTracker))
 
         // Handle Properties
         // This must be above the case java.util.Map below.
         // (Properties implements Map<Object,Object> and will be serialized as map otherwise)
         case v: java.util.Properties =>
           writeType(dos, "jobj")
-          writeJObj(dos, value)
+          writeJObj(dos, value, jvmObjectTracker)
 
         // Handle map
         case v: java.util.Map[_, _] =>
@@ -392,19 +406,21 @@ private[spark] object SerDe {
             val key = entry.getKey
             val value = entry.getValue
 
-            writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
+            writeKeyValue(
+              dos, key.asInstanceOf[Object], value.asInstanceOf[Object], jvmObjectTracker)
           }
         case v: scala.collection.Map[_, _] =>
           writeType(dos, "map")
           writeInt(dos, v.size)
-          v.foreach { case (key, value) =>
-            writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
+          v.foreach { case (k1, v1) =>
+            writeKeyValue(dos, k1.asInstanceOf[Object], v1.asInstanceOf[Object], jvmObjectTracker)
           }
 
         case _ =>
-          if (sqlSerDe == null || sqlSerDe._2 == null || !(sqlSerDe._2)(dos, value)) {
+          val sqlWriteSucceeded = sqlWriteObject != null && sqlWriteObject(dos, value)
+          if (!sqlWriteSucceeded) {
             writeType(dos, "jobj")
-            writeJObj(dos, value)
+            writeJObj(dos, value, jvmObjectTracker)
           }
       }
     }
@@ -447,9 +463,9 @@ private[spark] object SerDe {
     out.write(value)
   }
 
-  def writeJObj(out: DataOutputStream, value: Object): Unit = {
-    val objId = JVMObjectTracker.put(value)
-    writeString(out, objId)
+  def writeJObj(out: DataOutputStream, value: Object, jvmObjectTracker: JVMObjectTracker): Unit = {
+    val JVMObjectId(id) = jvmObjectTracker.addAndGetId(value)
+    writeString(out, id)
   }
 
   def writeIntArr(out: DataOutputStream, value: Array[Int]): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index e8d6d587b482..22d01c47e645 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -19,6 +19,7 @@ package org.apache.spark.broadcast
 
 import java.io._
 import java.nio.ByteBuffer
+import java.util.zip.Adler32
 
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
@@ -77,6 +78,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
     }
     // Note: use getSizeAsKb (not bytes) to maintain compatibility if no units are provided
     blockSize = conf.getSizeAsKb("spark.broadcast.blockSize", "4m").toInt * 1024
+    checksumEnabled = conf.getBoolean("spark.broadcast.checksum", true)
   }
   setConf(SparkEnv.get.conf)
 
@@ -85,10 +87,27 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
   /** Total number of blocks this broadcast variable contains. */
   private val numBlocks: Int = writeBlocks(obj)
 
+  /** Whether to generate checksum for blocks or not. */
+  private var checksumEnabled: Boolean = false
+  /** The checksum for all the blocks. */
+  private var checksums: Array[Int] = _
+
   override protected def getValue() = {
     _value
   }
 
+  private def calcChecksum(block: ByteBuffer): Int = {
+    val adler = new Adler32()
+    if (block.hasArray) {
+      adler.update(block.array, block.arrayOffset + block.position, block.limit - block.position)
+    } else {
+      val bytes = new Array[Byte](block.remaining())
+      block.duplicate.get(bytes)
+      adler.update(bytes)
+    }
+    adler.getValue.toInt
+  }
+
   /**
    * Divide the object into multiple blocks and put those blocks in the block manager.
    *
@@ -105,7 +124,13 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
     }
     val blocks =
       TorrentBroadcast.blockifyObject(value, blockSize, SparkEnv.get.serializer, compressionCodec)
+    if (checksumEnabled) {
+      checksums = new Array[Int](blocks.length)
+    }
     blocks.zipWithIndex.foreach { case (block, i) =>
+      if (checksumEnabled) {
+        checksums(i) = calcChecksum(block)
+      }
       val pieceId = BroadcastBlockId(id, "piece" + i)
       val bytes = new ChunkedByteBuffer(block.duplicate())
       if (!blockManager.putBytes(pieceId, bytes, MEMORY_AND_DISK_SER, tellMaster = true)) {
@@ -135,6 +160,13 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
         case None =>
           bm.getRemoteBytes(pieceId) match {
             case Some(b) =>
+              if (checksumEnabled) {
+                val sum = calcChecksum(b.chunks(0))
+                if (sum != checksums(pid)) {
+                  throw new SparkException(s"corrupt remote block $pieceId of $broadcastId:" +
+                    s" $sum != ${checksums(pid)}")
+                }
+              }
               // We found the block from remote executors/driver's BlockManager, so put the block
               // in this executor's BlockManager.
               if (!bm.putBytes(pieceId, b, StorageLevel.MEMORY_AND_DISK_SER, tellMaster = true)) {
@@ -175,11 +207,15 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
     TorrentBroadcast.synchronized {
       setConf(SparkEnv.get.conf)
       val blockManager = SparkEnv.get.blockManager
-      blockManager.getLocalValues(broadcastId).map(_.data.next()) match {
-        case Some(x) =>
-          releaseLock(broadcastId)
-          x.asInstanceOf[T]
-
+      blockManager.getLocalValues(broadcastId) match {
+        case Some(blockResult) =>
+          if (blockResult.data.hasNext) {
+            val x = blockResult.data.next().asInstanceOf[T]
+            releaseLock(broadcastId)
+            x
+          } else {
+            throw new SparkException(s"Failed to get locally stored broadcast data: $broadcastId")
+          }
         case None =>
           logInfo("Started reading broadcast variable " + id)
           val startTimeMs = System.currentTimeMillis()
diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index ee276e1b7113..a4de3d7eaf45 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -221,7 +221,9 @@ object Client {
     val conf = new SparkConf()
     val driverArgs = new ClientArguments(args)
 
-    conf.set("spark.rpc.askTimeout", "10")
+    if (!conf.contains("spark.rpc.askTimeout")) {
+      conf.set("spark.rpc.askTimeout", "10s")
+    }
     Logger.getRootLogger.setLevel(driverArgs.logLevel)
 
     val rpcEnv =
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 3f54ecc17ac3..1b3e9521c9bd 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -21,7 +21,7 @@ import java.io.IOException
 import java.lang.reflect.Method
 import java.security.PrivilegedExceptionAction
 import java.text.DateFormat
-import java.util.{Arrays, Comparator, Date}
+import java.util.{Arrays, Comparator, Date, Locale}
 
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
@@ -357,7 +357,7 @@ class SparkHadoopUtil extends Logging {
    * @return a printable string value.
    */
   private[spark] def tokenToString(token: Token[_ <: TokenIdentifier]): String = {
-    val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT)
+    val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.US)
     val buffer = new StringBuilder(128)
     buffer.append(token.toString)
     try {
@@ -373,7 +373,7 @@ class SparkHadoopUtil extends Logging {
       }
     } catch {
       case e: IOException =>
-        logDebug("Failed to decode $token: $e", e)
+        logDebug(s"Failed to decode $token: $e", e)
     }
     buffer.toString
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 5c052286099f..443f1f5b084c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -322,7 +322,7 @@ object SparkSubmit {
     }
 
     // Require all R files to be local
-    if (args.isR && !isYarnCluster) {
+    if (args.isR && !isYarnCluster && !isMesosCluster) {
       if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) {
         printErrorAndExit(s"Only local R files are supported: ${args.primaryResource}")
       }
@@ -330,9 +330,6 @@ object SparkSubmit {
 
     // The following modes are not supported or applicable
     (clusterManager, deployMode) match {
-      case (MESOS, CLUSTER) if args.isR =>
-        printErrorAndExit("Cluster deploy mode is currently not supported for R " +
-          "applications on Mesos clusters.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on standalone clusters.")
@@ -410,9 +407,9 @@ object SparkSubmit {
       printErrorAndExit("Distributing R packages with standalone cluster is not supported.")
     }
 
-    // TODO: Support SparkR with mesos cluster
-    if (args.isR && clusterManager == MESOS) {
-      printErrorAndExit("SparkR is not supported for Mesos cluster.")
+    // TODO: Support distributing R packages with mesos cluster
+    if (args.isR && clusterManager == MESOS && !RUtils.rPackages.isEmpty) {
+      printErrorAndExit("Distributing R packages with mesos cluster is not supported.")
     }
 
     // If we're running an R app, set the main class to our specific R runner
@@ -488,12 +485,17 @@ object SparkSubmit {
 
     // In client mode, launch the application main class directly
     // In addition, add the main application jar and any added jars (if any) to the classpath
-    if (deployMode == CLIENT) {
+    // Also add the main application jar and any added jars to classpath in case YARN client
+    // requires these jars.
+    if (deployMode == CLIENT || isYarnCluster) {
       childMainClass = args.mainClass
       if (isUserJar(args.primaryResource)) {
         childClasspath += args.primaryResource
       }
       if (args.jars != null) { childClasspath ++= args.jars.split(",") }
+    }
+
+    if (deployMode == CLIENT) {
       if (args.childArgs != null) { childArgs ++= args.childArgs }
     }
 
@@ -598,6 +600,9 @@ object SparkSubmit {
         if (args.pyFiles != null) {
           sysProps("spark.submit.pyFiles") = args.pyFiles
         }
+      } else if (args.isR) {
+        // Second argument is main class
+        childArgs += (args.primaryResource, "")
       } else {
         childArgs += (args.primaryResource, args.mainClass)
       }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
index 06530ff83646..6d8758a3d3b1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
@@ -74,6 +74,30 @@ private[history] case class LoadedAppUI(
 
 private[history] abstract class ApplicationHistoryProvider {
 
+  /**
+   * Returns the count of application event logs that the provider is currently still processing.
+   * History Server UI can use this to indicate to a user that the application listing on the UI
+   * can be expected to list additional known applications once the processing of these
+   * application event logs completes.
+   *
+   * A History Provider that does not have a notion of count of event logs that may be pending
+   * for processing need not override this method.
+   *
+   * @return Count of application event logs that are currently under process
+   */
+  def getEventLogsUnderProcess(): Int = {
+    0
+  }
+
+  /**
+   * Returns the time the history provider last updated the application history information
+   *
+   * @return 0 if this is undefined or unsupported, otherwise the last updated time in millis
+   */
+  def getLastUpdatedTime(): Long = {
+    0
+  }
+
   /**
    * Returns a list of applications available for the history server to show.
    *
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index dfc1aad64c81..2bd019c53aae 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.history
 
 import java.io.{FileNotFoundException, IOException, OutputStream}
 import java.util.UUID
-import java.util.concurrent.{Executors, ExecutorService, TimeUnit}
+import java.util.concurrent.{Executors, ExecutorService, Future, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.mutable
@@ -97,6 +97,13 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     .map { d => Utils.resolveURI(d).toString }
     .getOrElse(DEFAULT_LOG_DIR)
 
+  private val HISTORY_UI_ACLS_ENABLE = conf.getBoolean("spark.history.ui.acls.enable", false)
+  private val HISTORY_UI_ADMIN_ACLS = conf.get("spark.history.ui.admin.acls", "")
+  private val HISTORY_UI_ADMIN_ACLS_GROUPS = conf.get("spark.history.ui.admin.acls.groups", "")
+  logInfo(s"History server ui acls " + (if (HISTORY_UI_ACLS_ENABLE) "enabled" else "disabled") +
+    "; users with admin permissions: " + HISTORY_UI_ADMIN_ACLS.toString +
+    "; groups with admin permissions" + HISTORY_UI_ADMIN_ACLS_GROUPS.toString)
+
   private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
   private val fs = Utils.getHadoopFileSystem(logDir, hadoopConf)
 
@@ -108,7 +115,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
   // The modification time of the newest log detected during the last scan.   Currently only
   // used for logging msgs (logs are re-scanned based on file size, rather than modtime)
-  private var lastScanTime = -1L
+  private val lastScanTime = new java.util.concurrent.atomic.AtomicLong(-1)
 
   // Mapping of application IDs to their metadata, in descending end time order. Apps are inserted
   // into the map in order, so the LinkedHashMap maintains the correct ordering.
@@ -120,6 +127,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   // List of application logs to be deleted by event log cleaner.
   private var attemptsToClean = new mutable.ListBuffer[FsApplicationAttemptInfo]
 
+  private val pendingReplayTasksCount = new java.util.concurrent.atomic.AtomicInteger(0)
+
   /**
    * Return a runnable that performs the given operation on the event logs.
    * This operation is expected to be executed periodically.
@@ -226,6 +235,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     applications.get(appId)
   }
 
+  override def getEventLogsUnderProcess(): Int = pendingReplayTasksCount.get()
+
+  override def getLastUpdatedTime(): Long = lastScanTime.get()
+
   override def getAppUI(appId: String, attemptId: Option[String]): Option[LoadedAppUI] = {
     try {
       applications.get(appId).flatMap { appInfo =>
@@ -244,13 +257,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
           val appListener = replay(fileStatus, isApplicationCompleted(fileStatus), replayBus)
 
           if (appListener.appId.isDefined) {
-            val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false)
-            ui.getSecurityManager.setAcls(uiAclsEnabled)
+            ui.getSecurityManager.setAcls(HISTORY_UI_ACLS_ENABLE)
             // make sure to set admin acls before view acls so they are properly picked up
-            ui.getSecurityManager.setAdminAcls(appListener.adminAcls.getOrElse(""))
-            ui.getSecurityManager.setViewAcls(attempt.sparkUser,
-              appListener.viewAcls.getOrElse(""))
-            ui.getSecurityManager.setAdminAclsGroups(appListener.adminAclsGroups.getOrElse(""))
+            val adminAcls = HISTORY_UI_ADMIN_ACLS + "," + appListener.adminAcls.getOrElse("")
+            ui.getSecurityManager.setAdminAcls(adminAcls)
+            ui.getSecurityManager.setViewAcls(attempt.sparkUser, appListener.viewAcls.getOrElse(""))
+            val adminAclsGroups = HISTORY_UI_ADMIN_ACLS_GROUPS + "," +
+              appListener.adminAclsGroups.getOrElse("")
+            ui.getSecurityManager.setAdminAclsGroups(adminAclsGroups)
             ui.getSecurityManager.setViewAclsGroups(appListener.viewAclsGroups.getOrElse(""))
             Some(LoadedAppUI(ui, updateProbe(appId, attemptId, attempt.fileSize)))
           } else {
@@ -329,26 +343,43 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       if (logInfos.nonEmpty) {
         logDebug(s"New/updated attempts found: ${logInfos.size} ${logInfos.map(_.getPath)}")
       }
-      logInfos.map { file =>
-          replayExecutor.submit(new Runnable {
+
+      var tasks = mutable.ListBuffer[Future[_]]()
+
+      try {
+        for (file <- logInfos) {
+          tasks += replayExecutor.submit(new Runnable {
             override def run(): Unit = mergeApplicationListing(file)
           })
         }
-        .foreach { task =>
-          try {
-            // Wait for all tasks to finish. This makes sure that checkForLogs
-            // is not scheduled again while some tasks are already running in
-            // the replayExecutor.
-            task.get()
-          } catch {
-            case e: InterruptedException =>
-              throw e
-            case e: Exception =>
-              logError("Exception while merging application listings", e)
-          }
+      } catch {
+        // let the iteration over logInfos break, since an exception on
+        // replayExecutor.submit (..) indicates the ExecutorService is unable
+        // to take any more submissions at this time
+
+        case e: Exception =>
+          logError(s"Exception while submitting event log for replay", e)
+      }
+
+      pendingReplayTasksCount.addAndGet(tasks.size)
+
+      tasks.foreach { task =>
+        try {
+          // Wait for all tasks to finish. This makes sure that checkForLogs
+          // is not scheduled again while some tasks are already running in
+          // the replayExecutor.
+          task.get()
+        } catch {
+          case e: InterruptedException =>
+            throw e
+          case e: Exception =>
+            logError("Exception while merging application listings", e)
+        } finally {
+          pendingReplayTasksCount.decrementAndGet()
         }
+      }
 
-      lastScanTime = newLastScanTime
+      lastScanTime.set(newLastScanTime)
     } catch {
       case e: Exception => logError("Exception in checking for event log updates", e)
     }
@@ -365,7 +396,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     } catch {
       case e: Exception =>
         logError("Exception encountered when attempting to update last scan time", e)
-        lastScanTime
+        lastScanTime.get()
     } finally {
       if (!fs.delete(path, true)) {
         logWarning(s"Error deleting ${path}")
@@ -640,9 +671,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       false
   }
 
-  // For testing.
   private[history] def isFsInSafeMode(dfs: DistributedFileSystem): Boolean = {
-    dfs.setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_GET)
+    /* true to check only for Active NNs status */
+    dfs.setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_GET, true)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index 96b9ecf43b14..af1471763340 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -26,17 +26,35 @@ import org.apache.spark.ui.{UIUtils, WebUIPage}
 private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
 
   def render(request: HttpServletRequest): Seq[Node] = {
+    // stripXSS is called first to remove suspicious characters used in XSS attacks
     val requestedIncomplete =
-      Option(request.getParameter("showIncomplete")).getOrElse("false").toBoolean
+      Option(UIUtils.stripXSS(request.getParameter("showIncomplete"))).getOrElse("false").toBoolean
 
     val allAppsSize = parent.getApplicationList().count(_.completed != requestedIncomplete)
+    val eventLogsUnderProcessCount = parent.getEventLogsUnderProcess()
+    val lastUpdatedTime = parent.getLastUpdatedTime()
     val providerConfig = parent.getProviderConfig()
     val content =
+      <script src={UIUtils.prependBaseUri("/static/historypage-common.js")}></script>
       <div>
           <div class="span12">
             <ul class="unstyled">
               {providerConfig.map { case (k, v) => <li><strong>{k}:</strong> {v}</li> }}
             </ul>
+            {
+            if (eventLogsUnderProcessCount > 0) {
+              <p>There are {eventLogsUnderProcessCount} event log(s) currently being
+                processed which may result in additional applications getting listed on this page.
+                Refresh the page to view updates. </p>
+            }
+            }
+
+            {
+            if (lastUpdatedTime > 0) {
+              <p>Last updated: <span id="last-updated">{lastUpdatedTime}</span></p>
+            }
+            }
+
             {
             if (allAppsSize > 0) {
               <script src={UIUtils.prependBaseUri("/static/dataTables.rowsGroup.js")}></script> ++
@@ -46,6 +64,8 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
                 <script>setAppLimit({parent.maxApplications})</script>
             } else if (requestedIncomplete) {
               <h4>No incomplete applications found!</h4>
+            } else if (eventLogsUnderProcessCount > 0) {
+              <h4>No completed applications found!</h4>
             } else {
               <h4>No completed applications found!</h4> ++ parent.emptyListingHtml
             }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 3175b36b3e56..b02992af7b04 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -179,6 +179,14 @@ class HistoryServer(
     provider.getListing()
   }
 
+  def getEventLogsUnderProcess(): Int = {
+    provider.getEventLogsUnderProcess()
+  }
+
+  def getLastUpdatedTime(): Long = {
+    provider.getLastUpdatedTime()
+  }
+
   def getApplicationInfoList: Iterator[ApplicationInfo] = {
     getApplicationList().map(ApplicationsListResource.appHistoryInfoToPublicAppInfo)
   }
@@ -261,7 +269,7 @@ object HistoryServer extends Logging {
     Utils.initDaemon(log)
     new HistoryServerArguments(conf, argStrings)
     initSecurity()
-    val securityManager = new SecurityManager(conf)
+    val securityManager = createSecurityManager(conf)
 
     val providerName = conf.getOption("spark.history.provider")
       .getOrElse(classOf[FsHistoryProvider].getName())
@@ -281,6 +289,24 @@ object HistoryServer extends Logging {
     while(true) { Thread.sleep(Int.MaxValue) }
   }
 
+  /**
+   * Create a security manager.
+   * This turns off security in the SecurityManager, so that the History Server can start
+   * in a Spark cluster where security is enabled.
+   * @param config configuration for the SecurityManager constructor
+   * @return the security manager for use in constructing the History Server.
+   */
+  private[history] def createSecurityManager(config: SparkConf): SecurityManager = {
+    if (config.getBoolean("spark.acls.enable", config.getBoolean("spark.ui.acls.enable", false))) {
+      logInfo("Either spark.acls.enable or spark.ui.acls.enable is configured, clearing it and " +
+        "only using spark.history.ui.acl.enable")
+      config.set("spark.acls.enable", "false")
+      config.set("spark.ui.acls.enable", "false")
+    }
+
+    new SecurityManager(config)
+  }
+
   def initSecurity() {
     // If we are accessing HDFS and it has security enabled (Kerberos), we have to login
     // from a keytab file so that we can access HDFS beyond the kerberos ticket expiration.
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
index 2eddb5ff5447..080ba12c2f0d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
@@ -24,7 +24,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
 /**
- * Command-line parser for the master.
+ * Command-line parser for the [[HistoryServer]].
  */
 private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String])
   extends Logging {
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 8c91aa15167c..4618e6117a4f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.deploy.master
 
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 import java.util.concurrent.{ScheduledFuture, TimeUnit}
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
@@ -51,7 +51,8 @@ private[deploy] class Master(
 
   private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
 
-  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss") // For application IDs
+  // For application IDs
+  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
 
   private val WORKER_TIMEOUT_MS = conf.getLong("spark.worker.timeout", 60) * 1000
   private val RETAINED_APPLICATIONS = conf.getInt("spark.deploy.retainedApplications", 200)
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
index 18cff3125d6b..57778701ff9e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
@@ -33,7 +33,8 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app")
 
   /** Executor details for a particular application */
   def render(request: HttpServletRequest): Seq[Node] = {
-    val appId = request.getParameter("appId")
+    // stripXSS is called first to remove suspicious characters used in XSS attacks
+    val appId = UIUtils.stripXSS(request.getParameter("appId"))
     val state = master.askWithRetry[MasterStateResponse](RequestMasterState)
     val app = state.activeApps.find(_.id == appId)
       .getOrElse(state.completedApps.find(_.id == appId).orNull)
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index 3fb860582cc1..d26e99cda31a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -57,8 +57,10 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
   private def handleKillRequest(request: HttpServletRequest, action: String => Unit): Unit = {
     if (parent.killEnabled &&
         parent.master.securityMgr.checkModifyPermissions(request.getRemoteUser)) {
-      val killFlag = Option(request.getParameter("terminate")).getOrElse("false").toBoolean
-      val id = Option(request.getParameter("id"))
+      // stripXSS is called first to remove suspicious characters used in XSS attacks
+      val killFlag =
+        Option(UIUtils.stripXSS(request.getParameter("terminate"))).getOrElse("false").toBoolean
+      val id = Option(UIUtils.stripXSS(request.getParameter("id")))
       if (id.isDefined && killFlag) {
         action(id.get)
       }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
index e878c10183f6..58a181128eb4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
@@ -57,7 +57,8 @@ private[deploy] class DriverRunner(
   @volatile private[worker] var finalException: Option[Exception] = None
 
   // Timeout to wait for when trying to terminate a driver.
-  private val DRIVER_TERMINATE_TIMEOUT_MS = 10 * 1000
+  private val DRIVER_TERMINATE_TIMEOUT_MS =
+    conf.getTimeAsMs("spark.worker.driverTerminateTimeout", "10s")
 
   // Decoupled for testing
   def setClock(_clock: Clock): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 0bedd9a20a96..0940f3c55844 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -20,7 +20,7 @@ package org.apache.spark.deploy.worker
 import java.io.File
 import java.io.IOException
 import java.text.SimpleDateFormat
-import java.util.{Date, UUID}
+import java.util.{Date, Locale, UUID}
 import java.util.concurrent._
 import java.util.concurrent.{Future => JFuture, ScheduledFuture => JScheduledFuture}
 
@@ -68,7 +68,7 @@ private[deploy] class Worker(
     ThreadUtils.newDaemonSingleThreadExecutor("worker-cleanup-thread"))
 
   // For worker and executor IDs
-  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
+  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
   // Send a heartbeat every (heartbeat timeout) / 4 milliseconds
   private val HEARTBEAT_MILLIS = conf.getLong("spark.worker.timeout", 60) * 1000 / 4
 
@@ -187,8 +187,7 @@ private[deploy] class Worker(
     webUi = new WorkerWebUI(this, workDir, webUiPort)
     webUi.bind()
 
-    val scheme = if (webUi.sslOptions.enabled) "https" else "http"
-    workerWebUiUrl = s"$scheme://$publicAddress:${webUi.boundPort}"
+    workerWebUiUrl = s"http://$publicAddress:${webUi.boundPort}"
     registerWithMaster()
 
     metricsSystem.registerSource(workerSource)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
index 465c214362b2..63c0d7d418b6 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/LogPage.scala
@@ -35,13 +35,16 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
   private val supportedLogTypes = Set("stderr", "stdout")
   private val defaultBytes = 100 * 1024
 
+  // stripXSS is called first to remove suspicious characters used in XSS attacks
   def renderLog(request: HttpServletRequest): String = {
-    val appId = Option(request.getParameter("appId"))
-    val executorId = Option(request.getParameter("executorId"))
-    val driverId = Option(request.getParameter("driverId"))
-    val logType = request.getParameter("logType")
-    val offset = Option(request.getParameter("offset")).map(_.toLong)
-    val byteLength = Option(request.getParameter("byteLength")).map(_.toInt).getOrElse(defaultBytes)
+    val appId = Option(UIUtils.stripXSS(request.getParameter("appId")))
+    val executorId = Option(UIUtils.stripXSS(request.getParameter("executorId")))
+    val driverId = Option(UIUtils.stripXSS(request.getParameter("driverId")))
+    val logType = UIUtils.stripXSS(request.getParameter("logType"))
+    val offset = Option(UIUtils.stripXSS(request.getParameter("offset"))).map(_.toLong)
+    val byteLength =
+      Option(UIUtils.stripXSS(request.getParameter("byteLength"))).map(_.toInt)
+      .getOrElse(defaultBytes)
 
     val logDir = (appId, executorId, driverId) match {
       case (Some(a), Some(e), None) =>
@@ -57,13 +60,16 @@ private[ui] class LogPage(parent: WorkerWebUI) extends WebUIPage("logPage") with
     pre + logText
   }
 
+  // stripXSS is called first to remove suspicious characters used in XSS attacks
   def render(request: HttpServletRequest): Seq[Node] = {
-    val appId = Option(request.getParameter("appId"))
-    val executorId = Option(request.getParameter("executorId"))
-    val driverId = Option(request.getParameter("driverId"))
-    val logType = request.getParameter("logType")
-    val offset = Option(request.getParameter("offset")).map(_.toLong)
-    val byteLength = Option(request.getParameter("byteLength")).map(_.toInt).getOrElse(defaultBytes)
+    val appId = Option(UIUtils.stripXSS(request.getParameter("appId")))
+    val executorId = Option(UIUtils.stripXSS(request.getParameter("executorId")))
+    val driverId = Option(UIUtils.stripXSS(request.getParameter("driverId")))
+    val logType = UIUtils.stripXSS(request.getParameter("logType"))
+    val offset = Option(UIUtils.stripXSS(request.getParameter("offset"))).map(_.toLong)
+    val byteLength =
+      Option(UIUtils.stripXSS(request.getParameter("byteLength"))).map(_.toInt)
+      .getOrElse(defaultBytes)
 
     val (logDir, params, pageName) = (appId, executorId, driverId) match {
       case (Some(a), Some(e), None) =>
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 7eec4ae64f29..92a27902c669 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -200,8 +200,8 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
         new SecurityManager(executorConf),
         clientMode = true)
       val driver = fetcher.setupEndpointRefByURI(driverUrl)
-      val props = driver.askWithRetry[Seq[(String, String)]](RetrieveSparkProps) ++
-        Seq[(String, String)](("spark.app.id", appId))
+      val cfg = driver.askWithRetry[SparkAppConfig](RetrieveSparkAppConfig)
+      val props = cfg.sparkProperties ++ Seq[(String, String)](("spark.app.id", appId))
       fetcher.shutdown()
 
       // Create SparkEnv using properties we fetched from the driver.
@@ -221,7 +221,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       }
 
       val env = SparkEnv.createExecutorEnv(
-        driverConf, executorId, hostname, port, cores, isLocal = false)
+        driverConf, executorId, hostname, port, cores, cfg.ioEncryptionKey, isLocal = false)
 
       env.rpcEnv.setupEndpoint("Executor", new CoarseGrainedExecutorBackend(
         env.rpcEnv, driverUrl, executorId, hostname, cores, userClassPath, env))
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 9501dd9cd8e9..3346f6dd1f97 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -84,6 +84,16 @@ private[spark] class Executor(
   // Start worker thread pool
   private val threadPool = ThreadUtils.newDaemonCachedThreadPool("Executor task launch worker")
   private val executorSource = new ExecutorSource(threadPool, executorId)
+  // Pool used for threads that supervise task killing / cancellation
+  private val taskReaperPool = ThreadUtils.newDaemonCachedThreadPool("Task reaper")
+  // For tasks which are in the process of being killed, this map holds the most recently created
+  // TaskReaper. All accesses to this map should be synchronized on the map itself (this isn't
+  // a ConcurrentHashMap because we use the synchronization for purposes other than simply guarding
+  // the integrity of the map's internal state). The purpose of this map is to prevent the creation
+  // of a separate TaskReaper for every killTask() of a given task. Instead, this map allows us to
+  // track whether an existing TaskReaper fulfills the role of a TaskReaper that we would otherwise
+  // create. The map key is a task id.
+  private val taskReaperForTask: HashMap[Long, TaskReaper] = HashMap[Long, TaskReaper]()
 
   if (!isLocal) {
     env.metricsSystem.registerSource(executorSource)
@@ -93,6 +103,9 @@ private[spark] class Executor(
   // Whether to load classes in user jars before those in Spark jars
   private val userClassPathFirst = conf.getBoolean("spark.executor.userClassPathFirst", false)
 
+  // Whether to monitor killed / interrupted tasks
+  private val taskReaperEnabled = conf.getBoolean("spark.task.reaper.enabled", false)
+
   // Create our ClassLoader
   // do this after SparkEnv creation so can access the SecurityManager
   private val urlClassLoader = createClassLoader()
@@ -148,9 +161,27 @@ private[spark] class Executor(
   }
 
   def killTask(taskId: Long, interruptThread: Boolean): Unit = {
-    val tr = runningTasks.get(taskId)
-    if (tr != null) {
-      tr.kill(interruptThread)
+    val taskRunner = runningTasks.get(taskId)
+    if (taskRunner != null) {
+      if (taskReaperEnabled) {
+        val maybeNewTaskReaper: Option[TaskReaper] = taskReaperForTask.synchronized {
+          val shouldCreateReaper = taskReaperForTask.get(taskId) match {
+            case None => true
+            case Some(existingReaper) => interruptThread && !existingReaper.interruptThread
+          }
+          if (shouldCreateReaper) {
+            val taskReaper = new TaskReaper(taskRunner, interruptThread = interruptThread)
+            taskReaperForTask(taskId) = taskReaper
+            Some(taskReaper)
+          } else {
+            None
+          }
+        }
+        // Execute the TaskReaper from outside of the synchronized block.
+        maybeNewTaskReaper.foreach(taskReaperPool.execute)
+      } else {
+        taskRunner.kill(interruptThread = interruptThread)
+      }
     }
   }
 
@@ -161,12 +192,7 @@ private[spark] class Executor(
    * @param interruptThread whether to interrupt the task thread
    */
   def killAllTasks(interruptThread: Boolean) : Unit = {
-    // kill all the running tasks
-    for (taskRunner <- runningTasks.values().asScala) {
-      if (taskRunner != null) {
-        taskRunner.kill(interruptThread)
-      }
-    }
+    runningTasks.keys().asScala.foreach(t => killTask(t, interruptThread = interruptThread))
   }
 
   def stop(): Unit = {
@@ -192,13 +218,21 @@ private[spark] class Executor(
       serializedTask: ByteBuffer)
     extends Runnable {
 
+    val threadName = s"Executor task launch worker for task $taskId"
+
     /** Whether this task has been killed. */
     @volatile private var killed = false
 
+    @volatile private var threadId: Long = -1
+
+    def getThreadId: Long = threadId
+
     /** Whether this task has been finished. */
     @GuardedBy("TaskRunner.this")
     private var finished = false
 
+    def isFinished: Boolean = synchronized { finished }
+
     /** How much the JVM process has spent in GC when the task starts to run. */
     @volatile var startGCTime: Long = _
 
@@ -229,9 +263,15 @@ private[spark] class Executor(
       // ClosedByInterruptException during execBackend.statusUpdate which causes
       // Executor to crash
       Thread.interrupted()
+      // Notify any waiting TaskReapers. Generally there will only be one reaper per task but there
+      // is a rare corner-case where one task can have two reapers in case cancel(interrupt=False)
+      // is followed by cancel(interrupt=True). Thus we use notifyAll() to avoid a lost wakeup:
+      notifyAll()
     }
 
     override def run(): Unit = {
+      threadId = Thread.currentThread.getId
+      Thread.currentThread.setName(threadName)
       val threadMXBean = ManagementFactory.getThreadMXBean
       val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId)
       val deserializeStartTime = System.currentTimeMillis()
@@ -431,6 +471,117 @@ private[spark] class Executor(
     }
   }
 
+  /**
+   * Supervises the killing / cancellation of a task by sending the interrupted flag, optionally
+   * sending a Thread.interrupt(), and monitoring the task until it finishes.
+   *
+   * Spark's current task cancellation / task killing mechanism is "best effort" because some tasks
+   * may not be interruptable or may not respond to their "killed" flags being set. If a significant
+   * fraction of a cluster's task slots are occupied by tasks that have been marked as killed but
+   * remain running then this can lead to a situation where new jobs and tasks are starved of
+   * resources that are being used by these zombie tasks.
+   *
+   * The TaskReaper was introduced in SPARK-18761 as a mechanism to monitor and clean up zombie
+   * tasks. For backwards-compatibility / backportability this component is disabled by default
+   * and must be explicitly enabled by setting `spark.task.reaper.enabled=true`.
+   *
+   * A TaskReaper is created for a particular task when that task is killed / cancelled. Typically
+   * a task will have only one TaskReaper, but it's possible for a task to have up to two reapers
+   * in case kill is called twice with different values for the `interrupt` parameter.
+   *
+   * Once created, a TaskReaper will run until its supervised task has finished running. If the
+   * TaskReaper has not been configured to kill the JVM after a timeout (i.e. if
+   * `spark.task.reaper.killTimeout < 0`) then this implies that the TaskReaper may run indefinitely
+   * if the supervised task never exits.
+   */
+  private class TaskReaper(
+      taskRunner: TaskRunner,
+      val interruptThread: Boolean)
+    extends Runnable {
+
+    private[this] val taskId: Long = taskRunner.taskId
+
+    private[this] val killPollingIntervalMs: Long =
+      conf.getTimeAsMs("spark.task.reaper.pollingInterval", "10s")
+
+    private[this] val killTimeoutMs: Long = conf.getTimeAsMs("spark.task.reaper.killTimeout", "-1")
+
+    private[this] val takeThreadDump: Boolean =
+      conf.getBoolean("spark.task.reaper.threadDump", true)
+
+    override def run(): Unit = {
+      val startTimeMs = System.currentTimeMillis()
+      def elapsedTimeMs = System.currentTimeMillis() - startTimeMs
+      def timeoutExceeded(): Boolean = killTimeoutMs > 0 && elapsedTimeMs > killTimeoutMs
+      try {
+        // Only attempt to kill the task once. If interruptThread = false then a second kill
+        // attempt would be a no-op and if interruptThread = true then it may not be safe or
+        // effective to interrupt multiple times:
+        taskRunner.kill(interruptThread = interruptThread)
+        // Monitor the killed task until it exits. The synchronization logic here is complicated
+        // because we don't want to synchronize on the taskRunner while possibly taking a thread
+        // dump, but we also need to be careful to avoid races between checking whether the task
+        // has finished and wait()ing for it to finish.
+        var finished: Boolean = false
+        while (!finished && !timeoutExceeded()) {
+          taskRunner.synchronized {
+            // We need to synchronize on the TaskRunner while checking whether the task has
+            // finished in order to avoid a race where the task is marked as finished right after
+            // we check and before we call wait().
+            if (taskRunner.isFinished) {
+              finished = true
+            } else {
+              taskRunner.wait(killPollingIntervalMs)
+            }
+          }
+          if (taskRunner.isFinished) {
+            finished = true
+          } else {
+            logWarning(s"Killed task $taskId is still running after $elapsedTimeMs ms")
+            if (takeThreadDump) {
+              try {
+                Utils.getThreadDumpForThread(taskRunner.getThreadId).foreach { thread =>
+                  if (thread.threadName == taskRunner.threadName) {
+                    logWarning(s"Thread dump from task $taskId:\n${thread.stackTrace}")
+                  }
+                }
+              } catch {
+                case NonFatal(e) =>
+                  logWarning("Exception thrown while obtaining thread dump: ", e)
+              }
+            }
+          }
+        }
+
+        if (!taskRunner.isFinished && timeoutExceeded()) {
+          if (isLocal) {
+            logError(s"Killed task $taskId could not be stopped within $killTimeoutMs ms; " +
+              "not killing JVM because we are running in local mode.")
+          } else {
+            // In non-local-mode, the exception thrown here will bubble up to the uncaught exception
+            // handler and cause the executor JVM to exit.
+            throw new SparkException(
+              s"Killing executor JVM because killed task $taskId could not be stopped within " +
+                s"$killTimeoutMs ms.")
+          }
+        }
+      } finally {
+        // Clean up entries in the taskReaperForTask map.
+        taskReaperForTask.synchronized {
+          taskReaperForTask.get(taskId).foreach { taskReaperInMap =>
+            if (taskReaperInMap eq this) {
+              taskReaperForTask.remove(taskId)
+            } else {
+              // This must have been a TaskReaper where interruptThread == false where a subsequent
+              // killTask() call for the same task had interruptThread == true and overwrote the
+              // map entry.
+            }
+          }
+        }
+      }
+    }
+  }
+
   /**
    * Create a ClassLoader for use in tasks, adding any JARs specified by the user or any classes
    * created by the interpreter to the search path
diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
index f66510b6f977..59404e08895a 100644
--- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
+++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
@@ -27,6 +27,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit}
 
+import org.apache.spark.internal.config
+import org.apache.spark.SparkContext
+
 /**
  * A general format for reading whole files in as streams, byte arrays,
  * or other functions to be added
@@ -40,9 +43,14 @@ private[spark] abstract class StreamFileInputFormat[T]
    * Allow minPartitions set by end-user in order to keep compatibility with old Hadoop API
    * which is set through setMaxSplitSize
    */
-  def setMinPartitions(context: JobContext, minPartitions: Int) {
-    val totalLen = listStatus(context).asScala.filterNot(_.isDirectory).map(_.getLen).sum
-    val maxSplitSize = math.ceil(totalLen / math.max(minPartitions, 1.0)).toLong
+  def setMinPartitions(sc: SparkContext, context: JobContext, minPartitions: Int) {
+    val defaultMaxSplitBytes = sc.getConf.get(config.FILES_MAX_PARTITION_BYTES)
+    val openCostInBytes = sc.getConf.get(config.FILES_OPEN_COST_IN_BYTES)
+    val defaultParallelism = sc.defaultParallelism
+    val files = listStatus(context).asScala
+    val totalBytes = files.filterNot(_.isDirectory).map(_.getLen + openCostInBytes).sum
+    val bytesPerCore = totalBytes / defaultParallelism
+    val maxSplitSize = Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore))
     super.setMaxSplitSize(maxSplitSize)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 497ca92c7bc6..f4844dee62ef 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -198,12 +198,26 @@ package object config {
     .createWithDefault(0)
 
   private[spark] val DRIVER_BLOCK_MANAGER_PORT = ConfigBuilder("spark.driver.blockManager.port")
-    .doc("Port to use for the block managed on the driver.")
+    .doc("Port to use for the block manager on the driver.")
     .fallbackConf(BLOCK_MANAGER_PORT)
 
   private[spark] val IGNORE_CORRUPT_FILES = ConfigBuilder("spark.files.ignoreCorruptFiles")
     .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " +
-      "encountering corrupt files and contents that have been read will still be returned.")
+      "encountering corrupted or non-existing files and contents that have been read will still " +
+      "be returned.")
     .booleanConf
     .createWithDefault(false)
+
+  private[spark] val FILES_MAX_PARTITION_BYTES = ConfigBuilder("spark.files.maxPartitionBytes")
+    .doc("The maximum number of bytes to pack into a single partition when reading files.")
+    .longConf
+    .createWithDefault(128 * 1024 * 1024)
+
+  private[spark] val FILES_OPEN_COST_IN_BYTES = ConfigBuilder("spark.files.openCostInBytes")
+    .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" +
+      " the same time. This is used when putting multiple files into a partition. It's better to" +
+      " over estimate, then the partitions with small files will be faster than partitions with" +
+      " bigger files.")
+    .longConf
+    .createWithDefault(4 * 1024 * 1024)
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
new file mode 100644
index 000000000000..afd2250c93a8
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import org.apache.hadoop.mapreduce._
+
+import org.apache.spark.util.Utils
+
+
+/**
+ * An interface to define how a single Spark job commits its outputs. Two notes:
+ *
+ * 1. Implementations must be serializable, as the committer instance instantiated on the driver
+ *    will be used for tasks on executors.
+ * 2. Implementations should have a constructor with either 2 or 3 arguments:
+ *    (jobId: String, path: String) or (jobId: String, path: String, isAppend: Boolean).
+ * 3. A committer should not be reused across multiple Spark jobs.
+ *
+ * The proper call sequence is:
+ *
+ * 1. Driver calls setupJob.
+ * 2. As part of each task's execution, executor calls setupTask and then commitTask
+ *    (or abortTask if task failed).
+ * 3. When all necessary tasks completed successfully, the driver calls commitJob. If the job
+ *    failed to execute (e.g. too many failed tasks), the job should call abortJob.
+ */
+abstract class FileCommitProtocol {
+  import FileCommitProtocol._
+
+  /**
+   * Setups up a job. Must be called on the driver before any other methods can be invoked.
+   */
+  def setupJob(jobContext: JobContext): Unit
+
+  /**
+   * Commits a job after the writes succeed. Must be called on the driver.
+   */
+  def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit
+
+  /**
+   * Aborts a job after the writes fail. Must be called on the driver.
+   *
+   * Calling this function is a best-effort attempt, because it is possible that the driver
+   * just crashes (or killed) before it can call abort.
+   */
+  def abortJob(jobContext: JobContext): Unit
+
+  /**
+   * Sets up a task within a job.
+   * Must be called before any other task related methods can be invoked.
+   */
+  def setupTask(taskContext: TaskAttemptContext): Unit
+
+  /**
+   * Notifies the commit protocol to add a new file, and gets back the full path that should be
+   * used. Must be called on the executors when running tasks.
+   *
+   * Note that the returned temp file may have an arbitrary path. The commit protocol only
+   * promises that the file will be at the location specified by the arguments after job commit.
+   *
+   * A full file path consists of the following parts:
+   *  1. the base path
+   *  2. some sub-directory within the base path, used to specify partitioning
+   *  3. file prefix, usually some unique job id with the task id
+   *  4. bucket id
+   *  5. source specific file extension, e.g. ".snappy.parquet"
+   *
+   * The "dir" parameter specifies 2, and "ext" parameter specifies both 4 and 5, and the rest
+   * are left to the commit protocol implementation to decide.
+   *
+   * Important: it is the caller's responsibility to add uniquely identifying content to "ext"
+   * if a task is going to write out multiple files to the same dir. The file commit protocol only
+   * guarantees that files written by different tasks will not conflict.
+   */
+  def newTaskTempFile(taskContext: TaskAttemptContext, dir: Option[String], ext: String): String
+
+  /**
+   * Similar to newTaskTempFile(), but allows files to committed to an absolute output location.
+   * Depending on the implementation, there may be weaker guarantees around adding files this way.
+   *
+   * Important: it is the caller's responsibility to add uniquely identifying content to "ext"
+   * if a task is going to write out multiple files to the same dir. The file commit protocol only
+   * guarantees that files written by different tasks will not conflict.
+   */
+  def newTaskTempFileAbsPath(
+      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String
+
+  /**
+   * Commits a task after the writes succeed. Must be called on the executors when running tasks.
+   */
+  def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage
+
+  /**
+   * Aborts a task after the writes have failed. Must be called on the executors when running tasks.
+   *
+   * Calling this function is a best-effort attempt, because it is possible that the executor
+   * just crashes (or killed) before it can call abort.
+   */
+  def abortTask(taskContext: TaskAttemptContext): Unit
+}
+
+
+object FileCommitProtocol {
+  class TaskCommitMessage(val obj: Any) extends Serializable
+
+  object EmptyTaskCommitMessage extends TaskCommitMessage(null)
+
+  /**
+   * Instantiates a FileCommitProtocol using the given className.
+   */
+  def instantiate(className: String, jobId: String, outputPath: String, isAppend: Boolean)
+    : FileCommitProtocol = {
+    val clazz = Utils.classForName(className).asInstanceOf[Class[FileCommitProtocol]]
+
+    // First try the one with argument (jobId: String, outputPath: String, isAppend: Boolean).
+    // If that doesn't exist, try the one with (jobId: string, outputPath: String).
+    try {
+      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
+      ctor.newInstance(jobId, outputPath, isAppend.asInstanceOf[java.lang.Boolean])
+    } catch {
+      case _: NoSuchMethodException =>
+        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+        ctor.newInstance(jobId, outputPath)
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
new file mode 100644
index 000000000000..c99b75e52325
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import java.util.{Date, UUID}
+
+import scala.collection.mutable
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+
+import org.apache.spark.SparkHadoopWriter
+import org.apache.spark.internal.Logging
+import org.apache.spark.mapred.SparkHadoopMapRedUtil
+
+/**
+ * An [[FileCommitProtocol]] implementation backed by an underlying Hadoop OutputCommitter
+ * (from the newer mapreduce API, not the old mapred API).
+ *
+ * Unlike Hadoop's OutputCommitter, this implementation is serializable.
+ */
+class HadoopMapReduceCommitProtocol(jobId: String, path: String)
+  extends FileCommitProtocol with Serializable with Logging {
+
+  import FileCommitProtocol._
+
+  /** OutputCommitter from Hadoop is not serializable so marking it transient. */
+  @transient private var committer: OutputCommitter = _
+
+  /**
+   * Tracks files staged by this task for absolute output paths. These outputs are not managed by
+   * the Hadoop OutputCommitter, so we must move these to their final locations on job commit.
+   *
+   * The mapping is from the temp output path to the final desired output path of the file.
+   */
+  @transient private var addedAbsPathFiles: mutable.Map[String, String] = null
+
+  /**
+   * The staging directory for all files committed with absolute output paths.
+   */
+  private def absPathStagingDir: Path = new Path(path, "_temporary-" + jobId)
+
+  protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
+    context.getOutputFormatClass.newInstance().getOutputCommitter(context)
+  }
+
+  override def newTaskTempFile(
+      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
+    val filename = getFilename(taskContext, ext)
+
+    val stagingDir: String = committer match {
+      // For FileOutputCommitter it has its own staging path called "work path".
+      case f: FileOutputCommitter => Option(f.getWorkPath.toString).getOrElse(path)
+      case _ => path
+    }
+
+    dir.map { d =>
+      new Path(new Path(stagingDir, d), filename).toString
+    }.getOrElse {
+      new Path(stagingDir, filename).toString
+    }
+  }
+
+  override def newTaskTempFileAbsPath(
+      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
+    val filename = getFilename(taskContext, ext)
+    val absOutputPath = new Path(absoluteDir, filename).toString
+
+    // Include a UUID here to prevent file collisions for one task writing to different dirs.
+    // In principle we could include hash(absoluteDir) instead but this is simpler.
+    val tmpOutputPath = new Path(
+      absPathStagingDir, UUID.randomUUID().toString() + "-" + filename).toString
+
+    addedAbsPathFiles(tmpOutputPath) = absOutputPath
+    tmpOutputPath
+  }
+
+  private def getFilename(taskContext: TaskAttemptContext, ext: String): String = {
+    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
+    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
+    // the file name is fine and won't overflow.
+    val split = taskContext.getTaskAttemptID.getTaskID.getId
+    f"part-$split%05d-$jobId$ext"
+  }
+
+  override def setupJob(jobContext: JobContext): Unit = {
+    // Setup IDs
+    val jobId = SparkHadoopWriter.createJobID(new Date, 0)
+    val taskId = new TaskID(jobId, TaskType.MAP, 0)
+    val taskAttemptId = new TaskAttemptID(taskId, 0)
+
+    // Set up the configuration object
+    jobContext.getConfiguration.set("mapred.job.id", jobId.toString)
+    jobContext.getConfiguration.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
+    jobContext.getConfiguration.set("mapred.task.id", taskAttemptId.toString)
+    jobContext.getConfiguration.setBoolean("mapred.task.is.map", true)
+    jobContext.getConfiguration.setInt("mapred.task.partition", 0)
+
+    val taskAttemptContext = new TaskAttemptContextImpl(jobContext.getConfiguration, taskAttemptId)
+    committer = setupCommitter(taskAttemptContext)
+    committer.setupJob(jobContext)
+  }
+
+  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
+    committer.commitJob(jobContext)
+    val filesToMove = taskCommits.map(_.obj.asInstanceOf[Map[String, String]])
+      .foldLeft(Map[String, String]())(_ ++ _)
+    logDebug(s"Committing files staged for absolute locations $filesToMove")
+    val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+    for ((src, dst) <- filesToMove) {
+      fs.rename(new Path(src), new Path(dst))
+    }
+    fs.delete(absPathStagingDir, true)
+  }
+
+  override def abortJob(jobContext: JobContext): Unit = {
+    committer.abortJob(jobContext, JobStatus.State.FAILED)
+    val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+    fs.delete(absPathStagingDir, true)
+  }
+
+  override def setupTask(taskContext: TaskAttemptContext): Unit = {
+    committer = setupCommitter(taskContext)
+    committer.setupTask(taskContext)
+    addedAbsPathFiles = mutable.Map[String, String]()
+  }
+
+  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
+    val attemptId = taskContext.getTaskAttemptID
+    SparkHadoopMapRedUtil.commitTask(
+      committer, taskContext, attemptId.getJobID.getId, attemptId.getTaskID.getId)
+    new TaskCommitMessage(addedAbsPathFiles.toMap)
+  }
+
+  override def abortTask(taskContext: TaskAttemptContext): Unit = {
+    committer.abortTask(taskContext)
+    // best effort cleanup of other staged files
+    for ((src, _) <- addedAbsPathFiles) {
+      val tmp = new Path(src)
+      tmp.getFileSystem(taskContext.getConfiguration).delete(tmp, false)
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index ae014becef75..2e991ce394c4 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -32,9 +32,8 @@ import org.apache.spark.util.Utils
  * CompressionCodec allows the customization of choosing different compression implementations
  * to be used in block storage.
  *
- * Note: The wire protocol for a codec is not guaranteed compatible across versions of Spark.
- *       This is intended for use as an internal compression utility within a single
- *       Spark application.
+ * @note The wire protocol for a codec is not guaranteed compatible across versions of Spark.
+ * This is intended for use as an internal compression utility within a single Spark application.
  */
 @DeveloperApi
 trait CompressionCodec {
@@ -103,9 +102,9 @@ private[spark] object CompressionCodec {
  * LZ4 implementation of [[org.apache.spark.io.CompressionCodec]].
  * Block size can be configured by `spark.io.compression.lz4.blockSize`.
  *
- * Note: The wire protocol for this codec is not guaranteed to be compatible across versions
- *       of Spark. This is intended for use as an internal compression utility within a single Spark
- *       application.
+ * @note The wire protocol for this codec is not guaranteed to be compatible across versions
+ * of Spark. This is intended for use as an internal compression utility within a single Spark
+ * application.
  */
 @DeveloperApi
 class LZ4CompressionCodec(conf: SparkConf) extends CompressionCodec {
@@ -123,9 +122,9 @@ class LZ4CompressionCodec(conf: SparkConf) extends CompressionCodec {
  * :: DeveloperApi ::
  * LZF implementation of [[org.apache.spark.io.CompressionCodec]].
  *
- * Note: The wire protocol for this codec is not guaranteed to be compatible across versions
- *       of Spark. This is intended for use as an internal compression utility within a single Spark
- *       application.
+ * @note The wire protocol for this codec is not guaranteed to be compatible across versions
+ * of Spark. This is intended for use as an internal compression utility within a single Spark
+ * application.
  */
 @DeveloperApi
 class LZFCompressionCodec(conf: SparkConf) extends CompressionCodec {
@@ -143,9 +142,9 @@ class LZFCompressionCodec(conf: SparkConf) extends CompressionCodec {
  * Snappy implementation of [[org.apache.spark.io.CompressionCodec]].
  * Block size can be configured by `spark.io.compression.snappy.blockSize`.
  *
- * Note: The wire protocol for this codec is not guaranteed to be compatible across versions
- *       of Spark. This is intended for use as an internal compression utility within a single Spark
- *       application.
+ * @note The wire protocol for this codec is not guaranteed to be compatible across versions
+ * of Spark. This is intended for use as an internal compression utility within a single Spark
+ * application.
  */
 @DeveloperApi
 class SnappyCompressionCodec(conf: SparkConf) extends CompressionCodec {
@@ -173,7 +172,7 @@ private final object SnappyCompressionCodec {
 }
 
 /**
- * Wrapper over [[SnappyOutputStream]] which guards against write-after-close and double-close
+ * Wrapper over `SnappyOutputStream` which guards against write-after-close and double-close
  * issues. See SPARK-7660 for more details. This wrapping can be removed if we upgrade to a version
  * of snappy-java that contains the fix for https://github.com/xerial/snappy-java/issues/107.
  */
diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index 3f7cfd9d2c11..99ec78633ab7 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -85,6 +85,17 @@ object HiveCatalogMetrics extends Source {
    */
   val METRIC_FILE_CACHE_HITS = metricRegistry.counter(MetricRegistry.name("fileCacheHits"))
 
+  /**
+   * Tracks the total number of Hive client calls (e.g. to lookup a table).
+   */
+  val METRIC_HIVE_CLIENT_CALLS = metricRegistry.counter(MetricRegistry.name("hiveClientCalls"))
+
+  /**
+   * Tracks the total number of Spark jobs launched for parallel file listing.
+   */
+  val METRIC_PARALLEL_LISTING_JOB_COUNT = metricRegistry.counter(
+    MetricRegistry.name("parallelListingJobCount"))
+
   /**
    * Resets the values of all metrics to zero. This is useful in tests.
    */
@@ -92,10 +103,14 @@ object HiveCatalogMetrics extends Source {
     METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
     METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
     METRIC_FILE_CACHE_HITS.dec(METRIC_FILE_CACHE_HITS.getCount())
+    METRIC_HIVE_CLIENT_CALLS.dec(METRIC_HIVE_CLIENT_CALLS.getCount())
+    METRIC_PARALLEL_LISTING_JOB_COUNT.dec(METRIC_PARALLEL_LISTING_JOB_COUNT.getCount())
   }
 
   // clients can use these to avoid classloader issues with the codahale classes
   def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n)
   def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
   def incrementFileCacheHits(n: Int): Unit = METRIC_FILE_CACHE_HITS.inc(n)
+  def incrementHiveClientCalls(n: Int): Unit = METRIC_HIVE_CLIENT_CALLS.inc(n)
+  def incrementParallelListingJobCount(n: Int): Unit = METRIC_PARALLEL_LISTING_JOB_COUNT.inc(n)
 }
diff --git a/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala b/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala
index 8f83668d7902..b3f8bfe8b1d4 100644
--- a/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala
+++ b/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala
@@ -46,5 +46,5 @@ trait BlockDataManager {
   /**
    * Release locks acquired by [[putBlockData()]] and [[getBlockData()]].
    */
-  def releaseLock(blockId: BlockId): Unit
+  def releaseLock(blockId: BlockId, taskAttemptId: Option[Long]): Unit
 }
diff --git a/core/src/main/scala/org/apache/spark/partial/BoundedDouble.scala b/core/src/main/scala/org/apache/spark/partial/BoundedDouble.scala
index ab6aba6fc7d6..8f579c5a3033 100644
--- a/core/src/main/scala/org/apache/spark/partial/BoundedDouble.scala
+++ b/core/src/main/scala/org/apache/spark/partial/BoundedDouble.scala
@@ -28,7 +28,7 @@ class BoundedDouble(val mean: Double, val confidence: Double, val low: Double, v
     this.mean.hashCode ^ this.confidence.hashCode ^ this.low.hashCode ^ this.high.hashCode
 
   /**
-   * Note that consistent with Double, any NaN value will make equality false
+   * @note Consistent with Double, any NaN value will make equality false
    */
   override def equals(that: Any): Boolean =
     that match {
diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index 41832e835474..50d977a92da5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{Partition, SparkContext}
 import org.apache.spark.input.StreamFileInputFormat
 
 private[spark] class BinaryFileRDD[T](
-    sc: SparkContext,
+    @transient private val sc: SparkContext,
     inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
     keyClass: Class[String],
     valueClass: Class[T],
@@ -43,7 +43,7 @@ private[spark] class BinaryFileRDD[T](
       case _ =>
     }
     val jobContext = new JobContextImpl(conf, jobId)
-    inputFormat.setMinPartitions(jobContext, minPartitions)
+    inputFormat.setMinPartitions(sc, jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
     for (i <- 0 until rawSplits.size) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 2381f54ee3f0..a091f06b4ed7 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -66,14 +66,14 @@ private[spark] class CoGroupPartition(
 
 /**
  * :: DeveloperApi ::
- * A RDD that cogroups its parents. For each key k in parent RDDs, the resulting RDD contains a
+ * An RDD that cogroups its parents. For each key k in parent RDDs, the resulting RDD contains a
  * tuple with the list of values for that key.
  *
- * Note: This is an internal API. We recommend users use RDD.cogroup(...) instead of
- * instantiating this directly.
- *
  * @param rdds parent RDDs.
  * @param part partitioner used to partition the shuffle output
+ *
+ * @note This is an internal API. We recommend users use RDD.cogroup(...) instead of
+ * instantiating this directly.
  */
 @DeveloperApi
 class CoGroupedRDD[K: ClassTag](
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index a05a770b40c5..14331dfd0c98 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -152,13 +152,13 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
 
   /**
    * Compute a histogram using the provided buckets. The buckets are all open
-   * to the right except for the last which is closed
+   * to the right except for the last which is closed.
    *  e.g. for the array
    *  [1, 10, 20, 50] the buckets are [1, 10) [10, 20) [20, 50]
-   *  e.g 1<=x<10 , 10<=x<20, 20<=x<=50
+   *  e.g {@code <=x<10, 10<=x<20, 20<=x<=50}
    *  And on the input of 1 and 50 we would have a histogram of 1, 0, 1
    *
-   * Note: if your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
+   * @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
    * from an O(log n) insertion to O(1) per element. (where n = # buckets) if you set evenBuckets
    * to true.
    * buckets must be sorted and not contain any duplicates.
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index e1cf3938de09..b56ebf4df06e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -19,7 +19,7 @@ package org.apache.spark.rdd
 
 import java.io.IOException
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import scala.collection.immutable.Map
 import scala.reflect.ClassTag
@@ -84,9 +84,6 @@ private[spark] class HadoopPartition(rddId: Int, override val index: Int, s: Inp
  * An RDD that provides core functionality for reading data stored in Hadoop (e.g., files in HDFS,
  * sources in HBase, or S3), using the older MapReduce API (`org.apache.hadoop.mapred`).
  *
- * Note: Instantiating this class directly is not recommended, please use
- * [[org.apache.spark.SparkContext.hadoopRDD()]]
- *
  * @param sc The SparkContext to associate the RDD with.
  * @param broadcastedConf A general Hadoop Configuration, or a subclass of it. If the enclosed
  *   variable references an instance of JobConf, then that JobConf will be used for the Hadoop job.
@@ -97,6 +94,9 @@ private[spark] class HadoopPartition(rddId: Int, override val index: Int, s: Inp
  * @param keyClass Class of the key associated with the inputFormatClass.
  * @param valueClass Class of the value associated with the inputFormatClass.
  * @param minPartitions Minimum number of HadoopRDD partitions (Hadoop Splits) to generate.
+ *
+ * @note Instantiating this class directly is not recommended, please use
+ * `org.apache.spark.SparkContext.hadoopRDD()`
  */
 @DeveloperApi
 class HadoopRDD[K, V](
@@ -210,12 +210,12 @@ class HadoopRDD[K, V](
   override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
     val iter = new NextIterator[(K, V)] {
 
-      val split = theSplit.asInstanceOf[HadoopPartition]
+      private val split = theSplit.asInstanceOf[HadoopPartition]
       logInfo("Input split: " + split.inputSplit)
-      val jobConf = getJobConf()
+      private val jobConf = getJobConf()
 
-      val inputMetrics = context.taskMetrics().inputMetrics
-      val existingBytesRead = inputMetrics.bytesRead
+      private val inputMetrics = context.taskMetrics().inputMetrics
+      private val existingBytesRead = inputMetrics.bytesRead
 
       // Sets the thread local variable for the file's name
       split.inputSplit.value match {
@@ -225,7 +225,7 @@ class HadoopRDD[K, V](
 
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
-      val getBytesReadCallback: Option[() => Long] = split.inputSplit.value match {
+      private val getBytesReadCallback: Option[() => Long] = split.inputSplit.value match {
         case _: FileSplit | _: CombineFileSplit =>
           SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
         case _ => None
@@ -235,28 +235,39 @@ class HadoopRDD[K, V](
       // If we do a coalesce, however, we are likely to compute multiple partitions in the same
       // task and in the same thread, in which case we need to avoid override values written by
       // previous partitions (SPARK-13071).
-      def updateBytesRead(): Unit = {
+      private def updateBytesRead(): Unit = {
         getBytesReadCallback.foreach { getBytesRead =>
           inputMetrics.setBytesRead(existingBytesRead + getBytesRead())
         }
       }
 
-      var reader: RecordReader[K, V] = null
-      val inputFormat = getInputFormat(jobConf)
-      HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss").format(createTime),
+      private var reader: RecordReader[K, V] = null
+      private val inputFormat = getInputFormat(jobConf)
+      HadoopRDD.addLocalConfiguration(
+        new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(createTime),
         context.stageId, theSplit.index, context.attemptNumber, jobConf)
-      reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
 
+      reader =
+        try {
+          inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
+        } catch {
+          case e: IOException if ignoreCorruptFiles =>
+            logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
+            finished = true
+            null
+        }
       // Register an on-task-completion callback to close the input stream.
       context.addTaskCompletionListener{ context => closeIfNeeded() }
-      val key: K = reader.createKey()
-      val value: V = reader.createValue()
+      private val key: K = if (reader == null) null.asInstanceOf[K] else reader.createKey()
+      private val value: V = if (reader == null) null.asInstanceOf[V] else reader.createValue()
 
       override def getNext(): (K, V) = {
         try {
           finished = !reader.next(key, value)
         } catch {
-          case e: IOException if ignoreCorruptFiles => finished = true
+          case e: IOException if ignoreCorruptFiles =>
+            logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
+            finished = true
         }
         if (!finished) {
           inputMetrics.incRecordsRead(1)
diff --git a/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala b/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala
index f40d4c8e0a4d..960c91a154db 100644
--- a/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala
@@ -22,6 +22,8 @@ import org.apache.spark.unsafe.types.UTF8String
 /**
  * This holds file names of the current Spark task. This is used in HadoopRDD,
  * FileScanRDD, NewHadoopRDD and InputFileName function in Spark SQL.
+ *
+ * The returned value should never be null but empty string if it is unknown.
  */
 private[spark] object InputFileNameHolder {
   /**
@@ -32,9 +34,15 @@ private[spark] object InputFileNameHolder {
     override protected def initialValue(): UTF8String = UTF8String.fromString("")
   }
 
+  /**
+   * Returns the holding file name or empty string if it is unknown.
+   */
   def getInputFileName(): UTF8String = inputFileName.get()
 
-  private[spark] def setInputFileName(file: String) = inputFileName.set(UTF8String.fromString(file))
+  private[spark] def setInputFileName(file: String) = {
+    require(file != null, "The input file name cannot be null")
+    inputFileName.set(UTF8String.fromString(file))
+  }
 
   private[spark] def unsetInputFileName(): Unit = inputFileName.remove()
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
index 0970b9807167..aab46b8954bf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
@@ -41,7 +41,10 @@ private[spark] class JdbcPartition(idx: Int, val lower: Long, val upper: Long) e
  *   The RDD takes care of closing the connection.
  * @param sql the text of the query.
  *   The query must contain two ? placeholders for parameters used to partition the results.
- *   E.g. "select title, author from books where ? <= id and id <= ?"
+ *   For example,
+ *   {{{
+ *   select title, author from books where ? <= id and id <= ?
+ *   }}}
  * @param lowerBound the minimum value of the first placeholder
  * @param upperBound the maximum value of the second placeholder
  *   The lower and upper bounds are inclusive.
@@ -151,7 +154,10 @@ object JdbcRDD {
    *   The RDD takes care of closing the connection.
    * @param sql the text of the query.
    *   The query must contain two ? placeholders for parameters used to partition the results.
-   *   E.g. "select title, author from books where ? <= id and id <= ?"
+   *   For example,
+   *   {{{
+   *   select title, author from books where ? <= id and id <= ?
+   *   }}}
    * @param lowerBound the minimum value of the first placeholder
    * @param upperBound the maximum value of the second placeholder
    *   The lower and upper bounds are inclusive.
@@ -191,7 +197,10 @@ object JdbcRDD {
    *   The RDD takes care of closing the connection.
    * @param sql the text of the query.
    *   The query must contain two ? placeholders for parameters used to partition the results.
-   *   E.g. "select title, author from books where ? <= id and id <= ?"
+   *   For example,
+   *   {{{
+   *   select title, author from books where ? <= id and id <= ?
+   *   }}}
    * @param lowerBound the minimum value of the first placeholder
    * @param upperBound the maximum value of the second placeholder
    *   The lower and upper bounds are inclusive.
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index baf31fb65887..6168d979032a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -19,7 +19,7 @@ package org.apache.spark.rdd
 
 import java.io.IOException
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import scala.reflect.ClassTag
 
@@ -57,13 +57,13 @@ private[spark] class NewHadoopPartition(
  * An RDD that provides core functionality for reading data stored in Hadoop (e.g., files in HDFS,
  * sources in HBase, or S3), using the new MapReduce API (`org.apache.hadoop.mapreduce`).
  *
- * Note: Instantiating this class directly is not recommended, please use
- * [[org.apache.spark.SparkContext.newAPIHadoopRDD()]]
- *
  * @param sc The SparkContext to associate the RDD with.
  * @param inputFormatClass Storage format of the data to be read.
  * @param keyClass Class of the key associated with the inputFormatClass.
  * @param valueClass Class of the value associated with the inputFormatClass.
+ *
+ * @note Instantiating this class directly is not recommended, please use
+ * `org.apache.spark.SparkContext.newAPIHadoopRDD()`
  */
 @DeveloperApi
 class NewHadoopRDD[K, V](
@@ -79,7 +79,7 @@ class NewHadoopRDD[K, V](
   // private val serializableConf = new SerializableWritable(_conf)
 
   private val jobTrackerId: String = {
-    val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
+    val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
     formatter.format(new Date())
   }
 
@@ -132,12 +132,12 @@ class NewHadoopRDD[K, V](
 
   override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
     val iter = new Iterator[(K, V)] {
-      val split = theSplit.asInstanceOf[NewHadoopPartition]
+      private val split = theSplit.asInstanceOf[NewHadoopPartition]
       logInfo("Input split: " + split.serializableHadoopSplit)
-      val conf = getConf
+      private val conf = getConf
 
-      val inputMetrics = context.taskMetrics().inputMetrics
-      val existingBytesRead = inputMetrics.bytesRead
+      private val inputMetrics = context.taskMetrics().inputMetrics
+      private val existingBytesRead = inputMetrics.bytesRead
 
       // Sets the thread local variable for the file's name
       split.serializableHadoopSplit.value match {
@@ -147,46 +147,62 @@ class NewHadoopRDD[K, V](
 
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
-      val getBytesReadCallback: Option[() => Long] = split.serializableHadoopSplit.value match {
-        case _: FileSplit | _: CombineFileSplit =>
-          SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
-        case _ => None
-      }
+      private val getBytesReadCallback: Option[() => Long] =
+        split.serializableHadoopSplit.value match {
+          case _: FileSplit | _: CombineFileSplit =>
+            SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
+          case _ => None
+        }
 
       // For Hadoop 2.5+, we get our input bytes from thread-local Hadoop FileSystem statistics.
       // If we do a coalesce, however, we are likely to compute multiple partitions in the same
       // task and in the same thread, in which case we need to avoid override values written by
       // previous partitions (SPARK-13071).
-      def updateBytesRead(): Unit = {
+      private def updateBytesRead(): Unit = {
         getBytesReadCallback.foreach { getBytesRead =>
           inputMetrics.setBytesRead(existingBytesRead + getBytesRead())
         }
       }
 
-      val format = inputFormatClass.newInstance
+      private val format = inputFormatClass.newInstance
       format match {
         case configurable: Configurable =>
           configurable.setConf(conf)
         case _ =>
       }
-      val attemptId = new TaskAttemptID(jobTrackerId, id, TaskType.MAP, split.index, 0)
-      val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
-      private var reader = format.createRecordReader(
-        split.serializableHadoopSplit.value, hadoopAttemptContext)
-      reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
+      private val attemptId = new TaskAttemptID(jobTrackerId, id, TaskType.MAP, split.index, 0)
+      private val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
+      private var finished = false
+      private var reader =
+        try {
+          val _reader = format.createRecordReader(
+            split.serializableHadoopSplit.value, hadoopAttemptContext)
+          _reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
+          _reader
+        } catch {
+          case e: IOException if ignoreCorruptFiles =>
+            logWarning(
+              s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}",
+              e)
+            finished = true
+            null
+        }
 
       // Register an on-task-completion callback to close the input stream.
       context.addTaskCompletionListener(context => close())
-      var havePair = false
-      var finished = false
-      var recordsSinceMetricsUpdate = 0
+      private var havePair = false
+      private var recordsSinceMetricsUpdate = 0
 
       override def hasNext: Boolean = {
         if (!finished && !havePair) {
           try {
             finished = !reader.nextKeyValue
           } catch {
-            case e: IOException if ignoreCorruptFiles => finished = true
+            case e: IOException if ignoreCorruptFiles =>
+              logWarning(
+                s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}",
+                e)
+              finished = true
           }
           if (finished) {
             // Close and release the reader here; close() will also be called when the task
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 068f4ed8ad74..dc123e23b781 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -19,7 +19,7 @@ package org.apache.spark.rdd
 
 import java.nio.ByteBuffer
 import java.text.SimpleDateFormat
-import java.util.{Date, HashMap => JHashMap}
+import java.util.{Date, HashMap => JHashMap, Locale}
 
 import scala.collection.{mutable, Map}
 import scala.collection.JavaConverters._
@@ -59,8 +59,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * :: Experimental ::
    * Generic function to combine the elements for each key using a custom set of aggregation
    * functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C
-   * Note that V and C can be different -- for example, one might group an RDD of type
-   * (Int, Int) into an RDD of type (Int, Seq[Int]). Users provide three functions:
+   *
+   * Users provide three functions:
    *
    *  - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
    *  - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
@@ -68,6 +68,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    *
    * In addition, users can control the partitioning of the output RDD, and whether to perform
    * map-side aggregation (if a mapper can produce multiple items with the same key).
+   *
+   * @note V and C can be different -- for example, one might group an RDD of type
+   * (Int, Int) into an RDD of type (Int, Seq[Int]).
    */
   @Experimental
   def combineByKeyWithClassTag[C](
@@ -363,7 +366,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   /**
    * Count the number of elements for each key, collecting the results to a local Map.
    *
-   * Note that this method should only be used if the resulting map is expected to be small, as
+   * @note This method should only be used if the resulting map is expected to be small, as
    * the whole thing is loaded into the driver's memory.
    * To handle very large results, consider using rdd.mapValues(_ => 1L).reduceByKey(_ + _), which
    * returns an RDD[T, Long] instead of a map.
@@ -398,9 +401,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
    * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
    *
-   * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp > p`
-   * would trigger sparse representation of registers, which may reduce the memory consumption
-   * and increase accuracy when the cardinality is small.
+   * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is
+   * greater than `p`) would trigger sparse representation of registers, which may reduce the
+   * memory consumption and increase accuracy when the cardinality is small.
    *
    * @param p The precision value for the normal set.
    *          `p` must be a value between 4 and `sp` if `sp` is not zero (32 max).
@@ -490,11 +493,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * The ordering of elements within each group is not guaranteed, and may even differ
    * each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * @note This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    *
-   * Note: As currently implemented, groupByKey must be able to hold all the key-value pairs for any
+   * @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any
    * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
    */
   def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])] = self.withScope {
@@ -514,11 +517,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * resulting RDD with into `numPartitions` partitions. The ordering of elements within
    * each group is not guaranteed, and may even differ each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * @note This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    *
-   * Note: As currently implemented, groupByKey must be able to hold all the key-value pairs for any
+   * @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any
    * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
    */
   def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])] = self.withScope {
@@ -635,9 +638,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * within each group is not guaranteed, and may even differ each time the resulting RDD is
    * evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * @note This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    */
   def groupByKey(): RDD[(K, Iterable[V])] = self.withScope {
     groupByKey(defaultPartitioner(self))
@@ -907,20 +910,24 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Return an RDD with the pairs from `this` whose keys are not in `other`.
    *
    * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
-   * RDD will be <= us.
+   * RDD will be less than or equal to us.
    */
   def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)] = self.withScope {
     subtractByKey(other, self.partitioner.getOrElse(new HashPartitioner(self.partitions.length)))
   }
 
-  /** Return an RDD with the pairs from `this` whose keys are not in `other`. */
+  /**
+   * Return an RDD with the pairs from `this` whose keys are not in `other`.
+   */
   def subtractByKey[W: ClassTag](
       other: RDD[(K, W)],
       numPartitions: Int): RDD[(K, V)] = self.withScope {
     subtractByKey(other, new HashPartitioner(numPartitions))
   }
 
-  /** Return an RDD with the pairs from `this` whose keys are not in `other`. */
+  /**
+   * Return an RDD with the pairs from `this` whose keys are not in `other`.
+   */
   def subtractByKey[W: ClassTag](other: RDD[(K, W)], p: Partitioner): RDD[(K, V)] = self.withScope {
     new SubtractedRDD[K, V, W](self, other, p)
   }
@@ -1016,7 +1023,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
    * supporting the key and value types K and V in this RDD.
    *
-   * Note that, we should make sure our tasks are idempotent when speculation is enabled, i.e. do
+   * @note We should make sure our tasks are idempotent when speculation is enabled, i.e. do
    * not use output committer that writes data directly.
    * There is an example in https://issues.apache.org/jira/browse/SPARK-10063 to show the bad
    * result of using direct output committer with speculation enabled.
@@ -1070,7 +1077,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * output paths required (e.g. a table name to write to) in the same way as it would be
    * configured for a Hadoop MapReduce job.
    *
-   * Note that, we should make sure our tasks are idempotent when speculation is enabled, i.e. do
+   * @note We should make sure our tasks are idempotent when speculation is enabled, i.e. do
    * not use output committer that writes data directly.
    * There is an example in https://issues.apache.org/jira/browse/SPARK-10063 to show the bad
    * result of using direct output committer with speculation enabled.
@@ -1079,7 +1086,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
     val hadoopConf = conf
     val job = NewAPIHadoopJob.getInstance(hadoopConf)
-    val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
+    val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
     val jobtrackerID = formatter.format(new Date())
     val stageId = self.id
     val jobConfiguration = job.getConfiguration
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
index 0c6ddda52cee..ce75a16031a3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
@@ -48,7 +48,7 @@ private[spark] class PruneDependency[T](rdd: RDD[T], partitionFilterFunc: Int =>
 
 /**
  * :: DeveloperApi ::
- * A RDD used to prune RDD partitions/partitions so we can avoid launching tasks on
+ * An RDD used to prune RDD partitions/partitions so we can avoid launching tasks on
  * all partitions. An example use case: If we know the RDD is partitioned by range,
  * and the execution DAG has a filter on the key, we can avoid launching tasks
  * on partitions that don't have the range covering the key.
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
index 3b1acacf409b..6a89ea878646 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
@@ -32,7 +32,7 @@ class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
 }
 
 /**
- * A RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD,
+ * An RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD,
  * a user-specified [[org.apache.spark.util.random.RandomSampler]] instance is used to obtain
  * a random sample of the records in the partition. The random seeds assigned to the samplers
  * are guaranteed to have different values.
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index db535de9e9bb..199a3770ac4a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -70,8 +70,8 @@ import org.apache.spark.util.random.{BernoulliCellSampler, BernoulliSampler, Poi
  * All of the scheduling and execution in Spark is done based on these methods, allowing each RDD
  * to implement its own way of computing itself. Indeed, users can implement custom RDDs (e.g. for
  * reading data from a new storage system) by overriding these functions. Please refer to the
- * [[http://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf Spark paper]] for more details
- * on RDD internals.
+ * <a href="http://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf">Spark paper</a>
+ * for more details on RDD internals.
  */
 abstract class RDD[T: ClassTag](
     @transient private var _sc: SparkContext,
@@ -195,10 +195,14 @@ abstract class RDD[T: ClassTag](
     }
   }
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def persist(): this.type = persist(StorageLevel.MEMORY_ONLY)
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def cache(): this.type = persist()
 
   /**
@@ -419,7 +423,8 @@ abstract class RDD[T: ClassTag](
    *
    * This results in a narrow dependency, e.g. if you go from 1000 partitions
    * to 100 partitions, there will not be a shuffle, instead each of the 100
-   * new partitions will claim 10 of the current partitions.
+   * new partitions will claim 10 of the current partitions. If a larger number
+   * of partitions is requested, it will stay at the current number of partitions.
    *
    * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
    * this may result in your computation taking place on fewer nodes than
@@ -428,7 +433,7 @@ abstract class RDD[T: ClassTag](
    * current upstream partitions will be executed in parallel (per whatever
    * the current partitioning is).
    *
-   * Note: With shuffle = true, you can actually coalesce to a larger number
+   * @note With shuffle = true, you can actually coalesce to a larger number
    * of partitions. This is useful if you have a small number of partitions,
    * say 100, potentially with a few partitions being abnormally large. Calling
    * coalesce(1000, shuffle = true) will result in 1000 partitions with the
@@ -469,8 +474,12 @@ abstract class RDD[T: ClassTag](
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be greater
+   *  than or equal to 0
    * @param seed seed for the random number generator
+   *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[RDD]].
    */
   def sample(
       withReplacement: Boolean,
@@ -534,13 +543,13 @@ abstract class RDD[T: ClassTag](
   /**
    * Return a fixed-size sampled subset of this RDD in an array
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
-   * all the data is loaded into the driver's memory.
-   *
    * @param withReplacement whether sampling is done with replacement
    * @param num size of the returned sample
    * @param seed seed for the random number generator
    * @return sample of specified size in an array
+   *
+   * @note this method should only be used if the resulting array is expected to be small, as
+   * all the data is loaded into the driver's memory.
    */
   def takeSample(
       withReplacement: Boolean,
@@ -615,7 +624,7 @@ abstract class RDD[T: ClassTag](
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    */
   def intersection(other: RDD[T]): RDD[T] = withScope {
     this.map(v => (v, null)).cogroup(other.map(v => (v, null)))
@@ -627,7 +636,7 @@ abstract class RDD[T: ClassTag](
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    *
    * @param partitioner Partitioner to use for the resulting RDD
    */
@@ -643,7 +652,7 @@ abstract class RDD[T: ClassTag](
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.  Performs a hash partition across the cluster
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    *
    * @param numPartitions How many partitions to use in the resulting RDD
    */
@@ -671,9 +680,9 @@ abstract class RDD[T: ClassTag](
    * mapping to that key. The ordering of elements within each group is not guaranteed, and
    * may even differ each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * @note This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    */
   def groupBy[K](f: T => K)(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])] = withScope {
     groupBy[K](f, defaultPartitioner(this))
@@ -684,9 +693,9 @@ abstract class RDD[T: ClassTag](
    * mapping to that key. The ordering of elements within each group is not guaranteed, and
    * may even differ each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * @note This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    */
   def groupBy[K](
       f: T => K,
@@ -699,9 +708,9 @@ abstract class RDD[T: ClassTag](
    * mapping to that key. The ordering of elements within each group is not guaranteed, and
    * may even differ each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * @note This operation may be very expensive. If you are grouping in order to perform an
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    */
   def groupBy[K](f: T => K, p: Partitioner)(implicit kt: ClassTag[K], ord: Ordering[K] = null)
       : RDD[(K, Iterable[T])] = withScope {
@@ -747,8 +756,10 @@ abstract class RDD[T: ClassTag](
    *                        print line function (like out.println()) as the 2nd parameter.
    *                        An example of pipe the RDD data of groupBy() in a streaming way,
    *                        instead of constructing a huge String to concat all the elements:
-   *                        def printRDDElement(record:(String, Seq[String]), f:String=&gt;Unit) =
-   *                          for (e &lt;- record._2) {f(e)}
+   *                        {{{
+   *                        def printRDDElement(record:(String, Seq[String]), f:String=>Unit) =
+   *                          for (e <- record._2) {f(e)}
+   *                        }}}
    * @param separateWorkingDir Use separate working directories for each task.
    * @param bufferSize Buffer size for the stdin writer for the piped process.
    * @param encoding Char encoding used for interacting (via stdin, stdout and stderr) with
@@ -788,14 +799,26 @@ abstract class RDD[T: ClassTag](
   }
 
   /**
-   * [performance] Spark's internal mapPartitions method which skips closure cleaning. It is a
-   * performance API to be used carefully only if we are sure that the RDD elements are
+   * [performance] Spark's internal mapPartitionsWithIndex method that skips closure cleaning.
+   * It is a performance API to be used carefully only if we are sure that the RDD elements are
    * serializable and don't require closure cleaning.
    *
    * @param preservesPartitioning indicates whether the input function preserves the partitioner,
    * which should be `false` unless this is a pair RDD and the input function doesn't modify
    * the keys.
    */
+  private[spark] def mapPartitionsWithIndexInternal[U: ClassTag](
+      f: (Int, Iterator[T]) => Iterator[U],
+      preservesPartitioning: Boolean = false): RDD[U] = withScope {
+    new MapPartitionsRDD(
+      this,
+      (context: TaskContext, index: Int, iter: Iterator[T]) => f(index, iter),
+      preservesPartitioning)
+  }
+
+  /**
+   * [performance] Spark's internal mapPartitions method that skips closure cleaning.
+   */
   private[spark] def mapPartitionsInternal[U: ClassTag](
       f: Iterator[T] => Iterator[U],
       preservesPartitioning: Boolean = false): RDD[U] = withScope {
@@ -906,7 +929,7 @@ abstract class RDD[T: ClassTag](
   /**
    * Return an array that contains all of the elements in this RDD.
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
+   * @note This method should only be used if the resulting array is expected to be small, as
    * all the data is loaded into the driver's memory.
    */
   def collect(): Array[T] = withScope {
@@ -919,7 +942,7 @@ abstract class RDD[T: ClassTag](
    *
    * The iterator will consume as much memory as the largest partition in this RDD.
    *
-   * Note: this results in multiple Spark jobs, and if the input RDD is the result
+   * @note This results in multiple Spark jobs, and if the input RDD is the result
    * of a wide transformation (e.g. join with different partitioners), to avoid
    * recomputing the input RDD should be cached first.
    */
@@ -1167,10 +1190,15 @@ abstract class RDD[T: ClassTag](
   /**
    * Return the count of each unique value in this RDD as a local map of (value, count) pairs.
    *
-   * Note that this method should only be used if the resulting map is expected to be small, as
+   * @note This method should only be used if the resulting map is expected to be small, as
    * the whole thing is loaded into the driver's memory.
-   * To handle very large results, consider using rdd.map(x =&gt; (x, 1L)).reduceByKey(_ + _), which
-   * returns an RDD[T, Long] instead of a map.
+   * To handle very large results, consider using
+   *
+   * {{{
+   * rdd.map(x => (x, 1L)).reduceByKey(_ + _)
+   * }}}
+   *
+   * , which returns an RDD[T, Long] instead of a map.
    */
   def countByValue()(implicit ord: Ordering[T] = null): Map[T, Long] = withScope {
     map(value => (value, null)).countByKey()
@@ -1208,9 +1236,9 @@ abstract class RDD[T: ClassTag](
    * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
    * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
    *
-   * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp &gt; p`
-   * would trigger sparse representation of registers, which may reduce the memory consumption
-   * and increase accuracy when the cardinality is small.
+   * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is greater
+   * than `p`) would trigger sparse representation of registers, which may reduce the memory
+   * consumption and increase accuracy when the cardinality is small.
    *
    * @param p The precision value for the normal set.
    *          `p` must be a value between 4 and `sp` if `sp` is not zero (32 max).
@@ -1257,7 +1285,7 @@ abstract class RDD[T: ClassTag](
    * This is similar to Scala's zipWithIndex but it uses Long instead of Int as the index type.
    * This method needs to trigger a spark job when this RDD contains more than one partitions.
    *
-   * Note that some RDDs, such as those returned by groupBy(), do not guarantee order of
+   * @note Some RDDs, such as those returned by groupBy(), do not guarantee order of
    * elements in a partition. The index assigned to each element is therefore not guaranteed,
    * and may even change if the RDD is reevaluated. If a fixed ordering is required to guarantee
    * the same index assignments, you should sort the RDD with sortByKey() or save it to a file.
@@ -1271,7 +1299,7 @@ abstract class RDD[T: ClassTag](
    * 2*n+k, ..., where n is the number of partitions. So there may exist gaps, but this method
    * won't trigger a spark job, which is different from [[org.apache.spark.rdd.RDD#zipWithIndex]].
    *
-   * Note that some RDDs, such as those returned by groupBy(), do not guarantee order of
+   * @note Some RDDs, such as those returned by groupBy(), do not guarantee order of
    * elements in a partition. The unique ID assigned to each element is therefore not guaranteed,
    * and may even change if the RDD is reevaluated. If a fixed ordering is required to guarantee
    * the same index assignments, you should sort the RDD with sortByKey() or save it to a file.
@@ -1290,10 +1318,10 @@ abstract class RDD[T: ClassTag](
    * results from that partition to estimate the number of additional partitions needed to satisfy
    * the limit.
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
+   * @note This method should only be used if the resulting array is expected to be small, as
    * all the data is loaded into the driver's memory.
    *
-   * @note due to complications in the internal implementation, this method will raise
+   * @note Due to complications in the internal implementation, this method will raise
    * an exception if called on an RDD of `Nothing` or `Null`.
    */
   def take(num: Int): Array[T] = withScope {
@@ -1355,7 +1383,7 @@ abstract class RDD[T: ClassTag](
    *   // returns Array(6, 5)
    * }}}
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
+   * @note This method should only be used if the resulting array is expected to be small, as
    * all the data is loaded into the driver's memory.
    *
    * @param num k, the number of top elements to return
@@ -1378,7 +1406,7 @@ abstract class RDD[T: ClassTag](
    *   // returns Array(2, 3)
    * }}}
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
+   * @note This method should only be used if the resulting array is expected to be small, as
    * all the data is loaded into the driver's memory.
    *
    * @param num k, the number of elements to return
@@ -1423,7 +1451,7 @@ abstract class RDD[T: ClassTag](
   }
 
   /**
-   * @note due to complications in the internal implementation, this method will raise an
+   * @note Due to complications in the internal implementation, this method will raise an
    * exception if called on an RDD of `Nothing` or `Null`. This may be come up in practice
    * because, for example, the type of `parallelize(Seq())` is `RDD[Nothing]`.
    * (`parallelize(Seq())` should be avoided anyway in favor of `parallelize(Seq[T]())`.)
@@ -1583,14 +1611,15 @@ abstract class RDD[T: ClassTag](
   /**
    * Return whether this RDD is checkpointed and materialized, either reliably or locally.
    */
-  def isCheckpointed: Boolean = checkpointData.exists(_.isCheckpointed)
+  def isCheckpointed: Boolean = isCheckpointedAndMaterialized
 
   /**
    * Return whether this RDD is checkpointed and materialized, either reliably or locally.
    * This is introduced as an alias for `isCheckpointed` to clarify the semantics of the
    * return value. Exposed for testing.
    */
-  private[spark] def isCheckpointedAndMaterialized: Boolean = isCheckpointed
+  private[spark] def isCheckpointedAndMaterialized: Boolean =
+    checkpointData.exists(_.isCheckpointed)
 
   /**
    * Return whether this RDD is marked for local checkpointing.
@@ -1719,7 +1748,7 @@ abstract class RDD[T: ClassTag](
 
   /**
    * Clears the dependencies of this RDD. This method must ensure that all references
-   * to the original parent RDDs is removed to enable the parent RDDs to be garbage
+   * to the original parent RDDs are removed to enable the parent RDDs to be garbage
    * collected. Subclasses of RDD may override this method for implementing their own cleaning
    * logic. See [[org.apache.spark.rdd.UnionRDD]] for an example.
    */
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
index 429514b4f6be..6c552d4d1251 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
@@ -23,7 +23,8 @@ import org.apache.spark.Partition
 
 /**
  * Enumeration to manage state transitions of an RDD through checkpointing
- * [ Initialized --> checkpointing in progress --> checkpointed ].
+ *
+ * [ Initialized --{@literal >} checkpointing in progress --{@literal >} checkpointed ]
  */
 private[spark] object CheckpointState extends Enumeration {
   type CheckpointState = Value
@@ -32,7 +33,7 @@ private[spark] object CheckpointState extends Enumeration {
 
 /**
  * This class contains all the information related to RDD checkpointing. Each instance of this
- * class is associated with a RDD. It manages process of checkpointing of the associated RDD,
+ * class is associated with an RDD. It manages process of checkpointing of the associated RDD,
  * as well as, manages the post-checkpoint state by providing the updated partitions,
  * iterator and preferred locations of the checkpointed RDD.
  */
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
index eac901d10067..7f399ecf81a0 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
@@ -151,7 +151,7 @@ private[spark] object ReliableCheckpointRDD extends Logging {
   }
 
   /**
-   * Write a RDD partition's data to a checkpoint file.
+   * Write an RDD partition's data to a checkpoint file.
    */
   def writePartitionToCheckpointFile[T: ClassTag](
       path: String,
diff --git a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
index 1311b481c7c7..86a332790fb0 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
@@ -27,9 +27,10 @@ import org.apache.spark.internal.Logging
 
 /**
  * Extra functions available on RDDs of (key, value) pairs to create a Hadoop SequenceFile,
- * through an implicit conversion. Note that this can't be part of PairRDDFunctions because
- * we need more implicit parameters to convert our keys and values to Writable.
+ * through an implicit conversion.
  *
+ * @note This can't be part of PairRDDFunctions because we need more implicit parameters to
+ * convert our keys and values to Writable.
  */
 class SequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable : ClassTag](
     self: RDD[(K, V)],
diff --git a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
index ad1fddbde7b0..60e383afadf1 100644
--- a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
@@ -20,7 +20,7 @@ package org.apache.spark.rdd
 import java.io.{IOException, ObjectOutputStream}
 
 import scala.collection.mutable.ArrayBuffer
-import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
+import scala.collection.parallel.ForkJoinTaskSupport
 import scala.concurrent.forkjoin.ForkJoinPool
 import scala.reflect.ClassTag
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
index b0e5ba0865c6..8425b211d6ec 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -29,7 +29,7 @@ class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
 }
 
 /**
- * Represents a RDD zipped with its element indices. The ordering is first based on the partition
+ * Represents an RDD zipped with its element indices. The ordering is first based on the partition
  * index and then the ordering of items within each partition. So the first item in the first
  * partition gets index 0, and the last item in the last partition receives the largest index.
  *
diff --git a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
index d8a80aa5aeb1..e00bc22aba44 100644
--- a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
@@ -35,14 +35,14 @@ trait PartitionCoalescer {
    * @param maxPartitions the maximum number of partitions to have after coalescing
    * @param parent the parent RDD whose partitions to coalesce
    * @return an array of [[PartitionGroup]]s, where each element is itself an array of
-   * [[Partition]]s and represents a partition after coalescing is performed.
+   * `Partition`s and represents a partition after coalescing is performed.
    */
   def coalesce(maxPartitions: Int, parent: RDD[_]): Array[PartitionGroup]
 }
 
 /**
  * ::DeveloperApi::
- * A group of [[Partition]]s
+ * A group of `Partition`s
  * @param prefLoc preferred location for the partition group
  */
 @DeveloperApi
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
index f527ec86ab7b..117f51c5b8f2 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.rpc
 
 /**
- * A callback that [[RpcEndpoint]] can use it to send back a message or failure. It's thread-safe
+ * A callback that [[RpcEndpoint]] can use to send back a message or failure. It's thread-safe
  * and can be called in any thread.
  */
 private[spark] trait RpcCallContext {
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 579122868afc..530743c03640 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -146,7 +146,6 @@ private[spark] abstract class RpcEnv(conf: SparkConf) {
    * @param uri URI with location of the file.
    */
   def openChannel(uri: String): ReadableByteChannel
-
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index e51649a1ecce..e56943da1303 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -407,11 +407,9 @@ private[netty] class NettyRpcEnv(
     }
 
   }
-
 }
 
 private[netty] object NettyRpcEnv extends Logging {
-
   /**
    * When deserializing the [[NettyRpcEndpointRef]], it needs a reference to [[NettyRpcEnv]].
    * Use `currentEnv` to wrap the deserialization codes. E.g.,
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
index 6c090ada5ae9..a7b7f58376f6 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Outbox.scala
@@ -56,7 +56,7 @@ private[netty] case class RpcOutboxMessage(
     content: ByteBuffer,
     _onFailure: (Throwable) => Unit,
     _onSuccess: (TransportClient, ByteBuffer) => Unit)
-  extends OutboxMessage with RpcResponseCallback {
+  extends OutboxMessage with RpcResponseCallback with Logging {
 
   private var client: TransportClient = _
   private var requestId: Long = _
@@ -67,8 +67,11 @@ private[netty] case class RpcOutboxMessage(
   }
 
   def onTimeout(): Unit = {
-    require(client != null, "TransportClient has not yet been set.")
-    client.removeRpcRequest(requestId)
+    if (client != null) {
+      client.removeRpcRequest(requestId)
+    } else {
+      logError("Ask timeout before connecting successfully")
+    }
   }
 
   override def onFailure(e: Throwable): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
index 99f20da2d66a..430dcc50ba71 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
@@ -20,7 +20,7 @@ package org.apache.spark.rpc.netty
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
 
 /**
- * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an [[RpcEndpoint]] exists.
+ * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an `RpcEndpoint` exists.
  *
  * This is used when setting up a remote endpoint reference.
  */
@@ -35,6 +35,6 @@ private[netty] class RpcEndpointVerifier(override val rpcEnv: RpcEnv, dispatcher
 private[netty] object RpcEndpointVerifier {
   val NAME = "endpoint-verifier"
 
-  /** A message used to ask the remote [[RpcEndpointVerifier]] if an [[RpcEndpoint]] exists. */
+  /** A message used to ask the remote [[RpcEndpointVerifier]] if an `RpcEndpoint` exists. */
   case class CheckExistence(name: String)
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
index cedacad44afe..0a5fe5a1d3ee 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
@@ -24,11 +24,6 @@ import org.apache.spark.annotation.DeveloperApi
  * :: DeveloperApi ::
  * Information about an [[org.apache.spark.Accumulable]] modified during a task or stage.
  *
- * Note: once this is JSON serialized the types of `update` and `value` will be lost and be
- * cast to strings. This is because the user can define an accumulator of any type and it will
- * be difficult to preserve the type in consumers of the event log. This does not apply to
- * internal accumulators that represent task level metrics.
- *
  * @param id accumulator ID
  * @param name accumulator name
  * @param update partial value from a task, may be None if used on driver to describe a stage
@@ -36,6 +31,11 @@ import org.apache.spark.annotation.DeveloperApi
  * @param internal whether this accumulator was internal
  * @param countFailedValues whether to count this accumulator's partial value if the task failed
  * @param metadata internal metadata associated with this accumulator, if any
+ *
+ * @note Once this is JSON serialized the types of `update` and `value` will be lost and be
+ * cast to strings. This is because the user can define an accumulator of any type and it will
+ * be difficult to preserve the type in consumers of the event log. This does not apply to
+ * internal accumulators that represent task level metrics.
  */
 @DeveloperApi
 case class AccumulableInfo private[spark] (
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index f2517401cb76..01a95c06fc69 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1660,7 +1660,7 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler
     } catch {
       case t: Throwable => logError("DAGScheduler failed to cancel all jobs.", t)
     }
-    dagScheduler.sc.stop()
+    dagScheduler.sc.stopInNewThread()
   }
 
   override def onStop(): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
index a6b032cc0084..66ab9a52b778 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
@@ -153,7 +153,7 @@ object InputFormatInfo {
 
     a) For each host, count number of splits hosted on that host.
     b) Decrement the currently allocated containers on that host.
-    c) Compute rack info for each host and update rack -> count map based on (b).
+    c) Compute rack info for each host and update rack to count map based on (b).
     d) Allocate nodes based on (c)
     e) On the allocation result, ensure that we don't allocate "too many" jobs on a single node
        (even if data locality on that is very high) : this is to prevent fragility of job if a
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index 0bd5a6bc59a9..08e05ae0c095 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -22,6 +22,7 @@ import java.io.{InputStream, IOException}
 import scala.io.Source
 
 import com.fasterxml.jackson.core.JsonParseException
+import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.internal.Logging
@@ -87,6 +88,12 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
             // Ignore events generated by Structured Streaming in Spark 2.0.0 and 2.0.1.
             // It's safe since no place uses them.
             logWarning(s"Dropped incompatible Structured Streaming log: $currentLine")
+          case e: UnrecognizedPropertyException if e.getMessage != null && e.getMessage.startsWith(
+            "Unrecognized field \"queryStatus\" " +
+              "(class org.apache.spark.sql.streaming.StreamingQueryListener$") =>
+            // Ignore events generated by Structured Streaming in Spark 2.0.2
+            // It's safe since no place uses them.
+            logWarning(s"Dropped incompatible Structured Streaming log: $currentLine")
           case jpe: JsonParseException =>
             // We can only ignore exception from last line of the file that might be truncated
             // the last entry may not be the very last line in the event log, but we treat it
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
index 1e7c63af2e79..d19353f2a993 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -42,7 +42,7 @@ import org.apache.spark.rdd.RDD
  * @param outputId index of the task in this job (a job can launch tasks on only a subset of the
  *                 input RDD's partitions).
  * @param localProperties copy of thread-local properties set by the user on the driver side.
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
  *
  * The parameters below are optional:
  * @param jobId id of the job this task belongs to
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
index 66d6790e168f..31011de85bf7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -42,7 +42,7 @@ import org.apache.spark.shuffle.ShuffleWriter
  *                   the type should be (RDD[_], ShuffleDependency[_, _, _]).
  * @param partition partition of the RDD this task is associated with
  * @param locs preferred task execution locations for locality scheduling
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
  * @param localProperties copy of thread-local properties set by the user on the driver side.
  *
  * The parameters below are optional:
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 9385e3c31e1e..112b08f2c03a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -45,7 +45,7 @@ import org.apache.spark.util._
  * @param stageId id of the stage this task belongs to
  * @param stageAttemptId attempt id of the stage this task belongs to
  * @param partitionId index of the number in the RDD
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
  * @param localProperties copy of thread-local properties set by the user on the driver side.
  *
  * The parameters below are optional:
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
index 1c7c81c488c3..45c742cbff5e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
@@ -23,7 +23,7 @@ import org.apache.spark.util.SerializableBuffer
 
 /**
  * Description of a task that gets passed onto executors to be executed, usually created by
- * [[TaskSetManager.resourceOffer]].
+ * `TaskSetManager.resourceOffer`.
  */
 private[spark] class TaskDescription(
     val taskId: Long,
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 3e3f1ad031e6..b03cfe4f0dc4 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -93,10 +93,12 @@ private[spark] class TaskSchedulerImpl(
   // Incrementing task IDs
   val nextTaskId = new AtomicLong(0)
 
-  // Number of tasks running on each executor
-  private val executorIdToTaskCount = new HashMap[String, Int]
+  // IDs of the tasks running on each executor
+  private val executorIdToRunningTaskIds = new HashMap[String, HashSet[Long]]
 
-  def runningTasksByExecutors(): Map[String, Int] = executorIdToTaskCount.toMap
+  def runningTasksByExecutors: Map[String, Int] = synchronized {
+    executorIdToRunningTaskIds.toMap.mapValues(_.size)
+  }
 
   // The set of executors we have on each host; this is used to compute hostsAlive, which
   // in turn is used to decide when we can attain data locality on a given host
@@ -264,7 +266,7 @@ private[spark] class TaskSchedulerImpl(
             val tid = task.taskId
             taskIdToTaskSetManager(tid) = taskSet
             taskIdToExecutorId(tid) = execId
-            executorIdToTaskCount(execId) += 1
+            executorIdToRunningTaskIds(execId).add(tid)
             availableCpus(i) -= CPUS_PER_TASK
             assert(availableCpus(i) >= 0)
             launchedTask = true
@@ -294,11 +296,11 @@ private[spark] class TaskSchedulerImpl(
       if (!hostToExecutors.contains(o.host)) {
         hostToExecutors(o.host) = new HashSet[String]()
       }
-      if (!executorIdToTaskCount.contains(o.executorId)) {
+      if (!executorIdToRunningTaskIds.contains(o.executorId)) {
         hostToExecutors(o.host) += o.executorId
         executorAdded(o.executorId, o.host)
         executorIdToHost(o.executorId) = o.host
-        executorIdToTaskCount(o.executorId) = 0
+        executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
         newExecAvail = true
       }
       for (rack <- getRackForHost(o.host)) {
@@ -349,38 +351,34 @@ private[spark] class TaskSchedulerImpl(
     var reason: Option[ExecutorLossReason] = None
     synchronized {
       try {
-        if (state == TaskState.LOST && taskIdToExecutorId.contains(tid)) {
-          // We lost this entire executor, so remember that it's gone
-          val execId = taskIdToExecutorId(tid)
-
-          if (executorIdToTaskCount.contains(execId)) {
-            reason = Some(
-              SlaveLost(s"Task $tid was lost, so marking the executor as lost as well."))
-            removeExecutor(execId, reason.get)
-            failedExecutor = Some(execId)
-          }
-        }
         taskIdToTaskSetManager.get(tid) match {
           case Some(taskSet) =>
-            if (TaskState.isFinished(state)) {
-              taskIdToTaskSetManager.remove(tid)
-              taskIdToExecutorId.remove(tid).foreach { execId =>
-                if (executorIdToTaskCount.contains(execId)) {
-                  executorIdToTaskCount(execId) -= 1
-                }
+            if (state == TaskState.LOST) {
+              // TaskState.LOST is only used by the deprecated Mesos fine-grained scheduling mode,
+              // where each executor corresponds to a single task, so mark the executor as failed.
+              val execId = taskIdToExecutorId.getOrElse(tid, throw new IllegalStateException(
+                "taskIdToTaskSetManager.contains(tid) <=> taskIdToExecutorId.contains(tid)"))
+              if (executorIdToRunningTaskIds.contains(execId)) {
+                reason = Some(
+                  SlaveLost(s"Task $tid was lost, so marking the executor as lost as well."))
+                removeExecutor(execId, reason.get)
+                failedExecutor = Some(execId)
               }
             }
-            if (state == TaskState.FINISHED) {
-              taskSet.removeRunningTask(tid)
-              taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData)
-            } else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) {
+            if (TaskState.isFinished(state)) {
+              cleanupTaskState(tid)
               taskSet.removeRunningTask(tid)
-              taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData)
+              if (state == TaskState.FINISHED) {
+                taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData)
+              } else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) {
+                taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData)
+              }
             }
           case None =>
             logError(
               ("Ignoring update with state %s for TID %s because its task set is gone (this is " +
-                "likely the result of receiving duplicate task finished status updates)")
+                "likely the result of receiving duplicate task finished status updates) or its " +
+                "executor has been marked as failed.")
                 .format(state, tid))
         }
       } catch {
@@ -491,7 +489,7 @@ private[spark] class TaskSchedulerImpl(
     var failedExecutor: Option[String] = None
 
     synchronized {
-      if (executorIdToTaskCount.contains(executorId)) {
+      if (executorIdToRunningTaskIds.contains(executorId)) {
         val hostPort = executorIdToHost(executorId)
         logExecutorLoss(executorId, hostPort, reason)
         removeExecutor(executorId, reason)
@@ -533,13 +531,31 @@ private[spark] class TaskSchedulerImpl(
       logError(s"Lost executor $executorId on $hostPort: $reason")
   }
 
+  /**
+   * Cleans up the TaskScheduler's state for tracking the given task.
+   */
+  private def cleanupTaskState(tid: Long): Unit = {
+    taskIdToTaskSetManager.remove(tid)
+    taskIdToExecutorId.remove(tid).foreach { executorId =>
+      executorIdToRunningTaskIds.get(executorId).foreach { _.remove(tid) }
+    }
+  }
+
   /**
    * Remove an executor from all our data structures and mark it as lost. If the executor's loss
    * reason is not yet known, do not yet remove its association with its host nor update the status
    * of any running tasks, since the loss reason defines whether we'll fail those tasks.
    */
   private def removeExecutor(executorId: String, reason: ExecutorLossReason) {
-    executorIdToTaskCount -= executorId
+    // The tasks on the lost executor may not send any more status updates (because the executor
+    // has been lost), so they should be cleaned up here.
+    executorIdToRunningTaskIds.remove(executorId).foreach { taskIds =>
+      logDebug("Cleaning up TaskScheduler state for tasks " +
+        s"${taskIds.mkString("[", ",", "]")} on failed executor $executorId")
+      // We do not notify the TaskSetManager of the task failures because that will
+      // happen below in the rootPool.executorLost() call.
+      taskIds.foreach(cleanupTaskState)
+    }
 
     val host = executorIdToHost(executorId)
     val execs = hostToExecutors.getOrElse(host, new HashSet)
@@ -577,11 +593,11 @@ private[spark] class TaskSchedulerImpl(
   }
 
   def isExecutorAlive(execId: String): Boolean = synchronized {
-    executorIdToTaskCount.contains(execId)
+    executorIdToRunningTaskIds.contains(execId)
   }
 
   def isExecutorBusy(execId: String): Boolean = synchronized {
-    executorIdToTaskCount.getOrElse(execId, -1) > 0
+    executorIdToRunningTaskIds.get(execId).exists(_.nonEmpty)
   }
 
   // By default, rack is unknown
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index edc8aac5d151..0a4f19d76073 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -28,7 +28,12 @@ private[spark] sealed trait CoarseGrainedClusterMessage extends Serializable
 
 private[spark] object CoarseGrainedClusterMessages {
 
-  case object RetrieveSparkProps extends CoarseGrainedClusterMessage
+  case object RetrieveSparkAppConfig extends CoarseGrainedClusterMessage
+
+  case class SparkAppConfig(
+      sparkProperties: Seq[(String, String)],
+      ioEncryptionKey: Option[Array[Byte]])
+    extends CoarseGrainedClusterMessage
 
   case object RetrieveLastAllocatedExecutorId extends CoarseGrainedClusterMessage
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 10d55c87fb8d..6fa239e4fee6 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -69,6 +69,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   // `CoarseGrainedSchedulerBackend.this`.
   private val executorDataMap = new HashMap[String, ExecutorData]
 
+  // Number of executors requested by the cluster manager, [[ExecutorAllocationManager]]
+  @GuardedBy("CoarseGrainedSchedulerBackend.this")
+  private var requestedTotalExecutors = 0
+
   // Number of executors requested from the cluster manager that have not registered yet
   @GuardedBy("CoarseGrainedSchedulerBackend.this")
   private var numPendingExecutors = 0
@@ -206,8 +210,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         removeExecutor(executorId, reason)
         context.reply(true)
 
-      case RetrieveSparkProps =>
-        context.reply(sparkProperties)
+      case RetrieveSparkAppConfig =>
+        val reply = SparkAppConfig(sparkProperties,
+          SparkEnv.get.securityManager.getIOEncryptionKey())
+        context.reply(reply)
     }
 
     // Make fake resource offers on all executors
@@ -388,6 +394,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
    * */
   protected def reset(): Unit = {
     val executors = synchronized {
+      requestedTotalExecutors = 0
       numPendingExecutors = 0
       executorsPendingToRemove.clear()
       Set() ++ executorDataMap.keys
@@ -461,12 +468,21 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager")
 
     val response = synchronized {
+      requestedTotalExecutors += numAdditionalExecutors
       numPendingExecutors += numAdditionalExecutors
       logDebug(s"Number of pending executors is now $numPendingExecutors")
+      if (requestedTotalExecutors !=
+          (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
+        logDebug(
+          s"""requestExecutors($numAdditionalExecutors): Executor request doesn't match:
+             |requestedTotalExecutors  = $requestedTotalExecutors
+             |numExistingExecutors     = $numExistingExecutors
+             |numPendingExecutors      = $numPendingExecutors
+             |executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin)
+      }
 
       // Account for executors pending to be added or removed
-      doRequestTotalExecutors(
-        numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)
+      doRequestTotalExecutors(requestedTotalExecutors)
     }
 
     defaultAskTimeout.awaitResult(response)
@@ -498,6 +514,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     }
 
     val response = synchronized {
+      this.requestedTotalExecutors = numExecutors
       this.localityAwareTasks = localityAwareTasks
       this.hostToLocalTaskCount = hostToLocalTaskCount
 
@@ -573,8 +590,17 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       // take into account executors that are pending to be added or removed.
       val adjustTotalExecutors =
         if (!replace) {
-          doRequestTotalExecutors(
-            numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)
+          requestedTotalExecutors = math.max(requestedTotalExecutors - executorsToKill.size, 0)
+          if (requestedTotalExecutors !=
+              (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
+            logDebug(
+              s"""killExecutors($executorIds, $replace, $force): Executor counts do not match:
+                 |requestedTotalExecutors  = $requestedTotalExecutors
+                 |numExistingExecutors     = $numExistingExecutors
+                 |numPendingExecutors      = $numPendingExecutors
+                 |executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin)
+          }
+          doRequestTotalExecutors(requestedTotalExecutors)
         } else {
           numPendingExecutors += knownExecutors.size
           Future.successful(true)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
index 04d40e2907cf..6f75a4791e94 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.scheduler.cluster
 
 import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.concurrent.Future
 
@@ -42,7 +43,7 @@ private[spark] class StandaloneSchedulerBackend(
   with Logging {
 
   private var client: StandaloneAppClient = null
-  private var stopping = false
+  private val stopping = new AtomicBoolean(false)
   private val launcherBackend = new LauncherBackend() {
     override protected def onStopRequest(): Unit = stop(SparkAppHandle.State.KILLED)
   }
@@ -112,7 +113,7 @@ private[spark] class StandaloneSchedulerBackend(
     launcherBackend.setState(SparkAppHandle.State.RUNNING)
   }
 
-  override def stop(): Unit = synchronized {
+  override def stop(): Unit = {
     stop(SparkAppHandle.State.FINISHED)
   }
 
@@ -125,21 +126,21 @@ private[spark] class StandaloneSchedulerBackend(
 
   override def disconnected() {
     notifyContext()
-    if (!stopping) {
+    if (!stopping.get) {
       logWarning("Disconnected from Spark cluster! Waiting for reconnection...")
     }
   }
 
   override def dead(reason: String) {
     notifyContext()
-    if (!stopping) {
+    if (!stopping.get) {
       launcherBackend.setState(SparkAppHandle.State.KILLED)
       logError("Application has been killed. Reason: " + reason)
       try {
         scheduler.error(reason)
       } finally {
         // Ensure the application terminates, as we can no longer run jobs.
-        sc.stop()
+        sc.stopInNewThread()
       }
     }
   }
@@ -206,20 +207,20 @@ private[spark] class StandaloneSchedulerBackend(
     registrationBarrier.release()
   }
 
-  private def stop(finalState: SparkAppHandle.State): Unit = synchronized {
-    try {
-      stopping = true
-
-      super.stop()
-      client.stop()
+  private def stop(finalState: SparkAppHandle.State): Unit = {
+    if (stopping.compareAndSet(false, true)) {
+      try {
+        super.stop()
+        client.stop()
 
-      val callback = shutdownCallback
-      if (callback != null) {
-        callback(this)
+        val callback = shutdownCallback
+        if (callback != null) {
+          callback(this)
+        }
+      } finally {
+        launcherBackend.setState(finalState)
+        launcherBackend.close()
       }
-    } finally {
-      launcherBackend.setState(finalState)
-      launcherBackend.close()
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
index 8f15f50bee81..8e3436f13480 100644
--- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
+++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
@@ -18,14 +18,13 @@ package org.apache.spark.security
 
 import java.io.{InputStream, OutputStream}
 import java.util.Properties
+import javax.crypto.KeyGenerator
 import javax.crypto.spec.{IvParameterSpec, SecretKeySpec}
 
 import org.apache.commons.crypto.random._
 import org.apache.commons.crypto.stream._
-import org.apache.hadoop.io.Text
 
 import org.apache.spark.SparkConf
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 
@@ -33,10 +32,6 @@ import org.apache.spark.internal.config._
  * A util class for manipulating IO encryption and decryption streams.
  */
 private[spark] object CryptoStreamUtils extends Logging {
-  /**
-   * Constants and variables for spark IO encryption
-   */
-  val SPARK_IO_TOKEN = new Text("SPARK_IO_TOKEN")
 
   // The initialization vector length in bytes.
   val IV_LENGTH_IN_BYTES = 16
@@ -46,32 +41,30 @@ private[spark] object CryptoStreamUtils extends Logging {
   val COMMONS_CRYPTO_CONF_PREFIX = "commons.crypto."
 
   /**
-   * Helper method to wrap [[OutputStream]] with [[CryptoOutputStream]] for encryption.
+   * Helper method to wrap `OutputStream` with `CryptoOutputStream` for encryption.
    */
   def createCryptoOutputStream(
       os: OutputStream,
-      sparkConf: SparkConf): OutputStream = {
+      sparkConf: SparkConf,
+      key: Array[Byte]): OutputStream = {
     val properties = toCryptoConf(sparkConf)
     val iv = createInitializationVector(properties)
     os.write(iv)
-    val credentials = SparkHadoopUtil.get.getCurrentUserCredentials()
-    val key = credentials.getSecretKey(SPARK_IO_TOKEN)
     val transformationStr = sparkConf.get(IO_CRYPTO_CIPHER_TRANSFORMATION)
     new CryptoOutputStream(transformationStr, properties, os,
       new SecretKeySpec(key, "AES"), new IvParameterSpec(iv))
   }
 
   /**
-   * Helper method to wrap [[InputStream]] with [[CryptoInputStream]] for decryption.
+   * Helper method to wrap `InputStream` with `CryptoInputStream` for decryption.
    */
   def createCryptoInputStream(
       is: InputStream,
-      sparkConf: SparkConf): InputStream = {
+      sparkConf: SparkConf,
+      key: Array[Byte]): InputStream = {
     val properties = toCryptoConf(sparkConf)
     val iv = new Array[Byte](IV_LENGTH_IN_BYTES)
     is.read(iv, 0, iv.length)
-    val credentials = SparkHadoopUtil.get.getCurrentUserCredentials()
-    val key = credentials.getSecretKey(SPARK_IO_TOKEN)
     val transformationStr = sparkConf.get(IO_CRYPTO_CIPHER_TRANSFORMATION)
     new CryptoInputStream(transformationStr, properties, is,
       new SecretKeySpec(key, "AES"), new IvParameterSpec(iv))
@@ -91,6 +84,17 @@ private[spark] object CryptoStreamUtils extends Logging {
     props
   }
 
+  /**
+   * Creates a new encryption key.
+   */
+  def createKey(conf: SparkConf): Array[Byte] = {
+    val keyLen = conf.get(IO_ENCRYPTION_KEY_SIZE_BITS)
+    val ioKeyGenAlgorithm = conf.get(IO_ENCRYPTION_KEYGEN_ALGORITHM)
+    val keyGen = KeyGenerator.getInstance(ioKeyGenAlgorithm)
+    keyGen.init(keyLen)
+    keyGen.generateKey().getEncoded()
+  }
+
   /**
    * This method to generate an IV (Initialization Vector) using secure random.
    */
diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 8b72da2ee01b..f60dcfddfdc2 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -131,7 +131,7 @@ private[spark] class JavaSerializerInstance(
  * :: DeveloperApi ::
  * A Spark serializer that uses Java's built-in serialization.
  *
- * Note that this serializer is not guaranteed to be wire-compatible across different versions of
+ * @note This serializer is not guaranteed to be wire-compatible across different versions of
  * Spark. It is intended to be used to serialize/de-serialize data within a single
  * Spark application.
  */
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 0d26281fe107..7eb2da1c2748 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -43,9 +43,10 @@ import org.apache.spark.util.{BoundedPriorityQueue, SerializableConfiguration, S
 import org.apache.spark.util.collection.CompactBuffer
 
 /**
- * A Spark serializer that uses the [[https://code.google.com/p/kryo/ Kryo serialization library]].
+ * A Spark serializer that uses the <a href="https://code.google.com/p/kryo/">
+ * Kryo serialization library</a>.
  *
- * Note that this serializer is not guaranteed to be wire-compatible across different versions of
+ * @note This serializer is not guaranteed to be wire-compatible across different versions of
  * Spark. It is intended to be used to serialize/de-serialize data within a single
  * Spark application.
  */
diff --git a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
index cb95246d5b0c..afe6cd86059f 100644
--- a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
@@ -40,7 +40,7 @@ import org.apache.spark.util.NextIterator
  *
  * 2. Java serialization interface.
  *
- * Note that serializers are not required to be wire-compatible across different versions of Spark.
+ * @note Serializers are not required to be wire-compatible across different versions of Spark.
  * They are intended to be used to serialize/de-serialize data within a single Spark application.
  */
 @DeveloperApi
diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
index 2156d576f187..dd98ea265ce4 100644
--- a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
@@ -33,7 +33,12 @@ import org.apache.spark.util.io.{ChunkedByteBuffer, ChunkedByteBufferOutputStrea
  * Component which configures serialization, compression and encryption for various Spark
  * components, including automatic selection of which [[Serializer]] to use for shuffles.
  */
-private[spark] class SerializerManager(defaultSerializer: Serializer, conf: SparkConf) {
+private[spark] class SerializerManager(
+    defaultSerializer: Serializer,
+    conf: SparkConf,
+    encryptionKey: Option[Array[Byte]]) {
+
+  def this(defaultSerializer: Serializer, conf: SparkConf) = this(defaultSerializer, conf, None)
 
   private[this] val kryoSerializer = new KryoSerializer(conf)
 
@@ -63,9 +68,6 @@ private[spark] class SerializerManager(defaultSerializer: Serializer, conf: Spar
   // Whether to compress shuffle output temporarily spilled to disk
   private[this] val compressShuffleSpill = conf.getBoolean("spark.shuffle.spill.compress", true)
 
-  // Whether to enable IO encryption
-  private[this] val enableIOEncryption = conf.get(IO_ENCRYPTION_ENABLED)
-
   /* The compression codec to use. Note that the "lazy" val is necessary because we want to delay
    * the initialization of the compression codec until it is first used. The reason is that a Spark
    * program could be using a user-defined codec in a third party jar, which is loaded in
@@ -73,12 +75,17 @@ private[spark] class SerializerManager(defaultSerializer: Serializer, conf: Spar
    * loaded yet. */
   private lazy val compressionCodec: CompressionCodec = CompressionCodec.createCodec(conf)
 
+  def encryptionEnabled: Boolean = encryptionKey.isDefined
+
   def canUseKryo(ct: ClassTag[_]): Boolean = {
     primitiveAndPrimitiveArrayClassTags.contains(ct) || ct == stringClassTag
   }
 
-  def getSerializer(ct: ClassTag[_]): Serializer = {
-    if (canUseKryo(ct)) {
+  // SPARK-18617: As feature in SPARK-13990 can not be applied to Spark Streaming now. The worst
+  // result is streaming job based on `Receiver` mode can not run on Spark 2.x properly. It may be
+  // a rational choice to close `kryo auto pick` feature for streaming in the first step.
+  def getSerializer(ct: ClassTag[_], autoPick: Boolean): Serializer = {
+    if (autoPick && canUseKryo(ct)) {
       kryoSerializer
     } else {
       defaultSerializer
@@ -124,15 +131,19 @@ private[spark] class SerializerManager(defaultSerializer: Serializer, conf: Spar
   /**
    * Wrap an input stream for encryption if shuffle encryption is enabled
    */
-  private[this] def wrapForEncryption(s: InputStream): InputStream = {
-    if (enableIOEncryption) CryptoStreamUtils.createCryptoInputStream(s, conf) else s
+  def wrapForEncryption(s: InputStream): InputStream = {
+    encryptionKey
+      .map { key => CryptoStreamUtils.createCryptoInputStream(s, conf, key) }
+      .getOrElse(s)
   }
 
   /**
    * Wrap an output stream for encryption if shuffle encryption is enabled
    */
-  private[this] def wrapForEncryption(s: OutputStream): OutputStream = {
-    if (enableIOEncryption) CryptoStreamUtils.createCryptoOutputStream(s, conf) else s
+  def wrapForEncryption(s: OutputStream): OutputStream = {
+    encryptionKey
+      .map { key => CryptoStreamUtils.createCryptoOutputStream(s, conf, key) }
+      .getOrElse(s)
   }
 
   /**
@@ -155,24 +166,32 @@ private[spark] class SerializerManager(defaultSerializer: Serializer, conf: Spar
       outputStream: OutputStream,
       values: Iterator[T]): Unit = {
     val byteStream = new BufferedOutputStream(outputStream)
-    val ser = getSerializer(implicitly[ClassTag[T]]).newInstance()
+    val autoPick = !blockId.isInstanceOf[StreamBlockId]
+    val ser = getSerializer(implicitly[ClassTag[T]], autoPick).newInstance()
     ser.serializeStream(wrapStream(blockId, byteStream)).writeAll(values).close()
   }
 
   /** Serializes into a chunked byte buffer. */
-  def dataSerialize[T: ClassTag](blockId: BlockId, values: Iterator[T]): ChunkedByteBuffer = {
-    dataSerializeWithExplicitClassTag(blockId, values, implicitly[ClassTag[T]])
+  def dataSerialize[T: ClassTag](
+      blockId: BlockId,
+      values: Iterator[T],
+      allowEncryption: Boolean = true): ChunkedByteBuffer = {
+    dataSerializeWithExplicitClassTag(blockId, values, implicitly[ClassTag[T]],
+      allowEncryption = allowEncryption)
   }
 
   /** Serializes into a chunked byte buffer. */
   def dataSerializeWithExplicitClassTag(
       blockId: BlockId,
       values: Iterator[_],
-      classTag: ClassTag[_]): ChunkedByteBuffer = {
+      classTag: ClassTag[_],
+      allowEncryption: Boolean = true): ChunkedByteBuffer = {
     val bbos = new ChunkedByteBufferOutputStream(1024 * 1024 * 4, ByteBuffer.allocate)
     val byteStream = new BufferedOutputStream(bbos)
-    val ser = getSerializer(classTag).newInstance()
-    ser.serializeStream(wrapStream(blockId, byteStream)).writeAll(values).close()
+    val autoPick = !blockId.isInstanceOf[StreamBlockId]
+    val ser = getSerializer(classTag, autoPick).newInstance()
+    val encrypted = if (allowEncryption) wrapForEncryption(byteStream) else byteStream
+    ser.serializeStream(wrapForCompression(blockId, encrypted)).writeAll(values).close()
     bbos.toChunkedByteBuffer
   }
 
@@ -182,12 +201,15 @@ private[spark] class SerializerManager(defaultSerializer: Serializer, conf: Spar
    */
   def dataDeserializeStream[T](
       blockId: BlockId,
-      inputStream: InputStream)
+      inputStream: InputStream,
+      maybeEncrypted: Boolean = true)
       (classTag: ClassTag[T]): Iterator[T] = {
     val stream = new BufferedInputStream(inputStream)
-    getSerializer(classTag)
+    val autoPick = !blockId.isInstanceOf[StreamBlockId]
+    val decrypted = if (maybeEncrypted) wrapForEncryption(inputStream) else inputStream
+    getSerializer(classTag, autoPick)
       .newInstance()
-      .deserializeStream(wrapStream(blockId, stream))
+      .deserializeStream(wrapForCompression(blockId, decrypted))
       .asIterator.asInstanceOf[Iterator[T]]
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
index 17bc04303fa8..d2df77f1c5dc 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApiRootResource.scala
@@ -18,6 +18,7 @@ package org.apache.spark.status.api.v1
 
 import java.util.zip.ZipOutputStream
 import javax.servlet.ServletContext
+import javax.servlet.http.HttpServletRequest
 import javax.ws.rs._
 import javax.ws.rs.core.{Context, Response}
 
@@ -40,7 +41,7 @@ import org.apache.spark.ui.SparkUI
  * HistoryServerSuite.
  */
 @Path("/v1")
-private[v1] class ApiRootResource extends UIRootFromServletContext {
+private[v1] class ApiRootResource extends ApiRequestContext {
 
   @Path("applications")
   def getApplicationList(): ApplicationListResource = {
@@ -56,21 +57,21 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   def getJobs(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): AllJobsResource = {
-    uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
+    withSparkUI(appId, Some(attemptId)) { ui =>
       new AllJobsResource(ui)
     }
   }
 
   @Path("applications/{appId}/jobs")
   def getJobs(@PathParam("appId") appId: String): AllJobsResource = {
-    uiRoot.withSparkUI(appId, None) { ui =>
+    withSparkUI(appId, None) { ui =>
       new AllJobsResource(ui)
     }
   }
 
   @Path("applications/{appId}/jobs/{jobId: \\d+}")
   def getJob(@PathParam("appId") appId: String): OneJobResource = {
-    uiRoot.withSparkUI(appId, None) { ui =>
+    withSparkUI(appId, None) { ui =>
       new OneJobResource(ui)
     }
   }
@@ -79,21 +80,21 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   def getJob(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): OneJobResource = {
-    uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
+    withSparkUI(appId, Some(attemptId)) { ui =>
       new OneJobResource(ui)
     }
   }
 
   @Path("applications/{appId}/executors")
   def getExecutors(@PathParam("appId") appId: String): ExecutorListResource = {
-    uiRoot.withSparkUI(appId, None) { ui =>
+    withSparkUI(appId, None) { ui =>
       new ExecutorListResource(ui)
     }
   }
 
   @Path("applications/{appId}/allexecutors")
   def getAllExecutors(@PathParam("appId") appId: String): AllExecutorListResource = {
-    uiRoot.withSparkUI(appId, None) { ui =>
+    withSparkUI(appId, None) { ui =>
       new AllExecutorListResource(ui)
     }
   }
@@ -102,7 +103,7 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   def getExecutors(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): ExecutorListResource = {
-    uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
+    withSparkUI(appId, Some(attemptId)) { ui =>
       new ExecutorListResource(ui)
     }
   }
@@ -111,15 +112,14 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   def getAllExecutors(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): AllExecutorListResource = {
-    uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
+    withSparkUI(appId, Some(attemptId)) { ui =>
       new AllExecutorListResource(ui)
     }
   }
 
-
   @Path("applications/{appId}/stages")
   def getStages(@PathParam("appId") appId: String): AllStagesResource = {
-    uiRoot.withSparkUI(appId, None) { ui =>
+    withSparkUI(appId, None) { ui =>
       new AllStagesResource(ui)
     }
   }
@@ -128,14 +128,14 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   def getStages(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): AllStagesResource = {
-    uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
+    withSparkUI(appId, Some(attemptId)) { ui =>
       new AllStagesResource(ui)
     }
   }
 
   @Path("applications/{appId}/stages/{stageId: \\d+}")
   def getStage(@PathParam("appId") appId: String): OneStageResource = {
-    uiRoot.withSparkUI(appId, None) { ui =>
+    withSparkUI(appId, None) { ui =>
       new OneStageResource(ui)
     }
   }
@@ -144,14 +144,14 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   def getStage(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): OneStageResource = {
-    uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
+    withSparkUI(appId, Some(attemptId)) { ui =>
       new OneStageResource(ui)
     }
   }
 
   @Path("applications/{appId}/storage/rdd")
   def getRdds(@PathParam("appId") appId: String): AllRDDResource = {
-    uiRoot.withSparkUI(appId, None) { ui =>
+    withSparkUI(appId, None) { ui =>
       new AllRDDResource(ui)
     }
   }
@@ -160,14 +160,14 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   def getRdds(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): AllRDDResource = {
-    uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
+    withSparkUI(appId, Some(attemptId)) { ui =>
       new AllRDDResource(ui)
     }
   }
 
   @Path("applications/{appId}/storage/rdd/{rddId: \\d+}")
   def getRdd(@PathParam("appId") appId: String): OneRDDResource = {
-    uiRoot.withSparkUI(appId, None) { ui =>
+    withSparkUI(appId, None) { ui =>
       new OneRDDResource(ui)
     }
   }
@@ -176,7 +176,7 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   def getRdd(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): OneRDDResource = {
-    uiRoot.withSparkUI(appId, Some(attemptId)) { ui =>
+    withSparkUI(appId, Some(attemptId)) { ui =>
       new OneRDDResource(ui)
     }
   }
@@ -184,14 +184,27 @@ private[v1] class ApiRootResource extends UIRootFromServletContext {
   @Path("applications/{appId}/logs")
   def getEventLogs(
       @PathParam("appId") appId: String): EventLogDownloadResource = {
-    new EventLogDownloadResource(uiRoot, appId, None)
+    try {
+      // withSparkUI will throw NotFoundException if attemptId exists for this application.
+      // So we need to try again with attempt id "1".
+      withSparkUI(appId, None) { _ =>
+        new EventLogDownloadResource(uiRoot, appId, None)
+      }
+    } catch {
+      case _: NotFoundException =>
+        withSparkUI(appId, Some("1")) { _ =>
+          new EventLogDownloadResource(uiRoot, appId, None)
+        }
+    }
   }
 
   @Path("applications/{appId}/{attemptId}/logs")
   def getEventLogs(
       @PathParam("appId") appId: String,
       @PathParam("attemptId") attemptId: String): EventLogDownloadResource = {
-    new EventLogDownloadResource(uiRoot, appId, Some(attemptId))
+    withSparkUI(appId, Some(attemptId)) { _ =>
+      new EventLogDownloadResource(uiRoot, appId, Some(attemptId))
+    }
   }
 
   @Path("version")
@@ -234,19 +247,6 @@ private[spark] trait UIRoot {
       .status(Response.Status.SERVICE_UNAVAILABLE)
       .build()
   }
-
-  /**
-   * Get the spark UI with the given appID, and apply a function
-   * to it.  If there is no such app, throw an appropriate exception
-   */
-  def withSparkUI[T](appId: String, attemptId: Option[String])(f: SparkUI => T): T = {
-    val appKey = attemptId.map(appId + "/" + _).getOrElse(appId)
-    getSparkUI(appKey) match {
-      case Some(ui) =>
-        f(ui)
-      case None => throw new NotFoundException("no such app: " + appId)
-    }
-  }
   def securityManager: SecurityManager
 }
 
@@ -263,13 +263,37 @@ private[v1] object UIRootFromServletContext {
   }
 }
 
-private[v1] trait UIRootFromServletContext {
+private[v1] trait ApiRequestContext {
   @Context
-  var servletContext: ServletContext = _
+  protected var servletContext: ServletContext = _
+
+  @Context
+  protected var httpRequest: HttpServletRequest = _
 
   def uiRoot: UIRoot = UIRootFromServletContext.getUiRoot(servletContext)
+
+
+  /**
+   * Get the spark UI with the given appID, and apply a function
+   * to it.  If there is no such app, throw an appropriate exception
+   */
+  def withSparkUI[T](appId: String, attemptId: Option[String])(f: SparkUI => T): T = {
+    val appKey = attemptId.map(appId + "/" + _).getOrElse(appId)
+    uiRoot.getSparkUI(appKey) match {
+      case Some(ui) =>
+        val user = httpRequest.getRemoteUser()
+        if (!ui.securityManager.checkUIViewPermissions(user)) {
+          throw new ForbiddenException(raw"""user "$user" is not authorized""")
+        }
+        f(ui)
+      case None => throw new NotFoundException("no such app: " + appId)
+    }
+  }
 }
 
+private[v1] class ForbiddenException(msg: String) extends WebApplicationException(
+  Response.status(Response.Status.FORBIDDEN).entity(msg).build())
+
 private[v1] class NotFoundException(msg: String) extends WebApplicationException(
   new NoSuchElementException(msg),
     Response
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
index f6a9f9c5573d..76af33c1a18d 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
@@ -21,7 +21,7 @@ import java.lang.annotation.Annotation
 import java.lang.reflect.Type
 import java.nio.charset.StandardCharsets
 import java.text.SimpleDateFormat
-import java.util.{Calendar, SimpleTimeZone}
+import java.util.{Calendar, Locale, SimpleTimeZone}
 import javax.ws.rs.Produces
 import javax.ws.rs.core.{MediaType, MultivaluedMap}
 import javax.ws.rs.ext.{MessageBodyWriter, Provider}
@@ -86,7 +86,7 @@ private[v1] class JacksonMessageWriter extends MessageBodyWriter[Object]{
 
 private[spark] object JacksonMessageWriter {
   def makeISODateFormat: SimpleDateFormat = {
-    val iso8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'GMT'")
+    val iso8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'GMT'", Locale.US)
     val cal = Calendar.getInstance(new SimpleTimeZone(0, "GMT"))
     iso8601.setCalendar(cal)
     iso8601
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/SecurityFilter.scala b/core/src/main/scala/org/apache/spark/status/api/v1/SecurityFilter.scala
index b4a991eda35f..1cd37185d660 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/SecurityFilter.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/SecurityFilter.scala
@@ -21,14 +21,14 @@ import javax.ws.rs.core.Response
 import javax.ws.rs.ext.Provider
 
 @Provider
-private[v1] class SecurityFilter extends ContainerRequestFilter with UIRootFromServletContext {
+private[v1] class SecurityFilter extends ContainerRequestFilter with ApiRequestContext {
   override def filter(req: ContainerRequestContext): Unit = {
-    val user = Option(req.getSecurityContext.getUserPrincipal).map { _.getName }.orNull
+    val user = httpRequest.getRemoteUser()
     if (!uiRoot.securityManager.checkUIViewPermissions(user)) {
       req.abortWith(
         Response
           .status(Response.Status.FORBIDDEN)
-          .entity(raw"""user "$user"is not authorized""")
+          .entity(raw"""user "$user" is not authorized""")
           .build()
       )
     }
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala b/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
index 0c71cd238222..d8d5e8958b23 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
@@ -17,7 +17,7 @@
 package org.apache.spark.status.api.v1
 
 import java.text.{ParseException, SimpleDateFormat}
-import java.util.TimeZone
+import java.util.{Locale, TimeZone}
 import javax.ws.rs.WebApplicationException
 import javax.ws.rs.core.Response
 import javax.ws.rs.core.Response.Status
@@ -25,12 +25,12 @@ import javax.ws.rs.core.Response.Status
 private[v1] class SimpleDateParam(val originalValue: String) {
 
   val timestamp: Long = {
-    val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz")
+    val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz", Locale.US)
     try {
       format.parse(originalValue).getTime()
     } catch {
       case _: ParseException =>
-        val gmtDay = new SimpleDateFormat("yyyy-MM-dd")
+        val gmtDay = new SimpleDateFormat("yyyy-MM-dd", Locale.US)
         gmtDay.setTimeZone(TimeZone.getTimeZone("GMT"))
         try {
           gmtDay.parse(originalValue).getTime()
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala
index dd8f5bacb9f6..c0e18e54f410 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockInfoManager.scala
@@ -281,22 +281,27 @@ private[storage] class BlockInfoManager extends Logging {
 
   /**
    * Release a lock on the given block.
+   * In case a TaskContext is not propagated properly to all child threads for the task, we fail to
+   * get the TID from TaskContext, so we have to explicitly pass the TID value to release the lock.
+   *
+   * See SPARK-18406 for more discussion of this issue.
    */
-  def unlock(blockId: BlockId): Unit = synchronized {
-    logTrace(s"Task $currentTaskAttemptId releasing lock for $blockId")
+  def unlock(blockId: BlockId, taskAttemptId: Option[TaskAttemptId] = None): Unit = synchronized {
+    val taskId = taskAttemptId.getOrElse(currentTaskAttemptId)
+    logTrace(s"Task $taskId releasing lock for $blockId")
     val info = get(blockId).getOrElse {
       throw new IllegalStateException(s"Block $blockId not found")
     }
     if (info.writerTask != BlockInfo.NO_WRITER) {
       info.writerTask = BlockInfo.NO_WRITER
-      writeLocksByTask.removeBinding(currentTaskAttemptId, blockId)
+      writeLocksByTask.removeBinding(taskId, blockId)
     } else {
       assert(info.readerCount > 0, s"Block $blockId is not locked for reading")
       info.readerCount -= 1
-      val countsForTask = readLocksByTask(currentTaskAttemptId)
+      val countsForTask = readLocksByTask(taskId)
       val newPinCountForTask: Int = countsForTask.remove(blockId, 1) - 1
       assert(newPinCountForTask >= 0,
-        s"Task $currentTaskAttemptId release lock on block $blockId more times than it acquired it")
+        s"Task $taskId release lock on block $blockId more times than it acquired it")
     }
     notifyAll()
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 982b83324e0f..6ff61bdd5b7b 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -28,6 +28,8 @@ import scala.reflect.ClassTag
 import scala.util.Random
 import scala.util.control.NonFatal
 
+import com.google.common.io.ByteStreams
+
 import org.apache.spark._
 import org.apache.spark.executor.{DataReadMethod, ShuffleWriteMetrics}
 import org.apache.spark.internal.Logging
@@ -38,6 +40,7 @@ import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.ExternalShuffleClient
 import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
 import org.apache.spark.rpc.RpcEnv
+import org.apache.spark.security.CryptoStreamUtils
 import org.apache.spark.serializer.{SerializerInstance, SerializerManager}
 import org.apache.spark.shuffle.ShuffleManager
 import org.apache.spark.storage.memory._
@@ -62,7 +65,7 @@ private[spark] class BlockManager(
     executorId: String,
     rpcEnv: RpcEnv,
     val master: BlockManagerMaster,
-    serializerManager: SerializerManager,
+    val serializerManager: SerializerManager,
     val conf: SparkConf,
     memoryManager: MemoryManager,
     mapOutputTracker: MapOutputTracker,
@@ -315,6 +318,9 @@ private[spark] class BlockManager(
 
   /**
    * Put the block locally, using the given storage level.
+   *
+   * '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
+   * so may corrupt or change the data stored by the `BlockManager`.
    */
   override def putBlockData(
       blockId: BlockId,
@@ -448,6 +454,7 @@ private[spark] class BlockManager(
       case Some(info) =>
         val level = info.level
         logDebug(s"Level for block $blockId is $level")
+        val taskAttemptId = Option(TaskContext.get()).map(_.taskAttemptId())
         if (level.useMemory && memoryStore.contains(blockId)) {
           val iter: Iterator[Any] = if (level.deserialized) {
             memoryStore.getValues(blockId).get
@@ -455,7 +462,12 @@ private[spark] class BlockManager(
             serializerManager.dataDeserializeStream(
               blockId, memoryStore.getBytes(blockId).get.toInputStream())(info.classTag)
           }
-          val ci = CompletionIterator[Any, Iterator[Any]](iter, releaseLock(blockId))
+          // We need to capture the current taskId in case the iterator completion is triggered
+          // from a different thread which does not have TaskContext set; see SPARK-18406 for
+          // discussion.
+          val ci = CompletionIterator[Any, Iterator[Any]](iter, {
+            releaseLock(blockId, taskAttemptId)
+          })
           Some(new BlockResult(ci, DataReadMethod.Memory, info.size))
         } else if (level.useDisk && diskStore.contains(blockId)) {
           val iterToReturn: Iterator[Any] = {
@@ -472,7 +484,9 @@ private[spark] class BlockManager(
               serializerManager.dataDeserializeStream(blockId, stream)(info.classTag)
             }
           }
-          val ci = CompletionIterator[Any, Iterator[Any]](iterToReturn, releaseLock(blockId))
+          val ci = CompletionIterator[Any, Iterator[Any]](iterToReturn, {
+            releaseLock(blockId, taskAttemptId)
+          })
           Some(new BlockResult(ci, DataReadMethod.Disk, info.size))
         } else {
           handleLocalReadFailure(blockId)
@@ -648,10 +662,13 @@ private[spark] class BlockManager(
   }
 
   /**
-   * Release a lock on the given block.
+   * Release a lock on the given block with explicit TID.
+   * The param `taskAttemptId` should be passed in case we can't get the correct TID from
+   * TaskContext, for example, the input iterator of a cached RDD iterates to the end in a child
+   * thread.
    */
-  def releaseLock(blockId: BlockId): Unit = {
-    blockInfoManager.unlock(blockId)
+  def releaseLock(blockId: BlockId, taskAttemptId: Option[Long] = None): Unit = {
+    blockInfoManager.unlock(blockId, taskAttemptId)
   }
 
   /**
@@ -745,24 +762,54 @@ private[spark] class BlockManager(
       serializerInstance: SerializerInstance,
       bufferSize: Int,
       writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
-    val wrapStream: OutputStream => OutputStream = serializerManager.wrapStream(blockId, _)
     val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
-    new DiskBlockObjectWriter(file, serializerInstance, bufferSize, wrapStream,
+    new DiskBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize,
       syncWrites, writeMetrics, blockId)
   }
 
   /**
    * Put a new block of serialized bytes to the block manager.
    *
+   * '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
+   * so may corrupt or change the data stored by the `BlockManager`.
+   *
+   * @param encrypt If true, asks the block manager to encrypt the data block before storing,
+   *                when I/O encryption is enabled. This is required for blocks that have been
+   *                read from unencrypted sources, since all the BlockManager read APIs
+   *                automatically do decryption.
    * @return true if the block was stored or false if an error occurred.
    */
   def putBytes[T: ClassTag](
       blockId: BlockId,
       bytes: ChunkedByteBuffer,
       level: StorageLevel,
-      tellMaster: Boolean = true): Boolean = {
+      tellMaster: Boolean = true,
+      encrypt: Boolean = false): Boolean = {
     require(bytes != null, "Bytes is null")
-    doPutBytes(blockId, bytes, level, implicitly[ClassTag[T]], tellMaster)
+
+    val bytesToStore =
+      if (encrypt && securityManager.ioEncryptionKey.isDefined) {
+        try {
+          val data = bytes.toByteBuffer
+          val in = new ByteBufferInputStream(data)
+          val byteBufOut = new ByteBufferOutputStream(data.remaining())
+          val out = CryptoStreamUtils.createCryptoOutputStream(byteBufOut, conf,
+            securityManager.ioEncryptionKey.get)
+          try {
+            ByteStreams.copy(in, out)
+          } finally {
+            in.close()
+            out.close()
+          }
+          new ChunkedByteBuffer(byteBufOut.toByteBuffer)
+        } finally {
+          bytes.dispose()
+        }
+      } else {
+        bytes
+      }
+
+    doPutBytes(blockId, bytesToStore, level, implicitly[ClassTag[T]], tellMaster)
   }
 
   /**
@@ -771,6 +818,9 @@ private[spark] class BlockManager(
    *
    * If the block already exists, this method will not overwrite it.
    *
+   * '''Important!''' Callers must not mutate or release the data buffer underlying `bytes`. Doing
+   * so may corrupt or change the data stored by the `BlockManager`.
+   *
    * @param keepReadLock if true, this method will hold the read lock when it returns (even if the
    *                     block already exists). If false, this method will hold no locks when it
    *                     returns.
@@ -814,7 +864,15 @@ private[spark] class BlockManager(
               false
           }
         } else {
-          memoryStore.putBytes(blockId, size, level.memoryMode, () => bytes)
+          val memoryMode = level.memoryMode
+          memoryStore.putBytes(blockId, size, memoryMode, () => {
+            if (memoryMode == MemoryMode.OFF_HEAP &&
+                bytes.chunks.exists(buffer => !buffer.isDirect)) {
+              bytes.copy(Platform.allocateDirectBuffer)
+            } else {
+              bytes
+            }
+          })
         }
         if (!putSucceeded && level.useDisk) {
           logWarning(s"Persisting block $blockId to disk instead.")
@@ -1019,7 +1077,7 @@ private[spark] class BlockManager(
           try {
             replicate(blockId, bytesToReplicate, level, remoteClassTag)
           } finally {
-            bytesToReplicate.dispose()
+            bytesToReplicate.unmap()
           }
           logDebug("Put block %s remotely took %s"
             .format(blockId, Utils.getUsedTimeMs(remoteStartTime)))
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index 6bded9270050..d71acbb4cf77 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -43,7 +43,7 @@ private[spark] object BlockManagerMessages {
     extends ToBlockManagerSlave
 
   /**
-   * Driver -> Executor message to trigger a thread dump.
+   * Driver to Executor message to trigger a thread dump.
    */
   case object TriggerThreadDump extends ToBlockManagerSlave
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
index bf087af16a5b..bb8a684b4c7a 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
@@ -89,17 +89,18 @@ class RandomBlockReplicationPolicy
     prioritizedPeers
   }
 
+  // scalastyle:off line.size.limit
   /**
    * Uses sampling algorithm by Robert Floyd. Finds a random sample in O(n) while
-   * minimizing space usage
-   * [[http://math.stackexchange.com/questions/178690/
-   * whats-the-proof-of-correctness-for-robert-floyds-algorithm-for-selecting-a-sin]]
+   * minimizing space usage. Please see <a href="http://math.stackexchange.com/questions/178690/whats-the-proof-of-correctness-for-robert-floyds-algorithm-for-selecting-a-sin">
+   * here</a>.
    *
    * @param n total number of indices
    * @param m number of samples needed
    * @param r random number generator
    * @return list of m random unique indices
    */
+  // scalastyle:on line.size.limit
   private def getSampleIds(n: Int, m: Int, r: Random): List[Int] = {
     val indices = (n - m + 1 to n).foldLeft(Set.empty[Int]) {case (set, i) =>
       val t = r.nextInt(i) + 1
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index a499827ae159..eb3ff926372a 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -22,7 +22,7 @@ import java.nio.channels.FileChannel
 
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.internal.Logging
-import org.apache.spark.serializer.{SerializationStream, SerializerInstance}
+import org.apache.spark.serializer.{SerializationStream, SerializerInstance, SerializerManager}
 import org.apache.spark.util.Utils
 
 /**
@@ -37,9 +37,9 @@ import org.apache.spark.util.Utils
  */
 private[spark] class DiskBlockObjectWriter(
     val file: File,
+    serializerManager: SerializerManager,
     serializerInstance: SerializerInstance,
     bufferSize: Int,
-    wrapStream: OutputStream => OutputStream,
     syncWrites: Boolean,
     // These write metrics concurrently shared with other active DiskBlockObjectWriters who
     // are themselves performing writes. All updates must be relative.
@@ -116,7 +116,7 @@ private[spark] class DiskBlockObjectWriter(
       initialized = true
     }
 
-    bs = wrapStream(mcs)
+    bs = serializerManager.wrapStream(blockId, mcs)
     objOut = serializerInstance.serializeStream(bs)
     streamOpen = true
     this
@@ -128,16 +128,19 @@ private[spark] class DiskBlockObjectWriter(
    */
   private def closeResources(): Unit = {
     if (initialized) {
-      mcs.manualClose()
-      channel = null
-      mcs = null
-      bs = null
-      fos = null
-      ts = null
-      objOut = null
-      initialized = false
-      streamOpen = false
-      hasBeenClosed = true
+      Utils.tryWithSafeFinally {
+        mcs.manualClose()
+      } {
+        channel = null
+        mcs = null
+        bs = null
+        fos = null
+        ts = null
+        objOut = null
+        initialized = false
+        streamOpen = false
+        hasBeenClosed = true
+      }
     }
   }
 
@@ -199,26 +202,29 @@ private[spark] class DiskBlockObjectWriter(
   def revertPartialWritesAndClose(): File = {
     // Discard current writes. We do this by flushing the outstanding writes and then
     // truncating the file to its initial position.
-    try {
+    Utils.tryWithSafeFinally {
       if (initialized) {
         writeMetrics.decBytesWritten(reportedPosition - committedPosition)
         writeMetrics.decRecordsWritten(numRecordsWritten)
         streamOpen = false
         closeResources()
       }
-
-      val truncateStream = new FileOutputStream(file, true)
+    } {
+      var truncateStream: FileOutputStream = null
       try {
+        truncateStream = new FileOutputStream(file, true)
         truncateStream.getChannel.truncate(committedPosition)
-        file
+      } catch {
+        case e: Exception =>
+          logError("Uncaught exception while reverting partial writes to file " + file, e)
       } finally {
-        truncateStream.close()
+        if (truncateStream != null) {
+          truncateStream.close()
+          truncateStream = null
+        }
       }
-    } catch {
-      case e: Exception =>
-        logError("Uncaught exception while reverting partial writes to file " + file, e)
-        file
     }
+    file
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index 4dc2f362329a..7eda6e97a81e 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -247,7 +247,7 @@ final class ShuffleBlockFetcherIterator(
 
   /**
    * Fetch the local blocks while we are fetching remote blocks. This is ok because
-   * [[ManagedBuffer]]'s memory is allocated lazily when we create the input stream, so all we
+   * `ManagedBuffer`'s memory is allocated lazily when we create the input stream, so all we
    * track in-memory are the ManagedBuffer references themselves.
    */
   private[this] def fetchLocalBlocks() {
@@ -304,6 +304,10 @@ final class ShuffleBlockFetcherIterator(
    * Throws a FetchFailedException if the next block could not be fetched.
    */
   override def next(): (BlockId, InputStream) = {
+    if (!hasNext) {
+      throw new NoSuchElementException
+    }
+
     numBlocksProcessed += 1
     val startFetchWait = System.currentTimeMillis()
     currentResult = results.take()
@@ -423,7 +427,7 @@ object ShuffleBlockFetcherIterator {
    * @param address BlockManager that the block was fetched from.
    * @param size estimated size of the block, used to calculate bytesInFlight.
    *             Note that this is NOT the exact bytes.
-   * @param buf [[ManagedBuffer]] for the content.
+   * @param buf `ManagedBuffer` for the content.
    * @param isNetworkReqDone Is this the last network request for this host in this fetch request.
    */
   private[storage] case class SuccessFetchResult(
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
index fb9941bbd9e0..5efdd23f79a2 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
@@ -71,7 +71,7 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
   /**
    * Return the blocks stored in this block manager.
    *
-   * Note that this is somewhat expensive, as it involves cloning the underlying maps and then
+   * @note This is somewhat expensive, as it involves cloning the underlying maps and then
    * concatenating them together. Much faster alternatives exist for common operations such as
    * contains, get, and size.
    */
@@ -80,7 +80,7 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
   /**
    * Return the RDD blocks stored in this block manager.
    *
-   * Note that this is somewhat expensive, as it involves cloning the underlying maps and then
+   * @note This is somewhat expensive, as it involves cloning the underlying maps and then
    * concatenating them together. Much faster alternatives exist for common operations such as
    * getting the memory, disk, and off-heap memory sizes occupied by this RDD.
    */
@@ -128,7 +128,8 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
 
   /**
    * Return whether the given block is stored in this block manager in O(1) time.
-   * Note that this is much faster than `this.blocks.contains`, which is O(blocks) time.
+   *
+   * @note This is much faster than `this.blocks.contains`, which is O(blocks) time.
    */
   def containsBlock(blockId: BlockId): Boolean = {
     blockId match {
@@ -141,7 +142,8 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
 
   /**
    * Return the given block stored in this block manager in O(1) time.
-   * Note that this is much faster than `this.blocks.get`, which is O(blocks) time.
+   *
+   * @note This is much faster than `this.blocks.get`, which is O(blocks) time.
    */
   def getBlock(blockId: BlockId): Option[BlockStatus] = {
     blockId match {
@@ -154,19 +156,22 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
 
   /**
    * Return the number of blocks stored in this block manager in O(RDDs) time.
-   * Note that this is much faster than `this.blocks.size`, which is O(blocks) time.
+   *
+   * @note This is much faster than `this.blocks.size`, which is O(blocks) time.
    */
   def numBlocks: Int = _nonRddBlocks.size + numRddBlocks
 
   /**
    * Return the number of RDD blocks stored in this block manager in O(RDDs) time.
-   * Note that this is much faster than `this.rddBlocks.size`, which is O(RDD blocks) time.
+   *
+   * @note This is much faster than `this.rddBlocks.size`, which is O(RDD blocks) time.
    */
   def numRddBlocks: Int = _rddBlocks.values.map(_.size).sum
 
   /**
    * Return the number of blocks that belong to the given RDD in O(1) time.
-   * Note that this is much faster than `this.rddBlocksById(rddId).size`, which is
+   *
+   * @note This is much faster than `this.rddBlocksById(rddId).size`, which is
    * O(blocks in this RDD) time.
    */
   def numRddBlocksById(rddId: Int): Int = _rddBlocks.get(rddId).map(_.size).getOrElse(0)
@@ -231,22 +236,60 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
 
 /** Helper methods for storage-related objects. */
 private[spark] object StorageUtils extends Logging {
+  // Ewwww... Reflection!!! See the unmap method for justification
+  private val memoryMappedBufferFileDescriptorField = {
+    val mappedBufferClass = classOf[java.nio.MappedByteBuffer]
+    val fdField = mappedBufferClass.getDeclaredField("fd")
+    fdField.setAccessible(true)
+    fdField
+  }
 
   /**
-   * Attempt to clean up a ByteBuffer if it is memory-mapped. This uses an *unsafe* Sun API that
-   * might cause errors if one attempts to read from the unmapped buffer, but it's better than
-   * waiting for the GC to find it because that could lead to huge numbers of open files. There's
-   * unfortunately no standard API to do this.
+   * Attempt to clean up a ByteBuffer if it is direct or memory-mapped. This uses an *unsafe* Sun
+   * API that will cause errors if one attempts to read from the disposed buffer. However, neither
+   * the bytes allocated to direct buffers nor file descriptors opened for memory-mapped buffers put
+   * pressure on the garbage collector. Waiting for garbage collection may lead to the depletion of
+   * off-heap memory or huge numbers of open files. There's unfortunately no standard API to
+   * manually dispose of these kinds of buffers.
+   *
+   * See also [[unmap]]
    */
   def dispose(buffer: ByteBuffer): Unit = {
     if (buffer != null && buffer.isInstanceOf[MappedByteBuffer]) {
-      logTrace(s"Unmapping $buffer")
-      if (buffer.asInstanceOf[DirectBuffer].cleaner() != null) {
-        buffer.asInstanceOf[DirectBuffer].cleaner().clean()
+      logTrace(s"Disposing of $buffer")
+      cleanDirectBuffer(buffer.asInstanceOf[DirectBuffer])
+    }
+  }
+
+  /**
+   * Attempt to unmap a ByteBuffer if it is memory-mapped. This uses an *unsafe* Sun API that will
+   * cause errors if one attempts to read from the unmapped buffer. However, the file descriptors of
+   * memory-mapped buffers do not put pressure on the garbage collector. Waiting for garbage
+   * collection may lead to huge numbers of open files. There's unfortunately no standard API to
+   * manually unmap memory-mapped buffers.
+   *
+   * See also [[dispose]]
+   */
+  def unmap(buffer: ByteBuffer): Unit = {
+    if (buffer != null && buffer.isInstanceOf[MappedByteBuffer]) {
+      // Note that direct buffers are instances of MappedByteBuffer. As things stand in Java 8, the
+      // JDK does not provide a public API to distinguish between direct buffers and memory-mapped
+      // buffers. As an alternative, we peek beneath the curtains and look for a non-null file
+      // descriptor in mappedByteBuffer
+      if (memoryMappedBufferFileDescriptorField.get(buffer) != null) {
+        logTrace(s"Unmapping $buffer")
+        cleanDirectBuffer(buffer.asInstanceOf[DirectBuffer])
       }
     }
   }
 
+  private def cleanDirectBuffer(buffer: DirectBuffer) = {
+    val cleaner = buffer.cleaner()
+    if (cleaner != null) {
+      cleaner.clean()
+    }
+  }
+
   /**
    * Update the given list of RDDInfo with the given list of storage statuses.
    * This method overwrites the old values stored in the RDDInfo's.
diff --git a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
index 095d32407f34..929a0808bd23 100644
--- a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
@@ -31,7 +31,7 @@ import org.apache.spark.{SparkConf, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.memory.{MemoryManager, MemoryMode}
 import org.apache.spark.serializer.{SerializationStream, SerializerManager}
-import org.apache.spark.storage.{BlockId, BlockInfoManager, StorageLevel}
+import org.apache.spark.storage.{BlockId, BlockInfoManager, StorageLevel, StreamBlockId}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.util.{SizeEstimator, Utils}
 import org.apache.spark.util.collection.SizeTrackingVector
@@ -334,7 +334,8 @@ private[spark] class MemoryStore(
     val bbos = new ChunkedByteBufferOutputStream(initialMemoryThreshold.toInt, allocator)
     redirectableStream.setOutputStream(bbos)
     val serializationStream: SerializationStream = {
-      val ser = serializerManager.getSerializer(classTag).newInstance()
+      val autoPick = !blockId.isInstanceOf[StreamBlockId]
+      val ser = serializerManager.getSerializer(classTag, autoPick).newInstance()
       ser.serializeStream(serializerManager.wrapStream(blockId, redirectableStream))
     }
 
@@ -693,7 +694,7 @@ private[storage] class PartiallyUnrolledIterator[T](
   }
 
   override def next(): T = {
-    if (unrolled == null) {
+    if (unrolled == null || !unrolled.hasNext) {
       rest.next()
     } else {
       unrolled.next()
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 35c3c8d00f99..639b8577617f 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -45,6 +45,9 @@ import org.apache.spark.util.Utils
  */
 private[spark] object JettyUtils extends Logging {
 
+  val SPARK_CONNECTOR_NAME = "Spark"
+  val REDIRECT_CONNECTOR_NAME = "HttpsRedirect"
+
   // Base type for a function that returns something based on an HTTP request. Allows for
   // implicit conversion from many types of functions to jetty Handlers.
   type Responder[T] = HttpServletRequest => T
@@ -87,9 +90,9 @@ private[spark] object JettyUtils extends Logging {
             response.setHeader("X-Frame-Options", xFrameOptionsValue)
             response.getWriter.print(servletParams.extractFn(result))
           } else {
-            response.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
+            response.setStatus(HttpServletResponse.SC_FORBIDDEN)
             response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
-            response.sendError(HttpServletResponse.SC_UNAUTHORIZED,
+            response.sendError(HttpServletResponse.SC_FORBIDDEN,
               "User is not authorized to access this page.")
           }
         } catch {
@@ -274,17 +277,18 @@ private[spark] object JettyUtils extends Logging {
       conf: SparkConf,
       serverName: String = ""): ServerInfo = {
 
-    val collection = new ContextHandlerCollection
     addFilters(handlers, conf)
 
     val gzipHandlers = handlers.map { h =>
+      h.setVirtualHosts(Array("@" + SPARK_CONNECTOR_NAME))
+
       val gzipHandler = new GzipHandler
       gzipHandler.setHandler(h)
       gzipHandler
     }
 
     // Bind to the given port, or throw a java.net.BindException if the port is occupied
-    def connect(currentPort: Int): (Server, Int) = {
+    def connect(currentPort: Int): ((Server, Option[Int]), Int) = {
       val pool = new QueuedThreadPool
       if (serverName.nonEmpty) {
         pool.setName(serverName)
@@ -292,7 +296,9 @@ private[spark] object JettyUtils extends Logging {
       pool.setDaemon(true)
 
       val server = new Server(pool)
-      val connectors = new ArrayBuffer[ServerConnector]
+      val connectors = new ArrayBuffer[ServerConnector]()
+      val collection = new ContextHandlerCollection
+
       // Create a connector on port currentPort to listen for HTTP requests
       val httpConnector = new ServerConnector(
         server,
@@ -306,26 +312,33 @@ private[spark] object JettyUtils extends Logging {
       httpConnector.setPort(currentPort)
       connectors += httpConnector
 
-      sslOptions.createJettySslContextFactory().foreach { factory =>
-        // If the new port wraps around, do not try a privileged port.
-        val securePort =
-          if (currentPort != 0) {
-            (currentPort + 400 - 1024) % (65536 - 1024) + 1024
-          } else {
-            0
-          }
-        val scheme = "https"
-        // Create a connector on port securePort to listen for HTTPS requests
-        val connector = new ServerConnector(server, factory)
-        connector.setPort(securePort)
-
-        connectors += connector
-
-        // redirect the HTTP requests to HTTPS port
-        collection.addHandler(createRedirectHttpsHandler(securePort, scheme))
+      val httpsConnector = sslOptions.createJettySslContextFactory() match {
+        case Some(factory) =>
+          // If the new port wraps around, do not try a privileged port.
+          val securePort =
+            if (currentPort != 0) {
+              (currentPort + 400 - 1024) % (65536 - 1024) + 1024
+            } else {
+              0
+            }
+          val scheme = "https"
+          // Create a connector on port securePort to listen for HTTPS requests
+          val connector = new ServerConnector(server, factory)
+          connector.setPort(securePort)
+          connector.setName(SPARK_CONNECTOR_NAME)
+          connectors += connector
+
+          // redirect the HTTP requests to HTTPS port
+          httpConnector.setName(REDIRECT_CONNECTOR_NAME)
+          collection.addHandler(createRedirectHttpsHandler(connector, scheme))
+          Some(connector)
+
+        case None =>
+          // No SSL, so the HTTP connector becomes the official one where all contexts bind.
+          httpConnector.setName(SPARK_CONNECTOR_NAME)
+          None
       }
 
-      gzipHandlers.foreach(collection.addHandler)
       // As each acceptor and each selector will use one thread, the number of threads should at
       // least be the number of acceptors and selectors plus 1. (See SPARK-13776)
       var minThreads = 1
@@ -337,17 +350,20 @@ private[spark] object JettyUtils extends Logging {
         // The number of selectors always equals to the number of acceptors
         minThreads += connector.getAcceptors * 2
       }
-      server.setConnectors(connectors.toArray)
       pool.setMaxThreads(math.max(pool.getMaxThreads, minThreads))
 
       val errorHandler = new ErrorHandler()
       errorHandler.setShowStacks(true)
       errorHandler.setServer(server)
       server.addBean(errorHandler)
+
+      gzipHandlers.foreach(collection.addHandler)
       server.setHandler(collection)
+
+      server.setConnectors(connectors.toArray)
       try {
         server.start()
-        (server, httpConnector.getLocalPort)
+        ((server, httpsConnector.map(_.getLocalPort())), httpConnector.getLocalPort)
       } catch {
         case e: Exception =>
           server.stop()
@@ -356,13 +372,18 @@ private[spark] object JettyUtils extends Logging {
       }
     }
 
-    val (server, boundPort) = Utils.startServiceOnPort[Server](port, connect, conf, serverName)
-    ServerInfo(server, boundPort, collection)
+    val ((server, securePort), boundPort) = Utils.startServiceOnPort(port, connect, conf,
+      serverName)
+    ServerInfo(server, boundPort, securePort,
+      server.getHandler().asInstanceOf[ContextHandlerCollection])
   }
 
-  private def createRedirectHttpsHandler(securePort: Int, scheme: String): ContextHandler = {
+  private def createRedirectHttpsHandler(
+      httpsConnector: ServerConnector,
+      scheme: String): ContextHandler = {
     val redirectHandler: ContextHandler = new ContextHandler
     redirectHandler.setContextPath("/")
+    redirectHandler.setVirtualHosts(Array("@" + REDIRECT_CONNECTOR_NAME))
     redirectHandler.setHandler(new AbstractHandler {
       override def handle(
           target: String,
@@ -372,8 +393,8 @@ private[spark] object JettyUtils extends Logging {
         if (baseRequest.isSecure) {
           return
         }
-        val httpsURI = createRedirectURI(scheme, baseRequest.getServerName, securePort,
-          baseRequest.getRequestURI, baseRequest.getQueryString)
+        val httpsURI = createRedirectURI(scheme, baseRequest.getServerName,
+          httpsConnector.getLocalPort, baseRequest.getRequestURI, baseRequest.getQueryString)
         response.setContentLength(0)
         response.encodeRedirectURL(httpsURI)
         response.sendRedirect(httpsURI)
@@ -442,7 +463,23 @@ private[spark] object JettyUtils extends Logging {
 private[spark] case class ServerInfo(
     server: Server,
     boundPort: Int,
-    rootHandler: ContextHandlerCollection) {
+    securePort: Option[Int],
+    private val rootHandler: ContextHandlerCollection) {
+
+  def addHandler(handler: ContextHandler): Unit = {
+    handler.setVirtualHosts(Array("@" + JettyUtils.SPARK_CONNECTOR_NAME))
+    rootHandler.addHandler(handler)
+    if (!handler.isStarted()) {
+      handler.start()
+    }
+  }
+
+  def removeHandler(handler: ContextHandler): Unit = {
+    rootHandler.removeHandler(handler)
+    if (handler.isStarted) {
+      handler.stop()
+    }
+  }
 
   def stop(): Unit = {
     server.stop()
diff --git a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala
index 2a7c16b04bf7..79974df2603f 100644
--- a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala
@@ -175,13 +175,14 @@ private[ui] trait PagedTable[T] {
 
       val hiddenFormFields = {
         if (goButtonFormPath.contains('?')) {
-          val querystring = goButtonFormPath.split("\\?", 2)(1)
+          val queryString = goButtonFormPath.split("\\?", 2)(1)
+          val search = queryString.split("#")(0)
           Splitter
             .on('&')
             .trimResults()
             .omitEmptyStrings()
             .withKeyValueSeparator("=")
-            .split(querystring)
+            .split(search)
             .asScala
             .filterKeys(_ != pageSizeFormField)
             .filterKeys(_ != prevPageSizeFormField)
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index c0d1a2220f62..40aa7ff8a786 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -25,6 +25,8 @@ import scala.util.control.NonFatal
 import scala.xml._
 import scala.xml.transform.{RewriteRule, RuleTransformer}
 
+import org.apache.commons.lang3.StringEscapeUtils
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.ui.scope.RDDOperationGraph
 
@@ -34,9 +36,12 @@ private[spark] object UIUtils extends Logging {
   val TABLE_CLASS_STRIPED = TABLE_CLASS_NOT_STRIPED + " table-striped"
   val TABLE_CLASS_STRIPED_SORTABLE = TABLE_CLASS_STRIPED + " sortable"
 
+  private val NEWLINE_AND_SINGLE_QUOTE_REGEX = raw"(?i)(\r\n|\n|\r|%0D%0A|%0A|%0D|'|%27)".r
+
   // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
   private val dateFormat = new ThreadLocal[SimpleDateFormat]() {
-    override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
+    override def initialValue(): SimpleDateFormat =
+      new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US)
   }
 
   def formatDate(date: Date): String = dateFormat.get.format(date)
@@ -170,6 +175,7 @@ private[spark] object UIUtils extends Logging {
     <script src={prependBaseUri("/static/timeline-view.js")}></script>
     <script src={prependBaseUri("/static/log-view.js")}></script>
     <script src={prependBaseUri("/static/webui.js")}></script>
+    <script>setUIRoot('{UIUtils.uiRoot}')</script>
   }
 
   def vizHeaderNodes: Seq[Node] = {
@@ -420,8 +426,8 @@ private[spark] object UIUtils extends Logging {
    * the whole string will rendered as a simple escaped text.
    *
    * Note: In terms of security, only anchor tags with root relative links are supported. So any
-   * attempts to embed links outside Spark UI, or other tags like <script> will cause in the whole
-   * description to be treated as plain text.
+   * attempts to embed links outside Spark UI, or other tags like {@code <script>} will cause in
+   * the whole description to be treated as plain text.
    *
    * @param desc        the original job or stage description string, which may contain html tags.
    * @param basePathUri with which to prepend the relative links; this is used when plainText is
@@ -522,4 +528,21 @@ private[spark] object UIUtils extends Logging {
       origHref
     }
   }
+
+  /**
+   * Remove suspicious characters of user input to prevent Cross-Site scripting (XSS) attacks
+   *
+   * For more information about XSS testing:
+   * https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet and
+   * https://www.owasp.org/index.php/Testing_for_Reflected_Cross_site_scripting_(OTG-INPVAL-001)
+   */
+  def stripXSS(requestParameter: String): String = {
+    if (requestParameter == null) {
+      null
+    } else {
+      // Remove new lines and single quotes, followed by escaping HTML version 4.0
+      StringEscapeUtils.escapeHtml4(
+        NEWLINE_AND_SINGLE_QUOTE_REGEX.replaceAllIn(requestParameter, ""))
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index a05e0efb7a3e..b1a6ef2ffebd 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -91,23 +91,13 @@ private[spark] abstract class WebUI(
   /** Attach a handler to this UI. */
   def attachHandler(handler: ServletContextHandler) {
     handlers += handler
-    serverInfo.foreach { info =>
-      info.rootHandler.addHandler(handler)
-      if (!handler.isStarted) {
-        handler.start()
-      }
-    }
+    serverInfo.foreach(_.addHandler(handler))
   }
 
   /** Detach a handler from this UI. */
   def detachHandler(handler: ServletContextHandler) {
     handlers -= handler
-    serverInfo.foreach { info =>
-      info.rootHandler.removeHandler(handler)
-      if (handler.isStarted) {
-        handler.stop()
-      }
-    }
+    serverInfo.foreach(_.removeHandler(handler))
   }
 
   /**
@@ -147,10 +137,7 @@ private[spark] abstract class WebUI(
   }
 
   /** Return the url of web interface. Only valid after bind(). */
-  def webUrl: String = {
-    val protocol = if (sslOptions.enabled) "https" else "http"
-    s"$protocol://$publicHostName:$boundPort"
-  }
+  def webUrl: String = s"http://$publicHostName:$boundPort"
 
   /** Return the actual port to which this server is bound. Only valid after bind(). */
   def boundPort: Int = serverInfo.map(_.boundPort).getOrElse(-1)
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
index a0ef80d9bdae..d1577d170a8a 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
@@ -27,8 +27,10 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage
 
   private val sc = parent.sc
 
+  // stripXSS is called first to remove suspicious characters used in XSS attacks
   def render(request: HttpServletRequest): Seq[Node] = {
-    val executorId = Option(request.getParameter("executorId")).map { executorId =>
+    val executorId =
+      Option(UIUtils.stripXSS(request.getParameter("executorId"))).map { executorId =>
       UIUtils.decodeURLParameter(executorId)
     }.getOrElse {
       throw new IllegalArgumentException(s"Missing executorId parameter")
@@ -48,6 +50,16 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage
           }
       }.map { thread =>
         val threadId = thread.threadId
+        val blockedBy = thread.blockedByThreadId match {
+          case Some(blockedByThreadId) =>
+            <div>
+              Blocked by <a href={s"#${thread.blockedByThreadId}_td_id"}>
+              Thread {thread.blockedByThreadId} {thread.blockedByLock}</a>
+            </div>
+          case None => Text("")
+        }
+        val heldLocks = thread.holdingLocks.mkString(", ")
+
         <tr id={s"thread_${threadId}_tr"} class="accordion-heading"
             onclick={s"toggleThreadStackTrace($threadId, false)"}
             onmouseover={s"onMouseOverAndOut($threadId)"}
@@ -55,6 +67,7 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage
           <td id={s"${threadId}_td_id"}>{threadId}</td>
           <td id={s"${threadId}_td_name"}>{thread.threadName}</td>
           <td id={s"${threadId}_td_state"}>{thread.threadState}</td>
+          <td id={s"${threadId}_td_locking"}>{blockedBy}{heldLocks}</td>
           <td id={s"${threadId}_td_stacktrace"} class="hidden">{thread.stackTrace}</td>
         </tr>
       }
@@ -86,6 +99,7 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage
           <th onClick="collapseAllThreadStackTrace(false)">Thread ID</th>
           <th onClick="collapseAllThreadStackTrace(false)">Thread Name</th>
           <th onClick="collapseAllThreadStackTrace(false)">Thread State</th>
+          <th onClick="collapseAllThreadStackTrace(false)">Thread Locks</th>
         </thead>
         <tbody>{dumpRows}</tbody>
       </table>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 173fc3cf31ce..3f25a2deca4b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -220,18 +220,20 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       jobTag: String,
       jobs: Seq[JobUIData],
       killEnabled: Boolean): Seq[Node] = {
-    val allParameters = request.getParameterMap.asScala.toMap
+    // stripXSS is called to remove suspicious characters used in XSS attacks
+    val allParameters = request.getParameterMap.asScala.toMap.mapValues(_.map(UIUtils.stripXSS))
     val parameterOtherTable = allParameters.filterNot(_._1.startsWith(jobTag))
       .map(para => para._1 + "=" + para._2(0))
 
     val someJobHasJobGroup = jobs.exists(_.jobGroup.isDefined)
     val jobIdTitle = if (someJobHasJobGroup) "Job Id (Job Group)" else "Job Id"
 
-    val parameterJobPage = request.getParameter(jobTag + ".page")
-    val parameterJobSortColumn = request.getParameter(jobTag + ".sort")
-    val parameterJobSortDesc = request.getParameter(jobTag + ".desc")
-    val parameterJobPageSize = request.getParameter(jobTag + ".pageSize")
-    val parameterJobPrevPageSize = request.getParameter(jobTag + ".prevPageSize")
+    // stripXSS is called first to remove suspicious characters used in XSS attacks
+    val parameterJobPage = UIUtils.stripXSS(request.getParameter(jobTag + ".page"))
+    val parameterJobSortColumn = UIUtils.stripXSS(request.getParameter(jobTag + ".sort"))
+    val parameterJobSortDesc = UIUtils.stripXSS(request.getParameter(jobTag + ".desc"))
+    val parameterJobPageSize = UIUtils.stripXSS(request.getParameter(jobTag + ".pageSize"))
+    val parameterJobPrevPageSize = UIUtils.stripXSS(request.getParameter(jobTag + ".prevPageSize"))
 
     val jobPage = Option(parameterJobPage).map(_.toInt).getOrElse(1)
     val jobSortColumn = Option(parameterJobSortColumn).map { sortColumn =>
@@ -500,7 +502,8 @@ private[ui] class JobPagedTable(
   override def tableId: String = jobTag + "-table"
 
   override def tableCssClass: String =
-    "table table-bordered table-condensed table-striped table-head-clickable"
+    "table table-bordered table-condensed table-striped " +
+      "table-head-clickable table-cell-width-limited"
 
   override def pageSizeFormField: String = jobTag + ".pageSize"
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
index 0ff9e5e9411c..c0b1ed934666 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
@@ -187,7 +187,8 @@ private[ui] class JobPage(parent: JobsTab) extends WebUIPage("job") {
     val listener = parent.jobProgresslistener
 
     listener.synchronized {
-      val parameterId = request.getParameter("id")
+      // stripXSS is called first to remove suspicious characters used in XSS attacks
+      val parameterId = UIUtils.stripXSS(request.getParameter("id"))
       require(parameterId != null && parameterId.nonEmpty, "Missing id parameter")
 
       val jobId = parameterId.toInt
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
index 620c54c2dc0a..cc173381879a 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ui.jobs
 import javax.servlet.http.HttpServletRequest
 
 import org.apache.spark.scheduler.SchedulingMode
-import org.apache.spark.ui.{SparkUI, SparkUITab}
+import org.apache.spark.ui.{SparkUI, SparkUITab, UIUtils}
 
 /** Web UI showing progress status of all jobs in the given SparkContext. */
 private[ui] class JobsTab(parent: SparkUI) extends SparkUITab(parent, "jobs") {
@@ -40,7 +40,8 @@ private[ui] class JobsTab(parent: SparkUI) extends SparkUITab(parent, "jobs") {
 
   def handleKillRequest(request: HttpServletRequest): Unit = {
     if (killEnabled && parent.securityManager.checkModifyPermissions(request.getRemoteUser)) {
-      val jobId = Option(request.getParameter("id")).map(_.toInt)
+      // stripXSS is called first to remove suspicious characters used in XSS attacks
+      val jobId = Option(UIUtils.stripXSS(request.getParameter("id"))).map(_.toInt)
       jobId.foreach { id =>
         if (jobProgresslistener.activeJobs.contains(id)) {
           sc.foreach(_.cancelJob(id))
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
index 8ee70d27cc09..b164f32b62e9 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
@@ -31,7 +31,8 @@ private[ui] class PoolPage(parent: StagesTab) extends WebUIPage("pool") {
 
   def render(request: HttpServletRequest): Seq[Node] = {
     listener.synchronized {
-      val poolName = Option(request.getParameter("poolname")).map { poolname =>
+      // stripXSS is called first to remove suspicious characters used in XSS attacks
+      val poolName = Option(UIUtils.stripXSS(request.getParameter("poolname"))).map { poolname =>
         UIUtils.decodeURLParameter(poolname)
       }.getOrElse {
         throw new IllegalArgumentException(s"Missing poolname parameter")
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 8c7cefe20073..1db4d8b00368 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -70,8 +70,6 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
   // if we find that it's okay.
   private val MAX_TIMELINE_TASKS = parent.conf.getInt("spark.ui.timeline.tasks.maximum", 1000)
 
-  private val displayPeakExecutionMemory = parent.conf.getBoolean("spark.sql.unsafe.enabled", true)
-
   private def getLocalitySummaryString(stageData: StageUIData): String = {
     val localities = stageData.taskData.values.map(_.taskInfo.taskLocality)
     val localityCounts = localities.groupBy(identity).mapValues(_.size)
@@ -89,17 +87,18 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
 
   def render(request: HttpServletRequest): Seq[Node] = {
     progressListener.synchronized {
-      val parameterId = request.getParameter("id")
+      // stripXSS is called first to remove suspicious characters used in XSS attacks
+      val parameterId = UIUtils.stripXSS(request.getParameter("id"))
       require(parameterId != null && parameterId.nonEmpty, "Missing id parameter")
 
-      val parameterAttempt = request.getParameter("attempt")
+      val parameterAttempt = UIUtils.stripXSS(request.getParameter("attempt"))
       require(parameterAttempt != null && parameterAttempt.nonEmpty, "Missing attempt parameter")
 
-      val parameterTaskPage = request.getParameter("task.page")
-      val parameterTaskSortColumn = request.getParameter("task.sort")
-      val parameterTaskSortDesc = request.getParameter("task.desc")
-      val parameterTaskPageSize = request.getParameter("task.pageSize")
-      val parameterTaskPrevPageSize = request.getParameter("task.prevPageSize")
+      val parameterTaskPage = UIUtils.stripXSS(request.getParameter("task.page"))
+      val parameterTaskSortColumn = UIUtils.stripXSS(request.getParameter("task.sort"))
+      val parameterTaskSortDesc = UIUtils.stripXSS(request.getParameter("task.desc"))
+      val parameterTaskPageSize = UIUtils.stripXSS(request.getParameter("task.pageSize"))
+      val parameterTaskPrevPageSize = UIUtils.stripXSS(request.getParameter("task.prevPageSize"))
 
       val taskPage = Option(parameterTaskPage).map(_.toInt).getOrElse(1)
       val taskSortColumn = Option(parameterTaskSortColumn).map { sortColumn =>
@@ -252,15 +251,13 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
                   <span class="additional-metric-title">Getting Result Time</span>
                 </span>
               </li>
-              {if (displayPeakExecutionMemory) {
-                <li>
-                  <span data-toggle="tooltip"
-                        title={ToolTips.PEAK_EXECUTION_MEMORY} data-placement="right">
-                    <input type="checkbox" name={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}/>
-                    <span class="additional-metric-title">Peak Execution Memory</span>
-                  </span>
-                </li>
-              }}
+              <li>
+                <span data-toggle="tooltip"
+                      title={ToolTips.PEAK_EXECUTION_MEMORY} data-placement="right">
+                  <input type="checkbox" name={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}/>
+                  <span class="additional-metric-title">Peak Execution Memory</span>
+                </span>
+              </li>
             </ul>
           </div>
         </div>
@@ -532,13 +529,9 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
               {serializationQuantiles}
             </tr>,
             <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
-            if (displayPeakExecutionMemory) {
-              <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-                {peakExecutionMemoryQuantiles}
-              </tr>
-            } else {
-              Nil
-            },
+            <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
+              {peakExecutionMemoryQuantiles}
+            </tr>,
             if (stageData.hasInput) <tr>{inputQuantiles}</tr> else Nil,
             if (stageData.hasOutput) <tr>{outputQuantiles}</tr> else Nil,
             if (stageData.hasShuffleRead) {
@@ -1166,9 +1159,6 @@ private[ui] class TaskPagedTable(
     desc: Boolean,
     executorsListener: ExecutorsListener) extends PagedTable[TaskTableRowData] {
 
-  // We only track peak memory used for unsafe operators
-  private val displayPeakExecutionMemory = conf.getBoolean("spark.sql.unsafe.enabled", true)
-
   override def tableId: String = "task-table"
 
   override def tableCssClass: String =
@@ -1217,14 +1207,8 @@ private[ui] class TaskPagedTable(
         ("Task Deserialization Time", TaskDetailsClassNames.TASK_DESERIALIZATION_TIME),
         ("GC Time", ""),
         ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
-        ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME)) ++
-        {
-          if (displayPeakExecutionMemory) {
-            Seq(("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY))
-          } else {
-            Nil
-          }
-        } ++
+        ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME),
+        ("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY)) ++
         {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++
         {if (hasInput) Seq(("Input Size / Records", "")) else Nil} ++
         {if (hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
@@ -1316,11 +1300,9 @@ private[ui] class TaskPagedTable(
       <td class={TaskDetailsClassNames.GETTING_RESULT_TIME}>
         {UIUtils.formatDuration(task.gettingResultTime)}
       </td>
-      {if (displayPeakExecutionMemory) {
-        <td class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-          {Utils.bytesToString(task.peakExecutionMemoryUsed)}
-        </td>
-      }}
+      <td class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
+        {Utils.bytesToString(task.peakExecutionMemoryUsed)}
+      </td>
       {if (task.accumulators.nonEmpty) {
         <td>{Unparsed(task.accumulators.get)}</td>
       }}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index c9d0431e2d2f..f08d7abb184d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -42,15 +42,17 @@ private[ui] class StageTableBase(
     isFairScheduler: Boolean,
     killEnabled: Boolean,
     isFailedStage: Boolean) {
-  val allParameters = request.getParameterMap().asScala.toMap
+  // stripXSS is called to remove suspicious characters used in XSS attacks
+  val allParameters = request.getParameterMap.asScala.toMap.mapValues(_.map(UIUtils.stripXSS))
   val parameterOtherTable = allParameters.filterNot(_._1.startsWith(stageTag))
     .map(para => para._1 + "=" + para._2(0))
 
-  val parameterStagePage = request.getParameter(stageTag + ".page")
-  val parameterStageSortColumn = request.getParameter(stageTag + ".sort")
-  val parameterStageSortDesc = request.getParameter(stageTag + ".desc")
-  val parameterStagePageSize = request.getParameter(stageTag + ".pageSize")
-  val parameterStagePrevPageSize = request.getParameter(stageTag + ".prevPageSize")
+  val parameterStagePage = UIUtils.stripXSS(request.getParameter(stageTag + ".page"))
+  val parameterStageSortColumn = UIUtils.stripXSS(request.getParameter(stageTag + ".sort"))
+  val parameterStageSortDesc = UIUtils.stripXSS(request.getParameter(stageTag + ".desc"))
+  val parameterStagePageSize = UIUtils.stripXSS(request.getParameter(stageTag + ".pageSize"))
+  val parameterStagePrevPageSize =
+    UIUtils.stripXSS(request.getParameter(stageTag + ".prevPageSize"))
 
   val stagePage = Option(parameterStagePage).map(_.toInt).getOrElse(1)
   val stageSortColumn = Option(parameterStageSortColumn).map { sortColumn =>
@@ -149,7 +151,8 @@ private[ui] class StagePagedTable(
   override def tableId: String = stageTag + "-table"
 
   override def tableCssClass: String =
-    "table table-bordered table-condensed table-striped table-head-clickable"
+    "table table-bordered table-condensed table-striped " +
+      "table-head-clickable table-cell-width-limited"
 
   override def pageSizeFormField: String = stageTag + ".pageSize"
 
@@ -511,4 +514,3 @@ private[ui] class StageDataSource(
     }
   }
 }
-
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
index c1f25114371f..6baebb3eb3a5 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ui.jobs
 import javax.servlet.http.HttpServletRequest
 
 import org.apache.spark.scheduler.SchedulingMode
-import org.apache.spark.ui.{SparkUI, SparkUITab}
+import org.apache.spark.ui.{SparkUI, SparkUITab, UIUtils}
 
 /** Web UI showing progress status of all stages in the given SparkContext. */
 private[ui] class StagesTab(parent: SparkUI) extends SparkUITab(parent, "stages") {
@@ -39,7 +39,8 @@ private[ui] class StagesTab(parent: SparkUI) extends SparkUITab(parent, "stages"
 
   def handleKillRequest(request: HttpServletRequest): Unit = {
     if (killEnabled && parent.securityManager.checkModifyPermissions(request.getRemoteUser)) {
-      val stageId = Option(request.getParameter("id")).map(_.toInt)
+      // stripXSS is called first to remove suspicious characters used in XSS attacks
+      val stageId = Option(UIUtils.stripXSS(request.getParameter("id"))).map(_.toInt)
       stageId.foreach { id =>
         if (progressListener.activeStages.contains(id)) {
           sc.foreach(_.cancelStage(id))
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
index f4a04609c4c6..78113ac8a8d0 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -176,6 +176,7 @@ private[spark] object UIData {
       }
       newTaskInfo.finishTime = taskInfo.finishTime
       newTaskInfo.failed = taskInfo.failed
+      newTaskInfo.killed = taskInfo.killed
       newTaskInfo
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
index 227e940c9c50..68a0ef92a606 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
@@ -31,14 +31,15 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
   private val listener = parent.listener
 
   def render(request: HttpServletRequest): Seq[Node] = {
-    val parameterId = request.getParameter("id")
+    // stripXSS is called first to remove suspicious characters used in XSS attacks
+    val parameterId = UIUtils.stripXSS(request.getParameter("id"))
     require(parameterId != null && parameterId.nonEmpty, "Missing id parameter")
 
-    val parameterBlockPage = request.getParameter("block.page")
-    val parameterBlockSortColumn = request.getParameter("block.sort")
-    val parameterBlockSortDesc = request.getParameter("block.desc")
-    val parameterBlockPageSize = request.getParameter("block.pageSize")
-    val parameterBlockPrevPageSize = request.getParameter("block.prevPageSize")
+    val parameterBlockPage = UIUtils.stripXSS(request.getParameter("block.page"))
+    val parameterBlockSortColumn = UIUtils.stripXSS(request.getParameter("block.sort"))
+    val parameterBlockSortDesc = UIUtils.stripXSS(request.getParameter("block.desc"))
+    val parameterBlockPageSize = UIUtils.stripXSS(request.getParameter("block.pageSize"))
+    val parameterBlockPrevPageSize = UIUtils.stripXSS(request.getParameter("block.prevPageSize"))
 
     val blockPage = Option(parameterBlockPage).map(_.toInt).getOrElse(1)
     val blockSortColumn = Option(parameterBlockSortColumn).getOrElse("Block Name")
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index d3ddd3913132..021091d82ec4 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -59,15 +59,16 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
   }
 
   /**
-   * Returns true if this accumulator has been registered.  Note that all accumulators must be
-   * registered before use, or it will throw exception.
+   * Returns true if this accumulator has been registered.
+   *
+   * @note All accumulators must be registered before use, or it will throw exception.
    */
   final def isRegistered: Boolean =
     metadata != null && AccumulatorContext.get(metadata.id).isDefined
 
   private def assertMetadataNotNull(): Unit = {
     if (metadata == null) {
-      throw new IllegalAccessError("The metadata of this accumulator has not been assigned yet.")
+      throw new IllegalStateException("The metadata of this accumulator has not been assigned yet.")
     }
   }
 
@@ -223,7 +224,7 @@ private[spark] object AccumulatorContext {
    * Registers an [[AccumulatorV2]] created on the driver such that it can be used on the executors.
    *
    * All accumulators registered here can later be used as a container for accumulating partial
-   * values across multiple tasks. This is what [[org.apache.spark.scheduler.DAGScheduler]] does.
+   * values across multiple tasks. This is what `org.apache.spark.scheduler.DAGScheduler` does.
    * Note: if an accumulator is registered here, it should also be registered with the active
    * context cleaner for cleanup so as to avoid memory leaks.
    *
@@ -249,7 +250,7 @@ private[spark] object AccumulatorContext {
       // Since we are storing weak references, we must check whether the underlying data is valid.
       val acc = ref.get
       if (acc eq null) {
-        throw new IllegalAccessError(s"Attempted to access garbage collected accumulator $id")
+        throw new IllegalStateException(s"Attempted to access garbage collected accumulator $id")
       }
       acc
     }
diff --git a/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala b/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
index dce2ac63a664..50dc948e6c41 100644
--- a/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
+++ b/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
@@ -23,11 +23,10 @@ import java.nio.ByteBuffer
 import org.apache.spark.storage.StorageUtils
 
 /**
- * Reads data from a ByteBuffer, and optionally cleans it up using StorageUtils.dispose()
- * at the end of the stream (e.g. to close a memory-mapped file).
+ * Reads data from a ByteBuffer.
  */
 private[spark]
-class ByteBufferInputStream(private var buffer: ByteBuffer, dispose: Boolean = false)
+class ByteBufferInputStream(private var buffer: ByteBuffer)
   extends InputStream {
 
   override def read(): Int = {
@@ -72,9 +71,6 @@ class ByteBufferInputStream(private var buffer: ByteBuffer, dispose: Boolean = f
    */
   private def cleanUp() {
     if (buffer != null) {
-      if (dispose) {
-        StorageUtils.dispose(buffer)
-      }
       buffer = null
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index c11eb3ffa460..7e734bdd95f7 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -264,8 +264,7 @@ private[spark] object JsonProtocol {
     ("Submission Time" -> submissionTime) ~
     ("Completion Time" -> completionTime) ~
     ("Failure Reason" -> failureReason) ~
-    ("Accumulables" -> JArray(
-      stageInfo.accumulables.values.map(accumulableInfoToJson).toList))
+    ("Accumulables" -> accumulablesToJson(stageInfo.accumulables.values))
   }
 
   def taskInfoToJson(taskInfo: TaskInfo): JValue = {
@@ -281,7 +280,15 @@ private[spark] object JsonProtocol {
     ("Finish Time" -> taskInfo.finishTime) ~
     ("Failed" -> taskInfo.failed) ~
     ("Killed" -> taskInfo.killed) ~
-    ("Accumulables" -> JArray(taskInfo.accumulables.toList.map(accumulableInfoToJson)))
+    ("Accumulables" -> accumulablesToJson(taskInfo.accumulables))
+  }
+
+  private lazy val accumulableBlacklist = Set("internal.metrics.updatedBlockStatuses")
+
+  def accumulablesToJson(accumulables: Traversable[AccumulableInfo]): JArray = {
+    JArray(accumulables
+        .filterNot(_.name.exists(accumulableBlacklist.contains))
+        .toList.map(accumulableInfoToJson))
   }
 
   def accumulableInfoToJson(accumulableInfo: AccumulableInfo): JValue = {
@@ -376,7 +383,7 @@ private[spark] object JsonProtocol {
         ("Message" -> fetchFailed.message)
       case exceptionFailure: ExceptionFailure =>
         val stackTrace = stackTraceToJson(exceptionFailure.stackTrace)
-        val accumUpdates = JArray(exceptionFailure.accumUpdates.map(accumulableInfoToJson).toList)
+        val accumUpdates = accumulablesToJson(exceptionFailure.accumUpdates)
         ("Class Name" -> exceptionFailure.className) ~
         ("Description" -> exceptionFailure.description) ~
         ("Stack Trace" -> stackTrace) ~
diff --git a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
index 79fc2e94599c..fa5ad4e8d81e 100644
--- a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala
@@ -52,7 +52,7 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging {
    * Post the event to all registered listeners. The `postToAll` caller should guarantee calling
    * `postToAll` in the same thread for all events.
    */
-  final def postToAll(event: E): Unit = {
+  def postToAll(event: E): Unit = {
     // JavaConverters can create a JIterableWrapper if we use asScala.
     // However, this method will be called frequently. To avoid the wrapper cost, here we use
     // Java Iterator directly.
diff --git a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
index e3b588374ce1..46a5cb2cff5a 100644
--- a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
@@ -23,7 +23,7 @@ import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv, RpcTimeout}
 private[spark] object RpcUtils {
 
   /**
-   * Retrieve a [[RpcEndpointRef]] which is located in the driver via its name.
+   * Retrieve a `RpcEndpointRef` which is located in the driver via its name.
    */
   def makeDriverRef(name: String, conf: SparkConf, rpcEnv: RpcEnv): RpcEndpointRef = {
     val driverHost: String = conf.get("spark.driver.host", "localhost")
diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
index 45381365f1e5..1e02638591f8 100644
--- a/core/src/main/scala/org/apache/spark/util/StatCounter.scala
+++ b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
@@ -22,8 +22,8 @@ import org.apache.spark.annotation.Since
 /**
  * A class for tracking the statistics of a set of numbers (count, mean and variance) in a
  * numerically robust way. Includes support for merging two StatCounters. Based on Welford
- * and Chan's [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance algorithms]]
- * for running variance.
+ * and Chan's <a href="http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance">
+ * algorithms</a> for running variance.
  *
  * @constructor Initialize the StatCounter with the given values.
  */
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala b/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala
index d4e0ad93b966..b1217980faf1 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala
@@ -24,4 +24,8 @@ private[spark] case class ThreadStackTrace(
   threadId: Long,
   threadName: String,
   threadState: Thread.State,
-  stackTrace: String)
+  stackTrace: String,
+  blockedByThreadId: Option[Long],
+  blockedByLock: String,
+  holdingLocks: Seq[String])
+
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index d093e7bfc3da..2a21c6a52c52 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -180,8 +180,8 @@ private[spark] object ThreadUtils {
 
   // scalastyle:off awaitresult
   /**
-   * Preferred alternative to [[Await.result()]]. This method wraps and re-throws any exceptions
-   * thrown by the underlying [[Await]] call, ensuring that this thread's stack trace appears in
+   * Preferred alternative to `Await.result()`. This method wraps and re-throws any exceptions
+   * thrown by the underlying `Await` call, ensuring that this thread's stack trace appears in
    * logs.
    */
   @throws(classOf[SparkException])
@@ -196,7 +196,7 @@ private[spark] object ThreadUtils {
   }
 
   /**
-   * Calls [[Awaitable.result]] directly to avoid using `ForkJoinPool`'s `BlockingContext`, wraps
+   * Calls `Awaitable.result` directly to avoid using `ForkJoinPool`'s `BlockingContext`, wraps
    * and re-throws any exceptions with nice stack track.
    *
    * Codes running in the user's thread may be in a thread of Scala ForkJoinPool. As concurrent
@@ -209,7 +209,7 @@ private[spark] object ThreadUtils {
       // `awaitPermission` is not actually used anywhere so it's safe to pass in null here.
       // See SPARK-13747.
       val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
-      awaitable.result(Duration.Inf)(awaitPermission)
+      awaitable.result(atMost)(awaitPermission)
     } catch {
       case NonFatal(t) =>
         throw new SparkException("Exception thrown in awaitResult: ", t)
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 6027b07c0fee..56de802c423c 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.util
 
 import java.io._
-import java.lang.management.ManagementFactory
+import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo, ThreadInfo}
 import java.net._
 import java.nio.ByteBuffer
-import java.nio.channels.Channels
+import java.nio.channels.{Channels, FileChannel}
 import java.nio.charset.StandardCharsets
 import java.nio.file.{Files, Paths}
 import java.util.{Locale, Properties, Random, UUID}
@@ -39,6 +39,7 @@ import scala.reflect.ClassTag
 import scala.util.Try
 import scala.util.control.{ControlThrowable, NonFatal}
 
+import _root_.io.netty.channel.unix.Errors.NativeIoException
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import com.google.common.io.{ByteStreams, Files => GFiles}
 import com.google.common.net.InetAddresses
@@ -57,7 +58,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{DYN_ALLOCATION_INITIAL_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES}
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
-import org.apache.spark.util.logging.RollingFileAppender
 
 /** CallSite represents a place in user code. It can have a short and a long form. */
 private[spark] case class CallSite(shortForm: String, longForm: String)
@@ -312,41 +312,22 @@ private[spark] object Utils extends Logging {
    * copying is disabled by default unless explicitly set transferToEnabled as true,
    * the parameter transferToEnabled should be configured by spark.file.transferTo = [true|false].
    */
-  def copyStream(in: InputStream,
-                 out: OutputStream,
-                 closeStreams: Boolean = false,
-                 transferToEnabled: Boolean = false): Long =
-  {
-    var count = 0L
+  def copyStream(
+      in: InputStream,
+      out: OutputStream,
+      closeStreams: Boolean = false,
+      transferToEnabled: Boolean = false): Long = {
     tryWithSafeFinally {
       if (in.isInstanceOf[FileInputStream] && out.isInstanceOf[FileOutputStream]
         && transferToEnabled) {
         // When both streams are File stream, use transferTo to improve copy performance.
         val inChannel = in.asInstanceOf[FileInputStream].getChannel()
         val outChannel = out.asInstanceOf[FileOutputStream].getChannel()
-        val initialPos = outChannel.position()
         val size = inChannel.size()
-
-        // In case transferTo method transferred less data than we have required.
-        while (count < size) {
-          count += inChannel.transferTo(count, size - count, outChannel)
-        }
-
-        // Check the position after transferTo loop to see if it is in the right position and
-        // give user information if not.
-        // Position will not be increased to the expected length after calling transferTo in
-        // kernel version 2.6.32, this issue can be seen in
-        // https://bugs.openjdk.java.net/browse/JDK-7052359
-        // This will lead to stream corruption issue when using sort-based shuffle (SPARK-3948).
-        val finalPos = outChannel.position()
-        assert(finalPos == initialPos + size,
-          s"""
-             |Current position $finalPos do not equal to expected position ${initialPos + size}
-             |after transferTo, please check your kernel version to see if it is 2.6.32,
-             |this is a kernel bug which will lead to unexpected behavior when using transferTo.
-             |You can set spark.file.transferTo = false to disable this NIO feature.
-           """.stripMargin)
+        copyFileStreamNIO(inChannel, outChannel, 0, size)
+        size
       } else {
+        var count = 0L
         val buf = new Array[Byte](8192)
         var n = 0
         while (n != -1) {
@@ -356,8 +337,8 @@ private[spark] object Utils extends Logging {
             count += n
           }
         }
+        count
       }
-      count
     } {
       if (closeStreams) {
         try {
@@ -369,6 +350,37 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  def copyFileStreamNIO(
+      input: FileChannel,
+      output: FileChannel,
+      startPosition: Long,
+      bytesToCopy: Long): Unit = {
+    val initialPos = output.position()
+    var count = 0L
+    // In case transferTo method transferred less data than we have required.
+    while (count < bytesToCopy) {
+      count += input.transferTo(count + startPosition, bytesToCopy - count, output)
+    }
+    assert(count == bytesToCopy,
+      s"request to copy $bytesToCopy bytes, but actually copied $count bytes.")
+
+    // Check the position after transferTo loop to see if it is in the right position and
+    // give user information if not.
+    // Position will not be increased to the expected length after calling transferTo in
+    // kernel version 2.6.32, this issue can be seen in
+    // https://bugs.openjdk.java.net/browse/JDK-7052359
+    // This will lead to stream corruption issue when using sort-based shuffle (SPARK-3948).
+    val finalPos = output.position()
+    val expectedPos = initialPos + bytesToCopy
+    assert(finalPos == expectedPos,
+      s"""
+         |Current position $finalPos do not equal to expected position $expectedPos
+         |after transferTo, please check your kernel version to see if it is 2.6.32,
+         |this is a kernel bug which will lead to unexpected behavior when using transferTo.
+         |You can set spark.file.transferTo = false to disable this NIO feature.
+           """.stripMargin)
+  }
+
   /**
    * Construct a URI container information used for authentication.
    * This also sets the default authenticator to properly negotiation the
@@ -1248,7 +1260,7 @@ private[spark] object Utils extends Logging {
         val currentThreadName = Thread.currentThread().getName
         if (sc != null) {
           logError(s"uncaught error in thread $currentThreadName, stopping SparkContext", t)
-          sc.stop()
+          sc.stopInNewThread()
         }
         if (!NonFatal(t)) {
           logError(s"throw uncaught fatal error in thread $currentThreadName", t)
@@ -1418,8 +1430,12 @@ private[spark] object Utils extends Logging {
             }
             callStack(0) = ste.toString // Put last Spark method on top of the stack trace.
           } else {
-            firstUserLine = ste.getLineNumber
-            firstUserFile = ste.getFileName
+            if (ste.getFileName != null) {
+              firstUserFile = ste.getFileName
+              if (ste.getLineNumber >= 0) {
+                firstUserLine = ste.getLineNumber
+              }
+            }
             callStack += ste.toString
             insideSpark = false
           }
@@ -1668,8 +1684,8 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * NaN-safe version of [[java.lang.Double.compare()]] which allows NaN values to be compared
-   * according to semantics where NaN == NaN and NaN > any non-NaN double.
+   * NaN-safe version of `java.lang.Double.compare()` which allows NaN values to be compared
+   * according to semantics where NaN == NaN and NaN is greater than any non-NaN double.
    */
   def nanSafeCompareDoubles(x: Double, y: Double): Int = {
     val xIsNan: Boolean = java.lang.Double.isNaN(x)
@@ -1682,8 +1698,8 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * NaN-safe version of [[java.lang.Float.compare()]] which allows NaN values to be compared
-   * according to semantics where NaN == NaN and NaN > any non-NaN float.
+   * NaN-safe version of `java.lang.Float.compare()` which allows NaN values to be compared
+   * according to semantics where NaN == NaN and NaN is greater than any non-NaN float.
    */
   def nanSafeCompareFloats(x: Float, y: Float): Int = {
     val xIsNan: Boolean = java.lang.Float.isNaN(x)
@@ -2011,7 +2027,7 @@ private[spark] object Utils extends Logging {
     if (paths == null || paths.trim.isEmpty) {
       ""
     } else {
-      paths.split(",").map { p => Utils.resolveURI(p) }.mkString(",")
+      paths.split(",").filter(_.trim.nonEmpty).map { p => Utils.resolveURI(p) }.mkString(",")
     }
   }
 
@@ -2096,18 +2112,62 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  private implicit class Lock(lock: LockInfo) {
+    def lockString: String = {
+      lock match {
+        case monitor: MonitorInfo =>
+          s"Monitor(${lock.getClassName}@${lock.getIdentityHashCode}})"
+        case _ =>
+          s"Lock(${lock.getClassName}@${lock.getIdentityHashCode}})"
+      }
+    }
+  }
+
   /** Return a thread dump of all threads' stacktraces.  Used to capture dumps for the web UI */
   def getThreadDump(): Array[ThreadStackTrace] = {
     // We need to filter out null values here because dumpAllThreads() may return null array
     // elements for threads that are dead / don't exist.
     val threadInfos = ManagementFactory.getThreadMXBean.dumpAllThreads(true, true).filter(_ != null)
-    threadInfos.sortBy(_.getThreadId).map { case threadInfo =>
-      val stackTrace = threadInfo.getStackTrace.map(_.toString).mkString("\n")
-      ThreadStackTrace(threadInfo.getThreadId, threadInfo.getThreadName,
-        threadInfo.getThreadState, stackTrace)
+    threadInfos.sortBy(_.getThreadId).map(threadInfoToThreadStackTrace)
+  }
+
+  def getThreadDumpForThread(threadId: Long): Option[ThreadStackTrace] = {
+    if (threadId <= 0) {
+      None
+    } else {
+      // The Int.MaxValue here requests the entire untruncated stack trace of the thread:
+      val threadInfo =
+        Option(ManagementFactory.getThreadMXBean.getThreadInfo(threadId, Int.MaxValue))
+      threadInfo.map(threadInfoToThreadStackTrace)
     }
   }
 
+  private def threadInfoToThreadStackTrace(threadInfo: ThreadInfo): ThreadStackTrace = {
+    val monitors = threadInfo.getLockedMonitors.map(m => m.getLockedStackFrame -> m).toMap
+    val stackTrace = threadInfo.getStackTrace.map { frame =>
+      monitors.get(frame) match {
+        case Some(monitor) =>
+          monitor.getLockedStackFrame.toString + s" => holding ${monitor.lockString}"
+        case None =>
+          frame.toString
+      }
+    }.mkString("\n")
+
+    // use a set to dedup re-entrant locks that are held at multiple places
+    val heldLocks =
+      (threadInfo.getLockedSynchronizers ++ threadInfo.getLockedMonitors).map(_.lockString).toSet
+
+    ThreadStackTrace(
+      threadId = threadInfo.getThreadId,
+      threadName = threadInfo.getThreadName,
+      threadState = threadInfo.getThreadState,
+      stackTrace = stackTrace,
+      blockedByThreadId =
+        if (threadInfo.getLockOwnerId < 0) None else Some(threadInfo.getLockOwnerId),
+      blockedByLock = Option(threadInfo.getLockInfo).map(_.lockString).getOrElse(""),
+      holdingLocks = heldLocks.toSeq)
+  }
+
   /**
    * Convert all spark properties set in the given SparkConf to a sequence of java options.
    */
@@ -2196,6 +2256,9 @@ private[spark] object Utils extends Logging {
         isBindCollision(e.getCause)
       case e: MultiException =>
         e.getThrowables.asScala.exists(isBindCollision)
+      case e: NativeIoException =>
+        (e.getMessage != null && e.getMessage.startsWith("bind() failed: ")) ||
+          isBindCollision(e.getCause)
       case e: Exception => isBindCollision(e.getCause)
       case _ => false
     }
@@ -2306,8 +2369,9 @@ private[spark] object Utils extends Logging {
    * A spark url (`spark://host:port`) is a special URI that its scheme is `spark` and only contains
    * host and port.
    *
-   * @throws SparkException if `sparkUrl` is invalid.
+   * @throws org.apache.spark.SparkException if sparkUrl is invalid.
    */
+  @throws(classOf[SparkException])
   def extractHostPortFromSparkUrl(sparkUrl: String): (String, Int) = {
     try {
       val uri = new java.net.URI(sparkUrl)
@@ -2513,10 +2577,8 @@ private[util] object CallerContext extends Logging {
   val callerContextSupported: Boolean = {
     SparkHadoopUtil.get.conf.getBoolean("hadoop.caller.context.enabled", false) && {
       try {
-        // scalastyle:off classforname
-        Class.forName("org.apache.hadoop.ipc.CallerContext")
-        Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")
-        // scalastyle:on classforname
+        Utils.classForName("org.apache.hadoop.ipc.CallerContext")
+        Utils.classForName("org.apache.hadoop.ipc.CallerContext$Builder")
         true
       } catch {
         case _: ClassNotFoundException =>
@@ -2578,10 +2640,8 @@ private[spark] class CallerContext(
   def setCurrentContext(): Unit = {
     if (CallerContext.callerContextSupported) {
       try {
-        // scalastyle:off classforname
-        val callerContext = Class.forName("org.apache.hadoop.ipc.CallerContext")
-        val builder = Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")
-        // scalastyle:on classforname
+        val callerContext = Utils.classForName("org.apache.hadoop.ipc.CallerContext")
+        val builder = Utils.classForName("org.apache.hadoop.ipc.CallerContext$Builder")
         val builderInst = builder.getConstructor(classOf[String]).newInstance(context)
         val hdfsContext = builder.getMethod("build").invoke(builderInst)
         callerContext.getMethod("setCurrent", callerContext).invoke(null, hdfsContext)
diff --git a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
index 6b74a29aceda..bcb95b416dd2 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
@@ -140,16 +140,16 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
     var i = 1
     while (true) {
       val curKey = data(2 * pos)
-      if (k.eq(curKey) || k.equals(curKey)) {
-        val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
-        data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
-        return newValue
-      } else if (curKey.eq(null)) {
+      if (curKey.eq(null)) {
         val newValue = updateFunc(false, null.asInstanceOf[V])
         data(2 * pos) = k
         data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
         incrementSize()
         return newValue
+      } else if (k.eq(curKey) || k.equals(curKey)) {
+        val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
+        data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
+        return newValue
       } else {
         val delta = i
         pos = (pos + delta) & mask
diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 0f6a425e3db9..60f6f537c1d5 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -48,7 +48,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   require(initialCapacity <= OpenHashSet.MAX_CAPACITY,
     s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements")
-  require(initialCapacity >= 1, "Invalid initial capacity")
+  require(initialCapacity >= 0, "Invalid initial capacity")
   require(loadFactor < 1.0, "Load factor must be less than 1.0")
   require(loadFactor > 0.0, "Load factor must be greater than 0.0")
 
@@ -271,8 +271,12 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
   private def hashcode(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt()
 
   private def nextPowerOf2(n: Int): Int = {
-    val highBit = Integer.highestOneBit(n)
-    if (highBit == n) n else highBit << 1
+    if (n == 0) {
+      1
+    } else {
+      val highBit = Integer.highestOneBit(n)
+      if (highBit == n) n else highBit << 1
+    }
   }
 }
 
diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
index 89b0874e3865..cdafbca8cb85 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
@@ -86,7 +86,11 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
   }
 
   /**
-   * Copy this buffer into a new ByteBuffer.
+   * Convert this buffer to a ByteBuffer. If this buffer is backed by a single chunk, its underlying
+   * data will not be copied. Instead, it will be duplicated. If this buffer is backed by multiple
+   * chunks, the data underlying this buffer will be copied into a new byte buffer. As a result, it
+   * is suggested to use this method only if the caller does not need to manage the memory
+   * underlying this buffer.
    *
    * @throws UnsupportedOperationException if this buffer's size exceeds the max ByteBuffer size.
    */
@@ -132,10 +136,10 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
   }
 
   /**
-   * Attempt to clean up a ByteBuffer if it is memory-mapped. This uses an *unsafe* Sun API that
-   * might cause errors if one attempts to read from the unmapped buffer, but it's better than
-   * waiting for the GC to find it because that could lead to huge numbers of open files. There's
-   * unfortunately no standard API to do this.
+   * Attempt to clean up any ByteBuffer in this ChunkedByteBuffer which is direct or memory-mapped.
+   * See [[StorageUtils.dispose]] for more information.
+   *
+   * See also [[unmap]]
    */
   def dispose(): Unit = {
     if (!disposed) {
@@ -143,12 +147,25 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
       disposed = true
     }
   }
+
+  /**
+   * Attempt to unmap any ByteBuffer in this ChunkedByteBuffer if it is memory-mapped. See
+   * [[StorageUtils.unmap]] for more information.
+   *
+   * See also [[dispose]]
+   */
+  def unmap(): Unit = {
+    if (!disposed) {
+      chunks.foreach(StorageUtils.unmap)
+      disposed = true
+    }
+  }
 }
 
 /**
  * Reads data from a ChunkedByteBuffer.
  *
- * @param dispose if true, [[ChunkedByteBuffer.dispose()]] will be called at the end of the stream
+ * @param dispose if true, `ChunkedByteBuffer.dispose()` will be called at the end of the stream
  *                in order to close any memory-mapped files which back the buffer.
  */
 private class ChunkedByteBufferInputStream(
diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala
index 5c4238c0381a..1f263df57c85 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.util.logging
 
 import java.text.SimpleDateFormat
-import java.util.Calendar
+import java.util.{Calendar, Locale}
 
 import org.apache.spark.internal.Logging
 
@@ -59,7 +59,7 @@ private[spark] class TimeBasedRollingPolicy(
   }
 
   @volatile private var nextRolloverTime = calculateNextRolloverTime()
-  private val formatter = new SimpleDateFormat(rollingFileSuffixPattern)
+  private val formatter = new SimpleDateFormat(rollingFileSuffixPattern, Locale.US)
 
   /** Should rollover if current time has exceeded next rollover time */
   def shouldRollover(bytesToBeWritten: Long): Boolean = {
@@ -109,7 +109,7 @@ private[spark] class SizeBasedRollingPolicy(
   }
 
   @volatile private var bytesWrittenSinceRollover = 0L
-  val formatter = new SimpleDateFormat("--yyyy-MM-dd--HH-mm-ss--SSSS")
+  val formatter = new SimpleDateFormat("--yyyy-MM-dd--HH-mm-ss--SSSS", Locale.US)
 
   /** Should rollover if the next set of bytes is going to exceed the size limit */
   def shouldRollover(bytesToBeWritten: Long): Boolean = {
diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
index f98932a47016..a7e0075debed 100644
--- a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
@@ -56,28 +56,33 @@ private[spark] object SamplingUtils {
       val rand = new XORShiftRandom(seed)
       while (input.hasNext) {
         val item = input.next()
+        l += 1
+        // There are k elements in the reservoir, and the l-th element has been
+        // consumed. It should be chosen with probability k/l. The expression
+        // below is a random long chosen uniformly from [0,l)
         val replacementIndex = (rand.nextDouble() * l).toLong
         if (replacementIndex < k) {
           reservoir(replacementIndex.toInt) = item
         }
-        l += 1
       }
       (reservoir, l)
     }
   }
 
   /**
-   * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of
-   * the time.
+   * Returns a sampling rate that guarantees a sample of size greater than or equal to
+   * sampleSizeLowerBound 99.99% of the time.
    *
    * How the sampling rate is determined:
+   *
    * Let p = num / total, where num is the sample size and total is the total number of
-   * datapoints in the RDD. We're trying to compute q > p such that
+   * datapoints in the RDD. We're trying to compute q {@literal >} p such that
    *   - when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
-   *     where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
-   *     i.e. the failure rate of not having a sufficiently large sample < 0.0001.
+   *     where we want to guarantee
+   *     Pr[s {@literal <} num] {@literal <} 0.0001 for s = sum(prob_i for i from 0 to total),
+   *     i.e. the failure rate of not having a sufficiently large sample {@literal <} 0.0001.
    *     Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
-   *     num > 12, but we need a slightly larger q (9 empirically determined).
+   *     num {@literal >} 12, but we need a slightly larger q (9 empirically determined).
    *   - when sampling without replacement, we're drawing each datapoint with prob_i
    *     ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
    *     rate, where success rate is defined the same as in sampling with replacement.
@@ -108,14 +113,14 @@ private[spark] object SamplingUtils {
 private[spark] object PoissonBounds {
 
   /**
-   * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda).
+   * Returns a lambda such that Pr[X {@literal >} s] is very small, where X ~ Pois(lambda).
    */
   def getLowerBound(s: Double): Double = {
     math.max(s - numStd(s) * math.sqrt(s), 1e-15)
   }
 
   /**
-   * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda).
+   * Returns a lambda such that Pr[X {@literal <} s] is very small, where X ~ Pois(lambda).
    *
    * @param s sample size
    */
diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
index 67822749112c..ce46fc8f201b 100644
--- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
@@ -35,13 +35,14 @@ import org.apache.spark.rdd.RDD
  * high probability. This is achieved by maintaining a waitlist of size O(log(s)), where s is the
  * desired sample size for each stratum.
  *
- * Like in simple random sampling, we generate a random value for each item from the
- * uniform  distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist)
- * are accepted into the sample instantly. The threshold for instant accept is designed so that
- * s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by maintaining a
- * waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size s by adding
- * a portion of the waitlist to the set of items that are instantly accepted. The exact threshold
- * is computed by sorting the values in the waitlist and picking the value at (s - numAccepted).
+ * Like in simple random sampling, we generate a random value for each item from the uniform
+ * distribution [0.0, 1.0]. All items with values less than or equal to min(values of items in the
+ * waitlist) are accepted into the sample instantly. The threshold for instant accept is designed
+ * so that s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by
+ * maintaining a waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size
+ * s by adding a portion of the waitlist to the set of items that are instantly accepted. The exact
+ * threshold is computed by sorting the values in the waitlist and picking the value at
+ * (s - numAccepted).
  *
  * Note that since we use the same seed for the RNG when computing the thresholds and the actual
  * sample, our computed thresholds are guaranteed to produce the desired sample size.
@@ -160,12 +161,20 @@ private[spark] object StratifiedSamplingUtils extends Logging {
    *
    * To do so, we compute sampleSize = math.ceil(size * samplingRate) for each stratum and compare
    * it to the number of items that were accepted instantly and the number of items in the waitlist
-   * for that stratum. Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted),
+   * for that stratum.
+   *
+   * Most of the time,
+   * {{{
+   * numAccepted <= sampleSize <= (numAccepted + numWaitlisted)
+   * }}}
    * which means we need to sort the elements in the waitlist by their associated values in order
-   * to find the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize.
-   * Note that all elements in the waitlist have values >= bound for instant accept, so a T value
-   * in the waitlist range would allow all elements that were instantly accepted on the first pass
-   * to be included in the sample.
+   * to find the value T s.t.
+   * {{{
+   * |{elements in the stratum whose associated values <= T}| = sampleSize
+   * }}}.
+   * Note that all elements in the waitlist have values greater than or equal to bound for instant
+   * accept, so a T value in the waitlist range would allow all elements that were instantly
+   * accepted on the first pass to be included in the sample.
    */
   def computeThresholdByKey[K](finalResult: Map[K, AcceptanceResult],
       fractions: Map[K, Double]): Map[K, Double] = {
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 533025ba83e7..7bebe0612f9a 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -20,7 +20,6 @@
 import java.io.*;
 import java.nio.channels.FileChannel;
 import java.nio.ByteBuffer;
-import java.net.URI;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -46,6 +45,7 @@
 import com.google.common.collect.Lists;
 import com.google.common.base.Throwables;
 import com.google.common.io.Files;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.DefaultCodec;
@@ -1075,18 +1075,23 @@ public void wholeTextFiles() throws Exception {
     byte[] content2 = "spark is also easy to use.\n".getBytes(StandardCharsets.UTF_8);
 
     String tempDirName = tempDir.getAbsolutePath();
-    Files.write(content1, new File(tempDirName + "/part-00000"));
-    Files.write(content2, new File(tempDirName + "/part-00001"));
+    String path1 = new Path(tempDirName, "part-00000").toUri().getPath();
+    String path2 = new Path(tempDirName, "part-00001").toUri().getPath();
+
+    Files.write(content1, new File(path1));
+    Files.write(content2, new File(path2));
 
     Map<String, String> container = new HashMap<>();
-    container.put(tempDirName+"/part-00000", new Text(content1).toString());
-    container.put(tempDirName+"/part-00001", new Text(content2).toString());
+    container.put(path1, new Text(content1).toString());
+    container.put(path2, new Text(content2).toString());
 
     JavaPairRDD<String, String> readRDD = sc.wholeTextFiles(tempDirName, 3);
     List<Tuple2<String, String>> result = readRDD.collect();
 
     for (Tuple2<String, String> res : result) {
-      assertEquals(res._2(), container.get(new URI(res._1()).getPath()));
+      // Note that the paths from `wholeTextFiles` are in URI format on Windows,
+      // for example, file:/C:/a/b/c.
+      assertEquals(res._2(), container.get(new Path(res._1()).toUri().getPath()));
     }
   }
 
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
index a96cd82382e2..088b68132d90 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -26,11 +26,9 @@
 import scala.Tuple2;
 import scala.Tuple2$;
 import scala.collection.Iterator;
-import scala.runtime.AbstractFunction1;
 
 import com.google.common.collect.HashMultiset;
 import com.google.common.collect.Iterators;
-import com.google.common.io.ByteStreams;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -53,6 +51,7 @@
 import org.apache.spark.memory.TestMemoryManager;
 import org.apache.spark.network.util.LimitedInputStream;
 import org.apache.spark.scheduler.MapStatus;
+import org.apache.spark.security.CryptoStreamUtils;
 import org.apache.spark.serializer.*;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
 import org.apache.spark.storage.*;
@@ -77,7 +76,6 @@ public class UnsafeShuffleWriterSuite {
   final LinkedList<File> spillFilesCreated = new LinkedList<>();
   SparkConf conf;
   final Serializer serializer = new KryoSerializer(new SparkConf());
-  final SerializerManager serializerManager = new SerializerManager(serializer, new SparkConf());
   TaskMetrics taskMetrics;
 
   @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager;
@@ -86,17 +84,6 @@ public class UnsafeShuffleWriterSuite {
   @Mock(answer = RETURNS_SMART_NULLS) TaskContext taskContext;
   @Mock(answer = RETURNS_SMART_NULLS) ShuffleDependency<Object, Object, Object> shuffleDep;
 
-  private final class WrapStream extends AbstractFunction1<OutputStream, OutputStream> {
-    @Override
-    public OutputStream apply(OutputStream stream) {
-      if (conf.getBoolean("spark.shuffle.compress", true)) {
-        return CompressionCodec$.MODULE$.createCodec(conf).compressedOutputStream(stream);
-      } else {
-        return stream;
-      }
-    }
-  }
-
   @After
   public void tearDown() {
     Utils.deleteRecursively(tempDir);
@@ -121,6 +108,11 @@ public void setUp() throws IOException {
     memoryManager = new TestMemoryManager(conf);
     taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
 
+    // Some tests will override this manager because they change the configuration. This is a
+    // default for tests that don't need a specific one.
+    SerializerManager manager = new SerializerManager(serializer, conf);
+    when(blockManager.serializerManager()).thenReturn(manager);
+
     when(blockManager.diskBlockManager()).thenReturn(diskBlockManager);
     when(blockManager.getDiskWriter(
       any(BlockId.class),
@@ -131,12 +123,11 @@ public void setUp() throws IOException {
       @Override
       public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Throwable {
         Object[] args = invocationOnMock.getArguments();
-
         return new DiskBlockObjectWriter(
           (File) args[1],
+          blockManager.serializerManager(),
           (SerializerInstance) args[2],
           (Integer) args[3],
-          new WrapStream(),
           false,
           (ShuffleWriteMetrics) args[4],
           (BlockId) args[0]
@@ -201,9 +192,10 @@ private List<Tuple2<Object, Object>> readRecordsFromFile() throws IOException {
     for (int i = 0; i < NUM_PARTITITONS; i++) {
       final long partitionSize = partitionSizesInMergedFile[i];
       if (partitionSize > 0) {
-        InputStream in = new FileInputStream(mergedOutputFile);
-        ByteStreams.skipFully(in, startOffset);
-        in = new LimitedInputStream(in, partitionSize);
+        FileInputStream fin = new FileInputStream(mergedOutputFile);
+        fin.getChannel().position(startOffset);
+        InputStream in = new LimitedInputStream(fin, partitionSize);
+        in = blockManager.serializerManager().wrapForEncryption(in);
         if (conf.getBoolean("spark.shuffle.compress", true)) {
           in = CompressionCodec$.MODULE$.createCodec(conf).compressedInputStream(in);
         }
@@ -294,14 +286,32 @@ public void writeWithoutSpilling() throws Exception {
   }
 
   private void testMergingSpills(
-      boolean transferToEnabled,
-      String compressionCodecName) throws IOException {
+      final boolean transferToEnabled,
+      String compressionCodecName,
+      boolean encrypt) throws Exception {
     if (compressionCodecName != null) {
       conf.set("spark.shuffle.compress", "true");
       conf.set("spark.io.compression.codec", compressionCodecName);
     } else {
       conf.set("spark.shuffle.compress", "false");
     }
+    conf.set(org.apache.spark.internal.config.package$.MODULE$.IO_ENCRYPTION_ENABLED(), encrypt);
+
+    SerializerManager manager;
+    if (encrypt) {
+      manager = new SerializerManager(serializer, conf,
+        Option.apply(CryptoStreamUtils.createKey(conf)));
+    } else {
+      manager = new SerializerManager(serializer, conf);
+    }
+
+    when(blockManager.serializerManager()).thenReturn(manager);
+    testMergingSpills(transferToEnabled, encrypt);
+  }
+
+  private void testMergingSpills(
+      boolean transferToEnabled,
+      boolean encrypted) throws IOException {
     final UnsafeShuffleWriter<Object, Object> writer = createWriter(transferToEnabled);
     final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<>();
     for (int i : new int[] { 1, 2, 3, 4, 4, 2 }) {
@@ -324,6 +334,7 @@ private void testMergingSpills(
     for (long size: partitionSizesInMergedFile) {
       sumOfPartitionSizes += size;
     }
+
     assertEquals(sumOfPartitionSizes, mergedOutputFile.length());
 
     assertEquals(HashMultiset.create(dataToWrite), HashMultiset.create(readRecordsFromFile()));
@@ -338,42 +349,72 @@ private void testMergingSpills(
 
   @Test
   public void mergeSpillsWithTransferToAndLZF() throws Exception {
-    testMergingSpills(true, LZFCompressionCodec.class.getName());
+    testMergingSpills(true, LZFCompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithFileStreamAndLZF() throws Exception {
-    testMergingSpills(false, LZFCompressionCodec.class.getName());
+    testMergingSpills(false, LZFCompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithTransferToAndLZ4() throws Exception {
-    testMergingSpills(true, LZ4CompressionCodec.class.getName());
+    testMergingSpills(true, LZ4CompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithFileStreamAndLZ4() throws Exception {
-    testMergingSpills(false, LZ4CompressionCodec.class.getName());
+    testMergingSpills(false, LZ4CompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithTransferToAndSnappy() throws Exception {
-    testMergingSpills(true, SnappyCompressionCodec.class.getName());
+    testMergingSpills(true, SnappyCompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithFileStreamAndSnappy() throws Exception {
-    testMergingSpills(false, SnappyCompressionCodec.class.getName());
+    testMergingSpills(false, SnappyCompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithTransferToAndNoCompression() throws Exception {
-    testMergingSpills(true, null);
+    testMergingSpills(true, null, false);
   }
 
   @Test
   public void mergeSpillsWithFileStreamAndNoCompression() throws Exception {
-    testMergingSpills(false, null);
+    testMergingSpills(false, null, false);
+  }
+
+  @Test
+  public void mergeSpillsWithCompressionAndEncryption() throws Exception {
+    // This should actually be translated to a "file stream merge" internally, just have the
+    // test to make sure that it's the case.
+    testMergingSpills(true, LZ4CompressionCodec.class.getName(), true);
+  }
+
+  @Test
+  public void mergeSpillsWithFileStreamAndCompressionAndEncryption() throws Exception {
+    testMergingSpills(false, LZ4CompressionCodec.class.getName(), true);
+  }
+
+  @Test
+  public void mergeSpillsWithCompressionAndEncryptionSlowPath() throws Exception {
+    conf.set("spark.shuffle.unsafe.fastMergeEnabled", "false");
+    testMergingSpills(false, LZ4CompressionCodec.class.getName(), true);
+  }
+
+  @Test
+  public void mergeSpillsWithEncryptionAndNoCompression() throws Exception {
+    // This should actually be translated to a "file stream merge" internally, just have the
+    // test to make sure that it's the case.
+    testMergingSpills(true, null, true);
+  }
+
+  @Test
+  public void mergeSpillsWithFileStreamAndEncryptionAndNoCompression() throws Exception {
+    testMergingSpills(false, null, true);
   }
 
   @Test
@@ -531,4 +572,5 @@ public void testPeakMemoryUsed() throws Exception {
       writer.stop(false);
     }
   }
+
 }
diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index 33709b454c4c..26568146bf4d 100644
--- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -19,13 +19,11 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.util.*;
 
 import scala.Tuple2;
 import scala.Tuple2$;
-import scala.runtime.AbstractFunction1;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -75,13 +73,6 @@ public abstract class AbstractBytesToBytesMapSuite {
   @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager;
   @Mock(answer = RETURNS_SMART_NULLS) DiskBlockManager diskBlockManager;
 
-  private static final class WrapStream extends AbstractFunction1<OutputStream, OutputStream> {
-    @Override
-    public OutputStream apply(OutputStream stream) {
-      return stream;
-    }
-  }
-
   @Before
   public void setup() {
     memoryManager =
@@ -120,9 +111,9 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
 
         return new DiskBlockObjectWriter(
           (File) args[1],
+          serializerManager,
           (SerializerInstance) args[2],
           (Integer) args[3],
-          new WrapStream(),
           false,
           (ShuffleWriteMetrics) args[4],
           (BlockId) args[0]
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index a9cf8ff520ed..fbbe530a132e 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -19,14 +19,12 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.OutputStream;
 import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.UUID;
 
 import scala.Tuple2;
 import scala.Tuple2$;
-import scala.runtime.AbstractFunction1;
 
 import org.junit.After;
 import org.junit.Before;
@@ -57,13 +55,15 @@
 
 public class UnsafeExternalSorterSuite {
 
+  private final SparkConf conf = new SparkConf();
+
   final LinkedList<File> spillFilesCreated = new LinkedList<>();
   final TestMemoryManager memoryManager =
-    new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false"));
+    new TestMemoryManager(conf.clone().set("spark.memory.offHeap.enabled", "false"));
   final TaskMemoryManager taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
   final SerializerManager serializerManager = new SerializerManager(
-    new JavaSerializer(new SparkConf()),
-    new SparkConf().set("spark.shuffle.spill.compress", "false"));
+    new JavaSerializer(conf),
+    conf.clone().set("spark.shuffle.spill.compress", "false"));
   // Use integer comparison for comparing prefixes (which are partition ids, in this case)
   final PrefixComparator prefixComparator = PrefixComparators.LONG;
   // Since the key fits within the 8-byte prefix, we don't need to do any record comparison, so
@@ -86,14 +86,7 @@ public int compare(
 
   protected boolean shouldUseRadixSort() { return false; }
 
-  private final long pageSizeBytes = new SparkConf().getSizeAsBytes("spark.buffer.pageSize", "4m");
-
-  private static final class WrapStream extends AbstractFunction1<OutputStream, OutputStream> {
-    @Override
-    public OutputStream apply(OutputStream stream) {
-      return stream;
-    }
-  }
+  private final long pageSizeBytes = conf.getSizeAsBytes("spark.buffer.pageSize", "4m");
 
   @Before
   public void setUp() {
@@ -126,9 +119,9 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
 
         return new DiskBlockObjectWriter(
           (File) args[1],
+          serializerManager,
           (SerializerInstance) args[2],
           (Integer) args[3],
-          new WrapStream(),
           false,
           (ShuffleWriteMetrics) args[4],
           (BlockId) args[0]
diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
index 6d03ee091e4e..b35447bf657e 100644
--- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
@@ -210,7 +210,7 @@ class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContex
     assert(ref.get.isEmpty)
 
     // Getting a garbage collected accum should throw error
-    intercept[IllegalAccessError] {
+    intercept[IllegalStateException] {
       AccumulatorContext.get(accId)
     }
 
diff --git a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
index fb8d701ebda8..91355f736290 100644
--- a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
+++ b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
@@ -20,7 +20,6 @@ package org.apache.spark
 import java.io.{FileDescriptor, InputStream}
 import java.lang
 import java.nio.ByteBuffer
-import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -31,20 +30,29 @@ import org.apache.spark.internal.Logging
 
 object DebugFilesystem extends Logging {
   // Stores the set of active streams and their creation sites.
-  private val openStreams = new ConcurrentHashMap[FSDataInputStream, Throwable]()
+  private val openStreams = mutable.Map.empty[FSDataInputStream, Throwable]
 
-  def clearOpenStreams(): Unit = {
+  def addOpenStream(stream: FSDataInputStream): Unit = openStreams.synchronized {
+    openStreams.put(stream, new Throwable())
+  }
+
+  def clearOpenStreams(): Unit = openStreams.synchronized {
     openStreams.clear()
   }
 
-  def assertNoOpenStreams(): Unit = {
-    val numOpen = openStreams.size()
+  def removeOpenStream(stream: FSDataInputStream): Unit = openStreams.synchronized {
+    openStreams.remove(stream)
+  }
+
+  def assertNoOpenStreams(): Unit = openStreams.synchronized {
+    val numOpen = openStreams.values.size
     if (numOpen > 0) {
-      for (exc <- openStreams.values().asScala) {
+      for (exc <- openStreams.values) {
         logWarning("Leaked filesystem connection created at:")
         exc.printStackTrace()
       }
-      throw new RuntimeException(s"There are $numOpen possibly leaked file streams.")
+      throw new IllegalStateException(s"There are $numOpen possibly leaked file streams.",
+        openStreams.values.head)
     }
   }
 }
@@ -59,8 +67,7 @@ class DebugFilesystem extends LocalFileSystem {
 
   override def open(f: Path, bufferSize: Int): FSDataInputStream = {
     val wrapped: FSDataInputStream = super.open(f, bufferSize)
-    openStreams.put(wrapped, new Throwable())
-
+    addOpenStream(wrapped)
     new FSDataInputStream(wrapped.getWrappedStream) {
       override def setDropBehind(dropBehind: lang.Boolean): Unit = wrapped.setDropBehind(dropBehind)
 
@@ -97,7 +104,7 @@ class DebugFilesystem extends LocalFileSystem {
 
       override def close(): Unit = {
         wrapped.close()
-        openStreams.remove(wrapped)
+        removeOpenStream(wrapped)
       }
 
       override def read(): Int = wrapped.read()
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index cc52bb1d23cd..467a16d00456 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import java.io._
+import java.nio.ByteBuffer
 import java.util.zip.GZIPOutputStream
 
 import scala.io.Source
@@ -29,7 +30,6 @@ import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat}
 import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
 
-import org.apache.spark.input.PortableDataStream
 import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES
 import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD}
 import org.apache.spark.storage.StorageLevel
@@ -231,184 +231,82 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     assert(output.map(_.toString).collect().toList === List("(1,a)", "(2,aa)", "(3,aaa)"))
   }
 
-  test("binary file input as byte array") {
-    sc = new SparkContext("local", "test")
+  private def writeBinaryData(testOutput: Array[Byte], testOutputCopies: Int): File = {
     val outFile = new File(tempDir, "record-bytestream-00000.bin")
-    val outFileName = outFile.getAbsolutePath()
-
-    // create file
-    val testOutput = Array[Byte](1, 2, 3, 4, 5, 6)
-    val bbuf = java.nio.ByteBuffer.wrap(testOutput)
-    // write data to file
-    val file = new java.io.FileOutputStream(outFile)
+    val file = new FileOutputStream(outFile)
     val channel = file.getChannel
-    channel.write(bbuf)
+    for (i <- 0 until testOutputCopies) {
+      // Shift values by i so that they're different in the output
+      val alteredOutput = testOutput.map(b => (b + i).toByte)
+      channel.write(ByteBuffer.wrap(alteredOutput))
+    }
     channel.close()
     file.close()
+    outFile
+  }
 
-    val inRdd = sc.binaryFiles(outFileName)
-    val (infile: String, indata: PortableDataStream) = inRdd.collect.head
-
+  test("binary file input as byte array") {
+    sc = new SparkContext("local", "test")
+    val testOutput = Array[Byte](1, 2, 3, 4, 5, 6)
+    val outFile = writeBinaryData(testOutput, 1)
+    val inRdd = sc.binaryFiles(outFile.getAbsolutePath)
+    val (infile, indata) = inRdd.collect().head
     // Make sure the name and array match
-    assert(infile.contains(outFileName)) // a prefix may get added
+    assert(infile.contains(outFile.getAbsolutePath)) // a prefix may get added
     assert(indata.toArray === testOutput)
   }
 
   test("portabledatastream caching tests") {
     sc = new SparkContext("local", "test")
-    val outFile = new File(tempDir, "record-bytestream-00000.bin")
-    val outFileName = outFile.getAbsolutePath()
-
-    // create file
     val testOutput = Array[Byte](1, 2, 3, 4, 5, 6)
-    val bbuf = java.nio.ByteBuffer.wrap(testOutput)
-    // write data to file
-    val file = new java.io.FileOutputStream(outFile)
-    val channel = file.getChannel
-    channel.write(bbuf)
-    channel.close()
-    file.close()
-
-    val inRdd = sc.binaryFiles(outFileName).cache()
-    inRdd.foreach{
-      curData: (String, PortableDataStream) =>
-       curData._2.toArray() // force the file to read
-    }
-    val mappedRdd = inRdd.map {
-      curData: (String, PortableDataStream) =>
-        (curData._2.getPath(), curData._2)
-    }
-    val (infile: String, indata: PortableDataStream) = mappedRdd.collect.head
-
+    val outFile = writeBinaryData(testOutput, 1)
+    val inRdd = sc.binaryFiles(outFile.getAbsolutePath).cache()
+    inRdd.foreach(_._2.toArray()) // force the file to read
     // Try reading the output back as an object file
-
-    assert(indata.toArray === testOutput)
+    assert(inRdd.values.collect().head.toArray === testOutput)
   }
 
   test("portabledatastream persist disk storage") {
     sc = new SparkContext("local", "test")
-    val outFile = new File(tempDir, "record-bytestream-00000.bin")
-    val outFileName = outFile.getAbsolutePath()
-
-    // create file
     val testOutput = Array[Byte](1, 2, 3, 4, 5, 6)
-    val bbuf = java.nio.ByteBuffer.wrap(testOutput)
-    // write data to file
-    val file = new java.io.FileOutputStream(outFile)
-    val channel = file.getChannel
-    channel.write(bbuf)
-    channel.close()
-    file.close()
-
-    val inRdd = sc.binaryFiles(outFileName).persist(StorageLevel.DISK_ONLY)
-    inRdd.foreach{
-      curData: (String, PortableDataStream) =>
-        curData._2.toArray() // force the file to read
-    }
-    val mappedRdd = inRdd.map {
-      curData: (String, PortableDataStream) =>
-        (curData._2.getPath(), curData._2)
-    }
-    val (infile: String, indata: PortableDataStream) = mappedRdd.collect.head
-
-    // Try reading the output back as an object file
-
-    assert(indata.toArray === testOutput)
+    val outFile = writeBinaryData(testOutput, 1)
+    val inRdd = sc.binaryFiles(outFile.getAbsolutePath).persist(StorageLevel.DISK_ONLY)
+    inRdd.foreach(_._2.toArray()) // force the file to read
+    assert(inRdd.values.collect().head.toArray === testOutput)
   }
 
   test("portabledatastream flatmap tests") {
     sc = new SparkContext("local", "test")
-    val outFile = new File(tempDir, "record-bytestream-00000.bin")
-    val outFileName = outFile.getAbsolutePath()
-
-    // create file
     val testOutput = Array[Byte](1, 2, 3, 4, 5, 6)
+    val outFile = writeBinaryData(testOutput, 1)
+    val inRdd = sc.binaryFiles(outFile.getAbsolutePath)
     val numOfCopies = 3
-    val bbuf = java.nio.ByteBuffer.wrap(testOutput)
-    // write data to file
-    val file = new java.io.FileOutputStream(outFile)
-    val channel = file.getChannel
-    channel.write(bbuf)
-    channel.close()
-    file.close()
-
-    val inRdd = sc.binaryFiles(outFileName)
-    val mappedRdd = inRdd.map {
-      curData: (String, PortableDataStream) =>
-        (curData._2.getPath(), curData._2)
-    }
-    val copyRdd = mappedRdd.flatMap {
-      curData: (String, PortableDataStream) =>
-        for (i <- 1 to numOfCopies) yield (i, curData._2)
-    }
-
-    val copyArr: Array[(Int, PortableDataStream)] = copyRdd.collect()
-
-    // Try reading the output back as an object file
+    val copyRdd = inRdd.flatMap(curData => (0 until numOfCopies).map(_ => curData._2))
+    val copyArr = copyRdd.collect()
     assert(copyArr.length == numOfCopies)
-    copyArr.foreach{
-      cEntry: (Int, PortableDataStream) =>
-        assert(cEntry._2.toArray === testOutput)
+    for (i <- copyArr.indices) {
+      assert(copyArr(i).toArray === testOutput)
     }
-
   }
 
   test("fixed record length binary file as byte array") {
-    // a fixed length of 6 bytes
-
     sc = new SparkContext("local", "test")
-
-    val outFile = new File(tempDir, "record-bytestream-00000.bin")
-    val outFileName = outFile.getAbsolutePath()
-
-    // create file
     val testOutput = Array[Byte](1, 2, 3, 4, 5, 6)
     val testOutputCopies = 10
-
-    // write data to file
-    val file = new java.io.FileOutputStream(outFile)
-    val channel = file.getChannel
-    for(i <- 1 to testOutputCopies) {
-      val bbuf = java.nio.ByteBuffer.wrap(testOutput)
-      channel.write(bbuf)
-    }
-    channel.close()
-    file.close()
-
-    val inRdd = sc.binaryRecords(outFileName, testOutput.length)
-    // make sure there are enough elements
+    val outFile = writeBinaryData(testOutput, testOutputCopies)
+    val inRdd = sc.binaryRecords(outFile.getAbsolutePath, testOutput.length)
     assert(inRdd.count == testOutputCopies)
-
-    // now just compare the first one
-    val indata: Array[Byte] = inRdd.collect.head
-    assert(indata === testOutput)
+    val inArr = inRdd.collect()
+    for (i <- inArr.indices) {
+      assert(inArr(i) === testOutput.map(b => (b + i).toByte))
+    }
   }
 
   test ("negative binary record length should raise an exception") {
-    // a fixed length of 6 bytes
     sc = new SparkContext("local", "test")
-
-    val outFile = new File(tempDir, "record-bytestream-00000.bin")
-    val outFileName = outFile.getAbsolutePath()
-
-    // create file
-    val testOutput = Array[Byte](1, 2, 3, 4, 5, 6)
-    val testOutputCopies = 10
-
-    // write data to file
-    val file = new java.io.FileOutputStream(outFile)
-    val channel = file.getChannel
-    for(i <- 1 to testOutputCopies) {
-      val bbuf = java.nio.ByteBuffer.wrap(testOutput)
-      channel.write(bbuf)
-    }
-    channel.close()
-    file.close()
-
-    val inRdd = sc.binaryRecords(outFileName, -1)
-
+    val outFile = writeBinaryData(Array[Byte](1, 2, 3, 4, 5, 6), 1)
     intercept[SparkException] {
-      inRdd.count
+      sc.binaryRecords(outFile.getAbsolutePath, -1).count()
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
index a3490fc79e45..99150a1430d9 100644
--- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala
@@ -209,6 +209,83 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft
     assert(jobB.get() === 100)
   }
 
+  test("task reaper kills JVM if killed tasks keep running for too long") {
+    val conf = new SparkConf()
+      .set("spark.task.reaper.enabled", "true")
+      .set("spark.task.reaper.killTimeout", "5s")
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
+
+    // Add a listener to release the semaphore once any tasks are launched.
+    val sem = new Semaphore(0)
+    sc.addSparkListener(new SparkListener {
+      override def onTaskStart(taskStart: SparkListenerTaskStart) {
+        sem.release()
+      }
+    })
+
+    // jobA is the one to be cancelled.
+    val jobA = Future {
+      sc.setJobGroup("jobA", "this is a job to be cancelled", interruptOnCancel = true)
+      sc.parallelize(1 to 10000, 2).map { i =>
+        while (true) { }
+      }.count()
+    }
+
+    // Block until both tasks of job A have started and cancel job A.
+    sem.acquire(2)
+    // Small delay to ensure tasks actually start executing the task body
+    Thread.sleep(1000)
+
+    sc.clearJobGroup()
+    val jobB = sc.parallelize(1 to 100, 2).countAsync()
+    sc.cancelJobGroup("jobA")
+    val e = intercept[SparkException] { ThreadUtils.awaitResult(jobA, 15.seconds) }.getCause
+    assert(e.getMessage contains "cancel")
+
+    // Once A is cancelled, job B should finish fairly quickly.
+    assert(ThreadUtils.awaitResult(jobB, 60.seconds) === 100)
+  }
+
+  test("task reaper will not kill JVM if spark.task.killTimeout == -1") {
+    val conf = new SparkConf()
+      .set("spark.task.reaper.enabled", "true")
+      .set("spark.task.reaper.killTimeout", "-1")
+      .set("spark.task.reaper.PollingInterval", "1s")
+      .set("spark.deploy.maxExecutorRetries", "1")
+    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
+
+    // Add a listener to release the semaphore once any tasks are launched.
+    val sem = new Semaphore(0)
+    sc.addSparkListener(new SparkListener {
+      override def onTaskStart(taskStart: SparkListenerTaskStart) {
+        sem.release()
+      }
+    })
+
+    // jobA is the one to be cancelled.
+    val jobA = Future {
+      sc.setJobGroup("jobA", "this is a job to be cancelled", interruptOnCancel = true)
+      sc.parallelize(1 to 2, 2).map { i =>
+        val startTime = System.currentTimeMillis()
+        while (System.currentTimeMillis() < startTime + 10000) { }
+      }.count()
+    }
+
+    // Block until both tasks of job A have started and cancel job A.
+    sem.acquire(2)
+    // Small delay to ensure tasks actually start executing the task body
+    Thread.sleep(1000)
+
+    sc.clearJobGroup()
+    val jobB = sc.parallelize(1 to 100, 2).countAsync()
+    sc.cancelJobGroup("jobA")
+    val e = intercept[SparkException] { ThreadUtils.awaitResult(jobA, 15.seconds) }.getCause
+    assert(e.getMessage contains "cancel")
+
+    // Once A is cancelled, job B should finish fairly quickly.
+    assert(ThreadUtils.awaitResult(jobB, 60.seconds) === 100)
+  }
+
   test("two jobs sharing the same stage") {
     // sem1: make sure cancel is issued after some tasks are launched
     // twoJobsSharingStageSemaphore:
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index c451c596b069..619b30e3e477 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark
 
 import java.io.File
-import java.net.MalformedURLException
+import java.net.{MalformedURLException, URI}
 import java.nio.charset.StandardCharsets
 import java.util.concurrent.TimeUnit
 
@@ -26,6 +26,8 @@ import scala.concurrent.Await
 import scala.concurrent.duration.Duration
 
 import com.google.common.io.Files
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.io.{BytesWritable, LongWritable, Text}
 import org.apache.hadoop.mapred.TextInputFormat
 import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat}
@@ -289,6 +291,22 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("add jar with invalid path") {
+    val tmpDir = Utils.createTempDir()
+    val tmpJar = File.createTempFile("test", ".jar", tmpDir)
+
+    sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local"))
+    sc.addJar(tmpJar.getAbsolutePath)
+
+    // Invaid jar path will only print the error log, will not add to file server.
+    sc.addJar("dummy.jar")
+    sc.addJar("")
+    sc.addJar(tmpDir.getAbsolutePath)
+
+    sc.listJars().size should be (1)
+    sc.listJars().head should include (tmpJar.getName)
+  }
+
   test("Cancelling job group should not cause SparkContext to shutdown (SPARK-6414)") {
     try {
       sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local"))
@@ -451,4 +469,20 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext {
       sc.stop()
     }
   }
+
+  test("SPARK-19446: DebugFilesystem.assertNoOpenStreams should report " +
+    "open streams to help debugging") {
+    val fs = new DebugFilesystem()
+    fs.initialize(new URI("file:///"), new Configuration())
+    val file = File.createTempFile("SPARK19446", "temp")
+    Files.write(Array.ofDim[Byte](1000), file)
+    val path = new Path("file:///" + file.getCanonicalPath)
+    val stream = fs.open(path)
+    val exc = intercept[RuntimeException] {
+      DebugFilesystem.assertNoOpenStreams()
+    }
+    assert(exc != null)
+    assert(exc.getCause() != null)
+    stream.close()
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/api/r/JVMObjectTrackerSuite.scala b/core/src/test/scala/org/apache/spark/api/r/JVMObjectTrackerSuite.scala
new file mode 100644
index 000000000000..6a979aefe6e9
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/api/r/JVMObjectTrackerSuite.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import org.apache.spark.SparkFunSuite
+
+class JVMObjectTrackerSuite extends SparkFunSuite {
+  test("JVMObjectId does not take null IDs") {
+    intercept[IllegalArgumentException] {
+      JVMObjectId(null)
+    }
+  }
+
+  test("JVMObjectTracker") {
+    val tracker = new JVMObjectTracker
+    assert(tracker.size === 0)
+    withClue("an empty tracker can be cleared") {
+      tracker.clear()
+    }
+    val none = JVMObjectId("none")
+    assert(tracker.get(none) === None)
+    intercept[NoSuchElementException] {
+      tracker(JVMObjectId("none"))
+    }
+
+    val obj1 = new Object
+    val id1 = tracker.addAndGetId(obj1)
+    assert(id1 != null)
+    assert(tracker.size === 1)
+    assert(tracker.get(id1).get.eq(obj1))
+    assert(tracker(id1).eq(obj1))
+
+    val obj2 = new Object
+    val id2 = tracker.addAndGetId(obj2)
+    assert(id1 !== id2)
+    assert(tracker.size === 2)
+    assert(tracker(id2).eq(obj2))
+
+    val Some(obj1Removed) = tracker.remove(id1)
+    assert(obj1Removed.eq(obj1))
+    assert(tracker.get(id1) === None)
+    assert(tracker.size === 1)
+    assert(tracker(id2).eq(obj2))
+
+    val obj3 = new Object
+    val id3 = tracker.addAndGetId(obj3)
+    assert(tracker.size === 2)
+    assert(id3 != id1)
+    assert(id3 != id2)
+    assert(tracker(id3).eq(obj3))
+
+    tracker.clear()
+    assert(tracker.size === 0)
+    assert(tracker.get(id1) === None)
+    assert(tracker.get(id2) === None)
+    assert(tracker.get(id3) === None)
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/StartingOffsets.scala b/core/src/test/scala/org/apache/spark/api/r/RBackendSuite.scala
similarity index 63%
rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/StartingOffsets.scala
rename to core/src/test/scala/org/apache/spark/api/r/RBackendSuite.scala
index 83959e597171..085cc267ca74 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/StartingOffsets.scala
+++ b/core/src/test/scala/org/apache/spark/api/r/RBackendSuite.scala
@@ -15,18 +15,17 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.kafka010
+package org.apache.spark.api.r
 
-import org.apache.kafka.common.TopicPartition
+import org.apache.spark.SparkFunSuite
 
-/*
- * Values that can be specified for config startingOffsets
- */
-private[kafka010] sealed trait StartingOffsets
-
-private[kafka010] case object EarliestOffsets extends StartingOffsets
-
-private[kafka010] case object LatestOffsets extends StartingOffsets
-
-private[kafka010] case class SpecificOffsets(
-  partitionOffsets: Map[TopicPartition, Long]) extends StartingOffsets
+class RBackendSuite extends SparkFunSuite {
+  test("close() clears jvmObjectTracker") {
+    val backend = new RBackend
+    val tracker = backend.jvmObjectTracker
+    val id = tracker.addAndGetId(new Object)
+    backend.close()
+    assert(tracker.get(id) === None)
+    assert(tracker.size === 0)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index 973676398ae5..6646068d5080 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -137,6 +137,18 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext {
     sc.stop()
   }
 
+  test("Cache broadcast to disk") {
+    val conf = new SparkConf()
+      .setMaster("local")
+      .setAppName("test")
+      .set("spark.memory.useLegacyMode", "true")
+      .set("spark.storage.memoryFraction", "0.0")
+    sc = new SparkContext(conf)
+    val list = List[Int](1, 2, 3, 4)
+    val broadcast = sc.broadcast(list)
+    assert(broadcast.value.sum === 10)
+  }
+
   /**
    * Verify the persistence of state associated with a TorrentBroadcast in a local-cluster.
    *
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 7c649e305a37..d9e176a12226 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -204,7 +204,12 @@ class SparkSubmitSuite
     childArgsStr should include ("--arg arg1 --arg arg2")
     childArgsStr should include regex ("--jar .*thejar.jar")
     mainClass should be ("org.apache.spark.deploy.yarn.Client")
-    classpath should have length (0)
+
+    // In yarn cluster mode, also adding jars to classpath
+    classpath(0) should endWith ("thejar.jar")
+    classpath(1) should endWith ("one.jar")
+    classpath(2) should endWith ("two.jar")
+    classpath(3) should endWith ("three.jar")
 
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.driver.memory") should be ("4g")
diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
index e29eb8552e13..281484c6170e 100644
--- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -354,12 +354,13 @@ class StandaloneDynamicAllocationSuite
   test("kill the same executor twice (SPARK-9795)") {
     sc = new SparkContext(appConf)
     val appId = sc.applicationId
+    sc.requestExecutors(2)
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
       assert(apps.size === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
-      assert(apps.head.getExecutorLimit === Int.MaxValue)
+      assert(apps.head.getExecutorLimit === 2)
     }
     // sync executors between the Master and the driver, needed because
     // the driver refuses to kill executors it does not know about
@@ -378,12 +379,13 @@ class StandaloneDynamicAllocationSuite
   test("the pending replacement executors should not be lost (SPARK-10515)") {
     sc = new SparkContext(appConf)
     val appId = sc.applicationId
+    sc.requestExecutors(2)
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
       assert(apps.size === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
-      assert(apps.head.getExecutorLimit === Int.MaxValue)
+      assert(apps.head.getExecutorLimit === 2)
     }
     // sync executors between the Master and the driver, needed because
     // the driver refuses to kill executors it does not know about
@@ -433,10 +435,11 @@ class StandaloneDynamicAllocationSuite
     assert(executors.size === 2)
 
     // simulate running a task on the executor
-    val getMap = PrivateMethod[mutable.HashMap[String, Int]]('executorIdToTaskCount)
+    val getMap =
+      PrivateMethod[mutable.HashMap[String, mutable.HashSet[Long]]]('executorIdToRunningTaskIds)
     val taskScheduler = sc.taskScheduler.asInstanceOf[TaskSchedulerImpl]
-    val executorIdToTaskCount = taskScheduler invokePrivate getMap()
-    executorIdToTaskCount(executors.head) = 1
+    val executorIdToRunningTaskIds = taskScheduler invokePrivate getMap()
+    executorIdToRunningTaskIds(executors.head) = mutable.HashSet(1L)
     // kill the busy executor without force; this should fail
     assert(killExecutor(sc, executors.head, force = false).isEmpty)
     apps = getApplications()
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index a5eda7b5a5a7..c1a93e1ab73b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -35,10 +35,11 @@ import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
 import org.scalatest.concurrent.Eventually._
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.internal.Logging
 import org.apache.spark.io._
 import org.apache.spark.scheduler._
+import org.apache.spark.security.GroupMappingServiceProvider
 import org.apache.spark.util.{Clock, JsonProtocol, ManualClock, Utils}
 
 class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging {
@@ -428,6 +429,102 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
     }
   }
 
+  test("support history server ui admin acls") {
+    def createAndCheck(conf: SparkConf, properties: (String, String)*)
+      (checkFn: SecurityManager => Unit): Unit = {
+      // Empty the testDir for each test.
+      if (testDir.exists() && testDir.isDirectory) {
+        testDir.listFiles().foreach { f => if (f.isFile) f.delete() }
+      }
+
+      var provider: FsHistoryProvider = null
+      try {
+        provider = new FsHistoryProvider(conf)
+        val log = newLogFile("app1", Some("attempt1"), inProgress = false)
+        writeFile(log, true, None,
+          SparkListenerApplicationStart("app1", Some("app1"), System.currentTimeMillis(),
+            "test", Some("attempt1")),
+          SparkListenerEnvironmentUpdate(Map(
+            "Spark Properties" -> properties.toSeq,
+            "JVM Information" -> Seq.empty,
+            "System Properties" -> Seq.empty,
+            "Classpath Entries" -> Seq.empty
+          )),
+          SparkListenerApplicationEnd(System.currentTimeMillis()))
+
+        provider.checkForLogs()
+        val appUi = provider.getAppUI("app1", Some("attempt1"))
+
+        assert(appUi.nonEmpty)
+        val securityManager = appUi.get.ui.securityManager
+        checkFn(securityManager)
+      } finally {
+        if (provider != null) {
+          provider.stop()
+        }
+      }
+    }
+
+    // Test both history ui admin acls and application acls are configured.
+    val conf1 = createTestConf()
+      .set("spark.history.ui.acls.enable", "true")
+      .set("spark.history.ui.admin.acls", "user1,user2")
+      .set("spark.history.ui.admin.acls.groups", "group1")
+      .set("spark.user.groups.mapping", classOf[TestGroupsMappingProvider].getName)
+
+    createAndCheck(conf1, ("spark.admin.acls", "user"), ("spark.admin.acls.groups", "group")) {
+      securityManager =>
+        // Test whether user has permission to access UI.
+        securityManager.checkUIViewPermissions("user1") should be (true)
+        securityManager.checkUIViewPermissions("user2") should be (true)
+        securityManager.checkUIViewPermissions("user") should be (true)
+        securityManager.checkUIViewPermissions("abc") should be (false)
+
+        // Test whether user with admin group has permission to access UI.
+        securityManager.checkUIViewPermissions("user3") should be (true)
+        securityManager.checkUIViewPermissions("user4") should be (true)
+        securityManager.checkUIViewPermissions("user5") should be (true)
+        securityManager.checkUIViewPermissions("user6") should be (false)
+    }
+
+    // Test only history ui admin acls are configured.
+    val conf2 = createTestConf()
+      .set("spark.history.ui.acls.enable", "true")
+      .set("spark.history.ui.admin.acls", "user1,user2")
+      .set("spark.history.ui.admin.acls.groups", "group1")
+      .set("spark.user.groups.mapping", classOf[TestGroupsMappingProvider].getName)
+    createAndCheck(conf2) { securityManager =>
+      // Test whether user has permission to access UI.
+      securityManager.checkUIViewPermissions("user1") should be (true)
+      securityManager.checkUIViewPermissions("user2") should be (true)
+      // Check the unknown "user" should return false
+      securityManager.checkUIViewPermissions("user") should be (false)
+
+      // Test whether user with admin group has permission to access UI.
+      securityManager.checkUIViewPermissions("user3") should be (true)
+      securityManager.checkUIViewPermissions("user4") should be (true)
+      // Check the "user5" without mapping relation should return false
+      securityManager.checkUIViewPermissions("user5") should be (false)
+    }
+
+    // Test neither history ui admin acls nor application acls are configured.
+     val conf3 = createTestConf()
+      .set("spark.history.ui.acls.enable", "true")
+      .set("spark.user.groups.mapping", classOf[TestGroupsMappingProvider].getName)
+    createAndCheck(conf3) { securityManager =>
+      // Test whether user has permission to access UI.
+      securityManager.checkUIViewPermissions("user1") should be (false)
+      securityManager.checkUIViewPermissions("user2") should be (false)
+      securityManager.checkUIViewPermissions("user") should be (false)
+
+      // Test whether user with admin group has permission to access UI.
+      // Check should be failed since we don't have acl group settings.
+      securityManager.checkUIViewPermissions("user3") should be (false)
+      securityManager.checkUIViewPermissions("user4") should be (false)
+      securityManager.checkUIViewPermissions("user5") should be (false)
+    }
+ }
+
   /**
    * Asks the provider to check for logs and calls a function to perform checks on the updated
    * app list. Example:
@@ -480,3 +577,15 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
   }
 
 }
+
+class TestGroupsMappingProvider extends GroupMappingServiceProvider {
+  private val mappings = Map(
+    "user3" -> "group1",
+    "user4" -> "group1",
+    "user5" -> "group")
+
+  override def getGroups(username: String): Set[String] = {
+    mappings.get(username).map(Set(_)).getOrElse(Set.empty)
+  }
+}
+
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index a595bc174a31..c5830df4b6db 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -20,7 +20,8 @@ import java.io.{File, FileInputStream, FileWriter, InputStream, IOException}
 import java.net.{HttpURLConnection, URL}
 import java.nio.charset.StandardCharsets
 import java.util.zip.ZipInputStream
-import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
+import javax.servlet._
+import javax.servlet.http.{HttpServletRequest, HttpServletRequestWrapper, HttpServletResponse}
 
 import scala.concurrent.duration._
 import scala.language.postfixOps
@@ -29,6 +30,8 @@ import com.codahale.metrics.Counter
 import com.google.common.io.{ByteStreams, Files}
 import org.apache.commons.io.{FileUtils, IOUtils}
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.eclipse.jetty.proxy.ProxyServlet
+import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 import org.json4s.JsonAST._
 import org.json4s.jackson.JsonMethods
 import org.json4s.jackson.JsonMethods._
@@ -66,11 +69,12 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
   private var server: HistoryServer = null
   private var port: Int = -1
 
-  def init(): Unit = {
+  def init(extraConf: (String, String)*): Unit = {
     val conf = new SparkConf()
       .set("spark.history.fs.logDirectory", logDir)
       .set("spark.history.fs.update.interval", "0")
       .set("spark.testing", "true")
+    conf.setAll(extraConf)
     provider = new FsHistoryProvider(conf)
     provider.checkForLogs()
     val securityManager = new SecurityManager(conf)
@@ -258,8 +262,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     getContentAndCode("foobar")._1 should be (HttpServletResponse.SC_NOT_FOUND)
   }
 
-  test("relative links are prefixed with uiRoot (spark.ui.proxyBase)") {
-    val proxyBaseBeforeTest = System.getProperty("spark.ui.proxyBase")
+  test("static relative links are prefixed with uiRoot (spark.ui.proxyBase)") {
     val uiRoot = Option(System.getenv("APPLICATION_WEB_PROXY_BASE")).getOrElse("/testwebproxybase")
     val page = new HistoryPage(server)
     val request = mock[HttpServletRequest]
@@ -267,7 +270,6 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     // when
     System.setProperty("spark.ui.proxyBase", uiRoot)
     val response = page.render(request)
-    System.setProperty("spark.ui.proxyBase", Option(proxyBaseBeforeTest).getOrElse(""))
 
     // then
     val urls = response \\ "@href" map (_.toString)
@@ -275,6 +277,80 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     all (siteRelativeLinks) should startWith (uiRoot)
   }
 
+  test("ajax rendered relative links are prefixed with uiRoot (spark.ui.proxyBase)") {
+    val uiRoot = "/testwebproxybase"
+    System.setProperty("spark.ui.proxyBase", uiRoot)
+
+    server.stop()
+
+    val conf = new SparkConf()
+      .set("spark.history.fs.logDirectory", logDir)
+      .set("spark.history.fs.update.interval", "0")
+      .set("spark.testing", "true")
+
+    provider = new FsHistoryProvider(conf)
+    provider.checkForLogs()
+    val securityManager = new SecurityManager(conf)
+
+    server = new HistoryServer(conf, provider, securityManager, 18080)
+    server.initialize()
+    server.bind()
+
+    val port = server.boundPort
+
+    val servlet = new ProxyServlet {
+      override def rewriteTarget(request: HttpServletRequest): String = {
+        // servlet acts like a proxy that redirects calls made on
+        // spark.ui.proxyBase context path to the normal servlet handlers operating off "/"
+        val sb = request.getRequestURL()
+
+        if (request.getQueryString() != null) {
+          sb.append(s"?${request.getQueryString()}")
+        }
+
+        val proxyidx = sb.indexOf(uiRoot)
+        sb.delete(proxyidx, proxyidx + uiRoot.length).toString
+      }
+    }
+
+    val contextHandler = new ServletContextHandler
+    val holder = new ServletHolder(servlet)
+    contextHandler.setContextPath(uiRoot)
+    contextHandler.addServlet(holder, "/")
+    server.attachHandler(contextHandler)
+
+    implicit val webDriver: WebDriver = new HtmlUnitDriver(true) {
+      getWebClient.getOptions.setThrowExceptionOnScriptError(false)
+    }
+
+    try {
+      val url = s"http://localhost:$port"
+
+      go to s"$url$uiRoot"
+
+      // expect the ajax call to finish in 5 seconds
+      implicitlyWait(org.scalatest.time.Span(5, org.scalatest.time.Seconds))
+
+      // once this findAll call returns, we know the ajax load of the table completed
+      findAll(ClassNameQuery("odd"))
+
+      val links = findAll(TagNameQuery("a"))
+        .map(_.attribute("href"))
+        .filter(_.isDefined)
+        .map(_.get)
+        .filter(_.startsWith(url)).toList
+
+      // there are atleast some URL links that were generated via javascript,
+      // and they all contain the spark.ui.proxyBase (uiRoot)
+      links.length should be > 4
+      all(links) should startWith(url + uiRoot)
+    } finally {
+      contextHandler.stop()
+      quit()
+    }
+
+  }
+
   test("incomplete apps get refreshed") {
 
     implicit val webDriver: WebDriver = new HtmlUnitDriver
@@ -469,8 +545,43 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     assert(jobcount === getNumJobs("/jobs"))
 
     // no need to retain the test dir now the tests complete
-    logDir.deleteOnExit();
+    logDir.deleteOnExit()
+  }
+
+  test("ui and api authorization checks") {
+    val appId = "local-1430917381535"
+    val owner = "irashid"
+    val admin = "root"
+    val other = "alice"
 
+    stop()
+    init(
+      "spark.ui.filters" -> classOf[FakeAuthFilter].getName(),
+      "spark.history.ui.acls.enable" -> "true",
+      "spark.history.ui.admin.acls" -> admin)
+
+    val tests = Seq(
+      (owner, HttpServletResponse.SC_OK),
+      (admin, HttpServletResponse.SC_OK),
+      (other, HttpServletResponse.SC_FORBIDDEN),
+      // When the remote user is null, the code behaves as if auth were disabled.
+      (null, HttpServletResponse.SC_OK))
+
+    val port = server.boundPort
+    val testUrls = Seq(
+      s"http://localhost:$port/api/v1/applications/$appId/1/jobs",
+      s"http://localhost:$port/history/$appId/1/jobs/",
+      s"http://localhost:$port/api/v1/applications/$appId/logs",
+      s"http://localhost:$port/api/v1/applications/$appId/1/logs",
+      s"http://localhost:$port/api/v1/applications/$appId/2/logs")
+
+    tests.foreach { case (user, expectedCode) =>
+      testUrls.foreach { url =>
+        val headers = if (user != null) Seq(FakeAuthFilter.FAKE_HTTP_USER -> user) else Nil
+        val sc = TestUtils.httpResponseCode(new URL(url), headers = headers)
+        assert(sc === expectedCode, s"Unexpected status code $sc for $url (user = $user)")
+      }
+    }
   }
 
   def getContentAndCode(path: String, port: Int = port): (Int, Option[String], Option[String]) = {
@@ -555,3 +666,26 @@ object HistoryServerSuite {
     }
   }
 }
+
+/**
+ * A filter used for auth tests; sets the request's user to the value of the "HTTP_USER" header.
+ */
+class FakeAuthFilter extends Filter {
+
+  override def destroy(): Unit = { }
+
+  override def init(config: FilterConfig): Unit = { }
+
+  override def doFilter(req: ServletRequest, res: ServletResponse, chain: FilterChain): Unit = {
+    val hreq = req.asInstanceOf[HttpServletRequest]
+    val wrapped = new HttpServletRequestWrapper(hreq) {
+      override def getRemoteUser(): String = hreq.getHeader(FakeAuthFilter.FAKE_HTTP_USER)
+    }
+    chain.doFilter(wrapped, res)
+  }
+
+}
+
+object FakeAuthFilter {
+  val FAKE_HTTP_USER = "HTTP_USER"
+}
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index ad56715656c8..8d06f5468f4f 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -30,7 +30,7 @@ import org.apache.hadoop.mapred.{FileSplit, TextInputFormat}
 import org.apache.spark._
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDDSuiteUtils._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ThreadUtils, Utils}
 
 class RDDSuite extends SparkFunSuite with SharedSparkContext {
   var tempDir: File = _
@@ -1082,6 +1082,22 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
     assert(totalPartitionCount == 10)
   }
 
+  test("SPARK-18406: race between end-of-task and completion iterator read lock release") {
+    val rdd = sc.parallelize(1 to 1000, 10)
+    rdd.cache()
+
+    rdd.mapPartitions { iter =>
+      ThreadUtils.runInNewThread("TestThread") {
+        // Iterate to the end of the input iterator, to cause the CompletionIterator completion to
+        // fire outside of the task's main thread.
+        while (iter.hasNext) {
+          iter.next()
+        }
+        iter
+      }
+    }.collect()
+  }
+
   // NOTE
   // Below tests calling sc.stop() have to be the last tests in this suite. If there are tests
   // running after them and if they access sc those tests will fail as sc is already closed, because
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index bec95d13d193..5e8a854e46a0 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -2076,7 +2076,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
   }
 
   /**
-   * Checks the DAGScheduler's internal logic for traversing a RDD DAG by making sure that
+   * Checks the DAGScheduler's internal logic for traversing an RDD DAG by making sure that
    * getShuffleDependencies correctly returns the direct shuffle dependencies of a particular
    * RDD. The test creates the following RDD graph (where n denotes a narrow dependency and s
    * denotes a shuffle dependency):
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index f5f1947661d9..e736c6c1145f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -17,6 +17,12 @@
 
 package org.apache.spark.scheduler
 
+import java.nio.ByteBuffer
+
+import scala.collection.mutable.HashMap
+
+import org.mockito.Matchers.{anyInt, anyString, eq => meq}
+import org.mockito.Mockito.{atLeast, atMost, never, spy, verify, when}
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark._
@@ -408,4 +414,70 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     assert(thirdTaskDescs.size === 0)
     assert(taskScheduler.getExecutorsAliveOnHost("host1") === Some(Set("executor1", "executor3")))
   }
+  test("if an executor is lost then the state for its running tasks is cleaned up (SPARK-18553)") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+
+    val e0Offers = IndexedSeq(WorkerOffer("executor0", "host0", 1))
+    val attempt1 = FakeTask.createTaskSet(1)
+
+    // submit attempt 1, offer resources, task gets scheduled
+    taskScheduler.submitTasks(attempt1)
+    val taskDescriptions = taskScheduler.resourceOffers(e0Offers).flatten
+    assert(1 === taskDescriptions.length)
+
+    // mark executor0 as dead
+    taskScheduler.executorLost("executor0", SlaveLost())
+    assert(!taskScheduler.isExecutorAlive("executor0"))
+    assert(!taskScheduler.hasExecutorsAliveOnHost("host0"))
+    assert(taskScheduler.getExecutorsAliveOnHost("host0").isEmpty)
+
+
+    // Check that state associated with the lost task attempt is cleaned up:
+    assert(taskScheduler.taskIdToExecutorId.isEmpty)
+    assert(taskScheduler.taskIdToTaskSetManager.isEmpty)
+    assert(taskScheduler.runningTasksByExecutors.get("executor0").isEmpty)
+  }
+
+  test("if a task finishes with TaskState.LOST its executor is marked as dead") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+
+    val e0Offers = IndexedSeq(WorkerOffer("executor0", "host0", 1))
+    val attempt1 = FakeTask.createTaskSet(1)
+
+    // submit attempt 1, offer resources, task gets scheduled
+    taskScheduler.submitTasks(attempt1)
+    val taskDescriptions = taskScheduler.resourceOffers(e0Offers).flatten
+    assert(1 === taskDescriptions.length)
+
+    // Report the task as failed with TaskState.LOST
+    taskScheduler.statusUpdate(
+      tid = taskDescriptions.head.taskId,
+      state = TaskState.LOST,
+      serializedData = ByteBuffer.allocate(0)
+    )
+
+    // Check that state associated with the lost task attempt is cleaned up:
+    assert(taskScheduler.taskIdToExecutorId.isEmpty)
+    assert(taskScheduler.taskIdToTaskSetManager.isEmpty)
+    assert(taskScheduler.runningTasksByExecutors.get("executor0").isEmpty)
+
+    // Check that the executor has been marked as dead
+    assert(!taskScheduler.isExecutorAlive("executor0"))
+    assert(!taskScheduler.hasExecutorsAliveOnHost("host0"))
+    assert(taskScheduler.getExecutorsAliveOnHost("host0").isEmpty)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala b/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
index 81eb907ac7ba..a61ec74c7df8 100644
--- a/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
@@ -16,18 +16,21 @@
  */
 package org.apache.spark.security
 
-import java.security.PrivilegedExceptionAction
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.nio.charset.StandardCharsets.UTF_8
+import java.util.UUID
 
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+import com.google.common.io.ByteStreams
 
-import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark._
 import org.apache.spark.internal.config._
 import org.apache.spark.security.CryptoStreamUtils._
+import org.apache.spark.serializer.{JavaSerializer, SerializerManager}
+import org.apache.spark.storage.TempShuffleBlockId
 
 class CryptoStreamUtilsSuite extends SparkFunSuite {
-  val ugi = UserGroupInformation.createUserForTesting("testuser", Array("testgroup"))
 
-  test("Crypto configuration conversion") {
+  test("crypto configuration conversion") {
     val sparkKey1 = s"${SPARK_IO_ENCRYPTION_COMMONS_CONFIG_PREFIX}a.b.c"
     val sparkVal1 = "val1"
     val cryptoKey1 = s"${COMMONS_CRYPTO_CONF_PREFIX}a.b.c"
@@ -43,65 +46,85 @@ class CryptoStreamUtilsSuite extends SparkFunSuite {
     assert(!props.containsKey(cryptoKey2))
   }
 
-  test("Shuffle encryption is disabled by default") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        val credentials = UserGroupInformation.getCurrentUser.getCredentials()
-        val conf = new SparkConf()
-        initCredentials(conf, credentials)
-        assert(credentials.getSecretKey(SPARK_IO_TOKEN) === null)
-      }
-    })
+  test("shuffle encryption key length should be 128 by default") {
+    val conf = createConf()
+    var key = CryptoStreamUtils.createKey(conf)
+    val actual = key.length * (java.lang.Byte.SIZE)
+    assert(actual === 128)
   }
 
-  test("Shuffle encryption key length should be 128 by default") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        val credentials = UserGroupInformation.getCurrentUser.getCredentials()
-        val conf = new SparkConf()
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        initCredentials(conf, credentials)
-        var key = credentials.getSecretKey(SPARK_IO_TOKEN)
-        assert(key !== null)
-        val actual = key.length * (java.lang.Byte.SIZE)
-        assert(actual === 128)
-      }
-    })
+  test("create 256-bit key") {
+    val conf = createConf(IO_ENCRYPTION_KEY_SIZE_BITS.key -> "256")
+    var key = CryptoStreamUtils.createKey(conf)
+    val actual = key.length * (java.lang.Byte.SIZE)
+    assert(actual === 256)
   }
 
-  test("Initial credentials with key length in 256") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        val credentials = UserGroupInformation.getCurrentUser.getCredentials()
-        val conf = new SparkConf()
-        conf.set(IO_ENCRYPTION_KEY_SIZE_BITS, 256)
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        initCredentials(conf, credentials)
-        var key = credentials.getSecretKey(SPARK_IO_TOKEN)
-        assert(key !== null)
-        val actual = key.length * (java.lang.Byte.SIZE)
-        assert(actual === 256)
-      }
-    })
+  test("create key with invalid length") {
+    intercept[IllegalArgumentException] {
+      val conf = createConf(IO_ENCRYPTION_KEY_SIZE_BITS.key -> "328")
+      CryptoStreamUtils.createKey(conf)
+    }
   }
 
-  test("Initial credentials with invalid key length") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        val credentials = UserGroupInformation.getCurrentUser.getCredentials()
-        val conf = new SparkConf()
-        conf.set(IO_ENCRYPTION_KEY_SIZE_BITS, 328)
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        val thrown = intercept[IllegalArgumentException] {
-          initCredentials(conf, credentials)
-        }
-      }
-    })
+  test("serializer manager integration") {
+    val conf = createConf()
+      .set("spark.shuffle.compress", "true")
+      .set("spark.shuffle.spill.compress", "true")
+
+    val plainStr = "hello world"
+    val blockId = new TempShuffleBlockId(UUID.randomUUID())
+    val key = Some(CryptoStreamUtils.createKey(conf))
+    val serializerManager = new SerializerManager(new JavaSerializer(conf), conf,
+      encryptionKey = key)
+
+    val outputStream = new ByteArrayOutputStream()
+    val wrappedOutputStream = serializerManager.wrapStream(blockId, outputStream)
+    wrappedOutputStream.write(plainStr.getBytes(UTF_8))
+    wrappedOutputStream.close()
+
+    val encryptedBytes = outputStream.toByteArray
+    val encryptedStr = new String(encryptedBytes, UTF_8)
+    assert(plainStr !== encryptedStr)
+
+    val inputStream = new ByteArrayInputStream(encryptedBytes)
+    val wrappedInputStream = serializerManager.wrapStream(blockId, inputStream)
+    val decryptedBytes = ByteStreams.toByteArray(wrappedInputStream)
+    val decryptedStr = new String(decryptedBytes, UTF_8)
+    assert(decryptedStr === plainStr)
   }
 
-  private[this] def initCredentials(conf: SparkConf, credentials: Credentials): Unit = {
-    if (conf.get(IO_ENCRYPTION_ENABLED)) {
-      SecurityManager.initIOEncryptionKey(conf, credentials)
+  test("encryption key propagation to executors") {
+    val conf = createConf().setAppName("Crypto Test").setMaster("local-cluster[1,1,1024]")
+    val sc = new SparkContext(conf)
+    try {
+      val content = "This is the content to be encrypted."
+      val encrypted = sc.parallelize(Seq(1))
+        .map { str =>
+          val bytes = new ByteArrayOutputStream()
+          val out = CryptoStreamUtils.createCryptoOutputStream(bytes, SparkEnv.get.conf,
+            SparkEnv.get.securityManager.getIOEncryptionKey().get)
+          out.write(content.getBytes(UTF_8))
+          out.close()
+          bytes.toByteArray()
+        }.collect()(0)
+
+      assert(content != encrypted)
+
+      val in = CryptoStreamUtils.createCryptoInputStream(new ByteArrayInputStream(encrypted),
+        sc.conf, SparkEnv.get.securityManager.getIOEncryptionKey().get)
+      val decrypted = new String(ByteStreams.toByteArray(in), UTF_8)
+      assert(content === decrypted)
+    } finally {
+      sc.stop()
     }
   }
+
+  private def createConf(extra: (String, String)*): SparkConf = {
+    val conf = new SparkConf()
+    extra.foreach { case (k, v) => conf.set(k, v) }
+    conf.set(IO_ENCRYPTION_ENABLED, true)
+    conf
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
index 442941685f1a..85ccb3347104 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
@@ -33,7 +33,7 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark._
 import org.apache.spark.executor.{ShuffleWriteMetrics, TaskMetrics}
-import org.apache.spark.serializer.{JavaSerializer, SerializerInstance}
+import org.apache.spark.serializer.{JavaSerializer, SerializerInstance, SerializerManager}
 import org.apache.spark.shuffle.IndexShuffleBlockResolver
 import org.apache.spark.storage._
 import org.apache.spark.util.Utils
@@ -90,11 +90,12 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
     )).thenAnswer(new Answer[DiskBlockObjectWriter] {
       override def answer(invocation: InvocationOnMock): DiskBlockObjectWriter = {
         val args = invocation.getArguments
+        val manager = new SerializerManager(new JavaSerializer(conf), conf)
         new DiskBlockObjectWriter(
           args(1).asInstanceOf[File],
+          manager,
           args(2).asInstanceOf[SerializerInstance],
           args(3).asInstanceOf[Int],
-          wrapStream = identity,
           syncWrites = false,
           args(4).asInstanceOf[ShuffleWriteMetrics],
           blockId = args(0).asInstanceOf[BlockId]
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index f4bfdc2fd69a..264771281ba2 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -375,7 +375,8 @@ class BlockManagerReplicationSuite extends SparkFunSuite
       // Put the block into one of the stores
       val blockId = new TestBlockId(
         "block-with-" + storageLevel.description.replace(" ", "-").toLowerCase)
-      stores(0).putSingle(blockId, new Array[Byte](blockSize), storageLevel)
+      val testValue = Array.fill[Byte](blockSize)(1)
+      stores(0).putSingle(blockId, testValue, storageLevel)
 
       // Assert that master know two locations for the block
       val blockLocations = master.getLocations(blockId).map(_.executorId).toSet
@@ -387,12 +388,23 @@ class BlockManagerReplicationSuite extends SparkFunSuite
         testStore => blockLocations.contains(testStore.blockManagerId.executorId)
       }.foreach { testStore =>
         val testStoreName = testStore.blockManagerId.executorId
-        assert(
-          testStore.getLocalValues(blockId).isDefined, s"$blockId was not found in $testStoreName")
-        testStore.releaseLock(blockId)
+        val blockResultOpt = testStore.getLocalValues(blockId)
+        assert(blockResultOpt.isDefined, s"$blockId was not found in $testStoreName")
+        val localValues = blockResultOpt.get.data.toSeq
+        assert(localValues.size == 1)
+        assert(localValues.head === testValue)
         assert(master.getLocations(blockId).map(_.executorId).toSet.contains(testStoreName),
           s"master does not have status for ${blockId.name} in $testStoreName")
 
+        val memoryStore = testStore.memoryStore
+        if (memoryStore.contains(blockId) && !storageLevel.deserialized) {
+          memoryStore.getBytes(blockId).get.chunks.foreach { byteBuffer =>
+            assert(storageLevel.useOffHeap == byteBuffer.isDirect,
+              s"memory mode ${storageLevel.memoryMode} is not compatible with " +
+                byteBuffer.getClass.getSimpleName)
+          }
+        }
+
         val blockStatus = master.getBlockStatus(blockId)(testStore.blockManagerId)
 
         // Assert that block status in the master for this store has expected storage level
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
index 684e978d1186..bfb3ac4c15bc 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
@@ -22,7 +22,7 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.executor.ShuffleWriteMetrics
-import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.serializer.{JavaSerializer, SerializerManager}
 import org.apache.spark.util.Utils
 
 class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
@@ -42,11 +42,19 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("verify write metrics") {
+  private def createWriter(): (DiskBlockObjectWriter, File, ShuffleWriteMetrics) = {
     val file = new File(tempDir, "somefile")
+    val conf = new SparkConf()
+    val serializerManager = new SerializerManager(new JavaSerializer(conf), conf)
     val writeMetrics = new ShuffleWriteMetrics()
     val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+      file, serializerManager, new JavaSerializer(new SparkConf()).newInstance(), 1024, true,
+      writeMetrics)
+    (writer, file, writeMetrics)
+  }
+
+  test("verify write metrics") {
+    val (writer, file, writeMetrics) = createWriter()
 
     writer.write(Long.box(20), Long.box(30))
     // Record metrics update on every write
@@ -66,10 +74,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("verify write metrics on revert") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, _, writeMetrics) = createWriter()
 
     writer.write(Long.box(20), Long.box(30))
     // Record metrics update on every write
@@ -89,10 +94,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("Reopening a closed block writer") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, _, _) = createWriter()
 
     writer.open()
     writer.close()
@@ -102,10 +104,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("calling revertPartialWritesAndClose() on a partial write should truncate up to commit") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
 
     writer.write(Long.box(20), Long.box(30))
     val firstSegment = writer.commitAndGet()
@@ -120,10 +119,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("calling revertPartialWritesAndClose() after commit() should have no effect") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
 
     writer.write(Long.box(20), Long.box(30))
     val firstSegment = writer.commitAndGet()
@@ -136,10 +132,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("calling revertPartialWritesAndClose() on a closed block writer should have no effect") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -153,10 +146,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("commit() and close() should be idempotent") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -173,10 +163,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("revertPartialWritesAndClose() should be idempotent") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -191,10 +178,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("commit() and close() without ever opening or writing") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, _, _) = createWriter()
     val segment = writer.commitAndGet()
     writer.close()
     assert(segment.length === 0)
diff --git a/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala b/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala
index ec4f2637fadd..3050f9a25023 100644
--- a/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala
@@ -67,7 +67,8 @@ class PartiallySerializedBlockSuite
       spy
     }
 
-    val serializer = serializerManager.getSerializer(implicitly[ClassTag[T]]).newInstance()
+    val serializer = serializerManager
+      .getSerializer(implicitly[ClassTag[T]], autoPick = true).newInstance()
     val redirectableOutputStream = Mockito.spy(new RedirectableOutputStream)
     redirectableOutputStream.setOutputStream(bbos)
     val serializationStream = Mockito.spy(serializer.serializeStream(redirectableOutputStream))
@@ -182,7 +183,8 @@ class PartiallySerializedBlockSuite
       Mockito.verifyNoMoreInteractions(memoryStore)
       Mockito.verify(partiallySerializedBlock.getUnrolledChunkedByteBuffer, atLeastOnce).dispose()
 
-      val serializer = serializerManager.getSerializer(implicitly[ClassTag[T]]).newInstance()
+      val serializer = serializerManager
+        .getSerializer(implicitly[ClassTag[T]], autoPick = true).newInstance()
       val deserialized =
         serializer.deserializeStream(new ByteBufferInputStream(bbos.toByteBuffer)).asIterator.toSeq
       assert(deserialized === items)
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index d30b987d6ca3..11482d187aec 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -35,25 +35,15 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
 
   private val peakExecutionMemory = 10
 
-  test("peak execution memory only displayed if unsafe is enabled") {
-    val unsafeConf = "spark.sql.unsafe.enabled"
-    val conf = new SparkConf(false).set(unsafeConf, "true")
+  test("peak execution memory should displayed") {
+    val conf = new SparkConf(false)
     val html = renderStagePage(conf).toString().toLowerCase
     val targetString = "peak execution memory"
     assert(html.contains(targetString))
-    // Disable unsafe and make sure it's not there
-    val conf2 = new SparkConf(false).set(unsafeConf, "false")
-    val html2 = renderStagePage(conf2).toString().toLowerCase
-    assert(!html2.contains(targetString))
-    // Avoid setting anything; it should be displayed by default
-    val conf3 = new SparkConf(false)
-    val html3 = renderStagePage(conf3).toString().toLowerCase
-    assert(html3.contains(targetString))
   }
 
   test("SPARK-10543: peak execution memory should be per-task rather than cumulative") {
-    val unsafeConf = "spark.sql.unsafe.enabled"
-    val conf = new SparkConf(false).set(unsafeConf, "true")
+    val conf = new SparkConf(false)
     val html = renderStagePage(conf).toString().toLowerCase
     // verify min/25/50/75/max show task value not cumulative values
     assert(html.contains(s"<td>$peakExecutionMemory.0 b</td>" * 5))
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index e5d408a16736..6e734d7f9f8d 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -475,8 +475,8 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
         val url = new URL(
           sc.ui.get.appUIAddress.stripSuffix("/") + "/stages/stage/kill/?id=0")
         // SPARK-6846: should be POST only but YARN AM doesn't proxy POST
-        getResponseCode(url, "GET") should be (200)
-        getResponseCode(url, "POST") should be (200)
+        TestUtils.httpResponseCode(url, "GET") should be (200)
+        TestUtils.httpResponseCode(url, "POST") should be (200)
       }
     }
   }
@@ -488,8 +488,8 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
         val url = new URL(
           sc.ui.get.appUIAddress.stripSuffix("/") + "/jobs/job/kill/?id=0")
         // SPARK-6846: should be POST only but YARN AM doesn't proxy POST
-        getResponseCode(url, "GET") should be (200)
-        getResponseCode(url, "POST") should be (200)
+        TestUtils.httpResponseCode(url, "GET") should be (200)
+        TestUtils.httpResponseCode(url, "POST") should be (200)
       }
     }
   }
@@ -671,17 +671,6 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
     }
   }
 
-  def getResponseCode(url: URL, method: String): Int = {
-    val connection = url.openConnection().asInstanceOf[HttpURLConnection]
-    connection.setRequestMethod(method)
-    try {
-      connection.connect()
-      connection.getResponseCode()
-    } finally {
-      connection.disconnect()
-    }
-  }
-
   def goToUi(sc: SparkContext, path: String): Unit = {
     goToUi(sc.ui.get, path)
   }
diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
index 4abcfb7e5191..16fb4666f362 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
@@ -18,12 +18,12 @@
 package org.apache.spark.ui
 
 import java.net.{BindException, ServerSocket}
-import java.net.URI
-import javax.servlet.http.HttpServletRequest
+import java.net.{URI, URL}
+import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse}
 
 import scala.io.Source
 
-import org.eclipse.jetty.servlet.ServletContextHandler
+import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 import org.mockito.Mockito.{mock, when}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
@@ -167,6 +167,7 @@ class UISuite extends SparkFunSuite {
       val boundPort = serverInfo.boundPort
       assert(server.getState === "STARTED")
       assert(boundPort != 0)
+      assert(serverInfo.securePort.isDefined)
       intercept[BindException] {
         socket = new ServerSocket(boundPort)
       }
@@ -228,8 +229,59 @@ class UISuite extends SparkFunSuite {
     assert(newHeader === null)
   }
 
+  test("http -> https redirect applies to all URIs") {
+    var serverInfo: ServerInfo = null
+    try {
+      val servlet = new HttpServlet() {
+        override def doGet(req: HttpServletRequest, res: HttpServletResponse): Unit = {
+          res.sendError(HttpServletResponse.SC_OK)
+        }
+      }
+
+      def newContext(path: String): ServletContextHandler = {
+        val ctx = new ServletContextHandler()
+        ctx.setContextPath(path)
+        ctx.addServlet(new ServletHolder(servlet), "/root")
+        ctx
+      }
+
+      val (conf, sslOptions) = sslEnabledConf()
+      serverInfo = JettyUtils.startJettyServer("0.0.0.0", 0, sslOptions,
+        Seq[ServletContextHandler](newContext("/"), newContext("/test1")),
+        conf)
+      assert(serverInfo.server.getState === "STARTED")
+
+      val testContext = newContext("/test2")
+      serverInfo.addHandler(testContext)
+      testContext.start()
+
+      val httpPort = serverInfo.boundPort
+
+      val tests = Seq(
+        ("http", serverInfo.boundPort, HttpServletResponse.SC_FOUND),
+        ("https", serverInfo.securePort.get, HttpServletResponse.SC_OK))
+
+      tests.foreach { case (scheme, port, expected) =>
+        val urls = Seq(
+          s"$scheme://localhost:$port/root",
+          s"$scheme://localhost:$port/test1/root",
+          s"$scheme://localhost:$port/test2/root")
+        urls.foreach { url =>
+          val (rc, redirectUrl) = TestUtils.httpResponseCodeAndURL(new URL(url))
+          assert(rc === expected, s"Unexpected status $rc for $url")
+          if (rc == HttpServletResponse.SC_FOUND) {
+            assert(
+              TestUtils.httpResponseCode(new URL(redirectUrl.get)) === HttpServletResponse.SC_OK)
+          }
+        }
+      }
+    } finally {
+      stopServer(serverInfo)
+    }
+  }
+
   def stopServer(info: ServerInfo): Unit = {
-    if (info != null && info.server != null) info.server.stop
+    if (info != null) info.stop()
   }
 
   def closeSocket(socket: ServerSocket): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
index 6335d905c0fb..5ae07a53271a 100644
--- a/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
@@ -133,6 +133,45 @@ class UIUtilsSuite extends SparkFunSuite {
     assert(decoded2 === decodeURLParameter(decoded2))
   }
 
+  test("SPARK-20393: Prevent newline characters in parameters.") {
+    val encoding = "Encoding:base64%0d%0a%0d%0aPGh0bWw%2bjcmlwdD48L2h0bWw%2b"
+    val stripEncoding = "Encoding:base64PGh0bWw%2bjcmlwdD48L2h0bWw%2b"
+
+    assert(stripEncoding === stripXSS(encoding))
+  }
+
+  test("SPARK-20393: Prevent script from parameters running on page.") {
+    val scriptAlert = """>"'><script>alert(401)<%2Fscript>"""
+    val stripScriptAlert = "&gt;&quot;&gt;&lt;script&gt;alert(401)&lt;%2Fscript&gt;"
+
+    assert(stripScriptAlert === stripXSS(scriptAlert))
+  }
+
+  test("SPARK-20393: Prevent javascript from parameters running on page.") {
+    val javascriptAlert =
+      """app-20161208133404-0002<iframe+src%3Djavascript%3Aalert(1705)>"""
+    val stripJavascriptAlert =
+      "app-20161208133404-0002&lt;iframe+src%3Djavascript%3Aalert(1705)&gt;"
+
+    assert(stripJavascriptAlert === stripXSS(javascriptAlert))
+  }
+
+  test("SPARK-20393: Prevent links from parameters on page.") {
+    val link =
+      """stdout'"><iframe+id%3D1131+src%3Dhttp%3A%2F%2Fdemo.test.net%2Fphishing.html>"""
+    val stripLink =
+      "stdout&quot;&gt;&lt;iframe+id%3D1131+src%3Dhttp%3A%2F%2Fdemo.test.net%2Fphishing.html&gt;"
+
+    assert(stripLink === stripXSS(link))
+  }
+
+  test("SPARK-20393: Prevent popups from parameters on page.") {
+    val popup = """stdout'%2Balert(60)%2B'"""
+    val stripPopup = "stdout%2Balert(60)%2B"
+
+    assert(stripPopup === stripXSS(popup))
+  }
+
   private def verify(
       desc: String,
       expected: Node,
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 15ef32f21d90..8706d721a8f2 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -264,7 +264,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     val hour = minute * 60
     def str: (Long) => String = Utils.msDurationToString(_)
 
-    val sep = new DecimalFormatSymbols(Locale.getDefault()).getDecimalSeparator()
+    val sep = new DecimalFormatSymbols(Locale.US).getDecimalSeparator
 
     assert(str(123) === "123 ms")
     assert(str(second) === "1" + sep + "0 s")
@@ -484,6 +484,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
       assertResolves("""hdfs:/jar1,file:/jar2,jar3,C:\pi.py#py.pi,C:\path to\jar4""",
         s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:/C:/pi.py#py.pi,file:/C:/path%20to/jar4")
     }
+    assertResolves(",jar1,jar2", s"file:$cwd/jar1,file:$cwd/jar2")
   }
 
   test("nonLocalPaths") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 5141e36d9e38..7f0838268a11 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.util.collection
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark._
+import org.apache.spark.internal.config._
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.memory.MemoryTestingUtils
 
@@ -230,14 +231,19 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("spilling with compression and encryption") {
+    testSimpleSpilling(Some(CompressionCodec.DEFAULT_COMPRESSION_CODEC), encrypt = true)
+  }
+
   /**
    * Test spilling through simple aggregations and cogroups.
    * If a compression codec is provided, use it. Otherwise, do not compress spills.
    */
-  private def testSimpleSpilling(codec: Option[String] = None): Unit = {
+  private def testSimpleSpilling(codec: Option[String] = None, encrypt: Boolean = false): Unit = {
     val size = 1000
     val conf = createSparkConf(loadDefaults = true, codec)  // Load defaults for Spark home
     conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
+    conf.set(IO_ENCRYPTION_ENABLED, encrypt)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     assertSpilled(sc, "reduceByKey") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 3066e9996abd..335ecb9320ab 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -49,9 +49,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     intercept[IllegalArgumentException] {
       new OpenHashMap[String, Int](-1)
     }
-    intercept[IllegalArgumentException] {
-      new OpenHashMap[String, String](0)
-    }
   }
 
   test("primitive value") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 2607a543dd61..210bc5c09974 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -176,4 +176,9 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
     assert(set.size === 1000)
     assert(set.capacity > 1000)
   }
+
+  test("SPARK-18200 Support zero as an initial set size") {
+    val set = new OpenHashSet[Long](0)
+    assert(set.size === 0)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
index 508e737b725b..f5ee428020fd 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
@@ -49,9 +49,6 @@ class PrimitiveKeyOpenHashMapSuite extends SparkFunSuite with Matchers {
     intercept[IllegalArgumentException] {
       new PrimitiveKeyOpenHashMap[Int, Int](-1)
     }
-    intercept[IllegalArgumentException] {
-      new PrimitiveKeyOpenHashMap[Int, Int](0)
-    }
   }
 
   test("basic operations") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala
index 366ffda7788d..d5956ea32096 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala
@@ -22,6 +22,8 @@ import java.util.{Arrays, Comparator}
 
 import scala.util.Random
 
+import com.google.common.primitives.Ints
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.internal.Logging
 import org.apache.spark.unsafe.array.LongArray
@@ -30,7 +32,7 @@ import org.apache.spark.util.collection.Sorter
 import org.apache.spark.util.random.XORShiftRandom
 
 class RadixSortSuite extends SparkFunSuite with Logging {
-  private val N = 10000  // scale this down for more readable results
+  private val N = 10000L  // scale this down for more readable results
 
   /**
    * Describes a type of sort to test, e.g. two's complement descending. Each sort type has
@@ -73,22 +75,22 @@ class RadixSortSuite extends SparkFunSuite with Logging {
       },
       2, 4, false, false, true))
 
-  private def generateTestData(size: Int, rand: => Long): (Array[JLong], LongArray) = {
-    val ref = Array.tabulate[Long](size) { i => rand }
-    val extended = ref ++ Array.fill[Long](size)(0)
+  private def generateTestData(size: Long, rand: => Long): (Array[JLong], LongArray) = {
+    val ref = Array.tabulate[Long](Ints.checkedCast(size)) { i => rand }
+    val extended = ref ++ Array.fill[Long](Ints.checkedCast(size))(0)
     (ref.map(i => new JLong(i)), new LongArray(MemoryBlock.fromLongArray(extended)))
   }
 
-  private def generateKeyPrefixTestData(size: Int, rand: => Long): (LongArray, LongArray) = {
-    val ref = Array.tabulate[Long](size * 2) { i => rand }
-    val extended = ref ++ Array.fill[Long](size * 2)(0)
+  private def generateKeyPrefixTestData(size: Long, rand: => Long): (LongArray, LongArray) = {
+    val ref = Array.tabulate[Long](Ints.checkedCast(size * 2)) { i => rand }
+    val extended = ref ++ Array.fill[Long](Ints.checkedCast(size * 2))(0)
     (new LongArray(MemoryBlock.fromLongArray(ref)),
      new LongArray(MemoryBlock.fromLongArray(extended)))
   }
 
-  private def collectToArray(array: LongArray, offset: Int, length: Int): Array[Long] = {
+  private def collectToArray(array: LongArray, offset: Int, length: Long): Array[Long] = {
     var i = 0
-    val out = new Array[Long](length)
+    val out = new Array[Long](Ints.checkedCast(length))
     while (i < length) {
       out(i) = array.get(offset + i)
       i += 1
@@ -107,15 +109,13 @@ class RadixSortSuite extends SparkFunSuite with Logging {
     }
   }
 
-  private def referenceKeyPrefixSort(buf: LongArray, lo: Int, hi: Int, refCmp: PrefixComparator) {
+  private def referenceKeyPrefixSort(buf: LongArray, lo: Long, hi: Long, refCmp: PrefixComparator) {
     val sortBuffer = new LongArray(MemoryBlock.fromLongArray(new Array[Long](buf.size().toInt)))
     new Sorter(new UnsafeSortDataFormat(sortBuffer)).sort(
-      buf, lo, hi, new Comparator[RecordPointerAndKeyPrefix] {
+      buf, Ints.checkedCast(lo), Ints.checkedCast(hi), new Comparator[RecordPointerAndKeyPrefix] {
         override def compare(
             r1: RecordPointerAndKeyPrefix,
-            r2: RecordPointerAndKeyPrefix): Int = {
-          refCmp.compare(r1.keyPrefix, r2.keyPrefix)
-        }
+            r2: RecordPointerAndKeyPrefix): Int = refCmp.compare(r1.keyPrefix, r2.keyPrefix)
       })
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
index 667a4db6f7bb..55c5dd5e2460 100644
--- a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
@@ -44,6 +44,19 @@ class SamplingUtilsSuite extends SparkFunSuite {
     assert(sample3.length === 10)
   }
 
+  test("SPARK-18678 reservoirSampleAndCount with tiny input") {
+    val input = Seq(0, 1)
+    val counts = new Array[Int](input.size)
+    for (i <- 0 until 500) {
+      val (samples, inputSize) = SamplingUtils.reservoirSampleAndCount(input.iterator, 1)
+      assert(inputSize === 2)
+      assert(samples.length === 1)
+      counts(samples.head) += 1
+    }
+    // If correct, should be true with prob ~ 0.99999707
+    assert(math.abs(counts(0) - counts(1)) <= 100)
+  }
+
   test("computeFraction") {
     // test that the computed fraction guarantees enough data points
     // in the sample with a failure rate <= 0.0001
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index a3efddeaa515..17c0c8e33147 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -102,3 +102,5 @@ org.apache.spark.scheduler.ExternalClusterManager
 .Rbuildignore
 org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
 spark-warehouse
+structured-streaming/*
+kafka-source-initial-offset-version-2.1.0.bin
diff --git a/dev/checkstyle.xml b/dev/checkstyle.xml
index 3de6aa91dcd5..fd73ca73ee7e 100644
--- a/dev/checkstyle.xml
+++ b/dev/checkstyle.xml
@@ -28,7 +28,7 @@
 
     with Spark-specific changes from:
 
-    https://cwiki.apache.org/confluence/display/SPARK/Spark+Code+Style+Guide
+    http://spark.apache.org/contributing.html#code-style-guide
 
     Checkstyle is very configurable. Be sure to read the documentation at
     http://checkstyle.sf.net (or in your downloaded distribution).
@@ -52,6 +52,20 @@
       <property name="file" value="dev/checkstyle-suppressions.xml"/>
     </module>
 
+    <!--
+    If you wish to turn off checking for a section of code, you can put a comment in the source
+    before and after the section, with the following syntax:
+
+      // checkstyle:off no.XXX (such as checkstyle.off: NoFinalizer)
+      ...  // stuff that breaks the styles
+      // checkstyle:on
+    -->
+    <module name="SuppressionCommentFilter">
+        <property name="offCommentFormat" value="checkstyle.off\: ([\w\|]+)"/>
+        <property name="onCommentFormat" value="checkstyle.on\: ([\w\|]+)"/>
+        <property name="checkFormat" value="$1"/>
+    </module>
+
     <!-- Checks for whitespace                               -->
     <!-- See http://checkstyle.sf.net/config_whitespace.html -->
     <module name="FileTabCharacter">
@@ -168,5 +182,6 @@
         <module name="UnusedImports"/>
         <module name="RedundantImport"/>
         <module name="RedundantModifier"/>
+        <module name="FileContentsHolder"/>
     </module>
 </module>
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 96f9b5714ebb..c4ddc2186f5d 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -150,6 +150,7 @@ if [[ "$1" == "package" ]]; then
     NAME=$1
     FLAGS=$2
     ZINC_PORT=$3
+    BUILD_PACKAGE=$4
     cp -r spark spark-$SPARK_VERSION-bin-$NAME
 
     cd spark-$SPARK_VERSION-bin-$NAME
@@ -162,14 +163,62 @@ if [[ "$1" == "package" ]]; then
     export ZINC_PORT=$ZINC_PORT
     echo "Creating distribution: $NAME ($FLAGS)"
 
+    # Write out the VERSION to PySpark version info we rewrite the - into a . and SNAPSHOT
+    # to dev0 to be closer to PEP440.
+    PYSPARK_VERSION=`echo "$SPARK_VERSION" |  sed -r "s/-/./" | sed -r "s/SNAPSHOT/dev0/"`
+    echo "__version__='$PYSPARK_VERSION'" > python/pyspark/version.py
+
     # Get maven home set by MVN
     MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
 
-    ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
-      -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
-    cd ..
-    cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
 
+    if [ -z "$BUILD_PACKAGE" ]; then
+      echo "Creating distribution without PIP/R package"
+      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
+        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
+      cd ..
+    elif [[ "$BUILD_PACKAGE" == "withr" ]]; then
+      echo "Creating distribution with R package"
+      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --r $FLAGS \
+        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
+      cd ..
+
+      echo "Copying and signing R source package"
+      R_DIST_NAME=SparkR_$SPARK_VERSION.tar.gz
+      cp spark-$SPARK_VERSION-bin-$NAME/R/$R_DIST_NAME .
+
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
+        --output $R_DIST_NAME.asc \
+        --detach-sig $R_DIST_NAME
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+        MD5 $R_DIST_NAME > \
+        $R_DIST_NAME.md5
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+        SHA512 $R_DIST_NAME > \
+        $R_DIST_NAME.sha
+    else
+      echo "Creating distribution with PIP package"
+      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --pip $FLAGS \
+        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
+      cd ..
+
+      echo "Copying and signing python distribution"
+      PYTHON_DIST_NAME=pyspark-$PYSPARK_VERSION.tar.gz
+      cp spark-$SPARK_VERSION-bin-$NAME/python/dist/$PYTHON_DIST_NAME .
+
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
+        --output $PYTHON_DIST_NAME.asc \
+        --detach-sig $PYTHON_DIST_NAME
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+        MD5 $PYTHON_DIST_NAME > \
+        $PYTHON_DIST_NAME.md5
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+        SHA512 $PYTHON_DIST_NAME > \
+        $PYTHON_DIST_NAME.sha
+    fi
+
+    echo "Copying and signing regular binary distribution"
+    cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
     echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
       --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \
       --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz
@@ -187,10 +236,10 @@ if [[ "$1" == "package" ]]; then
   # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
   # share the same Zinc server.
   FLAGS="-Psparkr -Phive -Phive-thriftserver -Pyarn -Pmesos"
-  make_binary_release "hadoop2.3" "-Phadoop2.3 $FLAGS" "3033" &
-  make_binary_release "hadoop2.4" "-Phadoop2.4 $FLAGS" "3034" &
-  make_binary_release "hadoop2.6" "-Phadoop2.6 $FLAGS" "3035" &
-  make_binary_release "hadoop2.7" "-Phadoop2.7 $FLAGS" "3036" &
+  make_binary_release "hadoop2.3" "-Phadoop-2.3 $FLAGS" "3033" &
+  make_binary_release "hadoop2.4" "-Phadoop-2.4 $FLAGS" "3034" &
+  make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" "withr" &
+  make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" "withpip" &
   make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn -Pmesos" "3037" &
   make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" &
   wait
@@ -200,14 +249,18 @@ if [[ "$1" == "package" ]]; then
   dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-bin"
   echo "Copying release tarballs to $dest_dir"
   # Put to new directory:
-  LFTP mkdir -p $dest_dir
+  LFTP mkdir -p $dest_dir || true
   LFTP mput -O $dest_dir 'spark-*'
+  LFTP mput -O $dest_dir 'pyspark-*'
+  LFTP mput -O $dest_dir 'SparkR_*'
   # Delete /latest directory and rename new upload to /latest
   LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0"
   LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest"
   # Re-upload a second time and leave the files in the timestamped upload directory:
-  LFTP mkdir -p $dest_dir
+  LFTP mkdir -p $dest_dir || true
   LFTP mput -O $dest_dir 'spark-*'
+  LFTP mput -O $dest_dir 'pyspark-*'
+  LFTP mput -O $dest_dir 'SparkR_*'
   exit 0
 fi
 
@@ -222,13 +275,13 @@ if [[ "$1" == "docs" ]]; then
   PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" jekyll build
   echo "Copying release documentation to $dest_dir"
   # Put to new directory:
-  LFTP mkdir -p $dest_dir
+  LFTP mkdir -p $dest_dir || true
   LFTP mirror -R _site $dest_dir
   # Delete /latest directory and rename new upload to /latest
   LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0"
   LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest"
   # Re-upload a second time and leave the files in the timestamped upload directory:
-  LFTP mkdir -p $dest_dir
+  LFTP mkdir -p $dest_dir || true
   LFTP mirror -R _site $dest_dir
   cd ..
   exit 0
diff --git a/dev/create-release/release-tag.sh b/dev/create-release/release-tag.sh
index b7e5100ca740..370a62ce15bc 100755
--- a/dev/create-release/release-tag.sh
+++ b/dev/create-release/release-tag.sh
@@ -65,6 +65,7 @@ sed -i".tmp1" 's/Version.*$/Version: '"$RELEASE_VERSION"'/g' R/pkg/DESCRIPTION
 # Set the release version in docs
 sed -i".tmp1" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$RELEASE_VERSION"'/g' docs/_config.yml
 sed -i".tmp2" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$RELEASE_VERSION"'/g' docs/_config.yml
+sed -i".tmp3" 's/__version__ = .*$/__version__ = "'"$RELEASE_VERSION"'"/' python/pyspark/version.py
 
 git commit -a -m "Preparing Spark release $RELEASE_TAG"
 echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH"
@@ -74,12 +75,16 @@ git tag $RELEASE_TAG
 $MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs
 # Remove -SNAPSHOT before setting the R version as R expects version strings to only have numbers
 R_NEXT_VERSION=`echo $NEXT_VERSION | sed 's/-SNAPSHOT//g'`
-sed -i".tmp2" 's/Version.*$/Version: '"$R_NEXT_VERSION"'/g' R/pkg/DESCRIPTION
+sed -i".tmp4" 's/Version.*$/Version: '"$R_NEXT_VERSION"'/g' R/pkg/DESCRIPTION
+# Write out the R_NEXT_VERSION to PySpark version info we use dev0 instead of SNAPSHOT to be closer
+# to PEP440.
+sed -i".tmp5" 's/__version__ = .*$/__version__ = "'"$R_NEXT_VERSION.dev0"'"/' python/pyspark/version.py
+
 
 # Update docs with next version
-sed -i".tmp3" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$NEXT_VERSION"'/g' docs/_config.yml
+sed -i".tmp6" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$NEXT_VERSION"'/g' docs/_config.yml
 # Use R version for short version
-sed -i".tmp4" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$R_NEXT_VERSION"'/g' docs/_config.yml
+sed -i".tmp7" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$R_NEXT_VERSION"'/g' docs/_config.yml
 
 git commit -a -m "Preparing development version $NEXT_VERSION"
 
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 99279a4ca8be..da1702009924 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -24,7 +24,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -103,7 +103,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
@@ -113,8 +112,8 @@ jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
 leveldbjni-all-1.8.jar
-libfb303-0.9.2.jar
-libthrift-0.9.2.jar
+libfb303-0.9.3.jar
+libthrift-0.9.3.jar
 log4j-1.2.17.jar
 lz4-1.3.0.jar
 mesos-1.0.0-shaded-protobuf.jar
@@ -124,7 +123,7 @@ metrics-json-3.1.2.jar
 metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index f094b4a7e167..92746f07e782 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -27,7 +27,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -108,7 +108,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
@@ -118,8 +117,8 @@ jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
 leveldbjni-all-1.8.jar
-libfb303-0.9.2.jar
-libthrift-0.9.2.jar
+libfb303-0.9.3.jar
+libthrift-0.9.3.jar
 log4j-1.2.17.jar
 lz4-1.3.0.jar
 mail-1.4.7.jar
@@ -131,7 +130,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 7f0ef98680a1..49d99ae65ce8 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -27,7 +27,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -108,7 +108,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
@@ -118,8 +117,8 @@ jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
 leveldbjni-all-1.8.jar
-libfb303-0.9.2.jar
-libthrift-0.9.2.jar
+libfb303-0.9.3.jar
+libthrift-0.9.3.jar
 log4j-1.2.17.jar
 lz4-1.3.0.jar
 mail-1.4.7.jar
@@ -131,7 +130,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 4a27bf3deecb..652fcb27690a 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -31,7 +31,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -116,7 +116,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
@@ -126,8 +125,8 @@ jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
 leveldbjni-all-1.8.jar
-libfb303-0.9.2.jar
-libthrift-0.9.2.jar
+libfb303-0.9.3.jar
+libthrift-0.9.3.jar
 log4j-1.2.17.jar
 lz4-1.3.0.jar
 mail-1.4.7.jar
@@ -139,7 +138,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 151670a8e23e..16b5c82859a2 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -31,7 +31,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -116,7 +116,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
@@ -127,8 +126,8 @@ jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
 leveldbjni-all-1.8.jar
-libfb303-0.9.2.jar
-libthrift-0.9.2.jar
+libfb303-0.9.3.jar
+libthrift-0.9.3.jar
 log4j-1.2.17.jar
 lz4-1.3.0.jar
 mail-1.4.7.jar
@@ -140,7 +139,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/lint-python b/dev/lint-python
index 63487043a50b..3f878c2dad6b 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -20,7 +20,9 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./examples/src/main/python/ ./dev/sparktestsupport"
-PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py"
+# TODO: fix pep8 errors with the rest of the Python scripts under dev
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/*.py ./dev/run-tests-jenkins.py"
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/pip-sanity-check.py"
 PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
 PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
 PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 9be4fdfa51c9..00e0580a34a0 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -33,6 +33,8 @@ SPARK_HOME="$(cd "`dirname "$0"`/.."; pwd)"
 DISTDIR="$SPARK_HOME/dist"
 
 MAKE_TGZ=false
+MAKE_PIP=false
+MAKE_R=false
 NAME=none
 MVN="$SPARK_HOME/build/mvn"
 
@@ -40,7 +42,7 @@ function exit_with_usage {
   echo "make-distribution.sh - tool for making binary distributions of Spark"
   echo ""
   echo "usage:"
-  cl_options="[--name] [--tgz] [--mvn <mvn-command>]"
+  cl_options="[--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>]"
   echo "make-distribution.sh $cl_options <maven build options>"
   echo "See Spark's \"Building Spark\" doc for correct Maven options."
   echo ""
@@ -67,6 +69,12 @@ while (( "$#" )); do
     --tgz)
       MAKE_TGZ=true
       ;;
+    --pip)
+      MAKE_PIP=true
+      ;;
+    --r)
+      MAKE_R=true
+      ;;
     --mvn)
       MVN="$2"
       shift
@@ -201,12 +209,49 @@ fi
 # Copy data files
 cp -r "$SPARK_HOME/data" "$DISTDIR"
 
+# Make pip package
+if [ "$MAKE_PIP" == "true" ]; then
+  echo "Building python distribution package"
+  pushd "$SPARK_HOME/python" > /dev/null
+  # Delete the egg info file if it exists, this can cache older setup files.
+  rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
+  python setup.py sdist
+  popd > /dev/null
+else
+  echo "Skipping building python distribution package"
+fi
+
+# Make R package - this is used for both CRAN release and packing R layout into distribution
+if [ "$MAKE_R" == "true" ]; then
+  echo "Building R source package"
+  R_PACKAGE_VERSION=`grep Version $SPARK_HOME/R/pkg/DESCRIPTION | awk '{print $NF}'`
+  pushd "$SPARK_HOME/R" > /dev/null
+  # Build source package and run full checks
+  # Install source package to get it to generate vignettes, etc.
+  # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME
+  NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh
+  # Move R source package to match the Spark release version if the versions are not the same.
+  # NOTE(shivaram): `mv` throws an error on Linux if source and destination are same file
+  if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then
+    mv $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz
+  fi
+  popd > /dev/null
+else
+  echo "Skipping building R source package"
+fi
+
 # Copy other things
 mkdir "$DISTDIR"/conf
 cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf
 cp "$SPARK_HOME/README.md" "$DISTDIR"
 cp -r "$SPARK_HOME/bin" "$DISTDIR"
 cp -r "$SPARK_HOME/python" "$DISTDIR"
+
+# Remove the python distribution from dist/ if we built it
+if [ "$MAKE_PIP" == "true" ]; then
+  rm -f $DISTDIR/python/dist/pyspark-*.tar.gz
+fi
+
 cp -r "$SPARK_HOME/sbin" "$DISTDIR"
 # Copy SparkR if it exists
 if [ -d "$SPARK_HOME"/R/lib/SparkR ]; then
diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py
new file mode 100644
index 000000000000..c491005f4971
--- /dev/null
+++ b/dev/pip-sanity-check.py
@@ -0,0 +1,38 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark.sql import SparkSession
+from pyspark.ml.param import Params
+from pyspark.mllib.linalg import *
+import sys
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("PipSanityCheck")\
+        .getOrCreate()
+    sc = spark.sparkContext
+    rdd = sc.parallelize(range(100), 10)
+    value = rdd.reduce(lambda x, y: x + y)
+    if (value != 4950):
+        print("Value {0} did not match expected value.".format(value), file=sys.stderr)
+        sys.exit(-1)
+    print("Successfully ran pip sanity check")
+
+    spark.stop()
diff --git a/dev/requirements.txt b/dev/requirements.txt
index bf042d22a8b4..79782279f8fb 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -1,3 +1,4 @@
 jira==1.0.3
 PyGithub==1.26.0
 Unidecode==0.04.19
+pypandoc==1.3.3
diff --git a/dev/run-pip-tests b/dev/run-pip-tests
new file mode 100755
index 000000000000..af1b1feb70cd
--- /dev/null
+++ b/dev/run-pip-tests
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Stop on error
+set -e
+# Set nullglob for when we are checking existence based on globs
+shopt -s nullglob
+
+FWDIR="$(cd "$(dirname "$0")"/..; pwd)"
+cd "$FWDIR"
+
+echo "Constucting virtual env for testing"
+VIRTUALENV_BASE=$(mktemp -d)
+
+# Clean up the virtual env enviroment used if we created one.
+function delete_virtualenv() {
+  echo "Cleaning up temporary directory - $VIRTUALENV_BASE"
+  rm -rf "$VIRTUALENV_BASE"
+}
+trap delete_virtualenv EXIT
+
+# Some systems don't have pip or virtualenv - in those cases our tests won't work.
+if ! hash virtualenv 2>/dev/null; then
+  echo "Missing virtualenv skipping pip installability tests."
+  exit 0
+fi
+if ! hash pip 2>/dev/null; then
+  echo "Missing pip, skipping pip installability tests."
+  exit 0
+fi
+
+# Figure out which Python execs we should test pip installation with
+PYTHON_EXECS=()
+if hash python2 2>/dev/null; then
+  # We do this since we are testing with virtualenv and the default virtual env python
+  # is in /usr/bin/python
+  PYTHON_EXECS+=('python2')
+elif hash python 2>/dev/null; then
+  # If python2 isn't installed fallback to python if available
+  PYTHON_EXECS+=('python')
+fi
+if hash python3 2>/dev/null; then
+  PYTHON_EXECS+=('python3')
+fi
+
+# Determine which version of PySpark we are building for archive name
+PYSPARK_VERSION=$(python -c "exec(open('python/pyspark/version.py').read());print __version__")
+PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz"
+# The pip install options we use for all the pip commands
+PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall "
+# Test both regular user and edit/dev install modes.
+PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST"
+	      "pip install $PIP_OPTIONS -e python/")
+
+for python in "${PYTHON_EXECS[@]}"; do
+  for install_command in "${PIP_COMMANDS[@]}"; do
+    echo "Testing pip installation with python $python"
+    # Create a temp directory for us to work in and save its name to a file for cleanup
+    echo "Using $VIRTUALENV_BASE for virtualenv"
+    VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
+    rm -rf "$VIRTUALENV_PATH"
+    mkdir -p "$VIRTUALENV_PATH"
+    virtualenv --python=$python "$VIRTUALENV_PATH"
+    source "$VIRTUALENV_PATH"/bin/activate
+    # Upgrade pip & friends
+    pip install --upgrade pip pypandoc wheel
+    pip install numpy # Needed so we can verify mllib imports
+
+    echo "Creating pip installable source dist"
+    cd "$FWDIR"/python
+    # Delete the egg info file if it exists, this can cache the setup file.
+    rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion"
+    $python setup.py sdist
+
+
+    echo "Installing dist into virtual env"
+    cd dist
+    # Verify that the dist directory only contains one thing to install
+    sdists=(*.tar.gz)
+    if [ ${#sdists[@]} -ne 1 ]; then
+      echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first."
+      exit -1
+    fi
+    # Do the actual installation
+    cd "$FWDIR"
+    $install_command
+
+    cd /
+
+    echo "Run basic sanity check on pip installed version with spark-submit"
+    spark-submit "$FWDIR"/dev/pip-sanity-check.py
+    echo "Run basic sanity check with import based"
+    python "$FWDIR"/dev/pip-sanity-check.py
+    echo "Run the tests for context.py"
+    python "$FWDIR"/python/pyspark/context.py
+
+    cd "$FWDIR"
+
+  done
+done
+
+exit 0
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index a48d918f9dc1..bb286af76384 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -80,7 +80,7 @@ def pr_message(build_display_name,
                 short_commit_hash,
                 commit_url,
                 str(' ' + post_msg + '.') if post_msg else '.')
-    return '**[Test build %s %s](%sconsoleFull)** for PR %s at commit [`%s`](%s)%s' % str_args
+    return '**[Test build %s %s](%stestReport)** for PR %s at commit [`%s`](%s)%s' % str_args
 
 
 def run_pr_checks(pr_tests, ghprb_actual_commit, sha1):
@@ -128,6 +128,7 @@ def run_tests(tests_timeout):
         ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests',
         ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests',
         ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests',
+        ERROR_CODES["BLOCK_PYSPARK_PIP_TESTS"]: 'PySpark pip packaging tests',
         ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests',
         ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % (
             tests_timeout)
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 5d661f5f1a1c..f24aac9a9386 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -111,7 +111,7 @@ def determine_modules_to_test(changed_modules):
     >>> x = [x.name for x in determine_modules_to_test([modules.sql])]
     >>> x # doctest: +NORMALIZE_WHITESPACE
     ['sql', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver',
-     'pyspark-sql', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
+     'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
     """
     modules_to_test = set()
     for module in changed_modules:
@@ -432,6 +432,12 @@ def run_python_tests(test_modules, parallelism):
     run_cmd(command)
 
 
+def run_python_packaging_tests():
+    set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS")
+    command = [os.path.join(SPARK_HOME, "dev", "run-pip-tests")]
+    run_cmd(command)
+
+
 def run_build_tests():
     set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS")
     run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")])
@@ -583,6 +589,7 @@ def main():
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
         run_python_tests(modules_with_python_tests, opts.parallelism)
+        run_python_packaging_tests()
     if any(m.should_run_r_tests for m in test_modules):
         run_sparkr_tests()
 
diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py
index 89015f8c4fb9..38f25da41f77 100644
--- a/dev/sparktestsupport/__init__.py
+++ b/dev/sparktestsupport/__init__.py
@@ -33,5 +33,6 @@
     "BLOCK_SPARKR_UNIT_TESTS": 20,
     "BLOCK_JAVA_STYLE": 21,
     "BLOCK_BUILD_TESTS": 22,
+    "BLOCK_PYSPARK_PIP_TESTS": 23,
     "BLOCK_TIMEOUT": 124
 }
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index b34ab51f3b99..9e293e7206d5 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -123,6 +123,7 @@ def __hash__(self):
     ],
 )
 
+
 hive = Module(
     name="hive",
     dependencies=[sql],
@@ -142,6 +143,18 @@ def __hash__(self):
 )
 
 
+repl = Module(
+    name="repl",
+    dependencies=[hive],
+    source_file_regexes=[
+        "repl/",
+    ],
+    sbt_test_goals=[
+        "repl/test",
+    ],
+)
+
+
 hive_thriftserver = Module(
     name="hive-thriftserver",
     dependencies=[hive],
@@ -245,7 +258,8 @@ def __hash__(self):
     name="streaming-kafka-0-10",
     dependencies=[streaming],
     source_file_regexes=[
-        "external/kafka-0-10",
+        # The ending "/" is necessary otherwise it will include "sql-kafka" codes
+        "external/kafka-0-10/",
         "external/kafka-0-10-assembly",
     ],
     sbt_test_goals=[
diff --git a/docs/_config.yml b/docs/_config.yml
index e4fc093fe733..ee38a4cc9199 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,10 +14,10 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.0-SNAPSHOT
-SPARK_VERSION_SHORT: 2.1.0
+SPARK_VERSION: 2.1.2-SNAPSHOT
+SPARK_VERSION_SHORT: 2.1.2
 SCALA_BINARY_VERSION: "2.11"
-SCALA_VERSION: "2.11.7"
+SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK
 SPARK_GITHUB_URL: https://github.com/apache/spark
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index ad5b5c9adfac..c00d0db63cd1 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -113,8 +113,8 @@
                                 <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
                                 <li class="divider"></li>
                                 <li><a href="building-spark.html">Building Spark</a></li>
-                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
-                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects">Third Party Projects</a></li>
+                                <li><a href="http://spark.apache.org/contributing.html">Contributing to Spark</a></li>
+                                <li><a href="http://spark.apache.org/third-party-projects.html">Third Party Projects</a></li>
                             </ul>
                         </li>
                     </ul>
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index f926d67e6bea..71e643244ec2 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -142,4 +142,7 @@
   puts "cp -r R/pkg/html/. docs/api/R"
   cp_r("R/pkg/html/.", "docs/api/R")
 
+  puts "cp R/pkg/DESCRIPTION docs/api"
+  cp("R/pkg/DESCRIPTION", "docs/api")
+
 end
diff --git a/docs/building-spark.md b/docs/building-spark.md
index ebe46a42a15c..33ff80e08388 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -13,6 +13,7 @@ redirect_from: "building-with-maven.html"
 
 The Maven-based build is the build of reference for Apache Spark.
 Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+.
+Note that support for Java 7 is deprecated as of Spark 2.0.0 and may be removed in Spark 2.2.0.
 
 ### Setting up Maven's Memory Usage
 
@@ -79,6 +80,9 @@ Because HDFS is not protocol-compatible across versions, if you want to read fro
   </tbody>
 </table>
 
+Note that support for versions of Hadoop before 2.6 are deprecated as of Spark 2.1.0 and may be 
+removed in Spark 2.2.0.
+
 
 You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later.
 
@@ -129,6 +133,8 @@ To produce a Spark package compiled with Scala 2.10, use the `-Dscala-2.10` prop
 
     ./dev/change-scala-version.sh 2.10
     ./build/mvn -Pyarn -Phadoop-2.4 -Dscala-2.10 -DskipTests clean package
+    
+Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be removed in Spark 2.2.0.
 
 ## Building submodules individually
 
@@ -191,9 +197,9 @@ can be set to control the SBT build. For example:
 To avoid the overhead of launching sbt each time you need to re-compile, you can launch sbt
 in interactive mode by running `build/sbt`, and then run all build commands at the command
 prompt. For more recommendations on reducing build time, refer to the
-[wiki page](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-ReducingBuildTimes).
+[Useful Developer Tools page](http://spark.apache.org/developer-tools.html).
 
-## Encrypted Filesystems
+## Encrypted Filesystems
 
 When building on an encrypted filesystem (if your home directory is encrypted, for example), then the Spark build might fail with a "Filename too long" error. As a workaround, add the following in the configuration args of the `scala-maven-plugin` in the project `pom.xml`:
 
@@ -209,7 +215,7 @@ to the `sharedSettings` val. See also [this PR](https://github.com/apache/spark/
 ## IntelliJ IDEA or Eclipse
 
 For help in setting up IntelliJ IDEA or Eclipse for Spark development, and troubleshooting, refer to the
-[wiki page for IDE setup](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-IDESetup).
+[Useful Developer Tools page](http://spark.apache.org/developer-tools.html).
 
 
 # Running Tests
@@ -259,6 +265,14 @@ or
 Java 8 tests are automatically enabled when a Java 8 JDK is detected.
 If you have JDK 8 installed but it is not the system default, you can set JAVA_HOME to point to JDK 8 before running the tests.
 
+## PySpark pip installable
+
+If you are building Spark for use in a Python environment and you wish to pip install it, you will first need to build the Spark JARs as described above. Then you can construct an sdist package suitable for setup.py and pip installable package.
+
+    cd python; python setup.py sdist
+
+**Note:** Due to packaging requirements you can not directly pip install from the Python directory, rather you must first build the sdist package as described above.
+
 ## PySpark Tests with Maven
 
 If you are building PySpark and wish to run the PySpark tests you will need to build Spark with Hive support.
@@ -288,7 +302,7 @@ Once installed, the `docker` service needs to be started, if not already running
 On Linux, this can be done by `sudo service docker start`.
 
     ./build/mvn install -DskipTests
-    ./build/mvn -Pdocker-integration-tests -pl :spark-docker-integration-tests_2.11
+    ./build/mvn test -Pdocker-integration-tests -pl :spark-docker-integration-tests_2.11
 
 or
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 0017219e0726..7c51e1326656 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -59,6 +59,7 @@ The following format is accepted:
     1p or 1pb (pebibytes = 1024 tebibytes)
 
 ## Dynamically Loading Spark Properties
+
 In some cases, you may want to avoid hard-coding certain configurations in a `SparkConf`. For
 instance, if you'd like to run the same application with different masters or different
 amounts of memory. Spark allows you to simply create an empty conf:
@@ -106,7 +107,8 @@ line will appear. For all other configuration properties, you can assume the def
 Most of the properties that control internal settings have reasonable default values. Some
 of the most common options to set are:
 
-#### Application Properties
+### Application Properties
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -206,7 +208,8 @@ of the most common options to set are:
 
 Apart from these, the following properties are also available, and may be useful in some situations:
 
-#### Runtime Environment
+### Runtime Environment
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -453,7 +456,8 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Shuffle Behavior
+### Shuffle Behavior
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -572,7 +576,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.io.encryption.enabled</code></td>
   <td>false</td>
   <td>
-    Enable IO encryption. Only supported in YARN mode.
+    Enable IO encryption. Currently supported by all modes except Mesos. It's recommended that RPC encryption
+    be enabled when using this feature.
   </td>
 </tr>
 <tr>
@@ -593,7 +598,8 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Spark UI
+### Spark UI
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -717,7 +723,8 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Compression and Serialization
+### Compression and Serialization
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -863,7 +870,8 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Memory Management
+### Memory Management
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -953,7 +961,8 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Execution Behavior
+### Execution Behavior
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -1034,6 +1043,22 @@ Apart from these, the following properties are also available, and may be useful
     its contents do not match those of the source.
   </td>
 </tr>
+<tr>
+  <td><code>spark.files.maxPartitionBytes</code></td>
+  <td>134217728 (128 MB)</td>
+  <td>
+    The maximum number of bytes to pack into a single partition when reading files.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.files.openCostInBytes</code></td>
+  <td>4194304 (4 MB)</td>
+  <td>
+    The estimated cost to open a file, measured by the number of bytes could be scanned in the same
+    time. This is used when putting multiple files into a partition. It is better to over estimate,
+    then the partitions with small files will be faster than partitions with bigger files.
+  </td>
+</tr>
 <tr>
     <td><code>spark.hadoop.cloneConf</code></td>
     <td>false</td>
@@ -1064,7 +1089,8 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Networking
+### Networking
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -1095,13 +1121,13 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.driver.bindAddress</code></td>
   <td>(value of spark.driver.host)</td>
   <td>
-    <p>Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP
-    environment variable (see below).</p>
+    Hostname or IP address where to bind listening sockets. This config overrides the SPARK_LOCAL_IP
+    environment variable (see below).
 
-    <p>It also allows a different address from the local one to be advertised to executors or external systems.
+    <br />It also allows a different address from the local one to be advertised to executors or external systems.
     This is useful, for example, when running containers with bridged networking. For this to properly work,
     the different ports used by the driver (RPC, block manager and UI) need to be forwarded from the
-    container's host.</p>
+    container's host.
   </td>
 </tr>
 <tr>
@@ -1159,7 +1185,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.rpc.askTimeout</code></td>
-  <td>120s</td>
+  <td><code>spark.network.timeout</code></td>
   <td>
     Duration for an RPC ask operation to wait before timing out.
   </td>
@@ -1173,7 +1199,8 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Scheduling
+### Scheduling
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -1288,7 +1315,7 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.blacklist.stage.maxFailedTasksPerExecutor</code>
+  <td><code>spark.blacklist.stage.maxFailedTasksPerExecutor</code></td>
   <td>2</td>
   <td>
     (Experimental) How many different tasks must fail on one executor, within one stage, before the
@@ -1349,9 +1376,52 @@ Apart from these, the following properties are also available, and may be useful
     Should be greater than or equal to 1. Number of allowed retries = this value - 1.
   </td>
 </tr>
+<tr>
+  <td><code>spark.task.reaper.enabled</code></td>
+  <td>false</td>
+  <td>
+    Enables monitoring of killed / interrupted tasks. When set to true, any task which is killed
+    will be monitored by the executor until that task actually finishes executing. See the other
+    <code>spark.task.reaper.*</code> configurations for details on how to control the exact behavior
+    of this monitoring</code>. When set to false (the default), task killing will use an older code
+    path which lacks such monitoring.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.task.reaper.pollingInterval</code></td>
+  <td>10s</td>
+  <td>
+    When <code>spark.task.reaper.enabled = true</code>, this setting controls the frequency at which
+    executors will poll the status of killed tasks. If a killed task is still running when polled
+    then a warning will be logged and, by default, a thread-dump of the task will be logged
+    (this thread dump can be disabled via the <code>spark.task.reaper.threadDump</code> setting,
+    which is documented below).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.task.reaper.threadDump</code></td>
+  <td>true</td>
+  <td>
+    When <code>spark.task.reaper.enabled = true</code>, this setting controls whether task thread
+    dumps are logged during periodic polling of killed tasks. Set this to false to disable
+    collection of thread dumps.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.task.reaper.killTimeout</code></td>
+  <td>-1</td>
+  <td>
+    When <code>spark.task.reaper.enabled = true</code>, this setting specifies a timeout after
+    which the executor JVM will kill itself if a killed task has not stopped running. The default
+    value, -1, disables this mechanism and prevents the executor from self-destructing. The purpose
+    of this setting is to act as a safety-net to prevent runaway uncancellable tasks from rendering
+    an executor unusable.
+  </td>
+</tr>
 </table>
 
-#### Dynamic Allocation
+### Dynamic Allocation
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -1432,7 +1502,8 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Security
+### Security
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -1515,7 +1586,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.core.connection.ack.wait.timeout</code></td>
-  <td>60s</td>
+  <td><code>spark.network.timeout</code></td>
   <td>
     How long for the connection to wait for ack to occur before timing
     out and giving up. To avoid unwilling timeout caused by long pause like GC,
@@ -1588,7 +1659,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
-#### Encryption
+### Encryption
 
 <table class="table">
     <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -1596,21 +1667,21 @@ Apart from these, the following properties are also available, and may be useful
         <td><code>spark.ssl.enabled</code></td>
         <td>false</td>
         <td>
-            <p>Whether to enable SSL connections on all supported protocols.</p>
+            Whether to enable SSL connections on all supported protocols.
 
-            <p>When <code>spark.ssl.enabled</code> is configured, <code>spark.ssl.protocol</code>
-            is required.</p>
+            <br />When <code>spark.ssl.enabled</code> is configured, <code>spark.ssl.protocol</code>
+            is required.
 
-            <p>All the SSL settings like <code>spark.ssl.xxx</code> where <code>xxx</code> is a
+            <br />All the SSL settings like <code>spark.ssl.xxx</code> where <code>xxx</code> is a
             particular configuration property, denote the global configuration for all the supported
             protocols. In order to override the global configuration for the particular protocol,
-            the properties must be overwritten in the protocol-specific namespace.</p>
+            the properties must be overwritten in the protocol-specific namespace.
 
-            <p>Use <code>spark.ssl.YYY.XXX</code> settings to overwrite the global configuration for
+            <br />Use <code>spark.ssl.YYY.XXX</code> settings to overwrite the global configuration for
             particular protocol denoted by <code>YYY</code>. Example values for <code>YYY</code>
             include <code>fs</code>, <code>ui</code>, <code>standalone</code>, and
             <code>historyServer</code>.  See <a href="security.html#ssl-configuration">SSL
-            Configuration</a> for details on hierarchical SSL configuration for services.</p>
+            Configuration</a> for details on hierarchical SSL configuration for services.
         </td>
     </tr>
     <tr>
@@ -1694,7 +1765,8 @@ Apart from these, the following properties are also available, and may be useful
 </table>
 
 
-#### Spark SQL
+### Spark SQL
+
 Running the <code>SET -v</code> command will show the entire list of the SQL configuration.
 
 <div class="codetabs">
@@ -1736,7 +1808,8 @@ showDF(properties, numRows = 200, truncate = FALSE)
 </div>
 
 
-#### Spark Streaming
+### Spark Streaming
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -1857,7 +1930,8 @@ showDF(properties, numRows = 200, truncate = FALSE)
 </tr>
 </table>
 
-#### SparkR
+### SparkR
+
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -1900,13 +1974,13 @@ showDF(properties, numRows = 200, truncate = FALSE)
   <td><code>spark.r.heartBeatInterval</code></td>
   <td>100</td>
   <td>
-    Interval for heartbeats sents from SparkR backend to R process to prevent connection timeout.
+    Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.
   </td>
 </tr>
 
 </table>
 
-#### Deploy
+### Deploy
 
 <table class="table">
   <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -1929,15 +2003,16 @@ showDF(properties, numRows = 200, truncate = FALSE)
 </table>
 
 
-#### Cluster Managers
+### Cluster Managers
+
 Each cluster manager in Spark has additional configuration options. Configurations
 can be found on the pages for each mode:
 
-##### [YARN](running-on-yarn.html#configuration)
+#### [YARN](running-on-yarn.html#configuration)
 
-##### [Mesos](running-on-mesos.html#configuration)
+#### [Mesos](running-on-mesos.html#configuration)
 
-##### [Standalone Mode](spark-standalone.html#cluster-launch-scripts)
+#### [Standalone Mode](spark-standalone.html#cluster-launch-scripts)
 
 # Environment Variables
 
diff --git a/docs/contributing-to-spark.md b/docs/contributing-to-spark.md
index ef1b3ad6da57..9252545e4a12 100644
--- a/docs/contributing-to-spark.md
+++ b/docs/contributing-to-spark.md
@@ -5,4 +5,4 @@ title: Contributing to Spark
 
 The Spark team welcomes all forms of contributions, including bug reports, documentation or patches.
 For the newest information on how to contribute to the project, please read the
-[wiki page on contributing to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark).
+[Contributing to Spark guide](http://spark.apache.org/contributing.html).
diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 58671e6f146d..e271b28fb4f2 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -11,6 +11,7 @@ description: GraphX graph processing library guide for Spark SPARK_VERSION_SHORT
 <!-- All the documentation links  -->
 
 [EdgeRDD]: api/scala/index.html#org.apache.spark.graphx.EdgeRDD
+[VertexRDD]: api/scala/index.html#org.apache.spark.graphx.VertexRDD
 [Edge]: api/scala/index.html#org.apache.spark.graphx.Edge
 [EdgeTriplet]: api/scala/index.html#org.apache.spark.graphx.EdgeTriplet
 [Graph]: api/scala/index.html#org.apache.spark.graphx.Graph
@@ -35,7 +36,6 @@ description: GraphX graph processing library guide for Spark SPARK_VERSION_SHORT
 [Graph.fromEdgeTuples]: api/scala/index.html#org.apache.spark.graphx.Graph$@fromEdgeTuples[VD](RDD[(VertexId,VertexId)],VD,Option[PartitionStrategy])(ClassTag[VD]):Graph[VD,Int]
 [Graph.fromEdges]: api/scala/index.html#org.apache.spark.graphx.Graph$@fromEdges[VD,ED](RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
 [PartitionStrategy]: api/scala/index.html#org.apache.spark.graphx.PartitionStrategy
-[Graph.partitionBy]: api/scala/index.html#org.apache.spark.graphx.Graph$@partitionBy(partitionStrategy:org.apache.spark.graphx.PartitionStrategy):org.apache.spark.graphx.Graph[VD,ED]
 [PageRank]: api/scala/index.html#org.apache.spark.graphx.lib.PageRank$
 [ConnectedComponents]: api/scala/index.html#org.apache.spark.graphx.lib.ConnectedComponents$
 [TriangleCount]: api/scala/index.html#org.apache.spark.graphx.lib.TriangleCount$
@@ -89,7 +89,7 @@ with user defined objects attached to each vertex and edge.  A directed multigra
 graph with potentially multiple parallel edges sharing the same source and destination vertex.  The
 ability to support parallel edges simplifies modeling scenarios where there can be multiple
 relationships (e.g., co-worker and friend) between the same vertices.  Each vertex is keyed by a
-*unique* 64-bit long identifier (`VertexID`).  GraphX does not impose any ordering constraints on
+*unique* 64-bit long identifier (`VertexId`).  GraphX does not impose any ordering constraints on
 the vertex identifiers.  Similarly, edges have corresponding source and destination vertex
 identifiers.
 
@@ -130,12 +130,12 @@ class Graph[VD, ED] {
 }
 {% endhighlight %}
 
-The classes `VertexRDD[VD]` and `EdgeRDD[ED]` extend and are optimized versions of `RDD[(VertexID,
+The classes `VertexRDD[VD]` and `EdgeRDD[ED]` extend and are optimized versions of `RDD[(VertexId,
 VD)]` and `RDD[Edge[ED]]` respectively.  Both `VertexRDD[VD]` and `EdgeRDD[ED]` provide  additional
 functionality built around graph computation and leverage internal optimizations.  We discuss the
-`VertexRDD` and `EdgeRDD` API in greater detail in the section on [vertex and edge
+`VertexRDD`[VertexRDD] and `EdgeRDD`[EdgeRDD] API in greater detail in the section on [vertex and edge
 RDDs](#vertex_and_edge_rdds) but for now they can be thought of as simply RDDs of the form:
-`RDD[(VertexID, VD)]` and `RDD[Edge[ED]]`.
+`RDD[(VertexId, VD)]` and `RDD[Edge[ED]]`.
 
 ### Example Property Graph
 
@@ -197,7 +197,7 @@ graph.edges.filter(e => e.srcId > e.dstId).count
 {% endhighlight %}
 
 > Note that `graph.vertices` returns an `VertexRDD[(String, String)]` which extends
-> `RDD[(VertexID, (String, String))]` and so we use the scala `case` expression to deconstruct the
+> `RDD[(VertexId, (String, String))]` and so we use the scala `case` expression to deconstruct the
 > tuple.  On the other hand, `graph.edges` returns an `EdgeRDD` containing `Edge[String]` objects.
 > We could have also used the case class type constructor as in the following:
 > {% highlight scala %}
@@ -287,7 +287,7 @@ class Graph[VD, ED] {
   // Change the partitioning heuristic  ============================================================
   def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED]
   // Transform vertex and edge attributes ==========================================================
-  def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED]
+  def mapVertices[VD2](map: (VertexId, VD) => VD2): Graph[VD2, ED]
   def mapEdges[ED2](map: Edge[ED] => ED2): Graph[VD, ED2]
   def mapEdges[ED2](map: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2]): Graph[VD, ED2]
   def mapTriplets[ED2](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
@@ -297,18 +297,18 @@ class Graph[VD, ED] {
   def reverse: Graph[VD, ED]
   def subgraph(
       epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
-      vpred: (VertexID, VD) => Boolean = ((v, d) => true))
+      vpred: (VertexId, VD) => Boolean = ((v, d) => true))
     : Graph[VD, ED]
   def mask[VD2, ED2](other: Graph[VD2, ED2]): Graph[VD, ED]
   def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED]
   // Join RDDs with the graph ======================================================================
-  def joinVertices[U](table: RDD[(VertexID, U)])(mapFunc: (VertexID, VD, U) => VD): Graph[VD, ED]
-  def outerJoinVertices[U, VD2](other: RDD[(VertexID, U)])
-      (mapFunc: (VertexID, VD, Option[U]) => VD2)
+  def joinVertices[U](table: RDD[(VertexId, U)])(mapFunc: (VertexId, VD, U) => VD): Graph[VD, ED]
+  def outerJoinVertices[U, VD2](other: RDD[(VertexId, U)])
+      (mapFunc: (VertexId, VD, Option[U]) => VD2)
     : Graph[VD2, ED]
   // Aggregate information about adjacent triplets =================================================
-  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]]
-  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexID, VD)]]
+  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexId]]
+  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexId, VD)]]
   def aggregateMessages[Msg: ClassTag](
       sendMsg: EdgeContext[VD, ED, Msg] => Unit,
       mergeMsg: (Msg, Msg) => Msg,
@@ -316,15 +316,15 @@ class Graph[VD, ED] {
     : VertexRDD[A]
   // Iterative graph-parallel computation ==========================================================
   def pregel[A](initialMsg: A, maxIterations: Int, activeDirection: EdgeDirection)(
-      vprog: (VertexID, VD, A) => VD,
-      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID,A)],
+      vprog: (VertexId, VD, A) => VD,
+      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId,A)],
       mergeMsg: (A, A) => A)
     : Graph[VD, ED]
   // Basic graph algorithms ========================================================================
   def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double]
-  def connectedComponents(): Graph[VertexID, ED]
+  def connectedComponents(): Graph[VertexId, ED]
   def triangleCount(): Graph[Int, ED]
-  def stronglyConnectedComponents(numIter: Int): Graph[VertexID, ED]
+  def stronglyConnectedComponents(numIter: Int): Graph[VertexId, ED]
 }
 {% endhighlight %}
 
@@ -481,7 +481,7 @@ original value.
 > is therefore recommended that the input RDD be made unique using the following which will
 > also *pre-index* the resulting values to substantially accelerate the subsequent join.
 > {% highlight scala %}
-val nonUniqueCosts: RDD[(VertexID, Double)]
+val nonUniqueCosts: RDD[(VertexId, Double)]
 val uniqueCosts: VertexRDD[Double] =
   graph.vertices.aggregateUsingIndex(nonUnique, (a,b) => a + b)
 val joinedGraph = graph.joinVertices(uniqueCosts)(
@@ -511,7 +511,7 @@ val degreeGraph = graph.outerJoinVertices(outDegrees) { (id, oldAttr, outDegOpt)
 > provide type annotation for the user defined function:
 > {% highlight scala %}
 val joinedGraph = graph.joinVertices(uniqueCosts,
-  (id: VertexID, oldCost: Double, extraCost: Double) => oldCost + extraCost)
+  (id: VertexId, oldCost: Double, extraCost: Double) => oldCost + extraCost)
 {% endhighlight %}
 
 >
@@ -558,7 +558,7 @@ The user defined `mergeMsg` function takes two messages destined to the same ver
 yields a single message.  Think of `mergeMsg` as the <i>reduce</i> function in map-reduce.
 The  [`aggregateMessages`][Graph.aggregateMessages] operator returns a `VertexRDD[Msg]`
 containing the aggregate message (of type `Msg`) destined to each vertex.  Vertices that did not
-receive a message are not included in the returned `VertexRDD`.
+receive a message are not included in the returned `VertexRDD`[VertexRDD].
 
 <!--
 > An [`EdgeContext`][EdgeContext] is provided in place of a [`EdgeTriplet`][EdgeTriplet] to
@@ -815,21 +815,22 @@ object Graph {
 
 GraphX exposes `RDD` views of the vertices and edges stored within the graph.  However, because
 GraphX maintains the vertices and edges in optimized data structures and these data structures
-provide additional functionality, the vertices and edges are returned as `VertexRDD` and `EdgeRDD`
+provide additional functionality, the vertices and edges are returned as `VertexRDD`[VertexRDD] and `EdgeRDD`[EdgeRDD]
 respectively.  In this section we review some of the additional useful functionality in these types.
+Note that this is just an incomplete list, please refer to the API docs for the official list of operations. 
 
 ## VertexRDDs
 
-The `VertexRDD[A]` extends `RDD[(VertexID, A)]` and adds the additional constraint that each
-`VertexID` occurs only *once*.  Moreover, `VertexRDD[A]` represents a *set* of vertices each with an
+The `VertexRDD[A]` extends `RDD[(VertexId, A)]` and adds the additional constraint that each
+`VertexId` occurs only *once*.  Moreover, `VertexRDD[A]` represents a *set* of vertices each with an
 attribute of type `A`.  Internally, this is achieved by storing the vertex attributes in a reusable
 hash-map data-structure.  As a consequence if two `VertexRDD`s are derived from the same base
-`VertexRDD` (e.g., by `filter` or `mapValues`) they can be joined in constant time without hash
-evaluations. To leverage this indexed data structure, the `VertexRDD` exposes the following
+`VertexRDD`[VertexRDD] (e.g., by `filter` or `mapValues`) they can be joined in constant time without hash
+evaluations. To leverage this indexed data structure, the `VertexRDD`[VertexRDD] exposes the following
 additional functionality:
 
 {% highlight scala %}
-class VertexRDD[VD] extends RDD[(VertexID, VD)] {
+class VertexRDD[VD] extends RDD[(VertexId, VD)] {
   // Filter the vertex set but preserves the internal index
   def filter(pred: Tuple2[VertexId, VD] => Boolean): VertexRDD[VD]
   // Transform the values without changing the ids (preserves the internal index)
@@ -847,17 +848,17 @@ class VertexRDD[VD] extends RDD[(VertexID, VD)] {
 }
 {% endhighlight %}
 
-Notice, for example,  how the `filter` operator returns an `VertexRDD`.  Filter is actually
+Notice, for example,  how the `filter` operator returns an `VertexRDD`[VertexRDD].  Filter is actually
 implemented using a `BitSet` thereby reusing the index and preserving the ability to do fast joins
 with other `VertexRDD`s.  Likewise, the `mapValues` operators do not allow the `map` function to
-change the `VertexID` thereby enabling the same `HashMap` data structures to be reused.  Both the
+change the `VertexId` thereby enabling the same `HashMap` data structures to be reused.  Both the
 `leftJoin` and `innerJoin` are able to identify when joining two `VertexRDD`s derived from the same
 `HashMap` and implement the join by linear scan rather than costly point lookups.
 
-The `aggregateUsingIndex` operator is useful for efficient construction of a new `VertexRDD` from an
-`RDD[(VertexID, A)]`.  Conceptually, if I have constructed a `VertexRDD[B]` over a set of vertices,
-*which is a super-set* of the vertices in some `RDD[(VertexID, A)]` then I can reuse the index to
-both aggregate and then subsequently index the `RDD[(VertexID, A)]`.  For example:
+The `aggregateUsingIndex` operator is useful for efficient construction of a new `VertexRDD`[VertexRDD] from an
+`RDD[(VertexId, A)]`.  Conceptually, if I have constructed a `VertexRDD[B]` over a set of vertices,
+*which is a super-set* of the vertices in some `RDD[(VertexId, A)]` then I can reuse the index to
+both aggregate and then subsequently index the `RDD[(VertexId, A)]`.  For example:
 
 {% highlight scala %}
 val setA: VertexRDD[Int] = VertexRDD(sc.parallelize(0L until 100L).map(id => (id, 1)))
@@ -878,7 +879,7 @@ of the various partitioning strategies defined in [`PartitionStrategy`][Partitio
 each partition, edge attributes and adjacency structure, are stored separately enabling maximum
 reuse when changing attribute values.
 
-The three additional functions exposed by the `EdgeRDD` are:
+The three additional functions exposed by the `EdgeRDD`[EdgeRDD] are:
 {% highlight scala %}
 // Transform the edge attributes while preserving the structure
 def mapValues[ED2](f: Edge[ED] => ED2): EdgeRDD[ED2]
@@ -888,7 +889,7 @@ def reverse: EdgeRDD[ED]
 def innerJoin[ED2, ED3](other: EdgeRDD[ED2])(f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3]
 {% endhighlight %}
 
-In most applications we have found that operations on the `EdgeRDD` are accomplished through the
+In most applications we have found that operations on the `EdgeRDD`[EdgeRDD] are accomplished through the
 graph operators or rely on operations defined in the base `RDD` class.
 
 # Optimized Representation
diff --git a/docs/img/structured-streaming-watermark-append-mode.png b/docs/img/structured-streaming-watermark-append-mode.png
new file mode 100644
index 000000000000..541d5bf399b7
Binary files /dev/null and b/docs/img/structured-streaming-watermark-append-mode.png differ
diff --git a/docs/img/structured-streaming-watermark-update-mode.png b/docs/img/structured-streaming-watermark-update-mode.png
new file mode 100644
index 000000000000..6827849c3269
Binary files /dev/null and b/docs/img/structured-streaming-watermark-update-mode.png differ
diff --git a/docs/img/structured-streaming.pptx b/docs/img/structured-streaming.pptx
index 6aad2ed33e92..2ffd9f2a5139 100644
Binary files a/docs/img/structured-streaming.pptx and b/docs/img/structured-streaming.pptx differ
diff --git a/docs/index.md b/docs/index.md
index a7a92f6c4f6d..57b9fa848f4a 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -14,7 +14,9 @@ It also supports a rich set of higher-level tools including [Spark SQL](sql-prog
 
 Get Spark from the [downloads page](http://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. Spark uses Hadoop's client libraries for HDFS and YARN. Downloads are pre-packaged for a handful of popular Hadoop versions.
 Users can also download a "Hadoop free" binary and run Spark with any Hadoop version
-[by augmenting Spark's classpath](hadoop-provided.html). 
+[by augmenting Spark's classpath](hadoop-provided.html).
+Scala and Java users can include Spark in their projects using its maven cooridnates and in the future Python users can also install Spark from PyPI.
+
 
 If you'd like to build Spark from 
 source, visit [Building Spark](building-spark.html).
@@ -28,6 +30,10 @@ Spark runs on Java 7+, Python 2.6+/3.4+ and R 3.1+. For the Scala API, Spark {{s
 uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
 ({{site.SCALA_BINARY_VERSION}}.x).
 
+Note that support for Java 7 and Python 2.6 are deprecated as of Spark 2.0.0, and support for 
+Scala 2.10 and versions of Hadoop before 2.6 are deprecated as of Spark 2.1.0, and may be 
+removed in Spark 2.2.0.
+
 # Running the Examples and Shell
 
 Spark comes with several sample programs.  Scala, Java, Python and R examples are in the
@@ -119,13 +125,12 @@ options for deployment:
 * Integration with other storage systems:
   * [OpenStack Swift](storage-openstack-swift.html)
 * [Building Spark](building-spark.html): build Spark using the Maven system
-* [Contributing to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark)
-* [Third Party Projects](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects): related third party Spark projects
+* [Contributing to Spark](http://spark.apache.org/contributing.html)
+* [Third Party Projects](http://spark.apache.org/third-party-projects.html): related third party Spark projects
 
 **External Resources:**
 
 * [Spark Homepage](http://spark.apache.org)
-* [Spark Wiki](https://cwiki.apache.org/confluence/display/SPARK)
 * [Spark Community](http://spark.apache.org/community.html) resources, including local meetups
 * [StackOverflow tag `apache-spark`](http://stackoverflow.com/questions/tagged/apache-spark)
 * [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions about Spark here
diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md
index 12a03d3c9198..2747f2df7cb1 100644
--- a/docs/ml-advanced.md
+++ b/docs/ml-advanced.md
@@ -59,17 +59,25 @@ Given $n$ weighted observations $(w_i, a_i, b_i)$:
 
 The number of features for each observation is $m$. We use the following weighted least squares formulation:
 `\[   
-minimize_{x}\frac{1}{2} \sum_{i=1}^n \frac{w_i(a_i^T x -b_i)^2}{\sum_{k=1}^n w_k} + \frac{1}{2}\frac{\lambda}{\delta}\sum_{j=1}^m(\sigma_{j} x_{j})^2
+\min_{\mathbf{x}}\frac{1}{2} \sum_{i=1}^n \frac{w_i(\mathbf{a}_i^T \mathbf{x} -b_i)^2}{\sum_{k=1}^n w_k} + \frac{\lambda}{\delta}\left[\frac{1}{2}(1 - \alpha)\sum_{j=1}^m(\sigma_j x_j)^2 + \alpha\sum_{j=1}^m |\sigma_j x_j|\right]
 \]`
-where $\lambda$ is the regularization parameter, $\delta$ is the population standard deviation of the label
+where $\lambda$ is the regularization parameter, $\alpha$ is the elastic-net mixing parameter, $\delta$ is the population standard deviation of the label
 and $\sigma_j$ is the population standard deviation of the j-th feature column.
 
-This objective function has an analytic solution and it requires only one pass over the data to collect necessary statistics to solve.
-Unlike the original dataset which can only be stored in a distributed system,
-these statistics can be loaded into memory on a single machine if the number of features is relatively small, and then we can solve the objective function through Cholesky factorization on the driver.
+This objective function requires only one pass over the data to collect the statistics necessary to solve it. For an
+$n \times m$ data matrix, these statistics require only $O(m^2)$ storage and so can be stored on a single machine when $m$ (the number of features) is
+relatively small. We can then solve the normal equations on a single machine using local methods like direct Cholesky factorization or iterative optimization programs.
 
-WeightedLeastSquares only supports L2 regularization and provides options to enable or disable regularization and standardization.
-In order to make the normal equation approach efficient, WeightedLeastSquares requires that the number of features be no more than 4096. For larger problems, use L-BFGS instead.
+Spark MLlib currently supports two types of solvers for the normal equations: Cholesky factorization and Quasi-Newton methods (L-BFGS/OWL-QN). Cholesky factorization
+depends on a positive definite covariance matrix (i.e. columns of the data matrix must be linearly independent) and will fail if this condition is violated. Quasi-Newton methods
+are still capable of providing a reasonable solution even when the covariance matrix is not positive definite, so the normal equation solver can also fall back to 
+Quasi-Newton methods in this case. This fallback is currently always enabled for the `LinearRegression` and `GeneralizedLinearRegression` estimators.
+
+`WeightedLeastSquares` supports L1, L2, and elastic-net regularization and provides options to enable or disable regularization and standardization. In the case where no 
+L1 regularization is applied (i.e. $\alpha = 0$), there exists an analytical solution and either Cholesky or Quasi-Newton solver may be used. When $\alpha > 0$ no analytical 
+solution exists and we instead use the Quasi-Newton solver to find the coefficients iteratively. 
+
+In order to make the normal equation approach efficient, `WeightedLeastSquares` requires that the number of features be no more than 4096. For larger problems, use L-BFGS instead.
 
 ## Iteratively reweighted least squares (IRLS)
 
@@ -83,6 +91,6 @@ It solves certain optimization problems iteratively through the following proced
 * solve a weighted least squares (WLS) problem by WeightedLeastSquares.
 * repeat above steps until convergence.
 
-Since it involves solving a weighted least squares (WLS) problem by WeightedLeastSquares in each iteration,
+Since it involves solving a weighted least squares (WLS) problem by `WeightedLeastSquares` in each iteration,
 it also requires the number of features to be no more than 4096.
 Currently IRLS is used as the default solver of [GeneralizedLinearRegression](api/scala/index.html#org.apache.spark.ml.regression.GeneralizedLinearRegression).
diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index bb2e404330cc..2ffea6417863 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -55,17 +55,33 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression).
+
 {% include_example scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/classification/LogisticRegression.html).
+
 {% include_example java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
+
+More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegression).
+
 {% include_example python/ml/logistic_regression_with_elastic_net.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+More details on parameters can be found in the [R API documentation](api/R/spark.logit.html).
+
+{% include_example binomial r/ml/logit.R %}
+</div>
+
 </div>
 
 The `spark.ml` implementation of logistic regression also supports
@@ -105,9 +121,15 @@ Continuing the earlier example:
 {% include_example java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java %}
 </div>
 
-<!--- TODO: Add python model summaries once implemented -->
 <div data-lang="python" markdown="1">
-Logistic regression model summary is not yet supported in Python.
+[`LogisticRegressionTrainingSummary`](api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionSummary)
+provides a summary for a
+[`LogisticRegressionModel`](api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionModel).
+Currently, only binary classification is supported. Support for multiclass model summaries will be added in the future.
+
+Continuing the earlier example:
+
+{% include_example python/ml/logistic_regression_summary_example.py %}
 </div>
 
 </div>
@@ -156,6 +178,13 @@ model with elastic net regularization.
 {% include_example python/ml/multiclass_logistic_regression_with_elastic_net.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+More details on parameters can be found in the [R API documentation](api/R/spark.logit.html).
+
+{% include_example multinomial r/ml/logit.R %}
+</div>
+
 </div>
 
 
@@ -227,6 +256,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat
 
 {% include_example python/ml/random_forest_classifier_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.randomForest.html) for more details.
+
+{% include_example classification r/ml/randomForest.R %}
+</div>
+
 </div>
 
 ## Gradient-boosted tree classifier
@@ -260,6 +297,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat
 
 {% include_example python/ml/gradient_boosted_tree_classifier_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.gbt.html) for more details.
+
+{% include_example classification r/ml/gbt.R %}
+</div>
+
 </div>
 
 ## Multilayer perceptron classifier
@@ -289,17 +334,33 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier) for more details.
+
 {% include_example scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.html) for more details.
+
 {% include_example java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.MultilayerPerceptronClassifier) for more details.
+
 {% include_example python/ml/multilayer_perceptron_classification.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.mlp.html) for more details.
+
+{% include_example r/ml/mlp.R %}
+</div>
+
 </div>
 
 
@@ -371,6 +432,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat
 
 {% include_example python/ml/naive_bayes_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.naiveBayes.html) for more details.
+
+{% include_example r/ml/naiveBayes.R %}
+</div>
+
 </div>
 
 
@@ -392,15 +461,24 @@ regression model and extracting model summary statistics.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression).
+
 {% include_example scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/regression/LinearRegression.html).
+
 {% include_example java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 <!--- TODO: Add python model summaries once implemented -->
+
+More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.regression.LinearRegression).
+
 {% include_example python/ml/linear_regression_with_elastic_net.py %}
 </div>
 
@@ -519,23 +597,33 @@ function and extracting model summary statistics.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
 Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.GeneralizedLinearRegression) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/GeneralizedLinearRegressionExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
 Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/GeneralizedLinearRegression.html) for more details.
 
 {% include_example java/org/apache/spark/examples/ml/JavaGeneralizedLinearRegressionExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
+
 Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.GeneralizedLinearRegression) for more details.
 
 {% include_example python/ml/generalized_linear_regression_example.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.glm.html) for more details.
+
+{% include_example r/ml/glm.R %}
+</div>
+
 </div>
 
 
@@ -605,6 +693,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.
 
 {% include_example python/ml/random_forest_regressor_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.randomForest.html) for more details.
+
+{% include_example regression r/ml/randomForest.R %}
+</div>
+
 </div>
 
 ## Gradient-boosted tree regression
@@ -638,6 +734,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.
 
 {% include_example python/ml/gradient_boosted_tree_regressor_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.gbt.html) for more details.
+
+{% include_example regression r/ml/gbt.R %}
+</div>
+
 </div>
 
 
@@ -705,17 +809,33 @@ The implementation matches the result from R's survival function
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression) for more details.
+
 {% include_example scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/AFTSurvivalRegression.html) for more details.
+
 {% include_example java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.AFTSurvivalRegression) for more details.
+
 {% include_example python/ml/aft_survival_regression.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.survreg.html) for more details.
+
+{% include_example r/ml/survreg.R %}
+</div>
+
 </div>
 
 
@@ -786,6 +906,14 @@ Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.ml.html#pyspa
 
 {% include_example python/ml/isotonic_regression_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [`IsotonicRegression` R API docs](api/R/spark.isoreg.html) for more details on the API.
+
+{% include_example r/ml/isoreg.R %}
+</div>
+
 </div>
 
 # Linear methods
@@ -945,7 +1073,7 @@ Random forests combine many decision trees in order to reduce the risk of overfi
 The `spark.ml` implementation supports random forests for binary and multiclass classification and for regression,
 using both continuous and categorical features.
 
-For more information on the algorithm itself, please see the [`spark.mllib` documentation on random forests](mllib-ensembles.html).
+For more information on the algorithm itself, please see the [`spark.mllib` documentation on random forests](mllib-ensembles.html#random-forests).
 
 ### Inputs and Outputs
 
@@ -1026,7 +1154,7 @@ GBTs iteratively train decision trees in order to minimize a loss function.
 The `spark.ml` implementation supports GBTs for binary classification and for regression,
 using both continuous and categorical features.
 
-For more information on the algorithm itself, please see the [`spark.mllib` documentation on GBTs](mllib-ensembles.html).
+For more information on the algorithm itself, please see the [`spark.mllib` documentation on GBTs](mllib-ensembles.html#gradient-boosted-trees-gbts).
 
 ### Inputs and Outputs
 
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index 8a0a61cb595e..d10db51d2309 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -86,6 +86,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 
 {% include_example python/ml/kmeans_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.kmeans.html) for more details.
+
+{% include_example r/ml/kmeans.R %}
+</div>
+
 </div>
 
 ## Latent Dirichlet allocation (LDA)
@@ -116,6 +124,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 
 {% include_example python/ml/lda_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.lda.html) for more details.
+
+{% include_example r/ml/lda.R %}
+</div>
+
 </div>
 
 ## Bisecting k-means
@@ -231,4 +247,12 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 
 {% include_example python/ml/gaussian_mixture_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.gaussianMixture.html) for more details.
+
+{% include_example r/ml/gaussianMixture.R %}
+</div>
+
 </div>
diff --git a/docs/ml-collaborative-filtering.md b/docs/ml-collaborative-filtering.md
index 1d02d6933cb4..7933a1f5d7fa 100644
--- a/docs/ml-collaborative-filtering.md
+++ b/docs/ml-collaborative-filtering.md
@@ -149,4 +149,12 @@ als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True,
 {% endhighlight %}
 
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.als.html) for more details.
+
+{% include_example r/ml/als.R %}
+</div>
+
 </div>
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 352887d3ba6e..3ecf700abf6e 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -9,6 +9,7 @@ This section covers algorithms for working with features, roughly divided into t
 * Extraction: Extracting features from "raw" data
 * Transformation: Scaling, converting, or modifying features
 * Selection: Selecting a subset from a larger set of features
+* Locality Sensitive Hashing (LSH): This class of algorithms combines aspects of feature transformation with other algorithms.
 
 **Table of Contents**
 
@@ -694,7 +695,7 @@ for more details on the API.
 `VectorIndexer` helps index categorical features in datasets of `Vector`s.
 It can both automatically decide which features are categorical and convert original values to category indices.  Specifically, it does the following:
 
-1. Take an input column of type [Vector](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and a parameter `maxCategories`.
+1. Take an input column of type [Vector](api/scala/index.html#org.apache.spark.ml.linalg.Vector) and a parameter `maxCategories`.
 2. Decide which features should be categorical based on the number of distinct values, where features with at most `maxCategories` are declared categorical.
 3. Compute 0-based category indices for each categorical feature.
 4. Index categorical features and transform original feature values to indices.
@@ -729,6 +730,58 @@ for more details on the API.
 </div>
 </div>
 
+## Interaction
+
+`Interaction` is a `Transformer` which takes vector or double-valued columns, and generates a single vector column that contains the product of all combinations of one value from each input column.
+
+For example, if you have 2 vector type columns each of which has 3 dimensions as input columns, then then you'll get a 9-dimensional vector as the output column.
+
+**Examples**
+
+Assume that we have the following DataFrame with the columns "id1", "vec1", and "vec2":
+
+~~~~
+  id1|vec1          |vec2          
+  ---|--------------|--------------
+  1  |[1.0,2.0,3.0] |[8.0,4.0,5.0] 
+  2  |[4.0,3.0,8.0] |[7.0,9.0,8.0] 
+  3  |[6.0,1.0,9.0] |[2.0,3.0,6.0] 
+  4  |[10.0,8.0,6.0]|[9.0,4.0,5.0] 
+  5  |[9.0,2.0,7.0] |[10.0,7.0,3.0]
+  6  |[1.0,1.0,4.0] |[2.0,8.0,4.0]     
+~~~~
+
+Applying `Interaction` with those input columns,
+then `interactedCol` as the output column contains:
+
+~~~~
+  id1|vec1          |vec2          |interactedCol                                         
+  ---|--------------|--------------|------------------------------------------------------
+  1  |[1.0,2.0,3.0] |[8.0,4.0,5.0] |[8.0,4.0,5.0,16.0,8.0,10.0,24.0,12.0,15.0]            
+  2  |[4.0,3.0,8.0] |[7.0,9.0,8.0] |[56.0,72.0,64.0,42.0,54.0,48.0,112.0,144.0,128.0]     
+  3  |[6.0,1.0,9.0] |[2.0,3.0,6.0] |[36.0,54.0,108.0,6.0,9.0,18.0,54.0,81.0,162.0]        
+  4  |[10.0,8.0,6.0]|[9.0,4.0,5.0] |[360.0,160.0,200.0,288.0,128.0,160.0,216.0,96.0,120.0]
+  5  |[9.0,2.0,7.0] |[10.0,7.0,3.0]|[450.0,315.0,135.0,100.0,70.0,30.0,350.0,245.0,105.0] 
+  6  |[1.0,1.0,4.0] |[2.0,8.0,4.0] |[12.0,48.0,24.0,12.0,48.0,24.0,48.0,192.0,96.0]       
+~~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Interaction Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Interaction)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/InteractionExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Interaction Java docs](api/java/org/apache/spark/ml/feature/Interaction.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaInteractionExample.java %}
+</div>
+</div>
 
 ## Normalizer
 
@@ -1106,7 +1159,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I
 that the number of buckets used will be smaller than this value, for example, if there are too few
 distinct values of the input to create enough distinct quantiles.
 
-NaN values: Note also that QuantileDiscretizer
+NaN values:
+NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce
+a `Bucketizer` model for making predictions. During the transformation, `Bucketizer`
 will raise an error when it finds NaN values in the dataset, but the user can also choose to either
 keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep
 NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets
@@ -1396,3 +1451,113 @@ for more details on the API.
 {% include_example python/ml/chisq_selector_example.py %}
 </div>
 </div>
+
+# Locality Sensitive Hashing
+[Locality Sensitive Hashing (LSH)](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) is an important class of hashing techniques, which is commonly used in clustering, approximate nearest neighbor search and outlier detection with large datasets.
+
+The general idea of LSH is to use a family of functions ("LSH families") to hash data points into buckets, so that the data points which are close to each other are in the same buckets with high probability, while data points that are far away from each other are very likely in different buckets. An LSH family is formally defined as follows.
+
+In a metric space `(M, d)`, where `M` is a set and `d` is a distance function on `M`, an LSH family is a family of functions `h` that satisfy the following properties:
+`\[
+\forall p, q \in M,\\
+d(p,q) \leq r1 \Rightarrow Pr(h(p)=h(q)) \geq p1\\
+d(p,q) \geq r2 \Rightarrow Pr(h(p)=h(q)) \leq p2
+\]`
+This LSH family is called `(r1, r2, p1, p2)`-sensitive.
+
+In Spark, different LSH families are implemented in separate classes (e.g., `MinHash`), and APIs for feature transformation, approximate similarity join and approximate nearest neighbor are provided in each class.
+
+In LSH, we define a false positive as a pair of distant input features (with `$d(p,q) \geq r2$`) which are hashed into the same bucket, and we define a false negative as a pair of nearby features (with `$d(p,q) \leq r1$`) which are hashed into different buckets.
+
+## LSH Operations
+
+We describe the major types of operations which LSH can be used for.  A fitted LSH model has methods for each of these operations.
+
+### Feature Transformation
+Feature transformation is the basic functionality to add hashed values as a new column. This can be useful for dimensionality reduction. Users can specify input and output column names by setting `inputCol` and `outputCol`.
+
+LSH also supports multiple LSH hash tables. Users can specify the number of hash tables by setting `numHashTables`. This is also used for [OR-amplification](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Amplification) in approximate similarity join and approximate nearest neighbor. Increasing the number of hash tables will increase the accuracy but will also increase communication cost and running time.
+
+The type of `outputCol` is `Seq[Vector]` where the dimension of the array equals `numHashTables`, and the dimensions of the vectors are currently set to 1. In future releases, we will implement AND-amplification so that users can specify the dimensions of these vectors.
+
+### Approximate Similarity Join
+Approximate similarity join takes two datasets and approximately returns pairs of rows in the datasets whose distance is smaller than a user-defined threshold. Approximate similarity join supports both joining two different datasets and self-joining. Self-joining will produce some duplicate pairs.
+
+Approximate similarity join accepts both transformed and untransformed datasets as input. If an untransformed dataset is used, it will be transformed automatically. In this case, the hash signature will be created as `outputCol`.
+
+In the joined dataset, the origin datasets can be queried in `datasetA` and `datasetB`. A distance column will be added to the output dataset to show the true distance between each pair of rows returned.
+
+### Approximate Nearest Neighbor Search
+Approximate nearest neighbor search takes a dataset (of feature vectors) and a key (a single feature vector), and it approximately returns a specified number of rows in the dataset that are closest to the vector.
+
+Approximate nearest neighbor search accepts both transformed and untransformed datasets as input. If an untransformed dataset is used, it will be transformed automatically. In this case, the hash signature will be created as `outputCol`.
+
+A distance column will be added to the output dataset to show the true distance between each output row and the searched key.
+
+**Note:** Approximate nearest neighbor search will return fewer than `k` rows when there are not enough candidates in the hash bucket.
+
+## LSH Algorithms
+
+### Bucketed Random Projection for Euclidean Distance
+
+[Bucketed Random Projection](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions) is an LSH family for Euclidean distance. The Euclidean distance is defined as follows:
+`\[
+d(\mathbf{x}, \mathbf{y}) = \sqrt{\sum_i (x_i - y_i)^2}
+\]`
+Its LSH family projects feature vectors `$\mathbf{x}$` onto a random unit vector `$\mathbf{v}$` and portions the projected results into hash buckets:
+`\[
+h(\mathbf{x}) = \Big\lfloor \frac{\mathbf{x} \cdot \mathbf{v}}{r} \Big\rfloor
+\]`
+where `r` is a user-defined bucket length. The bucket length can be used to control the average size of hash buckets (and thus the number of buckets). A larger bucket length (i.e., fewer buckets) increases the probability of features being hashed to the same bucket (increasing the numbers of true and false positives).
+
+Bucketed Random Projection accepts arbitrary vectors as input features, and supports both sparse and dense vectors.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [BucketedRandomProjectionLSH Scala docs](api/scala/index.html#org.apache.spark.ml.feature.BucketedRandomProjectionLSH)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [BucketedRandomProjectionLSH Java docs](api/java/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java %}
+</div>
+</div>
+
+### MinHash for Jaccard Distance
+[MinHash](https://en.wikipedia.org/wiki/MinHash) is an LSH family for Jaccard distance where input features are sets of natural numbers. Jaccard distance of two sets is defined by the cardinality of their intersection and union:
+`\[
+d(\mathbf{A}, \mathbf{B}) = 1 - \frac{|\mathbf{A} \cap \mathbf{B}|}{|\mathbf{A} \cup \mathbf{B}|}
+\]`
+MinHash applies a random hash function `g` to each element in the set and take the minimum of all hashed values:
+`\[
+h(\mathbf{A}) = \min_{a \in \mathbf{A}}(g(a))
+\]`
+
+The input sets for MinHash are represented as binary vectors, where the vector indices represent the elements themselves and the non-zero values in the vector represent the presence of that element in the set. While both dense and sparse vectors are supported, typically sparse vectors are recommended for efficiency. For example, `Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)])` means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5. All non-zero values are treated as binary "1" values.
+
+**Note:** Empty sets cannot be transformed by MinHash, which means any input vector must have at least 1 non-zero entry.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [MinHashLSH Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinHashLSH)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/MinHashLSHExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [MinHashLSH Java docs](api/java/org/apache/spark/ml/feature/MinHashLSH.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java %}
+</div>
+</div>
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 4607ad3ba681..971761961b96 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -35,6 +35,18 @@ The primary Machine Learning API for Spark is now the [DataFrame](sql-programmin
 * The DataFrame-based API for MLlib provides a uniform API across ML algorithms and across multiple languages.
 * DataFrames facilitate practical ML Pipelines, particularly feature transformations.  See the [Pipelines guide](ml-pipeline.html) for details.
 
+*What is "Spark ML"?*
+
+* "Spark ML" is not an official name but occasionally used to refer to the MLlib DataFrame-based API.
+  This is majorly due to the `org.apache.spark.ml` Scala package name used by the DataFrame-based API, 
+  and the "Spark ML Pipelines" term we used initially to emphasize the pipeline concept.
+  
+*Is MLlib deprecated?*
+
+* No. MLlib includes both the RDD-based API and the DataFrame-based API.
+  The RDD-based API is now in maintenance mode.
+  But neither API is deprecated, nor MLlib as a whole.
+
 # Dependencies
 
 MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), which depends on
@@ -60,152 +72,34 @@ MLlib is under active development.
 The APIs marked `Experimental`/`DeveloperApi` may change in future releases,
 and the migration guide below will explain all changes between releases.
 
-## From 1.6 to 2.0
+## From 2.0 to 2.1
 
 ### Breaking changes
-
-There were several breaking changes in Spark 2.0, which are outlined below.
-
-**Linear algebra classes for DataFrame-based APIs**
-
-Spark's linear algebra dependencies were moved to a new project, `mllib-local` 
-(see [SPARK-13944](https://issues.apache.org/jira/browse/SPARK-13944)). 
-As part of this change, the linear algebra classes were copied to a new package, `spark.ml.linalg`. 
-The DataFrame-based APIs in `spark.ml` now depend on the `spark.ml.linalg` classes, 
-leading to a few breaking changes, predominantly in various model classes 
-(see [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810) for a full list).
-
-**Note:** the RDD-based APIs in `spark.mllib` continue to depend on the previous package `spark.mllib.linalg`.
-
-_Converting vectors and matrices_
-
-While most pipeline components support backward compatibility for loading, 
-some existing `DataFrames` and pipelines in Spark versions prior to 2.0, that contain vector or matrix 
-columns, may need to be migrated to the new `spark.ml` vector and matrix types. 
-Utilities for converting `DataFrame` columns from `spark.mllib.linalg` to `spark.ml.linalg` types
-(and vice versa) can be found in `spark.mllib.util.MLUtils`.
-
-There are also utility methods available for converting single instances of 
-vectors and matrices. Use the `asML` method on a `mllib.linalg.Vector` / `mllib.linalg.Matrix`
-for converting to `ml.linalg` types, and 
-`mllib.linalg.Vectors.fromML` / `mllib.linalg.Matrices.fromML` 
-for converting to `mllib.linalg` types.
-
-<div class="codetabs">
-<div data-lang="scala"  markdown="1">
-
-{% highlight scala %}
-import org.apache.spark.mllib.util.MLUtils
-
-// convert DataFrame columns
-val convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
-val convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
-// convert a single vector or matrix
-val mlVec: org.apache.spark.ml.linalg.Vector = mllibVec.asML
-val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML
-{% endhighlight %}
-
-Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for further detail.
-</div>
-
-<div data-lang="java" markdown="1">
-
-{% highlight java %}
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.sql.Dataset;
-
-// convert DataFrame columns
-Dataset<Row> convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF);
-Dataset<Row> convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF);
-// convert a single vector or matrix
-org.apache.spark.ml.linalg.Vector mlVec = mllibVec.asML();
-org.apache.spark.ml.linalg.Matrix mlMat = mllibMat.asML();
-{% endhighlight %}
-
-Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for further detail.
-</div>
-
-<div data-lang="python"  markdown="1">
-
-{% highlight python %}
-from pyspark.mllib.util import MLUtils
-
-# convert DataFrame columns
-convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
-convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
-# convert a single vector or matrix
-mlVec = mllibVec.asML()
-mlMat = mllibMat.asML()
-{% endhighlight %}
-
-Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for further detail.
-</div>
-</div>
-
+ 
 **Deprecated methods removed**
 
-Several deprecated methods were removed in the `spark.mllib` and `spark.ml` packages:
-
-* `setScoreCol` in `ml.evaluation.BinaryClassificationEvaluator`
-* `weights` in `LinearRegression` and `LogisticRegression` in `spark.ml`
-* `setMaxNumIterations` in `mllib.optimization.LBFGS` (marked as `DeveloperApi`)
-* `treeReduce` and `treeAggregate` in `mllib.rdd.RDDFunctions` (these functions are available on `RDD`s directly, and were marked as `DeveloperApi`)
-* `defaultStategy` in `mllib.tree.configuration.Strategy`
-* `build` in `mllib.tree.Node`
-* libsvm loaders for multiclass and load/save labeledData methods in `mllib.util.MLUtils`
-
-A full list of breaking changes can be found at [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810).
+* `setLabelCol` in `feature.ChiSqSelectorModel`
+* `numTrees` in `classification.RandomForestClassificationModel` (This now refers to the Param called `numTrees`)
+* `numTrees` in `regression.RandomForestRegressionModel` (This now refers to the Param called `numTrees`)
+* `model` in `regression.LinearRegressionSummary`
+* `validateParams` in `PipelineStage`
+* `validateParams` in `Evaluator`
 
 ### Deprecations and changes of behavior
 
 **Deprecations**
 
-Deprecations in the `spark.mllib` and `spark.ml` packages include:
-
-* [SPARK-14984](https://issues.apache.org/jira/browse/SPARK-14984):
- In `spark.ml.regression.LinearRegressionSummary`, the `model` field has been deprecated.
-* [SPARK-13784](https://issues.apache.org/jira/browse/SPARK-13784):
- In `spark.ml.regression.RandomForestRegressionModel` and `spark.ml.classification.RandomForestClassificationModel`,
- the `numTrees` parameter has been deprecated in favor of `getNumTrees` method.
-* [SPARK-13761](https://issues.apache.org/jira/browse/SPARK-13761):
- In `spark.ml.param.Params`, the `validateParams` method has been deprecated.
- We move all functionality in overridden methods to the corresponding `transformSchema`.
-* [SPARK-14829](https://issues.apache.org/jira/browse/SPARK-14829):
- In `spark.mllib` package, `LinearRegressionWithSGD`, `LassoWithSGD`, `RidgeRegressionWithSGD` and `LogisticRegressionWithSGD` have been deprecated.
- We encourage users to use `spark.ml.regression.LinearRegresson` and `spark.ml.classification.LogisticRegresson`.
-* [SPARK-14900](https://issues.apache.org/jira/browse/SPARK-14900):
- In `spark.mllib.evaluation.MulticlassMetrics`, the parameters `precision`, `recall` and `fMeasure` have been deprecated in favor of `accuracy`.
-* [SPARK-15644](https://issues.apache.org/jira/browse/SPARK-15644):
- In `spark.ml.util.MLReader` and `spark.ml.util.MLWriter`, the `context` method has been deprecated in favor of `session`.
-* In `spark.ml.feature.ChiSqSelectorModel`, the `setLabelCol` method has been deprecated since it was not used by `ChiSqSelectorModel`.
+* [SPARK-18592](https://issues.apache.org/jira/browse/SPARK-18592):
+  Deprecate all Param setter methods except for input/output column Params for `DecisionTreeClassificationModel`, `GBTClassificationModel`, `RandomForestClassificationModel`, `DecisionTreeRegressionModel`, `GBTRegressionModel` and `RandomForestRegressionModel`
 
 **Changes of behavior**
 
-Changes of behavior in the `spark.mllib` and `spark.ml` packages include:
-
-* [SPARK-7780](https://issues.apache.org/jira/browse/SPARK-7780):
- `spark.mllib.classification.LogisticRegressionWithLBFGS` directly calls `spark.ml.classification.LogisticRegresson` for binary classification now.
- This will introduce the following behavior changes for `spark.mllib.classification.LogisticRegressionWithLBFGS`:
-    * The intercept will not be regularized when training binary classification model with L1/L2 Updater.
-    * If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate.
-* [SPARK-13429](https://issues.apache.org/jira/browse/SPARK-13429):
- In order to provide better and consistent result with `spark.ml.classification.LogisticRegresson`,
- the default value of `spark.mllib.classification.LogisticRegressionWithLBFGS`: `convergenceTol` has been changed from 1E-4 to 1E-6.
-* [SPARK-12363](https://issues.apache.org/jira/browse/SPARK-12363):
- Fix a bug of `PowerIterationClustering` which will likely change its result.
-* [SPARK-13048](https://issues.apache.org/jira/browse/SPARK-13048):
- `LDA` using the `EM` optimizer will keep the last checkpoint by default, if checkpointing is being used.
-* [SPARK-12153](https://issues.apache.org/jira/browse/SPARK-12153):
- `Word2Vec` now respects sentence boundaries. Previously, it did not handle them correctly.
-* [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574):
- `HashingTF` uses `MurmurHash3` as default hash algorithm in both `spark.ml` and `spark.mllib`.
-* [SPARK-14768](https://issues.apache.org/jira/browse/SPARK-14768):
- The `expectedType` argument for PySpark `Param` was removed.
-* [SPARK-14931](https://issues.apache.org/jira/browse/SPARK-14931):
- Some default `Param` values, which were mismatched between pipelines in Scala and Python, have been changed.
-* [SPARK-13600](https://issues.apache.org/jira/browse/SPARK-13600):
- `QuantileDiscretizer` now uses `spark.sql.DataFrameStatFunctions.approxQuantile` to find splits (previously used custom sampling logic).
- The output buckets will differ for same input data and params.
+* [SPARK-17870](https://issues.apache.org/jira/browse/SPARK-17870):
+ Fix a bug of `ChiSqSelector` which will likely change its result. Now `ChiSquareSelector` use pValue rather than raw statistic to select a fixed number of top features.
+* [SPARK-3261](https://issues.apache.org/jira/browse/SPARK-3261):
+ `KMeans` returns potentially fewer than k cluster centers in cases where k distinct centroids aren't available or aren't selected.
+* [SPARK-17389](https://issues.apache.org/jira/browse/SPARK-17389):
+ `KMeans` reduces the default number of steps from 5 to 2 for the k-means|| initialization mode.
 
 ## Previous Spark versions
 
diff --git a/docs/ml-migration-guides.md b/docs/ml-migration-guides.md
index 82bf9d7760fb..58c3747ea638 100644
--- a/docs/ml-migration-guides.md
+++ b/docs/ml-migration-guides.md
@@ -7,6 +7,153 @@ description: MLlib migration guides from before Spark SPARK_VERSION_SHORT
 
 The migration guide for the current Spark version is kept on the [MLlib Guide main page](ml-guide.html#migration-guide).
 
+## From 1.6 to 2.0
+
+### Breaking changes
+
+There were several breaking changes in Spark 2.0, which are outlined below.
+
+**Linear algebra classes for DataFrame-based APIs**
+
+Spark's linear algebra dependencies were moved to a new project, `mllib-local` 
+(see [SPARK-13944](https://issues.apache.org/jira/browse/SPARK-13944)). 
+As part of this change, the linear algebra classes were copied to a new package, `spark.ml.linalg`. 
+The DataFrame-based APIs in `spark.ml` now depend on the `spark.ml.linalg` classes, 
+leading to a few breaking changes, predominantly in various model classes 
+(see [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810) for a full list).
+
+**Note:** the RDD-based APIs in `spark.mllib` continue to depend on the previous package `spark.mllib.linalg`.
+
+_Converting vectors and matrices_
+
+While most pipeline components support backward compatibility for loading, 
+some existing `DataFrames` and pipelines in Spark versions prior to 2.0, that contain vector or matrix 
+columns, may need to be migrated to the new `spark.ml` vector and matrix types. 
+Utilities for converting `DataFrame` columns from `spark.mllib.linalg` to `spark.ml.linalg` types
+(and vice versa) can be found in `spark.mllib.util.MLUtils`.
+
+There are also utility methods available for converting single instances of 
+vectors and matrices. Use the `asML` method on a `mllib.linalg.Vector` / `mllib.linalg.Matrix`
+for converting to `ml.linalg` types, and 
+`mllib.linalg.Vectors.fromML` / `mllib.linalg.Matrices.fromML` 
+for converting to `mllib.linalg` types.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.mllib.util.MLUtils
+
+// convert DataFrame columns
+val convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
+val convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
+// convert a single vector or matrix
+val mlVec: org.apache.spark.ml.linalg.Vector = mllibVec.asML
+val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML
+{% endhighlight %}
+
+Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for further detail.
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.Dataset;
+
+// convert DataFrame columns
+Dataset<Row> convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF);
+Dataset<Row> convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF);
+// convert a single vector or matrix
+org.apache.spark.ml.linalg.Vector mlVec = mllibVec.asML();
+org.apache.spark.ml.linalg.Matrix mlMat = mllibMat.asML();
+{% endhighlight %}
+
+Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for further detail.
+</div>
+
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+from pyspark.mllib.util import MLUtils
+
+# convert DataFrame columns
+convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
+convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
+# convert a single vector or matrix
+mlVec = mllibVec.asML()
+mlMat = mllibMat.asML()
+{% endhighlight %}
+
+Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for further detail.
+</div>
+</div>
+
+**Deprecated methods removed**
+
+Several deprecated methods were removed in the `spark.mllib` and `spark.ml` packages:
+
+* `setScoreCol` in `ml.evaluation.BinaryClassificationEvaluator`
+* `weights` in `LinearRegression` and `LogisticRegression` in `spark.ml`
+* `setMaxNumIterations` in `mllib.optimization.LBFGS` (marked as `DeveloperApi`)
+* `treeReduce` and `treeAggregate` in `mllib.rdd.RDDFunctions` (these functions are available on `RDD`s directly, and were marked as `DeveloperApi`)
+* `defaultStategy` in `mllib.tree.configuration.Strategy`
+* `build` in `mllib.tree.Node`
+* libsvm loaders for multiclass and load/save labeledData methods in `mllib.util.MLUtils`
+
+A full list of breaking changes can be found at [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810).
+
+### Deprecations and changes of behavior
+
+**Deprecations**
+
+Deprecations in the `spark.mllib` and `spark.ml` packages include:
+
+* [SPARK-14984](https://issues.apache.org/jira/browse/SPARK-14984):
+ In `spark.ml.regression.LinearRegressionSummary`, the `model` field has been deprecated.
+* [SPARK-13784](https://issues.apache.org/jira/browse/SPARK-13784):
+ In `spark.ml.regression.RandomForestRegressionModel` and `spark.ml.classification.RandomForestClassificationModel`,
+ the `numTrees` parameter has been deprecated in favor of `getNumTrees` method.
+* [SPARK-13761](https://issues.apache.org/jira/browse/SPARK-13761):
+ In `spark.ml.param.Params`, the `validateParams` method has been deprecated.
+ We move all functionality in overridden methods to the corresponding `transformSchema`.
+* [SPARK-14829](https://issues.apache.org/jira/browse/SPARK-14829):
+ In `spark.mllib` package, `LinearRegressionWithSGD`, `LassoWithSGD`, `RidgeRegressionWithSGD` and `LogisticRegressionWithSGD` have been deprecated.
+ We encourage users to use `spark.ml.regression.LinearRegresson` and `spark.ml.classification.LogisticRegresson`.
+* [SPARK-14900](https://issues.apache.org/jira/browse/SPARK-14900):
+ In `spark.mllib.evaluation.MulticlassMetrics`, the parameters `precision`, `recall` and `fMeasure` have been deprecated in favor of `accuracy`.
+* [SPARK-15644](https://issues.apache.org/jira/browse/SPARK-15644):
+ In `spark.ml.util.MLReader` and `spark.ml.util.MLWriter`, the `context` method has been deprecated in favor of `session`.
+* In `spark.ml.feature.ChiSqSelectorModel`, the `setLabelCol` method has been deprecated since it was not used by `ChiSqSelectorModel`.
+
+**Changes of behavior**
+
+Changes of behavior in the `spark.mllib` and `spark.ml` packages include:
+
+* [SPARK-7780](https://issues.apache.org/jira/browse/SPARK-7780):
+ `spark.mllib.classification.LogisticRegressionWithLBFGS` directly calls `spark.ml.classification.LogisticRegresson` for binary classification now.
+ This will introduce the following behavior changes for `spark.mllib.classification.LogisticRegressionWithLBFGS`:
+    * The intercept will not be regularized when training binary classification model with L1/L2 Updater.
+    * If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate.
+* [SPARK-13429](https://issues.apache.org/jira/browse/SPARK-13429):
+ In order to provide better and consistent result with `spark.ml.classification.LogisticRegresson`,
+ the default value of `spark.mllib.classification.LogisticRegressionWithLBFGS`: `convergenceTol` has been changed from 1E-4 to 1E-6.
+* [SPARK-12363](https://issues.apache.org/jira/browse/SPARK-12363):
+ Fix a bug of `PowerIterationClustering` which will likely change its result.
+* [SPARK-13048](https://issues.apache.org/jira/browse/SPARK-13048):
+ `LDA` using the `EM` optimizer will keep the last checkpoint by default, if checkpointing is being used.
+* [SPARK-12153](https://issues.apache.org/jira/browse/SPARK-12153):
+ `Word2Vec` now respects sentence boundaries. Previously, it did not handle them correctly.
+* [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574):
+ `HashingTF` uses `MurmurHash3` as default hash algorithm in both `spark.ml` and `spark.mllib`.
+* [SPARK-14768](https://issues.apache.org/jira/browse/SPARK-14768):
+ The `expectedType` argument for PySpark `Param` was removed.
+* [SPARK-14931](https://issues.apache.org/jira/browse/SPARK-14931):
+ Some default `Param` values, which were mismatched between pipelines in Scala and Python, have been changed.
+* [SPARK-13600](https://issues.apache.org/jira/browse/SPARK-13600):
+ `QuantileDiscretizer` now uses `spark.sql.DataFrameStatFunctions.approxQuantile` to find splits (previously used custom sampling logic).
+ The output buckets will differ for same input data and params.
+
 ## From 1.5 to 1.6
 
 There are no breaking API changes in the `spark.mllib` or `spark.ml` packages, but there are
diff --git a/docs/ml-pipeline.md b/docs/ml-pipeline.md
index adb057ba7e25..aa92c0a37c0f 100644
--- a/docs/ml-pipeline.md
+++ b/docs/ml-pipeline.md
@@ -38,26 +38,26 @@ algorithms into a single pipeline, or workflow.
 This section covers the key concepts introduced by the Pipelines API, where the pipeline concept is
 mostly inspired by the [scikit-learn](http://scikit-learn.org/) project.
 
-* **[`DataFrame`](ml-guide.html#dataframe)**: This ML API uses `DataFrame` from Spark SQL as an ML
+* **[`DataFrame`](ml-pipeline.html#dataframe)**: This ML API uses `DataFrame` from Spark SQL as an ML
   dataset, which can hold a variety of data types.
   E.g., a `DataFrame` could have different columns storing text, feature vectors, true labels, and predictions.
 
-* **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`.
+* **[`Transformer`](ml-pipeline.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`.
 E.g., an ML model is a `Transformer` which transforms a `DataFrame` with features into a `DataFrame` with predictions.
 
-* **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`.
+* **[`Estimator`](ml-pipeline.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`.
 E.g., a learning algorithm is an `Estimator` which trains on a `DataFrame` and produces a model.
 
-* **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow.
+* **[`Pipeline`](ml-pipeline.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow.
 
-* **[`Parameter`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
+* **[`Parameter`](ml-pipeline.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
 
 ## DataFrame
 
 Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data.
 This API adopts the `DataFrame` from Spark SQL in order to support a variety of data types.
 
-`DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types.
+`DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#data-types) for a list of supported types.
 In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](mllib-data-types.html#local-vector) types.
 
 A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`.  See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples.
@@ -132,7 +132,7 @@ The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw
 The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`.
 The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`.
 Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`.
-If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()`
+If the `Pipeline` had more `Estimator`s, it would call the `LogisticRegressionModel`'s `transform()`
 method on the `DataFrame` before passing the `DataFrame` to the next stage.
 
 A `Pipeline` is an `Estimator`.
@@ -206,15 +206,30 @@ This example covers the concepts of `Estimator`, `Transformer`, and `Param`.
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+
+Refer to the [`Estimator` Scala docs](api/scala/index.html#org.apache.spark.ml.Estimator),
+the [`Transformer` Scala docs](api/scala/index.html#org.apache.spark.ml.Transformer) and
+the [`Params` Scala docs](api/scala/index.html#org.apache.spark.ml.param.Params) for details on the API.
+
 {% include_example scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+
+Refer to the [`Estimator` Java docs](api/java/org/apache/spark/ml/Estimator.html),
+the [`Transformer` Java docs](api/java/org/apache/spark/ml/Transformer.html) and
+the [`Params` Java docs](api/java/org/apache/spark/ml/param/Params.html) for details on the API.
+
 {% include_example java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+
+Refer to the [`Estimator` Python docs](api/python/pyspark.ml.html#pyspark.ml.Estimator),
+the [`Transformer` Python docs](api/python/pyspark.ml.html#pyspark.ml.Transformer) and
+the [`Params` Python docs](api/python/pyspark.ml.html#pyspark.ml.param.Params) for more details on the API.
+
 {% include_example python/ml/estimator_transformer_param_example.py %}
 </div>
 
@@ -226,15 +241,25 @@ This example follows the simple text document `Pipeline` illustrated in the figu
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+
+Refer to the [`Pipeline` Scala docs](api/scala/index.html#org.apache.spark.ml.Pipeline) for details on the API.
+
 {% include_example scala/org/apache/spark/examples/ml/PipelineExample.scala %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+
+
+Refer to the [`Pipeline` Java docs](api/java/org/apache/spark/ml/Pipeline.html) for details on the API.
+
 {% include_example java/org/apache/spark/examples/ml/JavaPipelineExample.java %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+
+Refer to the [`Pipeline` Python docs](api/python/pyspark.ml.html#pyspark.ml.Pipeline) for more details on the API.
+
 {% include_example python/ml/pipeline_example.py %}
 </div>
 
diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md
index 2ca90c7092fd..c3bba29a9d37 100644
--- a/docs/ml-tuning.md
+++ b/docs/ml-tuning.md
@@ -74,15 +74,23 @@ However, it is also a well-established method for choosing parameters which is m
 
 <div class="codetabs">
 
-<div data-lang="scala">
+<div data-lang="scala" markdown="1">
+
+Refer to the [`CrossValidator` Scala docs](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) for details on the API.
+
 {% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala %}
 </div>
 
-<div data-lang="java">
+<div data-lang="java" markdown="1">
+
+Refer to the [`CrossValidator` Java docs](api/java/org/apache/spark/ml/tuning/CrossValidator.html) for details on the API.
+
 {% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+
+Refer to the [`CrossValidator` Python docs](api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator) for more details on the API.
 
 {% include_example python/ml/cross_validator.py %}
 </div>
@@ -107,14 +115,23 @@ Like `CrossValidator`, `TrainValidationSplit` finally fits the `Estimator` using
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+Refer to the [`TrainValidationSplit` Scala docs](api/scala/index.html#org.apache.spark.ml.tuning.TrainValidationSplit) for details on the API.
+
 {% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [`TrainValidationSplit` Java docs](api/java/org/apache/spark/ml/tuning/TrainValidationSplit.html) for details on the API.
+
 {% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java %}
 </div>
 
-<div data-lang="python">
+<div data-lang="python" markdown="1">
+
+Refer to the [`TrainValidationSplit` Python docs](api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplit) for more details on the API.
+
 {% include_example python/ml/train_validation_split.py %}
 </div>
 
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index d5f6ae379a85..8990e95796b6 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -24,13 +24,11 @@ variant of the [k-means++](http://en.wikipedia.org/wiki/K-means%2B%2B) method
 called [kmeans||](http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf).
 The implementation in `spark.mllib` has the following parameters:
 
-* *k* is the number of desired clusters.
+* *k* is the number of desired clusters. Note that it is possible for fewer than k clusters to be returned, for example, if there are fewer than k distinct points to cluster.
 * *maxIterations* is the maximum number of iterations to run.
 * *initializationMode* specifies either random initialization or
 initialization via k-means\|\|.
-* *runs* is the number of times to run the k-means algorithm (k-means is not
-guaranteed to find a globally optimal solution, and when run multiple times on
-a given dataset, the algorithm returns the best clustering result).
+* *runs* This param has no effect since Spark 2.0.0.
 * *initializationSteps* determines the number of steps in the k-means\|\| algorithm.
 * *epsilon* determines the distance threshold within which we consider k-means to have converged.
 * *initialModel* is an optional set of cluster centers used for initialization. If this parameter is supplied, only one run is performed.
diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md
index d90905a86ade..ca84551506b2 100644
--- a/docs/mllib-isotonic-regression.md
+++ b/docs/mllib-isotonic-regression.md
@@ -27,7 +27,7 @@ best fitting the original data points.
 [pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111)
 which uses an approach to
 [parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10).
-The training input is a RDD of tuples of three double values that represent
+The training input is an RDD of tuples of three double values that represent
 label, feature and weight in this order. Additionally IsotonicRegression algorithm has one
 optional parameter called $isotonic$ defaulting to true.
 This argument specifies if the isotonic regression is
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 816bdf131700..3085539b40e6 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -139,7 +139,7 @@ and logistic regression.
 Linear SVMs supports only binary classification, while logistic regression supports both binary and
 multiclass classification problems.
 For both methods, `spark.mllib` supports L1 and L2 regularized variants.
-The training data set is represented by an RDD of [LabeledPoint](mllib-data-types.html) in MLlib,
+The training data set is represented by an RDD of [LabeledPoint](mllib-data-types.html#labeled-point) in MLlib,
 where labels are class indices starting from zero: $0, 1, 2, \ldots$.
 
 ### Linear Support Vector Machines (SVMs)
@@ -491,5 +491,3 @@ Algorithms are all implemented in Scala:
 * [RidgeRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
 * [LassoWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.LassoWithSGD)
 
-Python calls the Scala implementation via
-[PythonMLLibAPI](api/scala/index.html#org.apache.spark.mllib.api.python.PythonMLLibAPI).
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 5bc5e18c4d45..8583213b6725 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -27,8 +27,8 @@ in the UI to persisted storage.
 
 ## Viewing After the Fact
 
-If Spark is run on Mesos or YARN, it is still possible to construct the UI of an
-application through Spark's history server, provided that the application's event logs exist.
+It is still possible to construct the UI of an application through Spark's history server, 
+provided that the application's event logs exist.
 You can start the history server by executing:
 
     ./sbin/start-history-server.sh
@@ -41,13 +41,11 @@ directory must be supplied in the `spark.history.fs.logDirectory` configuration
 and should contain sub-directories that each represents an application's event logs.
 
 The spark jobs themselves must be configured to log events, and to log them to the same shared,
-writeable directory. For example, if the server was configured with a log directory of
+writable directory. For example, if the server was configured with a log directory of
 `hdfs://namenode/shared/spark-logs`, then the client-side options would be:
 
-```
-spark.eventLog.enabled true
-spark.eventLog.dir hdfs://namenode/shared/spark-logs
-```
+    spark.eventLog.enabled true
+    spark.eventLog.dir hdfs://namenode/shared/spark-logs
 
 The history server can be configured as follows:
 
@@ -171,6 +169,28 @@ The history server can be configured as follows:
       If disabled, no access control checks are made.
     </td>
   </tr>
+  <tr>
+    <td>spark.history.ui.admin.acls</td>
+    <td>empty</td>
+    <td>
+      Comma separated list of users/administrators that have view access to all the Spark applications in
+      history server. By default only the users permitted to view the application at run-time could
+      access the related application history, with this, configured users/administrators could also
+      have the permission to access it.
+      Putting a "*" in the list means any user can have the privilege of admin.
+    </td>
+  </tr>
+  <tr>
+    <td>spark.history.ui.admin.acls.groups</td>
+    <td>empty</td>
+    <td>
+      Comma separated list of groups that have view access to all the Spark applications in
+      history server. By default only the groups permitted to view the application at run-time could
+      access the related application history, with this, configured groups could also
+      have the permission to access it.
+      Putting a "*" in the list means any group can have the privilege of admin.
+    </td>
+  </tr>
   <tr>
     <td>spark.history.fs.cleaner.enabled</td>
     <td>false</td>
@@ -237,7 +257,7 @@ In the API, an application is referenced by its application ID, `[app-id]`.
 When running on YARN, each application may have multiple attempts, but there are attempt IDs
 only for applications in cluster mode, not applications in client mode. Applications in YARN cluster mode
 can be identified by their `[attempt-id]`. In the API listed below, when running in YARN cluster mode,
-`[app-id]` will actually be `[base-app-id]/[attempt-id]`, where `[base-app-id]` is the YARN application ID.
+`[app-id]` will actually be `[base-app-id]/[attempt-id]`, where `[base-app-id]` is the YARN application ID.
 
 <table class="table">
   <tr><th>Endpoint</th><th>Meaning</th></tr>
@@ -258,7 +278,7 @@ can be identified by their `[attempt-id]`. In the API listed below, when running
     <td><code>/applications/[app-id]/jobs</code></td>
     <td>
       A list of all jobs for a given application.
-      <br><code>?status=[complete|succeeded|failed]</code> list only jobs in the specific state.
+      <br><code>?status=[running|succeeded|failed|unknown]</code> list only jobs in the specific state.
     </td>
   </tr>
   <tr>
@@ -268,12 +288,12 @@ can be identified by their `[attempt-id]`. In the API listed below, when running
   <tr>
     <td><code>/applications/[app-id]/stages</code></td>
     <td>A list of all stages for a given application.</td>
+    <br><code>?status=[active|complete|pending|failed]</code> list only stages in the state.
   </tr>
   <tr>
     <td><code>/applications/[app-id]/stages/[stage-id]</code></td>
     <td>
       A list of all attempts for the given stage.
-      <br><code>?status=[active|complete|pending|failed]</code> list only stages in the state.
     </td>
   </tr>
   <tr>
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 7516579ec6db..b9c7a95667d1 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -59,6 +59,8 @@ Spark {{site.SPARK_VERSION}} works with Java 7 and higher. If you are using Java
 for concisely writing functions, otherwise you can use the classes in the
 [org.apache.spark.api.java.function](api/java/index.html?org/apache/spark/api/java/function/package-summary.html) package.
 
+Note that support for Java 7 is deprecated as of Spark 2.0.0 and may be removed in Spark 2.2.0.
+
 To write a Spark application in Java, you need to add a dependency on Spark. Spark is available through Maven Central at:
 
     groupId = org.apache.spark
@@ -87,6 +89,8 @@ import org.apache.spark.SparkConf
 Spark {{site.SPARK_VERSION}} works with Python 2.6+ or Python 3.4+. It can use the standard CPython interpreter,
 so C libraries like NumPy can be used. It also works with PyPy 2.3+.
 
+Note that support for Python 2.6 is deprecated as of Spark 2.0.0, and may be removed in Spark 2.2.0.
+
 To run Spark applications in Python, use the `bin/spark-submit` script located in the Spark directory.
 This script will load Spark's Java/Scala libraries and allow you to submit applications to a cluster.
 You can also use `bin/pyspark` to launch an interactive Python shell.
@@ -243,7 +247,7 @@ $ PYSPARK_DRIVER_PYTHON=ipython ./bin/pyspark
 To use the Jupyter notebook (previously known as the IPython notebook), 
 
 {% highlight bash %}
-$ PYSPARK_DRIVER_PYTHON=jupyter ./bin/pyspark
+$ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspark
 {% endhighlight %}
 
 You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`. 
@@ -339,7 +343,7 @@ Some notes on reading files with Spark:
 
 * All of Spark's file-based input methods, including `textFile`, support running on directories, compressed files, and wildcards as well. For example, you can use `textFile("/my/directory")`, `textFile("/my/directory/*.txt")`, and `textFile("/my/directory/*.gz")`.
 
-* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 64MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
+* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 128MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
 
 Apart from text files, Spark's Scala API also supports several other data formats:
 
@@ -371,7 +375,7 @@ Some notes on reading files with Spark:
 
 * All of Spark's file-based input methods, including `textFile`, support running on directories, compressed files, and wildcards as well. For example, you can use `textFile("/my/directory")`, `textFile("/my/directory/*.txt")`, and `textFile("/my/directory/*.gz")`.
 
-* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 64MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
+* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 128MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
 
 Apart from text files, Spark's Java API also supports several other data formats:
 
@@ -403,7 +407,7 @@ Some notes on reading files with Spark:
 
 * All of Spark's file-based input methods, including `textFile`, support running on directories, compressed files, and wildcards as well. For example, you can use `textFile("/my/directory")`, `textFile("/my/directory/*.txt")`, and `textFile("/my/directory/*.gz")`.
 
-* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 64MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
+* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 128MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
 
 Apart from text files, Spark's Python API also supports several other data formats:
 
@@ -1341,14 +1345,15 @@ therefore be efficiently supported in parallel. They can be used to implement co
 MapReduce) or sums. Spark natively supports accumulators of numeric types, and programmers
 can add support for new types.
 
-If accumulators are created with a name, they will be
-displayed in Spark's UI. This can be useful for understanding the progress of
-running stages (NOTE: this is not yet supported in Python).
+As a user, you can create named or unnamed accumulators. As seen in the image below, a named accumulator (in this instance `counter`) will display in the web UI for the stage that modifies that accumulator. Spark displays the value for each accumulator modified by a task in the "Tasks" table.
 
 <p style="text-align: center;">
   <img src="img/spark-webui-accumulators.png" title="Accumulators in the Spark UI" alt="Accumulators in the Spark UI" />
 </p>
 
+Tracking accumulators in the UI can be useful for understanding the progress of 
+running stages (NOTE: this is not yet supported in Python).
+
 <div class="codetabs">
 
 <div data-lang="scala"  markdown="1">
@@ -1374,18 +1379,23 @@ res2: Long = 10
 
 While this code used the built-in support for accumulators of type Long, programmers can also
 create their own types by subclassing [AccumulatorV2](api/scala/index.html#org.apache.spark.util.AccumulatorV2).
-The AccumulatorV2 abstract class has several methods which need to override: 
-`reset` for resetting the accumulator to zero, and `add` for add anothor value into the accumulator, `merge` for merging another same-type accumulator into this one. Other methods need to override can refer to scala API document. For example, supposing we had a `MyVector` class
+The AccumulatorV2 abstract class has several methods which one has to override: `reset` for resetting
+the accumulator to zero, `add` for adding another value into the accumulator,
+`merge` for merging another same-type accumulator into this one. Other methods that must be overridden
+are contained in the [API documentation](api/scala/index.html#org.apache.spark.util.AccumulatorV2). For example, supposing we had a `MyVector` class
 representing mathematical vectors, we could write:
 
 {% highlight scala %}
-object VectorAccumulatorV2 extends AccumulatorV2[MyVector, MyVector] {
-  val vec_ : MyVector = MyVector.createZeroVector
-  def reset(): MyVector = {
-    vec_.reset()
+class VectorAccumulatorV2 extends AccumulatorV2[MyVector, MyVector] {
+
+  private val myVector: MyVector = MyVector.createZeroVector
+
+  def reset(): Unit = {
+    myVector.reset()
   }
-  def add(v1: MyVector, v2: MyVector): MyVector = {
-    vec_.add(v2)
+
+  def add(v: MyVector): Unit = {
+    myVector.add(v)
   }
   ...
 }
@@ -1420,29 +1430,36 @@ accum.value();
 // returns 10
 {% endhighlight %}
 
-Programmers can also create their own types by subclassing
-[AccumulatorParam](api/java/index.html?org/apache/spark/AccumulatorParam.html).
-The AccumulatorParam interface has two methods: `zero` for providing a "zero value" for your data
-type, and `addInPlace` for adding two values together. For example, supposing we had a `Vector` class
+While this code used the built-in support for accumulators of type Long, programmers can also
+create their own types by subclassing [AccumulatorV2](api/scala/index.html#org.apache.spark.util.AccumulatorV2).
+The AccumulatorV2 abstract class has several methods which one has to override: `reset` for resetting
+the accumulator to zero, `add` for adding another value into the accumulator,
+`merge` for merging another same-type accumulator into this one. Other methods that must be overridden
+are contained in the [API documentation](api/scala/index.html#org.apache.spark.util.AccumulatorV2). For example, supposing we had a `MyVector` class
 representing mathematical vectors, we could write:
 
 {% highlight java %}
-class VectorAccumulatorParam implements AccumulatorParam<Vector> {
-  public Vector zero(Vector initialValue) {
-    return Vector.zeros(initialValue.size());
+class VectorAccumulatorV2 implements AccumulatorV2<MyVector, MyVector> {
+
+  private MyVector myVector = MyVector.createZeroVector();
+
+  public void reset() {
+    myVector.reset();
   }
-  public Vector addInPlace(Vector v1, Vector v2) {
-    v1.addInPlace(v2); return v1;
+
+  public void add(MyVector v) {
+    myVector.add(v);
   }
+  ...
 }
 
 // Then, create an Accumulator of this type:
-Accumulator<Vector> vecAccum = sc.accumulator(new Vector(...), new VectorAccumulatorParam());
+VectorAccumulatorV2 myVectorAcc = new VectorAccumulatorV2();
+// Then, register it into spark context:
+jsc.sc().register(myVectorAcc, "MyVectorAcc1");
 {% endhighlight %}
 
-In Java, Spark also supports the more general [Accumulable](api/java/index.html?org/apache/spark/Accumulable.html)
-interface to accumulate data where the resulting type is not the same as the elements added (e.g. build
-a list by collecting together elements).
+Note that, when programmers define their own type of AccumulatorV2, the resulting type can be different than that of the elements added.
 
 </div>
 
diff --git a/docs/quick-start.md b/docs/quick-start.md
index cb9a37819956..478bdcf6bab1 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -260,7 +260,7 @@ object which contains information about our
 application. 
 
 Our application depends on the Spark API, so we'll also include an sbt configuration file, 
-`simple.sbt`, which explains that Spark is a dependency. This file also adds a repository that 
+`build.sbt`, which explains that Spark is a dependency. This file also adds a repository that 
 Spark depends on:
 
 {% highlight scala %}
@@ -273,7 +273,7 @@ scalaVersion := "{{site.SCALA_VERSION}}"
 libraryDependencies += "org.apache.spark" %% "spark-core" % "{{site.SPARK_VERSION}}"
 {% endhighlight %}
 
-For sbt to work correctly, we'll need to layout `SimpleApp.scala` and `simple.sbt`
+For sbt to work correctly, we'll need to layout `SimpleApp.scala` and `build.sbt`
 according to the typical directory structure. Once that is in place, we can create a JAR package
 containing the application's code, then use the `spark-submit` script to run our program.
 
@@ -281,7 +281,7 @@ containing the application's code, then use the `spark-submit` script to run our
 # Your directory layout should look like this
 $ find .
 .
-./simple.sbt
+./build.sbt
 ./src
 ./src/main
 ./src/main/scala
@@ -330,7 +330,7 @@ public class SimpleApp {
 
     System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
     
-    sc.stop()
+    sc.stop();
   }
 }
 {% endhighlight %}
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index cd18808681ec..a0729757b7e1 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -117,28 +117,6 @@ To use a custom metrics.properties for the application master and executors, upd
     Use lower-case suffixes, e.g. <code>k</code>, <code>m</code>, <code>g</code>, <code>t</code>, and <code>p</code>, for kibi-, mebi-, gibi-, tebi-, and pebibytes, respectively.
   </td>
 </tr>
-<tr>
-  <td><code>spark.driver.memory</code></td>
-  <td>1g</td>
-  <td>
-    Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
-    (e.g. <code>1g</code>, <code>2g</code>).
-
-    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
-    directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-memory</code> command line option
-    or in your default properties file.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.driver.cores</code></td>
-  <td><code>1</code></td>
-  <td>
-    Number of cores used by the driver in YARN cluster mode.
-    Since the driver is run in the same JVM as the YARN Application Master in cluster mode, this also controls the cores used by the YARN Application Master.
-    In client mode, use <code>spark.yarn.am.cores</code> to control the number of cores used by the YARN Application Master instead.
-  </td>
-</tr>
 <tr>
   <td><code>spark.yarn.am.cores</code></td>
   <td><code>1</code></td>
@@ -233,13 +211,6 @@ To use a custom metrics.properties for the application master and executors, upd
     Comma-separated list of jars to be placed in the working directory of each executor.
   </td>
 </tr>
-<tr>
-  <td><code>spark.executor.cores</code></td>
-  <td>1 in YARN mode, all the available cores on the worker in standalone mode.</td>
-  <td>
-    The number of cores to use on each executor. For YARN and standalone mode only.
-  </td>
-</tr>
 <tr>
  <td><code>spark.executor.instances</code></td>
   <td><code>2</code></td>
@@ -247,13 +218,6 @@ To use a custom metrics.properties for the application master and executors, upd
     The number of executors for static allocation. With <code>spark.dynamicAllocation.enabled</code>, the initial set of executors will be at least this large.
   </td>
 </tr>
-<tr>
-  <td><code>spark.executor.memory</code></td>
-  <td>1g</td>
-  <td>
-    Amount of memory to use per executor process (e.g. <code>2g</code>, <code>8g</code>).
-  </td>
-</tr>
 <tr>
  <td><code>spark.yarn.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>
@@ -542,7 +506,7 @@ Spark supports integrating with other security-aware services through Java Servi
 `java.util.ServiceLoader`). To do that, implementations of `org.apache.spark.deploy.yarn.security.ServiceCredentialProvider`
 should be available to Spark by listing their names in the corresponding file in the jar's
 `META-INF/services` directory. These plug-ins can be disabled by setting
-`spark.yarn.security.tokens.{service}.enabled` to `false`, where `{service}` is the name of
+`spark.yarn.security.credentials.{service}.enabled` to `false`, where `{service}` is the name of
 credential provider.
 
 ## Configuring the External Shuffle Service
@@ -554,11 +518,13 @@ instructions:
 pre-packaged distribution.
 1. Locate the `spark-<version>-yarn-shuffle.jar`. This should be under
 `$SPARK_HOME/common/network-yarn/target/scala-<version>` if you are building Spark yourself, and under
-`lib` if you are using a distribution.
+`yarn` if you are using a distribution.
 1. Add this jar to the classpath of all `NodeManager`s in your cluster.
 1. In the `yarn-site.xml` on each node, add `spark_shuffle` to `yarn.nodemanager.aux-services`,
 then set `yarn.nodemanager.aux-services.spark_shuffle.class` to
 `org.apache.spark.network.yarn.YarnShuffleService`.
+1. Increase `NodeManager's` heap size by setting `YARN_HEAPSIZE` (1000 by default) in `etc/hadoop/yarn-env.sh` 
+to avoid garbage collection issues during shuffle. 
 1. Restart all `NodeManager`s in your cluster.
 
 The following extra configuration options are available when the shuffle service is running on YARN:
@@ -604,8 +570,8 @@ the Spark configuration must be set to disable token collection for the services
 The Spark configuration must include the lines:
 
 ```
-spark.yarn.security.tokens.hive.enabled   false
-spark.yarn.security.tokens.hbase.enabled  false
+spark.yarn.security.credentials.hive.enabled   false
+spark.yarn.security.credentials.hbase.enabled  false
 ```
 
 The configuration option `spark.yarn.access.namenodes` must be unset.
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 1c0b60f7b934..084bd701fdcc 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -83,7 +83,7 @@ Once you've set up this file, you can launch or stop your cluster with the follo
 - `sbin/start-slaves.sh` - Starts a slave instance on each machine specified in the `conf/slaves` file.
 - `sbin/start-slave.sh` - Starts a slave instance on the machine the script is executed on.
 - `sbin/start-all.sh` - Starts both a master and a number of slaves as described above.
-- `sbin/stop-master.sh` - Stops the master that was started via the `bin/start-master.sh` script.
+- `sbin/stop-master.sh` - Stops the master that was started via the `sbin/start-master.sh` script.
 - `sbin/stop-slaves.sh` - Stops all slave instances on the machines specified in the `conf/slaves` file.
 - `sbin/stop-all.sh` - Stops both the master and the slaves as described above.
 
diff --git a/docs/sparkr.md b/docs/sparkr.md
index f30bd4026fed..d7ffd9b3f122 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -40,7 +40,9 @@ sparkR.session()
 You can also start SparkR from RStudio. You can connect your R program to a Spark cluster from
 RStudio, R shell, Rscript or other R IDEs. To start, make sure SPARK_HOME is set in environment
 (you can check [Sys.getenv](https://stat.ethz.ch/R-manual/R-devel/library/base/html/Sys.getenv.html)),
-load the SparkR package, and call `sparkR.session` as below. In addition to calling `sparkR.session`,
+load the SparkR package, and call `sparkR.session` as below. It will check for the Spark installation, and, if not found, it will be downloaded and cached automatically. Alternatively, you can also run `install.spark` manually.
+
+In addition to calling `sparkR.session`,
  you could also specify certain Spark driver properties. Normally these
 [Application properties](configuration.html#application-properties) and
 [Runtime Environment](configuration.html#runtime-environment) cannot be set programmatically, as the
@@ -126,7 +128,7 @@ head(df)
 SparkR supports operating on a variety of data sources through the `SparkDataFrame` interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more [specific options](sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources.
 
 The general method for creating SparkDataFrames from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active SparkSession will be used automatically.
-SparkR supports reading JSON, CSV and Parquet files natively, and through packages available from sources like [Third Party Projects](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects), you can find data source connectors for popular file formats like Avro. These packages can either be added by
+SparkR supports reading JSON, CSV and Parquet files natively, and through packages available from sources like [Third Party Projects](http://spark.apache.org/third-party-projects.html), you can find data source connectors for popular file formats like Avro. These packages can either be added by
 specifying `--packages` with `spark-submit` or `sparkR` commands, or if initializing SparkSession with `sparkPackages` parameter when in an interactive R shell or from RStudio.
 
 <div data-lang="r" markdown="1">
@@ -510,39 +512,50 @@ head(teenagers)
 
 # Machine Learning
 
-SparkR supports the following machine learning algorithms currently: `Generalized Linear Model`, `Accelerated Failure Time (AFT) Survival Regression Model`, `Naive Bayes Model` and `KMeans Model`.
-Under the hood, SparkR uses MLlib to train the model.
-Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models.
-SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘.
-
 ## Algorithms
 
-### Generalized Linear Model
+SparkR supports the following machine learning algorithms currently:
+
+#### Classification
+
+* [`spark.logit`](api/R/spark.logit.html): [`Logistic Regression`](ml-classification-regression.html#logistic-regression)
+* [`spark.mlp`](api/R/spark.mlp.html): [`Multilayer Perceptron (MLP)`](ml-classification-regression.html#multilayer-perceptron-classifier)
+* [`spark.naiveBayes`](api/R/spark.naiveBayes.html): [`Naive Bayes`](ml-classification-regression.html#naive-bayes)
+
+#### Regression
 
-[spark.glm()](api/R/spark.glm.html) or [glm()](api/R/glm.html) fits generalized linear model against a Spark DataFrame.
-Currently "gaussian", "binomial", "poisson" and "gamma" families are supported.
-{% include_example glm r/ml.R %}
+* [`spark.survreg`](api/R/spark.survreg.html): [`Accelerated Failure Time (AFT) Survival  Model`](ml-classification-regression.html#survival-regression)
+* [`spark.glm`](api/R/spark.glm.html) or [`glm`](api/R/glm.html): [`Generalized Linear Model (GLM)`](ml-classification-regression.html#generalized-linear-regression)
+* [`spark.isoreg`](api/R/spark.isoreg.html): [`Isotonic Regression`](ml-classification-regression.html#isotonic-regression)
 
-### Accelerated Failure Time (AFT) Survival Regression Model
+#### Tree
 
-[spark.survreg()](api/R/spark.survreg.html) fits an accelerated failure time (AFT) survival regression model on a SparkDataFrame.
-Note that the formula of [spark.survreg()](api/R/spark.survreg.html) does not support operator '.' currently.
-{% include_example survreg r/ml.R %}
+* [`spark.gbt`](api/R/spark.gbt.html): `Gradient Boosted Trees for` [`Regression`](ml-classification-regression.html#gradient-boosted-tree-regression) `and` [`Classification`](ml-classification-regression.html#gradient-boosted-tree-classifier)
+* [`spark.randomForest`](api/R/spark.randomForest.html): `Random Forest for` [`Regression`](ml-classification-regression.html#random-forest-regression) `and` [`Classification`](ml-classification-regression.html#random-forest-classifier)
 
-### Naive Bayes Model
+#### Clustering
 
-[spark.naiveBayes()](api/R/spark.naiveBayes.html) fits a Bernoulli naive Bayes model against a SparkDataFrame. Only categorical data is supported.
-{% include_example naiveBayes r/ml.R %}
+* [`spark.gaussianMixture`](api/R/spark.gaussianMixture.html): [`Gaussian Mixture Model (GMM)`](ml-clustering.html#gaussian-mixture-model-gmm)
+* [`spark.kmeans`](api/R/spark.kmeans.html): [`K-Means`](ml-clustering.html#k-means)
+* [`spark.lda`](api/R/spark.lda.html): [`Latent Dirichlet Allocation (LDA)`](ml-clustering.html#latent-dirichlet-allocation-lda)
 
-### KMeans Model
+#### Collaborative Filtering
+
+* [`spark.als`](api/R/spark.als.html): [`Alternating Least Squares (ALS)`](ml-collaborative-filtering.html#collaborative-filtering)
+
+#### Statistics
+
+* [`spark.kstest`](api/R/spark.kstest.html): `Kolmogorov-Smirnov Test`
+
+Under the hood, SparkR uses MLlib to train the model. Please refer to the corresponding section of MLlib user guide for example code.
+Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models.
+SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘.
 
-[spark.kmeans()](api/R/spark.kmeans.html) fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans().
-{% include_example kmeans r/ml.R %}
 
 ## Model persistence
 
 The following example shows how to save/load a MLlib model by SparkR.
-{% include_example read_write r/ml.R %}
+{% include_example read_write r/ml/ml.R %}
 
 # R Function Name Conflicts
 
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index b9be7a7545ef..e72a0be148e1 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -222,9 +222,9 @@ The `sql` function enables applications to run SQL queries programmatically and
 
 ## Global Temporary View
 
-Temporay views in Spark SQL are session-scoped and will disappear if the session that creates it
+Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it
 terminates. If you want to have a temporary view that is shared among all sessions and keep alive
-until the Spark application terminiates, you can create a global temporary view. Global temporary
+until the Spark application terminates, you can create a global temporary view. Global temporary
 view is tied to a system preserved database `global_temp`, and we must use the qualified name to
 refer it, e.g. `SELECT * FROM global_temp.view1`.
 
@@ -382,6 +382,52 @@ For example:
 
 </div>
 
+## Aggregations
+
+The [built-in DataFrames functions](api/scala/index.html#org.apache.spark.sql.functions$) provide common
+aggregations such as `count()`, `countDistinct()`, `avg()`, `max()`, `min()`, etc.
+While those functions are designed for DataFrames, Spark SQL also has type-safe versions for some of them in 
+[Scala](api/scala/index.html#org.apache.spark.sql.expressions.scalalang.typed$) and 
+[Java](api/java/org/apache/spark/sql/expressions/javalang/typed.html) to work with strongly typed Datasets.
+Moreover, users are not limited to the predefined aggregate functions and can create their own.
+
+### Untyped User-Defined Aggregate Functions
+
+<div class="codetabs">
+
+<div data-lang="scala"  markdown="1">
+
+Users have to extend the [UserDefinedAggregateFunction](api/scala/index.html#org.apache.spark.sql.expressions.UserDefinedAggregateFunction) 
+abstract class to implement a custom untyped aggregate function. For example, a user-defined average
+can look like:
+
+{% include_example untyped_custom_aggregation scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala%}
+</div>
+
+<div data-lang="java"  markdown="1">
+
+{% include_example untyped_custom_aggregation java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java%}
+</div>
+
+</div>
+
+### Type-Safe User-Defined Aggregate Functions
+
+User-defined aggregations for strongly typed Datasets revolve around the [Aggregator](api/scala/index.html#org.apache.spark.sql.expressions.Aggregator) abstract class.
+For example, a type-safe user-defined average can look like:
+<div class="codetabs">
+
+<div data-lang="scala"  markdown="1">
+
+{% include_example typed_custom_aggregation scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala%}
+</div>
+
+<div data-lang="java"  markdown="1">
+
+{% include_example typed_custom_aggregation java/org/apache/spark/examples/sql/JavaUserDefinedTypedAggregation.java%}
+</div>
+
+</div>
 
 # Data Sources
 
@@ -515,7 +561,7 @@ new data.
 ### Saving to Persistent Tables
 
 `DataFrames` can also be saved as persistent tables into Hive metastore using the `saveAsTable`
-command. Notice existing Hive deployment is not necessary to use this feature. Spark will create a
+command. Notice that an existing Hive deployment is not necessary to use this feature. Spark will create a
 default local Hive metastore (using Derby) for you. Unlike the `createOrReplaceTempView` command,
 `saveAsTable` will materialize the contents of the DataFrame and create a pointer to the data in the
 Hive metastore. Persistent tables will still exist even after your Spark program has restarted, as
@@ -526,6 +572,18 @@ By default `saveAsTable` will create a "managed table", meaning that the locatio
 be controlled by the metastore. Managed tables will also have their data deleted automatically
 when a table is dropped.
 
+Currently, `saveAsTable` does not expose an API supporting the creation of an "external table" from a `DataFrame`.
+However, this functionality can be achieved by providing a `path` option to the `DataFrameWriter` with `path` as the key
+and location of the external table as its value (a string) when saving the table with `saveAsTable`. When an External table
+is dropped only its metadata is removed.
+
+Starting from Spark 2.1, persistent datasource tables have per-partition metadata stored in the Hive metastore. This brings several benefits:
+
+- Since the metastore can return only necessary partitions for a query, discovering all the partitions on the first query to the table is no longer needed.
+- Hive DDLs such as `ALTER TABLE PARTITION ... SET LOCATION` are now available for tables created with the Datasource API.
+
+Note that partition information is not gathered by default when creating external datasource tables (those with a `path` option). To sync the partition information in the metastore, you can invoke `MSCK REPAIR TABLE`.
+
 ## Parquet Files
 
 [Parquet](http://parquet.io) is a columnar format that is supported by many other data processing systems.
@@ -1029,11 +1087,11 @@ following command:
 bin/spark-shell --driver-class-path postgresql-9.4.1207.jar --jars postgresql-9.4.1207.jar
 {% endhighlight %}
 
-Tables from the remote database can be loaded as a DataFrame or Spark SQL Temporary table using
+Tables from the remote database can be loaded as a DataFrame or Spark SQL temporary view using
 the Data Sources API. Users can specify the JDBC connection properties in the data source options.
 <code>user</code> and <code>password</code> are normally provided as connection properties for
 logging into the data sources. In addition to the connection properties, Spark also supports
-the following case-sensitive options:
+the following case-insensitive options:
 
 <table class="table">
   <tr><th>Property Name</th><th>Meaning</th></tr>
@@ -1089,7 +1147,7 @@ the following case-sensitive options:
   <tr>
      <td><code>isolationLevel</code></td>
      <td>
-       The transaction isolation level, which applies to current connection. It can be one of <code>NONE<code>, <code>READ_COMMITTED<code>, <code>READ_UNCOMMITTED<code>, <code>REPEATABLE_READ<code>, or <code>SERIALIZABLE<code>, corresponding to standard transaction isolation levels defined by JDBC's Connection object, with default of <code>READ_UNCOMMITTED<code>. This option applies only to writing. Please refer the documentation in <code>java.sql.Connection</code>.
+       The transaction isolation level, which applies to current connection. It can be one of <code>NONE</code>, <code>READ_COMMITTED</code>, <code>READ_UNCOMMITTED</code>, <code>REPEATABLE_READ</code>, or <code>SERIALIZABLE</code>, corresponding to standard transaction isolation levels defined by JDBC's Connection object, with default of <code>READ_UNCOMMITTED</code>. This option applies only to writing. Please refer the documentation in <code>java.sql.Connection</code>.
      </td>
    </tr>
 
@@ -1159,9 +1217,9 @@ turning on some experimental options.
 
 ## Caching Data In Memory
 
-Spark SQL can cache tables using an in-memory columnar format by calling `spark.cacheTable("tableName")` or `dataFrame.cache()`.
+Spark SQL can cache tables using an in-memory columnar format by calling `spark.catalog.cacheTable("tableName")` or `dataFrame.cache()`.
 Then Spark SQL will scan only required columns and will automatically tune compression to minimize
-memory usage and GC pressure. You can call `spark.uncacheTable("tableName")` to remove the table from memory.
+memory usage and GC pressure. You can call `spark.catalog.uncacheTable("tableName")` to remove the table from memory.
 
 Configuration of in-memory caching can be done using the `setConf` method on `SparkSession` or by running
 `SET key=value` commands using SQL.
@@ -1297,7 +1355,7 @@ Thrift JDBC server also supports sending thrift RPC messages over HTTP transport
 Use the following setting to enable HTTP mode as system property or in `hive-site.xml` file in `conf/`:
 
     hive.server2.transport.mode - Set this to value: http
-    hive.server2.thrift.http.port - HTTP port number fo listen on; default is 10001
+    hive.server2.thrift.http.port - HTTP port number to listen on; default is 10001
     hive.server2.http.endpoint - HTTP endpoint; default is cliservice
 
 To test, use beeline to connect to the JDBC/ODBC server in http mode with:
@@ -1320,6 +1378,15 @@ options.
 
 # Migration Guide
 
+## Upgrading From Spark SQL 2.0 to 2.1
+
+ - Datasource tables now store partition metadata in the Hive metastore. This means that Hive DDLs such as `ALTER TABLE PARTITION ... SET LOCATION` are now available for tables created with the Datasource API.
+    - Legacy datasource tables can be migrated to this format via the `MSCK REPAIR TABLE` command. Migrating legacy tables is recommended to take advantage of Hive DDL support and improved planning performance.
+    - To determine if a table has been migrated, look for the `PartitionProvider: Catalog` attribute when issuing `DESCRIBE FORMATTED` on the table.
+ - Changes to `INSERT OVERWRITE TABLE ... PARTITION ...` behavior for Datasource tables.
+    - In prior Spark versions `INSERT OVERWRITE` overwrote the entire Datasource table, even when given a partition specification. Now only partitions matching the specification are overwritten.
+    - Note that this still differs from the behavior of Hive tables, which is to overwrite only partitions overlapping with newly inserted data.
+
 ## Upgrading From Spark SQL 1.6 to 2.0
 
  - `SparkSession` is now the new entry point of Spark that replaces the old `SQLContext` and
@@ -1337,6 +1404,14 @@ options.
  - Dataset and DataFrame API `explode` has been deprecated, alternatively, use `functions.explode()` with `select` or `flatMap`
  - Dataset and DataFrame API `registerTempTable` has been deprecated and replaced by `createOrReplaceTempView`
 
+ - Changes to `CREATE TABLE ... LOCATION` behavior for Hive tables.
+    - From Spark 2.0, `CREATE TABLE ... LOCATION` is equivalent to `CREATE EXTERNAL TABLE ... LOCATION`
+      in order to prevent accidental dropping the existing data in the user-provided locations.
+      That means, a Hive table created in Spark SQL with the user-specified location is always a Hive external table.
+      Dropping external tables will not remove the data. Users are not allowed to specify the location for Hive managed tables.
+      Note that this is different from the Hive behavior.
+    - As a result, `DROP TABLE` statements on those tables will not remove the data.
+
 ## Upgrading From Spark SQL 1.5 to 1.6
 
  - From Spark 1.6, by default the Thrift server runs in multi-session mode. Which means each JDBC/ODBC
@@ -1831,7 +1906,8 @@ You can access them by doing
   <td> The value type in Scala of the data type of this field
   (For example, Int for a StructField with the data type IntegerType) </td>
   <td>
-  StructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
+  StructField(<i>name</i>, <i>dataType</i>, [<i>nullable</i>])<br />
+  <b>Note:</b> The default value of <i>nullable</i> is <i>true</i>.
   </td>
 </tr>
 </table>
@@ -2119,7 +2195,8 @@ from pyspark.sql.types import *
   <td> The value type in Python of the data type of this field
   (For example, Int for a StructField with the data type IntegerType) </td>
   <td>
-  StructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
+  StructField(<i>name</i>, <i>dataType</i>, [<i>nullable</i>])<br />
+  <b>Note:</b> The default value of <i>nullable</i> is <i>True</i>.
   </td>
 </tr>
 </table>
@@ -2240,7 +2317,7 @@ from pyspark.sql.types import *
   <td> vector or list </td>
   <td>
   list(type="array", elementType=<i>elementType</i>, containsNull=[<i>containsNull</i>])<br />
-  <b>Note:</b> The default value of <i>containsNull</i> is <i>True</i>.
+  <b>Note:</b> The default value of <i>containsNull</i> is <i>TRUE</i>.
   </td>
 </tr>
 <tr>
@@ -2248,7 +2325,7 @@ from pyspark.sql.types import *
   <td> environment </td>
   <td>
   list(type="map", keyType=<i>keyType</i>, valueType=<i>valueType</i>, valueContainsNull=[<i>valueContainsNull</i>])<br />
-  <b>Note:</b> The default value of <i>valueContainsNull</i> is <i>True</i>.
+  <b>Note:</b> The default value of <i>valueContainsNull</i> is <i>TRUE</i>.
   </td>
 </tr>
 <tr>
@@ -2265,7 +2342,8 @@ from pyspark.sql.types import *
   <td> The value type in R of the data type of this field
   (For example, integer for a StructField with the data type IntegerType) </td>
   <td>
-  list(name=<i>name</i>, type=<i>dataType</i>, nullable=<i>nullable</i>)
+  list(name=<i>name</i>, type=<i>dataType</i>, nullable=[<i>nullable</i>])<br />
+  <b>Note:</b> The default value of <i>nullable</i> is <i>TRUE</i>.
   </td>
 </tr>
 </table>
diff --git a/docs/streaming-kafka-0-10-integration.md b/docs/streaming-kafka-0-10-integration.md
index c1ef396907db..b645d3c3a4b5 100644
--- a/docs/streaming-kafka-0-10-integration.md
+++ b/docs/streaming-kafka-0-10-integration.md
@@ -17,69 +17,72 @@ For Scala/Java applications using SBT/Maven project definitions, link your strea
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	import org.apache.kafka.clients.consumer.ConsumerRecord
-	import org.apache.kafka.common.serialization.StringDeserializer
-	import org.apache.spark.streaming.kafka010._
-	import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
-	import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
-
-	val kafkaParams = Map[String, Object](
-	  "bootstrap.servers" -> "localhost:9092,anotherhost:9092",
-	  "key.deserializer" -> classOf[StringDeserializer],
-	  "value.deserializer" -> classOf[StringDeserializer],
-	  "group.id" -> "use_a_separate_group_id_for_each_stream",
-	  "auto.offset.reset" -> "latest",
-	  "enable.auto.commit" -> (false: java.lang.Boolean)
-	)
-
-	val topics = Array("topicA", "topicB")
-	val stream = KafkaUtils.createDirectStream[String, String](
-	  streamingContext,
-	  PreferConsistent,
-	  Subscribe[String, String](topics, kafkaParams)
-	)
-
-	stream.map(record => (record.key, record.value))
-
+{% highlight scala %}
+import org.apache.kafka.clients.consumer.ConsumerRecord
+import org.apache.kafka.common.serialization.StringDeserializer
+import org.apache.spark.streaming.kafka010._
+import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
+import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
+
+val kafkaParams = Map[String, Object](
+  "bootstrap.servers" -> "localhost:9092,anotherhost:9092",
+  "key.deserializer" -> classOf[StringDeserializer],
+  "value.deserializer" -> classOf[StringDeserializer],
+  "group.id" -> "use_a_separate_group_id_for_each_stream",
+  "auto.offset.reset" -> "latest",
+  "enable.auto.commit" -> (false: java.lang.Boolean)
+)
+
+val topics = Array("topicA", "topicB")
+val stream = KafkaUtils.createDirectStream[String, String](
+  streamingContext,
+  PreferConsistent,
+  Subscribe[String, String](topics, kafkaParams)
+)
+
+stream.map(record => (record.key, record.value))
+{% endhighlight %}
 Each item in the stream is a [ConsumerRecord](http://kafka.apache.org/0100/javadoc/org/apache/kafka/clients/consumer/ConsumerRecord.html)
 </div>
 <div data-lang="java" markdown="1">
-	import java.util.*;
-	import org.apache.spark.SparkConf;
-	import org.apache.spark.TaskContext;
-	import org.apache.spark.api.java.*;
-	import org.apache.spark.api.java.function.*;
-	import org.apache.spark.streaming.api.java.*;
-	import org.apache.spark.streaming.kafka010.*;
-	import org.apache.kafka.clients.consumer.ConsumerRecord;
-	import org.apache.kafka.common.TopicPartition;
-	import org.apache.kafka.common.serialization.StringDeserializer;
-	import scala.Tuple2;
-	
-	Map<String, Object> kafkaParams = new HashMap<>();
-	kafkaParams.put("bootstrap.servers", "localhost:9092,anotherhost:9092");
-	kafkaParams.put("key.deserializer", StringDeserializer.class);
-	kafkaParams.put("value.deserializer", StringDeserializer.class);
-	kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
-	kafkaParams.put("auto.offset.reset", "latest");
-	kafkaParams.put("enable.auto.commit", false);
-	
-	Collection<String> topics = Arrays.asList("topicA", "topicB");
-	
-	final JavaInputDStream<ConsumerRecord<String, String>> stream =
-	  KafkaUtils.createDirectStream(
-	    streamingContext,
-	    LocationStrategies.PreferConsistent(),
-	    ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
-	  );
-	
-	stream.mapToPair(
-	  new PairFunction<ConsumerRecord<String, String>, String, String>() {
-	    @Override
-	    public Tuple2<String, String> call(ConsumerRecord<String, String> record) {
-	      return new Tuple2<>(record.key(), record.value());
-	    }
-	  })
+{% highlight java %}
+import java.util.*;
+import org.apache.spark.SparkConf;
+import org.apache.spark.TaskContext;
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.*;
+import org.apache.spark.streaming.api.java.*;
+import org.apache.spark.streaming.kafka010.*;
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.apache.kafka.common.TopicPartition;
+import org.apache.kafka.common.serialization.StringDeserializer;
+import scala.Tuple2;
+
+Map<String, Object> kafkaParams = new HashMap<>();
+kafkaParams.put("bootstrap.servers", "localhost:9092,anotherhost:9092");
+kafkaParams.put("key.deserializer", StringDeserializer.class);
+kafkaParams.put("value.deserializer", StringDeserializer.class);
+kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
+kafkaParams.put("auto.offset.reset", "latest");
+kafkaParams.put("enable.auto.commit", false);
+
+Collection<String> topics = Arrays.asList("topicA", "topicB");
+
+final JavaInputDStream<ConsumerRecord<String, String>> stream =
+  KafkaUtils.createDirectStream(
+    streamingContext,
+    LocationStrategies.PreferConsistent(),
+    ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
+  );
+
+stream.mapToPair(
+  new PairFunction<ConsumerRecord<String, String>, String, String>() {
+    @Override
+    public Tuple2<String, String> call(ConsumerRecord<String, String> record) {
+      return new Tuple2<>(record.key(), record.value());
+    }
+  })
+{% endhighlight %}
 </div>
 </div>
 
@@ -109,32 +112,35 @@ If you have a use case that is better suited to batch processing, you can create
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	// Import dependencies and create kafka params as in Create Direct Stream above
-
-	val offsetRanges = Array(
-	  // topic, partition, inclusive starting offset, exclusive ending offset
-	  OffsetRange("test", 0, 0, 100),
-	  OffsetRange("test", 1, 0, 100)
-	)
+{% highlight scala %}
+// Import dependencies and create kafka params as in Create Direct Stream above
 
-	val rdd = KafkaUtils.createRDD[String, String](sparkContext, kafkaParams, offsetRanges, PreferConsistent)
+val offsetRanges = Array(
+  // topic, partition, inclusive starting offset, exclusive ending offset
+  OffsetRange("test", 0, 0, 100),
+  OffsetRange("test", 1, 0, 100)
+)
 
+val rdd = KafkaUtils.createRDD[String, String](sparkContext, kafkaParams, offsetRanges, PreferConsistent)
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
-	// Import dependencies and create kafka params as in Create Direct Stream above
-
-	OffsetRange[] offsetRanges = {
-	  // topic, partition, inclusive starting offset, exclusive ending offset
-	  OffsetRange.create("test", 0, 0, 100),
-	  OffsetRange.create("test", 1, 0, 100)
-	};
-
-	JavaRDD<ConsumerRecord<String, String>> rdd = KafkaUtils.createRDD(
-	  sparkContext,
-	  kafkaParams,
-	  offsetRanges,
-	  LocationStrategies.PreferConsistent()
-	);
+{% highlight java %}
+// Import dependencies and create kafka params as in Create Direct Stream above
+
+OffsetRange[] offsetRanges = {
+  // topic, partition, inclusive starting offset, exclusive ending offset
+  OffsetRange.create("test", 0, 0, 100),
+  OffsetRange.create("test", 1, 0, 100)
+};
+
+JavaRDD<ConsumerRecord<String, String>> rdd = KafkaUtils.createRDD(
+  sparkContext,
+  kafkaParams,
+  offsetRanges,
+  LocationStrategies.PreferConsistent()
+);
+{% endhighlight %}
 </div>
 </div>
 
@@ -144,29 +150,33 @@ Note that you cannot use `PreferBrokers`, because without the stream there is no
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	stream.foreachRDD { rdd =>
-	  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
-	  rdd.foreachPartition { iter =>
-	    val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
-	    println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
-	  }
-	}
+{% highlight scala %}
+stream.foreachRDD { rdd =>
+  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+  rdd.foreachPartition { iter =>
+    val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
+    println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
+  }
+}
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
-	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
-	  @Override
-	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
-	    final OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
-	    rdd.foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>() {
-	      @Override
-	      public void call(Iterator<ConsumerRecord<String, String>> consumerRecords) {
-	        OffsetRange o = offsetRanges[TaskContext.get().partitionId()];
-	        System.out.println(
-	          o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
-	      }
-	    });
-	  }
-	});
+{% highlight java %}
+stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+  @Override
+  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+    final OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+    rdd.foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>() {
+      @Override
+      public void call(Iterator<ConsumerRecord<String, String>> consumerRecords) {
+        OffsetRange o = offsetRanges[TaskContext.get().partitionId()];
+        System.out.println(
+          o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
+      }
+    });
+  }
+});
+{% endhighlight %}
 </div>
 </div>
 
@@ -183,25 +193,28 @@ Kafka has an offset commit API that stores offsets in a special Kafka topic.  By
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	stream.foreachRDD { rdd =>
-	  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
-
-	  // some time later, after outputs have completed
-	  stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
-	}
-
+{% highlight scala %}
+stream.foreachRDD { rdd =>
+  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+
+  // some time later, after outputs have completed
+  stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
+}
+{% endhighlight %}
 As with HasOffsetRanges, the cast to CanCommitOffsets will only succeed if called on the result of createDirectStream, not after transformations.  The commitAsync call is threadsafe, but must occur after outputs if you want meaningful semantics.
 </div>
 <div data-lang="java" markdown="1">
-	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
-	  @Override
-	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
-	    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
-
-	    // some time later, after outputs have completed
-	    ((CanCommitOffsets) stream.inputDStream()).commitAsync(offsetRanges);
-	  }
-	});
+{% highlight java %}
+stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+  @Override
+  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+
+    // some time later, after outputs have completed
+    ((CanCommitOffsets) stream.inputDStream()).commitAsync(offsetRanges);
+  }
+});
+{% endhighlight %}
 </div>
 </div>
 
@@ -210,64 +223,68 @@ For data stores that support transactions, saving offsets in the same transactio
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	// The details depend on your data store, but the general idea looks like this
+{% highlight scala %}
+// The details depend on your data store, but the general idea looks like this
 
-	// begin from the the offsets committed to the database
-	val fromOffsets = selectOffsetsFromYourDatabase.map { resultSet =>
-	  new TopicPartition(resultSet.string("topic"), resultSet.int("partition")) -> resultSet.long("offset")
-	}.toMap
+// begin from the the offsets committed to the database
+val fromOffsets = selectOffsetsFromYourDatabase.map { resultSet =>
+  new TopicPartition(resultSet.string("topic"), resultSet.int("partition")) -> resultSet.long("offset")
+}.toMap
 
-	val stream = KafkaUtils.createDirectStream[String, String](
-	  streamingContext,
-	  PreferConsistent,
-	  Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
-	)
+val stream = KafkaUtils.createDirectStream[String, String](
+  streamingContext,
+  PreferConsistent,
+  Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
+)
 
-	stream.foreachRDD { rdd =>
-	  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+stream.foreachRDD { rdd =>
+  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
 
-	  val results = yourCalculation(rdd)
+  val results = yourCalculation(rdd)
 
-	  // begin your transaction
+  // begin your transaction
 
-	  // update results
-	  // update offsets where the end of existing offsets matches the beginning of this batch of offsets
-	  // assert that offsets were updated correctly
+  // update results
+  // update offsets where the end of existing offsets matches the beginning of this batch of offsets
+  // assert that offsets were updated correctly
 
-	  // end your transaction
-	}
+  // end your transaction
+}
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
-	// The details depend on your data store, but the general idea looks like this
-
-	// begin from the the offsets committed to the database
-	Map<TopicPartition, Long> fromOffsets = new HashMap<>();
-	for (resultSet : selectOffsetsFromYourDatabase)
-	  fromOffsets.put(new TopicPartition(resultSet.string("topic"), resultSet.int("partition")), resultSet.long("offset"));
-	}
-
-	JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(
-	  streamingContext,
-	  LocationStrategies.PreferConsistent(),
-	  ConsumerStrategies.<String, String>Assign(fromOffsets.keySet(), kafkaParams, fromOffsets)
-	);
-
-	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
-	  @Override
-	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
-	    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
-	    
-	    Object results = yourCalculation(rdd);
-
-	    // begin your transaction
-
-	    // update results
-	    // update offsets where the end of existing offsets matches the beginning of this batch of offsets
-	    // assert that offsets were updated correctly
-
-	    // end your transaction
-	  }
-	});
+{% highlight java %}
+// The details depend on your data store, but the general idea looks like this
+
+// begin from the the offsets committed to the database
+Map<TopicPartition, Long> fromOffsets = new HashMap<>();
+for (resultSet : selectOffsetsFromYourDatabase)
+  fromOffsets.put(new TopicPartition(resultSet.string("topic"), resultSet.int("partition")), resultSet.long("offset"));
+}
+
+JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(
+  streamingContext,
+  LocationStrategies.PreferConsistent(),
+  ConsumerStrategies.<String, String>Assign(fromOffsets.keySet(), kafkaParams, fromOffsets)
+);
+
+stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+  @Override
+  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+    
+    Object results = yourCalculation(rdd);
+
+    // begin your transaction
+
+    // update results
+    // update offsets where the end of existing offsets matches the beginning of this batch of offsets
+    // assert that offsets were updated correctly
+
+    // end your transaction
+  }
+});
+{% endhighlight %}
 </div>
 </div>
 
@@ -277,25 +294,29 @@ The new Kafka consumer [supports SSL](http://kafka.apache.org/documentation.html
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	val kafkaParams = Map[String, Object](
-	  // the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
-	  "security.protocol" -> "SSL",
-	  "ssl.truststore.location" -> "/some-directory/kafka.client.truststore.jks",
-	  "ssl.truststore.password" -> "test1234",
-	  "ssl.keystore.location" -> "/some-directory/kafka.client.keystore.jks",
-	  "ssl.keystore.password" -> "test1234",
-	  "ssl.key.password" -> "test1234"
-	)
+{% highlight scala %}
+val kafkaParams = Map[String, Object](
+  // the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
+  "security.protocol" -> "SSL",
+  "ssl.truststore.location" -> "/some-directory/kafka.client.truststore.jks",
+  "ssl.truststore.password" -> "test1234",
+  "ssl.keystore.location" -> "/some-directory/kafka.client.keystore.jks",
+  "ssl.keystore.password" -> "test1234",
+  "ssl.key.password" -> "test1234"
+)
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
-	Map<String, Object> kafkaParams = new HashMap<String, Object>();
-	// the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
-	kafkaParams.put("security.protocol", "SSL");
-	kafkaParams.put("ssl.truststore.location", "/some-directory/kafka.client.truststore.jks");
-	kafkaParams.put("ssl.truststore.password", "test1234");
-	kafkaParams.put("ssl.keystore.location", "/some-directory/kafka.client.keystore.jks");
-	kafkaParams.put("ssl.keystore.password", "test1234");
-	kafkaParams.put("ssl.key.password", "test1234");
+{% highlight java %}
+Map<String, Object> kafkaParams = new HashMap<String, Object>();
+// the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
+kafkaParams.put("security.protocol", "SSL");
+kafkaParams.put("ssl.truststore.location", "/some-directory/kafka.client.truststore.jks");
+kafkaParams.put("ssl.truststore.password", "test1234");
+kafkaParams.put("ssl.keystore.location", "/some-directory/kafka.client.keystore.jks");
+kafkaParams.put("ssl.keystore.password", "test1234");
+kafkaParams.put("ssl.key.password", "test1234");
+{% endhighlight %}
 </div>
 </div>
 
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 0b0315b36650..a878971608b3 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -1246,6 +1246,22 @@ dstream.foreachRDD { rdd =>
 }
 {% endhighlight %}
 </div>
+<div data-lang="java" markdown="1">
+{% highlight java %}
+dstream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
+  @Override
+  public void call(JavaRDD<String> rdd) {
+    final Connection connection = createNewConnection(); // executed at the driver
+    rdd.foreach(new VoidFunction<String>() {
+      @Override
+      public void call(String record) {
+        connection.send(record); // executed at the worker
+      }
+    });
+  }
+});
+{% endhighlight %}
+</div>
 <div data-lang="python" markdown="1">
 {% highlight python %}
 def sendRecord(rdd):
@@ -1279,6 +1295,23 @@ dstream.foreachRDD { rdd =>
 }
 {% endhighlight %}
 </div>
+<div data-lang="java" markdown="1">
+{% highlight java %}
+dstream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
+  @Override
+  public void call(JavaRDD<String> rdd) {
+    rdd.foreach(new VoidFunction<String>() {
+      @Override
+      public void call(String record) {
+        Connection connection = createNewConnection();
+        connection.send(record);
+        connection.close();
+      }
+    });
+  }
+});
+{% endhighlight %}
+</div>
 <div data-lang="python" markdown="1">
 {% highlight python %}
 def sendRecord(record):
@@ -1309,6 +1342,25 @@ dstream.foreachRDD { rdd =>
 }
 {% endhighlight %}
 </div>
+<div data-lang="java" markdown="1">
+{% highlight java %}
+dstream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
+  @Override
+  public void call(JavaRDD<String> rdd) {
+    rdd.foreachPartition(new VoidFunction<Iterator<String>>() {
+      @Override
+      public void call(Iterator<String> partitionOfRecords) {
+        Connection connection = createNewConnection();
+        while (partitionOfRecords.hasNext()) {
+          connection.send(partitionOfRecords.next());
+        }
+        connection.close();
+      }
+    });
+  }
+});
+{% endhighlight %}
+</div>
 <div data-lang="python" markdown="1">
 {% highlight python %}
 def sendPartition(iter):
@@ -1342,6 +1394,26 @@ dstream.foreachRDD { rdd =>
 {% endhighlight %}
 </div>
 
+<div data-lang="java" markdown="1">
+{% highlight java %}
+dstream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
+  @Override
+  public void call(JavaRDD<String> rdd) {
+    rdd.foreachPartition(new VoidFunction<Iterator<String>>() {
+      @Override
+      public void call(Iterator<String> partitionOfRecords) {
+        // ConnectionPool is a static, lazily initialized pool of connections
+        Connection connection = ConnectionPool.getConnection();
+        while (partitionOfRecords.hasNext()) {
+          connection.send(partitionOfRecords.next());
+        }
+        ConnectionPool.returnConnection(connection); // return to the pool for future reuse
+      }
+    });
+  }
+});
+{% endhighlight %}
+</div>
 <div data-lang="python" markdown="1">
 {% highlight python %}
 def sendPartition(iter):
@@ -1945,6 +2017,9 @@ To run a Spark Streaming applications, you need to have the following.
   `spark.streaming.driver.writeAheadLog.closeFileAfterWrite` and
   `spark.streaming.receiver.writeAheadLog.closeFileAfterWrite`. See
   [Spark Streaming Configuration](configuration.html#spark-streaming) for more details.
+  Note that Spark will not encrypt data written to the write ahead log when I/O encryption is
+  enabled. If encryption of the write ahead log data is desired, it should be stored in a file
+  system that supports encryption natively.
 
 - *Setting the max receiving rate* - If the cluster resources is not large enough for the streaming
   application to process data as fast as it is being received, the receivers can be rate limited
@@ -2191,7 +2266,7 @@ consistent batch processing times. Make sure you set the CMS GC on both the driv
 
 - When data is received from a stream source, receiver creates blocks of data.  A new block of data is generated every blockInterval milliseconds. N blocks of data are created during the batchInterval where N = batchInterval/blockInterval. These blocks are distributed by the BlockManager of the current executor to the block managers of other executors. After that, the Network Input Tracker running on the driver is informed about the block locations for further processing.
 
-- A RDD is created on the driver for the blocks created during the batchInterval. The blocks generated during the batchInterval are partitions of the RDD. Each partition is a task in spark. blockInterval== batchinterval would mean that a single partition is created and probably it is processed locally.
+- An RDD is created on the driver for the blocks created during the batchInterval. The blocks generated during the batchInterval are partitions of the RDD. Each partition is a task in spark. blockInterval== batchinterval would mean that a single partition is created and probably it is processed locally.
 
 - The map tasks on the blocks are processed in the executors (one that received the block, and another where the block was replicated) that has the blocks irrespective of block interval, unless non-local scheduling kicks in.
 Having bigger blockinterval means bigger blocks. A high value of `spark.locality.wait` increases the chance of processing a block on the local node. A balance needs to be found out between these two parameters to ensure that the bigger blocks are processed locally.
@@ -2382,7 +2457,7 @@ additional effort may be necessary to achieve exactly-once semantics. There are
     - [Kafka Integration Guide](streaming-kafka-integration.html)
     - [Kinesis Integration Guide](streaming-kinesis-integration.html)
     - [Custom Receiver Guide](streaming-custom-receivers.html)
-* Third-party DStream data sources can be found in [Third Party Projects](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects)
+* Third-party DStream data sources can be found in [Third Party Projects](http://spark.apache.org/third-party-projects.html)
 * API documentation
   - Scala docs
     * [StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) and
diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index a6c3b3a9024d..013fc11ff902 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -19,97 +19,221 @@ application. See the [Deploying](#deploying) subsection below.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+{% highlight scala %}
 
-    // Subscribe to 1 topic
-    val ds1 = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1")
-      .load()
-    ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-
-    // Subscribe to multiple topics
-    val ds2 = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1,topic2")
-      .load()
-    ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-
-    // Subscribe to a pattern
-    val ds3 = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribePattern", "topic.*")
-      .load()
-    ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
+// Subscribe to 1 topic
+val ds1 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
 
+// Subscribe to multiple topics
+val ds2 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1,topic2")
+  .load()
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
+
+// Subscribe to a pattern
+val ds3 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribePattern", "topic.*")
+  .load()
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
+
+{% endhighlight %}
+</div>
+<div data-lang="java" markdown="1">
+{% highlight java %}
+
+// Subscribe to 1 topic
+Dataset<Row> ds1 = spark
+  .readStream()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+
+// Subscribe to multiple topics
+Dataset<Row> ds2 = spark
+  .readStream()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1,topic2")
+  .load()
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+
+// Subscribe to a pattern
+Dataset<Row> ds3 = spark
+  .readStream()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribePattern", "topic.*")
+  .load()
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+
+{% endhighlight %}
+</div>
+<div data-lang="python" markdown="1">
+{% highlight python %}
+
+# Subscribe to 1 topic
+ds1 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+
+# Subscribe to multiple topics
+ds2 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1,topic2")
+  .load()
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+
+# Subscribe to a pattern
+ds3 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribePattern", "topic.*")
+  .load()
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+
+{% endhighlight %}
+</div>
+</div>
+
+### Creating a Kafka Source Batch
+If you have a use case that is better suited to batch processing,
+you can create an Dataset/DataFrame for a defined range of offsets.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+
+// Subscribe to 1 topic defaults to the earliest and latest offsets
+val ds1 = spark
+  .read
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
+
+// Subscribe to multiple topics, specifying explicit Kafka offsets
+val ds2 = spark
+  .read
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1,topic2")
+  .option("startingOffsets", """{"topic1":{"0":23,"1":-2},"topic2":{"0":-2}}""")
+  .option("endingOffsets", """{"topic1":{"0":50,"1":-1},"topic2":{"0":-1}}""")
+  .load()
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
+
+// Subscribe to a pattern, at the earliest and latest offsets
+val ds3 = spark
+  .read
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribePattern", "topic.*")
+  .option("startingOffsets", "earliest")
+  .option("endingOffsets", "latest")
+  .load()
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
+
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
+{% highlight java %}
+
+// Subscribe to 1 topic defaults to the earliest and latest offsets
+Dataset<Row> ds1 = spark
+  .read()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load();
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
+
+// Subscribe to multiple topics, specifying explicit Kafka offsets
+Dataset<Row> ds2 = spark
+  .read()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1,topic2")
+  .option("startingOffsets", "{\"topic1\":{\"0\":23,\"1\":-2},\"topic2\":{\"0\":-2}}")
+  .option("endingOffsets", "{\"topic1\":{\"0\":50,\"1\":-1},\"topic2\":{\"0\":-1}}")
+  .load();
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
 
-    // Subscribe to 1 topic
-    Dataset<Row> ds1 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1")
-      .load()
-    ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-
-    // Subscribe to multiple topics
-    Dataset<Row> ds2 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1,topic2")
-      .load()
-    ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-
-    // Subscribe to a pattern
-    Dataset<Row> ds3 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribePattern", "topic.*")
-      .load()
-    ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+// Subscribe to a pattern, at the earliest and latest offsets
+Dataset<Row> ds3 = spark
+  .read()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribePattern", "topic.*")
+  .option("startingOffsets", "earliest")
+  .option("endingOffsets", "latest")
+  .load();
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
 
+{% endhighlight %}
 </div>
 <div data-lang="python" markdown="1">
+{% highlight python %}
 
-    # Subscribe to 1 topic
-    ds1 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1")
-      .load()
-    ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-
-    # Subscribe to multiple topics
-    ds2 = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1,topic2")
-      .load()
-    ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-
-    # Subscribe to a pattern
-    ds3 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribePattern", "topic.*")
-      .load()
-    ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+# Subscribe to 1 topic defaults to the earliest and latest offsets
+ds1 = spark \
+  .read \
+  .format("kafka") \
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
+  .option("subscribe", "topic1") \
+  .load()
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
+# Subscribe to multiple topics, specifying explicit Kafka offsets
+ds2 = spark \
+  .read \
+  .format("kafka") \
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
+  .option("subscribe", "topic1,topic2") \
+  .option("startingOffsets", """{"topic1":{"0":23,"1":-2},"topic2":{"0":-2}}""") \
+  .option("endingOffsets", """{"topic1":{"0":50,"1":-1},"topic2":{"0":-1}}""") \
+  .load()
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+
+# Subscribe to a pattern, at the earliest and latest offsets
+ds3 = spark \
+  .read \
+  .format("kafka") \
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
+  .option("subscribePattern", "topic.*") \
+  .option("startingOffsets", "earliest") \
+  .option("endingOffsets", "latest") \
+  .load()
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+
+{% endhighlight %}
 </div>
 </div>
 
@@ -146,7 +270,8 @@ Each row in the source has the following schema:
 </tr>
 </table>
 
-The following options must be set for the Kafka source.
+The following options must be set for the Kafka source
+for both batch and streaming queries.
 
 <table class="table">
 <tr><th>Option</th><th>value</th><th>meaning</th></tr>
@@ -181,50 +306,69 @@ The following options must be set for the Kafka source.
 The following configurations are optional:
 
 <table class="table">
-<tr><th>Option</th><th>value</th><th>default</th><th>meaning</th></tr>
+<tr><th>Option</th><th>value</th><th>default</th><th>query type</th><th>meaning</th></tr>
 <tr>
   <td>startingOffsets</td>
-  <td>earliest, latest, or json string
-  {"topicA":{"0":23,"1":-1},"topicB":{"0":-2}}
+  <td>"earliest", "latest" (streaming only), or json string
+  """ {"topicA":{"0":23,"1":-1},"topicB":{"0":-2}} """
   </td>
-  <td>latest</td>
+  <td>"latest" for streaming, "earliest" for batch</td>
+  <td>streaming and batch</td>
   <td>The start point when a query is started, either "earliest" which is from the earliest offsets,
   "latest" which is just from the latest offsets, or a json string specifying a starting offset for
   each TopicPartition.  In the json, -2 as an offset can be used to refer to earliest, -1 to latest.
-  Note: This only applies when a new Streaming query is started, and that resuming will always pick
-  up from where the query left off. Newly discovered partitions during a query will start at
+  Note: For batch queries, latest (either implicitly or by using -1 in json) is not allowed.
+  For streaming queries, this only applies when a new query is started, and that resuming will
+  always pick up from where the query left off. Newly discovered partitions during a query will start at
   earliest.</td>
 </tr>
+<tr>
+  <td>endingOffsets</td>
+  <td>latest or json string
+  {"topicA":{"0":23,"1":-1},"topicB":{"0":-1}}
+  </td>
+  <td>latest</td>
+  <td>batch query</td>
+  <td>The end point when a batch query is ended, either "latest" which is just referred to the
+  latest, or a json string specifying an ending offset for each TopicPartition.  In the json, -1
+  as an offset can be used to refer to latest, and -2 (earliest) as an offset is not allowed.</td>
+</tr>
 <tr>
   <td>failOnDataLoss</td>
   <td>true or false</td>
   <td>true</td>
-  <td>Whether to fail the query when it's possible that data is lost (e.g., topics are deleted, or 
+  <td>streaming query</td>
+  <td>Whether to fail the query when it's possible that data is lost (e.g., topics are deleted, or
   offsets are out of range). This may be a false alarm. You can disable it when it doesn't work
-  as you expected.</td>
+  as you expected. Batch queries will always fail if it fails to read any data from the provided
+  offsets due to lost data.</td>
 </tr>
 <tr>
   <td>kafkaConsumer.pollTimeoutMs</td>
   <td>long</td>
   <td>512</td>
+  <td>streaming and batch</td>
   <td>The timeout in milliseconds to poll data from Kafka in executors.</td>
 </tr>
 <tr>
   <td>fetchOffset.numRetries</td>
   <td>int</td>
   <td>3</td>
-  <td>Number of times to retry before giving up fatch Kafka latest offsets.</td>
+  <td>streaming and batch</td>
+  <td>Number of times to retry before giving up fetching Kafka offsets.</td>
 </tr>
 <tr>
   <td>fetchOffset.retryIntervalMs</td>
   <td>long</td>
   <td>10</td>
+  <td>streaming and batch</td>
   <td>milliseconds to wait before retrying to fetch Kafka offsets</td>
 </tr>
 <tr>
   <td>maxOffsetsPerTrigger</td>
   <td>long</td>
   <td>none</td>
+  <td>streaming and batch</td>
   <td>Rate limit on maximum number of offsets processed per trigger interval. The specified total number of offsets will be proportionally split across topicPartitions of different volume.</td>
 </tr>
 </table>
@@ -234,12 +378,13 @@ Kafka's own configurations can be set via `DataStreamReader.option` with `kafka.
 [Kafka consumer config docs](http://kafka.apache.org/documentation.html#newconsumerconfigs).
 
 Note that the following Kafka params cannot be set and the Kafka source will throw an exception:
+
 - **group.id**: Kafka source will create a unique group id for each query automatically.
 - **auto.offset.reset**: Set the source option `startingOffsets` to specify
  where to start instead. Structured Streaming manages which offsets are consumed internally, rather 
  than rely on the kafka Consumer to do it. This will ensure that no data is missed when when new 
  topics/partitions are dynamically subscribed. Note that `startingOffsets` only applies when a new
- Streaming query is started, and that resuming will always pick up from where the query left off.
+ streaming query is started, and that resuming will always pick up from where the query left off.
 - **key.deserializer**: Keys are always deserialized as byte arrays with ByteArrayDeserializer. Use 
  DataFrame operations to explicitly deserialize the keys.
 - **value.deserializer**: Values are always deserialized as byte arrays with ByteArrayDeserializer. 
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index d838ed35a14f..da5c2344a1b1 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -8,9 +8,9 @@ title: Structured Streaming Programming Guide
 {:toc}
 
 # Overview
-Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data.The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the [Dataset/DataFrame API](sql-programming-guide.html) in Scala, Java or Python to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write Ahead Logs. In short, *Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.*
+Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the [Dataset/DataFrame API](sql-programming-guide.html) in Scala, Java or Python to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write Ahead Logs. In short, *Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.*
 
-**Spark 2.0 is the ALPHA RELEASE of Structured Streaming** and the APIs are still experimental. In this guide, we are going to walk you through the programming model and the APIs. First, let's start with a simple example - a streaming word count. 
+**Structured Streaming is still ALPHA in Spark 2.1** and the APIs are still experimental. In this guide, we are going to walk you through the programming model and the APIs. First, let's start with a simple example - a streaming word count. 
 
 # Quick Example
 Let’s say you want to maintain a running word count of text data received from a data server listening on a TCP socket. Let’s see how you can express this using Structured Streaming. You can see the full code in 
@@ -58,7 +58,7 @@ from pyspark.sql.functions import explode
 from pyspark.sql.functions import split
 
 spark = SparkSession \
-    .builder() \
+    .builder \
     .appName("StructuredNetworkWordCount") \
     .getOrCreate()
 {% endhighlight %}
@@ -368,13 +368,13 @@ A query on the input will generate the "Result Table". Every trigger interval (s
 
 ![Model](img/structured-streaming-model.png)
 
-The "Output" is defined as what gets written out to the external storage. The output can be defined in different modes 
+The "Output" is defined as what gets written out to the external storage. The output can be defined in a different mode:
 
   - *Complete Mode* - The entire updated Result Table will be written to the external storage. It is up to the storage connector to decide how to handle writing of the entire table. 
 
   - *Append Mode* - Only the new rows appended in the Result Table since the last trigger will be written to the external storage. This is applicable only on the queries where existing rows in the Result Table are not expected to change.
   
-  - *Update Mode* - Only the rows that were updated in the Result Table since the last trigger will be written to the external storage (not available yet in Spark 2.0). Note that this is different from the Complete Mode in that this mode does not output the rows that are not changed.
+  - *Update Mode* - Only the rows that were updated in the Result Table since the last trigger will be written to the external storage (available since Spark 2.1.1). Note that this is different from the Complete Mode in that this mode only outputs the rows that have changed since the last trigger. If the query doesn't contain aggregations, it will be equivalent to Append mode.
 
 Note that each mode is applicable on certain types of queries. This is discussed in detail [later](#output-modes).
 
@@ -400,7 +400,14 @@ see how this model handles event-time based processing and late arriving data.
 ## Handling Event-time and Late Data
 Event-time is the time embedded in the data itself. For many applications, you may want to operate on this event-time. For example, if you want to get the number of events generated by IoT devices every minute, then you probably want to use the time when the data was generated (that is, event-time in the data), rather than the time Spark receives them. This event-time is very naturally expressed in this model -- each event from the devices is a row in the table, and event-time is a column value in the row. This allows window-based aggregations (e.g. number of events every minute) to be just a special type of grouping and aggregation on the even-time column -- each time window is a group and each row can belong to multiple windows/groups. Therefore, such event-time-window-based aggregation queries can be defined consistently on both a static dataset (e.g. from collected device events logs) as well as on a data stream, making the life of the user much easier.
 
-Furthermore, this model naturally handles data that has arrived later than expected based on its event-time. Since Spark is updating the Result Table, it has full control over updating/cleaning up the aggregates when there is late data. While not yet implemented in Spark 2.0, event-time watermarking will be used to manage this data. These are explained later in more details in the [Window Operations](#window-operations-on-event-time) section.
+Furthermore, this model naturally handles data that has arrived later than 
+expected based on its event-time. Since Spark is updating the Result Table, 
+it has full control over updating old aggregates when there is late data, 
+as well as cleaning up old aggregates to limit the size of intermediate
+state data. Since Spark 2.1, we have support for watermarking which 
+allows the user to specify the threshold of late data, and allows the engine
+to accordingly clean up old state. These are explained later in more 
+details in the [Window Operations](#window-operations-on-event-time) section.
 
 ## Fault Tolerance Semantics
 Delivering end-to-end exactly-once semantics was one of key goals behind the design of Structured Streaming. To achieve that, we have designed the Structured Streaming sources, the sinks and the execution engine to reliably track the exact progress of the processing so that it can handle any kind of failure by restarting and/or reprocessing. Every streaming source is assumed to have offsets (similar to Kafka offsets, or Kinesis sequence numbers)
@@ -417,7 +424,7 @@ Streaming DataFrames can be created through the `DataStreamReader` interface
 ([Scala](api/scala/index.html#org.apache.spark.sql.streaming.DataStreamReader)/[Java](api/java/org/apache/spark/sql/streaming/DataStreamReader.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader) docs)
 returned by `SparkSession.readStream()`. Similar to the read interface for creating static DataFrame, you can specify the details of the source – data format, schema, options, etc.
 
-#### Data Sources
+#### Input Sources
 In Spark 2.0, there are a few built-in sources.
 
   - **File source** - Reads files written in a directory as a stream of data. Supported file formats are text, csv, json, parquet. See the docs of the DataStreamReader interface for a more up-to-date list, and supported options for each file format. Note that the files must be atomically placed in the given directory, which in most file systems, can be achieved by file move operations.
@@ -426,6 +433,54 @@ In Spark 2.0, there are a few built-in sources.
 
   - **Socket source (for testing)** - Reads UTF8 text data from a socket connection. The listening server socket is at the driver. Note that this should be used only for testing as this does not provide end-to-end fault-tolerance guarantees. 
 
+Some sources are not fault-tolerant because they do not guarantee that data can be replayed using 
+checkpointed offsets after a failure. See the earlier section on 
+[fault-tolerance semantics](#fault-tolerance-semantics).
+Here are the details of all the sources in Spark.
+
+<table class="table">
+  <tr>
+    <th>Source</th>
+    <th>Options</th>
+    <th>Fault-tolerant</th>
+    <th>Notes</th>
+  </tr>
+  <tr>
+    <td><b>File source</b></td>
+    <td>
+        <code>path</code>: path to the input directory, and common to all file formats.
+        <br/><br/>
+        For file-format-specific options, see the related methods in <code>DataStreamReader</code>
+        (<a href="api/scala/index.html#org.apache.spark.sql.streaming.DataStreamReader">Scala</a>/<a href="api/java/org/apache/spark/sql/streaming/DataStreamReader.html">Java</a>/<a href="api/python/pyspark.sql.html#pyspark.sql.streaming.DataStreamReader">Python</a>).
+        E.g. for "parquet" format options see <code>DataStreamReader.parquet()</code></td>
+    <td>Yes</td>
+    <td>Supports glob paths, but does not support multiple comma-separated paths/globs.</td>
+  </tr>
+  <tr>
+    <td><b>Socket Source</b></td>
+    <td>
+        <code>host</code>: host to connect to, must be specified<br/>
+        <code>port</code>: port to connect to, must be specified
+    </td>
+    <td>No</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td><b>Kafka Source</b></td>
+    <td>
+        See the <a href="structured-streaming-kafka-integration.html">Kafka Integration Guide</a>.
+    </td>
+    <td>Yes</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+</table>
+
 Here are some examples.
 
 <div class="codetabs">
@@ -490,7 +545,7 @@ spark = SparkSession. ...
 
 # Read text from socket 
 socketDF = spark \
-    .readStream() \
+    .readStream \
     .format("socket") \
     .option("host", "localhost") \
     .option("port", 9999) \
@@ -503,7 +558,7 @@ socketDF.printSchema()
 # Read all the csv files written atomically in a directory
 userSchema = StructType().add("name", "string").add("age", "integer")
 csvDF = spark \
-    .readStream() \
+    .readStream \
     .option("sep", ";") \
     .schema(userSchema) \
     .csv("/path/to/directory")  # Equivalent to format("csv").load("/path/to/directory")
@@ -530,9 +585,9 @@ Most of the common operations on DataFrame/Dataset are supported for streaming.
 <div data-lang="scala"  markdown="1">
 
 {% highlight scala %}
-case class DeviceData(device: String, type: String, signal: Double, time: DateTime)
+case class DeviceData(device: String, deviceType: String, signal: Double, time: DateTime)
 
-val df: DataFrame = ... // streaming DataFrame with IOT device data with schema { device: string, type: string, signal: double, time: string }
+val df: DataFrame = ... // streaming DataFrame with IOT device data with schema { device: string, deviceType: string, signal: double, time: string }
 val ds: Dataset[DeviceData] = df.as[DeviceData]    // streaming Dataset with IOT device data
 
 // Select the devices which have signal more than 10
@@ -540,11 +595,11 @@ df.select("device").where("signal > 10")      // using untyped APIs
 ds.filter(_.signal > 10).map(_.device)         // using typed APIs
 
 // Running count of the number of updates for each device type
-df.groupBy("type").count()                          // using untyped API
+df.groupBy("deviceType").count()                          // using untyped API
 
 // Running average signal for each device type
-import org.apache.spark.sql.expressions.scalalang.typed._
-ds.groupByKey(_.type).agg(typed.avg(_.signal))    // using typed API
+import org.apache.spark.sql.expressions.scalalang.typed
+ds.groupByKey(_.deviceType).agg(typed.avg(_.signal))    // using typed API
 {% endhighlight %}
 
 </div>
@@ -558,7 +613,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
 
 public class DeviceData {
   private String device;
-  private String type;
+  private String deviceType;
   private Double signal;
   private java.sql.Date time;
   ...
@@ -583,13 +638,13 @@ ds.filter(new FilterFunction<DeviceData>() { // using typed APIs
 }, Encoders.STRING());
 
 // Running count of the number of updates for each device type
-df.groupBy("type").count(); // using untyped API
+df.groupBy("deviceType").count(); // using untyped API
 
 // Running average signal for each device type
 ds.groupByKey(new MapFunction<DeviceData, String>() { // using typed API
   @Override
   public String call(DeviceData value) throws Exception {
-    return value.getType();
+    return value.getDeviceType();
   }
 }, Encoders.STRING()).agg(typed.avg(new MapFunction<DeviceData, Double>() {
   @Override
@@ -604,13 +659,13 @@ ds.groupByKey(new MapFunction<DeviceData, String>() { // using typed API
 <div data-lang="python"  markdown="1">
 
 {% highlight python %}
-df = ...  # streaming DataFrame with IOT device data with schema { device: string, type: string, signal: double, time: DateType }
+df = ...  # streaming DataFrame with IOT device data with schema { device: string, deviceType: string, signal: double, time: DateType }
 
 # Select the devices which have signal more than 10
 df.select("device").where("signal > 10")                              
 
 # Running count of the number of updates for each device type
-df.groupBy("type").count()
+df.groupBy("deviceType").count()
 {% endhighlight %}
 </div>
 </div>
@@ -671,12 +726,136 @@ windowedCounts = words.groupBy(
 </div>
 
 
+### Handling Late Data and Watermarking
 Now consider what happens if one of the events arrives late to the application.
-For example, a word that was generated at 12:04 but it was received at 12:11. 
-Since this windowing is based on the time in the data, the time 12:04 should be considered for windowing. This occurs naturally in our window-based grouping – the late data is automatically placed in the proper windows and the correct aggregates are updated as illustrated below.
+For example, say, a word generated at 12:04 (i.e. event time) could be received received by 
+the application at 12:11. The application should use the time 12:04 instead of 12:11
+to update the older counts for the window `12:00 - 12:10`. This occurs 
+naturally in our window-based grouping – Structured Streaming can maintain the intermediate state 
+for partial aggregates for a long period of time such that late data can update aggregates of 
+old windows correctly, as illustrated below.
 
 ![Handling Late Data](img/structured-streaming-late-data.png)
 
+However, to run this query for days, its necessary for the system to bound the amount of 
+intermediate in-memory state it accumulates. This means the system needs to know when an old 
+aggregate can be dropped from the in-memory state because the application is not going to receive 
+late data for that aggregate any more. To enable this, in Spark 2.1, we have introduced 
+**watermarking**, which let's the engine automatically track the current event time in the data and
+and attempt to clean up old state accordingly. You can define the watermark of a query by 
+specifying the event time column and the threshold on how late the data is expected be in terms of 
+event time. For a specific window starting at time `T`, the engine will maintain state and allow late
+data to be update the state until `(max event time seen by the engine - late threshold > T)`. 
+In other words, late data within the threshold will be aggregated, 
+but data later than the threshold will be dropped. Let's understand this with an example. We can 
+easily define watermarking on the previous example using `withWatermark()` as shown below.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+import spark.implicits._
+
+val words = ... // streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+// Group the data by window and word and compute the count of each group
+val windowedCounts = words
+    .withWatermark("timestamp", "10 minutes")
+    .groupBy(
+        window($"timestamp", "10 minutes", "5 minutes"),
+        $"word")
+    .count()
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+Dataset<Row> words = ... // streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+// Group the data by window and word and compute the count of each group
+Dataset<Row> windowedCounts = words
+    .withWatermark("timestamp", "10 minutes")
+    .groupBy(
+        functions.window(words.col("timestamp"), "10 minutes", "5 minutes"),
+        words.col("word"))
+    .count();
+{% endhighlight %}
+
+</div>
+<div data-lang="python"  markdown="1">
+{% highlight python %}
+words = ...  # streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+# Group the data by window and word and compute the count of each group
+windowedCounts = words \
+    .withWatermark("timestamp", "10 minutes") \
+    .groupBy(
+        window(words.timestamp, "10 minutes", "5 minutes"),
+        words.word) \
+    .count()
+{% endhighlight %}
+
+</div>
+</div>
+
+In this example, we are defining the watermark of the query on the value of the column "timestamp", 
+and also defining "10 minutes" as the threshold of how late is the data allowed to be. If this query 
+is run in Update output mode (discussed later in [Output Modes](#output-modes) section), 
+the engine will keep updating counts of a window in the Resule Table until the window is older 
+than the watermark, which lags behind the current event time in column "timestamp" by 10 minutes.
+Here is an illustration. 
+
+![Watermarking in Update Mode](img/structured-streaming-watermark-update-mode.png)
+
+As shown in the illustration, the maximum event time tracked by the engine is the 
+*blue dashed line*, and the watermark set as `(max event time - '10 mins')`
+at the beginning of every trigger is the red line  For example, when the engine observes the data 
+`(12:14, dog)`, it sets the watermark for the next trigger as `12:04`.
+This watermark lets the engine maintain intermediate state for additional 10 minutes to allow late
+data to be counted. For example, the data `(12:09, cat)` is out of order and late, and it falls in
+windows `12:05 - 12:15` and `12:10 - 12:20`. Since, it is still ahead of the watermark `12:04` in 
+the trigger, the engine still maintains the intermediate counts as state and correctly updates the 
+counts of the related windows. However, when the watermark is updated to 12:11, the intermediate 
+state for window `(12:00 - 12:10)` is cleared, and all subsequent data (e.g. `(12:04, donkey)`) 
+is considered "too late" and therefore ignored. Note that after every trigger, 
+the updated counts (i.e. purple rows) are written to sink as the trigger output, as dictated by 
+the Update mode.
+
+Some sinks (e.g. files) may not supported fine-grained updates that Update Mode requires. To work
+with them, we have also support Append Mode, where only the *final counts* are written to sink.
+This is illustrated below.
+
+![Watermarking in Append Mode](img/structured-streaming-watermark-append-mode.png)
+
+Similar to the Update Mode earlier, the engine maintains intermediate counts for each window. 
+However, the partial counts are not updated to the Result Table and not written to sink. The engine
+waits for "10 mins" for late date to be counted, 
+then drops intermediate state of a window < watermark, and appends the final
+counts to the Result Table/sink. For example, the final counts of window `12:00 - 12:10` is 
+appended to the Result Table only after the watermark is updated to `12:11`. 
+
+**Conditions for watermarking to clean aggregation state**
+It is important to note that the following conditions must be satisfied for the watermarking to 
+clean the state in aggregation queries *(as of Spark 2.1.1, subject to change in the future)*.
+
+- **Output mode must be Append or Update.** Complete mode requires all aggregate data to be preserved, 
+and hence cannot use watermarking to drop intermediate state. See the [Output Modes](#output-modes) 
+section for detailed explanation of the semantics of each output mode.
+
+- The aggregation must have either the event-time column, or a `window` on the event-time column. 
+
+- `withWatermark` must be called on the 
+same column as the timestamp column used in the aggregate. For example, 
+`df.withWatermark("time", "1 min").groupBy("time2").count()` is invalid 
+in Append output mode, as watermark is defined on a different column
+as the aggregation column.
+
+- `withWatermark` must be called before the aggregation for the watermark details to be used. 
+For example, `df.groupBy("time").count().withWatermark("time", "1 min")` is invalid in Append 
+output mode.
+
+
 ### Join Operations
 Streaming DataFrames can be joined with static DataFrames to create new streaming DataFrames. Here are a few examples.
 
@@ -717,8 +896,9 @@ streamingDf.join(staticDf, "type", "right_join")  # right outer join with a stat
 </div>
 
 ### Unsupported Operations
-However, note that all of the operations applicable on static DataFrames/Datasets are not supported in streaming DataFrames/Datasets yet. While some of these unsupported operations will be supported in future releases of Spark, there are others which are fundamentally hard to implement on streaming data efficiently. For example, sorting is not supported on the input streaming Dataset, as it requires keeping track of all the data received in the stream. This is therefore fundamentally hard to execute efficiently. As of Spark 2.0, some of the unsupported operations are as follows
-
+There are a few DataFrame/Dataset operations that are not supported with streaming DataFrames/Datasets. 
+Some of them are as follows.
+ 
 - Multiple streaming aggregations (i.e. a chain of aggregations on a streaming DF) are not yet supported on streaming Datasets.
 
 - Limit and take first N rows are not supported on streaming Datasets.
@@ -745,7 +925,12 @@ In addition, there are some Dataset methods that will not work on streaming Data
 
 - `show()` - Instead use the console sink (see next section).
 
-If you try any of these operations, you will see an AnalysisException like "operation XYZ is not supported with streaming DataFrames/Datasets".
+If you try any of these operations, you will see an `AnalysisException` like "operation XYZ is not supported with streaming DataFrames/Datasets".
+While some of them may be supported in future releases of Spark, 
+there are others which are fundamentally hard to implement on streaming data efficiently. 
+For example, sorting on the input stream is not supported, as it requires keeping 
+track of all the data received in the stream. This is therefore fundamentally hard to execute 
+efficiently.
 
 ## Starting Streaming Queries
 Once you have defined the final result DataFrame/Dataset, all that is left is for you start the streaming computation. To do that, you have to use the `DataStreamWriter`
@@ -763,64 +948,180 @@ returned through `Dataset.writeStream()`. You will have to specify one or more o
 - *Checkpoint location:* For some output sinks where the end-to-end fault-tolerance can be guaranteed, specify the location where the system will write all the checkpoint information. This should be a directory in an HDFS-compatible fault-tolerant file system. The semantics of checkpointing is discussed in more detail in the next section.
 
 #### Output Modes
-There are two types of output mode currently implemented.
+There are a few types of output modes.
+
+- **Append mode (default)** - This is the default mode, where only the 
+new rows added to the Result Table since the last trigger will be 
+outputted to the sink. This is supported for only those queries where 
+rows added to the Result Table is never going to change. Hence, this mode 
+guarantees that each row will be output only once (assuming 
+fault-tolerant sink). For example, queries with only `select`, 
+`where`, `map`, `flatMap`, `filter`, `join`, etc. will support Append mode.
+
+- **Complete mode** - The whole Result Table will be outputted to the sink after every trigger.
+ This is supported for aggregation queries.
+
+- **Update mode** - (*Available since Spark 2.1.1*) Only the rows in the Result Table that were 
+updated since the last trigger will be outputted to the sink. 
+More information to be added in future releases.
 
-- **Append mode (default)** - This is the default mode, where only the new rows added to the result table since the last trigger will be outputted to the sink. This is only applicable to queries that *do not have any aggregations* (e.g. queries with only `select`, `where`, `map`, `flatMap`, `filter`, `join`, etc.).
+Different types of streaming queries support different output modes.
+Here is the compatibility matrix.
 
-- **Complete mode** - The whole result table will be outputted to the sink.This is only applicable to queries that *have aggregations*. 
+<table class="table">
+  <tr>
+    <th>Query Type</th>
+    <th></th>
+    <th>Supported Output Modes</th>
+    <th>Notes</th>        
+  </tr>
+  <tr>
+    <td colspan="2" style="vertical-align: middle;">Queries without aggregation</td>
+    <td style="vertical-align: middle;">Append, Update</td>
+    <td style="vertical-align: middle;">
+        Complete mode not supported as it is infeasible to keep all data in the Result Table.
+    </td>
+  </tr>
+  <tr>
+    <td rowspan="2" style="vertical-align: middle;">Queries with aggregation</td>
+    <td style="vertical-align: middle;">Aggregation on event-time with watermark</td>
+    <td style="vertical-align: middle;">Append, Update, Complete</td>
+    <td>
+        Append mode uses watermark to drop old aggregation state. But the output of a 
+        windowed aggregation is delayed the late threshold specified in `withWatermark()` as by
+        the modes semantics, rows can be added to the Result Table only once after they are 
+        finalized (i.e. after watermark is crossed). See the
+        <a href="#handling-late-data-and-watermarking">Late Data</a> section for more details.
+        <br/><br/>
+        Update mode uses watermark to drop old aggregation state.
+        <br/><br/>
+        Complete mode does not drop old aggregation state since by definition this mode
+        preserves all data in the Result Table.
+    </td>    
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Other aggregations</td>
+    <td style="vertical-align: middle;">Complete, Update</td>
+    <td>
+        Since no watermark is defined (only defined in other category), 
+        old aggregation state is not dropped.
+        <br/><br/>
+        Append mode is not supported as aggregates can update thus violating the semantics of 
+        this mode.
+    </td>  
+  </tr>
+  <tr>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+</table>
 
 #### Output Sinks
 There are a few types of built-in output sinks.
 
-- **File sink** - Stores the output to a directory. As of Spark 2.0, this only supports Parquet file format, and Append output mode. 
+- **File sink** - Stores the output to a directory. 
+
+{% highlight scala %}
+writeStream
+    .format("parquet")        // can be "orc", "json", "csv", etc.
+    .option("path", "path/to/destination/dir")
+    .start()
+{% endhighlight %}
 
 - **Foreach sink** - Runs arbitrary computation on the records in the output. See later in the section for more details.
 
+{% highlight scala %}
+writeStream
+    .foreach(...)
+    .start()
+{% endhighlight %}
+
 - **Console sink (for debugging)** - Prints the output to the console/stdout every time there is a trigger. Both, Append and Complete output modes, are supported. This should be used for debugging purposes on low data volumes as the entire output is collected and stored in the driver's memory after every trigger.
 
-- **Memory sink (for debugging)** - The output is stored in memory as an in-memory table.  Both, Append and Complete output modes, are supported. This should be used for debugging purposes on low data volumes as the entire output is collected and stored in the driver's memory after every trigger.
+{% highlight scala %}
+writeStream
+    .format("console")
+    .start()
+{% endhighlight %}
+
+- **Memory sink (for debugging)** - The output is stored in memory as an in-memory table.
+Both, Append and Complete output modes, are supported. This should be used for debugging purposes
+on low data volumes as the entire output is collected and stored in the driver's memory.
+Hence, use it with caution.
 
-Here is a table of all the sinks, and the corresponding settings.
+{% highlight scala %}
+writeStream
+    .format("memory")
+    .queryName("tableName")
+    .start()
+{% endhighlight %}
+
+Some sinks are not fault-tolerant because they do not guarantee persistence of the output and are 
+meant for debugging purposes only. See the earlier section on 
+[fault-tolerance semantics](#fault-tolerance-semantics). 
+Here are the details of all the sinks in Spark.
 
 <table class="table">
   <tr>
     <th>Sink</th>
     <th>Supported Output Modes</th>
-    <th style="width:30%">Usage</th>
+    <th>Options</th>
     <th>Fault-tolerant</th>
     <th>Notes</th>
   </tr>
   <tr>
-    <td><b>File Sink</b><br/>(only parquet in Spark 2.0)</td>
+    <td><b>File Sink</b></td>
     <td>Append</td>
-    <td><pre>writeStream<br/>  .format("parquet")<br/>  .start()</pre></td>
+    <td>
+        <code>path</code>: path to the output directory, must be specified.
+        <code>maxFilesPerTrigger</code>: maximum number of new files to be considered in every trigger (default: no max)
+        <br/>
+        <code>latestFirst</code>: whether to processs the latest new files first, useful when there is a large backlog of files(default: false)
+        <br/><br/>
+        For file-format-specific options, see the related methods in DataFrameWriter
+        (<a href="api/scala/index.html#org.apache.spark.sql.DataFrameWriter">Scala</a>/<a href="api/java/org/apache/spark/sql/DataFrameWriter.html">Java</a>/<a href="api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter">Python</a>).
+        E.g. for "parquet" format options see <code>DataFrameWriter.parquet()</code>
+    </td>
     <td>Yes</td>
     <td>Supports writes to partitioned tables. Partitioning by time may be useful.</td>
   </tr>
   <tr>
     <td><b>Foreach Sink</b></td>
-    <td>All modes</td>
-    <td><pre>writeStream<br/>  .foreach(...)<br/>  .start()</pre></td>
+    <td>Append, Update, Compelete</td>
+    <td>None</td>
     <td>Depends on ForeachWriter implementation</td>
     <td>More details in the <a href="#using-foreach">next section</a></td>
   </tr>
   <tr>
     <td><b>Console Sink</b></td>
-    <td>Append, Complete</td>
-    <td><pre>writeStream<br/>  .format("console")<br/>  .start()</pre></td>
+    <td>Append, Update, Complete</td>
+    <td>
+        <code>numRows</code>: Number of rows to print every trigger (default: 20)
+        <br/>
+        <code>truncate</code>: Whether to truncate the output if too long (default: true)
+    </td>
     <td>No</td>
     <td></td>
   </tr>
   <tr>
     <td><b>Memory Sink</b></td>
     <td>Append, Complete</td>
-    <td><pre>writeStream<br/>  .format("memory")<br/>  .queryName("table")<br/>  .start()</pre></td>
-    <td>No</td>
-    <td>Saves the output data as a table, for interactive querying. Table name is the query name.</td>
-  </tr> 
+    <td>None</td>
+    <td>No. But in Complete Mode, restarted query will recreate the full table.</td>
+    <td>Table name is the query name.</td>
+  </tr>
+  <tr>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
 </table>
 
-Finally, you have to call `start()` to actually start the execution of the query. This returns a StreamingQuery object which is a handle to the continuously running execution. You can use this object to manage the query, which we will discuss in the next subsection. For now, let’s understand all this with a few examples.
+Note that you have to call `start()` to actually start the execution of the query. This returns a StreamingQuery object which is a handle to the continuously running execution. You can use this object to manage the query, which we will discuss in the next subsection. For now, let’s understand all this with a few examples.
 
 
 <div class="codetabs">
@@ -839,7 +1140,9 @@ noAggDF
 // Write new data to Parquet files
 noAggDF
   .writeStream
-  .parquet("path/to/destination/directory")
+  .format("parquet")
+  .option("checkpointLocation", "path/to/checkpoint/dir")
+  .option("path", "path/to/destination/dir")
   .start()
    
 // ========== DF with aggregation ==========
@@ -879,7 +1182,9 @@ noAggDF
 // Write new data to Parquet files
 noAggDF
   .writeStream()
-  .parquet("path/to/destination/directory")
+  .format("parquet")
+  .option("checkpointLocation", "path/to/checkpoint/dir")
+  .option("path", "path/to/destination/dir")
   .start();
    
 // ========== DF with aggregation ==========
@@ -912,14 +1217,16 @@ noAggDF = deviceDataDf.select("device").where("signal > 10")
 
 # Print new data to console
 noAggDF \
-    .writeStream() \
+    .writeStream \
     .format("console") \
     .start()
 
 # Write new data to Parquet files
 noAggDF \
-    .writeStream() \
-    .parquet("path/to/destination/directory") \
+    .writeStream \
+    .format("parquet") \
+    .option("checkpointLocation", "path/to/checkpoint/dir") \
+    .option("path", "path/to/destination/dir") \
     .start()
    
 # ========== DF with aggregation ==========
@@ -927,17 +1234,17 @@ aggDF = df.groupBy("device").count()
 
 # Print updated aggregations to console
 aggDF \
-    .writeStream() \
+    .writeStream \
     .outputMode("complete") \
     .format("console") \
     .start()
 
 # Have all the aggregates in an in memory table. The query name will be the table name
-aggDF\
-    .writeStream()\
-    .queryName("aggregates")\
-    .outputMode("complete")\
-    .format("memory")\
+aggDF \
+    .writeStream \
+    .queryName("aggregates") \
+    .outputMode("complete") \
+    .format("memory") \
     .start()
 
 spark.sql("select * from aggregates").show()   # interactively query in-memory table
@@ -947,7 +1254,7 @@ spark.sql("select * from aggregates").show()   # interactively query in-memory t
 </div>
 
 #### Using Foreach
-The `foreach` operation allows arbitrary operations to be computed on the output data. As of Spark 2.0, this is available only for Scala and Java. To use this, you will have to implement the interface `ForeachWriter`
+The `foreach` operation allows arbitrary operations to be computed on the output data. As of Spark 2.1, this is available only for Scala and Java. To use this, you will have to implement the interface `ForeachWriter`
 ([Scala](api/scala/index.html#org.apache.spark.sql.ForeachWriter)/[Java](api/java/org/apache/spark/sql/ForeachWriter.html) docs),
 which has methods that get called whenever there is a sequence of rows generated as output after a trigger. Note the following important points.
 
@@ -972,7 +1279,9 @@ The `StreamingQuery` object created when a query is started can be used to monit
 {% highlight scala %}
 val query = df.writeStream.format("console").start()   // get the query object
 
-query.id          // get the unique identifier of the running query
+query.id          // get the unique identifier of the running query that persists across restarts from checkpoint data
+
+query.runId       // get the unique id of this run of the query, which will be generated at every start/restart
 
 query.name        // get the name of the auto-generated or user-specified name
 
@@ -982,11 +1291,11 @@ query.stop()      // stop the query
 
 query.awaitTermination()   // block until query is terminated, with stop() or with error
 
-query.exception()    // the exception if the query has been terminated with error
+query.exception       // the exception if the query has been terminated with error
 
-query.sourceStatus()  // progress information about data has been read from the input sources
+query.recentProgress  // an array of the most recent progress updates for this query
 
-query.sinkStatus()   // progress information about data written to the output sink
+query.lastProgress    // the most recent progress update of this streaming query
 {% endhighlight %}
 
 
@@ -996,21 +1305,23 @@ query.sinkStatus()   // progress information about data written to the output si
 {% highlight java %}
 StreamingQuery query = df.writeStream().format("console").start();   // get the query object
 
-query.id();          // get the unique identifier of the running query
+query.id();          // get the unique identifier of the running query that persists across restarts from checkpoint data
+
+query.runId();       // get the unique id of this run of the query, which will be generated at every start/restart
 
 query.name();        // get the name of the auto-generated or user-specified name
 
 query.explain();   // print detailed explanations of the query
 
-query.stop();      // stop the query 
+query.stop();      // stop the query
 
 query.awaitTermination();   // block until query is terminated, with stop() or with error
 
-query.exception();    // the exception if the query has been terminated with error
+query.exception();       // the exception if the query has been terminated with error
 
-query.sourceStatus();  // progress information about data has been read from the input sources
+query.recentProgress();  // an array of the most recent progress updates for this query
 
-query.sinkStatus();   // progress information about data written to the output sink
+query.lastProgress();    // the most recent progress update of this streaming query
 
 {% endhighlight %}
 
@@ -1018,9 +1329,11 @@ query.sinkStatus();   // progress information about data written to the output s
 <div data-lang="python"  markdown="1">
 
 {% highlight python %}
-query = df.writeStream().format("console").start()   # get the query object
+query = df.writeStream.format("console").start()   # get the query object
 
-query.id()          # get the unique identifier of the running query
+query.id()          # get the unique identifier of the running query that persists across restarts from checkpoint data
+
+query.runId()       # get the unique id of this run of the query, which will be generated at every start/restart
 
 query.name()        # get the name of the auto-generated or user-specified name
 
@@ -1030,11 +1343,11 @@ query.stop()      # stop the query
 
 query.awaitTermination()   # block until query is terminated, with stop() or with error
 
-query.exception()    # the exception if the query has been terminated with error
+query.exception()       # the exception if the query has been terminated with error
 
-query.sourceStatus()  # progress information about data has been read from the input sources
+query.recentProgress()  # an array of the most recent progress updates for this query
 
-query.sinkStatus()   # progress information about data written to the output sink
+query.lastProgress()    # the most recent progress update of this streaming query
 
 {% endhighlight %}
 
@@ -1087,12 +1400,250 @@ spark.streams().awaitAnyTermination()  # block until any one of them terminates
 </div>
 </div>
 
-Finally, for asynchronous monitoring of streaming queries, you can create and attach a `StreamingQueryListener`
-([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryListener)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryListener.html) docs),
-which will give you regular callback-based updates when queries are started and terminated.
+
+## Monitoring Streaming Queries
+There are two APIs for monitoring and debugging active queries - 
+interactively and asynchronously.
+
+### Interactive APIs
+
+You can directly get the current status and metrics of an active query using 
+`streamingQuery.lastProgress()` and `streamingQuery.status()`. 
+`lastProgress()` returns a `StreamingQueryProgress` object 
+in [Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryProgress) 
+and [Java](api/java/org/apache/spark/sql/streaming/StreamingQueryProgress.html)
+and an dictionary with the same fields in Python. It has all the information about
+the progress made in the last trigger of the stream - what data was processed, 
+what were the processing rates, latencies, etc. There is also 
+`streamingQuery.recentProgress` which returns an array of last few progresses.  
+
+In addition, `streamingQuery.status()` returns `StreamingQueryStatus` object 
+in [Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryStatus) 
+and [Java](api/java/org/apache/spark/sql/streaming/StreamingQueryStatus.html)
+and an dictionary with the same fields in Python. It gives information about
+what the query is immediately doing - is a trigger active, is data being processed, etc.
+
+Here are a few examples.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+val query: StreamingQuery = ...
+
+println(query.lastProgress)
+
+/* Will print something like the following.
+
+{
+  "id" : "ce011fdc-8762-4dcb-84eb-a77333e28109",
+  "runId" : "88e2ff94-ede0-45a8-b687-6316fbef529a",
+  "name" : "MyQuery",
+  "timestamp" : "2016-12-14T18:45:24.873Z",
+  "numInputRows" : 10,
+  "inputRowsPerSecond" : 120.0,
+  "processedRowsPerSecond" : 200.0,
+  "durationMs" : {
+    "triggerExecution" : 3,
+    "getOffset" : 2
+  },
+  "eventTime" : {
+    "watermark" : "2016-12-14T18:45:24.873Z"
+  },
+  "stateOperators" : [ ],
+  "sources" : [ {
+    "description" : "KafkaSource[Subscribe[topic-0]]",
+    "startOffset" : {
+      "topic-0" : {
+        "2" : 0,
+        "4" : 1,
+        "1" : 1,
+        "3" : 1,
+        "0" : 1
+      }
+    },
+    "endOffset" : {
+      "topic-0" : {
+        "2" : 0,
+        "4" : 115,
+        "1" : 134,
+        "3" : 21,
+        "0" : 534
+      }
+    },
+    "numInputRows" : 10,
+    "inputRowsPerSecond" : 120.0,
+    "processedRowsPerSecond" : 200.0
+  } ],
+  "sink" : {
+    "description" : "MemorySink"
+  }
+}
+*/
+
+
+println(query.status)
+
+/*  Will print something like the following.
+{
+  "message" : "Waiting for data to arrive",
+  "isDataAvailable" : false,
+  "isTriggerActive" : false
+}
+*/
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+StreamingQuery query = ...
+
+System.out.println(query.lastProgress());
+/* Will print something like the following.
+
+{
+  "id" : "ce011fdc-8762-4dcb-84eb-a77333e28109",
+  "runId" : "88e2ff94-ede0-45a8-b687-6316fbef529a",
+  "name" : "MyQuery",
+  "timestamp" : "2016-12-14T18:45:24.873Z",
+  "numInputRows" : 10,
+  "inputRowsPerSecond" : 120.0,
+  "processedRowsPerSecond" : 200.0,
+  "durationMs" : {
+    "triggerExecution" : 3,
+    "getOffset" : 2
+  },
+  "eventTime" : {
+    "watermark" : "2016-12-14T18:45:24.873Z"
+  },
+  "stateOperators" : [ ],
+  "sources" : [ {
+    "description" : "KafkaSource[Subscribe[topic-0]]",
+    "startOffset" : {
+      "topic-0" : {
+        "2" : 0,
+        "4" : 1,
+        "1" : 1,
+        "3" : 1,
+        "0" : 1
+      }
+    },
+    "endOffset" : {
+      "topic-0" : {
+        "2" : 0,
+        "4" : 115,
+        "1" : 134,
+        "3" : 21,
+        "0" : 534
+      }
+    },
+    "numInputRows" : 10,
+    "inputRowsPerSecond" : 120.0,
+    "processedRowsPerSecond" : 200.0
+  } ],
+  "sink" : {
+    "description" : "MemorySink"
+  }
+}
+*/
+
+
+System.out.println(query.status());
+/*  Will print something like the following.
+{
+  "message" : "Waiting for data to arrive",
+  "isDataAvailable" : false,
+  "isTriggerActive" : false
+}
+*/
+{% endhighlight %}
+
+</div>
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+query = ...  # a StreamingQuery
+print(query.lastProgress)
+
+'''
+Will print something like the following.
+
+{u'stateOperators': [], u'eventTime': {u'watermark': u'2016-12-14T18:45:24.873Z'}, u'name': u'MyQuery', u'timestamp': u'2016-12-14T18:45:24.873Z', u'processedRowsPerSecond': 200.0, u'inputRowsPerSecond': 120.0, u'numInputRows': 10, u'sources': [{u'description': u'KafkaSource[Subscribe[topic-0]]', u'endOffset': {u'topic-0': {u'1': 134, u'0': 534, u'3': 21, u'2': 0, u'4': 115}}, u'processedRowsPerSecond': 200.0, u'inputRowsPerSecond': 120.0, u'numInputRows': 10, u'startOffset': {u'topic-0': {u'1': 1, u'0': 1, u'3': 1, u'2': 0, u'4': 1}}}], u'durationMs': {u'getOffset': 2, u'triggerExecution': 3}, u'runId': u'88e2ff94-ede0-45a8-b687-6316fbef529a', u'id': u'ce011fdc-8762-4dcb-84eb-a77333e28109', u'sink': {u'description': u'MemorySink'}}
+'''
+
+print(query.status)
+''' 
+Will print something like the following.
+
+{u'message': u'Waiting for data to arrive', u'isTriggerActive': False, u'isDataAvailable': False}
+'''
+{% endhighlight %}
+
+</div>
+</div>
+
+### Asynchronous API
+
+You can also asynchronously monitor all queries associated with a
+`SparkSession` by attaching a `StreamingQueryListener`
+([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryListener)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryListener.html) docs).
+Once you attach your custom `StreamingQueryListener` object with
+`sparkSession.streams.attachListener()`, you will get callbacks when a query is started and
+stopped and when there is progress made in an active query. Here is an example,
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+val spark: SparkSession = ...
+
+spark.streams.addListener(new StreamingQueryListener() {
+    override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
+        println("Query started: " + queryStarted.id)
+    }
+    override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = {
+        println("Query terminated: " + queryTerminated.id)
+    }
+    override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
+        println("Query made progress: " + queryProgress.progress)
+    }
+})
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+SparkSession spark = ...
+
+spark.streams().addListener(new StreamingQueryListener() {
+    @Override
+    public void onQueryStarted(QueryStartedEvent queryStarted) {
+        System.out.println("Query started: " + queryStarted.id());
+    }
+    @Override
+    public void onQueryTerminated(QueryTerminatedEvent queryTerminated) {
+        System.out.println("Query terminated: " + queryTerminated.id());
+    }
+    @Override
+    public void onQueryProgress(QueryProgressEvent queryProgress) {
+        System.out.println("Query made progress: " + queryProgress.progress());
+    }
+});
+{% endhighlight %}
+
+</div>
+<div data-lang="python"  markdown="1">
+{% highlight bash %}
+Not available in Python.
+{% endhighlight %}
+
+</div>
+</div>
 
 ## Recovering from Failures with Checkpointing 
-In case of a failure or intentional shutdown, you can recover the previous progress and state of a previous query, and continue where it left off. This is done using checkpointing and write ahead logs. You can configure a query with a checkpoint location, and the query will save all the progress information (i.e. range of offsets processed in each trigger) and the running aggregates (e.g. word counts in the [quick example](#quick-example)) to the checkpoint location. As of Spark 2.0, this checkpoint location has to be a path in an HDFS compatible file system, and can be set as an option in the DataStreamWriter when [starting a query](#starting-streaming-queries). 
+In case of a failure or intentional shutdown, you can recover the previous progress and state of a previous query, and continue where it left off. This is done using checkpointing and write ahead logs. You can configure a query with a checkpoint location, and the query will save all the progress information (i.e. range of offsets processed in each trigger) and the running aggregates (e.g. word counts in the [quick example](#quick-example)) to the checkpoint location. This checkpoint location has to be a path in an HDFS compatible file system, and can be set as an option in the DataStreamWriter when [starting a query](#starting-streaming-queries). 
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
@@ -1123,7 +1674,7 @@ aggDF
 
 {% highlight python %}
 aggDF \
-    .writeStream() \
+    .writeStream \
     .outputMode("complete") \
     .option("checkpointLocation", "path/to/HDFS/dir") \
     .format("memory") \
diff --git a/docs/tuning.md b/docs/tuning.md
index 9c43b315bbb9..0de303a3bd9b 100644
--- a/docs/tuning.md
+++ b/docs/tuning.md
@@ -224,8 +224,8 @@ temporary objects created during task execution. Some steps which may be useful
 
 * As an example, if your task is reading data from HDFS, the amount of memory used by the task can be estimated using
   the size of the data block read from HDFS. Note that the size of a decompressed block is often 2 or 3 times the
-  size of the block. So if we wish to have 3 or 4 tasks' worth of working space, and the HDFS block size is 64 MB,
-  we can estimate size of Eden to be `4*3*64MB`.
+  size of the block. So if we wish to have 3 or 4 tasks' worth of working space, and the HDFS block size is 128 MB,
+  we can estimate size of Eden to be `4*3*128MB`.
 
 * Monitor how the frequency and time taken by garbage collection changes with the new settings.
 
diff --git a/examples/pom.xml b/examples/pom.xml
index 90bbd3fbb940..f17e605250a1 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -90,6 +90,12 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql-kafka-0-10_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-math3</artifactId>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java
new file mode 100644
index 000000000000..ca3ee5a28525
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.feature.BucketedRandomProjectionLSH;
+import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaBucketedRandomProjectionLSHExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaBucketedRandomProjectionLSHExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> dataA = Arrays.asList(
+      RowFactory.create(0, Vectors.dense(1.0, 1.0)),
+      RowFactory.create(1, Vectors.dense(1.0, -1.0)),
+      RowFactory.create(2, Vectors.dense(-1.0, -1.0)),
+      RowFactory.create(3, Vectors.dense(-1.0, 1.0))
+    );
+
+    List<Row> dataB = Arrays.asList(
+        RowFactory.create(4, Vectors.dense(1.0, 0.0)),
+        RowFactory.create(5, Vectors.dense(-1.0, 0.0)),
+        RowFactory.create(6, Vectors.dense(0.0, 1.0)),
+        RowFactory.create(7, Vectors.dense(0.0, -1.0))
+    );
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("keys", new VectorUDT(), false, Metadata.empty())
+    });
+    Dataset<Row> dfA = spark.createDataFrame(dataA, schema);
+    Dataset<Row> dfB = spark.createDataFrame(dataB, schema);
+
+    Vector key = Vectors.dense(1.0, 0.0);
+
+    BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH()
+      .setBucketLength(2.0)
+      .setNumHashTables(3)
+      .setInputCol("keys")
+      .setOutputCol("values");
+
+    BucketedRandomProjectionLSHModel model = mh.fit(dfA);
+
+    // Feature Transformation
+    model.transform(dfA).show();
+    // Cache the transformed columns
+    Dataset<Row> transformedA = model.transform(dfA).cache();
+    Dataset<Row> transformedB = model.transform(dfB).cache();
+
+    // Approximate similarity join
+    model.approxSimilarityJoin(dfA, dfB, 1.5).show();
+    model.approxSimilarityJoin(transformedA, transformedB, 1.5).show();
+    // Self Join
+    model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show();
+
+    // Approximate nearest neighbor search
+    model.approxNearestNeighbors(dfA, key, 2).show();
+    model.approxNearestNeighbors(transformedA, key, 2).show();
+    // $example off$
+
+    spark.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
new file mode 100644
index 000000000000..3684a87e22e7
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.ml.feature.Interaction;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.Arrays;
+import java.util.List;
+
+// $example on$
+// $example off$
+
+public class JavaInteractionExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaInteractionExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(1, 1, 2, 3, 8, 4, 5),
+      RowFactory.create(2, 4, 3, 8, 7, 9, 8),
+      RowFactory.create(3, 6, 1, 9, 2, 3, 6),
+      RowFactory.create(4, 10, 8, 6, 9, 4, 5),
+      RowFactory.create(5, 9, 2, 7, 10, 7, 3),
+      RowFactory.create(6, 1, 1, 4, 2, 8, 4)
+    );
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id7", DataTypes.IntegerType, false, Metadata.empty())
+    });
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+
+    VectorAssembler assembler1 = new VectorAssembler()
+            .setInputCols(new String[]{"id2", "id3", "id4"})
+            .setOutputCol("vec1");
+
+    Dataset<Row> assembled1 = assembler1.transform(df);
+
+    VectorAssembler assembler2 = new VectorAssembler()
+            .setInputCols(new String[]{"id5", "id6", "id7"})
+            .setOutputCol("vec2");
+
+    Dataset<Row> assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2");
+
+    Interaction interaction = new Interaction()
+            .setInputCols(new String[]{"id1","vec1","vec2"})
+            .setOutputCol("interactedCol");
+
+    Dataset<Row> interacted = interaction.transform(assembled2);
+
+    interacted.show(false);
+    // $example off$
+
+    spark.stop();
+  }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
index b8fb5972ea41..4cdec21d2302 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
@@ -60,8 +60,8 @@ public static void main(String[] args) {
     LogisticRegressionModel mlrModel = mlr.fit(training);
 
     // Print the coefficients and intercepts for logistic regression with multinomial family
-    System.out.println("Multinomial coefficients: "
-            + lrModel.coefficientMatrix() + "\nMultinomial intercepts: " + mlrModel.interceptVector());
+    System.out.println("Multinomial coefficients: " + lrModel.coefficientMatrix()
+      + "\nMultinomial intercepts: " + mlrModel.interceptVector());
     // $example off$
 
     spark.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java
new file mode 100644
index 000000000000..9dbbf6d11724
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.feature.MinHashLSH;
+import org.apache.spark.ml.feature.MinHashLSHModel;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaMinHashLSHExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaMinHashLSHExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})),
+      RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})),
+      RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0}))
+    );
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("keys", new VectorUDT(), false, Metadata.empty())
+    });
+    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
+
+    MinHashLSH mh = new MinHashLSH()
+      .setNumHashTables(1)
+      .setInputCol("keys")
+      .setOutputCol("values");
+
+    MinHashLSHModel model = mh.fit(dataFrame);
+    model.transform(dataFrame).show();
+    // $example off$
+
+    spark.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
index 101a4df779f2..f42fd3317b79 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -35,13 +35,11 @@
 import org.apache.spark.sql.types.Metadata;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
-// $example off$
 
-// $example on:untyped_ops$
 // col("...") is preferable to df.col("...")
 import static org.apache.spark.sql.functions.callUDF;
 import static org.apache.spark.sql.functions.col;
-// $example off:untyped_ops$
+// $example off$
 
 public class JavaTokenizerExample {
   public static void main(String[] args) {
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedTypedAggregation.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedTypedAggregation.java
new file mode 100644
index 000000000000..78e9011be470
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedTypedAggregation.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.examples.sql;
+
+// $example on:typed_custom_aggregation$
+import java.io.Serializable;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoder;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.TypedColumn;
+import org.apache.spark.sql.expressions.Aggregator;
+// $example off:typed_custom_aggregation$
+
+public class JavaUserDefinedTypedAggregation {
+
+  // $example on:typed_custom_aggregation$
+  public static class Employee implements Serializable {
+    private String name;
+    private long salary;
+
+    // Constructors, getters, setters...
+    // $example off:typed_custom_aggregation$
+    public String getName() {
+      return name;
+    }
+
+    public void setName(String name) {
+      this.name = name;
+    }
+
+    public long getSalary() {
+      return salary;
+    }
+
+    public void setSalary(long salary) {
+      this.salary = salary;
+    }
+    // $example on:typed_custom_aggregation$
+  }
+
+  public static class Average implements Serializable  {
+    private long sum;
+    private long count;
+
+    // Constructors, getters, setters...
+    // $example off:typed_custom_aggregation$
+    public Average() {
+    }
+
+    public Average(long sum, long count) {
+      this.sum = sum;
+      this.count = count;
+    }
+
+    public long getSum() {
+      return sum;
+    }
+
+    public void setSum(long sum) {
+      this.sum = sum;
+    }
+
+    public long getCount() {
+      return count;
+    }
+
+    public void setCount(long count) {
+      this.count = count;
+    }
+    // $example on:typed_custom_aggregation$
+  }
+
+  public static class MyAverage extends Aggregator<Employee, Average, Double> {
+    // A zero value for this aggregation. Should satisfy the property that any b + zero = b
+    public Average zero() {
+      return new Average(0L, 0L);
+    }
+    // Combine two values to produce a new value. For performance, the function may modify `buffer`
+    // and return it instead of constructing a new object
+    public Average reduce(Average buffer, Employee employee) {
+      long newSum = buffer.getSum() + employee.getSalary();
+      long newCount = buffer.getCount() + 1;
+      buffer.setSum(newSum);
+      buffer.setCount(newCount);
+      return buffer;
+    }
+    // Merge two intermediate values
+    public Average merge(Average b1, Average b2) {
+      long mergedSum = b1.getSum() + b2.getSum();
+      long mergedCount = b1.getCount() + b2.getCount();
+      b1.setSum(mergedSum);
+      b1.setCount(mergedCount);
+      return b1;
+    }
+    // Transform the output of the reduction
+    public Double finish(Average reduction) {
+      return ((double) reduction.getSum()) / reduction.getCount();
+    }
+    // Specifies the Encoder for the intermediate value type
+    public Encoder<Average> bufferEncoder() {
+      return Encoders.bean(Average.class);
+    }
+    // Specifies the Encoder for the final output value type
+    public Encoder<Double> outputEncoder() {
+      return Encoders.DOUBLE();
+    }
+  }
+  // $example off:typed_custom_aggregation$
+
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("Java Spark SQL user-defined Datasets aggregation example")
+      .getOrCreate();
+
+    // $example on:typed_custom_aggregation$
+    Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
+    String path = "examples/src/main/resources/employees.json";
+    Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder);
+    ds.show();
+    // +-------+------+
+    // |   name|salary|
+    // +-------+------+
+    // |Michael|  3000|
+    // |   Andy|  4500|
+    // | Justin|  3500|
+    // |  Berta|  4000|
+    // +-------+------+
+
+    MyAverage myAverage = new MyAverage();
+    // Convert the function to a `TypedColumn` and give it a name
+    TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary");
+    Dataset<Double> result = ds.select(averageSalary);
+    result.show();
+    // +--------------+
+    // |average_salary|
+    // +--------------+
+    // |        3750.0|
+    // +--------------+
+    // $example off:typed_custom_aggregation$
+    spark.stop();
+  }
+
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java
new file mode 100644
index 000000000000..6da60a1fc6b8
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.examples.sql;
+
+// $example on:untyped_custom_aggregation$
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.expressions.MutableAggregationBuffer;
+import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off:untyped_custom_aggregation$
+
+public class JavaUserDefinedUntypedAggregation {
+
+  // $example on:untyped_custom_aggregation$
+  public static class MyAverage extends UserDefinedAggregateFunction {
+
+    private StructType inputSchema;
+    private StructType bufferSchema;
+
+    public MyAverage() {
+      List<StructField> inputFields = new ArrayList<>();
+      inputFields.add(DataTypes.createStructField("inputColumn", DataTypes.LongType, true));
+      inputSchema = DataTypes.createStructType(inputFields);
+
+      List<StructField> bufferFields = new ArrayList<>();
+      bufferFields.add(DataTypes.createStructField("sum", DataTypes.LongType, true));
+      bufferFields.add(DataTypes.createStructField("count", DataTypes.LongType, true));
+      bufferSchema = DataTypes.createStructType(bufferFields);
+    }
+    // Data types of input arguments of this aggregate function
+    public StructType inputSchema() {
+      return inputSchema;
+    }
+    // Data types of values in the aggregation buffer
+    public StructType bufferSchema() {
+      return bufferSchema;
+    }
+    // The data type of the returned value
+    public DataType dataType() {
+      return DataTypes.DoubleType;
+    }
+    // Whether this function always returns the same output on the identical input
+    public boolean deterministic() {
+      return true;
+    }
+    // Initializes the given aggregation buffer. The buffer itself is a `Row` that in addition to
+    // standard methods like retrieving a value at an index (e.g., get(), getBoolean()), provides
+    // the opportunity to update its values. Note that arrays and maps inside the buffer are still
+    // immutable.
+    public void initialize(MutableAggregationBuffer buffer) {
+      buffer.update(0, 0L);
+      buffer.update(1, 0L);
+    }
+    // Updates the given aggregation buffer `buffer` with new input data from `input`
+    public void update(MutableAggregationBuffer buffer, Row input) {
+      if (!input.isNullAt(0)) {
+        long updatedSum = buffer.getLong(0) + input.getLong(0);
+        long updatedCount = buffer.getLong(1) + 1;
+        buffer.update(0, updatedSum);
+        buffer.update(1, updatedCount);
+      }
+    }
+    // Merges two aggregation buffers and stores the updated buffer values back to `buffer1`
+    public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
+      long mergedSum = buffer1.getLong(0) + buffer2.getLong(0);
+      long mergedCount = buffer1.getLong(1) + buffer2.getLong(1);
+      buffer1.update(0, mergedSum);
+      buffer1.update(1, mergedCount);
+    }
+    // Calculates the final result
+    public Double evaluate(Row buffer) {
+      return ((double) buffer.getLong(0)) / buffer.getLong(1);
+    }
+  }
+  // $example off:untyped_custom_aggregation$
+
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("Java Spark SQL user-defined DataFrames aggregation example")
+      .getOrCreate();
+
+    // $example on:untyped_custom_aggregation$
+    // Register the function to access it
+    spark.udf().register("myAverage", new MyAverage());
+
+    Dataset<Row> df = spark.read().json("examples/src/main/resources/employees.json");
+    df.createOrReplaceTempView("employees");
+    df.show();
+    // +-------+------+
+    // |   name|salary|
+    // +-------+------+
+    // |Michael|  3000|
+    // |   Andy|  4500|
+    // | Justin|  3500|
+    // |  Berta|  4000|
+    // +-------+------+
+
+    Dataset<Row> result = spark.sql("SELECT myAverage(salary) as average_salary FROM employees");
+    result.show();
+    // +--------------+
+    // |average_salary|
+    // +--------------+
+    // |        3750.0|
+    // +--------------+
+    // $example off:untyped_custom_aggregation$
+
+    spark.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java b/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java
new file mode 100644
index 000000000000..0f45cfeca442
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.sql.streaming;
+
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.streaming.StreamingQuery;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: JavaStructuredKafkaWordCount <bootstrap-servers> <subscribe-type> <topics>
+ *   <bootstrap-servers> The Kafka "bootstrap.servers" configuration. A
+ *   comma-separated list of host:port.
+ *   <subscribe-type> There are three kinds of type, i.e. 'assign', 'subscribe',
+ *   'subscribePattern'.
+ *   |- <assign> Specific TopicPartitions to consume. Json string
+ *   |  {"topicA":[0,1],"topicB":[2,4]}.
+ *   |- <subscribe> The topic list to subscribe. A comma-separated list of
+ *   |  topics.
+ *   |- <subscribePattern> The pattern used to subscribe to topic(s).
+ *   |  Java regex string.
+ *   |- Only one of "assign, "subscribe" or "subscribePattern" options can be
+ *   |  specified for Kafka source.
+ *   <topics> Different value format depends on the value of 'subscribe-type'.
+ *
+ * Example:
+ *    `$ bin/run-example \
+ *      sql.streaming.JavaStructuredKafkaWordCount host1:port1,host2:port2 \
+ *      subscribe topic1,topic2`
+ */
+public final class JavaStructuredKafkaWordCount {
+
+  public static void main(String[] args) throws Exception {
+    if (args.length < 3) {
+      System.err.println("Usage: JavaStructuredKafkaWordCount <bootstrap-servers> " +
+        "<subscribe-type> <topics>");
+      System.exit(1);
+    }
+
+    String bootstrapServers = args[0];
+    String subscribeType = args[1];
+    String topics = args[2];
+
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaStructuredKafkaWordCount")
+      .getOrCreate();
+
+    // Create DataSet representing the stream of input lines from kafka
+    Dataset<String> lines = spark
+      .readStream()
+      .format("kafka")
+      .option("kafka.bootstrap.servers", bootstrapServers)
+      .option(subscribeType, topics)
+      .load()
+      .selectExpr("CAST(value AS STRING)")
+      .as(Encoders.STRING());
+
+    // Generate running word count
+    Dataset<Row> wordCounts = lines.flatMap(new FlatMapFunction<String, String>() {
+      @Override
+      public Iterator<String> call(String x) {
+        return Arrays.asList(x.split(" ")).iterator();
+      }
+    }, Encoders.STRING()).groupBy("value").count();
+
+    // Start running the query that prints the running counts to the console
+    StreamingQuery query = wordCounts.writeStream()
+      .outputMode("complete")
+      .format("console")
+      .start();
+
+    query.awaitTermination();
+  }
+}
diff --git a/examples/src/main/python/ml/logistic_regression_summary_example.py b/examples/src/main/python/ml/logistic_regression_summary_example.py
new file mode 100644
index 000000000000..bd440a1fbe8d
--- /dev/null
+++ b/examples/src/main/python/ml/logistic_regression_summary_example.py
@@ -0,0 +1,68 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.classification import LogisticRegression
+# $example off$
+from pyspark.sql import SparkSession
+
+"""
+An example demonstrating Logistic Regression Summary.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
+"""
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .appName("LogisticRegressionSummary") \
+        .getOrCreate()
+
+    # Load training data
+    training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
+
+    # Fit the model
+    lrModel = lr.fit(training)
+
+    # $example on$
+    # Extract the summary from the returned LogisticRegressionModel instance trained
+    # in the earlier example
+    trainingSummary = lrModel.summary
+
+    # Obtain the objective per iteration
+    objectiveHistory = trainingSummary.objectiveHistory
+    print("objectiveHistory:")
+    for objective in objectiveHistory:
+        print(objective)
+
+    # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
+    trainingSummary.roc.show()
+    print("areaUnderROC: " + str(trainingSummary.areaUnderROC))
+
+    # Set the model threshold to maximize F-Measure
+    fMeasure = trainingSummary.fMeasureByThreshold
+    maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
+    bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
+        .select('threshold').head()['threshold']
+    lr.setThreshold(bestThreshold)
+    # $example off$
+
+    spark.stop()
diff --git a/examples/src/main/python/mllib/k_means_example.py b/examples/src/main/python/mllib/k_means_example.py
index 5c397e62ef10..d6058f45020c 100644
--- a/examples/src/main/python/mllib/k_means_example.py
+++ b/examples/src/main/python/mllib/k_means_example.py
@@ -36,8 +36,7 @@
     parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
 
     # Build the model (cluster the data)
-    clusters = KMeans.train(parsedData, 2, maxIterations=10,
-                            runs=10, initializationMode="random")
+    clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")
 
     # Evaluate clustering by computing Within Set Sum of Squared Errors
     def error(point):
diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py
index ebcf66995b47..c07fa8f2752b 100644
--- a/examples/src/main/python/sql/basic.py
+++ b/examples/src/main/python/sql/basic.py
@@ -187,9 +187,6 @@ def programmatic_schema_example(spark):
     # Creates a temporary view using the DataFrame
     schemaPeople.createOrReplaceTempView("people")
 
-    # Creates a temporary view using the DataFrame
-    schemaPeople.createOrReplaceTempView("people")
-
     # SQL can be run over DataFrames that have been registered as a table.
     results = spark.sql("SELECT name FROM people")
 
diff --git a/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
new file mode 100644
index 000000000000..9e8a552b3b10
--- /dev/null
+++ b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
@@ -0,0 +1,90 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+ Consumes messages from one or more topics in Kafka and does wordcount.
+ Usage: structured_kafka_wordcount.py <bootstrap-servers> <subscribe-type> <topics>
+   <bootstrap-servers> The Kafka "bootstrap.servers" configuration. A
+   comma-separated list of host:port.
+   <subscribe-type> There are three kinds of type, i.e. 'assign', 'subscribe',
+   'subscribePattern'.
+   |- <assign> Specific TopicPartitions to consume. Json string
+   |  {"topicA":[0,1],"topicB":[2,4]}.
+   |- <subscribe> The topic list to subscribe. A comma-separated list of
+   |  topics.
+   |- <subscribePattern> The pattern used to subscribe to topic(s).
+   |  Java regex string.
+   |- Only one of "assign, "subscribe" or "subscribePattern" options can be
+   |  specified for Kafka source.
+   <topics> Different value format depends on the value of 'subscribe-type'.
+
+ Run the example
+    `$ bin/spark-submit examples/src/main/python/sql/streaming/structured_kafka_wordcount.py \
+    host1:port1,host2:port2 subscribe topic1,topic2`
+"""
+from __future__ import print_function
+
+import sys
+
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import explode
+from pyspark.sql.functions import split
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("""
+        Usage: structured_kafka_wordcount.py <bootstrap-servers> <subscribe-type> <topics>
+        """, file=sys.stderr)
+        exit(-1)
+
+    bootstrapServers = sys.argv[1]
+    subscribeType = sys.argv[2]
+    topics = sys.argv[3]
+
+    spark = SparkSession\
+        .builder\
+        .appName("StructuredKafkaWordCount")\
+        .getOrCreate()
+
+    # Create DataSet representing the stream of input lines from kafka
+    lines = spark\
+        .readStream\
+        .format("kafka")\
+        .option("kafka.bootstrap.servers", bootstrapServers)\
+        .option(subscribeType, topics)\
+        .load()\
+        .selectExpr("CAST(value AS STRING)")
+
+    # Split the lines into words
+    words = lines.select(
+        # explode turns each item in an array into a separate row
+        explode(
+            split(lines.value, ' ')
+        ).alias('word')
+    )
+
+    # Generate running word count
+    wordCounts = words.groupBy('word').count()
+
+    # Start running the query that prints the running counts to the console
+    query = wordCounts\
+        .writeStream\
+        .outputMode('complete')\
+        .format('console')\
+        .start()
+
+    query.awaitTermination()
diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R
deleted file mode 100644
index a8a1274ac902..000000000000
--- a/examples/src/main/r/ml.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# To run this example use
-# ./bin/spark-submit examples/src/main/r/ml.R
-
-# Load SparkR library into your R session
-library(SparkR)
-
-# Initialize SparkSession
-sparkR.session(appName = "SparkR-ML-example")
-
-############################ spark.glm and glm ##############################################
-# $example on:glm$
-irisDF <- suppressWarnings(createDataFrame(iris))
-# Fit a generalized linear model of family "gaussian" with spark.glm
-gaussianDF <- irisDF
-gaussianTestDF <- irisDF
-gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
-
-# Model summary
-summary(gaussianGLM)
-
-# Prediction
-gaussianPredictions <- predict(gaussianGLM, gaussianTestDF)
-showDF(gaussianPredictions)
-
-# Fit a generalized linear model with glm (R-compliant)
-gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian")
-summary(gaussianGLM2)
-
-# Fit a generalized linear model of family "binomial" with spark.glm
-binomialDF <- filter(irisDF, irisDF$Species != "setosa")
-binomialTestDF <- binomialDF
-binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial")
-
-# Model summary
-summary(binomialGLM)
-
-# Prediction
-binomialPredictions <- predict(binomialGLM, binomialTestDF)
-showDF(binomialPredictions)
-# $example off:glm$
-############################ spark.survreg ##############################################
-# $example on:survreg$
-# Use the ovarian dataset available in R survival package
-library(survival)
-
-# Fit an accelerated failure time (AFT) survival regression model with spark.survreg
-ovarianDF <- suppressWarnings(createDataFrame(ovarian))
-aftDF <- ovarianDF
-aftTestDF <- ovarianDF
-aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx)
-
-# Model summary
-summary(aftModel)
-
-# Prediction
-aftPredictions <- predict(aftModel, aftTestDF)
-showDF(aftPredictions)
-# $example off:survreg$
-############################ spark.naiveBayes ##############################################
-# $example on:naiveBayes$
-# Fit a Bernoulli naive Bayes model with spark.naiveBayes
-titanic <- as.data.frame(Titanic)
-titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5])
-nbDF <- titanicDF
-nbTestDF <- titanicDF
-nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age)
-
-# Model summary
-summary(nbModel)
-
-# Prediction
-nbPredictions <- predict(nbModel, nbTestDF)
-showDF(nbPredictions)
-# $example off:naiveBayes$
-############################ spark.kmeans ##############################################
-# $example on:kmeans$
-# Fit a k-means model with spark.kmeans
-irisDF <- suppressWarnings(createDataFrame(iris))
-kmeansDF <- irisDF
-kmeansTestDF <- irisDF
-kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
-                            k = 3)
-
-# Model summary
-summary(kmeansModel)
-
-# Get fitted result from the k-means model
-showDF(fitted(kmeansModel))
-
-# Prediction
-kmeansPredictions <- predict(kmeansModel, kmeansTestDF)
-showDF(kmeansPredictions)
-# $example off:kmeans$
-############################ model read/write ##############################################
-# $example on:read_write$
-irisDF <- suppressWarnings(createDataFrame(iris))
-# Fit a generalized linear model of family "gaussian" with spark.glm
-gaussianDF <- irisDF
-gaussianTestDF <- irisDF
-gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
-
-# Save and then load a fitted MLlib model
-modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
-write.ml(gaussianGLM, modelPath)
-gaussianGLM2 <- read.ml(modelPath)
-
-# Check model summary
-summary(gaussianGLM2)
-
-# Check model prediction
-gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF)
-showDF(gaussianPredictions)
-
-unlink(modelPath)
-# $example off:read_write$
-############################ fit models with spark.lapply #####################################
-
-# Perform distributed training of multiple models with spark.lapply
-families <- c("gaussian", "poisson")
-train <- function(family) {
-  model <- glm(Sepal.Length ~ Sepal.Width + Species, iris, family = family)
-  summary(model)
-}
-model.summaries <- spark.lapply(families, train)
-
-# Print the summary of each model
-print(model.summaries)
-
-
-# Stop the SparkSession now
-sparkR.session.stop()
diff --git a/examples/src/main/r/ml/als.R b/examples/src/main/r/ml/als.R
new file mode 100644
index 000000000000..383bbba1908e
--- /dev/null
+++ b/examples/src/main/r/ml/als.R
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/als.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-als-example")
+
+# $example on$
+# Load training data
+data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0),
+             list(1, 2, 4.0), list(2, 1, 1.0), list(2, 2, 5.0))
+df <- createDataFrame(data, c("userId", "movieId", "rating"))
+training <- df
+test <- df
+
+# Fit a recommendation model using ALS with spark.als
+model <- spark.als(training, maxIter = 5, regParam = 0.01, userCol = "userId",
+                   itemCol = "movieId", ratingCol = "rating")
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off$
diff --git a/examples/src/main/r/ml/gaussianMixture.R b/examples/src/main/r/ml/gaussianMixture.R
new file mode 100644
index 000000000000..54b69acc83d9
--- /dev/null
+++ b/examples/src/main/r/ml/gaussianMixture.R
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/gaussianMixture.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-gaussianMixture-example")
+
+# $example on$
+# Load training data
+df <- read.df("data/mllib/sample_kmeans_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a gaussian mixture clustering model with spark.gaussianMixture
+model <- spark.gaussianMixture(training, ~ features, k = 2)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off$
diff --git a/examples/src/main/r/ml/gbt.R b/examples/src/main/r/ml/gbt.R
new file mode 100644
index 000000000000..be16c2aa6633
--- /dev/null
+++ b/examples/src/main/r/ml/gbt.R
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/gbt.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-gbt-example")
+
+# GBT classification model
+
+# $example on:classification$
+# Load training data
+df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a GBT classification model with spark.gbt
+model <- spark.gbt(training, label ~ features, "classification", maxIter = 10)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:classification$
+
+# GBT regression model
+
+# $example on:regression$
+# Load training data
+df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a GBT regression model with spark.gbt
+model <- spark.gbt(training, label ~ features, "regression", maxIter = 10)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:regression$
diff --git a/examples/src/main/r/ml/glm.R b/examples/src/main/r/ml/glm.R
new file mode 100644
index 000000000000..599071790a2c
--- /dev/null
+++ b/examples/src/main/r/ml/glm.R
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/glm.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-glm-example")
+
+# $example on$
+irisDF <- suppressWarnings(createDataFrame(iris))
+# Fit a generalized linear model of family "gaussian" with spark.glm
+gaussianDF <- irisDF
+gaussianTestDF <- irisDF
+gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
+
+# Model summary
+summary(gaussianGLM)
+
+# Prediction
+gaussianPredictions <- predict(gaussianGLM, gaussianTestDF)
+showDF(gaussianPredictions)
+
+# Fit a generalized linear model with glm (R-compliant)
+gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian")
+summary(gaussianGLM2)
+
+# Fit a generalized linear model of family "binomial" with spark.glm
+# Note: Filter out "setosa" from label column (two labels left) to match "binomial" family.
+binomialDF <- filter(irisDF, irisDF$Species != "setosa")
+binomialTestDF <- binomialDF
+binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial")
+
+# Model summary
+summary(binomialGLM)
+
+# Prediction
+binomialPredictions <- predict(binomialGLM, binomialTestDF)
+showDF(binomialPredictions)
+# $example off$
diff --git a/examples/src/main/r/ml/isoreg.R b/examples/src/main/r/ml/isoreg.R
new file mode 100644
index 000000000000..75dce97ed993
--- /dev/null
+++ b/examples/src/main/r/ml/isoreg.R
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/isoreg.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-isoreg-example")
+
+# $example on$
+# Load training data
+df <- read.df("data/mllib/sample_isotonic_regression_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit an isotonic regression model with spark.isoreg
+model <- spark.isoreg(training, label ~ features, isotonic = FALSE)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off$
diff --git a/examples/src/main/r/ml/kmeans.R b/examples/src/main/r/ml/kmeans.R
new file mode 100644
index 000000000000..043b21b0385d
--- /dev/null
+++ b/examples/src/main/r/ml/kmeans.R
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/kmeans.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-kmeans-example")
+
+# $example on$
+# Fit a k-means model with spark.kmeans
+irisDF <- suppressWarnings(createDataFrame(iris))
+kmeansDF <- irisDF
+kmeansTestDF <- irisDF
+kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
+                            k = 3)
+
+# Model summary
+summary(kmeansModel)
+
+# Get fitted result from the k-means model
+showDF(fitted(kmeansModel))
+
+# Prediction
+kmeansPredictions <- predict(kmeansModel, kmeansTestDF)
+showDF(kmeansPredictions)
+# $example off$
diff --git a/examples/src/main/r/ml/kstest.R b/examples/src/main/r/ml/kstest.R
new file mode 100644
index 000000000000..12625f7d3e63
--- /dev/null
+++ b/examples/src/main/r/ml/kstest.R
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/kstest.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-kstest-example")
+
+# $example on$
+# Load training data
+data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
+df <- createDataFrame(data)
+training <- df
+test <- df
+
+# Conduct the two-sided Kolmogorov-Smirnov (KS) test with spark.kstest
+model <- spark.kstest(df, "test", "norm")
+
+# Model summary
+summary(model)
+# $example off$
diff --git a/examples/src/main/r/ml/lda.R b/examples/src/main/r/ml/lda.R
new file mode 100644
index 000000000000..7b187d155a4c
--- /dev/null
+++ b/examples/src/main/r/ml/lda.R
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/lda.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-lda-example")
+
+# $example on$
+# Load training data
+df <- read.df("data/mllib/sample_lda_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a latent dirichlet allocation model with spark.lda
+model <- spark.lda(training, k = 10, maxIter = 10)
+
+# Model summary
+summary(model)
+
+# Posterior probabilities
+posterior <- spark.posterior(model, test)
+showDF(posterior)
+
+# The log perplexity of the LDA model
+logPerplexity <- spark.perplexity(model, test)
+print(paste0("The upper bound bound on perplexity: ", logPerplexity))
+# $example off$
diff --git a/examples/src/main/r/ml/logit.R b/examples/src/main/r/ml/logit.R
new file mode 100644
index 000000000000..a2ac882ed022
--- /dev/null
+++ b/examples/src/main/r/ml/logit.R
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/logit.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-logit-example")
+
+# Binomial logistic regression
+
+# $example on:binomial$
+# Load training data
+df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit an binomial logistic regression model with spark.logit
+model <- spark.logit(training, label ~ features, maxIter = 10, regParam = 0.3, elasticNetParam = 0.8)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:binomial$
+
+# Multinomial logistic regression
+
+# $example on:multinomial$
+# Load training data
+df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a multinomial logistic regression model with spark.logit
+model <- spark.logit(training, label ~ features, maxIter = 10, regParam = 0.3, elasticNetParam = 0.8)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:multinomial$
diff --git a/examples/src/main/r/ml/ml.R b/examples/src/main/r/ml/ml.R
new file mode 100644
index 000000000000..05f5199aebe1
--- /dev/null
+++ b/examples/src/main/r/ml/ml.R
@@ -0,0 +1,64 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/ml.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-example")
+
+############################ model read/write ##############################################
+# $example on:read_write$
+irisDF <- suppressWarnings(createDataFrame(iris))
+# Fit a generalized linear model of family "gaussian" with spark.glm
+gaussianDF <- irisDF
+gaussianTestDF <- irisDF
+gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
+
+# Save and then load a fitted MLlib model
+modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
+write.ml(gaussianGLM, modelPath)
+gaussianGLM2 <- read.ml(modelPath)
+
+# Check model summary
+summary(gaussianGLM2)
+
+# Check model prediction
+gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF)
+showDF(gaussianPredictions)
+
+unlink(modelPath)
+# $example off:read_write$
+
+############################ fit models with spark.lapply #####################################
+# Perform distributed training of multiple models with spark.lapply
+algorithms <- c("Hartigan-Wong", "Lloyd", "MacQueen")
+train <- function(algorithm) {
+  model <- kmeans(x = iris[1:4], centers = 3, algorithm = algorithm)
+  model$withinss
+}
+
+model.withinss <- spark.lapply(algorithms, train)
+
+# Print the within-cluster sum of squares for each model
+print(model.withinss)
+
+# Stop the SparkSession now
+sparkR.session.stop()
diff --git a/examples/src/main/r/ml/mlp.R b/examples/src/main/r/ml/mlp.R
new file mode 100644
index 000000000000..d28fc069bd11
--- /dev/null
+++ b/examples/src/main/r/ml/mlp.R
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/mlp.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-mlp-example")
+
+# $example on$
+# Load training data
+df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# specify layers for the neural network:
+# input layer of size 4 (features), two intermediate of size 5 and 4
+# and output of size 3 (classes)
+layers = c(4, 5, 4, 3)
+
+# Fit a multi-layer perceptron neural network model with spark.mlp
+model <- spark.mlp(training, label ~ features, maxIter = 100,
+                   layers = layers, blockSize = 128, seed = 1234)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off$
diff --git a/examples/src/main/r/ml/naiveBayes.R b/examples/src/main/r/ml/naiveBayes.R
new file mode 100644
index 000000000000..9c416599b4d7
--- /dev/null
+++ b/examples/src/main/r/ml/naiveBayes.R
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/naiveBayes.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-naiveBayes-example")
+
+# $example on$
+# Fit a Bernoulli naive Bayes model with spark.naiveBayes
+titanic <- as.data.frame(Titanic)
+titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5])
+nbDF <- titanicDF
+nbTestDF <- titanicDF
+nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age)
+
+# Model summary
+summary(nbModel)
+
+# Prediction
+nbPredictions <- predict(nbModel, nbTestDF)
+showDF(nbPredictions)
+# $example off$
diff --git a/examples/src/main/r/ml/randomForest.R b/examples/src/main/r/ml/randomForest.R
new file mode 100644
index 000000000000..d1b96b62a0e3
--- /dev/null
+++ b/examples/src/main/r/ml/randomForest.R
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/randomForest.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-randomForest-example")
+
+# Random forest classification model
+
+# $example on:classification$
+# Load training data
+df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a random forest classification model with spark.randomForest
+model <- spark.randomForest(training, label ~ features, "classification", numTrees = 10)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:classification$
+
+# Random forest regression model
+
+# $example on:regression$
+# Load training data
+df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a random forest regression model with spark.randomForest
+model <- spark.randomForest(training, label ~ features, "regression", numTrees = 10)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:regression$
diff --git a/examples/src/main/r/ml/survreg.R b/examples/src/main/r/ml/survreg.R
new file mode 100644
index 000000000000..f728b8b5d8c0
--- /dev/null
+++ b/examples/src/main/r/ml/survreg.R
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/survreg.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-survreg-example")
+
+# $example on$
+# Use the ovarian dataset available in R survival package
+library(survival)
+
+# Fit an accelerated failure time (AFT) survival regression model with spark.survreg
+ovarianDF <- suppressWarnings(createDataFrame(ovarian))
+aftDF <- ovarianDF
+aftTestDF <- ovarianDF
+aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx)
+
+# Model summary
+summary(aftModel)
+
+# Prediction
+aftPredictions <- predict(aftModel, aftTestDF)
+showDF(aftPredictions)
+# $example off$
diff --git a/examples/src/main/resources/employees.json b/examples/src/main/resources/employees.json
new file mode 100644
index 000000000000..6b2e6329a1cb
--- /dev/null
+++ b/examples/src/main/resources/employees.json
@@ -0,0 +1,4 @@
+{"name":"Michael", "salary":3000}
+{"name":"Andy", "salary":4500}
+{"name":"Justin", "salary":3500}
+{"name":"Berta", "salary":4000}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
new file mode 100644
index 000000000000..686cc39d3b9a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.BucketedRandomProjectionLSH
+import org.apache.spark.ml.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object BucketedRandomProjectionLSHExample {
+  def main(args: Array[String]): Unit = {
+    // Creates a SparkSession
+    val spark = SparkSession
+      .builder
+      .appName("BucketedRandomProjectionLSHExample")
+      .getOrCreate()
+
+    // $example on$
+    val dfA = spark.createDataFrame(Seq(
+      (0, Vectors.dense(1.0, 1.0)),
+      (1, Vectors.dense(1.0, -1.0)),
+      (2, Vectors.dense(-1.0, -1.0)),
+      (3, Vectors.dense(-1.0, 1.0))
+    )).toDF("id", "keys")
+
+    val dfB = spark.createDataFrame(Seq(
+      (4, Vectors.dense(1.0, 0.0)),
+      (5, Vectors.dense(-1.0, 0.0)),
+      (6, Vectors.dense(0.0, 1.0)),
+      (7, Vectors.dense(0.0, -1.0))
+    )).toDF("id", "keys")
+
+    val key = Vectors.dense(1.0, 0.0)
+
+    val brp = new BucketedRandomProjectionLSH()
+      .setBucketLength(2.0)
+      .setNumHashTables(3)
+      .setInputCol("keys")
+      .setOutputCol("values")
+
+    val model = brp.fit(dfA)
+
+    // Feature Transformation
+    model.transform(dfA).show()
+    // Cache the transformed columns
+    val transformedA = model.transform(dfA).cache()
+    val transformedB = model.transform(dfB).cache()
+
+    // Approximate similarity join
+    model.approxSimilarityJoin(dfA, dfB, 1.5).show()
+    model.approxSimilarityJoin(transformedA, transformedB, 1.5).show()
+    // Self Join
+    model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show()
+
+    // Approximate nearest neighbor search
+    model.approxNearestNeighbors(dfA, key, 2).show()
+    model.approxNearestNeighbors(transformedA, key, 2).show()
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala
new file mode 100644
index 000000000000..8113c992b1d6
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Interaction
+import org.apache.spark.ml.feature.VectorAssembler
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object InteractionExample {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName("InteractionExample")
+      .getOrCreate()
+
+    // $example on$
+    val df = spark.createDataFrame(Seq(
+      (1, 1, 2, 3, 8, 4, 5),
+      (2, 4, 3, 8, 7, 9, 8),
+      (3, 6, 1, 9, 2, 3, 6),
+      (4, 10, 8, 6, 9, 4, 5),
+      (5, 9, 2, 7, 10, 7, 3),
+      (6, 1, 1, 4, 2, 8, 4)
+    )).toDF("id1", "id2", "id3", "id4", "id5", "id6", "id7")
+
+    val assembler1 = new VectorAssembler().
+      setInputCols(Array("id2", "id3", "id4")).
+      setOutputCol("vec1")
+
+    val assembled1 = assembler1.transform(df)
+
+    val assembler2 = new VectorAssembler().
+      setInputCols(Array("id5", "id6", "id7")).
+      setOutputCol("vec2")
+
+    val assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2")
+
+    val interaction = new Interaction()
+      .setInputCols(Array("id1", "vec1", "vec2"))
+      .setOutputCol("interactedCol")
+
+    val interacted = interaction.transform(assembled2)
+
+    interacted.show(truncate = false)
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala
new file mode 100644
index 000000000000..f4fc3cf4118a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.MinHashLSH
+import org.apache.spark.ml.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object MinHashLSHExample {
+  def main(args: Array[String]): Unit = {
+    // Creates a SparkSession
+    val spark = SparkSession
+      .builder
+      .appName("MinHashLSHExample")
+      .getOrCreate()
+
+    // $example on$
+    val dfA = spark.createDataFrame(Seq(
+      (0, Vectors.sparse(6, Seq((0, 1.0), (1, 1.0), (2, 1.0)))),
+      (1, Vectors.sparse(6, Seq((2, 1.0), (3, 1.0), (4, 1.0)))),
+      (2, Vectors.sparse(6, Seq((0, 1.0), (2, 1.0), (4, 1.0))))
+    )).toDF("id", "keys")
+
+    val dfB = spark.createDataFrame(Seq(
+      (3, Vectors.sparse(6, Seq((1, 1.0), (3, 1.0), (5, 1.0)))),
+      (4, Vectors.sparse(6, Seq((2, 1.0), (3, 1.0), (5, 1.0)))),
+      (5, Vectors.sparse(6, Seq((1, 1.0), (2, 1.0), (4, 1.0))))
+    )).toDF("id", "keys")
+
+    val key = Vectors.sparse(6, Seq((1, 1.0), (3, 1.0)))
+
+    val mh = new MinHashLSH()
+      .setNumHashTables(3)
+      .setInputCol("keys")
+      .setOutputCol("values")
+
+    val model = mh.fit(dfA)
+
+    // Feature Transformation
+    model.transform(dfA).show()
+    // Cache the transformed columns
+    val transformedA = model.transform(dfA).cache()
+    val transformedB = model.transform(dfB).cache()
+
+    // Approximate similarity join
+    model.approxSimilarityJoin(dfA, dfB, 0.6).show()
+    model.approxSimilarityJoin(transformedA, transformedB, 0.6).show()
+    // Self Join
+    model.approxSimilarityJoin(dfA, dfA, 0.6).filter("datasetA.id < datasetB.id").show()
+
+    // Approximate nearest neighbor search
+    model.approxNearestNeighbors(dfA, key, 2).show()
+    model.approxNearestNeighbors(transformedA, key, 2).show()
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala
new file mode 100644
index 000000000000..ac617d19d36c
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.examples.sql
+
+// $example on:typed_custom_aggregation$
+import org.apache.spark.sql.expressions.Aggregator
+import org.apache.spark.sql.Encoder
+import org.apache.spark.sql.Encoders
+import org.apache.spark.sql.SparkSession
+// $example off:typed_custom_aggregation$
+
+object UserDefinedTypedAggregation {
+
+  // $example on:typed_custom_aggregation$
+  case class Employee(name: String, salary: Long)
+  case class Average(var sum: Long, var count: Long)
+
+  object MyAverage extends Aggregator[Employee, Average, Double] {
+    // A zero value for this aggregation. Should satisfy the property that any b + zero = b
+    def zero: Average = Average(0L, 0L)
+    // Combine two values to produce a new value. For performance, the function may modify `buffer`
+    // and return it instead of constructing a new object
+    def reduce(buffer: Average, employee: Employee): Average = {
+      buffer.sum += employee.salary
+      buffer.count += 1
+      buffer
+    }
+    // Merge two intermediate values
+    def merge(b1: Average, b2: Average): Average = {
+      b1.sum += b2.sum
+      b1.count += b2.count
+      b1
+    }
+    // Transform the output of the reduction
+    def finish(reduction: Average): Double = reduction.sum.toDouble / reduction.count
+    // Specifies the Encoder for the intermediate value type
+    def bufferEncoder: Encoder[Average] = Encoders.product
+    // Specifies the Encoder for the final output value type
+    def outputEncoder: Encoder[Double] = Encoders.scalaDouble
+  }
+  // $example off:typed_custom_aggregation$
+
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName("Spark SQL user-defined Datasets aggregation example")
+      .getOrCreate()
+
+    import spark.implicits._
+
+    // $example on:typed_custom_aggregation$
+    val ds = spark.read.json("examples/src/main/resources/employees.json").as[Employee]
+    ds.show()
+    // +-------+------+
+    // |   name|salary|
+    // +-------+------+
+    // |Michael|  3000|
+    // |   Andy|  4500|
+    // | Justin|  3500|
+    // |  Berta|  4000|
+    // +-------+------+
+
+    // Convert the function to a `TypedColumn` and give it a name
+    val averageSalary = MyAverage.toColumn.name("average_salary")
+    val result = ds.select(averageSalary)
+    result.show()
+    // +--------------+
+    // |average_salary|
+    // +--------------+
+    // |        3750.0|
+    // +--------------+
+    // $example off:typed_custom_aggregation$
+
+    spark.stop()
+  }
+
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala
new file mode 100644
index 000000000000..9c9ebc55163d
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedUntypedAggregation.scala
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.examples.sql
+
+// $example on:untyped_custom_aggregation$
+import org.apache.spark.sql.expressions.MutableAggregationBuffer
+import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SparkSession
+// $example off:untyped_custom_aggregation$
+
+object UserDefinedUntypedAggregation {
+
+  // $example on:untyped_custom_aggregation$
+  object MyAverage extends UserDefinedAggregateFunction {
+    // Data types of input arguments of this aggregate function
+    def inputSchema: StructType = StructType(StructField("inputColumn", LongType) :: Nil)
+    // Data types of values in the aggregation buffer
+    def bufferSchema: StructType = {
+      StructType(StructField("sum", LongType) :: StructField("count", LongType) :: Nil)
+    }
+    // The data type of the returned value
+    def dataType: DataType = DoubleType
+    // Whether this function always returns the same output on the identical input
+    def deterministic: Boolean = true
+    // Initializes the given aggregation buffer. The buffer itself is a `Row` that in addition to
+    // standard methods like retrieving a value at an index (e.g., get(), getBoolean()), provides
+    // the opportunity to update its values. Note that arrays and maps inside the buffer are still
+    // immutable.
+    def initialize(buffer: MutableAggregationBuffer): Unit = {
+      buffer(0) = 0L
+      buffer(1) = 0L
+    }
+    // Updates the given aggregation buffer `buffer` with new input data from `input`
+    def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
+      if (!input.isNullAt(0)) {
+        buffer(0) = buffer.getLong(0) + input.getLong(0)
+        buffer(1) = buffer.getLong(1) + 1
+      }
+    }
+    // Merges two aggregation buffers and stores the updated buffer values back to `buffer1`
+    def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
+      buffer1(0) = buffer1.getLong(0) + buffer2.getLong(0)
+      buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
+    }
+    // Calculates the final result
+    def evaluate(buffer: Row): Double = buffer.getLong(0).toDouble / buffer.getLong(1)
+  }
+  // $example off:untyped_custom_aggregation$
+
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName("Spark SQL user-defined DataFrames aggregation example")
+      .getOrCreate()
+
+    // $example on:untyped_custom_aggregation$
+    // Register the function to access it
+    spark.udf.register("myAverage", MyAverage)
+
+    val df = spark.read.json("examples/src/main/resources/employees.json")
+    df.createOrReplaceTempView("employees")
+    df.show()
+    // +-------+------+
+    // |   name|salary|
+    // +-------+------+
+    // |Michael|  3000|
+    // |   Andy|  4500|
+    // | Justin|  3500|
+    // |  Berta|  4000|
+    // +-------+------+
+
+    val result = spark.sql("SELECT myAverage(salary) as average_salary FROM employees")
+    result.show()
+    // +--------------+
+    // |average_salary|
+    // +--------------+
+    // |        3750.0|
+    // +--------------+
+    // $example off:untyped_custom_aggregation$
+
+    spark.stop()
+  }
+
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredKafkaWordCount.scala
new file mode 100644
index 000000000000..c26f73e78881
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredKafkaWordCount.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.sql.streaming
+
+import org.apache.spark.sql.SparkSession
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: StructuredKafkaWordCount <bootstrap-servers> <subscribe-type> <topics>
+ *   <bootstrap-servers> The Kafka "bootstrap.servers" configuration. A
+ *   comma-separated list of host:port.
+ *   <subscribe-type> There are three kinds of type, i.e. 'assign', 'subscribe',
+ *   'subscribePattern'.
+ *   |- <assign> Specific TopicPartitions to consume. Json string
+ *   |  {"topicA":[0,1],"topicB":[2,4]}.
+ *   |- <subscribe> The topic list to subscribe. A comma-separated list of
+ *   |  topics.
+ *   |- <subscribePattern> The pattern used to subscribe to topic(s).
+ *   |  Java regex string.
+ *   |- Only one of "assign, "subscribe" or "subscribePattern" options can be
+ *   |  specified for Kafka source.
+ *   <topics> Different value format depends on the value of 'subscribe-type'.
+ *
+ * Example:
+ *    `$ bin/run-example \
+ *      sql.streaming.StructuredKafkaWordCount host1:port1,host2:port2 \
+ *      subscribe topic1,topic2`
+ */
+object StructuredKafkaWordCount {
+  def main(args: Array[String]): Unit = {
+    if (args.length < 3) {
+      System.err.println("Usage: StructuredKafkaWordCount <bootstrap-servers> " +
+        "<subscribe-type> <topics>")
+      System.exit(1)
+    }
+
+    val Array(bootstrapServers, subscribeType, topics) = args
+
+    val spark = SparkSession
+      .builder
+      .appName("StructuredKafkaWordCount")
+      .getOrCreate()
+
+    import spark.implicits._
+
+    // Create DataSet representing the stream of input lines from kafka
+    val lines = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", bootstrapServers)
+      .option(subscribeType, topics)
+      .load()
+      .selectExpr("CAST(value AS STRING)")
+      .as[String]
+
+    // Generate running word count
+    val wordCounts = lines.flatMap(_.split(" ")).groupBy("value").count()
+
+    // Start running the query that prints the running counts to the console
+    val query = wordCounts.writeStream
+      .outputMode("complete")
+      .format("console")
+      .start()
+
+    query.awaitTermination()
+  }
+
+}
+// scalastyle:on println
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 57d553b75b87..2cf0b41ee354 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -96,7 +96,7 @@
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <type>test-jar</type>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
index c9325dea0bb0..a1a065a443e6 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
@@ -51,12 +51,17 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
       + "B'1000100101', E'\\\\xDEADBEEF', true, '172.16.0.42', '192.168.0.0/16', "
       + """'{1, 2}', '{"a", null, "b"}', '{0.11, 0.22}', '{0.11, 0.22}', 'd1', 1.01, 1)"""
     ).executeUpdate()
+    conn.prepareStatement("INSERT INTO bar VALUES (null, null, null, null, null, "
+      + "null, null, null, null, null, "
+      + "null, null, null, null, null, null, null)"
+    ).executeUpdate()
   }
 
   test("Type mapping for various types") {
     val df = sqlContext.read.jdbc(jdbcUrl, "bar", new Properties)
-    val rows = df.collect()
-    assert(rows.length == 1)
+    val rows = df.collect().sortBy(_.toString())
+    assert(rows.length == 2)
+    // Test the types, and values using the first row.
     val types = rows(0).toSeq.map(x => x.getClass)
     assert(types.length == 17)
     assert(classOf[String].isAssignableFrom(types(0)))
@@ -96,6 +101,9 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
     assert(rows(0).getString(14) == "d1")
     assert(rows(0).getFloat(15) == 1.01f)
     assert(rows(0).getShort(16) == 1)
+
+    // Test reading null values using the second row.
+    assert(0.until(16).forall(rows(1).isNullAt(_)))
   }
 
   test("Basic write test") {
diff --git a/external/docker/spark-test/base/Dockerfile b/external/docker/spark-test/base/Dockerfile
index 76f550f886ce..5a95a9387c31 100644
--- a/external/docker/spark-test/base/Dockerfile
+++ b/external/docker/spark-test/base/Dockerfile
@@ -25,7 +25,7 @@ RUN apt-get update && \
     apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server && \
     rm -rf /var/lib/apt/lists/*
 
-ENV SCALA_VERSION 2.11.7
+ENV SCALA_VERSION 2.11.8
 ENV CDH_VERSION cdh4
 ENV SCALA_HOME /opt/scala-$SCALA_VERSION
 ENV SPARK_HOME /opt/spark
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index fb0292a5f11e..6ea318bf4af6 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5e9275c8e66d..de3d17e9b9c0 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -93,6 +93,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 7b68ca7373fe..9361fdac11c5 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -69,6 +69,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 54565840fa66..d84e289272c6 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -36,7 +36,7 @@ import org.apache.spark.streaming.flume.sink._
 import org.apache.spark.streaming.receiver.Receiver
 
 /**
- * A [[ReceiverInputDStream]] that can be used to read data from several Flume agents running
+ * A `ReceiverInputDStream` that can be used to read data from several Flume agents running
  * [[org.apache.spark.streaming.flume.sink.SparkSink]]s.
  * @param _ssc Streaming context that will execute this input stream
  * @param addresses List of addresses at which SparkSinks are listening
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index 1bc206e8675f..f73e4f0aabc2 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -73,6 +73,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
 
   <build>
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 4f5045326a00..66a679661f1d 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index ebff5fd07a9b..c84c0408f483 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -88,6 +88,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
index 3b5a96534f9b..6d76904fb0e5 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
@@ -18,12 +18,16 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
+import java.util.concurrent.TimeoutException
 
-import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
+import scala.collection.JavaConverters._
+
+import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer, OffsetOutOfRangeException}
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.{SparkEnv, SparkException, TaskContext}
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.kafka010.KafkaSource._
 
 
 /**
@@ -34,10 +38,21 @@ import org.apache.spark.internal.Logging
 private[kafka010] case class CachedKafkaConsumer private(
     topicPartition: TopicPartition,
     kafkaParams: ju.Map[String, Object]) extends Logging {
+  import CachedKafkaConsumer._
 
   private val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
 
-  private val consumer = {
+  private var consumer = createConsumer
+
+  /** indicates whether this consumer is in use or not */
+  private var inuse = true
+
+  /** Iterator to the already fetch data */
+  private var fetchedData = ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
+  private var nextOffsetInFetchedData = UNKNOWN_OFFSET
+
+  /** Create a KafkaConsumer to fetch records for `topicPartition` */
+  private def createConsumer: KafkaConsumer[Array[Byte], Array[Byte]] = {
     val c = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
     val tps = new ju.ArrayList[TopicPartition]()
     tps.add(topicPartition)
@@ -45,42 +60,220 @@ private[kafka010] case class CachedKafkaConsumer private(
     c
   }
 
-  /** Iterator to the already fetch data */
-  private var fetchedData = ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
-  private var nextOffsetInFetchedData = -2L
+  case class AvailableOffsetRange(earliest: Long, latest: Long)
 
   /**
-   * Get the record for the given offset, waiting up to timeout ms if IO is necessary.
-   * Sequential forward access will use buffers, but random access will be horribly inefficient.
+   * Return the available offset range of the current partition. It's a pair of the earliest offset
+   * and the latest offset.
    */
-  def get(offset: Long, pollTimeoutMs: Long): ConsumerRecord[Array[Byte], Array[Byte]] = {
+  def getAvailableOffsetRange(): AvailableOffsetRange = {
+    consumer.seekToBeginning(Set(topicPartition).asJava)
+    val earliestOffset = consumer.position(topicPartition)
+    consumer.seekToEnd(Set(topicPartition).asJava)
+    val latestOffset = consumer.position(topicPartition)
+    AvailableOffsetRange(earliestOffset, latestOffset)
+  }
+
+  /**
+   * Get the record for the given offset if available. Otherwise it will either throw error
+   * (if failOnDataLoss = true), or return the next available offset within [offset, untilOffset),
+   * or null.
+   *
+   * @param offset the offset to fetch.
+   * @param untilOffset the max offset to fetch. Exclusive.
+   * @param pollTimeoutMs timeout in milliseconds to poll data from Kafka.
+   * @param failOnDataLoss When `failOnDataLoss` is `true`, this method will either return record at
+   *                       offset if available, or throw exception.when `failOnDataLoss` is `false`,
+   *                       this method will either return record at offset if available, or return
+   *                       the next earliest available record less than untilOffset, or null. It
+   *                       will not throw any exception.
+   */
+  def get(
+      offset: Long,
+      untilOffset: Long,
+      pollTimeoutMs: Long,
+      failOnDataLoss: Boolean): ConsumerRecord[Array[Byte], Array[Byte]] = {
+    require(offset < untilOffset,
+      s"offset must always be less than untilOffset [offset: $offset, untilOffset: $untilOffset]")
     logDebug(s"Get $groupId $topicPartition nextOffset $nextOffsetInFetchedData requested $offset")
-    if (offset != nextOffsetInFetchedData) {
-      logInfo(s"Initial fetch for $topicPartition $offset")
-      seek(offset)
-      poll(pollTimeoutMs)
+    // The following loop is basically for `failOnDataLoss = false`. When `failOnDataLoss` is
+    // `false`, first, we will try to fetch the record at `offset`. If no such record exists, then
+    // we will move to the next available offset within `[offset, untilOffset)` and retry.
+    // If `failOnDataLoss` is `true`, the loop body will be executed only once.
+    var toFetchOffset = offset
+    while (toFetchOffset != UNKNOWN_OFFSET) {
+      try {
+        return fetchData(toFetchOffset, untilOffset, pollTimeoutMs, failOnDataLoss)
+      } catch {
+        case e: OffsetOutOfRangeException =>
+          // When there is some error thrown, it's better to use a new consumer to drop all cached
+          // states in the old consumer. We don't need to worry about the performance because this
+          // is not a common path.
+          resetConsumer()
+          reportDataLoss(failOnDataLoss, s"Cannot fetch offset $toFetchOffset", e)
+          toFetchOffset = getEarliestAvailableOffsetBetween(toFetchOffset, untilOffset)
+      }
     }
+    resetFetchedData()
+    null
+  }
 
-    if (!fetchedData.hasNext()) { poll(pollTimeoutMs) }
-    assert(fetchedData.hasNext(),
-      s"Failed to get records for $groupId $topicPartition $offset " +
-        s"after polling for $pollTimeoutMs")
-    var record = fetchedData.next()
+  /**
+   * Return the next earliest available offset in [offset, untilOffset). If all offsets in
+   * [offset, untilOffset) are invalid (e.g., the topic is deleted and recreated), it will return
+   * `UNKNOWN_OFFSET`.
+   */
+  private def getEarliestAvailableOffsetBetween(offset: Long, untilOffset: Long): Long = {
+    val range = getAvailableOffsetRange()
+    logWarning(s"Some data may be lost. Recovering from the earliest offset: ${range.earliest}")
+    if (offset >= range.latest || range.earliest >= untilOffset) {
+      // [offset, untilOffset) and [earliestOffset, latestOffset) have no overlap,
+      // either
+      // --------------------------------------------------------
+      //         ^                 ^         ^         ^
+      //         |                 |         |         |
+      //   earliestOffset   latestOffset   offset   untilOffset
+      //
+      // or
+      // --------------------------------------------------------
+      //      ^          ^              ^                ^
+      //      |          |              |                |
+      //   offset   untilOffset   earliestOffset   latestOffset
+      val warningMessage =
+        s"""
+          |The current available offset range is $range.
+          | Offset ${offset} is out of range, and records in [$offset, $untilOffset) will be
+          | skipped ${additionalMessage(failOnDataLoss = false)}
+        """.stripMargin
+      logWarning(warningMessage)
+      UNKNOWN_OFFSET
+    } else if (offset >= range.earliest) {
+      // -----------------------------------------------------------------------------
+      //         ^            ^                  ^                                 ^
+      //         |            |                  |                                 |
+      //   earliestOffset   offset   min(untilOffset,latestOffset)   max(untilOffset, latestOffset)
+      //
+      // This will happen when a topic is deleted and recreated, and new data are pushed very fast,
+      // then we will see `offset` disappears first then appears again. Although the parameters
+      // are same, the state in Kafka cluster is changed, so the outer loop won't be endless.
+      logWarning(s"Found a disappeared offset $offset. " +
+        s"Some data may be lost ${additionalMessage(failOnDataLoss = false)}")
+      offset
+    } else {
+      // ------------------------------------------------------------------------------
+      //      ^           ^                       ^                                 ^
+      //      |           |                       |                                 |
+      //   offset   earliestOffset   min(untilOffset,latestOffset)   max(untilOffset, latestOffset)
+      val warningMessage =
+        s"""
+           |The current available offset range is $range.
+           | Offset ${offset} is out of range, and records in [$offset, ${range.earliest}) will be
+           | skipped ${additionalMessage(failOnDataLoss = false)}
+        """.stripMargin
+      logWarning(warningMessage)
+      range.earliest
+    }
+  }
 
-    if (record.offset != offset) {
-      logInfo(s"Buffer miss for $groupId $topicPartition $offset")
+  /**
+   * Get the record for the given offset if available. Otherwise it will either throw error
+   * (if failOnDataLoss = true), or return the next available offset within [offset, untilOffset),
+   * or null.
+   *
+   * @throws OffsetOutOfRangeException if `offset` is out of range
+   * @throws TimeoutException if cannot fetch the record in `pollTimeoutMs` milliseconds.
+   */
+  private def fetchData(
+      offset: Long,
+      untilOffset: Long,
+      pollTimeoutMs: Long,
+      failOnDataLoss: Boolean): ConsumerRecord[Array[Byte], Array[Byte]] = {
+    if (offset != nextOffsetInFetchedData || !fetchedData.hasNext()) {
+      // This is the first fetch, or the last pre-fetched data has been drained.
+      // Seek to the offset because we may call seekToBeginning or seekToEnd before this.
       seek(offset)
       poll(pollTimeoutMs)
-      assert(fetchedData.hasNext(),
-        s"Failed to get records for $groupId $topicPartition $offset " +
-          s"after polling for $pollTimeoutMs")
-      record = fetchedData.next()
-      assert(record.offset == offset,
-        s"Got wrong record for $groupId $topicPartition even after seeking to offset $offset")
     }
 
-    nextOffsetInFetchedData = offset + 1
-    record
+    if (!fetchedData.hasNext()) {
+      // We cannot fetch anything after `poll`. Two possible cases:
+      // - `offset` is out of range so that Kafka returns nothing. Just throw
+      // `OffsetOutOfRangeException` to let the caller handle it.
+      // - Cannot fetch any data before timeout. TimeoutException will be thrown.
+      val range = getAvailableOffsetRange()
+      if (offset < range.earliest || offset >= range.latest) {
+        throw new OffsetOutOfRangeException(
+          Map(topicPartition -> java.lang.Long.valueOf(offset)).asJava)
+      } else {
+        throw new TimeoutException(
+          s"Cannot fetch record for offset $offset in $pollTimeoutMs milliseconds")
+      }
+    } else {
+      val record = fetchedData.next()
+      nextOffsetInFetchedData = record.offset + 1
+      // In general, Kafka uses the specified offset as the start point, and tries to fetch the next
+      // available offset. Hence we need to handle offset mismatch.
+      if (record.offset > offset) {
+        // This may happen when some records aged out but their offsets already got verified
+        if (failOnDataLoss) {
+          reportDataLoss(true, s"Cannot fetch records in [$offset, ${record.offset})")
+          // Never happen as "reportDataLoss" will throw an exception
+          null
+        } else {
+          if (record.offset >= untilOffset) {
+            reportDataLoss(false, s"Skip missing records in [$offset, $untilOffset)")
+            null
+          } else {
+            reportDataLoss(false, s"Skip missing records in [$offset, ${record.offset})")
+            record
+          }
+        }
+      } else if (record.offset < offset) {
+        // This should not happen. If it does happen, then we probably misunderstand Kafka internal
+        // mechanism.
+        throw new IllegalStateException(
+          s"Tried to fetch $offset but the returned record offset was ${record.offset}")
+      } else {
+        record
+      }
+    }
+  }
+
+  /** Create a new consumer and reset cached states */
+  private def resetConsumer(): Unit = {
+    consumer.close()
+    consumer = createConsumer
+    resetFetchedData()
+  }
+
+  /** Reset the internal pre-fetched data. */
+  private def resetFetchedData(): Unit = {
+    nextOffsetInFetchedData = UNKNOWN_OFFSET
+    fetchedData = ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
+  }
+
+  /**
+   * Return an addition message including useful message and instruction.
+   */
+  private def additionalMessage(failOnDataLoss: Boolean): String = {
+    if (failOnDataLoss) {
+      s"(GroupId: $groupId, TopicPartition: $topicPartition). " +
+        s"$INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE"
+    } else {
+      s"(GroupId: $groupId, TopicPartition: $topicPartition). " +
+        s"$INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE"
+    }
+  }
+
+  /**
+   * Throw an exception or log a warning as per `failOnDataLoss`.
+   */
+  private def reportDataLoss(
+      failOnDataLoss: Boolean,
+      message: String,
+      cause: Throwable = null): Unit = {
+    val finalMessage = s"$message ${additionalMessage(failOnDataLoss)}"
+    reportDataLoss0(failOnDataLoss, finalMessage, cause)
   }
 
   private def close(): Unit = consumer.close()
@@ -100,6 +293,8 @@ private[kafka010] case class CachedKafkaConsumer private(
 
 private[kafka010] object CachedKafkaConsumer extends Logging {
 
+  private val UNKNOWN_OFFSET = -2L
+
   private case class CacheKey(groupId: String, topicPartition: TopicPartition)
 
   private lazy val cache = {
@@ -108,7 +303,7 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
     new ju.LinkedHashMap[CacheKey, CachedKafkaConsumer](capacity, 0.75f, true) {
       override def removeEldestEntry(
         entry: ju.Map.Entry[CacheKey, CachedKafkaConsumer]): Boolean = {
-        if (this.size > capacity) {
+        if (entry.getValue.inuse == false && this.size > capacity) {
           logWarning(s"KafkaConsumer cache hitting max capacity of $capacity, " +
             s"removing consumer for ${entry.getKey}")
           try {
@@ -125,6 +320,43 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
     }
   }
 
+  def releaseKafkaConsumer(
+      topic: String,
+      partition: Int,
+      kafkaParams: ju.Map[String, Object]): Unit = {
+    val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
+    val topicPartition = new TopicPartition(topic, partition)
+    val key = CacheKey(groupId, topicPartition)
+
+    synchronized {
+      val consumer = cache.get(key)
+      if (consumer != null) {
+        consumer.inuse = false
+      } else {
+        logWarning(s"Attempting to release consumer that does not exist")
+      }
+    }
+  }
+
+  /**
+   * Removes (and closes) the Kafka Consumer for the given topic, partition and group id.
+   */
+  def removeKafkaConsumer(
+      topic: String,
+      partition: Int,
+      kafkaParams: ju.Map[String, Object]): Unit = {
+    val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
+    val topicPartition = new TopicPartition(topic, partition)
+    val key = CacheKey(groupId, topicPartition)
+
+    synchronized {
+      val removedConsumer = cache.remove(key)
+      if (removedConsumer != null) {
+        removedConsumer.close()
+      }
+    }
+  }
+
   /**
    * Get a cached consumer for groupId, assigned to topic and partition.
    * If matching consumer doesn't already exist, will be created using kafkaParams.
@@ -140,13 +372,37 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
     // If this is reattempt at running the task, then invalidate cache and start with
     // a new consumer
     if (TaskContext.get != null && TaskContext.get.attemptNumber > 1) {
-      cache.remove(key)
-      new CachedKafkaConsumer(topicPartition, kafkaParams)
+      removeKafkaConsumer(topic, partition, kafkaParams)
+      val consumer = new CachedKafkaConsumer(topicPartition, kafkaParams)
+      consumer.inuse = true
+      cache.put(key, consumer)
+      consumer
     } else {
       if (!cache.containsKey(key)) {
         cache.put(key, new CachedKafkaConsumer(topicPartition, kafkaParams))
       }
-      cache.get(key)
+      val consumer = cache.get(key)
+      consumer.inuse = true
+      consumer
+    }
+  }
+
+  private def reportDataLoss0(
+      failOnDataLoss: Boolean,
+      finalMessage: String,
+      cause: Throwable = null): Unit = {
+    if (failOnDataLoss) {
+      if (cause != null) {
+        throw new IllegalStateException(finalMessage, cause)
+      } else {
+        throw new IllegalStateException(finalMessage)
+      }
+    } else {
+      if (cause != null) {
+        logWarning(finalMessage, cause)
+      } else {
+        logWarning(finalMessage)
+      }
     }
   }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala
new file mode 100644
index 000000000000..66511b306541
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+
+import scala.collection.JavaConverters._
+
+import org.apache.kafka.clients.consumer.{Consumer, KafkaConsumer}
+import org.apache.kafka.clients.consumer.internals.NoOpConsumerRebalanceListener
+import org.apache.kafka.common.TopicPartition
+
+/**
+ * Subscribe allows you to subscribe to a fixed collection of topics.
+ * SubscribePattern allows you to use a regex to specify topics of interest.
+ * Note that unlike the 0.8 integration, using Subscribe or SubscribePattern
+ * should respond to adding partitions during a running stream.
+ * Finally, Assign allows you to specify a fixed collection of partitions.
+ * All three strategies have overloaded constructors that allow you to specify
+ * the starting offset for a particular partition.
+ */
+sealed trait ConsumerStrategy {
+  /** Create a [[KafkaConsumer]] and subscribe to topics according to a desired strategy */
+  def createConsumer(kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]]
+}
+
+/**
+ * Specify a fixed collection of partitions.
+ */
+case class AssignStrategy(partitions: Array[TopicPartition]) extends ConsumerStrategy {
+  override def createConsumer(
+      kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = {
+    val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
+    consumer.assign(ju.Arrays.asList(partitions: _*))
+    consumer
+  }
+
+  override def toString: String = s"Assign[${partitions.mkString(", ")}]"
+}
+
+/**
+ * Subscribe to a fixed collection of topics.
+ */
+case class SubscribeStrategy(topics: Seq[String]) extends ConsumerStrategy {
+  override def createConsumer(
+      kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = {
+    val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
+    consumer.subscribe(topics.asJava)
+    consumer
+  }
+
+  override def toString: String = s"Subscribe[${topics.mkString(", ")}]"
+}
+
+/**
+ * Use a regex to specify topics of interest.
+ */
+case class SubscribePatternStrategy(topicPattern: String) extends ConsumerStrategy {
+  override def createConsumer(
+      kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = {
+    val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
+    consumer.subscribe(
+      ju.regex.Pattern.compile(topicPattern),
+      new NoOpConsumerRebalanceListener())
+    consumer
+  }
+
+  override def toString: String = s"SubscribePattern[$topicPattern]"
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
index 40d568a12c25..868edb5dcdc0 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.kafka010
 
-import java.io.Writer
-
 import scala.collection.mutable.HashMap
 import scala.util.control.NonFatal
 
@@ -83,7 +81,14 @@ private object JsonUtils {
    */
   def partitionOffsets(partitionOffsets: Map[TopicPartition, Long]): String = {
     val result = new HashMap[String, HashMap[Int, Long]]()
-    partitionOffsets.foreach { case (tp, off) =>
+    implicit val ordering = new Ordering[TopicPartition] {
+      override def compare(x: TopicPartition, y: TopicPartition): Int = {
+        Ordering.Tuple2[String, Int].compare((x.topic, x.partition), (y.topic, y.partition))
+      }
+    }
+    val partitions = partitionOffsets.keySet.toSeq.sorted  // sort for more determinism
+    partitions.foreach { tp =>
+        val off = partitionOffsets(tp)
         val parts = result.getOrElse(tp.topic, new HashMap[Int, Long])
         parts += tp.partition -> off
         result += tp.topic -> parts
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeLimit.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeLimit.scala
new file mode 100644
index 000000000000..80a026f4f5d7
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeLimit.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.kafka.common.TopicPartition
+
+/**
+ * Objects that represent desired offset range limits for starting,
+ * ending, and specific offsets.
+ */
+private[kafka010] sealed trait KafkaOffsetRangeLimit
+
+/**
+ * Represents the desire to bind to the earliest offsets in Kafka
+ */
+private[kafka010] case object EarliestOffsetRangeLimit extends KafkaOffsetRangeLimit
+
+/**
+ * Represents the desire to bind to the latest offsets in Kafka
+ */
+private[kafka010] case object LatestOffsetRangeLimit extends KafkaOffsetRangeLimit
+
+/**
+ * Represents the desire to bind to specific offsets. A offset == -1 binds to the
+ * latest offset, and offset == -2 binds to the earliest offset.
+ */
+private[kafka010] case class SpecificOffsetRangeLimit(
+    partitionOffsets: Map[TopicPartition, Long]) extends KafkaOffsetRangeLimit
+
+private[kafka010] object KafkaOffsetRangeLimit {
+  /**
+   * Used to denote offset range limits that are resolved via Kafka
+   */
+  val LATEST = -1L // indicates resolution to the latest offset
+  val EARLIEST = -2L // indicates resolution to the earliest offset
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
new file mode 100644
index 000000000000..2696d6f089d2
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+import java.util.concurrent.{Executors, ThreadFactory}
+
+import scala.collection.JavaConverters._
+import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
+import scala.util.control.NonFatal
+
+import org.apache.kafka.clients.consumer.{Consumer, ConsumerConfig, KafkaConsumer}
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.types._
+import org.apache.spark.util.{ThreadUtils, UninterruptibleThread}
+
+/**
+ * This class uses Kafka's own [[KafkaConsumer]] API to read data offsets from Kafka.
+ * The [[ConsumerStrategy]] class defines which Kafka topics and partitions should be read
+ * by this source. These strategies directly correspond to the different consumption options
+ * in. This class is designed to return a configured [[KafkaConsumer]] that is used by the
+ * [[KafkaSource]] to query for the offsets. See the docs on
+ * [[org.apache.spark.sql.kafka010.ConsumerStrategy]]
+ * for more details.
+ *
+ * Note: This class is not ThreadSafe
+ */
+private[kafka010] class KafkaOffsetReader(
+    consumerStrategy: ConsumerStrategy,
+    driverKafkaParams: ju.Map[String, Object],
+    readerOptions: Map[String, String],
+    driverGroupIdPrefix: String) extends Logging {
+  /**
+   * Used to ensure execute fetch operations execute in an UninterruptibleThread
+   */
+  val kafkaReaderThread = Executors.newSingleThreadExecutor(new ThreadFactory {
+    override def newThread(r: Runnable): Thread = {
+      val t = new UninterruptibleThread("Kafka Offset Reader") {
+        override def run(): Unit = {
+          r.run()
+        }
+      }
+      t.setDaemon(true)
+      t
+    }
+  })
+  val execContext = ExecutionContext.fromExecutorService(kafkaReaderThread)
+
+  /**
+   * Place [[groupId]] and [[nextId]] here so that they are initialized before any consumer is
+   * created -- see SPARK-19564.
+   */
+  private var groupId: String = null
+  private var nextId = 0
+
+  /**
+   * A KafkaConsumer used in the driver to query the latest Kafka offsets. This only queries the
+   * offsets and never commits them.
+   */
+  protected var consumer = createConsumer()
+
+  private val maxOffsetFetchAttempts =
+    readerOptions.getOrElse("fetchOffset.numRetries", "3").toInt
+
+  private val offsetFetchAttemptIntervalMs =
+    readerOptions.getOrElse("fetchOffset.retryIntervalMs", "1000").toLong
+
+  private def nextGroupId(): String = {
+    groupId = driverGroupIdPrefix + "-" + nextId
+    nextId += 1
+    groupId
+  }
+
+  override def toString(): String = consumerStrategy.toString
+
+  /**
+   * Closes the connection to Kafka, and cleans up state.
+   */
+  def close(): Unit = {
+    consumer.close()
+    kafkaReaderThread.shutdownNow()
+  }
+
+  /**
+   * @return The Set of TopicPartitions for a given topic
+   */
+  def fetchTopicPartitions(): Set[TopicPartition] = runUninterruptibly {
+    assert(Thread.currentThread().isInstanceOf[UninterruptibleThread])
+    // Poll to get the latest assigned partitions
+    consumer.poll(0)
+    val partitions = consumer.assignment()
+    consumer.pause(partitions)
+    partitions.asScala.toSet
+  }
+
+  /**
+   * Resolves the specific offsets based on Kafka seek positions.
+   * This method resolves offset value -1 to the latest and -2 to the
+   * earliest Kafka seek position.
+   */
+  def fetchSpecificOffsets(
+      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] =
+    runUninterruptibly {
+      withRetriesWithoutInterrupt {
+        // Poll to get the latest assigned partitions
+        consumer.poll(0)
+        val partitions = consumer.assignment()
+        consumer.pause(partitions)
+        assert(partitions.asScala == partitionOffsets.keySet,
+          "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" +
+            "Use -1 for latest, -2 for earliest, if you don't care.\n" +
+            s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions.asScala}")
+        logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets")
+
+        partitionOffsets.foreach {
+          case (tp, KafkaOffsetRangeLimit.LATEST) =>
+            consumer.seekToEnd(ju.Arrays.asList(tp))
+          case (tp, KafkaOffsetRangeLimit.EARLIEST) =>
+            consumer.seekToBeginning(ju.Arrays.asList(tp))
+          case (tp, off) => consumer.seek(tp, off)
+        }
+        partitionOffsets.map {
+          case (tp, _) => tp -> consumer.position(tp)
+        }
+      }
+    }
+
+  /**
+   * Fetch the earliest offsets for the topic partitions that are indicated
+   * in the [[ConsumerStrategy]].
+   */
+  def fetchEarliestOffsets(): Map[TopicPartition, Long] = runUninterruptibly {
+    withRetriesWithoutInterrupt {
+      // Poll to get the latest assigned partitions
+      consumer.poll(0)
+      val partitions = consumer.assignment()
+      consumer.pause(partitions)
+      logDebug(s"Partitions assigned to consumer: $partitions. Seeking to the beginning")
+
+      consumer.seekToBeginning(partitions)
+      val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
+      logDebug(s"Got earliest offsets for partition : $partitionOffsets")
+      partitionOffsets
+    }
+  }
+
+  /**
+   * Fetch the latest offsets for the topic partitions that are indicated
+   * in the [[ConsumerStrategy]].
+   */
+  def fetchLatestOffsets(): Map[TopicPartition, Long] = runUninterruptibly {
+    withRetriesWithoutInterrupt {
+      // Poll to get the latest assigned partitions
+      consumer.poll(0)
+      val partitions = consumer.assignment()
+      consumer.pause(partitions)
+      logDebug(s"Partitions assigned to consumer: $partitions. Seeking to the end.")
+
+      consumer.seekToEnd(partitions)
+      val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
+      logDebug(s"Got latest offsets for partition : $partitionOffsets")
+      partitionOffsets
+    }
+  }
+
+  /**
+   * Fetch the earliest offsets for specific topic partitions.
+   * The return result may not contain some partitions if they are deleted.
+   */
+  def fetchEarliestOffsets(
+      newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] = {
+    if (newPartitions.isEmpty) {
+      Map.empty[TopicPartition, Long]
+    } else {
+      runUninterruptibly {
+        withRetriesWithoutInterrupt {
+          // Poll to get the latest assigned partitions
+          consumer.poll(0)
+          val partitions = consumer.assignment()
+          consumer.pause(partitions)
+          logDebug(s"\tPartitions assigned to consumer: $partitions")
+
+          // Get the earliest offset of each partition
+          consumer.seekToBeginning(partitions)
+          val partitionOffsets = newPartitions.filter { p =>
+            // When deleting topics happen at the same time, some partitions may not be in
+            // `partitions`. So we need to ignore them
+            partitions.contains(p)
+          }.map(p => p -> consumer.position(p)).toMap
+          logDebug(s"Got earliest offsets for new partitions: $partitionOffsets")
+          partitionOffsets
+        }
+      }
+    }
+  }
+
+  /**
+   * This method ensures that the closure is called in an [[UninterruptibleThread]].
+   * This is required when communicating with the [[KafkaConsumer]]. In the case
+   * of streaming queries, we are already running in an [[UninterruptibleThread]],
+   * however for batch mode this is not the case.
+   */
+  private def runUninterruptibly[T](body: => T): T = {
+    if (!Thread.currentThread.isInstanceOf[UninterruptibleThread]) {
+      val future = Future {
+        body
+      }(execContext)
+      ThreadUtils.awaitResult(future, Duration.Inf)
+    } else {
+      body
+    }
+  }
+
+  /**
+   * Helper function that does multiple retries on a body of code that returns offsets.
+   * Retries are needed to handle transient failures. For e.g. race conditions between getting
+   * assignment and getting position while topics/partitions are deleted can cause NPEs.
+   *
+   * This method also makes sure `body` won't be interrupted to workaround a potential issue in
+   * `KafkaConsumer.poll`. (KAFKA-1894)
+   */
+  private def withRetriesWithoutInterrupt(
+      body: => Map[TopicPartition, Long]): Map[TopicPartition, Long] = {
+    // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894)
+    assert(Thread.currentThread().isInstanceOf[UninterruptibleThread])
+
+    synchronized {
+      var result: Option[Map[TopicPartition, Long]] = None
+      var attempt = 1
+      var lastException: Throwable = null
+      while (result.isEmpty && attempt <= maxOffsetFetchAttempts
+        && !Thread.currentThread().isInterrupted) {
+        Thread.currentThread match {
+          case ut: UninterruptibleThread =>
+            // "KafkaConsumer.poll" may hang forever if the thread is interrupted (E.g., the query
+            // is stopped)(KAFKA-1894). Hence, we just make sure we don't interrupt it.
+            //
+            // If the broker addresses are wrong, or Kafka cluster is down, "KafkaConsumer.poll" may
+            // hang forever as well. This cannot be resolved in KafkaSource until Kafka fixes the
+            // issue.
+            ut.runUninterruptibly {
+              try {
+                result = Some(body)
+              } catch {
+                case NonFatal(e) =>
+                  lastException = e
+                  logWarning(s"Error in attempt $attempt getting Kafka offsets: ", e)
+                  attempt += 1
+                  Thread.sleep(offsetFetchAttemptIntervalMs)
+                  resetConsumer()
+              }
+            }
+          case _ =>
+            throw new IllegalStateException(
+              "Kafka APIs must be executed on a o.a.spark.util.UninterruptibleThread")
+        }
+      }
+      if (Thread.interrupted()) {
+        throw new InterruptedException()
+      }
+      if (result.isEmpty) {
+        assert(attempt > maxOffsetFetchAttempts)
+        assert(lastException != null)
+        throw lastException
+      }
+      result.get
+    }
+  }
+
+  /**
+   * Create a consumer using the new generated group id. We always use a new consumer to avoid
+   * just using a broken consumer to retry on Kafka errors, which likely will fail again.
+   */
+  private def createConsumer(): Consumer[Array[Byte], Array[Byte]] = synchronized {
+    val newKafkaParams = new ju.HashMap[String, Object](driverKafkaParams)
+    newKafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, nextGroupId())
+    consumerStrategy.createConsumer(newKafkaParams)
+  }
+
+  private def resetConsumer(): Unit = synchronized {
+    consumer.close()
+    consumer = createConsumer()
+  }
+}
+
+private[kafka010] object KafkaOffsetReader {
+
+  def kafkaSchema: StructType = StructType(Seq(
+    StructField("key", BinaryType),
+    StructField("value", BinaryType),
+    StructField("topic", StringType),
+    StructField("partition", IntegerType),
+    StructField("offset", LongType),
+    StructField("timestamp", TimestampType),
+    StructField("timestampType", IntegerType)
+  ))
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala
new file mode 100644
index 000000000000..f180bbad6e36
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.sources.{BaseRelation, TableScan}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.types.UTF8String
+
+
+private[kafka010] class KafkaRelation(
+    override val sqlContext: SQLContext,
+    kafkaReader: KafkaOffsetReader,
+    executorKafkaParams: ju.Map[String, Object],
+    sourceOptions: Map[String, String],
+    failOnDataLoss: Boolean,
+    startingOffsets: KafkaOffsetRangeLimit,
+    endingOffsets: KafkaOffsetRangeLimit)
+    extends BaseRelation with TableScan with Logging {
+  assert(startingOffsets != LatestOffsetRangeLimit,
+    "Starting offset not allowed to be set to latest offsets.")
+  assert(endingOffsets != EarliestOffsetRangeLimit,
+    "Ending offset not allowed to be set to earliest offsets.")
+
+  private val pollTimeoutMs = sourceOptions.getOrElse(
+    "kafkaConsumer.pollTimeoutMs",
+    sqlContext.sparkContext.conf.getTimeAsMs("spark.network.timeout", "120s").toString
+  ).toLong
+
+  override def schema: StructType = KafkaOffsetReader.kafkaSchema
+
+  override def buildScan(): RDD[Row] = {
+    // Leverage the KafkaReader to obtain the relevant partition offsets
+    val fromPartitionOffsets = getPartitionOffsets(startingOffsets)
+    val untilPartitionOffsets = getPartitionOffsets(endingOffsets)
+    // Obtain topicPartitions in both from and until partition offset, ignoring
+    // topic partitions that were added and/or deleted between the two above calls.
+    if (fromPartitionOffsets.keySet != untilPartitionOffsets.keySet) {
+      implicit val topicOrdering: Ordering[TopicPartition] = Ordering.by(t => t.topic())
+      val fromTopics = fromPartitionOffsets.keySet.toList.sorted.mkString(",")
+      val untilTopics = untilPartitionOffsets.keySet.toList.sorted.mkString(",")
+      throw new IllegalStateException("different topic partitions " +
+        s"for starting offsets topics[${fromTopics}] and " +
+        s"ending offsets topics[${untilTopics}]")
+    }
+
+    // Calculate offset ranges
+    val offsetRanges = untilPartitionOffsets.keySet.map { tp =>
+      val fromOffset = fromPartitionOffsets.get(tp).getOrElse {
+          // This should not happen since topicPartitions contains all partitions not in
+          // fromPartitionOffsets
+          throw new IllegalStateException(s"$tp doesn't have a from offset")
+      }
+      val untilOffset = untilPartitionOffsets(tp)
+      KafkaSourceRDDOffsetRange(tp, fromOffset, untilOffset, None)
+    }.toArray
+
+    logInfo("GetBatch generating RDD of offset range: " +
+      offsetRanges.sortBy(_.topicPartition.toString).mkString(", "))
+
+    // Create an RDD that reads from Kafka and get the (key, value) pair as byte arrays.
+    val rdd = new KafkaSourceRDD(
+      sqlContext.sparkContext, executorKafkaParams, offsetRanges,
+      pollTimeoutMs, failOnDataLoss, reuseKafkaConsumer = false).map { cr =>
+      InternalRow(
+        cr.key,
+        cr.value,
+        UTF8String.fromString(cr.topic),
+        cr.partition,
+        cr.offset,
+        DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.timestamp)),
+        cr.timestampType.id)
+    }
+    sqlContext.internalCreateDataFrame(rdd, schema).rdd
+  }
+
+  private def getPartitionOffsets(
+      kafkaOffsets: KafkaOffsetRangeLimit): Map[TopicPartition, Long] = {
+    def validateTopicPartitions(partitions: Set[TopicPartition],
+      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] = {
+      assert(partitions == partitionOffsets.keySet,
+        "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" +
+          "Use -1 for latest, -2 for earliest, if you don't care.\n" +
+          s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions}")
+      logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets")
+      partitionOffsets
+    }
+    val partitions = kafkaReader.fetchTopicPartitions()
+    // Obtain TopicPartition offsets with late binding support
+    kafkaOffsets match {
+      case EarliestOffsetRangeLimit => partitions.map {
+        case tp => tp -> KafkaOffsetRangeLimit.EARLIEST
+      }.toMap
+      case LatestOffsetRangeLimit => partitions.map {
+        case tp => tp -> KafkaOffsetRangeLimit.LATEST
+      }.toMap
+      case SpecificOffsetRangeLimit(partitionOffsets) =>
+        validateTopicPartitions(partitions, partitionOffsets)
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala
new file mode 100644
index 000000000000..08914d82fffd
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.execution.streaming.Sink
+
+private[kafka010] class KafkaSink(
+    sqlContext: SQLContext,
+    executorKafkaParams: ju.Map[String, Object],
+    topic: Option[String]) extends Sink with Logging {
+  @volatile private var latestBatchId = -1L
+
+  override def toString(): String = "KafkaSink"
+
+  override def addBatch(batchId: Long, data: DataFrame): Unit = {
+    if (batchId <= latestBatchId) {
+      logInfo(s"Skipping already committed batch $batchId")
+    } else {
+      KafkaWriter.write(sqlContext.sparkSession,
+        data.queryExecution, executorKafkaParams, topic)
+      latestBatchId = batchId
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 61cba737d148..496c11297b9c 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -18,26 +18,25 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
+import java.io._
+import java.nio.charset.StandardCharsets
 
-import scala.collection.JavaConverters._
-import scala.util.control.NonFatal
-
-import org.apache.kafka.clients.consumer.{Consumer, KafkaConsumer, OffsetOutOfRangeException}
-import org.apache.kafka.clients.consumer.internals.NoOpConsumerRebalanceListener
+import org.apache.commons.io.IOUtils
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.ExecutorCacheTaskLocation
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.kafka010.KafkaSource._
 import org.apache.spark.sql.types._
-import org.apache.spark.util.UninterruptibleThread
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
- * A [[Source]] that uses Kafka's own [[KafkaConsumer]] API to reads data from Kafka. The design
- * for this source is as follows.
+ * A [[Source]] that reads data from Kafka using the following design.
  *
  * - The [[KafkaSourceOffset]] is the custom [[Offset]] defined for this source that contains
  *   a map of TopicPartition -> offset. Note that this offset is 1 + (available offset). For
@@ -45,20 +44,14 @@ import org.apache.spark.util.UninterruptibleThread
  *   KafkaSourceOffset will contain TopicPartition("t", 2) -> 6. This is done keep it consistent
  *   with the semantics of `KafkaConsumer.position()`.
  *
- * - The [[ConsumerStrategy]] class defines which Kafka topics and partitions should be read
- *   by this source. These strategies directly correspond to the different consumption options
- *   in . This class is designed to return a configured [[KafkaConsumer]] that is used by the
- *   [[KafkaSource]] to query for the offsets. See the docs on
- *   [[org.apache.spark.sql.kafka010.KafkaSource.ConsumerStrategy]] for more details.
- *
  * - The [[KafkaSource]] written to do the following.
  *
- *  - As soon as the source is created, the pre-configured KafkaConsumer returned by the
- *    [[ConsumerStrategy]] is used to query the initial offsets that this source should
- *    start reading from. This used to create the first batch.
+ *  - As soon as the source is created, the pre-configured [[KafkaOffsetReader]]
+ *    is used to query the initial offsets that this source should
+ *    start reading from. This is used to create the first batch.
  *
- *   - `getOffset()` uses the KafkaConsumer to query the latest available offsets, which are
- *     returned as a [[KafkaSourceOffset]].
+ *   - `getOffset()` uses the [[KafkaOffsetReader]] to query the latest
+ *      available offsets, which are returned as a [[KafkaSourceOffset]].
  *
  *   - `getBatch()` returns a DF that reads from the 'start offset' until the 'end offset' in
  *     for each partition. The end offset is excluded to be consistent with the semantics of
@@ -76,47 +69,68 @@ import org.apache.spark.util.UninterruptibleThread
  * To avoid this issue, you should make sure stopping the query before stopping the Kafka brokers
  * and not use wrong broker addresses.
  */
-private[kafka010] case class KafkaSource(
-    sqlContext: SQLContext,
-    consumerStrategy: ConsumerStrategy,
-    executorKafkaParams: ju.Map[String, Object],
-    sourceOptions: Map[String, String],
-    metadataPath: String,
-    startingOffsets: StartingOffsets,
-    failOnDataLoss: Boolean)
+private[kafka010] class KafkaSource(
+                                     sqlContext: SQLContext,
+                                     kafkaReader: KafkaOffsetReader,
+                                     executorKafkaParams: ju.Map[String, Object],
+                                     sourceOptions: Map[String, String],
+                                     metadataPath: String,
+                                     startingOffsets: KafkaOffsetRangeLimit,
+                                     failOnDataLoss: Boolean)
   extends Source with Logging {
 
   private val sc = sqlContext.sparkContext
 
-  private val pollTimeoutMs = sourceOptions.getOrElse("kafkaConsumer.pollTimeoutMs", "512").toLong
-
-  private val maxOffsetFetchAttempts =
-    sourceOptions.getOrElse("fetchOffset.numRetries", "3").toInt
-
-  private val offsetFetchAttemptIntervalMs =
-    sourceOptions.getOrElse("fetchOffset.retryIntervalMs", "10").toLong
+  private val pollTimeoutMs = sourceOptions.getOrElse(
+    "kafkaConsumer.pollTimeoutMs",
+    sc.conf.getTimeAsMs("spark.network.timeout", "120s").toString
+  ).toLong
 
   private val maxOffsetsPerTrigger =
     sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong)
 
-  /**
-   * A KafkaConsumer used in the driver to query the latest Kafka offsets. This only queries the
-   * offsets and never commits them.
-   */
-  private val consumer = consumerStrategy.createConsumer()
-
   /**
    * Lazily initialize `initialPartitionOffsets` to make sure that `KafkaConsumer.poll` is only
    * called in StreamExecutionThread. Otherwise, interrupting a thread while running
    * `KafkaConsumer.poll` may hang forever (KAFKA-1894).
    */
   private lazy val initialPartitionOffsets = {
-    val metadataLog = new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession, metadataPath)
+    val metadataLog =
+      new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession, metadataPath) {
+        override def serialize(metadata: KafkaSourceOffset, out: OutputStream): Unit = {
+          out.write(0) // A zero byte is written to support Spark 2.1.0 (SPARK-19517)
+          val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))
+          writer.write("v" + VERSION + "\n")
+          writer.write(metadata.json)
+          writer.flush
+        }
+
+        override def deserialize(in: InputStream): KafkaSourceOffset = {
+          in.read() // A zero byte is read to support Spark 2.1.0 (SPARK-19517)
+          val content = IOUtils.toString(new InputStreamReader(in, StandardCharsets.UTF_8))
+          // HDFSMetadataLog guarantees that it never creates a partial file.
+          assert(content.length != 0)
+          if (content(0) == 'v') {
+            val indexOfNewLine = content.indexOf("\n")
+            if (indexOfNewLine > 0) {
+              val version = parseVersion(content.substring(0, indexOfNewLine), VERSION)
+              KafkaSourceOffset(SerializedOffset(content.substring(indexOfNewLine + 1)))
+            } else {
+              throw new IllegalStateException(
+                s"Log file was malformed: failed to detect the log file version line.")
+            }
+          } else {
+            // The log was generated by Spark 2.1.0
+            KafkaSourceOffset(SerializedOffset(content))
+          }
+        }
+      }
+
     metadataLog.get(0).getOrElse {
       val offsets = startingOffsets match {
-        case EarliestOffsets => KafkaSourceOffset(fetchEarliestOffsets())
-        case LatestOffsets => KafkaSourceOffset(fetchLatestOffsets())
-        case SpecificOffsets(p) => KafkaSourceOffset(fetchSpecificStartingOffsets(p))
+        case EarliestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchEarliestOffsets())
+        case LatestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchLatestOffsets())
+        case SpecificOffsetRangeLimit(p) => fetchAndVerify(p)
       }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
@@ -124,16 +138,31 @@ private[kafka010] case class KafkaSource(
     }.partitionToOffsets
   }
 
+  private def fetchAndVerify(specificOffsets: Map[TopicPartition, Long]) = {
+    val result = kafkaReader.fetchSpecificOffsets(specificOffsets)
+    specificOffsets.foreach {
+      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
+          off != KafkaOffsetRangeLimit.EARLIEST =>
+        if (result(tp) != off) {
+          reportDataLoss(
+            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
+        }
+      case _ =>
+      // no real way to check that beginning or end is reasonable
+    }
+    KafkaSourceOffset(result)
+  }
+
   private var currentPartitionOffsets: Option[Map[TopicPartition, Long]] = None
 
-  override def schema: StructType = KafkaSource.kafkaSchema
+  override def schema: StructType = KafkaOffsetReader.kafkaSchema
 
   /** Returns the maximum available offset for this source. */
   override def getOffset: Option[Offset] = {
     // Make sure initialPartitionOffsets is initialized
     initialPartitionOffsets
 
-    val latest = fetchLatestOffsets()
+    val latest = kafkaReader.fetchLatestOffsets()
     val offsets = maxOffsetsPerTrigger match {
       case None =>
         latest
@@ -153,7 +182,7 @@ private[kafka010] case class KafkaSource(
       limit: Long,
       from: Map[TopicPartition, Long],
       until: Map[TopicPartition, Long]): Map[TopicPartition, Long] = {
-    val fromNew = fetchNewPartitionEarliestOffsets(until.keySet.diff(from.keySet).toSeq)
+    val fromNew = kafkaReader.fetchEarliestOffsets(until.keySet.diff(from.keySet).toSeq)
     val sizes = until.flatMap {
       case (tp, end) =>
         // If begin isn't defined, something's wrong, but let alert logic in getBatch handle it
@@ -203,7 +232,7 @@ private[kafka010] case class KafkaSource(
 
     // Find the new partitions, and get their earliest offsets
     val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet)
-    val newPartitionOffsets = fetchNewPartitionEarliestOffsets(newPartitions.toSeq)
+    val newPartitionOffsets = kafkaReader.fetchEarliestOffsets(newPartitions.toSeq)
     if (newPartitionOffsets.keySet != newPartitions) {
       // We cannot get from offsets for some partitions. It means they got deleted.
       val deletedPartitions = newPartitions.diff(newPartitionOffsets.keySet)
@@ -259,10 +288,18 @@ private[kafka010] case class KafkaSource(
       }
     }.toArray
 
-    // Create a RDD that reads from Kafka and get the (key, value) pair as byte arrays.
+    // Create an RDD that reads from Kafka and get the (key, value) pair as byte arrays.
     val rdd = new KafkaSourceRDD(
-      sc, executorKafkaParams, offsetRanges, pollTimeoutMs).map { cr =>
-      Row(cr.key, cr.value, cr.topic, cr.partition, cr.offset, cr.timestamp, cr.timestampType.id)
+      sc, executorKafkaParams, offsetRanges, pollTimeoutMs, failOnDataLoss,
+      reuseKafkaConsumer = true).map { cr =>
+      InternalRow(
+        cr.key,
+        cr.value,
+        UTF8String.fromString(cr.topic),
+        cr.partition,
+        cr.offset,
+        DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.timestamp)),
+        cr.timestampType.id)
     }
 
     logInfo("GetBatch generating RDD of offset range: " +
@@ -273,169 +310,15 @@ private[kafka010] case class KafkaSource(
       currentPartitionOffsets = Some(untilPartitionOffsets)
     }
 
-    sqlContext.createDataFrame(rdd, schema)
+    sqlContext.internalCreateDataFrame(rdd, schema)
   }
 
   /** Stop this source and free any resources it has allocated. */
   override def stop(): Unit = synchronized {
-    consumer.close()
-  }
-
-  override def toString(): String = s"KafkaSource[$consumerStrategy]"
-
-  /**
-   * Set consumer position to specified offsets, making sure all assignments are set.
-   */
-  private def fetchSpecificStartingOffsets(
-      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] = {
-    val result = withRetriesWithoutInterrupt {
-      // Poll to get the latest assigned partitions
-      consumer.poll(0)
-      val partitions = consumer.assignment()
-      consumer.pause(partitions)
-      assert(partitions.asScala == partitionOffsets.keySet,
-        "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" +
-          "Use -1 for latest, -2 for earliest, if you don't care.\n" +
-          s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions.asScala}")
-      logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets")
-
-      partitionOffsets.foreach {
-        case (tp, -1) => consumer.seekToEnd(ju.Arrays.asList(tp))
-        case (tp, -2) => consumer.seekToBeginning(ju.Arrays.asList(tp))
-        case (tp, off) => consumer.seek(tp, off)
-      }
-      partitionOffsets.map {
-        case (tp, _) => tp -> consumer.position(tp)
-      }
-    }
-    partitionOffsets.foreach {
-      case (tp, off) if off != -1 && off != -2 =>
-        if (result(tp) != off) {
-          reportDataLoss(
-            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
-        }
-      case _ =>
-        // no real way to check that beginning or end is reasonable
-    }
-    result
-  }
-
-  /**
-   * Fetch the earliest offsets of partitions.
-   */
-  private def fetchEarliestOffsets(): Map[TopicPartition, Long] = withRetriesWithoutInterrupt {
-    // Poll to get the latest assigned partitions
-    consumer.poll(0)
-    val partitions = consumer.assignment()
-    consumer.pause(partitions)
-    logDebug(s"Partitions assigned to consumer: $partitions. Seeking to the beginning")
-
-    consumer.seekToBeginning(partitions)
-    val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
-    logDebug(s"Got earliest offsets for partition : $partitionOffsets")
-    partitionOffsets
+    kafkaReader.close()
   }
 
-  /**
-   * Fetch the latest offset of partitions.
-   */
-  private def fetchLatestOffsets(): Map[TopicPartition, Long] = withRetriesWithoutInterrupt {
-    // Poll to get the latest assigned partitions
-    consumer.poll(0)
-    val partitions = consumer.assignment()
-    consumer.pause(partitions)
-    logDebug(s"Partitions assigned to consumer: $partitions. Seeking to the end.")
-
-    consumer.seekToEnd(partitions)
-    val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
-    logDebug(s"Got latest offsets for partition : $partitionOffsets")
-    partitionOffsets
-  }
-
-  /**
-   * Fetch the earliest offsets for newly discovered partitions. The return result may not contain
-   * some partitions if they are deleted.
-   */
-  private def fetchNewPartitionEarliestOffsets(
-      newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] =
-    if (newPartitions.isEmpty) {
-      Map.empty[TopicPartition, Long]
-    } else {
-      withRetriesWithoutInterrupt {
-        // Poll to get the latest assigned partitions
-        consumer.poll(0)
-        val partitions = consumer.assignment()
-        consumer.pause(partitions)
-        logDebug(s"\tPartitions assigned to consumer: $partitions")
-
-        // Get the earliest offset of each partition
-        consumer.seekToBeginning(partitions)
-        val partitionOffsets = newPartitions.filter { p =>
-          // When deleting topics happen at the same time, some partitions may not be in
-          // `partitions`. So we need to ignore them
-          partitions.contains(p)
-        }.map(p => p -> consumer.position(p)).toMap
-        logDebug(s"Got earliest offsets for new partitions: $partitionOffsets")
-        partitionOffsets
-      }
-    }
-
-  /**
-   * Helper function that does multiple retries on the a body of code that returns offsets.
-   * Retries are needed to handle transient failures. For e.g. race conditions between getting
-   * assignment and getting position while topics/partitions are deleted can cause NPEs.
-   *
-   * This method also makes sure `body` won't be interrupted to workaround a potential issue in
-   * `KafkaConsumer.poll`. (KAFKA-1894)
-   */
-  private def withRetriesWithoutInterrupt(
-      body: => Map[TopicPartition, Long]): Map[TopicPartition, Long] = {
-    // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894)
-    assert(Thread.currentThread().isInstanceOf[StreamExecutionThread])
-
-    synchronized {
-      var result: Option[Map[TopicPartition, Long]] = None
-      var attempt = 1
-      var lastException: Throwable = null
-      while (result.isEmpty && attempt <= maxOffsetFetchAttempts
-        && !Thread.currentThread().isInterrupted) {
-        Thread.currentThread match {
-          case ut: UninterruptibleThread =>
-            // "KafkaConsumer.poll" may hang forever if the thread is interrupted (E.g., the query
-            // is stopped)(KAFKA-1894). Hence, we just make sure we don't interrupt it.
-            //
-            // If the broker addresses are wrong, or Kafka cluster is down, "KafkaConsumer.poll" may
-            // hang forever as well. This cannot be resolved in KafkaSource until Kafka fixes the
-            // issue.
-            ut.runUninterruptibly {
-              try {
-                result = Some(body)
-              } catch {
-                case x: OffsetOutOfRangeException =>
-                  reportDataLoss(x.getMessage)
-                case NonFatal(e) =>
-                  lastException = e
-                  logWarning(s"Error in attempt $attempt getting Kafka offsets: ", e)
-                  attempt += 1
-                  Thread.sleep(offsetFetchAttemptIntervalMs)
-              }
-            }
-          case _ =>
-            throw new IllegalStateException(
-              "Kafka APIs must be executed on a o.a.spark.util.UninterruptibleThread")
-        }
-      }
-      if (Thread.interrupted()) {
-        throw new InterruptedException()
-      }
-      if (result.isEmpty) {
-        assert(attempt > maxOffsetFetchAttempts)
-        assert(lastException != null)
-        throw lastException
-      }
-      result.get
-    }
-  }
+  override def toString(): String = s"KafkaSource[$kafkaReader]"
 
   /**
    * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`.
@@ -443,69 +326,34 @@ private[kafka010] case class KafkaSource(
    */
   private def reportDataLoss(message: String): Unit = {
     if (failOnDataLoss) {
-      throw new IllegalStateException(message +
-        ". Set the source option 'failOnDataLoss' to 'false' if you want to ignore these checks.")
+      throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE")
     } else {
-      logWarning(message)
+      logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
     }
   }
 }
 
-
 /** Companion object for the [[KafkaSource]]. */
 private[kafka010] object KafkaSource {
-
-  def kafkaSchema: StructType = StructType(Seq(
-    StructField("key", BinaryType),
-    StructField("value", BinaryType),
-    StructField("topic", StringType),
-    StructField("partition", IntegerType),
-    StructField("offset", LongType),
-    StructField("timestamp", LongType),
-    StructField("timestampType", IntegerType)
-  ))
-
-  sealed trait ConsumerStrategy {
-    def createConsumer(): Consumer[Array[Byte], Array[Byte]]
-  }
-
-  case class AssignStrategy(partitions: Array[TopicPartition], kafkaParams: ju.Map[String, Object])
-    extends ConsumerStrategy {
-    override def createConsumer(): Consumer[Array[Byte], Array[Byte]] = {
-      val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
-      consumer.assign(ju.Arrays.asList(partitions: _*))
-      consumer
-    }
-
-    override def toString: String = s"Assign[${partitions.mkString(", ")}]"
-  }
-
-  case class SubscribeStrategy(topics: Seq[String], kafkaParams: ju.Map[String, Object])
-    extends ConsumerStrategy {
-    override def createConsumer(): Consumer[Array[Byte], Array[Byte]] = {
-      val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
-      consumer.subscribe(topics.asJava)
-      consumer
-    }
-
-    override def toString: String = s"Subscribe[${topics.mkString(", ")}]"
-  }
-
-  case class SubscribePatternStrategy(
-    topicPattern: String, kafkaParams: ju.Map[String, Object])
-    extends ConsumerStrategy {
-    override def createConsumer(): Consumer[Array[Byte], Array[Byte]] = {
-      val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
-      consumer.subscribe(
-        ju.regex.Pattern.compile(topicPattern),
-        new NoOpConsumerRebalanceListener())
-      consumer
-    }
-
-    override def toString: String = s"SubscribePattern[$topicPattern]"
-  }
-
-  private def getSortedExecutorList(sc: SparkContext): Array[String] = {
+  val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE =
+    """
+      |Some data may have been lost because they are not available in Kafka any more; either the
+      | data was aged out by Kafka or the topic may have been deleted before all the data in the
+      | topic was processed. If you want your streaming query to fail on such cases, set the source
+      | option "failOnDataLoss" to "true".
+    """.stripMargin
+
+  val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE =
+    """
+      |Some data may have been lost because they are not available in Kafka any more; either the
+      | data was aged out by Kafka or the topic may have been deleted before all the data in the
+      | topic was processed. If you don't want your streaming query to fail on such cases, set the
+      | source option "failOnDataLoss" to "false".
+    """.stripMargin
+
+  private[kafka010] val VERSION = 1
+
+  def getSortedExecutorList(sc: SparkContext): Array[String] = {
     val bm = sc.env.blockManager
     bm.master.getPeers(bm.blockManagerId).toArray
       .map(x => ExecutorCacheTaskLocation(x.host, x.executorId))
@@ -517,5 +365,5 @@ private[kafka010] object KafkaSource {
     if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host }
   }
 
-  private def floorMod(a: Long, b: Int): Int = ((a % b).toInt + b) % b
+  def floorMod(a: Long, b: Int): Int = ((a % b).toInt + b) % b
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index b5ade982515f..b5da415b3097 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.kafka010
 
 import org.apache.kafka.common.TopicPartition
 
-import org.apache.spark.sql.execution.streaming.Offset
+import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
@@ -27,9 +27,8 @@ import org.apache.spark.sql.execution.streaming.Offset
  */
 private[kafka010]
 case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends Offset {
-  override def toString(): String = {
-    partitionToOffsets.toSeq.sortBy(_._1.toString).mkString("[", ", ", "]")
-  }
+
+  override val json = JsonUtils.partitionOffsets(partitionToOffsets)
 }
 
 /** Companion object of the [[KafkaSourceOffset]] */
@@ -38,6 +37,7 @@ private[kafka010] object KafkaSourceOffset {
   def getPartitionOffsets(offset: Offset): Map[TopicPartition, Long] = {
     offset match {
       case o: KafkaSourceOffset => o.partitionToOffsets
+      case so: SerializedOffset => KafkaSourceOffset(so).partitionToOffsets
       case _ =>
         throw new IllegalArgumentException(
           s"Invalid conversion from offset of ${offset.getClass} to KafkaSourceOffset")
@@ -51,4 +51,10 @@ private[kafka010] object KafkaSourceOffset {
   def apply(offsetTuples: (String, Int, Long)*): KafkaSourceOffset = {
     KafkaSourceOffset(offsetTuples.map { case(t, p, o) => (new TopicPartition(t, p), o) }.toMap)
   }
+
+  /**
+   * Returns [[KafkaSourceOffset]] from a JSON [[SerializedOffset]]
+   */
+  def apply(offset: SerializedOffset): KafkaSourceOffset =
+    KafkaSourceOffset(JsonUtils.partitionOffsets(offset.json))
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 585ced875caa..ca15cfece123 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -23,13 +23,14 @@ import java.util.UUID
 import scala.collection.JavaConverters._
 
 import org.apache.kafka.clients.consumer.ConsumerConfig
-import org.apache.kafka.common.serialization.ByteArrayDeserializer
+import org.apache.kafka.clients.producer.ProducerConfig
+import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.execution.streaming.Source
-import org.apache.spark.sql.kafka010.KafkaSource._
-import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
+import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
+import org.apache.spark.sql.execution.streaming.{Sink, Source}
+import org.apache.spark.sql.sources._
+import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -37,11 +38,16 @@ import org.apache.spark.sql.types.StructType
  * IllegalArgumentException when the Kafka Dataset is created, so that it can catch
  * missing options even before the query is started.
  */
-private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
-  with DataSourceRegister with Logging {
-
+private[kafka010] class KafkaSourceProvider extends DataSourceRegister
+    with StreamSourceProvider
+    with StreamSinkProvider
+    with RelationProvider
+    with CreatableRelationProvider
+    with Logging {
   import KafkaSourceProvider._
 
+  override def shortName(): String = "kafka"
+
   /**
    * Returns the name and schema of the source. In addition, it also verifies whether the options
    * are correct and sufficient to create the [[KafkaSource]] when the query is started.
@@ -51,9 +57,9 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
       schema: Option[StructType],
       providerName: String,
       parameters: Map[String, String]): (String, StructType) = {
+    validateStreamOptions(parameters)
     require(schema.isEmpty, "Kafka source has a fixed schema and cannot be set with a custom one")
-    validateOptions(parameters)
-    ("kafka", KafkaSource.kafkaSchema)
+    (shortName(), KafkaOffsetReader.kafkaSchema)
   }
 
   override def createSource(
@@ -62,7 +68,12 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
       schema: Option[StructType],
       providerName: String,
       parameters: Map[String, String]): Source = {
-      validateOptions(parameters)
+    validateStreamOptions(parameters)
+    // Each running query should use its own group id. Otherwise, the query may be only assigned
+    // partial data since Kafka will assign partitions to multiple consumers having the same group
+    // id. Hence, we should generate a unique id for each query.
+    val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
+
     val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase, v) }
     val specifiedKafkaParams =
       parameters
@@ -71,101 +82,200 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
         .map { k => k.drop(6).toString -> parameters(k) }
         .toMap
 
-    val deserClassName = classOf[ByteArrayDeserializer].getName
-    // Each running query should use its own group id. Otherwise, the query may be only assigned
-    // partial data since Kafka will assign partitions to multiple consumers having the same group
-    // id. Hence, we should generate a unique id for each query.
-    val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
-
-    val startingOffsets =
-      caseInsensitiveParams.get(STARTING_OFFSETS_OPTION_KEY).map(_.trim.toLowerCase) match {
-        case Some("latest") => LatestOffsets
-        case Some("earliest") => EarliestOffsets
-        case Some(json) => SpecificOffsets(JsonUtils.partitionOffsets(json))
-        case None => LatestOffsets
-      }
-
-    val kafkaParamsForStrategy =
-      ConfigUpdater("source", specifiedKafkaParams)
-        .set(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, deserClassName)
-        .set(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, deserClassName)
+    val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(caseInsensitiveParams,
+      STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit)
 
-        // So that consumers in Kafka source do not mess with any existing group id
-        .set(ConsumerConfig.GROUP_ID_CONFIG, s"$uniqueGroupId-driver")
-
-        // Set to "earliest" to avoid exceptions. However, KafkaSource will fetch the initial
-        // offsets by itself instead of counting on KafkaConsumer.
-        .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")
+    val kafkaOffsetReader = new KafkaOffsetReader(
+      strategy(caseInsensitiveParams),
+      kafkaParamsForDriver(specifiedKafkaParams),
+      parameters,
+      driverGroupIdPrefix = s"$uniqueGroupId-driver")
 
-        // So that consumers in the driver does not commit offsets unnecessarily
-        .set(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")
+    new KafkaSource(
+      sqlContext,
+      kafkaOffsetReader,
+      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
+      parameters,
+      metadataPath,
+      startingStreamOffsets,
+      failOnDataLoss(caseInsensitiveParams))
+  }
 
-        // So that the driver does not pull too much data
-        .set(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, new java.lang.Integer(1))
+  /**
+   * Returns a new base relation with the given parameters.
+   *
+   * @note The parameters' keywords are case insensitive and this insensitivity is enforced
+   *       by the Map that is passed to the function.
+   */
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String]): BaseRelation = {
+    validateBatchOptions(parameters)
+    // Each running query should use its own group id. Otherwise, the query may be only assigned
+    // partial data since Kafka will assign partitions to multiple consumers having the same group
+    // id. Hence, we should generate a unique id for each query.
+    val uniqueGroupId = s"spark-kafka-relation-${UUID.randomUUID}"
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase, v) }
+    val specifiedKafkaParams =
+      parameters
+        .keySet
+        .filter(_.toLowerCase.startsWith("kafka."))
+        .map { k => k.drop(6).toString -> parameters(k) }
+        .toMap
 
-        // If buffer config is not set, set it to reasonable value to work around
-        // buffer issues (see KAFKA-3135)
-        .setIfUnset(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 65536: java.lang.Integer)
-        .build()
+    val startingRelationOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(
+      caseInsensitiveParams, STARTING_OFFSETS_OPTION_KEY, EarliestOffsetRangeLimit)
+    assert(startingRelationOffsets != LatestOffsetRangeLimit)
 
-    val kafkaParamsForExecutors =
-      ConfigUpdater("executor", specifiedKafkaParams)
-        .set(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, deserClassName)
-        .set(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, deserClassName)
+    val endingRelationOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(caseInsensitiveParams,
+      ENDING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit)
+    assert(endingRelationOffsets != EarliestOffsetRangeLimit)
 
-        // Make sure executors do only what the driver tells them.
-        .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "none")
+    val kafkaOffsetReader = new KafkaOffsetReader(
+      strategy(caseInsensitiveParams),
+      kafkaParamsForDriver(specifiedKafkaParams),
+      parameters,
+      driverGroupIdPrefix = s"$uniqueGroupId-driver")
 
-        // So that consumers in executors do not mess with any existing group id
-        .set(ConsumerConfig.GROUP_ID_CONFIG, s"$uniqueGroupId-executor")
+    new KafkaRelation(
+      sqlContext,
+      kafkaOffsetReader,
+      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
+      parameters,
+      failOnDataLoss(caseInsensitiveParams),
+      startingRelationOffsets,
+      endingRelationOffsets)
+  }
 
-        // So that consumers in executors does not commit offsets unnecessarily
-        .set(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")
+  override def createSink(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      partitionColumns: Seq[String],
+      outputMode: OutputMode): Sink = {
+    val defaultTopic = parameters.get(TOPIC_OPTION_KEY).map(_.trim)
+    val specifiedKafkaParams = kafkaParamsForProducer(parameters)
+    new KafkaSink(sqlContext,
+      new ju.HashMap[String, Object](specifiedKafkaParams.asJava), defaultTopic)
+  }
 
-        // If buffer config is not set, set it to reasonable value to work around
-        // buffer issues (see KAFKA-3135)
-        .setIfUnset(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 65536: java.lang.Integer)
-        .build()
+  override def createRelation(
+      outerSQLContext: SQLContext,
+      mode: SaveMode,
+      parameters: Map[String, String],
+      data: DataFrame): BaseRelation = {
+    mode match {
+      case SaveMode.Overwrite | SaveMode.Ignore =>
+        throw new AnalysisException(s"Save mode $mode not allowed for Kafka. " +
+          s"Allowed save modes are ${SaveMode.Append} and " +
+          s"${SaveMode.ErrorIfExists} (default).")
+      case _ => // good
+    }
+    val topic = parameters.get(TOPIC_OPTION_KEY).map(_.trim)
+    val specifiedKafkaParams = kafkaParamsForProducer(parameters)
+    KafkaWriter.write(outerSQLContext.sparkSession, data.queryExecution,
+      new ju.HashMap[String, Object](specifiedKafkaParams.asJava), topic)
+
+    /* This method is suppose to return a relation that reads the data that was written.
+     * We cannot support this for Kafka. Therefore, in order to make things consistent,
+     * we return an empty base relation.
+     */
+    new BaseRelation {
+      override def sqlContext: SQLContext = unsupportedException
+      override def schema: StructType = unsupportedException
+      override def needConversion: Boolean = unsupportedException
+      override def sizeInBytes: Long = unsupportedException
+      override def unhandledFilters(filters: Array[Filter]): Array[Filter] = unsupportedException
+      private def unsupportedException =
+        throw new UnsupportedOperationException("BaseRelation from Kafka write " +
+          "operation is not usable.")
+    }
+  }
 
-    val strategy = caseInsensitiveParams.find(x => STRATEGY_OPTION_KEYS.contains(x._1)).get match {
-      case ("assign", value) =>
-        AssignStrategy(
-          JsonUtils.partitions(value),
-          kafkaParamsForStrategy)
-      case ("subscribe", value) =>
-        SubscribeStrategy(
-          value.split(",").map(_.trim()).filter(_.nonEmpty),
-          kafkaParamsForStrategy)
-      case ("subscribepattern", value) =>
-        SubscribePatternStrategy(
-          value.trim(),
-          kafkaParamsForStrategy)
-      case _ =>
-        // Should never reach here as we are already matching on
-        // matched strategy names
-        throw new IllegalArgumentException("Unknown option")
+  private def kafkaParamsForProducer(parameters: Map[String, String]): Map[String, String] = {
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase, v) }
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
+          + "are serialized with ByteArraySerializer.")
     }
 
-    val failOnDataLoss =
-      caseInsensitiveParams.getOrElse(FAIL_ON_DATA_LOSS_OPTION_KEY, "true").toBoolean
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
+    {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
+          + "value are serialized with ByteArraySerializer.")
+    }
+    parameters
+      .keySet
+      .filter(_.toLowerCase.startsWith("kafka."))
+      .map { k => k.drop(6).toString -> parameters(k) }
+      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
+        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+  }
 
-    new KafkaSource(
-      sqlContext,
-      strategy,
-      kafkaParamsForExecutors,
-      parameters,
-      metadataPath,
-      startingOffsets,
-      failOnDataLoss)
+  private def kafkaParamsForDriver(specifiedKafkaParams: Map[String, String]) =
+    ConfigUpdater("source", specifiedKafkaParams)
+      .set(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, deserClassName)
+      .set(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, deserClassName)
+
+      // Set to "earliest" to avoid exceptions. However, KafkaSource will fetch the initial
+      // offsets by itself instead of counting on KafkaConsumer.
+      .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")
+
+      // So that consumers in the driver does not commit offsets unnecessarily
+      .set(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")
+
+      // So that the driver does not pull too much data
+      .set(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, new java.lang.Integer(1))
+
+      // If buffer config is not set, set it to reasonable value to work around
+      // buffer issues (see KAFKA-3135)
+      .setIfUnset(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 65536: java.lang.Integer)
+      .build()
+
+  private def kafkaParamsForExecutors(
+      specifiedKafkaParams: Map[String, String], uniqueGroupId: String) =
+    ConfigUpdater("executor", specifiedKafkaParams)
+      .set(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, deserClassName)
+      .set(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, deserClassName)
+
+      // Make sure executors do only what the driver tells them.
+      .set(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "none")
+
+      // So that consumers in executors do not mess with any existing group id
+      .set(ConsumerConfig.GROUP_ID_CONFIG, s"$uniqueGroupId-executor")
+
+      // So that consumers in executors does not commit offsets unnecessarily
+      .set(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false")
+
+      // If buffer config is not set, set it to reasonable value to work around
+      // buffer issues (see KAFKA-3135)
+      .setIfUnset(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 65536: java.lang.Integer)
+      .build()
+
+  private def strategy(caseInsensitiveParams: Map[String, String]) =
+      caseInsensitiveParams.find(x => STRATEGY_OPTION_KEYS.contains(x._1)).get match {
+    case ("assign", value) =>
+      AssignStrategy(JsonUtils.partitions(value))
+    case ("subscribe", value) =>
+      SubscribeStrategy(value.split(",").map(_.trim()).filter(_.nonEmpty))
+    case ("subscribepattern", value) =>
+      SubscribePatternStrategy(value.trim())
+    case _ =>
+      // Should never reach here as we are already matching on
+      // matched strategy names
+      throw new IllegalArgumentException("Unknown option")
   }
 
-  private def validateOptions(parameters: Map[String, String]): Unit = {
+  private def failOnDataLoss(caseInsensitiveParams: Map[String, String]) =
+    caseInsensitiveParams.getOrElse(FAIL_ON_DATA_LOSS_OPTION_KEY, "true").toBoolean
 
+  private def validateGeneralOptions(parameters: Map[String, String]): Unit = {
     // Validate source options
-
     val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase, v) }
     val specifiedStrategies =
       caseInsensitiveParams.filter { case (k, _) => STRATEGY_OPTION_KEYS.contains(k) }.toSeq
+
     if (specifiedStrategies.isEmpty) {
       throw new IllegalArgumentException(
         "One of the following options must be specified for Kafka source: "
@@ -258,7 +368,52 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
     }
   }
 
-  override def shortName(): String = "kafka"
+  private def validateStreamOptions(caseInsensitiveParams: Map[String, String]) = {
+    // Stream specific options
+    caseInsensitiveParams.get(ENDING_OFFSETS_OPTION_KEY).map(_ =>
+      throw new IllegalArgumentException("ending offset not valid in streaming queries"))
+    validateGeneralOptions(caseInsensitiveParams)
+  }
+
+  private def validateBatchOptions(caseInsensitiveParams: Map[String, String]) = {
+    // Batch specific options
+    KafkaSourceProvider.getKafkaOffsetRangeLimit(
+      caseInsensitiveParams, STARTING_OFFSETS_OPTION_KEY, EarliestOffsetRangeLimit) match {
+      case EarliestOffsetRangeLimit => // good to go
+      case LatestOffsetRangeLimit =>
+        throw new IllegalArgumentException("starting offset can't be latest " +
+          "for batch queries on Kafka")
+      case SpecificOffsetRangeLimit(partitionOffsets) =>
+        partitionOffsets.foreach {
+          case (tp, off) if off == KafkaOffsetRangeLimit.LATEST =>
+            throw new IllegalArgumentException(s"startingOffsets for $tp can't " +
+              "be latest for batch queries on Kafka")
+          case _ => // ignore
+        }
+    }
+
+    KafkaSourceProvider.getKafkaOffsetRangeLimit(
+      caseInsensitiveParams, ENDING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit) match {
+      case EarliestOffsetRangeLimit =>
+        throw new IllegalArgumentException("ending offset can't be earliest " +
+          "for batch queries on Kafka")
+      case LatestOffsetRangeLimit => // good to go
+      case SpecificOffsetRangeLimit(partitionOffsets) =>
+        partitionOffsets.foreach {
+          case (tp, off) if off == KafkaOffsetRangeLimit.EARLIEST =>
+            throw new IllegalArgumentException(s"ending offset for $tp can't be " +
+              "earliest for batch queries on Kafka")
+          case _ => // ignore
+        }
+    }
+
+    validateGeneralOptions(caseInsensitiveParams)
+
+    // Don't want to throw an error, but at least log a warning.
+    if (caseInsensitiveParams.get("maxoffsetspertrigger").isDefined) {
+      logWarning("maxOffsetsPerTrigger option ignored in batch queries")
+    }
+  }
 
   /** Class to conveniently update Kafka config params, while logging the changes */
   private case class ConfigUpdater(module: String, kafkaParams: Map[String, String]) {
@@ -266,7 +421,7 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
 
     def set(key: String, value: Object): this.type = {
       map.put(key, value)
-      logInfo(s"$module: Set $key to $value, earlier value: ${kafkaParams.get(key).getOrElse("")}")
+      logInfo(s"$module: Set $key to $value, earlier value: ${kafkaParams.getOrElse(key, "")}")
       this
     }
 
@@ -284,6 +439,22 @@ private[kafka010] class KafkaSourceProvider extends StreamSourceProvider
 
 private[kafka010] object KafkaSourceProvider {
   private val STRATEGY_OPTION_KEYS = Set("subscribe", "subscribepattern", "assign")
-  private val STARTING_OFFSETS_OPTION_KEY = "startingoffsets"
+  private[kafka010] val STARTING_OFFSETS_OPTION_KEY = "startingoffsets"
+  private[kafka010] val ENDING_OFFSETS_OPTION_KEY = "endingoffsets"
   private val FAIL_ON_DATA_LOSS_OPTION_KEY = "failondataloss"
+  val TOPIC_OPTION_KEY = "topic"
+
+  private val deserClassName = classOf[ByteArrayDeserializer].getName
+
+  def getKafkaOffsetRangeLimit(
+      params: Map[String, String],
+      offsetOptionKey: String,
+      defaultOffsets: KafkaOffsetRangeLimit): KafkaOffsetRangeLimit = {
+    params.get(offsetOptionKey).map(_.trim) match {
+      case Some(offset) if offset.toLowerCase == "latest" => LatestOffsetRangeLimit
+      case Some(offset) if offset.toLowerCase == "earliest" => EarliestOffsetRangeLimit
+      case Some(json) => SpecificOffsetRangeLimit(JsonUtils.partitionOffsets(json))
+      case None => defaultOffsets
+    }
+  }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
index 802dd040aed9..6fb3473eb75f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
@@ -21,13 +21,14 @@ import java.{util => ju}
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.kafka.clients.consumer.ConsumerRecord
+import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.{Partition, SparkContext, TaskContext}
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.NextIterator
 
 
 /** Offset range that one partition of the KafkaSourceRDD has to read */
@@ -61,7 +62,9 @@ private[kafka010] class KafkaSourceRDD(
     sc: SparkContext,
     executorKafkaParams: ju.Map[String, Object],
     offsetRanges: Seq[KafkaSourceRDDOffsetRange],
-    pollTimeoutMs: Long)
+    pollTimeoutMs: Long,
+    failOnDataLoss: Boolean,
+    reuseKafkaConsumer: Boolean)
   extends RDD[ConsumerRecord[Array[Byte], Array[Byte]]](sc, Nil) {
 
   override def persist(newLevel: StorageLevel): this.type = {
@@ -120,7 +123,19 @@ private[kafka010] class KafkaSourceRDD(
   override def compute(
       thePart: Partition,
       context: TaskContext): Iterator[ConsumerRecord[Array[Byte], Array[Byte]]] = {
-    val range = thePart.asInstanceOf[KafkaSourceRDDPartition].offsetRange
+    val sourcePartition = thePart.asInstanceOf[KafkaSourceRDDPartition]
+    val topic = sourcePartition.offsetRange.topic
+    if (!reuseKafkaConsumer) {
+      // if we can't reuse CachedKafkaConsumers, let's reset the groupId to something unique
+      // to each task (i.e., append the task's unique partition id), because we will have
+      // multiple tasks (e.g., in the case of union) reading from the same topic partitions
+      val old = executorKafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
+      val id = TaskContext.getPartitionId()
+      executorKafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, old + "-" + id)
+    }
+    val kafkaPartition = sourcePartition.offsetRange.partition
+    val consumer = CachedKafkaConsumer.getOrCreate(topic, kafkaPartition, executorKafkaParams)
+    val range = resolveRange(consumer, sourcePartition.offsetRange)
     assert(
       range.fromOffset <= range.untilOffset,
       s"Beginning offset ${range.fromOffset} is after the ending offset ${range.untilOffset} " +
@@ -130,24 +145,68 @@ private[kafka010] class KafkaSourceRDD(
       logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset " +
         s"skipping ${range.topic} ${range.partition}")
       Iterator.empty
-
     } else {
+      val underlying = new NextIterator[ConsumerRecord[Array[Byte], Array[Byte]]]() {
+        var requestOffset = range.fromOffset
+
+        override def getNext(): ConsumerRecord[Array[Byte], Array[Byte]] = {
+          if (requestOffset >= range.untilOffset) {
+            // Processed all offsets in this partition.
+            finished = true
+            null
+          } else {
+            val r = consumer.get(requestOffset, range.untilOffset, pollTimeoutMs, failOnDataLoss)
+            if (r == null) {
+              // Losing some data. Skip the rest offsets in this partition.
+              finished = true
+              null
+            } else {
+              requestOffset = r.offset + 1
+              r
+            }
+          }
+        }
 
-      val consumer = CachedKafkaConsumer.getOrCreate(
-        range.topic, range.partition, executorKafkaParams)
-      var requestOffset = range.fromOffset
-
-      logDebug(s"Creating iterator for $range")
-
-      new Iterator[ConsumerRecord[Array[Byte], Array[Byte]]]() {
-        override def hasNext(): Boolean = requestOffset < range.untilOffset
-        override def next(): ConsumerRecord[Array[Byte], Array[Byte]] = {
-          assert(hasNext(), "Can't call next() once untilOffset has been reached")
-          val r = consumer.get(requestOffset, pollTimeoutMs)
-          requestOffset += 1
-          r
+        override protected def close(): Unit = {
+          if (!reuseKafkaConsumer) {
+            // Don't forget to close non-reuse KafkaConsumers. You may take down your cluster!
+            CachedKafkaConsumer.removeKafkaConsumer(topic, kafkaPartition, executorKafkaParams)
+          } else {
+            // Indicate that we're no longer using this consumer
+            CachedKafkaConsumer.releaseKafkaConsumer(topic, kafkaPartition, executorKafkaParams)
+          }
         }
       }
+      // Release consumer, either by removing it or indicating we're no longer using it
+      context.addTaskCompletionListener { _ =>
+        underlying.closeIfNeeded()
+      }
+      underlying
+    }
+  }
+
+  private def resolveRange(consumer: CachedKafkaConsumer, range: KafkaSourceRDDOffsetRange) = {
+    if (range.fromOffset < 0 || range.untilOffset < 0) {
+      // Late bind the offset range
+      val availableOffsetRange = consumer.getAvailableOffsetRange()
+      val fromOffset = if (range.fromOffset < 0) {
+        assert(range.fromOffset == KafkaOffsetRangeLimit.EARLIEST,
+          s"earliest offset ${range.fromOffset} does not equal ${KafkaOffsetRangeLimit.EARLIEST}")
+        availableOffsetRange.earliest
+      } else {
+        range.fromOffset
+      }
+      val untilOffset = if (range.untilOffset < 0) {
+        assert(range.untilOffset == KafkaOffsetRangeLimit.LATEST,
+          s"latest offset ${range.untilOffset} does not equal ${KafkaOffsetRangeLimit.LATEST}")
+        availableOffsetRange.latest
+      } else {
+        range.untilOffset
+      }
+      KafkaSourceRDDOffsetRange(range.topicPartition,
+        fromOffset, untilOffset, range.preferredLoc)
+    } else {
+      range
     }
   }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
new file mode 100644
index 000000000000..6e160cbe2db5
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+
+import org.apache.kafka.clients.producer.{KafkaProducer, _}
+import org.apache.kafka.common.serialization.ByteArraySerializer
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection}
+import org.apache.spark.sql.types.{BinaryType, StringType}
+
+/**
+ * A simple trait for writing out data in a single Spark task, without any concerns about how
+ * to commit or abort tasks. Exceptions thrown by the implementation of this class will
+ * automatically trigger task aborts.
+ */
+private[kafka010] class KafkaWriteTask(
+    producerConfiguration: ju.Map[String, Object],
+    inputSchema: Seq[Attribute],
+    topic: Option[String]) {
+  // used to synchronize with Kafka callbacks
+  @volatile private var failedWrite: Exception = null
+  private val projection = createProjection
+  private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _
+
+  /**
+   * Writes key value data out to topics.
+   */
+  def execute(iterator: Iterator[InternalRow]): Unit = {
+    producer = new KafkaProducer[Array[Byte], Array[Byte]](producerConfiguration)
+    while (iterator.hasNext && failedWrite == null) {
+      val currentRow = iterator.next()
+      val projectedRow = projection(currentRow)
+      val topic = projectedRow.getUTF8String(0)
+      val key = projectedRow.getBinary(1)
+      val value = projectedRow.getBinary(2)
+      if (topic == null) {
+        throw new NullPointerException(s"null topic present in the data. Use the " +
+        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
+      }
+      val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
+      val callback = new Callback() {
+        override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
+          if (failedWrite == null && e != null) {
+            failedWrite = e
+          }
+        }
+      }
+      producer.send(record, callback)
+    }
+  }
+
+  def close(): Unit = {
+    if (producer != null) {
+      checkForErrors
+      producer.close()
+      checkForErrors
+      producer = null
+    }
+  }
+
+  private def createProjection: UnsafeProjection = {
+    val topicExpression = topic.map(Literal(_)).orElse {
+      inputSchema.find(_.name == KafkaWriter.TOPIC_ATTRIBUTE_NAME)
+    }.getOrElse {
+      throw new IllegalStateException(s"topic option required when no " +
+        s"'${KafkaWriter.TOPIC_ATTRIBUTE_NAME}' attribute is present")
+    }
+    topicExpression.dataType match {
+      case StringType => // good
+      case t =>
+        throw new IllegalStateException(s"${KafkaWriter.TOPIC_ATTRIBUTE_NAME} " +
+          s"attribute unsupported type $t. ${KafkaWriter.TOPIC_ATTRIBUTE_NAME} " +
+          s"must be a ${StringType}")
+    }
+    val keyExpression = inputSchema.find(_.name == KafkaWriter.KEY_ATTRIBUTE_NAME)
+      .getOrElse(Literal(null, BinaryType))
+    keyExpression.dataType match {
+      case StringType | BinaryType => // good
+      case t =>
+        throw new IllegalStateException(s"${KafkaWriter.KEY_ATTRIBUTE_NAME} " +
+          s"attribute unsupported type $t")
+    }
+    val valueExpression = inputSchema
+      .find(_.name == KafkaWriter.VALUE_ATTRIBUTE_NAME).getOrElse(
+      throw new IllegalStateException(s"Required attribute " +
+        s"'${KafkaWriter.VALUE_ATTRIBUTE_NAME}' not found")
+    )
+    valueExpression.dataType match {
+      case StringType | BinaryType => // good
+      case t =>
+        throw new IllegalStateException(s"${KafkaWriter.VALUE_ATTRIBUTE_NAME} " +
+          s"attribute unsupported type $t")
+    }
+    UnsafeProjection.create(
+      Seq(topicExpression, Cast(keyExpression, BinaryType),
+        Cast(valueExpression, BinaryType)), inputSchema)
+  }
+
+  private def checkForErrors: Unit = {
+    if (failedWrite != null) {
+      throw failedWrite
+    }
+  }
+}
+
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
new file mode 100644
index 000000000000..61936e32fd83
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
+import org.apache.spark.sql.types.{BinaryType, StringType}
+import org.apache.spark.util.Utils
+
+/**
+ * The [[KafkaWriter]] class is used to write data from a batch query
+ * or structured streaming query, given by a [[QueryExecution]], to Kafka.
+ * The data is assumed to have a value column, and an optional topic and key
+ * columns. If the topic column is missing, then the topic must come from
+ * the 'topic' configuration option. If the key column is missing, then a
+ * null valued key field will be added to the
+ * [[org.apache.kafka.clients.producer.ProducerRecord]].
+ */
+private[kafka010] object KafkaWriter extends Logging {
+  val TOPIC_ATTRIBUTE_NAME: String = "topic"
+  val KEY_ATTRIBUTE_NAME: String = "key"
+  val VALUE_ATTRIBUTE_NAME: String = "value"
+
+  override def toString: String = "KafkaWriter"
+
+  def validateQuery(
+      queryExecution: QueryExecution,
+      kafkaParameters: ju.Map[String, Object],
+      topic: Option[String] = None): Unit = {
+    val schema = queryExecution.analyzed.output
+    schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
+      if (topic == None) {
+        throw new AnalysisException(s"topic option required when no " +
+          s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " +
+          s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a topic.")
+      } else {
+        Literal(topic.get, StringType)
+      }
+    ).dataType match {
+      case StringType => // good
+      case _ =>
+        throw new AnalysisException(s"Topic type must be a String")
+    }
+    schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse(
+      Literal(null, StringType)
+    ).dataType match {
+      case StringType | BinaryType => // good
+      case _ =>
+        throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " +
+          s"must be a String or BinaryType")
+    }
+    schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse(
+      throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found")
+    ).dataType match {
+      case StringType | BinaryType => // good
+      case _ =>
+        throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " +
+          s"must be a String or BinaryType")
+    }
+  }
+
+  def write(
+      sparkSession: SparkSession,
+      queryExecution: QueryExecution,
+      kafkaParameters: ju.Map[String, Object],
+      topic: Option[String] = None): Unit = {
+    val schema = queryExecution.analyzed.output
+    validateQuery(queryExecution, kafkaParameters, topic)
+    SQLExecution.withNewExecutionId(sparkSession, queryExecution) {
+      queryExecution.toRdd.foreachPartition { iter =>
+        val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
+        Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
+          finallyBlock = writeTask.close())
+      }
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin b/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin
new file mode 100644
index 000000000000..ae928e724967
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin
@@ -0,0 +1 @@
+2{"kafka-initial-offset-2-1-0":{"2":0,"1":0,"0":0}}
\ No newline at end of file
diff --git a/external/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt b/external/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt
new file mode 100644
index 000000000000..6410031743d2
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt
@@ -0,0 +1 @@
+{"topic1":{"0":456,"1":789},"topic2":{"0":0}}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumerSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumerSuite.scala
new file mode 100644
index 000000000000..7aa7dd096c07
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumerSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.scalatest.PrivateMethodTester
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class CachedKafkaConsumerSuite extends SharedSQLContext with PrivateMethodTester {
+
+  test("SPARK-19886: Report error cause correctly in reportDataLoss") {
+    val cause = new Exception("D'oh!")
+    val reportDataLoss = PrivateMethod[Unit]('reportDataLoss0)
+    val e = intercept[IllegalStateException] {
+      CachedKafkaConsumer.invokePrivate(reportDataLoss(true, "message", cause))
+    }
+    assert(e.getCause === cause)
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala
new file mode 100644
index 000000000000..673d60ff6f87
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.kafka.common.TopicPartition
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.test.SharedSQLContext
+
+class KafkaRelationSuite extends QueryTest with BeforeAndAfter with SharedSQLContext {
+
+  import testImplicits._
+
+  private val topicId = new AtomicInteger(0)
+
+  private var testUtils: KafkaTestUtils = _
+
+  private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
+
+  private def assignString(topic: String, partitions: Iterable[Int]): String = {
+    JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
+  }
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+      super.afterAll()
+    }
+  }
+
+  private def createDF(
+      topic: String,
+      withOptions: Map[String, String] = Map.empty[String, String],
+      brokerAddress: Option[String] = None) = {
+    val df = spark
+      .read
+      .format("kafka")
+      .option("kafka.bootstrap.servers",
+        brokerAddress.getOrElse(testUtils.brokerAddress))
+      .option("subscribe", topic)
+    withOptions.foreach {
+      case (key, value) => df.option(key, value)
+    }
+    df.load().selectExpr("CAST(value AS STRING)")
+  }
+
+
+  test("explicit earliest to latest offsets") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (0 to 9).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 19).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("20"), Some(2))
+
+    // Specify explicit earliest and latest offset values
+    val df = createDF(topic,
+      withOptions = Map("startingOffsets" -> "earliest", "endingOffsets" -> "latest"))
+    checkAnswer(df, (0 to 20).map(_.toString).toDF)
+
+    // "latest" should late bind to the current (latest) offset in the df
+    testUtils.sendMessages(topic, (21 to 29).map(_.toString).toArray, Some(2))
+    checkAnswer(df, (0 to 29).map(_.toString).toDF)
+  }
+
+  test("default starting and ending offsets") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (0 to 9).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 19).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("20"), Some(2))
+
+    // Implicit offset values, should default to earliest and latest
+    val df = createDF(topic)
+    // Test that we default to "earliest" and "latest"
+    checkAnswer(df, (0 to 20).map(_.toString).toDF)
+  }
+
+  test("explicit offsets") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (0 to 9).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 19).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("20"), Some(2))
+
+    // Test explicitly specified offsets
+    val startPartitionOffsets = Map(
+      new TopicPartition(topic, 0) -> -2L, // -2 => earliest
+      new TopicPartition(topic, 1) -> -2L,
+      new TopicPartition(topic, 2) -> 0L   // explicit earliest
+    )
+    val startingOffsets = JsonUtils.partitionOffsets(startPartitionOffsets)
+
+    val endPartitionOffsets = Map(
+      new TopicPartition(topic, 0) -> -1L, // -1 => latest
+      new TopicPartition(topic, 1) -> -1L,
+      new TopicPartition(topic, 2) -> 1L  // explicit offset happens to = the latest
+    )
+    val endingOffsets = JsonUtils.partitionOffsets(endPartitionOffsets)
+    val df = createDF(topic,
+        withOptions = Map("startingOffsets" -> startingOffsets, "endingOffsets" -> endingOffsets))
+    checkAnswer(df, (0 to 20).map(_.toString).toDF)
+
+    // static offset partition 2, nothing should change
+    testUtils.sendMessages(topic, (31 to 39).map(_.toString).toArray, Some(2))
+    checkAnswer(df, (0 to 20).map(_.toString).toDF)
+
+    // latest offset partition 1, should change
+    testUtils.sendMessages(topic, (21 to 30).map(_.toString).toArray, Some(1))
+    checkAnswer(df, (0 to 30).map(_.toString).toDF)
+  }
+
+  test("reuse same dataframe in query") {
+    // This test ensures that we do not cache the Kafka Consumer in KafkaRelation
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 1)
+    testUtils.sendMessages(topic, (0 to 10).map(_.toString).toArray, Some(0))
+
+    // Specify explicit earliest and latest offset values
+    val df = createDF(topic,
+      withOptions = Map("startingOffsets" -> "earliest", "endingOffsets" -> "latest"))
+    checkAnswer(df.union(df), ((0 to 10) ++ (0 to 10)).map(_.toString).toDF)
+  }
+
+  test("test late binding start offsets") {
+    var kafkaUtils: KafkaTestUtils = null
+    try {
+      /**
+       * The following settings will ensure that all log entries
+       * are removed following a call to cleanupLogs
+       */
+      val brokerProps = Map[String, Object](
+        "log.retention.bytes" -> 1.asInstanceOf[AnyRef], // retain nothing
+        "log.retention.ms" -> 1.asInstanceOf[AnyRef]     // no wait time
+      )
+      kafkaUtils = new KafkaTestUtils(withBrokerProps = brokerProps)
+      kafkaUtils.setup()
+
+      val topic = newTopic()
+      kafkaUtils.createTopic(topic, partitions = 1)
+      kafkaUtils.sendMessages(topic, (0 to 9).map(_.toString).toArray, Some(0))
+      // Specify explicit earliest and latest offset values
+      val df = createDF(topic,
+        withOptions = Map("startingOffsets" -> "earliest", "endingOffsets" -> "latest"),
+        Some(kafkaUtils.brokerAddress))
+      checkAnswer(df, (0 to 9).map(_.toString).toDF)
+      // Blow away current set of messages.
+      kafkaUtils.cleanupLogs()
+      // Add some more data, but do not call cleanup
+      kafkaUtils.sendMessages(topic, (10 to 19).map(_.toString).toArray, Some(0))
+      // Ensure that we late bind to the new starting position
+      checkAnswer(df, (10 to 19).map(_.toString).toDF)
+    } finally {
+      if (kafkaUtils != null) {
+        kafkaUtils.teardown()
+      }
+    }
+  }
+
+  test("bad batch query options") {
+    def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
+      val ex = intercept[IllegalArgumentException] {
+        val reader = spark
+          .read
+          .format("kafka")
+        options.foreach { case (k, v) => reader.option(k, v) }
+        reader.load()
+      }
+      expectedMsgs.foreach { m =>
+        assert(ex.getMessage.toLowerCase.contains(m.toLowerCase))
+      }
+    }
+
+    // Specifying an ending offset as the starting point
+    testBadOptions("startingOffsets" -> "latest")("starting offset can't be latest " +
+      "for batch queries on Kafka")
+
+    // Now do it with an explicit json start offset indicating latest
+    val startPartitionOffsets = Map( new TopicPartition("t", 0) -> -1L)
+    val startingOffsets = JsonUtils.partitionOffsets(startPartitionOffsets)
+    testBadOptions("subscribe" -> "t", "startingOffsets" -> startingOffsets)(
+      "startingOffsets for t-0 can't be latest for batch queries on Kafka")
+
+
+    // Make sure we catch ending offsets that indicate earliest
+    testBadOptions("endingOffsets" -> "earliest")("ending offset can't be earliest " +
+      "for batch queries on Kafka")
+
+    // Make sure we catch ending offsets that indicating earliest
+    val endPartitionOffsets = Map(new TopicPartition("t", 0) -> -2L)
+    val endingOffsets = JsonUtils.partitionOffsets(endPartitionOffsets)
+    testBadOptions("subscribe" -> "t", "endingOffsets" -> endingOffsets)(
+      "ending offset for t-0 can't be earliest for batch queries on Kafka")
+
+    // No strategy specified
+    testBadOptions()("options must be specified", "subscribe", "subscribePattern")
+
+    // Multiple strategies specified
+    testBadOptions("subscribe" -> "t", "subscribePattern" -> "t.*")(
+      "only one", "options can be specified")
+
+    testBadOptions("subscribe" -> "t", "assign" -> """{"a":[0]}""")(
+      "only one", "options can be specified")
+
+    testBadOptions("assign" -> "")("no topicpartitions to assign")
+    testBadOptions("subscribe" -> "")("no topics to subscribe")
+    testBadOptions("subscribePattern" -> "")("pattern to subscribe is empty")
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
new file mode 100644
index 000000000000..1e7f4f2c144b
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala
@@ -0,0 +1,428 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.kafka.clients.producer.ProducerConfig
+import org.apache.kafka.common.serialization.ByteArraySerializer
+import org.scalatest.time.SpanSugar._
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{BinaryType, DataType}
+
+class KafkaSinkSuite extends StreamTest with SharedSQLContext {
+  import testImplicits._
+
+  protected var testUtils: KafkaTestUtils = _
+
+  override val streamingTimeout = 30.seconds
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils(
+      withBrokerProps = Map("auto.create.topics.enable" -> "false"))
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+      super.afterAll()
+    }
+  }
+
+  test("batch - write to kafka") {
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+    val df = Seq("1", "2", "3", "4", "5").map(v => (topic, v)).toDF("topic", "value")
+    df.write
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("topic", topic)
+      .save()
+    checkAnswer(
+      createKafkaReader(topic).selectExpr("CAST(value as STRING) value"),
+      Row("1") :: Row("2") :: Row("3") :: Row("4") :: Row("5") :: Nil)
+  }
+
+  test("batch - null topic field value, and no topic option") {
+    val df = Seq[(String, String)](null.asInstanceOf[String] -> "1").toDF("topic", "value")
+    val ex = intercept[SparkException] {
+      df.write
+        .format("kafka")
+        .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+        .save()
+    }
+    assert(ex.getMessage.toLowerCase.contains(
+      "null topic present in the data"))
+  }
+
+  test("batch - unsupported save modes") {
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+    val df = Seq[(String, String)](null.asInstanceOf[String] -> "1").toDF("topic", "value")
+
+    // Test bad save mode Ignore
+    var ex = intercept[AnalysisException] {
+      df.write
+        .format("kafka")
+        .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+        .mode(SaveMode.Ignore)
+        .save()
+    }
+    assert(ex.getMessage.toLowerCase.contains(
+      s"save mode ignore not allowed for kafka"))
+
+    // Test bad save mode Overwrite
+    ex = intercept[AnalysisException] {
+      df.write
+        .format("kafka")
+        .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+        .mode(SaveMode.Overwrite)
+        .save()
+    }
+    assert(ex.getMessage.toLowerCase.contains(
+      s"save mode overwrite not allowed for kafka"))
+  }
+
+  test("SPARK-20496: batch - enforce analyzed plans") {
+    val inputEvents =
+      spark.range(1, 1000)
+        .select(to_json(struct("*")) as 'value)
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+    // used to throw UnresolvedException
+    inputEvents.write
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("topic", topic)
+      .save()
+  }
+
+  test("streaming - write to kafka with topic field") {
+    val input = MemoryStream[String]
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = None,
+      withOutputMode = Some(OutputMode.Append))(
+      withSelectExpr = s"'$topic' as topic", "value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      input.addData("1", "2", "3", "4", "5")
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      input.addData("6", "7", "8", "9", "10")
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - write aggregation w/o topic field, with topic option") {
+    val input = MemoryStream[String]
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF().groupBy("value").count(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Update()))(
+      withSelectExpr = "CAST(value as STRING) key", "CAST(count as STRING) value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+
+    try {
+      input.addData("1", "2", "2", "3", "3", "3")
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, (1, 1), (2, 2), (3, 3))
+      input.addData("1", "2", "3")
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, (1, 1), (2, 2), (3, 3), (1, 2), (2, 3), (3, 4))
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - aggregation with topic field and topic option") {
+    /* The purpose of this test is to ensure that the topic option
+     * overrides the topic field. We begin by writing some data that
+     * includes a topic field and value (e.g., 'foo') along with a topic
+     * option. Then when we read from the topic specified in the option
+     * we should see the data i.e., the data was written to the topic
+     * option, and not to the topic in the data e.g., foo
+     */
+    val input = MemoryStream[String]
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF().groupBy("value").count(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Update()))(
+      withSelectExpr = "'foo' as topic",
+        "CAST(value as STRING) key", "CAST(count as STRING) value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .selectExpr("CAST(key AS INT)", "CAST(value AS INT)")
+      .as[(Int, Int)]
+
+    try {
+      input.addData("1", "2", "2", "3", "3", "3")
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, (1, 1), (2, 2), (3, 3))
+      input.addData("1", "2", "3")
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, (1, 1), (2, 2), (3, 3), (1, 2), (2, 3), (3, 4))
+    } finally {
+      writer.stop()
+    }
+  }
+
+
+  test("streaming - write data with bad schema") {
+    val input = MemoryStream[String]
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = "value as key", "value"
+        )
+        input.addData("1", "2", "3", "4", "5")
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage
+      .toLowerCase
+      .contains("topic option required when no 'topic' attribute is present"))
+
+    try {
+      /* No value field */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "value as key"
+        )
+        input.addData("1", "2", "3", "4", "5")
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase.contains("required attribute 'value' not found"))
+  }
+
+  test("streaming - write data with valid schema but wrong types") {
+    val input = MemoryStream[String]
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      /* topic field wrong type */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"CAST('1' as INT) as topic", "value"
+        )
+        input.addData("1", "2", "3", "4", "5")
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase.contains("topic type must be a string"))
+
+    try {
+      /* value field wrong type */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as value"
+        )
+        input.addData("1", "2", "3", "4", "5")
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase.contains(
+      "value attribute type must be a string or binarytype"))
+
+    try {
+      ex = intercept[StreamingQueryException] {
+        /* key field wrong type */
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as key", "value"
+        )
+        input.addData("1", "2", "3", "4", "5")
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase.contains(
+      "key attribute type must be a string or binarytype"))
+  }
+
+  test("streaming - write to non-existing topic") {
+    val input = MemoryStream[String]
+    val topic = newTopic()
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF(), withTopic = Some(topic))()
+        input.addData("1", "2", "3", "4", "5")
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase.contains("job aborted"))
+  }
+
+  test("streaming - exception on config serializer") {
+    val input = MemoryStream[String]
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    ex = intercept[IllegalArgumentException] {
+      writer = createKafkaWriter(
+        input.toDF(),
+        withOptions = Map("kafka.key.serializer" -> "foo"))()
+    }
+    assert(ex.getMessage.toLowerCase.contains(
+      "kafka option 'key.serializer' is not supported"))
+
+    ex = intercept[IllegalArgumentException] {
+      writer = createKafkaWriter(
+        input.toDF(),
+        withOptions = Map("kafka.value.serializer" -> "foo"))()
+    }
+    assert(ex.getMessage.toLowerCase.contains(
+      "kafka option 'value.serializer' is not supported"))
+  }
+
+  test("generic - write big data with small producer buffer") {
+    /* This test ensures that we understand the semantics of Kafka when
+    * is comes to blocking on a call to send when the send buffer is full.
+    * This test will configure the smallest possible producer buffer and
+    * indicate that we should block when it is full. Thus, no exception should
+    * be thrown in the case of a full buffer.
+    */
+    val topic = newTopic()
+    testUtils.createTopic(topic, 1)
+    val options = new java.util.HashMap[String, Object]
+    options.put("bootstrap.servers", testUtils.brokerAddress)
+    options.put("buffer.memory", "16384") // min buffer size
+    options.put("block.on.buffer.full", "true")
+    options.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    options.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    val inputSchema = Seq(AttributeReference("value", BinaryType)())
+    val data = new Array[Byte](15000) // large value
+    val writeTask = new KafkaWriteTask(options, inputSchema, Some(topic))
+    try {
+      val fieldTypes: Array[DataType] = Array(BinaryType)
+      val converter = UnsafeProjection.create(fieldTypes)
+      val row = new SpecificInternalRow(fieldTypes)
+      row.update(0, data)
+      val iter = Seq.fill(1000)(converter.apply(row)).iterator
+      writeTask.execute(iter)
+    } finally {
+      writeTask.close()
+    }
+  }
+
+  private val topicId = new AtomicInteger(0)
+
+  private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
+
+  private def createKafkaReader(topic: String): DataFrame = {
+    spark.read
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("startingOffsets", "earliest")
+      .option("endingOffsets", "latest")
+      .option("subscribe", topic)
+      .load()
+  }
+
+  private def createKafkaWriter(
+      input: DataFrame,
+      withTopic: Option[String] = None,
+      withOutputMode: Option[OutputMode] = None,
+      withOptions: Map[String, String] = Map[String, String]())
+      (withSelectExpr: String*): StreamingQuery = {
+    var stream: DataStreamWriter[Row] = null
+    withTempDir { checkpointDir =>
+      var df = input.toDF()
+      if (withSelectExpr.length > 0) {
+        df = df.selectExpr(withSelectExpr: _*)
+      }
+      stream = df.writeStream
+        .format("kafka")
+        .option("checkpointLocation", checkpointDir.getCanonicalPath)
+        .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+        .queryName("kafkaStream")
+      withTopic.foreach(stream.option("topic", _))
+      withOutputMode.foreach(stream.outputMode(_))
+      withOptions.foreach(opt => stream.option(opt._1, opt._2))
+    }
+    stream.start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
index 7056a41b1751..10b35c74f473 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
@@ -17,9 +17,13 @@
 
 package org.apache.spark.sql.kafka010
 
+import java.io.File
+
+import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.streaming.OffsetSuite
+import org.apache.spark.sql.test.SharedSQLContext
 
-class KafkaSourceOffsetSuite extends OffsetSuite {
+class KafkaSourceOffsetSuite extends OffsetSuite with SharedSQLContext {
 
   compare(
     one = KafkaSourceOffset(("t", 0, 1L)),
@@ -36,4 +40,66 @@ class KafkaSourceOffsetSuite extends OffsetSuite {
   compare(
     one = KafkaSourceOffset(("t", 0, 1L)),
     two = KafkaSourceOffset(("t", 0, 2L), ("t", 1, 1L)))
+
+
+  val kso1 = KafkaSourceOffset(("t", 0, 1L))
+  val kso2 = KafkaSourceOffset(("t", 0, 2L), ("t", 1, 3L))
+  val kso3 = KafkaSourceOffset(("t", 0, 2L), ("t", 1, 3L), ("t", 1, 4L))
+
+  compare(KafkaSourceOffset(SerializedOffset(kso1.json)),
+    KafkaSourceOffset(SerializedOffset(kso2.json)))
+
+  test("basic serialization - deserialization") {
+    assert(KafkaSourceOffset.getPartitionOffsets(kso1) ==
+      KafkaSourceOffset.getPartitionOffsets(SerializedOffset(kso1.json)))
+  }
+
+
+  testWithUninterruptibleThread("OffsetSeqLog serialization - deserialization") {
+    withTempDir { temp =>
+      // use non-existent directory to test whether log make the dir
+      val dir = new File(temp, "dir")
+      val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath)
+      val batch0 = OffsetSeq.fill(kso1)
+      val batch1 = OffsetSeq.fill(kso2, kso3)
+
+      val batch0Serialized = OffsetSeq.fill(batch0.offsets.flatMap(_.map(o =>
+        SerializedOffset(o.json))): _*)
+
+      val batch1Serialized = OffsetSeq.fill(batch1.offsets.flatMap(_.map(o =>
+        SerializedOffset(o.json))): _*)
+
+      assert(metadataLog.add(0, batch0))
+      assert(metadataLog.getLatest() === Some(0 -> batch0Serialized))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+
+      assert(metadataLog.add(1, batch1))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+      assert(metadataLog.get(1) === Some(batch1Serialized))
+      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
+      assert(metadataLog.get(None, Some(1)) ===
+        Array(0 -> batch0Serialized, 1 -> batch1Serialized))
+
+      // Adding the same batch does nothing
+      metadataLog.add(1, OffsetSeq.fill(LongOffset(3)))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+      assert(metadataLog.get(1) === Some(batch1Serialized))
+      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
+      assert(metadataLog.get(None, Some(1)) ===
+        Array(0 -> batch0Serialized, 1 -> batch1Serialized))
+    }
+  }
+
+  test("read Spark 2.1.0 offset format") {
+    val offset = readFromResource("kafka-source-offset-version-2.1.0.txt")
+    assert(KafkaSourceOffset(offset) ===
+      KafkaSourceOffset(("topic1", 0, 456L), ("topic1", 1, 789L), ("topic2", 0, 0L)))
+  }
+
+  private def readFromResource(file: String): SerializedOffset = {
+    import scala.io.Source
+    val input = getClass.getResource(s"/$file").toURI
+    val str = Source.fromFile(input).mkString
+    SerializedOffset(str)
+  }
 }
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index ed4cc75920e8..2825a7483abc 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -17,8 +17,14 @@
 
 package org.apache.spark.sql.kafka010
 
+import java.io._
+import java.nio.charset.StandardCharsets.UTF_8
+import java.nio.file.{Files, Paths}
+import java.util.Properties
+import java.util.concurrent.ConcurrentLinkedQueue
 import java.util.concurrent.atomic.AtomicInteger
 
+import scala.collection.mutable
 import scala.util.Random
 
 import org.apache.kafka.clients.producer.RecordMetadata
@@ -27,9 +33,13 @@ import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.streaming.{ ProcessingTime, StreamTest }
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.functions.{count, window}
+import org.apache.spark.sql.kafka010.KafkaSourceProvider._
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
+import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 
 abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
@@ -134,6 +144,113 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
   private val topicId = new AtomicInteger(0)
 
+  testWithUninterruptibleThread(
+    "deserialization of initial offset with Spark 2.1.0") {
+    withTempDir { metadataPath =>
+      val topic = newTopic
+      testUtils.createTopic(topic, partitions = 3)
+
+      val provider = new KafkaSourceProvider
+      val parameters = Map(
+        "kafka.bootstrap.servers" -> testUtils.brokerAddress,
+        "subscribe" -> topic
+      )
+      val source = provider.createSource(spark.sqlContext, metadataPath.getAbsolutePath, None,
+        "", parameters)
+      source.getOffset.get // Write initial offset
+
+      // Make sure Spark 2.1.0 will throw an exception when reading the new log
+      intercept[java.lang.IllegalArgumentException] {
+        // Simulate how Spark 2.1.0 reads the log
+        val in = new FileInputStream(metadataPath.getAbsolutePath + "/0")
+        val length = in.read()
+        val bytes = new Array[Byte](length)
+        in.read(bytes)
+        KafkaSourceOffset(SerializedOffset(new String(bytes, UTF_8)))
+      }
+    }
+  }
+
+  testWithUninterruptibleThread("deserialization of initial offset written by Spark 2.1.0") {
+    withTempDir { metadataPath =>
+      val topic = "kafka-initial-offset-2-1-0"
+      testUtils.createTopic(topic, partitions = 3)
+
+      val provider = new KafkaSourceProvider
+      val parameters = Map(
+        "kafka.bootstrap.servers" -> testUtils.brokerAddress,
+        "subscribe" -> topic
+      )
+
+      val from = Paths.get(
+        getClass.getResource("/kafka-source-initial-offset-version-2.1.0.bin").getPath)
+      val to = Paths.get(s"${metadataPath.getAbsolutePath}/0")
+      Files.copy(from, to)
+
+      val source = provider.createSource(spark.sqlContext, metadataPath.getAbsolutePath, None,
+        "", parameters)
+      val deserializedOffset = source.getOffset.get
+      val referenceOffset = KafkaSourceOffset((topic, 0, 0L), (topic, 1, 0L), (topic, 2, 0L))
+      assert(referenceOffset == deserializedOffset)
+    }
+  }
+
+  testWithUninterruptibleThread("deserialization of initial offset written by future version") {
+    withTempDir { metadataPath =>
+      val futureMetadataLog =
+        new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession,
+          metadataPath.getAbsolutePath) {
+          override def serialize(metadata: KafkaSourceOffset, out: OutputStream): Unit = {
+            out.write(0)
+            val writer = new BufferedWriter(new OutputStreamWriter(out, UTF_8))
+            writer.write(s"v99999\n${metadata.json}")
+            writer.flush
+          }
+        }
+
+      val topic = newTopic
+      testUtils.createTopic(topic, partitions = 3)
+      val offset = KafkaSourceOffset((topic, 0, 0L), (topic, 1, 0L), (topic, 2, 0L))
+      futureMetadataLog.add(0, offset)
+
+      val provider = new KafkaSourceProvider
+      val parameters = Map(
+        "kafka.bootstrap.servers" -> testUtils.brokerAddress,
+        "subscribe" -> topic
+      )
+      val source = provider.createSource(spark.sqlContext, metadataPath.getAbsolutePath, None,
+        "", parameters)
+
+      val e = intercept[java.lang.IllegalStateException] {
+        source.getOffset.get // Read initial offset
+      }
+
+      Seq(
+        s"maximum supported log version is v${KafkaSource.VERSION}, but encountered v99999",
+        "produced by a newer version of Spark and cannot be read by this version"
+      ).foreach { message =>
+        assert(e.getMessage.contains(message))
+      }
+    }
+  }
+
+  test("(de)serialization of initial offsets") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 64)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    testStream(reader.load)(
+      makeSureGetOffsetCalled,
+      StopStream,
+      StartStream(),
+      StopStream)
+  }
+
   test("maxOffsetsPerTrigger") {
     val topic = newTopic()
     testUtils.createTopic(topic, partitions = 3)
@@ -202,7 +319,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
   test("cannot stop Kafka stream") {
     val topic = newTopic()
-    testUtils.createTopic(newTopic(), partitions = 5)
+    testUtils.createTopic(topic, partitions = 5)
     testUtils.sendMessages(topic, (101 to 105).map { _.toString }.toArray)
 
     val reader = spark
@@ -223,52 +340,85 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
-  test("assign from latest offsets") {
-    val topic = newTopic()
-    testFromLatestOffsets(topic, false, "assign" -> assignString(topic, 0 to 4))
-  }
+  for (failOnDataLoss <- Seq(true, false)) {
+    test(s"assign from latest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromLatestOffsets(
+        topic,
+        addPartitions = false,
+        failOnDataLoss = failOnDataLoss,
+        "assign" -> assignString(topic, 0 to 4))
+    }
 
-  test("assign from earliest offsets") {
-    val topic = newTopic()
-    testFromEarliestOffsets(topic, false, "assign" -> assignString(topic, 0 to 4))
-  }
+    test(s"assign from earliest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromEarliestOffsets(
+        topic,
+        addPartitions = false,
+        failOnDataLoss = failOnDataLoss,
+        "assign" -> assignString(topic, 0 to 4))
+    }
 
-  test("assign from specific offsets") {
-    val topic = newTopic()
-    testFromSpecificOffsets(topic, "assign" -> assignString(topic, 0 to 4))
-  }
+    test(s"assign from specific offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromSpecificOffsets(
+        topic,
+        failOnDataLoss = failOnDataLoss,
+        "assign" -> assignString(topic, 0 to 4),
+        "failOnDataLoss" -> failOnDataLoss.toString)
+    }
 
-  test("subscribing topic by name from latest offsets") {
-    val topic = newTopic()
-    testFromLatestOffsets(topic, true, "subscribe" -> topic)
-  }
+    test(s"subscribing topic by name from latest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromLatestOffsets(
+        topic,
+        addPartitions = true,
+        failOnDataLoss = failOnDataLoss,
+        "subscribe" -> topic)
+    }
 
-  test("subscribing topic by name from earliest offsets") {
-    val topic = newTopic()
-    testFromEarliestOffsets(topic, true, "subscribe" -> topic)
-  }
+    test(s"subscribing topic by name from earliest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromEarliestOffsets(
+        topic,
+        addPartitions = true,
+        failOnDataLoss = failOnDataLoss,
+        "subscribe" -> topic)
+    }
 
-  test("subscribing topic by name from specific offsets") {
-    val topic = newTopic()
-    testFromSpecificOffsets(topic, "subscribe" -> topic)
-  }
+    test(s"subscribing topic by name from specific offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromSpecificOffsets(topic, failOnDataLoss = failOnDataLoss, "subscribe" -> topic)
+    }
 
-  test("subscribing topic by pattern from latest offsets") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-suffix"
-    testFromLatestOffsets(topic, true, "subscribePattern" -> s"$topicPrefix-.*")
-  }
+    test(s"subscribing topic by pattern from latest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topicPrefix = newTopic()
+      val topic = topicPrefix + "-suffix"
+      testFromLatestOffsets(
+        topic,
+        addPartitions = true,
+        failOnDataLoss = failOnDataLoss,
+        "subscribePattern" -> s"$topicPrefix-.*")
+    }
 
-  test("subscribing topic by pattern from earliest offsets") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-suffix"
-    testFromEarliestOffsets(topic, true, "subscribePattern" -> s"$topicPrefix-.*")
-  }
+    test(s"subscribing topic by pattern from earliest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topicPrefix = newTopic()
+      val topic = topicPrefix + "-suffix"
+      testFromEarliestOffsets(
+        topic,
+        addPartitions = true,
+        failOnDataLoss = failOnDataLoss,
+        "subscribePattern" -> s"$topicPrefix-.*")
+    }
 
-  test("subscribing topic by pattern from specific offsets") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-suffix"
-    testFromSpecificOffsets(topic, "subscribePattern" -> s"$topicPrefix-.*")
+    test(s"subscribing topic by pattern from specific offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topicPrefix = newTopic()
+      val topic = topicPrefix + "-suffix"
+      testFromSpecificOffsets(
+        topic,
+        failOnDataLoss = failOnDataLoss,
+        "subscribePattern" -> s"$topicPrefix-.*")
+    }
   }
 
   test("subscribing topic by pattern with topic deletions") {
@@ -306,6 +456,30 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
+  test("starting offset is latest by default") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("0"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    val kafka = reader.load()
+      .selectExpr("CAST(value AS STRING)")
+      .as[String]
+    val mapped = kafka.map(_.toInt)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(1, 2, 3)  // should not have 0
+    )
+  }
+
   test("bad source options") {
     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
       val ex = intercept[IllegalArgumentException] {
@@ -320,6 +494,9 @@ class KafkaSourceSuite extends KafkaSourceTest {
       }
     }
 
+    // Specifying an ending offset
+    testBadOptions("endingOffsets" -> "latest")("Ending offset not valid in streaming queries")
+
     // No strategy specified
     testBadOptions()("options must be specified", "subscribe", "subscribePattern")
 
@@ -379,23 +556,88 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
     val mapped = kafka.map(kv => kv._2.toInt + 1)
     testStream(mapped)(
+      StartStream(trigger = ProcessingTime(1)),
       makeSureGetOffsetCalled,
       AddKafkaData(Set(topic), 1, 2, 3),
       CheckAnswer(2, 3, 4),
-      AssertOnLastQueryStatus { status =>
-        assert(status.triggerDetails.get("numRows.input.total").toInt > 0)
-        assert(status.sourceStatuses(0).processingRate > 0.0)
+      AssertOnQuery { query =>
+        val recordsRead = query.recentProgress.map(_.numInputRows).sum
+        recordsRead == 3
       }
     )
   }
 
+  test("delete a topic when a Spark job is running") {
+    KafkaSourceSuite.collectedData.clear()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 1)
+    testUtils.sendMessages(topic, (1 to 10).map(_.toString).toArray)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribe", topic)
+      // If a topic is deleted and we try to poll data starting from offset 0,
+      // the Kafka consumer will just block until timeout and return an empty result.
+      // So set the timeout to 1 second to make this test fast.
+      .option("kafkaConsumer.pollTimeoutMs", "1000")
+      .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    KafkaSourceSuite.globalTestUtils = testUtils
+    // The following ForeachWriter will delete the topic before fetching data from Kafka
+    // in executors.
+    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+      override def open(partitionId: Long, version: Long): Boolean = {
+        KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
+        true
+      }
+
+      override def process(value: Int): Unit = {
+        KafkaSourceSuite.collectedData.add(value)
+      }
+
+      override def close(errorOrNull: Throwable): Unit = {}
+    }).start()
+    query.processAllAvailable()
+    query.stop()
+    // `failOnDataLoss` is `false`, we should not fail the query
+    assert(query.exception.isEmpty)
+  }
+
+  test("get offsets from case insensitive parameters") {
+    for ((optionKey, optionValue, answer) <- Seq(
+      (STARTING_OFFSETS_OPTION_KEY, "earLiEst", EarliestOffsetRangeLimit),
+      (ENDING_OFFSETS_OPTION_KEY, "laTest", LatestOffsetRangeLimit),
+      (STARTING_OFFSETS_OPTION_KEY, """{"topic-A":{"0":23}}""",
+        SpecificOffsetRangeLimit(Map(new TopicPartition("topic-A", 0) -> 23))))) {
+      val offset = getKafkaOffsetRangeLimit(Map(optionKey -> optionValue), optionKey, answer)
+      assert(offset === answer)
+    }
+
+    for ((optionKey, answer) <- Seq(
+      (STARTING_OFFSETS_OPTION_KEY, EarliestOffsetRangeLimit),
+      (ENDING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit))) {
+      val offset = getKafkaOffsetRangeLimit(Map.empty, optionKey, answer)
+      assert(offset === answer)
+    }
+  }
+
   private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
 
   private def assignString(topic: String, partitions: Iterable[Int]): String = {
     JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
   }
 
-  private def testFromSpecificOffsets(topic: String, options: (String, String)*): Unit = {
+  private def testFromSpecificOffsets(
+      topic: String,
+      failOnDataLoss: Boolean,
+      options: (String, String)*): Unit = {
     val partitionOffsets = Map(
       new TopicPartition(topic, 0) -> -2L,
       new TopicPartition(topic, 1) -> -1L,
@@ -424,6 +666,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .option("startingOffsets", startingOffsets)
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
+      .option("failOnDataLoss", failOnDataLoss.toString)
     options.foreach { case (k, v) => reader.option(k, v) }
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -442,9 +685,88 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
+  test("Kafka column types") {
+    val now = System.currentTimeMillis()
+    val topic = newTopic()
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("startingOffsets", s"earliest")
+      .option("subscribe", topic)
+      .load()
+
+    val query = kafka
+      .writeStream
+      .format("memory")
+      .outputMode("append")
+      .queryName("kafkaColumnTypes")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaColumnTypes").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    assert(row.getAs[Array[Byte]]("key") === null, s"Unexpected results: $row")
+    assert(row.getAs[Array[Byte]]("value") === "1".getBytes(UTF_8), s"Unexpected results: $row")
+    assert(row.getAs[String]("topic") === topic, s"Unexpected results: $row")
+    assert(row.getAs[Int]("partition") === 0, s"Unexpected results: $row")
+    assert(row.getAs[Long]("offset") === 0L, s"Unexpected results: $row")
+    // We cannot check the exact timestamp as it's the time that messages were inserted by the
+    // producer. So here we just use a low bound to make sure the internal conversion works.
+    assert(row.getAs[java.sql.Timestamp]("timestamp").getTime >= now, s"Unexpected results: $row")
+    assert(row.getAs[Int]("timestampType") === 0, s"Unexpected results: $row")
+    query.stop()
+  }
+
+  test("KafkaSource with watermark") {
+    val now = System.currentTimeMillis()
+    val topic = newTopic()
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("startingOffsets", s"earliest")
+      .option("subscribe", topic)
+      .load()
+
+    val windowedAggregation = kafka
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start") as 'window, $"count")
+
+    val query = windowedAggregation
+      .writeStream
+      .format("memory")
+      .outputMode("complete")
+      .queryName("kafkaWatermark")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaWatermark").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    // We cannot check the exact window start time as it depands on the time that messages were
+    // inserted by the producer. So here we just use a low bound to make sure the internal
+    // conversion works.
+    assert(
+      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
+      s"Unexpected results: $row")
+    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
+    query.stop()
+  }
+
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,
+      failOnDataLoss: Boolean,
       options: (String, String)*): Unit = {
     testUtils.createTopic(topic, partitions = 5)
     testUtils.sendMessages(topic, Array("-1"))
@@ -456,6 +778,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .option("startingOffsets", s"latest")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
+      .option("failOnDataLoss", failOnDataLoss.toString)
     options.foreach { case (k, v) => reader.option(k, v) }
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -489,6 +812,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
   private def testFromEarliestOffsets(
       topic: String,
       addPartitions: Boolean,
+      failOnDataLoss: Boolean,
       options: (String, String)*): Unit = {
     testUtils.createTopic(topic, partitions = 5)
     testUtils.sendMessages(topic, (1 to 3).map { _.toString }.toArray)
@@ -500,6 +824,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .option("startingOffsets", s"earliest")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
+      .option("failOnDataLoss", failOnDataLoss.toString)
     options.foreach { case (k, v) => reader.option(k, v) }
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -528,6 +853,11 @@ class KafkaSourceSuite extends KafkaSourceTest {
   }
 }
 
+object KafkaSourceSuite {
+  @volatile var globalTestUtils: KafkaTestUtils = _
+  val collectedData = new ConcurrentLinkedQueue[Any]()
+}
+
 
 class KafkaSourceStressSuite extends KafkaSourceTest {
 
@@ -591,7 +921,7 @@ class KafkaSourceStressSuite extends KafkaSourceTest {
                 }
               })
           case 2 => // Add new partitions
-            AddKafkaData(topics.toSet, d: _*)(message = "Add partitiosn",
+            AddKafkaData(topics.toSet, d: _*)(message = "Add partition",
               topicAction = (topic, partition) => {
                 testUtils.addPartitions(topic, partition.get + nextInt(1, 6))
               })
@@ -602,3 +932,128 @@ class KafkaSourceStressSuite extends KafkaSourceTest {
       iterations = 50)
   }
 }
+
+class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with SharedSQLContext {
+
+  import testImplicits._
+
+  private var testUtils: KafkaTestUtils = _
+
+  private val topicId = new AtomicInteger(0)
+
+  private def newTopic(): String = s"failOnDataLoss-${topicId.getAndIncrement()}"
+
+  override def createSparkSession(): TestSparkSession = {
+    // Set maxRetries to 3 to handle NPE from `poll` when deleting a topic
+    new TestSparkSession(new SparkContext("local[2,3]", "test-sql-context", sparkConf))
+  }
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils {
+      override def brokerConfiguration: Properties = {
+        val props = super.brokerConfiguration
+        // Try to make Kafka clean up messages as fast as possible. However, there is a hard-code
+        // 30 seconds delay (kafka.log.LogManager.InitialTaskDelayMs) so this test should run at
+        // least 30 seconds.
+        props.put("log.cleaner.backoff.ms", "100")
+        props.put("log.segment.bytes", "40")
+        props.put("log.retention.bytes", "40")
+        props.put("log.retention.check.interval.ms", "100")
+        props.put("delete.retention.ms", "10")
+        props.put("log.flush.scheduler.interval.ms", "10")
+        props
+      }
+    }
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+      super.afterAll()
+    }
+  }
+
+  test("stress test for failOnDataLoss=false") {
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", "failOnDataLoss.*")
+      .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
+      .option("fetchOffset.retryIntervalMs", "3000")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+
+      override def open(partitionId: Long, version: Long): Boolean = {
+        true
+      }
+
+      override def process(value: Int): Unit = {
+        // Slow down the processing speed so that messages may be aged out.
+        Thread.sleep(Random.nextInt(500))
+      }
+
+      override def close(errorOrNull: Throwable): Unit = {
+      }
+    }).start()
+
+    val testTime = 1.minutes
+    val startTime = System.currentTimeMillis()
+    // Track the current existing topics
+    val topics = mutable.ArrayBuffer[String]()
+    // Track topics that have been deleted
+    val deletedTopics = mutable.Set[String]()
+    while (System.currentTimeMillis() - testTime.toMillis < startTime) {
+      Random.nextInt(10) match {
+        case 0 => // Create a new topic
+          val topic = newTopic()
+          topics += topic
+          // As pushing messages into Kafka updates Zookeeper asynchronously, there is a small
+          // chance that a topic will be recreated after deletion due to the asynchronous update.
+          // Hence, always overwrite to handle this race condition.
+          testUtils.createTopic(topic, partitions = 1, overwrite = true)
+          logInfo(s"Create topic $topic")
+        case 1 if topics.nonEmpty => // Delete an existing topic
+          val topic = topics.remove(Random.nextInt(topics.size))
+          testUtils.deleteTopic(topic)
+          logInfo(s"Delete topic $topic")
+          deletedTopics += topic
+        case 2 if deletedTopics.nonEmpty => // Recreate a topic that was deleted.
+          val topic = deletedTopics.toSeq(Random.nextInt(deletedTopics.size))
+          deletedTopics -= topic
+          topics += topic
+          // As pushing messages into Kafka updates Zookeeper asynchronously, there is a small
+          // chance that a topic will be recreated after deletion due to the asynchronous update.
+          // Hence, always overwrite to handle this race condition.
+          testUtils.createTopic(topic, partitions = 1, overwrite = true)
+          logInfo(s"Create topic $topic")
+        case 3 =>
+          Thread.sleep(1000)
+        case _ => // Push random messages
+          for (topic <- topics) {
+            val size = Random.nextInt(10)
+            for (_ <- 0 until size) {
+              testUtils.sendMessages(topic, Array(Random.nextInt(10).toString))
+            }
+          }
+      }
+      // `failOnDataLoss` is `false`, we should not fail the query
+      if (query.exception.nonEmpty) {
+        throw query.exception.get
+      }
+    }
+
+    query.stop()
+    // `failOnDataLoss` is `false`, we should not fail the query
+    if (query.exception.nonEmpty) {
+      throw query.exception.get
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 9b24ccdd560e..4345f882130a 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -50,7 +50,7 @@ import org.apache.spark.SparkConf
  *
  * The reason to put Kafka test utility class in src is to test Python related Kafka APIs.
  */
-class KafkaTestUtils extends Logging {
+class KafkaTestUtils(withBrokerProps: Map[String, Object] = Map.empty) extends Logging {
 
   // Zookeeper related configurations
   private val zkHost = "localhost"
@@ -155,8 +155,16 @@ class KafkaTestUtils extends Logging {
   }
 
   /** Create a Kafka topic and wait until it is propagated to the whole cluster */
-  def createTopic(topic: String, partitions: Int): Unit = {
-    AdminUtils.createTopic(zkUtils, topic, partitions, 1)
+  def createTopic(topic: String, partitions: Int, overwrite: Boolean = false): Unit = {
+    var created = false
+    while (!created) {
+      try {
+        AdminUtils.createTopic(zkUtils, topic, partitions, 1)
+        created = true
+      } catch {
+        case e: kafka.common.TopicExistsException if overwrite => deleteTopic(topic)
+      }
+    }
     // wait until metadata is propagated
     (0 until partitions).foreach { p =>
       waitUntilMetadataIsPropagated(topic, p)
@@ -176,7 +184,7 @@ class KafkaTestUtils extends Logging {
   def deleteTopic(topic: String): Unit = {
     val partitions = zkUtils.getPartitionsForTopics(Seq(topic))(topic).size
     AdminUtils.deleteTopic(zkUtils, topic)
-    verifyTopicDeletion(zkUtils, topic, partitions, List(this.server))
+    verifyTopicDeletionWithRetries(zkUtils, topic, partitions, List(this.server))
   }
 
   /** Add new paritions to a Kafka topic */
@@ -230,6 +238,24 @@ class KafkaTestUtils extends Logging {
     offsets
   }
 
+  def cleanupLogs(): Unit = {
+    server.logManager.cleanupLogs()
+  }
+
+  def getEarliestOffsets(topics: Set[String]): Map[TopicPartition, Long] = {
+    val kc = new KafkaConsumer[String, String](consumerConfiguration)
+    logInfo("Created consumer to get earliest offsets")
+    kc.subscribe(topics.asJavaCollection)
+    kc.poll(0)
+    val partitions = kc.assignment()
+    kc.pause(partitions)
+    kc.seekToBeginning(partitions)
+    val offsets = partitions.asScala.map(p => p -> kc.position(p)).toMap
+    kc.close()
+    logInfo("Closed consumer to get earliest offsets")
+    offsets
+  }
+
   def getLatestOffsets(topics: Set[String]): Map[TopicPartition, Long] = {
     val kc = new KafkaConsumer[String, String](consumerConfiguration)
     logInfo("Created consumer to get latest offsets")
@@ -244,7 +270,7 @@ class KafkaTestUtils extends Logging {
     offsets
   }
 
-  private def brokerConfiguration: Properties = {
+  protected def brokerConfiguration: Properties = {
     val props = new Properties()
     props.put("broker.id", "0")
     props.put("host.name", "localhost")
@@ -255,6 +281,8 @@ class KafkaTestUtils extends Logging {
     props.put("log.flush.interval.messages", "1")
     props.put("replica.socket.timeout.ms", "1500")
     props.put("delete.topic.enable", "true")
+    props.put("offsets.topic.num.partitions", "1")
+    props.putAll(withBrokerProps.asJava)
     props
   }
 
@@ -278,34 +306,57 @@ class KafkaTestUtils extends Logging {
     props
   }
 
+  /** Verify topic is deleted in all places, e.g, brokers, zookeeper. */
   private def verifyTopicDeletion(
+      topic: String,
+      numPartitions: Int,
+      servers: Seq[KafkaServer]): Unit = {
+    val topicAndPartitions = (0 until numPartitions).map(TopicAndPartition(topic, _))
+
+    import ZkUtils._
+    // wait until admin path for delete topic is deleted, signaling completion of topic deletion
+    assert(
+      !zkUtils.pathExists(getDeleteTopicPath(topic)),
+      s"${getDeleteTopicPath(topic)} still exists")
+    assert(!zkUtils.pathExists(getTopicPath(topic)), s"${getTopicPath(topic)} still exists")
+    // ensure that the topic-partition has been deleted from all brokers' replica managers
+    assert(servers.forall(server => topicAndPartitions.forall(tp =>
+      server.replicaManager.getPartition(tp.topic, tp.partition) == None)),
+      s"topic $topic still exists in the replica manager")
+    // ensure that logs from all replicas are deleted if delete topic is marked successful
+    assert(servers.forall(server => topicAndPartitions.forall(tp =>
+      server.getLogManager().getLog(tp).isEmpty)),
+      s"topic $topic still exists in log mananger")
+    // ensure that topic is removed from all cleaner offsets
+    assert(servers.forall(server => topicAndPartitions.forall { tp =>
+      val checkpoints = server.getLogManager().logDirs.map { logDir =>
+        new OffsetCheckpoint(new File(logDir, "cleaner-offset-checkpoint")).read()
+      }
+      checkpoints.forall(checkpointsPerLogDir => !checkpointsPerLogDir.contains(tp))
+    }), s"checkpoint for topic $topic still exists")
+    // ensure the topic is gone
+    assert(
+      !zkUtils.getAllTopics().contains(topic),
+      s"topic $topic still exists on zookeeper")
+  }
+
+  /** Verify topic is deleted. Retry to delete the topic if not. */
+  private def verifyTopicDeletionWithRetries(
       zkUtils: ZkUtils,
       topic: String,
       numPartitions: Int,
       servers: Seq[KafkaServer]) {
-    import ZkUtils._
-    val topicAndPartitions = (0 until numPartitions).map(TopicAndPartition(topic, _))
-    def isDeleted(): Boolean = {
-      // wait until admin path for delete topic is deleted, signaling completion of topic deletion
-      val deletePath = !zkUtils.pathExists(getDeleteTopicPath(topic))
-      val topicPath = !zkUtils.pathExists(getTopicPath(topic))
-      // ensure that the topic-partition has been deleted from all brokers' replica managers
-      val replicaManager = servers.forall(server => topicAndPartitions.forall(tp =>
-        server.replicaManager.getPartition(tp.topic, tp.partition) == None))
-      // ensure that logs from all replicas are deleted if delete topic is marked successful
-      val logManager = servers.forall(server => topicAndPartitions.forall(tp =>
-        server.getLogManager().getLog(tp).isEmpty))
-      // ensure that topic is removed from all cleaner offsets
-      val cleaner = servers.forall(server => topicAndPartitions.forall { tp =>
-        val checkpoints = server.getLogManager().logDirs.map { logDir =>
-          new OffsetCheckpoint(new File(logDir, "cleaner-offset-checkpoint")).read()
-        }
-        checkpoints.forall(checkpointsPerLogDir => !checkpointsPerLogDir.contains(tp))
-      })
-      deletePath && topicPath && replicaManager && logManager && cleaner
-    }
-    eventually(timeout(10.seconds)) {
-      assert(isDeleted, s"$topic not deleted after timeout")
+    eventually(timeout(60.seconds), interval(200.millis)) {
+      try {
+        verifyTopicDeletion(topic, numPartitions, servers)
+      } catch {
+        case e: Throwable =>
+          // As pushing messages into Kafka updates Zookeeper asynchronously, there is a small
+          // chance that a topic will be recreated after deletion due to the asynchronous update.
+          // Hence, delete the topic and retry.
+          AdminUtils.deleteTopic(zkUtils, topic)
+          throw e
+      }
     }
   }
 
@@ -321,7 +372,7 @@ class KafkaTestUtils extends Logging {
       case _ =>
         false
     }
-    eventually(timeout(10.seconds)) {
+    eventually(timeout(60.seconds)) {
       assert(isPropagated, s"Partition [$topic, $partition] metadata not propagated after timeout")
     }
   }
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index c36d47900709..961b80df50c5 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -89,6 +89,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
index 7e57bb18cbd5..794f53c5abfd 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
@@ -57,7 +57,8 @@ import org.apache.spark.streaming.scheduler.rate.RateEstimator
 private[spark] class DirectKafkaInputDStream[K, V](
     _ssc: StreamingContext,
     locationStrategy: LocationStrategy,
-    consumerStrategy: ConsumerStrategy[K, V]
+    consumerStrategy: ConsumerStrategy[K, V],
+    ppc: PerPartitionConfig
   ) extends InputDStream[ConsumerRecord[K, V]](_ssc) with Logging with CanCommitOffsets {
 
   val executorKafkaParams = {
@@ -128,12 +129,9 @@ private[spark] class DirectKafkaInputDStream[K, V](
     }
   }
 
-  private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt(
-    "spark.streaming.kafka.maxRatePerPartition", 0)
-
   protected[streaming] def maxMessagesPerPartition(
     offsets: Map[TopicPartition, Long]): Option[Map[TopicPartition, Long]] = {
-    val estimatedRateLimit = rateController.map(_.getLatestRate().toInt)
+    val estimatedRateLimit = rateController.map(_.getLatestRate())
 
     // calculate a per-partition rate limit based on current lag
     val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match {
@@ -144,11 +142,12 @@ private[spark] class DirectKafkaInputDStream[K, V](
         val totalLag = lagPerPartition.values.sum
 
         lagPerPartition.map { case (tp, lag) =>
+          val maxRateLimitPerPartition = ppc.maxRatePerPartition(tp)
           val backpressureRate = Math.round(lag / totalLag.toFloat * rate)
           tp -> (if (maxRateLimitPerPartition > 0) {
             Math.min(backpressureRate, maxRateLimitPerPartition)} else backpressureRate)
         }
-      case None => offsets.map { case (tp, offset) => tp -> maxRateLimitPerPartition }
+      case None => offsets.map { case (tp, offset) => tp -> ppc.maxRatePerPartition(tp) }
     }
 
     if (effectiveRateLimitPerPartition.values.sum > 0) {
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
index 5b5a9ac48c7c..98394251bb23 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
@@ -66,7 +66,8 @@ private[spark] class KafkaRDD[K, V](
       " must be set to false for executor kafka params, else offsets may commit before processing")
 
   // TODO is it necessary to have separate configs for initial poll time vs ongoing poll time?
-  private val pollTimeout = conf.getLong("spark.streaming.kafka.consumer.poll.ms", 512)
+  private val pollTimeout = conf.getLong("spark.streaming.kafka.consumer.poll.ms",
+    conf.getTimeAsMs("spark.network.timeout", "120s"))
   private val cacheInitialCapacity =
     conf.getInt("spark.streaming.kafka.consumer.cache.initialCapacity", 16)
   private val cacheMaxCapacity =
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala
index b2190bfa05a3..c11917f59d5b 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala
@@ -123,7 +123,31 @@ object KafkaUtils extends Logging {
       locationStrategy: LocationStrategy,
       consumerStrategy: ConsumerStrategy[K, V]
     ): InputDStream[ConsumerRecord[K, V]] = {
-    new DirectKafkaInputDStream[K, V](ssc, locationStrategy, consumerStrategy)
+    val ppc = new DefaultPerPartitionConfig(ssc.sparkContext.getConf)
+    createDirectStream[K, V](ssc, locationStrategy, consumerStrategy, ppc)
+  }
+
+  /**
+   * :: Experimental ::
+   * Scala constructor for a DStream where
+   * each given Kafka topic/partition corresponds to an RDD partition.
+   * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
+   *   see [[LocationStrategies]] for more details.
+   * @param consumerStrategy In most cases, pass in ConsumerStrategies.subscribe,
+   *   see [[ConsumerStrategies]] for more details.
+   * @param perPartitionConfig configuration of settings such as max rate on a per-partition basis.
+   *   see [[PerPartitionConfig]] for more details.
+   * @tparam K type of Kafka message key
+   * @tparam V type of Kafka message value
+   */
+  @Experimental
+  def createDirectStream[K, V](
+      ssc: StreamingContext,
+      locationStrategy: LocationStrategy,
+      consumerStrategy: ConsumerStrategy[K, V],
+      perPartitionConfig: PerPartitionConfig
+    ): InputDStream[ConsumerRecord[K, V]] = {
+    new DirectKafkaInputDStream[K, V](ssc, locationStrategy, consumerStrategy, perPartitionConfig)
   }
 
   /**
@@ -150,6 +174,33 @@ object KafkaUtils extends Logging {
         jssc.ssc, locationStrategy, consumerStrategy))
   }
 
+  /**
+   * :: Experimental ::
+   * Java constructor for a DStream where
+   * each given Kafka topic/partition corresponds to an RDD partition.
+   * @param keyClass Class of the keys in the Kafka records
+   * @param valueClass Class of the values in the Kafka records
+   * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
+   *   see [[LocationStrategies]] for more details.
+   * @param consumerStrategy In most cases, pass in ConsumerStrategies.subscribe,
+   *   see [[ConsumerStrategies]] for more details
+   * @param perPartitionConfig configuration of settings such as max rate on a per-partition basis.
+   *   see [[PerPartitionConfig]] for more details.
+   * @tparam K type of Kafka message key
+   * @tparam V type of Kafka message value
+   */
+  @Experimental
+  def createDirectStream[K, V](
+      jssc: JavaStreamingContext,
+      locationStrategy: LocationStrategy,
+      consumerStrategy: ConsumerStrategy[K, V],
+      perPartitionConfig: PerPartitionConfig
+    ): JavaInputDStream[ConsumerRecord[K, V]] = {
+    new JavaInputDStream(
+      createDirectStream[K, V](
+        jssc.ssc, locationStrategy, consumerStrategy, perPartitionConfig))
+  }
+
   /**
    * Tweak kafka params to prevent issues on executors
    */
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala
new file mode 100644
index 000000000000..4792f2a95511
--- /dev/null
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka010
+
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.SparkConf
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * Interface for user-supplied configurations that can't otherwise be set via Spark properties,
+ * because they need tweaking on a per-partition basis,
+ */
+@Experimental
+abstract class PerPartitionConfig extends Serializable {
+  /**
+   *  Maximum rate (number of records per second) at which data will be read
+   *  from each Kafka partition.
+   */
+  def maxRatePerPartition(topicPartition: TopicPartition): Long
+}
+
+/**
+ * Default per-partition configuration
+ */
+private class DefaultPerPartitionConfig(conf: SparkConf)
+    extends PerPartitionConfig {
+  val maxRate = conf.getLong("spark.streaming.kafka.maxRatePerPartition", 0)
+
+  def maxRatePerPartition(topicPartition: TopicPartition): Long = maxRate
+}
diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
index 02aec43c3b34..f36e0a901f7b 100644
--- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
+++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
@@ -252,7 +252,8 @@ class DirectKafkaStreamSuite
       val s = new DirectKafkaInputDStream[String, String](
         ssc,
         preferredHosts,
-        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala))
+        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala),
+        new DefaultPerPartitionConfig(sparkConf))
       s.consumer.poll(0)
       assert(
         s.consumer.position(topicPartition) >= offsetBeforeStart,
@@ -306,7 +307,8 @@ class DirectKafkaStreamSuite
         ConsumerStrategies.Assign[String, String](
           List(topicPartition),
           kafkaParams.asScala,
-          Map(topicPartition -> 11L)))
+          Map(topicPartition -> 11L)),
+        new DefaultPerPartitionConfig(sparkConf))
       s.consumer.poll(0)
       assert(
         s.consumer.position(topicPartition) >= offsetBeforeStart,
@@ -518,7 +520,7 @@ class DirectKafkaStreamSuite
 
   test("maxMessagesPerPartition with backpressure disabled") {
     val topic = "maxMessagesPerPartition"
-    val kafkaStream = getDirectKafkaStream(topic, None)
+    val kafkaStream = getDirectKafkaStream(topic, None, None)
 
     val input = Map(new TopicPartition(topic, 0) -> 50L, new TopicPartition(topic, 1) -> 50L)
     assert(kafkaStream.maxMessagesPerPartition(input).get ==
@@ -528,7 +530,7 @@ class DirectKafkaStreamSuite
   test("maxMessagesPerPartition with no lag") {
     val topic = "maxMessagesPerPartition"
     val rateController = Some(new ConstantRateController(0, new ConstantEstimator(100), 100))
-    val kafkaStream = getDirectKafkaStream(topic, rateController)
+    val kafkaStream = getDirectKafkaStream(topic, rateController, None)
 
     val input = Map(new TopicPartition(topic, 0) -> 0L, new TopicPartition(topic, 1) -> 0L)
     assert(kafkaStream.maxMessagesPerPartition(input).isEmpty)
@@ -537,11 +539,19 @@ class DirectKafkaStreamSuite
   test("maxMessagesPerPartition respects max rate") {
     val topic = "maxMessagesPerPartition"
     val rateController = Some(new ConstantRateController(0, new ConstantEstimator(100), 1000))
-    val kafkaStream = getDirectKafkaStream(topic, rateController)
+    val ppc = Some(new PerPartitionConfig {
+      def maxRatePerPartition(tp: TopicPartition) =
+        if (tp.topic == topic && tp.partition == 0) {
+          50
+        } else {
+          100
+        }
+    })
+    val kafkaStream = getDirectKafkaStream(topic, rateController, ppc)
 
     val input = Map(new TopicPartition(topic, 0) -> 1000L, new TopicPartition(topic, 1) -> 1000L)
     assert(kafkaStream.maxMessagesPerPartition(input).get ==
-      Map(new TopicPartition(topic, 0) -> 10L, new TopicPartition(topic, 1) -> 10L))
+      Map(new TopicPartition(topic, 0) -> 5L, new TopicPartition(topic, 1) -> 10L))
   }
 
   test("using rate controller") {
@@ -570,7 +580,9 @@ class DirectKafkaStreamSuite
       new DirectKafkaInputDStream[String, String](
         ssc,
         preferredHosts,
-        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala)) {
+        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala),
+        new DefaultPerPartitionConfig(sparkConf)
+      ) {
         override protected[streaming] val rateController =
           Some(new DirectKafkaRateController(id, estimator))
       }.map(r => (r.key, r.value))
@@ -616,7 +628,10 @@ class DirectKafkaStreamSuite
     }.toSeq.sortBy { _._1 }
   }
 
-  private def getDirectKafkaStream(topic: String, mockRateController: Option[RateController]) = {
+  private def getDirectKafkaStream(
+      topic: String,
+      mockRateController: Option[RateController],
+      ppc: Option[PerPartitionConfig]) = {
     val batchIntervalMilliseconds = 100
 
     val sparkConf = new SparkConf()
@@ -643,7 +658,8 @@ class DirectKafkaStreamSuite
           tps.foreach(tp => consumer.seek(tp, 0))
           consumer
         }
-      }
+      },
+      ppc.getOrElse(new DefaultPerPartitionConfig(sparkConf))
     ) {
         override protected[streaming] val rateController = mockRateController
     }
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index bc02b8a66246..e56ed102ac89 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 91ccd4a927e9..e260e434f8dd 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -89,6 +89,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index c3c799375bbe..d52c230eb784 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -88,12 +88,12 @@ class DirectKafkaInputDStream[
 
   protected val kc = new KafkaCluster(kafkaParams)
 
-  private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt(
+  private val maxRateLimitPerPartition: Long = context.sparkContext.getConf.getLong(
       "spark.streaming.kafka.maxRatePerPartition", 0)
 
   protected[streaming] def maxMessagesPerPartition(
       offsets: Map[TopicAndPartition, Long]): Option[Map[TopicAndPartition, Long]] = {
-    val estimatedRateLimit = rateController.map(_.getLatestRate().toInt)
+    val estimatedRateLimit = rateController.map(_.getLatestRate())
 
     // calculate a per-partition rate limit based on current lag
     val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match {
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
index 35acb7b09f12..e0e44d444027 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
@@ -231,7 +231,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
   // this 0 here indicates api version, in this case the original ZK backed api.
   private def defaultConsumerApiVersion: Short = 0
 
-  /** Requires Kafka >= 0.8.1.1.  Defaults to the original ZooKeeper backed api version. */
+  /**
+   * Requires Kafka 0.8.1.1 or later.
+   * Defaults to the original ZooKeeper backed API version.
+   */
   def getConsumerOffsets(
       groupId: String,
       topicAndPartitions: Set[TopicAndPartition]
@@ -250,7 +253,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
     }
   }
 
-  /** Requires Kafka >= 0.8.1.1.  Defaults to the original ZooKeeper backed api version. */
+  /**
+   * Requires Kafka 0.8.1.1 or later.
+   * Defaults to the original ZooKeeper backed API version.
+   */
   def getConsumerOffsetMetadata(
       groupId: String,
       topicAndPartitions: Set[TopicAndPartition]
@@ -287,7 +293,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
     Left(errs)
   }
 
-  /** Requires Kafka >= 0.8.1.1.  Defaults to the original ZooKeeper backed api version. */
+  /**
+   * Requires Kafka 0.8.1.1 or later.
+   * Defaults to the original ZooKeeper backed API version.
+   */
   def setConsumerOffsets(
       groupId: String,
       offsets: Map[TopicAndPartition, Long]
@@ -305,7 +314,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
     setConsumerOffsetMetadata(groupId, meta, consumerApiVersion)
   }
 
-  /** Requires Kafka >= 0.8.1.1.  Defaults to the original ZooKeeper backed api version. */
+  /**
+   * Requires Kafka 0.8.1.1 or later.
+   * Defaults to the original ZooKeeper backed API version.
+   */
   def setConsumerOffsetMetadata(
       groupId: String,
       metadata: Map[TopicAndPartition, OffsetAndMetadata]
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
index 3713bda41b8e..7ff3a98ca52c 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
@@ -38,7 +38,7 @@ import org.apache.spark.util.ThreadUtils
  *
  * @param kafkaParams Map of kafka configuration parameters.
  *                    See: http://kafka.apache.org/configuration.html
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name to numPartitions) to consume. Each partition is consumed
  * in its own thread.
  * @param storageLevel RDD storage level.
  */
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index b17e19807794..d5aef8184fc8 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -47,7 +47,7 @@ object KafkaUtils {
    * @param ssc       StreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..)
    * @param groupId   The group id for this consumer
-   * @param topics    Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics    Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                  in its own thread
    * @param storageLevel  Storage level to use for storing the received objects
    *                      (default: StorageLevel.MEMORY_AND_DISK_SER_2)
@@ -72,7 +72,7 @@ object KafkaUtils {
    * @param ssc         StreamingContext object
    * @param kafkaParams Map of kafka configuration parameters,
    *                    see http://kafka.apache.org/08/configuration.html
-   * @param topics      Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics      Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                    in its own thread.
    * @param storageLevel Storage level to use for storing the received objects
    * @tparam K type of Kafka message key
@@ -97,7 +97,7 @@ object KafkaUtils {
    * @param jssc      JavaStreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..)
    * @param groupId   The group id for this consumer
-   * @param topics    Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics    Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                  in its own thread
    * @return DStream of (Kafka message key, Kafka message value)
    */
@@ -115,7 +115,7 @@ object KafkaUtils {
    * @param jssc      JavaStreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..).
    * @param groupId   The group id for this consumer.
-   * @param topics    Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics    Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                  in its own thread.
    * @param storageLevel RDD storage level.
    * @return DStream of (Kafka message key, Kafka message value)
@@ -140,7 +140,7 @@ object KafkaUtils {
    * @param valueDecoderClass Type of kafka value decoder
    * @param kafkaParams Map of kafka configuration parameters,
    *                    see http://kafka.apache.org/08/configuration.html
-   * @param topics  Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics  Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                in its own thread
    * @param storageLevel RDD storage level.
    * @tparam K type of Kafka message key
@@ -223,7 +223,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create a RDD from Kafka using offset ranges for each topic and partition.
+   * Create an RDD from Kafka using offset ranges for each topic and partition.
    *
    * @param sc SparkContext object
    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
@@ -255,7 +255,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
+   * Create an RDD from Kafka using offset ranges for each topic and partition. This allows you
    * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
    * as the metadata.
    *
@@ -303,7 +303,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create a RDD from Kafka using offset ranges for each topic and partition.
+   * Create an RDD from Kafka using offset ranges for each topic and partition.
    *
    * @param jsc JavaSparkContext object
    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
@@ -340,7 +340,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
+   * Create an RDD from Kafka using offset ranges for each topic and partition. This allows you
    * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
    * as the metadata.
    *
@@ -396,7 +396,7 @@ object KafkaUtils {
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
    *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
-   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    in the `StreamingContext`. The information on consumed offset can be
    *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
    *  - End-to-end semantics: This stream ensures that every records is effectively received and
    *    transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -448,7 +448,7 @@ object KafkaUtils {
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
    *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
-   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    in the `StreamingContext`. The information on consumed offset can be
    *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
    *  - End-to-end semantics: This stream ensures that every records is effectively received and
    *    transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -499,7 +499,7 @@ object KafkaUtils {
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
    *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
-   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    in the `StreamingContext`. The information on consumed offset can be
    *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
    *  - End-to-end semantics: This stream ensures that every records is effectively received and
    *    transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -565,7 +565,7 @@ object KafkaUtils {
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
    *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
-   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    in the `StreamingContext`. The information on consumed offset can be
    *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
    *  - End-to-end semantics: This stream ensures that every records is effectively received and
    *    transformed exactly once, but gives no guarantees on whether the transformed data are
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
index d9b856e4697a..10d364f98740 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
@@ -22,7 +22,7 @@ import kafka.common.TopicAndPartition
 /**
  * Represents any object that has a collection of [[OffsetRange]]s. This can be used to access the
  * offset ranges in RDDs generated by the direct Kafka DStream (see
- * [[KafkaUtils.createDirectStream()]]).
+ * `KafkaUtils.createDirectStream()`).
  * {{{
  *   KafkaUtils.createDirectStream(...).foreachRDD { rdd =>
  *      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index f7cb76446339..72e14f58e38f 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 57809ff692c2..182f963cdd03 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -78,6 +78,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
index 80e0cce05586..a0ccd086d90f 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -27,7 +27,6 @@ import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
 import com.amazonaws.services.kinesis.model.Record
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.streaming.Duration
 
 /**
  * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor.
@@ -102,27 +101,32 @@ private[kinesis] class KinesisRecordProcessor[T](receiver: KinesisReceiver[T], w
    * @param checkpointer used to perform a Kinesis checkpoint for ShutdownReason.TERMINATE
    * @param reason for shutdown (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE)
    */
-  override def shutdown(checkpointer: IRecordProcessorCheckpointer, reason: ShutdownReason) {
+  override def shutdown(
+      checkpointer: IRecordProcessorCheckpointer,
+      reason: ShutdownReason): Unit = {
     logInfo(s"Shutdown:  Shutting down workerId $workerId with reason $reason")
-    reason match {
-      /*
-       * TERMINATE Use Case.  Checkpoint.
-       * Checkpoint to indicate that all records from the shard have been drained and processed.
-       * It's now OK to read from the new shards that resulted from a resharding event.
-       */
-      case ShutdownReason.TERMINATE =>
-        receiver.removeCheckpointer(shardId, checkpointer)
+    // null if not initialized before shutdown:
+    if (shardId == null) {
+      logWarning(s"No shardId for workerId $workerId?")
+    } else {
+      reason match {
+        /*
+         * TERMINATE Use Case.  Checkpoint.
+         * Checkpoint to indicate that all records from the shard have been drained and processed.
+         * It's now OK to read from the new shards that resulted from a resharding event.
+         */
+        case ShutdownReason.TERMINATE => receiver.removeCheckpointer(shardId, checkpointer)
 
-      /*
-       * ZOMBIE Use Case or Unknown reason.  NoOp.
-       * No checkpoint because other workers may have taken over and already started processing
-       *    the same records.
-       * This may lead to records being processed more than once.
-       */
-      case _ =>
-        receiver.removeCheckpointer(shardId, null) // return null so that we don't checkpoint
+        /*
+         * ZOMBIE Use Case or Unknown reason.  NoOp.
+         * No checkpoint because other workers may have taken over and already started processing
+         *    the same records.
+         * This may lead to records being processed more than once.
+         * Return null so that we don't checkpoint
+         */
+        case _ => receiver.removeCheckpointer(shardId, null)
+      }
     }
-
   }
 }
 
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index a0007d33d625..b2daffa34ccb 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -33,10 +33,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
-   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
-   * gets the AWS credentials.
-   *
    * @param ssc StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -57,6 +53,10 @@ object KinesisUtils {
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    * @param messageHandler A custom message handler that can generate a generic output from a
    *                       Kinesis `Record`, which contains both message data, and metadata.
+   *
+   * @note The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
    */
   def createStream[T: ClassTag](
       ssc: StreamingContext,
@@ -81,10 +81,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note:
-   *  The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   *  is enabled. Make sure that your checkpoint directory is secure.
-   *
    * @param ssc StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -107,6 +103,9 @@ object KinesisUtils {
    *                       Kinesis `Record`, which contains both message data, and metadata.
    * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   *
+   * @note The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    */
   // scalastyle:off
   def createStream[T: ClassTag](
@@ -134,10 +133,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
-   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
-   * gets the AWS credentials.
-   *
    * @param ssc StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -156,6 +151,10 @@ object KinesisUtils {
    *                            details on the different types of checkpoints.
    * @param storageLevel Storage level to use for storing the received objects.
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   *
+   * @note The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
    */
   def createStream(
       ssc: StreamingContext,
@@ -178,10 +177,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note:
-   *  The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   *  is enabled. Make sure that your checkpoint directory is secure.
-   *
    * @param ssc StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -202,6 +197,9 @@ object KinesisUtils {
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   *
+   * @note The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    */
   def createStream(
       ssc: StreamingContext,
@@ -225,10 +223,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
-   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
-   * gets the AWS credentials.
-   *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -250,6 +244,10 @@ object KinesisUtils {
    * @param messageHandler A custom message handler that can generate a generic output from a
    *                       Kinesis `Record`, which contains both message data, and metadata.
    * @param recordClass Class of the records in DStream
+   *
+   * @note The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
    */
   def createStream[T](
       jssc: JavaStreamingContext,
@@ -272,10 +270,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note:
-   * The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   * is enabled. Make sure that your checkpoint directory is secure.
-   *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -299,6 +293,9 @@ object KinesisUtils {
    * @param recordClass Class of the records in DStream
    * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   *
+   * @note The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    */
   // scalastyle:off
   def createStream[T](
@@ -326,10 +323,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
-   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
-   * gets the AWS credentials.
-   *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -348,6 +341,10 @@ object KinesisUtils {
    *                            details on the different types of checkpoints.
    * @param storageLevel Storage level to use for storing the received objects.
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   *
+   * @note The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
    */
   def createStream(
       jssc: JavaStreamingContext,
@@ -367,10 +364,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note:
-   * The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   * is enabled. Make sure that your checkpoint directory is secure.
-   *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -391,6 +384,9 @@ object KinesisUtils {
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   *
+   * @note The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    */
   def createStream(
       jssc: JavaStreamingContext,
diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
index 905c33834df1..a4d81a680979 100644
--- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
+++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
@@ -221,7 +221,7 @@ abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean)
     assert(collectedData.toSet === testData.toSet)
 
     // Verify that the block fetching is skipped when isBlockValid is set to false.
-    // This is done by using a RDD whose data is only in memory but is set to skip block fetching
+    // This is done by using an RDD whose data is only in memory but is set to skip block fetching
     // Using that RDD will throw exception, as it skips block fetching even if the blocks are in
     // in BlockManager.
     if (testIsBlockValid) {
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index fab409d3e9f9..d6ba472a1fc9 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 10d5ba93ebb8..87e34b8a4b00 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -78,6 +78,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
index 922ec7955fd6..c55a5885ba80 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
@@ -54,8 +54,8 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    *
    * @return an RDD containing the edges in this graph
    *
-   * @see [[Edge]] for the edge type.
-   * @see [[Graph#triplets]] to get an RDD which contains all the edges
+   * @see `Edge` for the edge type.
+   * @see `Graph#triplets` to get an RDD which contains all the edges
    * along with their vertex data.
    *
    */
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
index f678e5f1238f..f665727ef90d 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
@@ -32,7 +32,7 @@ object GraphLoader extends Logging {
    * id and a target id. Skips lines that begin with `#`.
    *
    * If desired the edges can be automatically oriented in the positive
-   * direction (source Id < target Id) by setting `canonicalOrientation` to
+   * direction (source Id is less than target Id) by setting `canonicalOrientation` to
    * true.
    *
    * @example Loads a file in the following format:
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
index 98e082cc44e1..376c7b06f9d2 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
@@ -41,7 +41,7 @@ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
 
   /**
    * If `partitionsRDD` already has a partitioner, use it. Otherwise assume that the
-   * [[PartitionID]]s in `partitionsRDD` correspond to the actual partitions and create a new
+   * `PartitionID`s in `partitionsRDD` correspond to the actual partitions and create a new
    * partitioner that allows co-partitioning with `partitionsRDD`.
    */
   override val partitioner =
@@ -63,7 +63,9 @@ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
     this
   }
 
-  /** Persists the edge partitions using `targetStorageLevel`, which defaults to MEMORY_ONLY. */
+  /**
+   * Persists the edge partitions using `targetStorageLevel`, which defaults to MEMORY_ONLY.
+   */
   override def cache(): this.type = {
     partitionsRDD.persist(targetStorageLevel)
     this
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
index e18831382d4d..5d2a53782b55 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
@@ -42,7 +42,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
 
   @transient override val edges: EdgeRDDImpl[ED, VD] = replicatedVertexView.edges
 
-  /** Return a RDD that brings edges together with their source and destination vertices. */
+  /** Return an RDD that brings edges together with their source and destination vertices. */
   @transient override lazy val triplets: RDD[EdgeTriplet[VD, ED]] = {
     replicatedVertexView.upgrade(vertices, true, true)
     replicatedVertexView.edges.partitionsRDD.mapPartitions(_.flatMap {
@@ -277,7 +277,9 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
 
 object GraphImpl {
 
-  /** Create a graph from edges, setting referenced vertices to `defaultVertexAttr`. */
+  /**
+   * Create a graph from edges, setting referenced vertices to `defaultVertexAttr`.
+   */
   def apply[VD: ClassTag, ED: ClassTag](
       edges: RDD[Edge[ED]],
       defaultVertexAttr: VD,
@@ -286,7 +288,9 @@ object GraphImpl {
     fromEdgeRDD(EdgeRDD.fromEdges(edges), defaultVertexAttr, edgeStorageLevel, vertexStorageLevel)
   }
 
-  /** Create a graph from EdgePartitions, setting referenced vertices to `defaultVertexAttr`. */
+  /**
+   * Create a graph from EdgePartitions, setting referenced vertices to `defaultVertexAttr`.
+   */
   def fromEdgePartitions[VD: ClassTag, ED: ClassTag](
       edgePartitions: RDD[(PartitionID, EdgePartition[ED, VD])],
       defaultVertexAttr: VD,
@@ -296,7 +300,9 @@ object GraphImpl {
       vertexStorageLevel)
   }
 
-  /** Create a graph from vertices and edges, setting missing vertices to `defaultVertexAttr`. */
+  /**
+   * Create a graph from vertices and edges, setting missing vertices to `defaultVertexAttr`.
+   */
   def apply[VD: ClassTag, ED: ClassTag](
       vertices: RDD[(VertexId, VD)],
       edges: RDD[Edge[ED]],
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
index 8d608c99b1a1..8da46db98be8 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
@@ -57,7 +57,7 @@ private[graphx] object VertexPartitionBase {
  * concrete implementation. [[VertexPartitionBaseOps]] provides a variety of operations for
  * VertexPartitionBase and subclasses that provide implicit evidence of membership in the
  * `VertexPartitionBaseOpsConstructor` typeclass (for example,
- * [[VertexPartition.VertexPartitionOpsConstructor]]).
+ * `VertexPartition.VertexPartitionOpsConstructor`).
  */
 private[graphx] abstract class VertexPartitionBase[@specialized(Long, Int, Double) VD: ClassTag]
   extends Serializable {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
index 43594573cf01..a8ed59b09bbb 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
@@ -29,7 +29,7 @@ import org.apache.spark.util.collection.BitSet
 /**
  * A class containing additional operations for subclasses of VertexPartitionBase that provide
  * implicit evidence of membership in the `VertexPartitionBaseOpsConstructor` typeclass (for
- * example, [[VertexPartition.VertexPartitionOpsConstructor]]).
+ * example, `VertexPartition.VertexPartitionOpsConstructor`).
  */
 private[graphx] abstract class VertexPartitionBaseOps
     [VD: ClassTag, Self[X] <: VertexPartitionBase[X]: VertexPartitionBaseOpsConstructor]
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
index d314522de991..3c6f22d97360 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
@@ -63,7 +63,9 @@ class VertexRDDImpl[VD] private[graphx] (
     this
   }
 
-  /** Persists the vertex partitions at `targetStorageLevel`, which defaults to MEMORY_ONLY. */
+  /**
+   * Persists the vertex partitions at `targetStorageLevel`, which defaults to MEMORY_ONLY.
+   */
   override def cache(): this.type = {
     partitionsRDD.persist(targetStorageLevel)
     this
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index f4b00757a8b5..feb3f47667f8 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -28,7 +28,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
 /**
  * PageRank algorithm implementation. There are two implementations of PageRank implemented.
  *
- * The first implementation uses the standalone [[Graph]] interface and runs PageRank
+ * The first implementation uses the standalone `Graph` interface and runs PageRank
  * for a fixed number of iterations:
  * {{{
  * var PR = Array.fill(n)( 1.0 )
@@ -41,7 +41,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
  * }
  * }}}
  *
- * The second implementation uses the [[Pregel]] interface and runs PageRank until
+ * The second implementation uses the `Pregel` interface and runs PageRank until
  * convergence:
  *
  * {{{
@@ -58,7 +58,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
  * `alpha` is the random reset probability (typically 0.15), `inNbrs[i]` is the set of
  * neighbors which link to `i` and `outDeg[j]` is the out degree of vertex `j`.
  *
- * Note that this is not the "normalized" PageRank and as a consequence pages that have no
+ * @note This is not the "normalized" PageRank and as a consequence pages that have no
  * inlinks will have a PageRank of alpha.
  */
 object PageRank extends Logging {
@@ -185,6 +185,13 @@ object PageRank extends Logging {
   def runParallelPersonalizedPageRank[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
     numIter: Int, resetProb: Double = 0.15,
     sources: Array[VertexId]): Graph[Vector, Double] = {
+    require(numIter > 0, s"Number of iterations must be greater than 0," +
+      s" but got ${numIter}")
+    require(resetProb >= 0 && resetProb <= 1, s"Random reset probability must belong" +
+      s" to [0, 1], but got ${resetProb}")
+    require(sources.nonEmpty, s"The list of sources must be non-empty," +
+      s" but got ${sources.mkString("[", ",", "]")}")
+
     // TODO if one sources vertex id is outside of the int range
     // we won't be able to store its activations in a sparse vector
     val zero = Vectors.sparse(sources.size, List()).asBreeze
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
index bb2ffab0f60f..59fdd855e6f3 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
@@ -42,7 +42,8 @@ object SVDPlusPlus {
   /**
    * Implement SVD++ based on "Factorization Meets the Neighborhood:
    * a Multifaceted Collaborative Filtering Model",
-   * available at [[http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf]].
+   * available at <a href="http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf">
+   * here</a>.
    *
    * The prediction rule is rui = u + bu + bi + qi*(pu + |N(u)|^^-0.5^^*sum(y)),
    * see the details on page 6.
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
index 34e9e22c3a35..2715137d19eb 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
@@ -36,7 +36,7 @@ import org.apache.spark.graphx._
  * self cycles and canonicalizes the graph to ensure that the following conditions hold:
  * <ul>
  * <li> There are no self edges</li>
- * <li> All edges are oriented src > dst</li>
+ * <li> All edges are oriented (src is greater than dst)</li>
  * <li> There are no duplicate edges</li>
  * </ul>
  * However, the canonicalization procedure is costly as it requires repartitioning the graph.
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/EdgeRDDSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/EdgeRDDSuite.scala
index f1ecc9e2219d..7a24e320c3e0 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/EdgeRDDSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/EdgeRDDSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.graphx
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.Utils
 
 class EdgeRDDSuite extends SparkFunSuite with LocalSparkContext {
 
@@ -33,4 +34,30 @@ class EdgeRDDSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("checkpointing") {
+    withSpark { sc =>
+      val verts = sc.parallelize(List((0L, 0), (1L, 1), (1L, 2), (2L, 3), (2L, 3), (2L, 3)))
+      val edges = EdgeRDD.fromEdges(sc.parallelize(List.empty[Edge[Int]]))
+      sc.setCheckpointDir(Utils.createTempDir().getCanonicalPath)
+      edges.checkpoint()
+
+      // EdgeRDD not yet checkpointed
+      assert(!edges.isCheckpointed)
+      assert(!edges.isCheckpointedAndMaterialized)
+      assert(!edges.partitionsRDD.isCheckpointed)
+      assert(!edges.partitionsRDD.isCheckpointedAndMaterialized)
+
+      val data = edges.collect().toSeq // force checkpointing
+
+      // EdgeRDD shows up as checkpointed, but internally it is not.
+      // Only internal partitionsRDD is checkpointed.
+      assert(edges.isCheckpointed)
+      assert(!edges.isCheckpointedAndMaterialized)
+      assert(edges.partitionsRDD.isCheckpointed)
+      assert(edges.partitionsRDD.isCheckpointedAndMaterialized)
+
+      assert(edges.collect().toSeq ===  data) // test checkpointed RDD
+    }
+  }
+
 }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
index 0bb9e0a3ea18..8e630435279d 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.graphx
 import org.apache.spark.{HashPartitioner, SparkContext, SparkFunSuite}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.Utils
 
 class VertexRDDSuite extends SparkFunSuite with LocalSparkContext {
 
@@ -197,4 +198,29 @@ class VertexRDDSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("checkpoint") {
+    withSpark { sc =>
+      val n = 100
+      val verts = vertices(sc, n)
+      sc.setCheckpointDir(Utils.createTempDir().getCanonicalPath)
+      verts.checkpoint()
+
+      // VertexRDD not yet checkpointed
+      assert(!verts.isCheckpointed)
+      assert(!verts.isCheckpointedAndMaterialized)
+      assert(!verts.partitionsRDD.isCheckpointed)
+      assert(!verts.partitionsRDD.isCheckpointedAndMaterialized)
+
+      val data = verts.collect().toSeq // force checkpointing
+
+      // VertexRDD shows up as checkpointed, but internally it is not.
+      // Only internal partitionsRDD is checkpointed.
+      assert(verts.isCheckpointed)
+      assert(!verts.isCheckpointedAndMaterialized)
+      assert(verts.partitionsRDD.isCheckpointed)
+      assert(verts.partitionsRDD.isCheckpointedAndMaterialized)
+
+      assert(verts.collect().toSeq === data) // test checkpointed RDD
+    }
+  }
 }
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 6023cf077186..db4b15b10499 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -67,6 +67,17 @@
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
 
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <!-- Not needed by the test code, but referenced by SparkSubmit which is used by the tests. -->
     <dependency>
       <groupId>org.apache.hadoop</groupId>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
index 62a22008d0d5..250b2a882feb 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
@@ -357,7 +357,7 @@ static int javaMajorVersion(String javaVersion) {
   static String findJarsDir(String sparkHome, String scalaVersion, boolean failIfNotFound) {
     // TODO: change to the correct directory once the assembly build is changed.
     File libdir;
-    if (new File(sparkHome, "RELEASE").isFile()) {
+    if (new File(sparkHome, "jars").isDirectory()) {
       libdir = new File(sparkHome, "jars");
       checkState(!failIfNotFound || libdir.isDirectory(),
         "Library directory '%s' does not exist.",
diff --git a/launcher/src/main/java/org/apache/spark/launcher/FilteredObjectInputStream.java b/launcher/src/main/java/org/apache/spark/launcher/FilteredObjectInputStream.java
new file mode 100644
index 000000000000..46796b10ae85
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/FilteredObjectInputStream.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectStreamClass;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * An object input stream that only allows classes used by the launcher protocol to be in the
+ * serialized stream. See SPARK-20922.
+ */
+class FilteredObjectInputStream extends ObjectInputStream {
+
+  private static final List<String> ALLOWED_PACKAGES = Arrays.asList(
+    "org.apache.spark.launcher.",
+    "java.lang.");
+
+  FilteredObjectInputStream(InputStream is) throws IOException {
+    super(is);
+  }
+
+  @Override
+  protected Class<?> resolveClass(ObjectStreamClass desc)
+      throws IOException, ClassNotFoundException {
+
+    boolean isValid = false;
+    for (String p : ALLOWED_PACKAGES) {
+      if (desc.getName().startsWith(p)) {
+        isValid = true;
+        break;
+      }
+    }
+    if (!isValid) {
+      throw new IllegalArgumentException(
+        String.format("Unexpected class in stream: %s", desc.getName()));
+    }
+    return super.resolveClass(desc);
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
index eec264909bbb..b4a8719e2605 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
@@ -20,7 +20,6 @@
 import java.io.Closeable;
 import java.io.EOFException;
 import java.io.IOException;
-import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.net.Socket;
 import java.util.logging.Level;
@@ -53,7 +52,7 @@ abstract class LauncherConnection implements Closeable, Runnable {
   @Override
   public void run() {
     try {
-      ObjectInputStream in = new ObjectInputStream(socket.getInputStream());
+      FilteredObjectInputStream in = new FilteredObjectInputStream(socket.getInputStream());
       while (!closed) {
         Message msg = (Message) in.readObject();
         handle(msg);
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 12f1a0ce2d1b..03c2934e2692 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -19,8 +19,11 @@
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.io.ObjectInputStream;
 import java.net.InetAddress;
 import java.net.Socket;
+import java.util.Arrays;
+import java.util.List;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.Semaphore;
@@ -120,31 +123,7 @@ public void testTimeout() throws Exception {
       Socket s = new Socket(InetAddress.getLoopbackAddress(),
         LauncherServer.getServerInstance().getPort());
       client = new TestClient(s);
-
-      // Try a few times since the client-side socket may not reflect the server-side close
-      // immediately.
-      boolean helloSent = false;
-      int maxTries = 10;
-      for (int i = 0; i < maxTries; i++) {
-        try {
-          if (!helloSent) {
-            client.send(new Hello(handle.getSecret(), "1.4.0"));
-            helloSent = true;
-          } else {
-            client.send(new SetAppId("appId"));
-          }
-          fail("Expected exception caused by connection timeout.");
-        } catch (IllegalStateException | IOException e) {
-          // Expected.
-          break;
-        } catch (AssertionError e) {
-          if (i < maxTries - 1) {
-            Thread.sleep(100);
-          } else {
-            throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
-          }
-        }
-      }
+      waitForError(client, handle.getSecret());
     } finally {
       SparkLauncher.launcherConfig.remove(SparkLauncher.CHILD_CONNECTION_TIMEOUT);
       kill(handle);
@@ -183,6 +162,25 @@ public void infoChanged(SparkAppHandle handle) {
     }
   }
 
+  @Test
+  public void testStreamFiltering() throws Exception {
+    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    TestClient client = null;
+    try {
+      Socket s = new Socket(InetAddress.getLoopbackAddress(),
+        LauncherServer.getServerInstance().getPort());
+
+      client = new TestClient(s);
+      client.send(new EvilPayload());
+      waitForError(client, handle.getSecret());
+      assertEquals(0, EvilPayload.EVIL_BIT);
+    } finally {
+      kill(handle);
+      close(client);
+      client.clientThread.join();
+    }
+  }
+
   private void kill(SparkAppHandle handle) {
     if (handle != null) {
       handle.kill();
@@ -199,6 +197,35 @@ private void close(Closeable c) {
     }
   }
 
+  /**
+   * Try a few times to get a client-side error, since the client-side socket may not reflect the
+   * server-side close immediately.
+   */
+  private void waitForError(TestClient client, String secret) throws Exception {
+    boolean helloSent = false;
+    int maxTries = 10;
+    for (int i = 0; i < maxTries; i++) {
+      try {
+        if (!helloSent) {
+          client.send(new Hello(secret, "1.4.0"));
+          helloSent = true;
+        } else {
+          client.send(new SetAppId("appId"));
+        }
+        fail("Expected error but message went through.");
+      } catch (IllegalStateException | IOException e) {
+        // Expected.
+        break;
+      } catch (AssertionError e) {
+        if (i < maxTries - 1) {
+          Thread.sleep(100);
+        } else {
+          throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
+        }
+      }
+    }
+  }
+
   private static class TestClient extends LauncherConnection {
 
     final BlockingQueue<Message> inbound;
@@ -220,4 +247,19 @@ protected void handle(Message msg) throws IOException {
 
   }
 
+  private static class EvilPayload extends LauncherProtocol.Message {
+
+    static int EVIL_BIT = 0;
+
+    // This field should cause the launcher server to throw an error and not deserialize the
+    // message.
+    private List<String> notAllowedField = Arrays.asList("disallowed");
+
+    private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
+      stream.defaultReadObject();
+      EVIL_BIT = 1;
+    }
+
+  }
+
 }
diff --git a/mesos/pom.xml b/mesos/pom.xml
index 57cc26a4ccef..262316a193ca 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
index cd98110ddcc0..219d214dd0d0 100644
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
+++ b/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
@@ -29,7 +29,8 @@ import org.apache.spark.ui.{UIUtils, WebUIPage}
 private[ui] class DriverPage(parent: MesosClusterUI) extends WebUIPage("driver") {
 
   override def render(request: HttpServletRequest): Seq[Node] = {
-    val driverId = request.getParameter("id")
+    // stripXSS is called first to remove suspicious characters used in XSS attacks
+    val driverId = UIUtils.stripXSS(request.getParameter("id"))
     require(driverId != null && driverId.nonEmpty, "Missing id parameter")
 
     val state = parent.scheduler.getDriverState(driverId)
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala b/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
index 3b96488a129a..ff60b88c6d53 100644
--- a/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
+++ b/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.rest.mesos
 
 import java.io.File
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 import java.util.concurrent.atomic.AtomicLong
 import javax.servlet.http.HttpServletResponse
 
@@ -62,11 +62,10 @@ private[mesos] class MesosSubmitRequestServlet(
   private val DEFAULT_CORES = 1.0
 
   private val nextDriverNumber = new AtomicLong(0)
-  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")  // For application IDs
-  private def newDriverId(submitDate: Date): String = {
-    "driver-%s-%04d".format(
-      createDateFormat.format(submitDate), nextDriverNumber.incrementAndGet())
-  }
+  // For application IDs
+  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
+  private def newDriverId(submitDate: Date): String =
+    f"driver-${createDateFormat.format(submitDate)}-${nextDriverNumber.incrementAndGet()}%04d"
 
   /**
    * Build a driver description from the fields specified in the submit request.
diff --git a/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
index 1937bd30bac5..ee9149ce0208 100644
--- a/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
+++ b/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
@@ -75,7 +75,7 @@ private[spark] class MesosExecutorBackend
     val conf = new SparkConf(loadDefaults = true).setAll(properties)
     val port = conf.getInt("spark.executor.port", 0)
     val env = SparkEnv.createExecutorEnv(
-      conf, executorId, slaveInfo.getHostname, port, cpusPerTask, isLocal = false)
+      conf, executorId, slaveInfo.getHostname, port, cpusPerTask, None, isLocal = false)
 
     executor = new Executor(
       executorId,
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
index a849c4afa24f..ed29b346ba26 100644
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
+++ b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.scheduler.cluster.mesos
 
 import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.internal.config._
 import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 
 /**
@@ -37,6 +38,9 @@ private[spark] class MesosClusterManager extends ExternalClusterManager {
   override def createSchedulerBackend(sc: SparkContext,
       masterURL: String,
       scheduler: TaskScheduler): SchedulerBackend = {
+    require(!sc.conf.get(IO_ENCRYPTION_ENABLED),
+      "I/O encryption is currently not supported in Mesos.")
+
     val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1)
     val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)
     if (coarse) {
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 5063c1fe988b..22df2b1db284 100644
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -54,14 +54,17 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   with org.apache.mesos.Scheduler
   with MesosSchedulerUtils {
 
-  val MAX_SLAVE_FAILURES = 2     // Blacklist a slave after this many failures
+  // Blacklist a slave after this many failures
+  private val MAX_SLAVE_FAILURES = 2
 
-  // Maximum number of cores to acquire (TODO: we'll need more flexible controls here)
-  val maxCores = conf.get("spark.cores.max", Int.MaxValue.toString).toInt
+  private val maxCoresOption = conf.getOption("spark.cores.max").map(_.toInt)
 
-  val useFetcherCache = conf.getBoolean("spark.mesos.fetcherCache.enable", false)
+  // Maximum number of cores to acquire
+  private val maxCores = maxCoresOption.getOrElse(Int.MaxValue)
 
-  val maxGpus = conf.getInt("spark.mesos.gpus.max", 0)
+  private val useFetcherCache = conf.getBoolean("spark.mesos.fetcherCache.enable", false)
+
+  private val maxGpus = conf.getInt("spark.mesos.gpus.max", 0)
 
   private[this] val shutdownTimeoutMS =
     conf.getTimeAsMs("spark.mesos.coarse.shutdownTimeout", "10s")
@@ -75,10 +78,10 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   private val shuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)
 
   // Cores we have acquired with each Mesos task ID
-  val coresByTaskId = new mutable.HashMap[String, Int]
-  val gpusByTaskId = new mutable.HashMap[String, Int]
-  var totalCoresAcquired = 0
-  var totalGpusAcquired = 0
+  private val coresByTaskId = new mutable.HashMap[String, Int]
+  private val gpusByTaskId = new mutable.HashMap[String, Int]
+  private var totalCoresAcquired = 0
+  private var totalGpusAcquired = 0
 
   // SlaveID -> Slave
   // This map accumulates entries for the duration of the job.  Slaves are never deleted, because
@@ -108,7 +111,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   // may lead to deadlocks since the superclass might also try to lock
   private val stateLock = new ReentrantLock
 
-  val extraCoresPerExecutor = conf.getInt("spark.mesos.extra.cores", 0)
+  private val extraCoresPerExecutor = conf.getInt("spark.mesos.extra.cores", 0)
 
   // Offer constraints
   private val slaveOfferConstraints =
@@ -140,7 +143,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       securityManager.isSaslEncryptionEnabled())
   }
 
-  var nextMesosTaskId = 0
+  private var nextMesosTaskId = 0
 
   @volatile var appId: String = _
 
@@ -257,7 +260,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   }
 
   override def sufficientResourcesRegistered(): Boolean = {
-    totalCoresAcquired >= maxCores * minRegisteredRatio
+    totalCoreCount.get >= maxCoresOption.getOrElse(0) * minRegisteredRatio
   }
 
   override def disconnected(d: org.apache.mesos.SchedulerDriver) {}
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
index 6fce06632c57..a55855428b47 100644
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
+++ b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.scheduler.cluster.mesos
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark._
+import org.apache.spark.internal.config._
 
 class MesosClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
     def testURL(masterURL: String, expectedClass: Class[_], coarse: Boolean) {
@@ -44,4 +45,12 @@ class MesosClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
           classOf[MesosFineGrainedSchedulerBackend],
           coarse = false)
     }
+
+    test("mesos with i/o encryption throws error") {
+      val se = intercept[SparkException] {
+        val conf = new SparkConf().setAppName("test").set(IO_ENCRYPTION_ENABLED, true)
+        sc = new SparkContext("mesos", "test", conf)
+      }
+      assert(se.getCause().isInstanceOf[IllegalArgumentException])
+    }
 }
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index f73638fda623..f96d65338b79 100644
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -20,9 +20,7 @@ package org.apache.spark.scheduler.cluster.mesos
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
-import scala.concurrent.Promise
 import scala.reflect.ClassTag
 
 import org.apache.mesos.{Protos, Scheduler, SchedulerDriver}
@@ -37,8 +35,8 @@ import org.scalatest.BeforeAndAfter
 import org.apache.spark.{LocalSparkContext, SecurityManager, SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.internal.config._
 import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
-import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
+import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef}
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor, RemoveExecutor}
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.scheduler.cluster.mesos.Utils._
 
@@ -304,25 +302,29 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
   }
 
   test("weburi is set in created scheduler driver") {
-    setBackend()
+    initializeSparkConf()
+    sc = new SparkContext(sparkConf)
+
     val taskScheduler = mock[TaskSchedulerImpl]
     when(taskScheduler.sc).thenReturn(sc)
+
     val driver = mock[SchedulerDriver]
     when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
+
     val securityManager = mock[SecurityManager]
 
     val backend = new MesosCoarseGrainedSchedulerBackend(
-        taskScheduler, sc, "master", securityManager) {
+      taskScheduler, sc, "master", securityManager) {
       override protected def createSchedulerDriver(
-        masterUrl: String,
-        scheduler: Scheduler,
-        sparkUser: String,
-        appName: String,
-        conf: SparkConf,
-        webuiUrl: Option[String] = None,
-        checkpoint: Option[Boolean] = None,
-        failoverTimeout: Option[Double] = None,
-        frameworkId: Option[String] = None): SchedulerDriver = {
+          masterUrl: String,
+          scheduler: Scheduler,
+          sparkUser: String,
+          appName: String,
+          conf: SparkConf,
+          webuiUrl: Option[String] = None,
+          checkpoint: Option[Boolean] = None,
+          failoverTimeout: Option[Double] = None,
+          frameworkId: Option[String] = None): SchedulerDriver = {
         markRegistered()
         assert(webuiUrl.isDefined)
         assert(webuiUrl.get.equals("http://webui"))
@@ -422,37 +424,11 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     assert(!dockerInfo.getForcePullImage)
   }
 
-  test("Do not call removeExecutor() after backend is stopped") {
-    setBackend()
-
-    // launches a task on a valid offer
-    val offers = List(Resources(backend.executorMemory(sc), 1))
-    offerResources(offers)
-    verifyTaskLaunched(driver, "o1")
-
-    // launches a thread simulating status update
-    val statusUpdateThread = new Thread {
-      override def run(): Unit = {
-        while (!stopCalled) {
-          Thread.sleep(100)
-        }
-
-        val status = createTaskStatus("0", "s1", TaskState.TASK_FINISHED)
-        backend.statusUpdate(driver, status)
-      }
-    }.start
-
-    backend.stop()
-    // Any method of the backend involving sending messages to the driver endpoint should not
-    // be called after the backend is stopped.
-    verify(driverEndpoint, never()).askWithRetry(isA(classOf[RemoveExecutor]))(any[ClassTag[_]])
-  }
-
   test("mesos supports spark.executor.uri") {
     val url = "spark.spark.spark.com"
     setBackend(Map(
       "spark.executor.uri" -> url
-    ), false)
+    ), null)
 
     val (mem, cpu) = (backend.executorMemory(sc), 4)
 
@@ -468,7 +444,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     setBackend(Map(
       "spark.mesos.fetcherCache.enable" -> "true",
       "spark.executor.uri" -> url
-    ), false)
+    ), null)
     val offers = List(Resources(backend.executorMemory(sc), 1))
     offerResources(offers)
     val launchedTasks = verifyTaskLaunched(driver, "o1")
@@ -482,7 +458,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     setBackend(Map(
       "spark.mesos.fetcherCache.enable" -> "false",
       "spark.executor.uri" -> url
-    ), false)
+    ), null)
     val offers = List(Resources(backend.executorMemory(sc), 1))
     offerResources(offers)
     val launchedTasks = verifyTaskLaunched(driver, "o1")
@@ -491,8 +467,31 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     assert(!uris.asScala.head.getCache)
   }
 
+  test("supports spark.scheduler.minRegisteredResourcesRatio") {
+    val expectedCores = 1
+    setBackend(Map(
+      "spark.cores.max" -> expectedCores.toString,
+      "spark.scheduler.minRegisteredResourcesRatio" -> "1.0"))
+
+    val offers = List(Resources(backend.executorMemory(sc), expectedCores))
+    offerResources(offers)
+    val launchedTasks = verifyTaskLaunched(driver, "o1")
+    assert(!backend.isReady)
+
+    registerMockExecutor(launchedTasks(0).getTaskId.getValue, "s1", expectedCores)
+    assert(backend.isReady)
+  }
+
   private case class Resources(mem: Int, cpus: Int, gpus: Int = 0)
 
+  private def registerMockExecutor(executorId: String, slaveId: String, cores: Integer) = {
+    val mockEndpointRef = mock[RpcEndpointRef]
+    val mockAddress = mock[RpcAddress]
+    val message = RegisterExecutor(executorId, mockEndpointRef, slaveId, cores, Map.empty)
+
+    backend.driverEndpoint.askWithRetry[Boolean](message)
+  }
+
   private def verifyDeclinedOffer(driver: SchedulerDriver,
       offerId: OfferID,
       filter: Boolean = false): Unit = {
@@ -521,8 +520,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
   private def createSchedulerBackend(
       taskScheduler: TaskSchedulerImpl,
       driver: SchedulerDriver,
-      shuffleClient: MesosExternalShuffleClient,
-      endpoint: RpcEndpointRef): MesosCoarseGrainedSchedulerBackend = {
+      shuffleClient: MesosExternalShuffleClient) = {
     val securityManager = mock[SecurityManager]
 
     val backend = new MesosCoarseGrainedSchedulerBackend(
@@ -540,9 +538,6 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
 
       override protected def getShuffleClient(): MesosExternalShuffleClient = shuffleClient
 
-      override protected def createDriverEndpointRef(
-          properties: ArrayBuffer[(String, String)]): RpcEndpointRef = endpoint
-
       // override to avoid race condition with the driver thread on `mesosDriver`
       override def startScheduler(newDriver: SchedulerDriver): Unit = {
         mesosDriver = newDriver
@@ -558,31 +553,35 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
     backend
   }
 
-  private def setBackend(sparkConfVars: Map[String, String] = null,
-      setHome: Boolean = true) {
+  private def initializeSparkConf(
+    sparkConfVars: Map[String, String] = null,
+    home: String = "/path"): Unit = {
     sparkConf = (new SparkConf)
       .setMaster("local[*]")
       .setAppName("test-mesos-dynamic-alloc")
       .set("spark.mesos.driver.webui.url", "http://webui")
 
-    if (setHome) {
-      sparkConf.setSparkHome("/path")
+    if (home != null) {
+      sparkConf.setSparkHome(home)
     }
 
     if (sparkConfVars != null) {
       sparkConf.setAll(sparkConfVars)
     }
+  }
 
+  private def setBackend(sparkConfVars: Map[String, String] = null, home: String = "/path") {
+    initializeSparkConf(sparkConfVars, home)
     sc = new SparkContext(sparkConf)
 
     driver = mock[SchedulerDriver]
     when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
+
     taskScheduler = mock[TaskSchedulerImpl]
     when(taskScheduler.sc).thenReturn(sc)
+
     externalShuffleClient = mock[MesosExternalShuffleClient]
-    driverEndpoint = mock[RpcEndpointRef]
-    when(driverEndpoint.ask(any())(any())).thenReturn(Promise().future)
 
-    backend = createSchedulerBackend(taskScheduler, driver, externalShuffleClient, driverEndpoint)
+    backend = createSchedulerBackend(taskScheduler, driver, externalShuffleClient)
   }
 }
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 8c985fd13ac0..dae5b86d5fcb 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -56,6 +56,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <profiles>
     <profile>
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index 4d4b06b0952b..d9ffdeb797fb 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -85,11 +85,15 @@ sealed trait Matrix extends Serializable {
   @Since("2.0.0")
   def copy: Matrix
 
-  /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */
+  /**
+   * Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data.
+   */
   @Since("2.0.0")
   def transpose: Matrix
 
-  /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */
+  /**
+   * Convenience method for `Matrix`-`DenseMatrix` multiplication.
+   */
   @Since("2.0.0")
   def multiply(y: DenseMatrix): DenseMatrix = {
     val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols)
@@ -97,13 +101,17 @@ sealed trait Matrix extends Serializable {
     C
   }
 
-  /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */
+  /**
+   * Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility.
+   */
   @Since("2.0.0")
   def multiply(y: DenseVector): DenseVector = {
     multiply(y.asInstanceOf[Vector])
   }
 
-  /** Convenience method for `Matrix`-`Vector` multiplication. */
+  /**
+   * Convenience method for `Matrix`-`Vector` multiplication.
+   */
   @Since("2.0.0")
   def multiply(y: Vector): DenseVector = {
     val output = new DenseVector(new Array[Double](numRows))
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
index 2e4a58dc6291..7bc2cb1701f9 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -30,7 +30,7 @@ import org.apache.spark.annotation.Since
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
  *
- * Note: Users should not implement this interface.
+ * @note Users should not implement this interface.
  */
 @Since("2.0.0")
 sealed trait Vector extends Serializable {
@@ -657,6 +657,8 @@ class SparseVector @Since("2.0.0") (
   override def argmax: Int = {
     if (size == 0) {
       -1
+    } else if (numActives == 0) {
+      0
     } else {
       // Find the max active entry.
       var maxIdx = indices(0)
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
index 0be28677eff3..3167e0c286d4 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
@@ -28,7 +28,8 @@ import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}
  * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
  * the event that the covariance matrix is singular, the density will be computed in a
  * reduced dimensional subspace under which the distribution is supported.
- * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
+ * (see <a href="http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case">
+ * here</a>)
  *
  * @param mean The mean vector of the distribution
  * @param cov The covariance matrix of the distribution
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
index ea22c2787fb3..bd7165633769 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala
@@ -125,6 +125,13 @@ class VectorsSuite extends SparkMLFunSuite {
 
     val vec8 = Vectors.sparse(5, Array(1, 2), Array(0.0, -1.0))
     assert(vec8.argmax === 0)
+
+    // Check for case when sparse vector is non-empty but the values are empty
+    val vec9 = Vectors.sparse(100, Array.empty[Int], Array.empty[Double]).asInstanceOf[SparseVector]
+    assert(vec9.argmax === 0)
+
+    val vec10 = Vectors.sparse(1, Array.empty[Int], Array.empty[Double]).asInstanceOf[SparseVector]
+    assert(vec10.argmax === 0)
   }
 
   test("vector equals") {
diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/util/TestingUtils.scala b/mllib-local/src/test/scala/org/apache/spark/ml/util/TestingUtils.scala
index 2327917e2cad..336519369823 100644
--- a/mllib-local/src/test/scala/org/apache/spark/ml/util/TestingUtils.scala
+++ b/mllib-local/src/test/scala/org/apache/spark/ml/util/TestingUtils.scala
@@ -207,7 +207,7 @@ object TestingUtils {
       if (r.fun(x, r.y, r.eps)) {
         throw new TestFailedException(
           s"Did not expect \n$x\n and \n${r.y}\n to be within " +
-            "${r.eps}${r.method} for all elements.", 0)
+            s"${r.eps}${r.method} for all elements.", 0)
       }
       true
     }
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 4484998a49c8..be87ad2d1994 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -113,6 +113,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
   </dependencies>
   <profiles>
     <profile>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
index 252acc156583..c581fed17727 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Model.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
@@ -30,7 +30,7 @@ import org.apache.spark.ml.param.ParamMap
 abstract class Model[M <: Model[M]] extends Transformer {
   /**
    * The parent estimator that produced this model.
-   * Note: For ensembles' component Models, this value can be null.
+   * @note For ensembles' component Models, this value can be null.
    */
   @transient var parent: Estimator[M] = _
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 195a93e08672..08e9cb9ba866 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -46,6 +46,10 @@ abstract class PipelineStage extends Params with Logging {
    *
    * Check transform validity and derive the output schema from the input schema.
    *
+   * We check validity for interactions between parameters during `transformSchema` and
+   * raise an exception if any parameter value is invalid. Parameter value checks which
+   * do not depend on other parameters are handled by `Param.validate()`.
+   *
    * Typical implementation should first conduct verification on schema change and parameter
    * validity, including complex parameter interaction checks.
    */
@@ -169,7 +173,7 @@ class Pipeline @Since("1.4.0") (
   override def copy(extra: ParamMap): Pipeline = {
     val map = extractParamMap(extra)
     val newStages = map(stages).map(_.copy(extra))
-    new Pipeline().setStages(newStages)
+    new Pipeline(uid).setStages(newStages)
   }
 
   @Since("1.2.0")
@@ -212,7 +216,9 @@ object Pipeline extends MLReadable[Pipeline] {
     }
   }
 
-  /** Methods for `MLReader` and `MLWriter` shared between [[Pipeline]] and [[PipelineModel]] */
+  /**
+   * Methods for `MLReader` and `MLWriter` shared between [[Pipeline]] and [[PipelineModel]]
+   */
   private[ml] object SharedReadWrite {
 
     import org.json4s.JsonDSL._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index aa92edde7acd..4b43a3aa5b70 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -59,7 +59,7 @@ private[ml] trait PredictorParams extends Params
 /**
  * :: DeveloperApi ::
  * Abstraction for prediction problems (regression and classification). It accepts all NumericType
- * labels and will automatically cast it to DoubleType in [[fit()]].
+ * labels and will automatically cast it to DoubleType in `fit()`.
  *
  * @tparam FeaturesType  Type of features.
  *                       E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
index 12b9732a4c3d..21a246e454c8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
@@ -239,7 +239,9 @@ object AttributeGroup {
     }
   }
 
-  /** Creates an attribute group from a [[StructField]] instance. */
+  /**
+   * Creates an attribute group from a `StructField` instance.
+   */
   def fromStructField(field: StructField): AttributeGroup = {
     require(field.dataType == new VectorUDT)
     if (field.metadata.contains(ML_ATTR)) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
index 27554acdf3c2..7fbfee75e96a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
@@ -98,7 +98,7 @@ sealed abstract class Attribute extends Serializable {
   def toMetadata(): Metadata = toMetadata(Metadata.empty)
 
   /**
-   * Converts to a [[StructField]] with some existing metadata.
+   * Converts to a `StructField` with some existing metadata.
    * @param existingMetadata existing metadata to carry over
    */
   def toStructField(existingMetadata: Metadata): StructField = {
@@ -109,7 +109,9 @@ sealed abstract class Attribute extends Serializable {
     StructField(name.get, DoubleType, nullable = false, newMetadata)
   }
 
-  /** Converts to a [[StructField]]. */
+  /**
+   * Converts to a `StructField`.
+   */
   def toStructField(): StructField = toStructField(Metadata.empty)
 
   override def toString: String = toMetadataImpl(withType = true).toString
@@ -369,12 +371,16 @@ class NominalAttribute private[ml] (
   override def withIndex(index: Int): NominalAttribute = copy(index = Some(index))
   override def withoutIndex: NominalAttribute = copy(index = None)
 
-  /** Copy with new values and empty `numValues`. */
+  /**
+   * Copy with new values and empty `numValues`.
+   */
   def withValues(values: Array[String]): NominalAttribute = {
     copy(numValues = None, values = Some(values))
   }
 
-  /** Copy with new values and empty `numValues`. */
+  /**
+   * Copy with new values and empty `numValues`.
+   */
   @varargs
   def withValues(first: String, others: String*): NominalAttribute = {
     copy(numValues = None, values = Some((first +: others).toArray))
@@ -385,12 +391,16 @@ class NominalAttribute private[ml] (
     copy(values = None)
   }
 
-  /** Copy with a new `numValues` and empty `values`. */
+  /**
+   * Copy with a new `numValues` and empty `values`.
+   */
   def withNumValues(numValues: Int): NominalAttribute = {
     copy(numValues = Some(numValues), values = None)
   }
 
-  /** Copy without the `numValues`. */
+  /**
+   * Copy without the `numValues`.
+   */
   def withoutNumValues: NominalAttribute = copy(numValues = None)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index bb192ab5f25a..9f60f0896ec5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -52,33 +52,49 @@ class DecisionTreeClassifier @Since("1.4.0") (
 
   // Override parameter setters from parent trait for Java API compatibility.
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setImpurity(value: String): this.type = super.setImpurity(value)
+  override def setImpurity(value: String): this.type = set(impurity, value)
 
+  /** @group setParam */
   @Since("1.6.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   override protected def train(dataset: Dataset[_]): DecisionTreeClassificationModel = {
     val categoricalFeatures: Map[Int, Int] =
@@ -207,9 +223,9 @@ class DecisionTreeClassificationModel private[ml] (
    *     where gain is scaled by the number of instances passing through node
    *   - Normalize importances for tree to sum to 1.
    *
-   * Note: Feature importance for single decision trees can have high variance due to
-   *       correlated predictor variables. Consider using a [[RandomForestClassifier]]
-   *       to determine feature importance instead.
+   * @note Feature importance for single decision trees can have high variance due to
+   * correlated predictor variables. Consider using a [[RandomForestClassifier]]
+   * to determine feature importance instead.
    */
   @Since("2.0.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(this, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index f8f164e8c14b..c99b63b25d2e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -43,7 +43,6 @@ import org.apache.spark.sql.types.DoubleType
  * Gradient-Boosted Trees (GBTs) (http://en.wikipedia.org/wiki/Gradient_boosting)
  * learning algorithm for classification.
  * It supports binary labels, as well as both continuous and categorical features.
- * Note: Multiclass labels are not currently supported.
  *
  * The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
  *
@@ -54,6 +53,8 @@ import org.apache.spark.sql.types.DoubleType
  *    based on the loss function, whereas the original gradient boosting method does not.
  *  - We expect to implement TreeBoost in the future:
  *    [https://issues.apache.org/jira/browse/SPARK-4240]
+ *
+ * @note Multiclass labels are not currently supported.
  */
 @Since("1.4.0")
 class GBTClassifier @Since("1.4.0") (
@@ -68,31 +69,47 @@ class GBTClassifier @Since("1.4.0") (
 
   // Parameters from TreeClassifierParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
   /**
    * The impurity setting is ignored for GBT models.
    * Individual trees are built using impurity "Variance."
+   *
+   * @group setParam
    */
   @Since("1.4.0")
   override def setImpurity(value: String): this.type = {
@@ -102,19 +119,23 @@ class GBTClassifier @Since("1.4.0") (
 
   // Parameters from TreeEnsembleParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSubsamplingRate(value: Double): this.type = super.setSubsamplingRate(value)
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   // Parameters from GBTParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxIter(value: Int): this.type = super.setMaxIter(value)
+  override def setMaxIter(value: Int): this.type = set(maxIter, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setStepSize(value: Double): this.type = super.setStepSize(value)
+  override def setStepSize(value: Double): this.type = set(stepSize, value)
 
   // Parameters from GBTClassifierParams:
 
@@ -169,10 +190,11 @@ object GBTClassifier extends DefaultParamsReadable[GBTClassifier] {
  * Gradient-Boosted Trees (GBTs) (http://en.wikipedia.org/wiki/Gradient_boosting)
  * model for classification.
  * It supports binary labels, as well as both continuous and categorical features.
- * Note: Multiclass labels are not currently supported.
  *
  * @param _trees  Decision trees in the ensemble.
  * @param _treeWeights  Weights for the decision trees in the ensemble.
+ *
+ * @note Multiclass labels are not currently supported.
  */
 @Since("1.6.0")
 class GBTClassificationModel private[ml](
@@ -201,6 +223,12 @@ class GBTClassificationModel private[ml](
   @Since("1.4.0")
   override def trees: Array[DecisionTreeRegressionModel] = _trees
 
+  /**
+   * Number of trees in ensemble
+   */
+  @Since("2.0.0")
+  val getNumTrees: Int = trees.length
+
   @Since("1.4.0")
   override def treeWeights: Array[Double] = _treeWeights
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index c4651054fd76..5e1d6eec96a3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -40,7 +40,7 @@ import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.{col, lit}
-import org.apache.spark.sql.types.DoubleType
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.VersionUtils
 
@@ -56,13 +56,13 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   /**
    * Set threshold in binary classification, in range [0, 1].
    *
-   * If the estimated probability of class label 1 is > threshold, then predict 1, else 0.
-   * A high threshold encourages the model to predict 0 more often;
+   * If the estimated probability of class label 1 is greater than threshold, then predict 1,
+   * else 0. A high threshold encourages the model to predict 0 more often;
    * a low threshold encourages the model to predict 1 more often.
    *
    * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`.
-   *       When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared.
-   *       If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be
+   *       When `setThreshold()` is called, any user-set value for `thresholds` will be cleared.
+   *       If both `threshold` and `thresholds` are set in a ParamMap, then they must be
    *       equivalent.
    *
    * Default is 0.5.
@@ -101,12 +101,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   /**
    * Get threshold for binary classification.
    *
-   * If [[thresholds]] is set with length 2 (i.e., binary classification),
+   * If `thresholds` is set with length 2 (i.e., binary classification),
    * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}.
-   * Otherwise, returns [[threshold]] if set, or its default value if unset.
+   * Otherwise, returns `threshold` if set, or its default value if unset.
    *
    * @group getParam
-   * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2.
+   * @throws IllegalArgumentException if `thresholds` is set to an array of length other than 2.
    */
   override def getThreshold: Double = {
     checkThresholdConsistency()
@@ -122,13 +122,13 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
 
   /**
    * Set thresholds in multiclass (or binary) classification to adjust the probability of
-   * predicting each class. Array must have length equal to the number of classes, with values > 0,
-   * excepting that at most one value may be 0.
+   * predicting each class. Array must have length equal to the number of classes,
+   * with values greater than 0, excepting that at most one value may be 0.
    * The class with largest value p/t is predicted, where p is the original probability of that
    * class and t is the class's threshold.
    *
-   * Note: When [[setThresholds()]] is called, any user-set value for [[threshold]] will be cleared.
-   *       If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be
+   * Note: When `setThresholds()` is called, any user-set value for `threshold` will be cleared.
+   *       If both `threshold` and `thresholds` are set in a ParamMap, then they must be
    *       equivalent.
    *
    * @group setParam
@@ -141,8 +141,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   /**
    * Get thresholds for binary or multiclass classification.
    *
-   * If [[thresholds]] is set, return its value.
-   * Otherwise, if [[threshold]] is set, return the equivalent thresholds for binary
+   * If `thresholds` is set, return its value.
+   * Otherwise, if `threshold` is set, return the equivalent thresholds for binary
    * classification: (1-threshold, threshold).
    * If neither are set, throw an exception.
    *
@@ -159,9 +159,9 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   }
 
   /**
-   * If [[threshold]] and [[thresholds]] are both set, ensures they are consistent.
+   * If `threshold` and `thresholds` are both set, ensures they are consistent.
    *
-   * @throws IllegalArgumentException if [[threshold]] and [[thresholds]] are not equivalent
+   * @throws IllegalArgumentException if `threshold` and `thresholds` are not equivalent
    */
   protected def checkThresholdConsistency(): Unit = {
     if (isSet(threshold) && isSet(thresholds)) {
@@ -176,8 +176,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
     }
   }
 
-  override def validateParams(): Unit = {
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType = {
     checkThresholdConsistency()
+    super.validateAndTransformSchema(schema, fitting, featuresDataType)
   }
 }
 
@@ -206,8 +210,9 @@ class LogisticRegression @Since("1.2.0") (
 
   /**
    * Set the ElasticNet mixing parameter.
-   * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
-   * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+   * For alpha = 0, the penalty is an L2 penalty.
+   * For alpha = 1, it is an L1 penalty.
+   * For alpha in (0,1), the penalty is a combination of L1 and L2.
    * Default is 0.0 which is an L2 penalty.
    *
    * @group setParam
@@ -294,7 +299,7 @@ class LogisticRegression @Since("1.2.0") (
   override def getThresholds: Array[Double] = super.getThresholds
 
   /**
-   * Suggested depth for treeAggregate (>= 2).
+   * Suggested depth for treeAggregate (greater than or equal to 2).
    * If the dimensions of features or the number of partitions are large,
    * this param could be adjusted to a larger size.
    * Default is 2.
@@ -307,7 +312,6 @@ class LogisticRegression @Since("1.2.0") (
 
   private var optInitialModel: Option[LogisticRegressionModel] = None
 
-  /** @group setParam */
   private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = {
     this.optInitialModel = Some(model)
     this
@@ -318,8 +322,9 @@ class LogisticRegression @Since("1.2.0") (
     train(dataset, handlePersistence)
   }
 
-  protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean):
-      LogisticRegressionModel = {
+  protected[spark] def train(
+      dataset: Dataset[_],
+      handlePersistence: Boolean): LogisticRegressionModel = {
     val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
     val instances: RDD[Instance] =
       dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
@@ -438,18 +443,14 @@ class LogisticRegression @Since("1.2.0") (
           val standardizationParam = $(standardization)
           def regParamL1Fun = (index: Int) => {
             // Remove the L1 penalization on the intercept
-            val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0)
+            val isIntercept = $(fitIntercept) && index >= numFeatures * numCoefficientSets
             if (isIntercept) {
               0.0
             } else {
               if (standardizationParam) {
                 regParamL1
               } else {
-                val featureIndex = if ($(fitIntercept)) {
-                  index % numFeaturesPlusIntercept
-                } else {
-                  index % numFeatures
-                }
+                val featureIndex = index / numCoefficientSets
                 // If `standardization` is false, we still standardize the data
                 // to improve the rate of convergence; as a result, we have to
                 // perform this reverse standardization by penalizing each component
@@ -466,8 +467,12 @@ class LogisticRegression @Since("1.2.0") (
           new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
         }
 
-        val initialCoefficientsWithIntercept =
-          Vectors.zeros(numCoefficientSets * numFeaturesPlusIntercept)
+        /*
+          The coefficients are laid out in column major order during training. Here we initialize
+          a column major matrix of initial coefficients.
+         */
+        val initialCoefWithInterceptMatrix =
+          Matrices.zeros(numCoefficientSets, numFeaturesPlusIntercept)
 
         val initialModelIsValid = optInitialModel match {
           case Some(_initialModel) =>
@@ -486,17 +491,15 @@ class LogisticRegression @Since("1.2.0") (
         }
 
         if (initialModelIsValid) {
-          val initialCoefWithInterceptArray = initialCoefficientsWithIntercept.toArray
           val providedCoef = optInitialModel.get.coefficientMatrix
-          providedCoef.foreachActive { (row, col, value) =>
-            val flatIndex = row * numFeaturesPlusIntercept + col
+          providedCoef.foreachActive { (classIndex, featureIndex, value) =>
             // We need to scale the coefficients since they will be trained in the scaled space
-            initialCoefWithInterceptArray(flatIndex) = value * featuresStd(col)
+            initialCoefWithInterceptMatrix.update(classIndex, featureIndex,
+              value * featuresStd(featureIndex))
           }
           if ($(fitIntercept)) {
-            optInitialModel.get.interceptVector.foreachActive { (index, value) =>
-              val coefIndex = (index + 1) * numFeaturesPlusIntercept - 1
-              initialCoefWithInterceptArray(coefIndex) = value
+            optInitialModel.get.interceptVector.foreachActive { (classIndex, value) =>
+              initialCoefWithInterceptMatrix.update(classIndex, numFeatures, value)
             }
           }
         } else if ($(fitIntercept) && isMultinomial) {
@@ -526,8 +529,7 @@ class LogisticRegression @Since("1.2.0") (
           val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
           val rawMean = rawIntercepts.sum / rawIntercepts.length
           rawIntercepts.indices.foreach { i =>
-            initialCoefficientsWithIntercept.toArray(i * numFeaturesPlusIntercept + numFeatures) =
-              rawIntercepts(i) - rawMean
+            initialCoefWithInterceptMatrix.update(i, numFeatures, rawIntercepts(i) - rawMean)
           }
         } else if ($(fitIntercept)) {
           /*
@@ -543,12 +545,12 @@ class LogisticRegression @Since("1.2.0") (
                b = \log{P(1) / P(0)} = \log{count_1 / count_0}
              }}}
            */
-          initialCoefficientsWithIntercept.toArray(numFeatures) = math.log(
-            histogram(1) / histogram(0))
+          initialCoefWithInterceptMatrix.update(0, numFeatures,
+            math.log(histogram(1) / histogram(0)))
         }
 
         val states = optimizer.iterations(new CachedDiffFunction(costFun),
-          initialCoefficientsWithIntercept.asBreeze.toDenseVector)
+          new BDV[Double](initialCoefWithInterceptMatrix.toArray))
 
         /*
            Note that in Logistic Regression, the objective history (loss + regularization)
@@ -572,19 +574,32 @@ class LogisticRegression @Since("1.2.0") (
         /*
            The coefficients are trained in the scaled space; we're converting them back to
            the original space.
+
+           Additionally, since the coefficients were laid out in column major order during training
+           to avoid extra computation, we convert them back to row major before passing them to the
+           model.
+
            Note that the intercept in scaled space and original space is the same;
            as a result, no scaling is needed.
          */
-        val rawCoefficients = state.x.toArray.clone()
-        val coefficientArray = Array.tabulate(numCoefficientSets * numFeatures) { i =>
-          // flatIndex will loop though rawCoefficients, and skip the intercept terms.
-          val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
-          val featureIndex = i % numFeatures
-          if (featuresStd(featureIndex) != 0.0) {
-            rawCoefficients(flatIndex) / featuresStd(featureIndex)
-          } else {
-            0.0
+        val allCoefficients = state.x.toArray.clone()
+        val allCoefMatrix = new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept,
+          allCoefficients)
+        val denseCoefficientMatrix = new DenseMatrix(numCoefficientSets, numFeatures,
+          new Array[Double](numCoefficientSets * numFeatures), isTransposed = true)
+        val interceptVec = if ($(fitIntercept) || !isMultinomial) {
+          Vectors.zeros(numCoefficientSets)
+        } else {
+          Vectors.sparse(numCoefficientSets, Seq())
+        }
+        // separate intercepts and coefficients from the combined matrix
+        allCoefMatrix.foreachActive { (classIndex, featureIndex, value) =>
+          val isIntercept = $(fitIntercept) && (featureIndex == numFeatures)
+          if (!isIntercept && featuresStd(featureIndex) != 0.0) {
+            denseCoefficientMatrix.update(classIndex, featureIndex,
+              value / featuresStd(featureIndex))
           }
+          if (isIntercept) interceptVec.toArray(classIndex) = value
         }
 
         if ($(regParam) == 0.0 && isMultinomial) {
@@ -597,17 +612,16 @@ class LogisticRegression @Since("1.2.0") (
             Friedman, et al. "Regularization Paths for Generalized Linear Models via
               Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf
            */
-          val coefficientMean = coefficientArray.sum / coefficientArray.length
-          coefficientArray.indices.foreach { i => coefficientArray(i) -= coefficientMean}
+          val denseValues = denseCoefficientMatrix.values
+          val coefficientMean = denseValues.sum / denseValues.length
+          denseCoefficientMatrix.update(_ - coefficientMean)
         }
 
-        val denseCoefficientMatrix =
-          new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true)
         // TODO: use `denseCoefficientMatrix.compressed` after SPARK-17471
         val compressedCoefficientMatrix = if (isMultinomial) {
           denseCoefficientMatrix
         } else {
-          val compressedVector = Vectors.dense(coefficientArray).compressed
+          val compressedVector = Vectors.dense(denseCoefficientMatrix.values).compressed
           compressedVector match {
             case dv: DenseVector => denseCoefficientMatrix
             case sv: SparseVector =>
@@ -616,25 +630,13 @@ class LogisticRegression @Since("1.2.0") (
           }
         }
 
-        val interceptsArray: Array[Double] = if ($(fitIntercept)) {
-          Array.tabulate(numCoefficientSets) { i =>
-            val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1
-            rawCoefficients(coefIndex)
-          }
-        } else {
-          Array.empty[Double]
+        // center the intercepts when using multinomial algorithm
+        if ($(fitIntercept) && isMultinomial) {
+          val interceptArray = interceptVec.toArray
+          val interceptMean = interceptArray.sum / interceptArray.length
+          (0 until interceptVec.size).foreach { i => interceptArray(i) -= interceptMean }
         }
-        val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) {
-          // The intercepts are never regularized, so we always center the mean.
-          val interceptMean = interceptsArray.sum / numClasses
-          interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean }
-          Vectors.dense(interceptsArray)
-        } else if (interceptsArray.length == 1) {
-          Vectors.dense(interceptsArray)
-        } else {
-          Vectors.sparse(numCoefficientSets, Seq())
-        }
-        (compressedCoefficientMatrix, interceptVector.compressed, arrayBuilder.result())
+        (compressedCoefficientMatrix, interceptVec.compressed, arrayBuilder.result())
       }
     }
 
@@ -651,7 +653,7 @@ class LogisticRegression @Since("1.2.0") (
         $(labelCol),
         $(featuresCol),
         objectiveHistory)
-      model.setSummary(logRegSummary)
+      model.setSummary(Some(logRegSummary))
     } else {
       model
     }
@@ -697,6 +699,7 @@ class LogisticRegressionModel private[spark] (
   /**
    * A vector of model coefficients for "binomial" logistic regression. If this model was trained
    * using the "multinomial" family then an exception is thrown.
+   *
    * @return Vector
    */
   @Since("2.0.0")
@@ -720,6 +723,7 @@ class LogisticRegressionModel private[spark] (
   /**
    * The model intercept for "binomial" logistic regression. If this model was fit with the
    * "multinomial" family then an exception is thrown.
+   *
    * @return Double
    */
   @Since("1.3.0")
@@ -791,9 +795,9 @@ class LogisticRegressionModel private[spark] (
     }
   }
 
-  private[classification] def setSummary(
-      summary: LogisticRegressionTrainingSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[classification]
+  def setSummary(summary: Option[LogisticRegressionTrainingSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -816,7 +820,7 @@ class LogisticRegressionModel private[spark] (
 
   /**
    * Predict label for the given feature vector.
-   * The behavior of this can be adjusted using [[thresholds]].
+   * The behavior of this can be adjusted using `thresholds`.
    */
   override protected def predict(features: Vector): Double = if (isMultinomial) {
     super.predict(features)
@@ -888,8 +892,7 @@ class LogisticRegressionModel private[spark] (
   override def copy(extra: ParamMap): LogisticRegressionModel = {
     val newModel = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector,
       numClasses, isMultinomial), extra)
-    if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
-    newModel.setParent(parent)
+    newModel.setSummary(trainingSummary).setParent(parent)
   }
 
   override protected def raw2prediction(rawPrediction: Vector): Double = {
@@ -1102,7 +1105,9 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary
  */
 sealed trait LogisticRegressionSummary extends Serializable {
 
-  /** Dataframe output by the model's `transform` method. */
+  /**
+   * Dataframe output by the model's `transform` method.
+   */
   def predictions: DataFrame
 
   /** Field in "predictions" which gives the probability of each class as a vector. */
@@ -1179,8 +1184,8 @@ class BinaryLogisticRegressionSummary private[classification] (
    * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
    * See http://en.wikipedia.org/wiki/Receiver_operating_characteristic
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR")
@@ -1188,8 +1193,8 @@ class BinaryLogisticRegressionSummary private[classification] (
   /**
    * Computes the area under the receiver operating characteristic (ROC) curve.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC()
@@ -1198,8 +1203,8 @@ class BinaryLogisticRegressionSummary private[classification] (
    * Returns the precision-recall curve, which is a Dataframe containing
    * two fields recall, precision with (0.0, 1.0) prepended to it.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision")
@@ -1207,8 +1212,8 @@ class BinaryLogisticRegressionSummary private[classification] (
   /**
    * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val fMeasureByThreshold: DataFrame = {
@@ -1220,8 +1225,8 @@ class BinaryLogisticRegressionSummary private[classification] (
    * Every possible probability obtained in transforming the dataset are used
    * as thresholds used in calculating the precision.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val precisionByThreshold: DataFrame = {
@@ -1233,8 +1238,8 @@ class BinaryLogisticRegressionSummary private[classification] (
    * Every possible probability obtained in transforming the dataset are used
    * as thresholds used in calculating the recall.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val recallByThreshold: DataFrame = {
@@ -1276,7 +1281,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  *
  * The probability of the multinomial outcome $y$ taking on any of the K possible outcomes is:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    P(y_i=0|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1}
  *       e^{\vec{x}_i^T \vec{\beta}_k}} \\
@@ -1285,7 +1290,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}\,}{\sum_{k=0}^{K-1}
  *       e^{\vec{x}_i^T \vec{\beta}_k}}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * The model coefficients $\beta = (\beta_0, \beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix
  * which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not
@@ -1294,7 +1299,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  * Note that the coefficients in the model above lack identifiability. That is, any constant scalar
  * can be added to all of the coefficients and the probabilities remain the same.
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *    \frac{e^{\vec{x}_i^T \left(\vec{\beta}_0 + \vec{c}\right)}}{\sum_{k=0}^{K-1}
@@ -1304,7 +1309,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}}
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * However, when regularization is added to the loss function, the coefficients are indeed
  * identifiable because there is only one set of coefficients which minimizes the regularization
@@ -1316,7 +1321,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  * The loss of objective function for a single instance of data (we do not include the
  * regularization term here for simplicity) can be written as
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *    \ell\left(\beta, x_i\right) &= -log{P\left(y_i \middle| \vec{x}_i, \beta\right)} \\
@@ -1324,14 +1329,14 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    &= log\left(\sum_{k=0}^{K-1} e^{margins_k}\right) - margins_y
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where ${margins}_k = \vec{x}_i^T \vec{\beta}_k$.
  *
  * For optimization, we have to calculate the first derivative of the loss function, and a simple
  * calculation shows that
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *    \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}}
@@ -1340,54 +1345,54 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    &= x_{i, j} \cdot w_i \cdot multiplier_k
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $w_i$ is the sample weight, $I_{y=k}$ is an indicator function
  *
- *  <p><blockquote>
+ *  <blockquote>
  *    $$
  *    I_{y=k} = \begin{cases}
  *          1 & y = k \\
  *          0 & else
  *       \end{cases}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * and
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k=0}^{K-1}
  *       e^{\vec{x}_i \cdot \vec{\beta}_k}} - I_{y=k}\right)
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * If any of margins is larger than 709.78, the numerical computation of multiplier and loss
  * function will suffer from arithmetic overflow. This issue occurs when there are outliers in
  * data which are far away from the hyperplane, and this will cause the failing of training once
- * infinity is introduced. Note that this is only a concern when max(margins) > 0.
+ * infinity is introduced. Note that this is only a concern when max(margins) &gt; 0.
  *
- * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can easily
- * be rewritten into the following equivalent numerically stable formula.
+ * Fortunately, when max(margins) = maxMargin &gt; 0, the loss function and the multiplier can
+ * easily be rewritten into the following equivalent numerically stable formula.
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \ell\left(\beta, x\right) = log\left(\sum_{k=0}^{K-1} e^{margins_k - maxMargin}\right) -
  *       margins_{y} + maxMargin
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Note that each term, $(margins_k - maxMargin)$ in the exponential is no greater than zero; as a
  * result, overflow will not happen with this formula.
  *
  * For $multiplier$, a similar trick can be applied as the following,
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k - maxMargin}}{\sum_{k'=0}^{K-1}
  *       e^{\vec{x}_i \cdot \vec{\beta}_{k'} - maxMargin}} - I_{y=k}\right)
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * @param bcCoefficients The broadcast coefficients corresponding to the features.
  * @param bcFeaturesStd The broadcast standard deviation values of the features.
@@ -1395,6 +1400,12 @@ class BinaryLogisticRegressionSummary private[classification] (
  *                   Multinomial Logistic Regression.
  * @param fitIntercept Whether to fit an intercept term.
  * @param multinomial Whether to use multinomial (softmax) or binary loss
+ *
+ * @note In order to avoid unnecessary computation during calculation of the gradient updates
+ * we lay out the coefficients in column major order during training. This allows us to
+ * perform feature standardization once, while still retaining sequential memory access
+ * for speed. We convert back to row major order when we create the model,
+ * since this form is optimal for the matrix operations used for prediction.
  */
 private class LogisticAggregator(
     bcCoefficients: Broadcast[Vector],
@@ -1406,6 +1417,7 @@ private class LogisticAggregator(
   private val numFeatures = bcFeaturesStd.value.length
   private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
   private val coefficientSize = bcCoefficients.value.size
+  private val numCoefficientSets = if (multinomial) numClasses else 1
   if (multinomial) {
     require(numClasses ==  coefficientSize / numFeaturesPlusIntercept, s"The number of " +
       s"coefficients should be ${numClasses * numFeaturesPlusIntercept} but was $coefficientSize")
@@ -1486,57 +1498,65 @@ private class LogisticAggregator(
     var marginOfLabel = 0.0
     var maxMargin = Double.NegativeInfinity
 
-    val margins = Array.tabulate(numClasses) { i =>
-      var margin = 0.0
-      features.foreachActive { (index, value) =>
-        if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-          margin += localCoefficients(i * numFeaturesPlusIntercept + index) *
-            value / localFeaturesStd(index)
-        }
+    val margins = new Array[Double](numClasses)
+    features.foreachActive { (index, value) =>
+      val stdValue = value / localFeaturesStd(index)
+      var j = 0
+      while (j < numClasses) {
+        margins(j) += localCoefficients(index * numClasses + j) * stdValue
+        j += 1
       }
-
+    }
+    var i = 0
+    while (i < numClasses) {
       if (fitIntercept) {
-        margin += localCoefficients(i * numFeaturesPlusIntercept + numFeatures)
+        margins(i) += localCoefficients(numClasses * numFeatures + i)
       }
-      if (i == label.toInt) marginOfLabel = margin
-      if (margin > maxMargin) {
-        maxMargin = margin
+      if (i == label.toInt) marginOfLabel = margins(i)
+      if (margins(i) > maxMargin) {
+        maxMargin = margins(i)
       }
-      margin
+      i += 1
     }
 
     /**
-     * When maxMargin > 0, the original formula could cause overflow.
+     * When maxMargin is greater than 0, the original formula could cause overflow.
      * We address this by subtracting maxMargin from all the margins, so it's guaranteed
      * that all of the new margins will be smaller than zero to prevent arithmetic overflow.
      */
+    val multipliers = new Array[Double](numClasses)
     val sum = {
       var temp = 0.0
-      if (maxMargin > 0) {
-        for (i <- 0 until numClasses) {
-          margins(i) -= maxMargin
-          temp += math.exp(margins(i))
-        }
-      } else {
-        for (i <- 0 until numClasses) {
-          temp += math.exp(margins(i))
-        }
+      var i = 0
+      while (i < numClasses) {
+        if (maxMargin > 0) margins(i) -= maxMargin
+        val exp = math.exp(margins(i))
+        temp += exp
+        multipliers(i) = exp
+        i += 1
       }
       temp
     }
 
-    for (i <- 0 until numClasses) {
-      val multiplier = math.exp(margins(i)) / sum - {
-        if (label == i) 1.0 else 0.0
-      }
-      features.foreachActive { (index, value) =>
-        if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-          localGradientArray(i * numFeaturesPlusIntercept + index) +=
-            weight * multiplier * value / localFeaturesStd(index)
+    margins.indices.foreach { i =>
+      multipliers(i) = multipliers(i) / sum - (if (label == i) 1.0 else 0.0)
+    }
+    features.foreachActive { (index, value) =>
+      if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+        val stdValue = value / localFeaturesStd(index)
+        var j = 0
+        while (j < numClasses) {
+          localGradientArray(index * numClasses + j) +=
+            weight * multipliers(j) * stdValue
+          j += 1
         }
       }
-      if (fitIntercept) {
-        localGradientArray(i * numFeaturesPlusIntercept + numFeatures) += weight * multiplier
+    }
+    if (fitIntercept) {
+      var i = 0
+      while (i < numClasses) {
+        localGradientArray(numFeatures * numClasses + i) += weight * multipliers(i)
+        i += 1
       }
     }
 
@@ -1607,12 +1627,12 @@ private class LogisticAggregator(
     lossSum / weightSum
   }
 
-  def gradient: Vector = {
+  def gradient: Matrix = {
     require(weightSum > 0.0, s"The effective number of instances should be " +
       s"greater than 0.0, but $weightSum.")
     val result = Vectors.dense(gradientSumArray.clone())
     scal(1.0 / weightSum, result)
-    result
+    new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept, result.toArray)
   }
 }
 
@@ -1637,6 +1657,8 @@ private class LogisticCostFun(
     val bcCoeffs = instances.context.broadcast(coeffs)
     val featuresStd = bcFeaturesStd.value
     val numFeatures = featuresStd.length
+    val numCoefficientSets = if (multinomial) numClasses else 1
+    val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
 
     val logisticAggregator = {
       val seqOp = (c: LogisticAggregator, instance: Instance) => c.add(instance)
@@ -1648,28 +1670,25 @@ private class LogisticCostFun(
       )(seqOp, combOp, aggregationDepth)
     }
 
-    val totalGradientArray = logisticAggregator.gradient.toArray
+    val totalGradientMatrix = logisticAggregator.gradient
+    val coefMatrix = new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept, coeffs.toArray)
     // regVal is the sum of coefficients squares excluding intercept for L2 regularization.
     val regVal = if (regParamL2 == 0.0) {
       0.0
     } else {
       var sum = 0.0
-      coeffs.foreachActive { case (index, value) =>
+      coefMatrix.foreachActive { case (classIndex, featureIndex, value) =>
         // We do not apply regularization to the intercepts
-        val isIntercept = fitIntercept && ((index + 1) % (numFeatures + 1) == 0)
+        val isIntercept = fitIntercept && (featureIndex == numFeatures)
         if (!isIntercept) {
           // The following code will compute the loss of the regularization; also
           // the gradient of the regularization, and add back to totalGradientArray.
           sum += {
             if (standardization) {
-              totalGradientArray(index) += regParamL2 * value
+              val gradValue = totalGradientMatrix(classIndex, featureIndex)
+              totalGradientMatrix.update(classIndex, featureIndex, gradValue + regParamL2 * value)
               value * value
             } else {
-              val featureIndex = if (fitIntercept) {
-                index % (numFeatures + 1)
-              } else {
-                index % numFeatures
-              }
               if (featuresStd(featureIndex) != 0.0) {
                 // If `standardization` is false, we still standardize the data
                 // to improve the rate of convergence; as a result, we have to
@@ -1677,7 +1696,8 @@ private class LogisticCostFun(
                 // differently to get effectively the same objective function when
                 // the training dataset is not standardized.
                 val temp = value / (featuresStd(featureIndex) * featuresStd(featureIndex))
-                totalGradientArray(index) += regParamL2 * temp
+                val gradValue = totalGradientMatrix(classIndex, featureIndex)
+                totalGradientMatrix.update(classIndex, featureIndex, gradValue + regParamL2 * temp)
                 value * temp
               } else {
                 0.0
@@ -1690,6 +1710,6 @@ private class LogisticCostFun(
     }
     bcCoeffs.destroy(blocking = false)
 
-    (logisticAggregator.loss + regVal, new BDV(totalGradientArray))
+    (logisticAggregator.loss + regVal, new BDV(totalGradientMatrix.toArray))
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 88fe7cb4a6e0..aaaf7df34576 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -21,7 +21,7 @@ import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.ml.{PredictionModel, Predictor, PredictorParams}
 import org.apache.spark.ml.ann.{FeedForwardTopology, FeedForwardTrainer}
 import org.apache.spark.ml.feature.LabeledPoint
@@ -135,7 +135,6 @@ private object LabelConverter {
 }
 
 /**
- * :: Experimental ::
  * Classifier trainer based on the Multilayer Perceptron.
  * Each layer has sigmoid activation function, output layer has softmax.
  * Number of inputs has to be equal to the size of feature vectors.
@@ -143,7 +142,6 @@ private object LabelConverter {
  *
  */
 @Since("1.5.0")
-@Experimental
 class MultilayerPerceptronClassifier @Since("1.5.0") (
     @Since("1.5.0") override val uid: String)
   extends Predictor[Vector, MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel]
@@ -282,17 +280,14 @@ object MultilayerPerceptronClassifier
 }
 
 /**
- * :: Experimental ::
  * Classification model based on the Multilayer Perceptron.
  * Each layer has sigmoid activation function, output layer has softmax.
  *
  * @param uid uid
  * @param layers array of layer sizes including input and output layers
  * @param weights the weights of layers
- * @return prediction model
  */
 @Since("1.5.0")
-@Experimental
 class MultilayerPerceptronClassificationModel private[ml] (
     @Since("1.5.0") override val uid: String,
     @Since("1.5.0") val layers: Array[Int],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index b03a07a6bc1e..94ee2a2e7d9f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType
 /**
  * Params for Naive Bayes Classifiers.
  */
-private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
+private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
 
   /**
    * The smoothing parameter.
@@ -60,23 +60,27 @@ private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
   final def getModelType: String = $(modelType)
 }
 
+// scalastyle:off line.size.limit
 /**
  * Naive Bayes Classifiers.
  * It supports Multinomial NB
- * ([[http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html]])
+ * (see <a href="http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html">
+ * here</a>)
  * which can handle finitely supported discrete data. For example, by converting documents into
  * TF-IDF vectors, it can be used for document classification. By making every vector a
  * binary (0/1) data, it can also be used as Bernoulli NB
- * ([[http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html]]).
+ * (see <a href="http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html">
+ * here</a>).
  * The input feature values must be nonnegative.
  */
+// scalastyle:on line.size.limit
 @Since("1.5.0")
 class NaiveBayes @Since("1.5.0") (
     @Since("1.5.0") override val uid: String)
   extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel]
   with NaiveBayesParams with DefaultParamsWritable {
 
-  import NaiveBayes.{Bernoulli, Multinomial}
+  import NaiveBayes._
 
   @Since("1.5.0")
   def this() = this(Identifiable.randomUID("nb"))
@@ -110,21 +114,20 @@ class NaiveBayes @Since("1.5.0") (
   @Since("2.1.0")
   def setWeightCol(value: String): this.type = set(weightCol, value)
 
+  override protected def train(dataset: Dataset[_]): NaiveBayesModel = {
+    trainWithLabelCheck(dataset, positiveLabel = true)
+  }
+
   /**
    * ml assumes input labels in range [0, numClasses). But this implementation
    * is also called by mllib NaiveBayes which allows other kinds of input labels
-   * such as {-1, +1}. Here we use this parameter to switch between different processing logic.
-   * It should be removed when we remove mllib NaiveBayes.
+   * such as {-1, +1}. `positiveLabel` is used to determine whether the label
+   * should be checked and it should be removed when we remove mllib NaiveBayes.
    */
-  private[spark] var isML: Boolean = true
-
-  private[spark] def setIsML(isML: Boolean): this.type = {
-    this.isML = isML
-    this
-  }
-
-  override protected def train(dataset: Dataset[_]): NaiveBayesModel = {
-    if (isML) {
+  private[spark] def trainWithLabelCheck(
+      dataset: Dataset[_],
+      positiveLabel: Boolean): NaiveBayesModel = {
+    if (positiveLabel) {
       val numClasses = getNumClasses(dataset)
       if (isDefined(thresholds)) {
         require($(thresholds).length == numClasses, this.getClass.getSimpleName +
@@ -133,28 +136,9 @@ class NaiveBayes @Since("1.5.0") (
       }
     }
 
-    val requireNonnegativeValues: Vector => Unit = (v: Vector) => {
-      val values = v match {
-        case sv: SparseVector => sv.values
-        case dv: DenseVector => dv.values
-      }
-
-      require(values.forall(_ >= 0.0),
-        s"Naive Bayes requires nonnegative feature values but found $v.")
-    }
-
-    val requireZeroOneBernoulliValues: Vector => Unit = (v: Vector) => {
-      val values = v match {
-        case sv: SparseVector => sv.values
-        case dv: DenseVector => dv.values
-      }
-
-      require(values.forall(v => v == 0.0 || v == 1.0),
-        s"Bernoulli naive Bayes requires 0 or 1 feature values but found $v.")
-    }
-
+    val modelTypeValue = $(modelType)
     val requireValues: Vector => Unit = {
-      $(modelType) match {
+      modelTypeValue match {
         case Multinomial =>
           requireNonnegativeValues
         case Bernoulli =>
@@ -226,13 +210,33 @@ class NaiveBayes @Since("1.5.0") (
 @Since("1.6.0")
 object NaiveBayes extends DefaultParamsReadable[NaiveBayes] {
   /** String name for multinomial model type. */
-  private[spark] val Multinomial: String = "multinomial"
+  private[classification] val Multinomial: String = "multinomial"
 
   /** String name for Bernoulli model type. */
-  private[spark] val Bernoulli: String = "bernoulli"
+  private[classification] val Bernoulli: String = "bernoulli"
 
   /* Set of modelTypes that NaiveBayes supports */
-  private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
+  private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
+
+  private[NaiveBayes] def requireNonnegativeValues(v: Vector): Unit = {
+    val values = v match {
+      case sv: SparseVector => sv.values
+      case dv: DenseVector => dv.values
+    }
+
+    require(values.forall(_ >= 0.0),
+      s"Naive Bayes requires nonnegative feature values but found $v.")
+  }
+
+  private[NaiveBayes] def requireZeroOneBernoulliValues(v: Vector): Unit = {
+    val values = v match {
+      case sv: SparseVector => sv.values
+      case dv: DenseVector => dv.values
+    }
+
+    require(values.forall(v => v == 0.0 || v == 1.0),
+      s"Bernoulli naive Bayes requires 0 or 1 feature values but found $v.")
+  }
 
   @Since("1.6.0")
   override def load(path: String): NaiveBayes = super.load(path)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index f4ab0a074c42..e58b30d66588 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -140,6 +140,14 @@ final class OneVsRestModel private[ml] (
     this(uid, Metadata.empty, models.asScala.toArray)
   }
 
+  /** @group setParam */
+  @Since("2.1.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
   @Since("1.4.0")
   override def transformSchema(schema: StructType): StructType = {
     validateAndTransformSchema(schema, fitting = false, getClassifier.featuresDataType)
@@ -175,6 +183,7 @@ final class OneVsRestModel private[ml] (
         val updateUDF = udf { (predictions: Map[Int, Double], prediction: Vector) =>
           predictions + ((index, prediction(1)))
         }
+        model.setFeaturesCol($(featuresCol))
         val transformedDataset = model.transform(df).select(columns: _*)
         val updatedDataset = transformedDataset
           .withColumn(tmpColName, updateUDF(col(accColName), col(rawPredictionCol)))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 52345b0626c4..5bbaafeff329 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.functions._
 
 
 /**
- * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] learning algorithm for
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> learning algorithm for
  * classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
@@ -54,47 +54,66 @@ class RandomForestClassifier @Since("1.4.0") (
 
   // Parameters from TreeClassifierParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setImpurity(value: String): this.type = super.setImpurity(value)
+  override def setImpurity(value: String): this.type = set(impurity, value)
 
   // Parameters from TreeEnsembleParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSubsamplingRate(value: Double): this.type = super.setSubsamplingRate(value)
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   // Parameters from RandomForestParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setNumTrees(value: Int): this.type = super.setNumTrees(value)
+  override def setNumTrees(value: Int): this.type = set(numTrees, value)
 
+  /** @group setParam */
   @Since("1.4.0")
   override def setFeatureSubsetStrategy(value: String): this.type =
-    super.setFeatureSubsetStrategy(value)
+    set(featureSubsetStrategy, value)
 
   override protected def train(dataset: Dataset[_]): RandomForestClassificationModel = {
     val categoricalFeatures: Map[Int, Int] =
@@ -144,7 +163,7 @@ object RandomForestClassifier extends DefaultParamsReadable[RandomForestClassifi
 }
 
 /**
- * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] model for classification.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> model for classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
  *
@@ -158,7 +177,7 @@ class RandomForestClassificationModel private[ml] (
     @Since("1.6.0") override val numFeatures: Int,
     @Since("1.5.0") override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel]
-  with RandomForestClassificationModelParams with TreeEnsembleModel[DecisionTreeClassificationModel]
+  with RandomForestClassifierParams with TreeEnsembleModel[DecisionTreeClassificationModel]
   with MLWritable with Serializable {
 
   require(_trees.nonEmpty, "RandomForestClassificationModel requires at least 1 tree.")
@@ -221,15 +240,6 @@ class RandomForestClassificationModel private[ml] (
     }
   }
 
-  /**
-   * Number of trees in ensemble
-   *
-   * @deprecated  Use [[getNumTrees]] instead.  This method will be removed in 2.1.0
-   */
-  // TODO: Once this is removed, then this class can inherit from RandomForestClassifierParams
-  @deprecated("Use getNumTrees instead.  This method will be removed in 2.1.0.", "2.0.0")
-  val numTrees: Int = trees.length
-
   @Since("1.4.0")
   override def copy(extra: ParamMap): RandomForestClassificationModel = {
     copyValues(new RandomForestClassificationModel(uid, _trees, numFeatures, numClasses), extra)
@@ -249,7 +259,7 @@ class RandomForestClassificationModel private[ml] (
    * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
    * and follows the implementation from scikit-learn.
    *
-   * @see [[DecisionTreeClassificationModel.featureImportances]]
+   * @see `DecisionTreeClassificationModel.featureImportances`
    */
   @Since("1.5.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index 2718dd93dcb5..4c20e6563bad 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -42,7 +42,7 @@ private[clustering] trait BisectingKMeansParams extends Params
   with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol {
 
   /**
-   * The desired number of leaf clusters. Must be > 1. Default: 4.
+   * The desired number of leaf clusters. Must be &gt; 1. Default: 4.
    * The actual number could be smaller if there are no divisible leaf clusters.
    * @group param
    */
@@ -55,8 +55,8 @@ private[clustering] trait BisectingKMeansParams extends Params
   def getK: Int = $(k)
 
   /**
-   * The minimum number of points (if >= 1.0) or the minimum proportion
-   * of points (if < 1.0) of a divisible cluster (default: 1.0).
+   * The minimum number of points (if greater than or equal to 1.0) or the minimum proportion
+   * of points (if less than 1.0) of a divisible cluster (default: 1.0).
    * @group expertParam
    */
   @Since("2.0.0")
@@ -80,13 +80,11 @@ private[clustering] trait BisectingKMeansParams extends Params
 }
 
 /**
- * :: Experimental ::
  * Model fitted by BisectingKMeans.
  *
  * @param parentModel a model trained by [[org.apache.spark.mllib.clustering.BisectingKMeans]].
  */
 @Since("2.0.0")
-@Experimental
 class BisectingKMeansModel private[ml] (
     @Since("2.0.0") override val uid: String,
     private val parentModel: MLlibBisectingKMeansModel
@@ -94,10 +92,18 @@ class BisectingKMeansModel private[ml] (
 
   @Since("2.0.0")
   override def copy(extra: ParamMap): BisectingKMeansModel = {
-    val copied = new BisectingKMeansModel(uid, parentModel)
-    copyValues(copied, extra)
+    val copied = copyValues(new BisectingKMeansModel(uid, parentModel), extra)
+    copied.setSummary(trainingSummary).setParent(this.parent)
   }
 
+  /** @group setParam */
+  @Since("2.1.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
@@ -131,8 +137,8 @@ class BisectingKMeansModel private[ml] (
 
   private var trainingSummary: Option[BisectingKMeansSummary] = None
 
-  private[clustering] def setSummary(summary: BisectingKMeansSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[clustering] def setSummary(summary: Option[BisectingKMeansSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -189,8 +195,6 @@ object BisectingKMeansModel extends MLReadable[BisectingKMeansModel] {
 }
 
 /**
- * :: Experimental ::
- *
  * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques"
  * by Steinbach, Karypis, and Kumar, with modification to fit Spark.
  * The algorithm starts from a single cluster that contains all points.
@@ -200,12 +204,11 @@ object BisectingKMeansModel extends MLReadable[BisectingKMeansModel] {
  * If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters,
  * larger clusters get higher priority.
  *
- * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf
- *     Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
- *     KDD Workshop on Text Mining, 2000.]]
+ * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">
+ * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
+ * KDD Workshop on Text Mining, 2000.</a>
  */
 @Since("2.0.0")
-@Experimental
 class BisectingKMeans @Since("2.0.0") (
     @Since("2.0.0") override val uid: String)
   extends Estimator[BisectingKMeansModel] with BisectingKMeansParams with DefaultParamsWritable {
@@ -264,7 +267,7 @@ class BisectingKMeans @Since("2.0.0") (
     val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
     val summary = new BisectingKMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
-    model.setSummary(summary)
+    model.setSummary(Some(summary))
     instr.logSuccess(model)
     model
   }
@@ -288,7 +291,7 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
  * :: Experimental ::
  * Summary of BisectingKMeans.
  *
- * @param predictions  [[DataFrame]] produced by [[BisectingKMeansModel.transform()]].
+ * @param predictions  `DataFrame` produced by `BisectingKMeansModel.transform()`.
  * @param predictionCol  Name for column of predicted clusters in `predictions`.
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
index 8b5f525194f2..44e832b058b6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.{DataFrame, Row}
  * :: Experimental ::
  * Summary of clustering algorithms.
  *
- * @param predictions  [[DataFrame]] produced by model.transform().
+ * @param predictions  `DataFrame` produced by model.transform().
  * @param predictionCol  Name for column of predicted clusters in `predictions`.
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 8fac63fefbb5..ac56845581ae 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -44,7 +44,7 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
   with HasSeed with HasPredictionCol with HasProbabilityCol with HasTol {
 
   /**
-   * Number of independent Gaussians in the mixture model. Must be > 1. Default: 2.
+   * Number of independent Gaussians in the mixture model. Must be greater than 1. Default: 2.
    * @group param
    */
   @Since("2.0.0")
@@ -68,29 +68,38 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
 }
 
 /**
- * :: Experimental ::
- *
  * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
  * are drawn from each Gaussian i with probability weights(i).
  *
  * @param weights Weight for each Gaussian distribution in the mixture.
  *                This is a multinomial probability distribution over the k Gaussians,
  *                where weights(i) is the weight for Gaussian i, and weights sum to 1.
- * @param gaussians Array of [[MultivariateGaussian]] where gaussians(i) represents
+ * @param gaussians Array of `MultivariateGaussian` where gaussians(i) represents
  *                  the Multivariate Gaussian (Normal) Distribution for Gaussian i
  */
 @Since("2.0.0")
-@Experimental
 class GaussianMixtureModel private[ml] (
     @Since("2.0.0") override val uid: String,
     @Since("2.0.0") val weights: Array[Double],
     @Since("2.0.0") val gaussians: Array[MultivariateGaussian])
   extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {
 
+  /** @group setParam */
+  @Since("2.1.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setProbabilityCol(value: String): this.type = set(probabilityCol, value)
+
   @Since("2.0.0")
   override def copy(extra: ParamMap): GaussianMixtureModel = {
-    val copied = new GaussianMixtureModel(uid, weights, gaussians)
-    copyValues(copied, extra).setParent(this.parent)
+    val copied = copyValues(new GaussianMixtureModel(uid, weights, gaussians), extra)
+    copied.setSummary(trainingSummary).setParent(this.parent)
   }
 
   @Since("2.0.0")
@@ -149,8 +158,8 @@ class GaussianMixtureModel private[ml] (
 
   private var trainingSummary: Option[GaussianMixtureSummary] = None
 
-  private[clustering] def setSummary(summary: GaussianMixtureSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[clustering] def setSummary(summary: Option[GaussianMixtureSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -253,7 +262,6 @@ object GaussianMixtureModel extends MLReadable[GaussianMixtureModel] {
 }
 
 /**
- * :: Experimental ::
  * Gaussian Mixture clustering.
  *
  * This class performs expectation maximization for multivariate Gaussian
@@ -267,12 +275,11 @@ object GaussianMixtureModel extends MLReadable[GaussianMixtureModel] {
  * While this process is generally guaranteed to converge, it is not guaranteed
  * to find a global optimum.
  *
- * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
- *       This is due to high-dimensional data (a) making it difficult to cluster at all (based
- *       on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
+ * @note For high-dimensional data (with many features), this algorithm may perform poorly.
+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
  */
 @Since("2.0.0")
-@Experimental
 class GaussianMixture @Since("2.0.0") (
     @Since("2.0.0") override val uid: String)
   extends Estimator[GaussianMixtureModel] with GaussianMixtureParams with DefaultParamsWritable {
@@ -339,7 +346,7 @@ class GaussianMixture @Since("2.0.0") (
       .setParent(this)
     val summary = new GaussianMixtureSummary(model.transform(dataset),
       $(predictionCol), $(probabilityCol), $(featuresCol), $(k))
-    model.setSummary(summary)
+    model.setSummary(Some(summary))
     instr.logNumFeatures(model.gaussians.head.mean.size)
     instr.logSuccess(model)
     model
@@ -362,7 +369,7 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
  * :: Experimental ::
  * Summary of GaussianMixture.
  *
- * @param predictions  [[DataFrame]] produced by [[GaussianMixtureModel.transform()]].
+ * @param predictions  `DataFrame` produced by `GaussianMixtureModel.transform()`.
  * @param predictionCol  Name for column of predicted clusters in `predictions`.
  * @param probabilityCol  Name for column of predicted probability of each cluster
  *                        in `predictions`.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 85bb8c93b3fa..af8f35374a1f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -33,6 +33,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.util.VersionUtils.majorVersion
 
 /**
  * Common params for KMeans and KMeansModel
@@ -41,7 +42,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
   with HasSeed with HasPredictionCol with HasTol {
 
   /**
-   * The number of clusters to create (k). Must be > 1. Note that it is possible for fewer than
+   * The number of clusters to create (k). Must be &gt; 1. Note that it is possible for fewer than
    * k clusters to be returned, for example, if there are fewer than k distinct points to cluster.
    * Default: 2.
    * @group param
@@ -71,7 +72,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
 
   /**
    * Param for the number of steps for the k-means|| initialization mode. This is an advanced
-   * setting -- the default of 2 is almost always enough. Must be > 0. Default: 2.
+   * setting -- the default of 2 is almost always enough. Must be &gt; 0. Default: 2.
    * @group expertParam
    */
   @Since("1.5.0")
@@ -94,13 +95,11 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
 }
 
 /**
- * :: Experimental ::
  * Model fitted by KMeans.
  *
  * @param parentModel a model trained by spark.mllib.clustering.KMeans.
  */
 @Since("1.5.0")
-@Experimental
 class KMeansModel private[ml] (
     @Since("1.5.0") override val uid: String,
     private val parentModel: MLlibKMeansModel)
@@ -108,8 +107,8 @@ class KMeansModel private[ml] (
 
   @Since("1.5.0")
   override def copy(extra: ParamMap): KMeansModel = {
-    val copied = new KMeansModel(uid, parentModel)
-    copyValues(copied, extra)
+    val copied = copyValues(new KMeansModel(uid, parentModel), extra)
+    copied.setSummary(trainingSummary).setParent(this.parent)
   }
 
   /** @group setParam */
@@ -163,8 +162,8 @@ class KMeansModel private[ml] (
 
   private var trainingSummary: Option[KMeansSummary] = None
 
-  private[clustering] def setSummary(summary: KMeansSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[clustering] def setSummary(summary: Option[KMeansSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -231,10 +230,7 @@ object KMeansModel extends MLReadable[KMeansModel] {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
       val dataPath = new Path(path, "data").toString
 
-      val versionRegex = "([0-9]+)\\.(.+)".r
-      val versionRegex(major, _) = metadata.sparkVersion
-
-      val clusterCenters = if (major.toInt >= 2) {
+      val clusterCenters = if (majorVersion(metadata.sparkVersion) >= 2) {
         val data: Dataset[Data] = sparkSession.read.parquet(dataPath).as[Data]
         data.collect().sortBy(_.clusterIdx).map(_.clusterCenter).map(OldVectors.fromML)
       } else {
@@ -249,13 +245,11 @@ object KMeansModel extends MLReadable[KMeansModel] {
 }
 
 /**
- * :: Experimental ::
  * K-means clustering with support for k-means|| initialization proposed by Bahmani et al.
  *
- * @see [[http://dx.doi.org/10.14778/2180912.2180915 Bahmani et al., Scalable k-means++.]]
+ * @see <a href="http://dx.doi.org/10.14778/2180912.2180915">Bahmani et al., Scalable k-means++.</a>
  */
 @Since("1.5.0")
-@Experimental
 class KMeans @Since("1.5.0") (
     @Since("1.5.0") override val uid: String)
   extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {
@@ -326,7 +320,7 @@ class KMeans @Since("1.5.0") (
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
     val summary = new KMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
-    model.setSummary(summary)
+    model.setSummary(Some(summary))
     instr.logSuccess(model)
     model
   }
@@ -348,7 +342,7 @@ object KMeans extends DefaultParamsReadable[KMeans] {
  * :: Experimental ::
  * Summary of KMeans.
  *
- * @param predictions  [[DataFrame]] produced by [[KMeansModel.transform()]].
+ * @param predictions  `DataFrame` produced by `KMeansModel.transform()`.
  * @param predictionCol  Name for column of predicted clusters in `predictions`.
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 7773802854c0..583e5e0928eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -22,7 +22,7 @@ import org.json4s.DefaultFormats
 import org.json4s.JsonAST.JObject
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors, VectorUDT}
@@ -50,7 +50,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
   with HasSeed with HasCheckpointInterval {
 
   /**
-   * Param for the number of topics (clusters) to infer. Must be > 1. Default: 10.
+   * Param for the number of topics (clusters) to infer. Must be &gt; 1. Default: 10.
    *
    * @group param
    */
@@ -78,13 +78,13 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *  - EM
    *     - Currently only supports symmetric distributions, so all values in the vector should be
    *       the same.
-   *     - Values should be > 1.0
+   *     - Values should be greater than 1.0
    *     - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
    *       from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Values should be >= 0
+   *     - Values should be greater than or equal to 0
    *     - default = uniformly (1.0 / k), following the implementation from
-   *       [[https://github.com/Blei-Lab/onlineldavb]].
+   *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    *
    * @group param
    */
@@ -120,13 +120,13 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *
    * Optimizer-specific parameter settings:
    *  - EM
-   *     - Value should be > 1.0
+   *     - Value should be greater than 1.0
    *     - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
    *       Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Value should be >= 0
+   *     - Value should be greater than or equal to 0
    *     - default = (1.0 / k), following the implementation from
-   *       [[https://github.com/Blei-Lab/onlineldavb]].
+   *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    *
    * @group param
    */
@@ -162,11 +162,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *  - Online LDA:
    *     Hoffman, Blei and Bach.  "Online Learning for Latent Dirichlet Allocation."
    *     Neural Information Processing Systems, 2010.
-   *     [[http://www.cs.columbia.edu/~blei/papers/HoffmanBleiBach2010b.pdf]]
+   *     See <a href="http://www.cs.columbia.edu/~blei/papers/HoffmanBleiBach2010b.pdf">here</a>
    *  - EM:
    *     Asuncion et al.  "On Smoothing and Inference for Topic Models."
    *     Uncertainty in Artificial Intelligence, 2009.
-   *     [[http://arxiv.org/pdf/1205.2662.pdf]]
+   *     See <a href="http://arxiv.org/pdf/1205.2662.pdf">here</a>
    *
    * @group param
    */
@@ -245,9 +245,9 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    * Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent,
    * in range (0, 1].
    *
-   * Note that this should be adjusted in synch with [[LDA.maxIter]]
+   * Note that this should be adjusted in synch with `LDA.maxIter`
    * so the entire corpus is used.  Specifically, set both so that
-   * maxIterations * miniBatchFraction >= 1.
+   * maxIterations * miniBatchFraction greater than or equal to 1.
    *
    * Note: This is the same as the `miniBatchFraction` parameter in
    *       [[org.apache.spark.mllib.clustering.OnlineLDAOptimizer]].
@@ -293,8 +293,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    * cause failures if a data partition is lost, so set this bit with care.
    * Note that checkpoints will be cleaned up via reference counting, regardless.
    *
-   * See [[DistributedLDAModel.getCheckpointFiles]] for getting remaining checkpoints and
-   * [[DistributedLDAModel.deleteCheckpointFiles]] for removing remaining checkpoints.
+   * See `DistributedLDAModel.getCheckpointFiles` for getting remaining checkpoints and
+   * `DistributedLDAModel.deleteCheckpointFiles` for removing remaining checkpoints.
    *
    * Default: true
    *
@@ -396,15 +396,13 @@ private object LDAParams {
 
 
 /**
- * :: Experimental ::
  * Model fitted by [[LDA]].
  *
  * @param vocabSize  Vocabulary size (number of terms or words in the vocabulary)
  * @param sparkSession  Used to construct local DataFrames for returning query results
  */
 @Since("1.6.0")
-@Experimental
-sealed abstract class LDAModel private[ml] (
+abstract class LDAModel private[ml] (
     @Since("1.6.0") override val uid: String,
     @Since("1.6.0") val vocabSize: Int,
     @Since("1.6.0") @transient private[ml] val sparkSession: SparkSession)
@@ -431,7 +429,7 @@ sealed abstract class LDAModel private[ml] (
   private[ml] def getEffectiveTopicConcentration: Double = getModel.topicConcentration
 
   /**
-   * The features for LDA should be a [[Vector]] representing the word counts in a document.
+   * The features for LDA should be a `Vector` representing the word counts in a document.
    * The vector should be of length vocabSize, with counts for each term (word).
    *
    * @group setParam
@@ -556,14 +554,12 @@ sealed abstract class LDAModel private[ml] (
 
 
 /**
- * :: Experimental ::
  *
  * Local (non-distributed) model fitted by [[LDA]].
  *
  * This model stores the inferred topics only; it does not store info about the training dataset.
  */
 @Since("1.6.0")
-@Experimental
 class LocalLDAModel private[ml] (
     uid: String,
     vocabSize: Int,
@@ -641,7 +637,6 @@ object LocalLDAModel extends MLReadable[LocalLDAModel] {
 
 
 /**
- * :: Experimental ::
  *
  * Distributed model fitted by [[LDA]].
  * This type of model is currently only produced by Expectation-Maximization (EM).
@@ -650,10 +645,9 @@ object LocalLDAModel extends MLReadable[LocalLDAModel] {
  * for each training document.
  *
  * @param oldLocalModelOption  Used to implement [[oldLocalModel]] as a lazy val, but keeping
- *                             [[copy()]] cheap.
+ *                             `copy()` cheap.
  */
 @Since("1.6.0")
-@Experimental
 class DistributedLDAModel private[ml] (
     uid: String,
     vocabSize: Int,
@@ -701,7 +695,7 @@ class DistributedLDAModel private[ml] (
    *  - Even with [[logPrior]], this is NOT the same as the data log likelihood given the
    *    hyperparameters.
    *  - This is computed from the topic distributions computed during training. If you call
-   *    [[logLikelihood()]] on the same training dataset, the topic distributions will be computed
+   *    `logLikelihood()` on the same training dataset, the topic distributions will be computed
    *    again, possibly giving different results.
    */
   @Since("1.6.0")
@@ -719,7 +713,7 @@ class DistributedLDAModel private[ml] (
   /**
    * :: DeveloperApi ::
    *
-   * If using checkpointing and [[LDA.keepLastCheckpoint]] is set to true, then there may be
+   * If using checkpointing and `LDA.keepLastCheckpoint` is set to true, then there may be
    * saved checkpoint files.  This method is provided so that users can manage those files.
    *
    * Note that removing the checkpoints can cause failures if a partition is lost and is needed
@@ -789,7 +783,6 @@ object DistributedLDAModel extends MLReadable[DistributedLDAModel] {
 
 
 /**
- * :: Experimental ::
  *
  * Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
  *
@@ -804,16 +797,15 @@ object DistributedLDAModel extends MLReadable[DistributedLDAModel] {
  *
  * Input data (featuresCol):
  *  LDA is given a collection of documents as input data, via the featuresCol parameter.
- *  Each document is specified as a [[Vector]] of length vocabSize, where each entry is the
+ *  Each document is specified as a `Vector` of length vocabSize, where each entry is the
  *  count for the corresponding term (word) in the document.  Feature transformers such as
  *  [[org.apache.spark.ml.feature.Tokenizer]] and [[org.apache.spark.ml.feature.CountVectorizer]]
  *  can be useful for converting text to word count vectors.
  *
- * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
- *       (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">
+ * Latent Dirichlet allocation (Wikipedia)</a>
  */
 @Since("1.6.0")
-@Experimental
 class LDA @Since("1.6.0") (
     @Since("1.6.0") override val uid: String)
   extends Estimator[LDAModel] with LDAParams with DefaultParamsWritable {
@@ -826,7 +818,7 @@ class LDA @Since("1.6.0") (
     optimizeDocConcentration -> true, keepLastCheckpoint -> true)
 
   /**
-   * The features for LDA should be a [[Vector]] representing the word counts in a document.
+   * The features for LDA should be a `Vector` representing the word counts in a document.
    * The vector should be of length vocabSize, with counts for each term (word).
    *
    * @group setParam
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
similarity index 66%
rename from mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
rename to mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
index 1b524c6710b4..cbac16345a29 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
@@ -34,9 +34,9 @@ import org.apache.spark.sql.types.StructType
 /**
  * :: Experimental ::
  *
- * Params for [[RandomProjection]].
+ * Params for [[BucketedRandomProjectionLSH]].
  */
-private[ml] trait RandomProjectionParams extends Params {
+private[ml] trait BucketedRandomProjectionLSHParams extends Params {
 
   /**
    * The length of each hash bucket, a larger bucket lowers the false negative rate. The number of
@@ -58,8 +58,8 @@ private[ml] trait RandomProjectionParams extends Params {
 /**
  * :: Experimental ::
  *
- * Model produced by [[RandomProjection]], where multiple random vectors are stored. The vectors
- * are normalized to be unit vectors and each vector is used in a hash function:
+ * Model produced by [[BucketedRandomProjectionLSH]], where multiple random vectors are stored. The
+ * vectors are normalized to be unit vectors and each vector is used in a hash function:
  *    `h_i(x) = floor(r_i.dot(x) / bucketLength)`
  * where `r_i` is the i-th random unit vector. The number of buckets will be `(max L2 norm of input
  * vectors) / bucketLength`.
@@ -68,18 +68,19 @@ private[ml] trait RandomProjectionParams extends Params {
  */
 @Experimental
 @Since("2.1.0")
-class RandomProjectionModel private[ml] (
+class BucketedRandomProjectionLSHModel private[ml](
     override val uid: String,
-    @Since("2.1.0") val randUnitVectors: Array[Vector])
-  extends LSHModel[RandomProjectionModel] with RandomProjectionParams {
+    private[ml] val randUnitVectors: Array[Vector])
+  extends LSHModel[BucketedRandomProjectionLSHModel] with BucketedRandomProjectionLSHParams {
 
   @Since("2.1.0")
-  override protected[ml] val hashFunction: (Vector) => Vector = {
+  override protected[ml] val hashFunction: Vector => Array[Vector] = {
     key: Vector => {
       val hashValues: Array[Double] = randUnitVectors.map({
         randUnitVector => Math.floor(BLAS.dot(key, randUnitVector) / $(bucketLength))
       })
-      Vectors.dense(hashValues)
+      // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
+      hashValues.map(Vectors.dense(_))
     }
   }
 
@@ -89,40 +90,43 @@ class RandomProjectionModel private[ml] (
   }
 
   @Since("2.1.0")
-  override protected[ml] def hashDistance(x: Vector, y: Vector): Double = {
+  override protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = {
     // Since it's generated by hashing, it will be a pair of dense vectors.
-    x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min
+    x.zip(y).map(vectorPair => Vectors.sqdist(vectorPair._1, vectorPair._2)).min
   }
 
   @Since("2.1.0")
   override def copy(extra: ParamMap): this.type = defaultCopy(extra)
 
   @Since("2.1.0")
-  override def write: MLWriter = new RandomProjectionModel.RandomProjectionModelWriter(this)
+  override def write: MLWriter = {
+    new BucketedRandomProjectionLSHModel.BucketedRandomProjectionLSHModelWriter(this)
+  }
 }
 
 /**
  * :: Experimental ::
  *
- * This [[RandomProjection]] implements Locality Sensitive Hashing functions for Euclidean
- * distance metrics.
+ * This [[BucketedRandomProjectionLSH]] implements Locality Sensitive Hashing functions for
+ * Euclidean distance metrics.
  *
  * The input is dense or sparse vectors, each of which represents a point in the Euclidean
- * distance space. The output will be vectors of configurable dimension. Hash value in the same
- * dimension is calculated by the same hash function.
+ * distance space. The output will be vectors of configurable dimension. Hash values in the
+ * same dimension are calculated by the same hash function.
  *
  * References:
  *
- * 1. [[https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions
- * Wikipedia on Stable Distributions]]
+ * 1. <a href="https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions">
+ * Wikipedia on Stable Distributions</a>
  *
  * 2. Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint
  * arXiv:1408.2927 (2014).
  */
 @Experimental
 @Since("2.1.0")
-class RandomProjection(override val uid: String) extends LSH[RandomProjectionModel]
-  with RandomProjectionParams with HasSeed {
+class BucketedRandomProjectionLSH(override val uid: String)
+  extends LSH[BucketedRandomProjectionLSHModel]
+    with BucketedRandomProjectionLSHParams with HasSeed {
 
   @Since("2.1.0")
   override def setInputCol(value: String): this.type = super.setInputCol(value)
@@ -131,11 +135,11 @@ class RandomProjection(override val uid: String) extends LSH[RandomProjectionMod
   override def setOutputCol(value: String): this.type = super.setOutputCol(value)
 
   @Since("2.1.0")
-  override def setOutputDim(value: Int): this.type = super.setOutputDim(value)
+  override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value)
 
   @Since("2.1.0")
   def this() = {
-    this(Identifiable.randomUID("random projection"))
+    this(Identifiable.randomUID("brp-lsh"))
   }
 
   /** @group setParam */
@@ -147,15 +151,16 @@ class RandomProjection(override val uid: String) extends LSH[RandomProjectionMod
   def setSeed(value: Long): this.type = set(seed, value)
 
   @Since("2.1.0")
-  override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = {
+  override protected[this] def createRawLSHModel(
+    inputDim: Int): BucketedRandomProjectionLSHModel = {
     val rand = new Random($(seed))
     val randUnitVectors: Array[Vector] = {
-      Array.fill($(outputDim)) {
+      Array.fill($(numHashTables)) {
         val randArray = Array.fill(inputDim)(rand.nextGaussian())
         Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray)))
       }
     }
-    new RandomProjectionModel(uid, randUnitVectors)
+    new BucketedRandomProjectionLSHModel(uid, randUnitVectors)
   }
 
   @Since("2.1.0")
@@ -169,23 +174,25 @@ class RandomProjection(override val uid: String) extends LSH[RandomProjectionMod
 }
 
 @Since("2.1.0")
-object RandomProjection extends DefaultParamsReadable[RandomProjection] {
+object BucketedRandomProjectionLSH extends DefaultParamsReadable[BucketedRandomProjectionLSH] {
 
   @Since("2.1.0")
-  override def load(path: String): RandomProjection = super.load(path)
+  override def load(path: String): BucketedRandomProjectionLSH = super.load(path)
 }
 
 @Since("2.1.0")
-object RandomProjectionModel extends MLReadable[RandomProjectionModel] {
+object BucketedRandomProjectionLSHModel extends MLReadable[BucketedRandomProjectionLSHModel] {
 
   @Since("2.1.0")
-  override def read: MLReader[RandomProjectionModel] = new RandomProjectionModelReader
+  override def read: MLReader[BucketedRandomProjectionLSHModel] = {
+    new BucketedRandomProjectionLSHModelReader
+  }
 
   @Since("2.1.0")
-  override def load(path: String): RandomProjectionModel = super.load(path)
+  override def load(path: String): BucketedRandomProjectionLSHModel = super.load(path)
 
-  private[RandomProjectionModel] class RandomProjectionModelWriter(instance: RandomProjectionModel)
-    extends MLWriter {
+  private[BucketedRandomProjectionLSHModel] class BucketedRandomProjectionLSHModelWriter(
+    instance: BucketedRandomProjectionLSHModel) extends MLWriter {
 
     // TODO: Save using the existing format of Array[Vector] once SPARK-12878 is resolved.
     private case class Data(randUnitVectors: Matrix)
@@ -203,12 +210,13 @@ object RandomProjectionModel extends MLReadable[RandomProjectionModel] {
     }
   }
 
-  private class RandomProjectionModelReader extends MLReader[RandomProjectionModel] {
+  private class BucketedRandomProjectionLSHModelReader
+    extends MLReader[BucketedRandomProjectionLSHModel] {
 
     /** Checked against metadata when loading model */
-    private val className = classOf[RandomProjectionModel].getName
+    private val className = classOf[BucketedRandomProjectionLSHModel].getName
 
-    override def load(path: String): RandomProjectionModel = {
+    override def load(path: String): BucketedRandomProjectionLSHModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
 
       val dataPath = new Path(path, "data").toString
@@ -216,7 +224,8 @@ object RandomProjectionModel extends MLReadable[RandomProjectionModel] {
       val Row(randUnitVectors: Matrix) = MLUtils.convertMatrixColumnsToML(data, "randUnitVectors")
         .select("randUnitVectors")
         .head()
-      val model = new RandomProjectionModel(metadata.uid, randUnitVectors.rowIter.toArray)
+      val model = new BucketedRandomProjectionLSHModel(metadata.uid,
+        randUnitVectors.rowIter.toArray)
 
       DefaultParamsReader.getAndSetParams(model, metadata)
       model
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 1143f0f565eb..d1f3b2af1e48 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -44,7 +44,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   /**
    * Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets.
    * A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which
-   * also includes y. Splits should be of length >= 3 and strictly increasing.
+   * also includes y. Splits should be of length greater than or equal to 3 and strictly increasing.
    * Values at -inf, inf must be explicitly provided to cover all Double values;
    * otherwise, values outside the splits specified will be treated as errors.
    *
@@ -78,17 +78,18 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /**
-   * Param for how to handle invalid entries. Options are skip (filter out rows with
-   * invalid values), error (throw an error), or keep (keep invalid values in a special additional
-   * bucket).
+   * Param for how to handle invalid entries. Options are 'skip' (filter out rows with
+   * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
+   * additional bucket).
    * Default: "error"
    * @group param
    */
+  // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
-    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
 
   /** @group getParam */
   @Since("2.1.0")
@@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
   private[feature] val SKIP_INVALID: String = "skip"
   private[feature] val ERROR_INVALID: String = "error"
   private[feature] val KEEP_INVALID: String = "keep"
-  private[feature] val supportedHandleInvalid: Array[String] =
+  private[feature] val supportedHandleInvalids: Array[String] =
     Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 653fa41124f8..8699929bab79 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params
    * Default value is 0.05.
    * @group param
    */
+  @Since("2.1.0")
   final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.",
     ParamValidators.inRange(0, 1))
   setDefault(fpr -> 0.05)
 
   /** @group getParam */
+  @Since("2.1.0")
   def getFpr: Double = $(fpr)
 
   /**
@@ -216,13 +218,6 @@ final class ChiSqSelectorModel private[ml] (
   @Since("1.6.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /**
-   * @group setParam
-   */
-  @Since("1.6.0")
-  @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0")
-  def setLabelCol(value: String): this.type = set(labelCol, value)
-
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     val transformedSchema = transformSchema(dataset.schema, logging = true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 6299f74a6bf9..1ebe29703bc4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -53,8 +53,9 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
   /**
    * Specifies the minimum number of different documents a term must appear in to be included
    * in the vocabulary.
-   * If this is an integer >= 1, this specifies the number of documents the term must appear in;
-   * if this is a double in [0,1), then this specifies the fraction of documents.
+   * If this is an integer greater than or equal to 1, this specifies the number of documents
+   * the term must appear in; if this is a double in [0,1), then this specifies the fraction of
+   * documents.
    *
    * Default: 1.0
    * @group param
@@ -78,8 +79,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
   /**
    * Filter to ignore rare words in a document. For each document, terms with
    * frequency/count less than the given threshold are ignored.
-   * If this is an integer >= 1, then this specifies a count (of times the term must appear
-   * in the document);
+   * If this is an integer greater than or equal to 1, then this specifies a count (of times the
+   * term must appear in the document);
    * if this is a double in [0,1), then this specifies a fraction (out of the document's token
    * count).
    *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
index 6ff36b35ca4c..682787a83011 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
@@ -32,7 +32,8 @@ import org.apache.spark.sql.types.DataType
  * It returns a real vector of the same length representing the DCT. The return vector is scaled
  * such that the transform matrix is unitary (aka scaled DCT-II).
  *
- * More information on [[https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia]].
+ * More information on <a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II">
+ * DCT-II in Discrete cosine transform (Wikipedia)</a>.
  */
 @Since("1.5.0")
 class DCT @Since("1.5.0") (@Since("1.5.0") override val uid: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index a8792a35ff4a..db432b6fefaf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -52,7 +52,7 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /**
-   * Number of features.  Should be > 0.
+   * Number of features. Should be greater than 0.
    * (default = 2^18^)
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index 6386dd8a1080..46a0730f5ddb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -44,7 +44,8 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol
    * @group param
    */
   final val minDocFreq = new IntParam(
-    this, "minDocFreq", "minimum number of documents in which a term should appear for filtering")
+    this, "minDocFreq", "minimum number of documents in which a term should appear for filtering" +
+      " (>= 0)", ParamValidators.gtEq(0))
 
   setDefault(minDocFreq -> 0)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
index 333a8c364a88..309cc2ef52b0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -33,28 +33,28 @@ import org.apache.spark.sql.types._
  */
 private[ml] trait LSHParams extends HasInputCol with HasOutputCol {
   /**
-   * Param for the dimension of LSH OR-amplification.
+   * Param for the number of hash tables used in LSH OR-amplification.
    *
-   * In this implementation, we use LSH OR-amplification to reduce the false negative rate. The
-   * higher the dimension is, the lower the false negative rate.
+   * LSH OR-amplification can be used to reduce the false negative rate. Higher values for this
+   * param lead to a reduced false negative rate, at the expense of added computational complexity.
    * @group param
    */
-  final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension, where" +
-    "increasing dimensionality lowers the false negative rate, and decreasing dimensionality" +
-    " improves the running performance", ParamValidators.gt(0))
+  final val numHashTables: IntParam = new IntParam(this, "numHashTables", "number of hash " +
+    "tables, where increasing number of hash tables lowers the false negative rate, and " +
+    "decreasing it improves the running performance", ParamValidators.gt(0))
 
   /** @group getParam */
-  final def getOutputDim: Int = $(outputDim)
+  final def getNumHashTables: Int = $(numHashTables)
 
-  setDefault(outputDim -> 1)
+  setDefault(numHashTables -> 1)
 
   /**
    * Transform the Schema for LSH
-   * @param schema The schema of the input dataset without [[outputCol]]
-   * @return A derived schema with [[outputCol]] added
+   * @param schema The schema of the input dataset without [[outputCol]].
+   * @return A derived schema with [[outputCol]] added.
    */
   protected[this] final def validateAndTransformSchema(schema: StructType): StructType = {
-    SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT)
+    SchemaUtils.appendColumn(schema, $(outputCol), DataTypes.createArrayType(new VectorUDT))
   }
 }
 
@@ -66,32 +66,32 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
   self: T =>
 
   /**
-   * The hash function of LSH, mapping a predefined KeyType to a Vector
+   * The hash function of LSH, mapping an input feature vector to multiple hash vectors.
    * @return The mapping of LSH function.
    */
-  protected[ml] val hashFunction: Vector => Vector
+  protected[ml] val hashFunction: Vector => Array[Vector]
 
   /**
    * Calculate the distance between two different keys using the distance metric corresponding
-   * to the hashFunction
-   * @param x One input vector in the metric space
-   * @param y One input vector in the metric space
-   * @return The distance between x and y
+   * to the hashFunction.
+   * @param x One input vector in the metric space.
+   * @param y One input vector in the metric space.
+   * @return The distance between x and y.
    */
   protected[ml] def keyDistance(x: Vector, y: Vector): Double
 
   /**
    * Calculate the distance between two different hash Vectors.
    *
-   * @param x One of the hash vector
-   * @param y Another hash vector
-   * @return The distance between hash vectors x and y
+   * @param x One of the hash vector.
+   * @param y Another hash vector.
+   * @return The distance between hash vectors x and y.
    */
-  protected[ml] def hashDistance(x: Vector, y: Vector): Double
+  protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
-    val transformUDF = udf(hashFunction, new VectorUDT)
+    val transformUDF = udf(hashFunction, DataTypes.createArrayType(new VectorUDT))
     dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
   }
 
@@ -99,29 +99,12 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
     validateAndTransformSchema(schema)
   }
 
-  /**
-   * Given a large dataset and an item, approximately find at most k items which have the closest
-   * distance to the item. If the [[outputCol]] is missing, the method will transform the data; if
-   * the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
-   * transformed data when necessary.
-   *
-   * This method implements two ways of fetching k nearest neighbors:
-   *  - Single Probing: Fast, return at most k elements (Probing only one buckets)
-   *  - Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key)
-   *
-   * @param dataset the dataset to search for nearest neighbors of the key
-   * @param key Feature vector representing the item to search for
-   * @param numNearestNeighbors The maximum number of nearest neighbors
-   * @param singleProbing True for using Single Probing; false for multiple probing
-   * @param distCol Output column for storing the distance between each result row and the key
-   * @return A dataset containing at most k items closest to the key. A distCol is added to show
-   *         the distance between each row and the key.
-   */
-  def approxNearestNeighbors(
+  // TODO: Fix the MultiProbe NN Search in SPARK-18454
+  private[feature] def approxNearestNeighbors(
       dataset: Dataset[_],
       key: Vector,
       numNearestNeighbors: Int,
-      singleProbing: Boolean,
+      singleProbe: Boolean,
       distCol: String): Dataset[_] = {
     require(numNearestNeighbors > 0, "The number of nearest neighbors cannot be less than 1")
     // Get Hash Value of the key
@@ -132,14 +115,24 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
         dataset.toDF()
       }
 
-    // In the origin dataset, find the hash value that is closest to the key
-    val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType)
-    val hashDistCol = hashDistUDF(col($(outputCol)))
+    val modelSubset = if (singleProbe) {
+      def sameBucket(x: Seq[Vector], y: Seq[Vector]): Boolean = {
+        x.zip(y).exists(tuple => tuple._1 == tuple._2)
+      }
+
+      // In the origin dataset, find the hash value that hash the same bucket with the key
+      val sameBucketWithKeyUDF = udf((x: Seq[Vector]) =>
+        sameBucket(x, keyHash), DataTypes.BooleanType)
 
-    val modelSubset = if (singleProbing) {
-      modelDataset.filter(hashDistCol === 0.0)
+      modelDataset.filter(sameBucketWithKeyUDF(col($(outputCol))))
     } else {
+      // In the origin dataset, find the hash value that is closest to the key
+      // Limit the use of hashDist since it's controversial
+      val hashDistUDF = udf((x: Seq[Vector]) => hashDistance(x, keyHash), DataTypes.DoubleType)
+      val hashDistCol = hashDistUDF(col($(outputCol)))
+
       // Compute threshold to get exact k elements.
+      // TODO: SPARK-18409: Use approxQuantile to get the threshold
       val modelDatasetSortedByHash = modelDataset.sort(hashDistCol).limit(numNearestNeighbors)
       val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistCol))
       val hashThreshold = thresholdDataset.take(1).head.getDouble(0)
@@ -155,8 +148,30 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
   }
 
   /**
-   * Overloaded method for approxNearestNeighbors. Use Single Probing as default way to search
-   * nearest neighbors and "distCol" as default distCol.
+   * Given a large dataset and an item, approximately find at most k items which have the closest
+   * distance to the item. If the [[outputCol]] is missing, the method will transform the data; if
+   * the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
+   * transformed data when necessary.
+   *
+   * @note This method is experimental and will likely change behavior in the next release.
+   *
+   * @param dataset The dataset to search for nearest neighbors of the key.
+   * @param key Feature vector representing the item to search for.
+   * @param numNearestNeighbors The maximum number of nearest neighbors.
+   * @param distCol Output column for storing the distance between each result row and the key.
+   * @return A dataset containing at most k items closest to the key. A column "distCol" is added
+   *         to show the distance between each row and the key.
+   */
+  def approxNearestNeighbors(
+    dataset: Dataset[_],
+    key: Vector,
+    numNearestNeighbors: Int,
+    distCol: String): Dataset[_] = {
+    approxNearestNeighbors(dataset, key, numNearestNeighbors, true, distCol)
+  }
+
+  /**
+   * Overloaded method for approxNearestNeighbors. Use "distCol" as default distCol.
    */
   def approxNearestNeighbors(
       dataset: Dataset[_],
@@ -172,31 +187,28 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    *
    * @param dataset The dataset to transform and explode.
    * @param explodeCols The alias for the exploded columns, must be a seq of two strings.
-   * @return A dataset containing idCol, inputCol and explodeCols
+   * @return A dataset containing idCol, inputCol and explodeCols.
    */
   private[this] def processDataset(
       dataset: Dataset[_],
       inputName: String,
       explodeCols: Seq[String]): Dataset[_] = {
     require(explodeCols.size == 2, "explodeCols must be two strings.")
-    val vectorToMap = udf((x: Vector) => x.asBreeze.iterator.toMap,
-      MapType(DataTypes.IntegerType, DataTypes.DoubleType))
     val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) {
       transform(dataset)
     } else {
       dataset.toDF()
     }
     modelDataset.select(
-      struct(col("*")).as(inputName),
-      explode(vectorToMap(col($(outputCol)))).as(explodeCols))
+      struct(col("*")).as(inputName), posexplode(col($(outputCol))).as(explodeCols))
   }
 
   /**
    * Recreate a column using the same column name but different attribute id. Used in approximate
    * similarity join.
-   * @param dataset The dataset where a column need to recreate
-   * @param colName The name of the column to recreate
-   * @param tmpColName A temporary column name which does not conflict with existing columns
+   * @param dataset The dataset where a column need to recreate.
+   * @param colName The name of the column to recreate.
+   * @param tmpColName A temporary column name which does not conflict with existing columns.
    * @return
    */
   private[this] def recreateCol(
@@ -215,12 +227,12 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    * [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the transformed
    * data when necessary.
    *
-   * @param datasetA One of the datasets to join
-   * @param datasetB Another dataset to join
-   * @param threshold The threshold for the distance of row pairs
-   * @param distCol Output column for storing the distance between each result row and the key
+   * @param datasetA One of the datasets to join.
+   * @param datasetB Another dataset to join.
+   * @param threshold The threshold for the distance of row pairs.
+   * @param distCol Output column for storing the distance between each result row and the key.
    * @return A joined dataset containing pairs of rows. The original rows are in columns
-   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair
+   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair.
    */
   def approxSimilarityJoin(
       datasetA: Dataset[_],
@@ -293,7 +305,7 @@ private[ml] abstract class LSH[T <: LSHModel[T]]
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /** @group setParam */
-  def setOutputDim(value: Int): this.type = set(outputDim, value)
+  def setNumHashTables(value: Int): this.type = set(numHashTables, value)
 
   /**
    * Validate and create a new instance of concrete LSHModel. Because different LSHModel may have
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala
index 7d8e4adcc225..c5d0ec1a8d35 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala
@@ -19,11 +19,10 @@ package org.apache.spark.ml.feature
 
 import scala.beans.BeanInfo
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.ml.linalg.Vector
 
 /**
- * :: Experimental ::
  *
  * Class that represents the features and label of a data point.
  *
@@ -31,7 +30,6 @@ import org.apache.spark.ml.linalg.Vector
  * @param features List of features for this data point.
  */
 @Since("2.0.0")
-@Experimental
 @BeanInfo
 case class LabeledPoint(@Since("2.0.0") label: Double, @Since("2.0.0") features: Vector) {
   override def toString: String = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala
index acabf0b89266..85f9732f79f6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.ml.param.{ParamMap, Params}
@@ -48,12 +48,10 @@ private[feature] trait MaxAbsScalerParams extends Params with HasInputCol with H
 }
 
 /**
- * :: Experimental ::
  * Rescale each feature individually to range [-1, 1] by dividing through the largest maximum
  * absolute value in each feature. It does not shift/center the data, and thus does not destroy
  * any sparsity.
  */
-@Experimental
 @Since("2.0.0")
 class MaxAbsScaler @Since("2.0.0") (@Since("2.0.0") override val uid: String)
   extends Estimator[MaxAbsScalerModel] with MaxAbsScalerParams with DefaultParamsWritable {
@@ -101,11 +99,9 @@ object MaxAbsScaler extends DefaultParamsReadable[MaxAbsScaler] {
 }
 
 /**
- * :: Experimental ::
  * Model fitted by [[MaxAbsScaler]].
  *
  */
-@Experimental
 @Since("2.0.0")
 class MaxAbsScalerModel private[ml] (
     @Since("2.0.0") override val uid: String,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
similarity index 53%
rename from mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
rename to mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
index d9d0f32254e2..620e1fbb09ff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
@@ -31,36 +31,39 @@ import org.apache.spark.sql.types.StructType
 /**
  * :: Experimental ::
  *
- * Model produced by [[MinHash]], where multiple hash functions are stored. Each hash function is
- * a perfect hash function:
- *    `h_i(x) = (x * k_i mod prime) mod numEntries`
- * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*`
+ * Model produced by [[MinHashLSH]], where multiple hash functions are stored. Each hash function
+ * is picked from the following family of hash functions, where a_i and b_i are randomly chosen
+ * integers less than prime:
+ *    `h_i(x) = ((x \cdot a_i + b_i) \mod prime)`
+ *
+ * This hash family is approximately min-wise independent according to the reference.
  *
  * Reference:
- * [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]]
+ * Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear permutations."
+ * Electronic Journal of Combinatorics 7 (2000): R26.
  *
- * @param numEntries The number of entries of the hash functions.
- * @param randCoefficients An array of random coefficients, each used by one hash function.
+ * @param randCoefficients Pairs of random coefficients. Each pair is used by one hash function.
  */
 @Experimental
 @Since("2.1.0")
-class MinHashModel private[ml] (
+class MinHashLSHModel private[ml](
     override val uid: String,
-    @Since("2.1.0") val numEntries: Int,
-    @Since("2.1.0") val randCoefficients: Array[Int])
-  extends LSHModel[MinHashModel] {
+    private[ml] val randCoefficients: Array[(Int, Int)])
+  extends LSHModel[MinHashLSHModel] {
 
   @Since("2.1.0")
-  override protected[ml] val hashFunction: Vector => Vector = {
-    elems: Vector =>
+  override protected[ml] val hashFunction: Vector => Array[Vector] = {
+    elems: Vector => {
       require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.")
       val elemsList = elems.toSparse.indices.toList
-      val hashValues = randCoefficients.map({ randCoefficient: Int =>
-          elemsList.map({elem: Int =>
-            (1 + elem) * randCoefficient.toLong % MinHash.prime % numEntries
-          }).min.toDouble
-      })
-      Vectors.dense(hashValues)
+      val hashValues = randCoefficients.map { case (a, b) =>
+        elemsList.map { elem: Int =>
+          ((1 + elem) * a + b) % MinHashLSH.HASH_PRIME
+        }.min.toDouble
+      }
+      // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
+      hashValues.map(Vectors.dense(_))
+    }
   }
 
   @Since("2.1.0")
@@ -74,16 +77,19 @@ class MinHashModel private[ml] (
   }
 
   @Since("2.1.0")
-  override protected[ml] def hashDistance(x: Vector, y: Vector): Double = {
+  override protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = {
     // Since it's generated by hashing, it will be a pair of dense vectors.
-    x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min
+    // TODO: This hashDistance function requires more discussion in SPARK-18454
+    x.zip(y).map(vectorPair =>
+      vectorPair._1.toArray.zip(vectorPair._2.toArray).count(pair => pair._1 != pair._2)
+    ).min
   }
 
   @Since("2.1.0")
   override def copy(extra: ParamMap): this.type = defaultCopy(extra)
 
   @Since("2.1.0")
-  override def write: MLWriter = new MinHashModel.MinHashModelWriter(this)
+  override def write: MLWriter = new MinHashLSHModel.MinHashLSHModelWriter(this)
 }
 
 /**
@@ -92,18 +98,17 @@ class MinHashModel private[ml] (
  * LSH class for Jaccard distance.
  *
  * The input can be dense or sparse vectors, but it is more efficient if it is sparse. For example,
- *    `Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)])`
- * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5.
- * Also, any input vector must have at least 1 non-zero indices, and all non-zero values are treated
- * as binary "1" values.
+ *    `Vectors.sparse(10, Array((2, 1.0), (3, 1.0), (5, 1.0)))`
+ * means there are 10 elements in the space. This set contains elements 2, 3, and 5. Also, any
+ * input vector must have at least 1 non-zero index, and all non-zero values are
+ * treated as binary "1" values.
  *
  * References:
- * [[https://en.wikipedia.org/wiki/MinHash Wikipedia on MinHash]]
+ * <a href="https://en.wikipedia.org/wiki/MinHash">Wikipedia on MinHash</a>
  */
 @Experimental
 @Since("2.1.0")
-class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
-
+class MinHashLSH(override val uid: String) extends LSH[MinHashLSHModel] with HasSeed {
 
   @Since("2.1.0")
   override def setInputCol(value: String): this.type = super.setInputCol(value)
@@ -112,11 +117,11 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
   override def setOutputCol(value: String): this.type = super.setOutputCol(value)
 
   @Since("2.1.0")
-  override def setOutputDim(value: Int): this.type = super.setOutputDim(value)
+  override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value)
 
   @Since("2.1.0")
   def this() = {
-    this(Identifiable.randomUID("min hash"))
+    this(Identifiable.randomUID("mh-lsh"))
   }
 
   /** @group setParam */
@@ -124,13 +129,14 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
   def setSeed(value: Long): this.type = set(seed, value)
 
   @Since("2.1.0")
-  override protected[ml] def createRawLSHModel(inputDim: Int): MinHashModel = {
-    require(inputDim <= MinHash.prime / 2,
-      s"The input vector dimension $inputDim exceeds the threshold ${MinHash.prime / 2}.")
+  override protected[ml] def createRawLSHModel(inputDim: Int): MinHashLSHModel = {
+    require(inputDim <= MinHashLSH.HASH_PRIME,
+      s"The input vector dimension $inputDim exceeds the threshold ${MinHashLSH.HASH_PRIME}.")
     val rand = new Random($(seed))
-    val numEntry = inputDim * 2
-    val randCoofs: Array[Int] = Array.fill($(outputDim))(1 + rand.nextInt(MinHash.prime - 1))
-    new MinHashModel(uid, numEntry, randCoofs)
+    val randCoefs: Array[(Int, Int)] = Array.fill($(numHashTables)) {
+        (1 + rand.nextInt(MinHashLSH.HASH_PRIME - 1), rand.nextInt(MinHashLSH.HASH_PRIME - 1))
+      }
+    new MinHashLSHModel(uid, randCoefs)
   }
 
   @Since("2.1.0")
@@ -144,48 +150,49 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
 }
 
 @Since("2.1.0")
-object MinHash extends DefaultParamsReadable[MinHash] {
+object MinHashLSH extends DefaultParamsReadable[MinHashLSH] {
   // A large prime smaller than sqrt(2^63 − 1)
-  private[ml] val prime = 2038074743
+  private[ml] val HASH_PRIME = 2038074743
 
   @Since("2.1.0")
-  override def load(path: String): MinHash = super.load(path)
+  override def load(path: String): MinHashLSH = super.load(path)
 }
 
 @Since("2.1.0")
-object MinHashModel extends MLReadable[MinHashModel] {
+object MinHashLSHModel extends MLReadable[MinHashLSHModel] {
 
   @Since("2.1.0")
-  override def read: MLReader[MinHashModel] = new MinHashModelReader
+  override def read: MLReader[MinHashLSHModel] = new MinHashLSHModelReader
 
   @Since("2.1.0")
-  override def load(path: String): MinHashModel = super.load(path)
+  override def load(path: String): MinHashLSHModel = super.load(path)
 
-  private[MinHashModel] class MinHashModelWriter(instance: MinHashModel) extends MLWriter {
+  private[MinHashLSHModel] class MinHashLSHModelWriter(instance: MinHashLSHModel)
+    extends MLWriter {
 
-    private case class Data(numEntries: Int, randCoefficients: Array[Int])
+    private case class Data(randCoefficients: Array[Int])
 
     override protected def saveImpl(path: String): Unit = {
       DefaultParamsWriter.saveMetadata(instance, path, sc)
-      val data = Data(instance.numEntries, instance.randCoefficients)
+      val data = Data(instance.randCoefficients.flatMap(tuple => Array(tuple._1, tuple._2)))
       val dataPath = new Path(path, "data").toString
       sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
     }
   }
 
-  private class MinHashModelReader extends MLReader[MinHashModel] {
+  private class MinHashLSHModelReader extends MLReader[MinHashLSHModel] {
 
     /** Checked against metadata when loading model */
-    private val className = classOf[MinHashModel].getName
+    private val className = classOf[MinHashLSHModel].getName
 
-    override def load(path: String): MinHashModel = {
+    override def load(path: String): MinHashLSHModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
 
       val dataPath = new Path(path, "data").toString
-      val data = sparkSession.read.parquet(dataPath).select("numEntries", "randCoefficients").head()
-      val numEntries = data.getAs[Int](0)
-      val randCoefficients = data.getAs[Seq[Int]](1).toArray
-      val model = new MinHashModel(metadata.uid, numEntries, randCoefficients)
+      val data = sparkSession.read.parquet(dataPath).select("randCoefficients").head()
+      val randCoefficients = data.getAs[Seq[Int]](0).grouped(2)
+        .map(tuple => (tuple(0), tuple(1))).toArray
+      val model = new MinHashLSHModel(metadata.uid, randCoefficients)
 
       DefaultParamsReader.getAndSetParams(model, metadata)
       model
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
index 28cbe1cb01e9..19978c97d2cf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -78,14 +78,15 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H
  * statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
  * feature E is calculated as:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * For the case $E_{max} == E_{min}$, $Rescaled(e_i) = 0.5 * (max + min)$.
- * Note that since zero values will probably be transformed to non-zero values, output of the
+ *
+ * @note Since zero values will probably be transformed to non-zero values, output of the
  * transformer will be DenseVector even for sparse input.
  */
 @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
index 4463aea0097e..c8760f9dc178 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -41,7 +41,7 @@ class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   def this() = this(Identifiable.randomUID("ngram"))
 
   /**
-   * Minimum n-gram length, >= 1.
+   * Minimum n-gram length, greater than or equal to 1.
    * Default: 2, bigram features
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
index eb0690058013..6e96545c8cb7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
@@ -37,7 +37,7 @@ class Normalizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   def this() = this(Identifiable.randomUID("normalizer"))
 
   /**
-   * Normalization in L^p^ space.  Must be >= 1.
+   * Normalization in L^p^ space. Must be greater than equal to 1.
    * (default: p = 2)
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index e8e28ba29c84..ba1380bdda45 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -33,13 +33,14 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
  * at most a single one-value per row that indicates the input category index.
  * For example with 5 categories, an input value of 2.0 would map to an output vector of
  * `[0.0, 0.0, 1.0, 0.0]`.
- * The last category is not included by default (configurable via [[OneHotEncoder!.dropLast]]
+ * The last category is not included by default (configurable via `OneHotEncoder!.dropLast`
  * because it makes the vector entries sum up to one, and hence linearly dependent.
  * So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
- * Note that this is different from scikit-learn's OneHotEncoder, which keeps all categories.
+ *
+ * @note This is different from scikit-learn's OneHotEncoder, which keeps all categories.
  * The output vectors are sparse.
  *
- * @see [[StringIndexer]] for converting categorical values into category indices
+ * @see `StringIndexer` for converting categorical values into category indices
  */
 @Since("1.4.0")
 class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 6b913480fdc2..4143d864d793 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.util.VersionUtils.majorVersion
 
 /**
  * Params for [[PCA]] and [[PCAModel]].
@@ -44,7 +45,8 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC
    * The number of principal components.
    * @group param
    */
-  final val k: IntParam = new IntParam(this, "k", "the number of principal components")
+  final val k: IntParam = new IntParam(this, "k", "the number of principal components (> 0)",
+    ParamValidators.gt(0))
 
   /** @group getParam */
   def getK: Int = $(k)
@@ -61,7 +63,7 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC
 }
 
 /**
- * PCA trains a model to project vectors to a lower dimensional space of the top [[PCA!.k]]
+ * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k`
  * principal components.
  */
 @Since("1.5.0")
@@ -140,8 +142,9 @@ class PCAModel private[ml] (
 
   /**
    * Transform a vector by computed Principal Components.
-   * NOTE: Vectors to be transformed must be the same length
-   * as the source vectors given to [[PCA.fit()]].
+   *
+   * @note Vectors to be transformed must be the same length as the source vectors given
+   * to `PCA.fit()`.
    */
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
@@ -203,11 +206,8 @@ object PCAModel extends MLReadable[PCAModel] {
     override def load(path: String): PCAModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
 
-      val versionRegex = "([0-9]+)\\.(.+)".r
-      val versionRegex(major, _) = metadata.sparkVersion
-
       val dataPath = new Path(path, "data").toString
-      val model = if (major.toInt >= 2) {
+      val model = if (majorVersion(metadata.sparkVersion) >= 2) {
         val Row(pc: DenseMatrix, explainedVariance: DenseVector) =
           sparkSession.read.parquet(dataPath)
             .select("pc", "explainedVariance")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 25fb6be5afd8..292f9496a456 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -30,10 +30,12 @@ import org.apache.spark.sql.types.DataType
 
 /**
  * Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,
- * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an
- * expansion of a product of sums expresses it as a sum of products by using the fact that
- * multiplication distributes over addition". Take a 2-variable feature vector as an example:
- * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
+ * which is available at
+ * <a href="http://en.wikipedia.org/wiki/Polynomial_expansion">Polynomial expansion (Wikipedia)</a>
+ * , "In mathematics, an expansion of a product of sums expresses it as a sum of products by using
+ * the fact that multiplication distributes over addition". Take a 2-variable feature vector
+ * as an example: `(x, y)`, if we want to expand it with degree 2, then we get
+ * `(x, x * x, y, x * y, y * y)`.
  */
 @Since("1.4.0")
 class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: String)
@@ -43,7 +45,8 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str
   def this() = this(Identifiable.randomUID("poly"))
 
   /**
-   * The polynomial degree to expand, which should be >= 1.  A value of 1 means no expansion.
+   * The polynomial degree to expand, which should be greater than equal to 1. A value of 1 means
+   * no expansion.
    * Default: 2
    * @group param
    */
@@ -76,11 +79,11 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str
  * (n + d choose d) (including 1 and first-order values). For example, let f([a, b, c], 3) be the
  * function that expands [a, b, c] to their monomials of degree 3. We have the following recursion:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    f([a, b, c], 3) &= f([a, b], 3) ++ f([a, b], 2) * c ++ f([a, b], 1) * c^2 ++ [c^3]
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
  * current index and increment it properly for sparse input.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index b9e01dde70d8..80c7f55e26b8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -35,7 +35,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
 
   /**
    * Number of buckets (quantiles, or categories) into which data points are grouped. Must
-   * be >= 2.
+   * be greater than or equal to 2.
    *
    * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
    *
@@ -52,7 +52,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
 
   /**
    * Relative error (see documentation for
-   * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]] for description)
+   * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile` for description)
    * Must be in the range [0, 1].
    * default: 0.001
    * @group param
@@ -66,17 +66,18 @@ private[feature] trait QuantileDiscretizerBase extends Params
   def getRelativeError: Double = getOrDefault(relativeError)
 
   /**
-   * Param for how to handle invalid entries. Options are skip (filter out rows with
-   * invalid values), error (throw an error), or keep (keep invalid values in a special additional
-   * bucket).
+   * Param for how to handle invalid entries. Options are 'skip' (filter out rows with
+   * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
+   * additional bucket).
    * Default: "error"
    * @group param
    */
+  // TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
-    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
   setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
 
   /** @group getParam */
@@ -91,15 +92,17 @@ private[feature] trait QuantileDiscretizerBase extends Params
  * possible that the number of buckets used will be smaller than this value, for example, if there
  * are too few distinct values of the input to create enough distinct quantiles.
  *
- * NaN handling: Note also that
- * QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can
+ * NaN handling:
+ * NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will
+ * produce a `Bucketizer` model for making predictions. During the transformation,
+ * `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can
  * also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
  * If the user chooses to keep NaN values, they will be handled specially and placed into their own
  * bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3],
  * but NaNs will be counted in a special bucket[4].
  *
  * Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for
- * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]]
+ * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile`
  * for a detailed description). The precision of the approximation can be controlled with the
  * `relativeError` parameter. The lower and upper bin bounds will be `-Infinity` and `+Infinity`,
  * covering all real values.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index b25fff973c44..65db06c0d608 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -32,9 +32,11 @@ import org.apache.spark.sql.types.StructType
  * the output, it can be any select clause that Spark SQL supports. Users can also
  * use Spark SQL built-in function and UDFs to operate on these selected columns.
  * For example, [[SQLTransformer]] supports statements like:
- *  - SELECT a, a + b AS a_b FROM __THIS__
- *  - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
- *  - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
+ * {{{
+ *  SELECT a, a + b AS a_b FROM __THIS__
+ *  SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
+ *  SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
+ * }}}
  */
 @Since("1.6.0")
 class SQLTransformer @Since("1.6.0") (@Since("1.6.0") override val uid: String) extends Transformer
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index d76d556280e9..8f125d8fd51d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -79,8 +79,8 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
  * statistics on the samples in the training set.
  *
  * The "unit std" is computed using the
- * [[https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation
- *   corrected sample standard deviation]],
+ * <a href="https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation">
+ * corrected sample standard deviation</a>,
  * which is computed as the square root of the unbiased sample variance.
  */
 @Since("1.2.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 666070037cdd..3fcd84c029e6 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -28,8 +28,11 @@ import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
 
 /**
  * A feature transformer that filters out stop words from input.
- * Note: null values from input array are preserved unless adding null to stopWords explicitly.
- * @see [[http://en.wikipedia.org/wiki/Stop_words]]
+ *
+ * @note null values from input array are preserved unless adding null to stopWords
+ * explicitly.
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Stop_words">Stop words (Wikipedia)</a>
  */
 @Since("1.5.0")
 class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String)
@@ -49,7 +52,7 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
   /**
    * The words to be filtered out.
    * Default: English stop words
-   * @see [[StopWordsRemover.loadDefaultStopWords()]]
+   * @see `StopWordsRemover.loadDefaultStopWords()`
    * @group param
    */
   @Since("1.5.0")
@@ -129,7 +132,8 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] {
    * Loads the default stop words for the given language.
    * Supported languages: danish, dutch, english, finnish, french, german, hungarian,
    * italian, norwegian, portuguese, russian, spanish, swedish, turkish
-   * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]]
+   * @see <a href="http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/">
+   * here</a>
    */
   @Since("2.0.0")
   def loadDefaultStopWords(language: String): Array[String] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 80fe46796f80..0a4d31d1654e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -60,7 +60,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
  * The indices are in [0, numLabels), ordered by label frequencies.
  * So the most frequent label gets index 0.
  *
- * @see [[IndexToString]] for the inverse transformation
+ * @see `IndexToString` for the inverse transformation
  */
 @Since("1.4.0")
 class StringIndexer @Since("1.4.0") (
@@ -113,11 +113,11 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] {
 /**
  * Model fitted by [[StringIndexer]].
  *
- * NOTE: During transformation, if the input column does not exist,
- * [[StringIndexerModel.transform]] would return the input dataset unmodified.
- * This is a temporary fix for the case when target labels do not exist during prediction.
- *
  * @param labels  Ordered list of labels, corresponding to indices to be assigned.
+ *
+ * @note During transformation, if the input column does not exist,
+ * `StringIndexerModel.transform` would return the input dataset unmodified.
+ * This is a temporary fix for the case when target labels do not exist during prediction.
  */
 @Since("1.4.0")
 class StringIndexerModel (
@@ -247,12 +247,12 @@ object StringIndexerModel extends MLReadable[StringIndexerModel] {
 }
 
 /**
- * A [[Transformer]] that maps a column of indices back to a new column of corresponding
+ * A `Transformer` that maps a column of indices back to a new column of corresponding
  * string values.
  * The index-string mapping is either from the ML attributes of the input column,
  * or from user-supplied labels (which take precedence over ML attributes).
  *
- * @see [[StringIndexer]] for converting strings into indices
+ * @see `StringIndexer` for converting strings into indices
  */
 @Since("1.5.0")
 class IndexToString private[ml] (@Since("1.5.0") override val uid: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 45d8fa94a8f8..cfaf6c0e610b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -70,7 +70,7 @@ class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   def this() = this(Identifiable.randomUID("regexTok"))
 
   /**
-   * Minimum token length, >= 0.
+   * Minimum token length, greater than or equal to 0.
    * Default: 1, to avoid returning empty strings
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index d1a5c2e82581..d371da762c55 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -41,8 +41,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
 
   /**
    * Threshold for the number of values a categorical feature can take.
-   * If a feature is found to have > maxCategories values, then it is declared continuous.
-   * Must be >= 2.
+   * If a feature is found to have {@literal >} maxCategories values, then it is declared
+   * continuous. Must be greater than or equal to 2.
    *
    * (default = 20)
    * @group param
@@ -59,7 +59,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
 }
 
 /**
- * Class for indexing categorical feature columns in a dataset of [[Vector]].
+ * Class for indexing categorical feature columns in a dataset of `Vector`.
  *
  * This has 2 usage modes:
  *  - Automatically identify categorical features (default behavior)
@@ -76,7 +76,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
  *     - Warning: This can cause problems if features are continuous since this will collect ALL
  *       unique values to the driver.
  *     - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
- *       If maxCategories >= 3, then both features will be declared categorical.
+ *       If maxCategories is greater than or equal to 3, then both features will be declared
+ *       categorical.
  *
  * This returns a model which can transform categorical features to use 0-based indices.
  *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
index 966ccb85d0e0..e3e462d07e10 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
@@ -32,8 +32,8 @@ import org.apache.spark.sql.types.StructType
  * This class takes a feature vector and outputs a new feature vector with a subarray of the
  * original features.
  *
- * The subset of features can be specified with either indices ([[setIndices()]])
- * or names ([[setNames()]]).  At least one feature must be selected. Duplicate features
+ * The subset of features can be specified with either indices (`setIndices()`)
+ * or names (`setNames()`). At least one feature must be selected. Duplicate features
  * are not allowed, so there can be no overlap between selected indices and names.
  *
  * The output vector will order features with the selected indices first (in the order given),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index d53f3df514df..3ed08c983d56 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -43,7 +43,8 @@ private[feature] trait Word2VecBase extends Params
    * @group param
    */
   final val vectorSize = new IntParam(
-    this, "vectorSize", "the dimension of codes after transforming from words")
+    this, "vectorSize", "the dimension of codes after transforming from words (> 0)",
+    ParamValidators.gt(0))
   setDefault(vectorSize -> 100)
 
   /** @group getParam */
@@ -55,7 +56,8 @@ private[feature] trait Word2VecBase extends Params
    * @group expertParam
    */
   final val windowSize = new IntParam(
-    this, "windowSize", "the window size (context words from [-window, window])")
+    this, "windowSize", "the window size (context words from [-window, window]) (> 0)",
+    ParamValidators.gt(0))
   setDefault(windowSize -> 5)
 
   /** @group expertGetParam */
@@ -67,7 +69,8 @@ private[feature] trait Word2VecBase extends Params
    * @group param
    */
   final val numPartitions = new IntParam(
-    this, "numPartitions", "number of partitions for sentences of words")
+    this, "numPartitions", "number of partitions for sentences of words (> 0)",
+    ParamValidators.gt(0))
   setDefault(numPartitions -> 1)
 
   /** @group getParam */
@@ -80,7 +83,7 @@ private[feature] trait Word2VecBase extends Params
    * @group param
    */
   final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
-    "appear to be included in the word2vec model's vocabulary")
+    "appear to be included in the word2vec model's vocabulary (>= 0)", ParamValidators.gtEq(0))
   setDefault(minCount -> 5)
 
   /** @group getParam */
@@ -95,7 +98,7 @@ private[feature] trait Word2VecBase extends Params
    */
   final val maxSentenceLength = new IntParam(this, "maxSentenceLength", "Maximum length " +
     "(in words) of each sentence in the input data. Any sentence longer than this threshold will " +
-    "be divided into chunks up to the size.")
+    "be divided into chunks up to the size (> 0)", ParamValidators.gt(0))
   setDefault(maxSentenceLength -> 1000)
 
   /** @group getParam */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
index dcff4245d1d2..ce7f33505687 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
@@ -61,12 +61,12 @@
  *      createStructField("id", IntegerType, false),
  *      createStructField("text", StringType, false),
  *      createStructField("rating", DoubleType, false)));
- *  JavaRDD<Row> rowRDD = jsc.parallelize(
+ *  JavaRDD&lt;Row&gt; rowRDD = jsc.parallelize(
  *    Arrays.asList(
  *      RowFactory.create(0, "Hi I heard about Spark", 3.0),
  *      RowFactory.create(1, "I wish Java could use case classes", 4.0),
  *      RowFactory.create(2, "Logistic regression models are neat", 4.0)));
- *  Dataset<Row> dataset = jsql.createDataFrame(rowRDD, schema);
+ *  Dataset&lt;Row&gt; dataset = jsql.createDataFrame(rowRDD, schema);
  *  // define feature transformers
  *  RegexTokenizer tok = new RegexTokenizer()
  *    .setInputCol("text")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
index b94187ae787c..5dd648aecc95 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
@@ -84,6 +84,7 @@ import org.apache.spark.sql.DataFrame
  * input dataset, while MLlib's feature transformers operate lazily on individual columns,
  * which is more efficient and flexible to handle large and complex datasets.
  *
- * @see [[http://scikit-learn.org/stable/modules/preprocessing.html scikit-learn.preprocessing]]
+ * @see <a href="http://scikit-learn.org/stable/modules/preprocessing.html">
+ * scikit-learn.preprocessing</a>
  */
 package object feature
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
index 8a6b862cda17..143bf539b0af 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
@@ -50,9 +50,10 @@ private[ml] class IterativelyReweightedLeastSquaresModel(
  * @param maxIter maximum number of iterations.
  * @param tol the convergence tolerance.
  *
- * @see [[http://www.jstor.org/stable/2345503 P. J. Green, Iteratively Reweighted Least Squares
- *     for Maximum Likelihood Estimation, and some Robust and Resistant Alternatives,
- *     Journal of the Royal Statistical Society. Series B, 1984.]]
+ * @see <a href="http://www.jstor.org/stable/2345503">P. J. Green, Iteratively
+ * Reweighted Least Squares for Maximum Likelihood Estimation, and some Robust
+ * and Resistant Alternatives, Journal of the Royal Statistical Society.
+ * Series B, 1984.</a>
  */
 private[ml] class IterativelyReweightedLeastSquares(
     val initialModel: WeightedLeastSquaresModel,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
index 2f5299b01022..dc3bcc662733 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
@@ -16,9 +16,10 @@
  */
 package org.apache.spark.ml.optim
 
+import scala.collection.mutable
+
 import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
-import scala.collection.mutable
 
 import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vectors}
 import org.apache.spark.mllib.linalg.CholeskyDecomposition
@@ -33,7 +34,7 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition
  * @param objectiveHistory Option containing the objective history when an optimization program is
  *                         used to solve the normal equations. None when an analytic solver is used.
  */
-private[ml] class NormalEquationSolution(
+private[optim] class NormalEquationSolution(
     val coefficients: Array[Double],
     val aaInv: Option[Array[Double]],
     val objectiveHistory: Option[Array[Double]])
@@ -41,7 +42,7 @@ private[ml] class NormalEquationSolution(
 /**
  * Interface for classes that solve the normal equations locally.
  */
-private[ml] sealed trait NormalEquationSolver {
+private[optim] sealed trait NormalEquationSolver {
 
   /** Solve the normal equations from summary statistics. */
   def solve(
@@ -55,9 +56,9 @@ private[ml] sealed trait NormalEquationSolver {
 /**
  * A class that solves the normal equations directly, using Cholesky decomposition.
  */
-private[ml] class CholeskySolver extends NormalEquationSolver {
+private[optim] class CholeskySolver extends NormalEquationSolver {
 
-  def solve(
+  override def solve(
       bBar: Double,
       bbBar: Double,
       abBar: DenseVector,
@@ -74,13 +75,13 @@ private[ml] class CholeskySolver extends NormalEquationSolver {
 /**
  * A class for solving the normal equations using Quasi-Newton optimization methods.
  */
-private[ml] class QuasiNewtonSolver(
+private[optim] class QuasiNewtonSolver(
     fitIntercept: Boolean,
     maxIter: Int,
     tol: Double,
     l1RegFunc: Option[(Int) => Double]) extends NormalEquationSolver {
 
-  def solve(
+  override def solve(
       bBar: Double,
       bbBar: Double,
       abBar: DenseVector,
@@ -156,7 +157,7 @@ private[ml] class QuasiNewtonSolver(
  * Exception thrown when solving a linear system Ax = b for which the matrix A is non-invertible
  * (singular).
  */
-class SingularMatrixException(message: String, cause: Throwable)
+private[spark] class SingularMatrixException(message: String, cause: Throwable)
   extends IllegalArgumentException(message, cause) {
 
   def this(message: String) = this(message, null)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 90c24e1b590e..56ab9675700a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -47,7 +47,7 @@ private[ml] class WeightedLeastSquaresModel(
  * formulation:
  *
  * min,,x,z,, 1/2 sum,,i,, w,,i,, (a,,i,,^T^ x + z - b,,i,,)^2^ / sum,,i,, w,,i,,
- *   + lambda / delta (1/2 (1 - alpha) sumj,, (sigma,,j,, x,,j,,)^2^
+ *   + lambda / delta (1/2 (1 - alpha) sum,,j,, (sigma,,j,, x,,j,,)^2^
  *   + alpha sum,,j,, abs(sigma,,j,, x,,j,,)),
  *
  * where lambda is the regularization parameter, alpha is the ElasticNet mixing parameter,
@@ -91,7 +91,7 @@ private[ml] class WeightedLeastSquares(
   require(elasticNetParam >= 0.0 && elasticNetParam <= 1.0,
     s"elasticNetParam must be in [0, 1]: $elasticNetParam")
   require(maxIter >= 0, s"maxIter must be a positive integer: $maxIter")
-  require(tol > 0, s"tol must be greater than zero: $tol")
+  require(tol >= 0.0, s"tol must be >= 0, but was set to $tol")
 
   /**
    * Creates a [[WeightedLeastSquaresModel]] from an RDD of [[Instance]]s.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 9245931b27ca..9adb0fa618f2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -87,7 +87,7 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
   def ->(value: T): ParamPair[T] = ParamPair(this, value)
   // scalastyle:on
 
-  /** Encodes a param value into JSON, which can be decoded by [[jsonDecode()]]. */
+  /** Encodes a param value into JSON, which can be decoded by `jsonDecode()`. */
   def jsonEncode(value: T): String = {
     value match {
       case x: String =>
@@ -140,7 +140,7 @@ private[ml] object Param {
 
 /**
  * :: DeveloperApi ::
- * Factory methods for common validation functions for [[Param.isValid]].
+ * Factory methods for common validation functions for `Param.isValid`.
  * The numerical methods only support Int, Long, Float, and Double.
  */
 @DeveloperApi
@@ -165,32 +165,39 @@ object ParamValidators {
         s" of unexpected input type: ${value.getClass}")
   }
 
-  /** Check if value > lowerBound */
+  /**
+   * Check if value is greater than lowerBound
+   */
   def gt[T](lowerBound: Double): T => Boolean = { (value: T) =>
     getDouble(value) > lowerBound
   }
 
-  /** Check if value >= lowerBound */
+  /**
+   * Check if value is greater than or equal to lowerBound
+   */
   def gtEq[T](lowerBound: Double): T => Boolean = { (value: T) =>
     getDouble(value) >= lowerBound
   }
 
-  /** Check if value < upperBound */
+  /**
+   * Check if value is less than upperBound
+   */
   def lt[T](upperBound: Double): T => Boolean = { (value: T) =>
     getDouble(value) < upperBound
   }
 
-  /** Check if value <= upperBound */
+  /**
+   * Check if value is less than or equal to upperBound
+   */
   def ltEq[T](upperBound: Double): T => Boolean = { (value: T) =>
     getDouble(value) <= upperBound
   }
 
   /**
    * Check for value in range lowerBound to upperBound.
-   * @param lowerInclusive  If true, check for value >= lowerBound.
-   *                        If false, check for value > lowerBound.
-   * @param upperInclusive  If true, check for value <= upperBound.
-   *                        If false, check for value < upperBound.
+   *
+   * @param lowerInclusive if true, range includes value = lowerBound
+   * @param upperInclusive if true, range includes value = upperBound
    */
   def inRange[T](
       lowerBound: Double,
@@ -203,7 +210,7 @@ object ParamValidators {
     lowerValid && upperValid
   }
 
-  /** Version of [[inRange()]] which uses inclusive be default: [lowerBound, upperBound] */
+  /** Version of `inRange()` which uses inclusive be default: [lowerBound, upperBound] */
   def inRange[T](lowerBound: Double, upperBound: Double): T => Boolean = {
     inRange[T](lowerBound, upperBound, lowerInclusive = true, upperInclusive = true)
   }
@@ -228,7 +235,7 @@ object ParamValidators {
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Double]]] for Java.
+ * Specialized version of `Param[Double]` for Java.
  */
 @DeveloperApi
 class DoubleParam(parent: String, name: String, doc: String, isValid: Double => Boolean)
@@ -288,7 +295,7 @@ private[param] object DoubleParam {
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Int]]] for Java.
+ * Specialized version of `Param[Int]` for Java.
  */
 @DeveloperApi
 class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolean)
@@ -317,7 +324,7 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Float]]] for Java.
+ * Specialized version of `Param[Float]` for Java.
  */
 @DeveloperApi
 class FloatParam(parent: String, name: String, doc: String, isValid: Float => Boolean)
@@ -378,7 +385,7 @@ private object FloatParam {
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Long]]] for Java.
+ * Specialized version of `Param[Long]` for Java.
  */
 @DeveloperApi
 class LongParam(parent: String, name: String, doc: String, isValid: Long => Boolean)
@@ -407,7 +414,7 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Boolean]]] for Java.
+ * Specialized version of `Param[Boolean]` for Java.
  */
 @DeveloperApi
 class BooleanParam(parent: String, name: String, doc: String) // No need for isValid
@@ -430,7 +437,7 @@ class BooleanParam(parent: String, name: String, doc: String) // No need for isV
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Array[String]]]] for Java.
+ * Specialized version of `Param[Array[String]]` for Java.
  */
 @DeveloperApi
 class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array[String] => Boolean)
@@ -455,7 +462,7 @@ class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Array[Double]]]] for Java.
+ * Specialized version of `Param[Array[Double]]` for Java.
  */
 @DeveloperApi
 class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array[Double] => Boolean)
@@ -485,7 +492,7 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Array[Int]]]] for Java.
+ * Specialized version of `Param[Array[Int]]` for Java.
  */
 @DeveloperApi
 class IntArrayParam(parent: Params, name: String, doc: String, isValid: Array[Int] => Boolean)
@@ -533,7 +540,7 @@ trait Params extends Identifiable with Serializable {
    * Returns all params sorted by their names. The default implementation uses Java reflection to
    * list all public methods that have no arguments and return [[Param]].
    *
-   * Note: Developer should not use this method in constructor because we cannot guarantee that
+   * @note Developer should not use this method in constructor because we cannot guarantee that
    * this variable gets initialized before other params.
    */
   lazy val params: Array[Param[_]] = {
@@ -546,21 +553,6 @@ trait Params extends Identifiable with Serializable {
       .map(m => m.invoke(this).asInstanceOf[Param[_]])
   }
 
-  /**
-   * Validates parameter values stored internally.
-   * Raise an exception if any parameter value is invalid.
-   *
-   * This only needs to check for interactions between parameters.
-   * Parameter value checks which do not depend on other parameters are handled by
-   * `Param.validate()`. This method does not handle input/output column parameters;
-   * those are checked during schema validation.
-   * @deprecated Will be removed in 2.1.0. All the checks should be merged into transformSchema
-   */
-  @deprecated("Will be removed in 2.1.0. Checks should be merged into transformSchema.", "2.0.0")
-  def validateParams(): Unit = {
-    // Do nothing by default.  Override to handle Param interactions.
-  }
-
   /**
    * Explains a param.
    * @param param input param, must belong to this instance.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index fa4530927e8b..e3e03dfd43dd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -29,7 +29,7 @@ import org.apache.spark.ml.param._
 private[ml] trait HasRegParam extends Params {
 
   /**
-   * Param for regularization parameter (>= 0).
+   * Param for regularization parameter (&gt;= 0).
    * @group param
    */
   final val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0))
@@ -44,7 +44,7 @@ private[ml] trait HasRegParam extends Params {
 private[ml] trait HasMaxIter extends Params {
 
   /**
-   * Param for maximum number of iterations (>= 0).
+   * Param for maximum number of iterations (&gt;= 0).
    * @group param
    */
   final val maxIter: IntParam = new IntParam(this, "maxIter", "maximum number of iterations (>= 0)", ParamValidators.gtEq(0))
@@ -238,7 +238,7 @@ private[ml] trait HasOutputCol extends Params {
 private[ml] trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * @group param
    */
   final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1)
@@ -334,7 +334,7 @@ private[ml] trait HasElasticNetParam extends Params {
 private[ml] trait HasTol extends Params {
 
   /**
-   * Param for the convergence tolerance for iterative algorithms (>= 0).
+   * Param for the convergence tolerance for iterative algorithms (&gt;= 0).
    * @group param
    */
   final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms (>= 0)", ParamValidators.gtEq(0))
@@ -349,7 +349,7 @@ private[ml] trait HasTol extends Params {
 private[ml] trait HasStepSize extends Params {
 
   /**
-   * Param for Step size to be used for each iteration of optimization (> 0).
+   * Param for Step size to be used for each iteration of optimization (&gt; 0).
    * @group param
    */
   final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size to be used for each iteration of optimization (> 0)", ParamValidators.gt(0))
@@ -396,7 +396,7 @@ private[ml] trait HasSolver extends Params {
 private[ml] trait HasAggregationDepth extends Params {
 
   /**
-   * Param for suggested depth for treeAggregate (>= 2).
+   * Param for suggested depth for treeAggregate (&gt;= 2).
    * @group expertParam
    */
   final val aggregationDepth: IntParam = new IntParam(this, "aggregationDepth", "suggested depth for treeAggregate (>= 2)", ParamValidators.gtEq(2))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
new file mode 100644
index 000000000000..aacb41ee2659
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.r.RWrapperUtils._
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class GBTClassifierWrapper private (
+  val pipeline: PipelineModel,
+  val formula: String,
+  val features: Array[String]) extends MLWritable {
+
+  import GBTClassifierWrapper._
+
+  private val gbtcModel: GBTClassificationModel =
+    pipeline.stages(1).asInstanceOf[GBTClassificationModel]
+
+  lazy val numFeatures: Int = gbtcModel.numFeatures
+  lazy val featureImportances: Vector = gbtcModel.featureImportances
+  lazy val numTrees: Int = gbtcModel.getNumTrees
+  lazy val treeWeights: Array[Double] = gbtcModel.treeWeights
+
+  def summary: String = gbtcModel.toDebugString
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(gbtcModel.getFeaturesCol)
+      .drop(gbtcModel.getLabelCol)
+  }
+
+  override def write: MLWriter = new
+      GBTClassifierWrapper.GBTClassifierWrapperWriter(this)
+}
+
+private[r] object GBTClassifierWrapper extends MLReadable[GBTClassifierWrapper] {
+
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
+  def fit(  // scalastyle:ignore
+      data: DataFrame,
+      formula: String,
+      maxDepth: Int,
+      maxBins: Int,
+      maxIter: Int,
+      stepSize: Double,
+      minInstancesPerNode: Int,
+      minInfoGain: Double,
+      checkpointInterval: Int,
+      lossType: String,
+      seed: String,
+      subsamplingRate: Double,
+      maxMemoryInMB: Int,
+      cacheNodeIds: Boolean): GBTClassifierWrapper = {
+
+    val rFormula = new RFormula()
+      .setFormula(formula)
+      .setForceIndexLabel(true)
+    checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
+
+    // get labels and feature names from output schema
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
+
+    // assemble and fit the pipeline
+    val rfc = new GBTClassifier()
+      .setMaxDepth(maxDepth)
+      .setMaxBins(maxBins)
+      .setMaxIter(maxIter)
+      .setStepSize(stepSize)
+      .setMinInstancesPerNode(minInstancesPerNode)
+      .setMinInfoGain(minInfoGain)
+      .setCheckpointInterval(checkpointInterval)
+      .setLossType(lossType)
+      .setSubsamplingRate(subsamplingRate)
+      .setMaxMemoryInMB(maxMemoryInMB)
+      .setCacheNodeIds(cacheNodeIds)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
+    if (seed != null && seed.length > 0) rfc.setSeed(seed.toLong)
+
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormulaModel, rfc, idxToStr))
+      .fit(data)
+
+    new GBTClassifierWrapper(pipeline, formula, features)
+  }
+
+  override def read: MLReader[GBTClassifierWrapper] = new GBTClassifierWrapperReader
+
+  override def load(path: String): GBTClassifierWrapper = super.load(path)
+
+  class GBTClassifierWrapperWriter(instance: GBTClassifierWrapper)
+    extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("formula" -> instance.formula) ~
+        ("features" -> instance.features.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class GBTClassifierWrapperReader extends MLReader[GBTClassifierWrapper] {
+
+    override def load(path: String): GBTClassifierWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+      val pipeline = PipelineModel.load(pipelinePath)
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val formula = (rMetadata \ "formula").extract[String]
+      val features = (rMetadata \ "features").extract[Array[String]]
+
+      new GBTClassifierWrapper(pipeline, formula, features)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala
new file mode 100644
index 000000000000..585077588eb9
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class GBTRegressorWrapper private (
+  val pipeline: PipelineModel,
+  val formula: String,
+  val features: Array[String]) extends MLWritable {
+
+  private val gbtrModel: GBTRegressionModel =
+    pipeline.stages(1).asInstanceOf[GBTRegressionModel]
+
+  lazy val numFeatures: Int = gbtrModel.numFeatures
+  lazy val featureImportances: Vector = gbtrModel.featureImportances
+  lazy val numTrees: Int = gbtrModel.getNumTrees
+  lazy val treeWeights: Array[Double] = gbtrModel.treeWeights
+
+  def summary: String = gbtrModel.toDebugString
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset).drop(gbtrModel.getFeaturesCol)
+  }
+
+  override def write: MLWriter = new
+      GBTRegressorWrapper.GBTRegressorWrapperWriter(this)
+}
+
+private[r] object GBTRegressorWrapper extends MLReadable[GBTRegressorWrapper] {
+  def fit(  // scalastyle:ignore
+      data: DataFrame,
+      formula: String,
+      maxDepth: Int,
+      maxBins: Int,
+      maxIter: Int,
+      stepSize: Double,
+      minInstancesPerNode: Int,
+      minInfoGain: Double,
+      checkpointInterval: Int,
+      lossType: String,
+      seed: String,
+      subsamplingRate: Double,
+      maxMemoryInMB: Int,
+      cacheNodeIds: Boolean): GBTRegressorWrapper = {
+
+    val rFormula = new RFormula()
+      .setFormula(formula)
+    RWrapperUtils.checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
+
+    // get feature names from output schema
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+
+    // assemble and fit the pipeline
+    val rfr = new GBTRegressor()
+      .setMaxDepth(maxDepth)
+      .setMaxBins(maxBins)
+      .setMaxIter(maxIter)
+      .setStepSize(stepSize)
+      .setMinInstancesPerNode(minInstancesPerNode)
+      .setMinInfoGain(minInfoGain)
+      .setCheckpointInterval(checkpointInterval)
+      .setLossType(lossType)
+      .setSubsamplingRate(subsamplingRate)
+      .setMaxMemoryInMB(maxMemoryInMB)
+      .setCacheNodeIds(cacheNodeIds)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+    if (seed != null && seed.length > 0) rfr.setSeed(seed.toLong)
+
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormulaModel, rfr))
+      .fit(data)
+
+    new GBTRegressorWrapper(pipeline, formula, features)
+  }
+
+  override def read: MLReader[GBTRegressorWrapper] = new GBTRegressorWrapperReader
+
+  override def load(path: String): GBTRegressorWrapper = super.load(path)
+
+  class GBTRegressorWrapperWriter(instance: GBTRegressorWrapper)
+    extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("formula" -> instance.formula) ~
+        ("features" -> instance.features.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class GBTRegressorWrapperReader extends MLReader[GBTRegressorWrapper] {
+
+    override def load(path: String): GBTRegressorWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+      val pipeline = PipelineModel.load(pipelinePath)
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val formula = (rMetadata \ "formula").extract[String]
+      val features = (rMetadata \ "features").extract[Array[String]]
+
+      new GBTRegressorWrapper(pipeline, formula, features)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index b1bb577e1ffe..78f401f29b00 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -25,6 +25,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.regression._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql._
@@ -71,9 +72,8 @@ private[r] object GeneralizedLinearRegressionWrapper
       maxIter: Int,
       weightCol: String,
       regParam: Double): GeneralizedLinearRegressionWrapper = {
-    val rFormula = new RFormula()
-      .setFormula(formula)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+    val rFormula = new RFormula().setFormula(formula)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
     val schema = rFormulaModel.transform(data).schema
@@ -104,30 +104,38 @@ private[r] object GeneralizedLinearRegressionWrapper
       features
     }
 
-    val rCoefficientStandardErrors = if (glm.getFitIntercept) {
-      Array(summary.coefficientStandardErrors.last) ++
-        summary.coefficientStandardErrors.dropRight(1)
+    val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
+      val rCoefficientStandardErrors = if (glm.getFitIntercept) {
+        Array(summary.coefficientStandardErrors.last) ++
+          summary.coefficientStandardErrors.dropRight(1)
+      } else {
+        summary.coefficientStandardErrors
+      }
+
+      val rTValues = if (glm.getFitIntercept) {
+        Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
+      } else {
+        summary.tValues
+      }
+
+      val rPValues = if (glm.getFitIntercept) {
+        Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
+      } else {
+        summary.pValues
+      }
+
+      if (glm.getFitIntercept) {
+        Array(glm.intercept) ++ glm.coefficients.toArray ++
+          rCoefficientStandardErrors ++ rTValues ++ rPValues
+      } else {
+        glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+      }
     } else {
-      summary.coefficientStandardErrors
-    }
-
-    val rTValues = if (glm.getFitIntercept) {
-      Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
-    } else {
-      summary.tValues
-    }
-
-    val rPValues = if (glm.getFitIntercept) {
-      Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
-    } else {
-      summary.pValues
-    }
-
-    val rCoefficients: Array[Double] = if (glm.getFitIntercept) {
-      Array(glm.intercept) ++ glm.coefficients.toArray ++
-        rCoefficientStandardErrors ++ rTValues ++ rPValues
-    } else {
-      glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+      if (glm.getFitIntercept) {
+        Array(glm.intercept) ++ glm.coefficients.toArray
+      } else {
+        glm.coefficients.toArray
+      }
     }
 
     val rDispersion: Double = summary.dispersion
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala
index ea9458525aa3..8d596863b459 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala
@@ -43,6 +43,8 @@ private[r] class KMeansWrapper private (
 
   lazy val cluster: DataFrame = kMeansModel.summary.cluster
 
+  lazy val clusterSize: Int = kMeansModel.clusterCenters.size
+
   def fitted(method: String): DataFrame = {
     if (method == "centers") {
       kMeansModel.summary.predictions.drop(kMeansModel.getFeaturesCol)
@@ -68,7 +70,10 @@ private[r] object KMeansWrapper extends MLReadable[KMeansWrapper] {
       formula: String,
       k: Int,
       maxIter: Int,
-      initMode: String): KMeansWrapper = {
+      initMode: String,
+      seed: String,
+      initSteps: Int,
+      tol: Double): KMeansWrapper = {
 
     val rFormula = new RFormula()
       .setFormula(formula)
@@ -87,6 +92,10 @@ private[r] object KMeansWrapper extends MLReadable[KMeansWrapper] {
       .setMaxIter(maxIter)
       .setInitMode(initMode)
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setInitSteps(initSteps)
+      .setTol(tol)
+
+    if (seed != null && seed.length > 0) kMeans.setSeed(seed.toInt)
 
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, kMeans))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala
index cbe6a705007d..e7851e148855 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LDAWrapper.scala
@@ -122,6 +122,7 @@ private[r] object LDAWrapper extends MLReadable[LDAWrapper] {
       .setK(k)
       .setMaxIter(maxIter)
       .setSubsamplingRate(subsamplingRate)
+      .setOptimizer(optimizer)
 
     val featureSchema = data.schema(features)
     val stages = featureSchema.dataType match {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 9b352c986311..645bc7247f30 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -23,41 +23,58 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.AttributeGroup
-import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel}
-import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
 private[r] class LogisticRegressionWrapper private (
     val pipeline: PipelineModel,
     val features: Array[String],
-    val isLoaded: Boolean = false) extends MLWritable {
+    val labels: Array[String]) extends MLWritable {
 
-  private val logisticRegressionModel: LogisticRegressionModel =
-    pipeline.stages(1).asInstanceOf[LogisticRegressionModel]
-
-  lazy val totalIterations: Int = logisticRegressionModel.summary.totalIterations
-
-  lazy val objectiveHistory: Array[Double] = logisticRegressionModel.summary.objectiveHistory
-
-  lazy val blrSummary =
-    logisticRegressionModel.summary.asInstanceOf[BinaryLogisticRegressionSummary]
+  import LogisticRegressionWrapper._
 
-  lazy val roc: DataFrame = blrSummary.roc
-
-  lazy val areaUnderROC: Double = blrSummary.areaUnderROC
-
-  lazy val pr: DataFrame = blrSummary.pr
-
-  lazy val fMeasureByThreshold: DataFrame = blrSummary.fMeasureByThreshold
+  private val lrModel: LogisticRegressionModel =
+    pipeline.stages(1).asInstanceOf[LogisticRegressionModel]
 
-  lazy val precisionByThreshold: DataFrame = blrSummary.precisionByThreshold
+  val rFeatures: Array[String] = if (lrModel.getFitIntercept) {
+    Array("(Intercept)") ++ features
+  } else {
+    features
+  }
 
-  lazy val recallByThreshold: DataFrame = blrSummary.recallByThreshold
+  val rCoefficients: Array[Double] = {
+    val numRows = lrModel.coefficientMatrix.numRows
+    val numCols = lrModel.coefficientMatrix.numCols
+    val numColsWithIntercept = if (lrModel.getFitIntercept) numCols + 1 else numCols
+    val coefficients: Array[Double] = new Array[Double](numRows * numColsWithIntercept)
+    val coefficientVectors: Seq[Vector] = lrModel.coefficientMatrix.rowIter.toSeq
+    var i = 0
+    if (lrModel.getFitIntercept) {
+      while (i < numRows) {
+        coefficients(i * numColsWithIntercept) = lrModel.interceptVector(i)
+        System.arraycopy(coefficientVectors(i).toArray, 0,
+          coefficients, i * numColsWithIntercept + 1, numCols)
+        i += 1
+      }
+    } else {
+      while (i < numRows) {
+        System.arraycopy(coefficientVectors(i).toArray, 0,
+          coefficients, i * numColsWithIntercept, numCols)
+        i += 1
+      }
+    }
+    coefficients
+  }
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(logisticRegressionModel.getFeaturesCol)
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(lrModel.getFeaturesCol)
+      .drop(lrModel.getLabelCol)
   }
 
   override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this)
@@ -66,6 +83,9 @@ private[r] class LogisticRegressionWrapper private (
 private[r] object LogisticRegressionWrapper
     extends MLReadable[LogisticRegressionWrapper] {
 
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
   def fit( // scalastyle:ignore
       data: DataFrame,
       formula: String,
@@ -73,28 +93,25 @@ private[r] object LogisticRegressionWrapper
       elasticNetParam: Double,
       maxIter: Int,
       tol: Double,
-      fitIntercept: Boolean,
       family: String,
       standardization: Boolean,
       thresholds: Array[Double],
-      weightCol: String,
-      aggregationDepth: Int,
-      probability: String
+      weightCol: String
       ): LogisticRegressionWrapper = {
 
     val rFormula = new RFormula()
       .setFormula(formula)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+      .setForceIndexLabel(true)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
-    // get feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
+    val fitIntercept = rFormula.hasIntercept
+
+    // get labels and feature names from output schema
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
-    val logisticRegression = new LogisticRegression()
+    val lr = new LogisticRegression()
       .setRegParam(regParam)
       .setElasticNetParam(elasticNetParam)
       .setMaxIter(maxIter)
@@ -103,21 +120,26 @@ private[r] object LogisticRegressionWrapper
       .setFamily(family)
       .setStandardization(standardization)
       .setWeightCol(weightCol)
-      .setAggregationDepth(aggregationDepth)
       .setFeaturesCol(rFormula.getFeaturesCol)
-      .setProbabilityCol(probability)
+      .setLabelCol(rFormula.getLabelCol)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
 
     if (thresholds.length > 1) {
-      logisticRegression.setThresholds(thresholds)
+      lr.setThresholds(thresholds)
     } else {
-      logisticRegression.setThreshold(thresholds(0))
+      lr.setThreshold(thresholds(0))
     }
 
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
     val pipeline = new Pipeline()
-      .setStages(Array(rFormulaModel, logisticRegression))
+      .setStages(Array(rFormulaModel, lr, idxToStr))
       .fit(data)
 
-    new LogisticRegressionWrapper(pipeline, features)
+    new LogisticRegressionWrapper(pipeline, features, labels)
   }
 
   override def read: MLReader[LogisticRegressionWrapper] = new LogisticRegressionWrapperReader
@@ -131,7 +153,8 @@ private[r] object LogisticRegressionWrapper
       val pipelinePath = new Path(path, "pipeline").toString
 
       val rMetadata = ("class" -> instance.getClass.getName) ~
-        ("features" -> instance.features.toSeq)
+        ("features" -> instance.features.toSeq) ~
+        ("labels" -> instance.labels.toSeq)
       val rMetadataJson: String = compact(render(rMetadata))
       sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
 
@@ -149,9 +172,10 @@ private[r] object LogisticRegressionWrapper
       val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
       val rMetadata = parse(rMetadataStr)
       val features = (rMetadata \ "features").extract[Array[String]]
+      val labels = (rMetadata \ "labels").extract[Array[String]]
 
       val pipeline = PipelineModel.load(pipelinePath)
-      new LogisticRegressionWrapper(pipeline, features, isLoaded = true)
+      new LogisticRegressionWrapper(pipeline, features, labels)
     }
   }
-}
\ No newline at end of file
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
index 2193eb80e9fd..d34de3093114 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
@@ -24,19 +24,29 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
 import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
 import org.apache.spark.sql.{DataFrame, Dataset}
 
 private[r] class MultilayerPerceptronClassifierWrapper private (
-    val pipeline: PipelineModel,
-    val labelCount: Long,
-    val layers: Array[Int],
-    val weights: Array[Double]
+    val pipeline: PipelineModel
   ) extends MLWritable {
 
+  import MultilayerPerceptronClassifierWrapper._
+
+  val mlpModel: MultilayerPerceptronClassificationModel =
+    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]
+
+  val weights: Array[Double] = mlpModel.weights.toArray
+  val layers: Array[Int] = mlpModel.layers
+
   def transform(dataset: Dataset[_]): DataFrame = {
     pipeline.transform(dataset)
+      .drop(mlpModel.getFeaturesCol)
+      .drop(mlpModel.getLabelCol)
+      .drop(PREDICTED_LABEL_INDEX_COL)
   }
 
   /**
@@ -49,10 +59,12 @@ private[r] class MultilayerPerceptronClassifierWrapper private (
 private[r] object MultilayerPerceptronClassifierWrapper
   extends MLReadable[MultilayerPerceptronClassifierWrapper] {
 
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
   val PREDICTED_LABEL_COL = "prediction"
 
   def fit(
       data: DataFrame,
+      formula: String,
       blockSize: Int,
       layers: Array[Int],
       solver: String,
@@ -62,8 +74,13 @@ private[r] object MultilayerPerceptronClassifierWrapper
       seed: String,
       initialWeights: Array[Double]
      ): MultilayerPerceptronClassifierWrapper = {
+    val rFormula = new RFormula()
+      .setFormula(formula)
+      .setForceIndexLabel(true)
+    checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
-    val schema = data.schema
+    val (_, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
     val mlp = new MultilayerPerceptronClassifier()
@@ -73,25 +90,25 @@ private[r] object MultilayerPerceptronClassifierWrapper
       .setMaxIter(maxIter)
       .setTol(tol)
       .setStepSize(stepSize)
-      .setPredictionCol(PREDICTED_LABEL_COL)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     if (seed != null && seed.length > 0) mlp.setSeed(seed.toInt)
     if (initialWeights != null) {
       require(initialWeights.length > 0)
       mlp.setInitialWeights(Vectors.dense(initialWeights))
     }
 
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
     val pipeline = new Pipeline()
-      .setStages(Array(mlp))
+      .setStages(Array(rFormulaModel, mlp, idxToStr))
       .fit(data)
 
-    val multilayerPerceptronClassificationModel: MultilayerPerceptronClassificationModel =
-    pipeline.stages.head.asInstanceOf[MultilayerPerceptronClassificationModel]
-
-    val weights = multilayerPerceptronClassificationModel.weights.toArray
-    val layersFromPipeline = multilayerPerceptronClassificationModel.layers
-    val labelCount = data.select("label").distinct().count()
-
-    new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layersFromPipeline, weights)
+    new MultilayerPerceptronClassifierWrapper(pipeline)
   }
 
   /**
@@ -107,17 +124,10 @@ private[r] object MultilayerPerceptronClassifierWrapper
 
     override def load(path: String): MultilayerPerceptronClassifierWrapper = {
       implicit val format = DefaultFormats
-      val rMetadataPath = new Path(path, "rMetadata").toString
       val pipelinePath = new Path(path, "pipeline").toString
 
-      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
-      val rMetadata = parse(rMetadataStr)
-      val labelCount = (rMetadata \ "labelCount").extract[Long]
-      val layers = (rMetadata \ "layers").extract[Array[Int]]
-      val weights = (rMetadata \ "weights").extract[Array[Double]]
-
       val pipeline = PipelineModel.load(pipelinePath)
-      new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights)
+      new MultilayerPerceptronClassifierWrapper(pipeline)
     }
   }
 
@@ -128,10 +138,7 @@ private[r] object MultilayerPerceptronClassifierWrapper
       val rMetadataPath = new Path(path, "rMetadata").toString
       val pipelinePath = new Path(path, "pipeline").toString
 
-      val rMetadata = ("class" -> instance.getClass.getName) ~
-        ("labelCount" -> instance.labelCount) ~
-        ("layers" -> instance.layers.toSeq) ~
-        ("weights" -> instance.weights.toArray.toSeq)
+      val rMetadata = "class" -> instance.getClass.getName
       val rMetadataJson: String = compact(render(rMetadata))
       sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
index 4fdab2dd9465..0afea4be3d1d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
@@ -23,9 +23,9 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
 import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
 import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -46,6 +46,7 @@ private[r] class NaiveBayesWrapper private (
     pipeline.transform(dataset)
       .drop(PREDICTED_LABEL_INDEX_COL)
       .drop(naiveBayesModel.getFeaturesCol)
+      .drop(naiveBayesModel.getLabelCol)
   }
 
   override def write: MLWriter = new NaiveBayesWrapper.NaiveBayesWrapperWriter(this)
@@ -60,21 +61,16 @@ private[r] object NaiveBayesWrapper extends MLReadable[NaiveBayesWrapper] {
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
-      .asInstanceOf[NominalAttribute]
-    val labels = labelAttr.values.get
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
     // assemble and fit the pipeline
     val naiveBayes = new NaiveBayes()
       .setSmoothing(smoothing)
       .setModelType("bernoulli")
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     val idxToStr = new IndexToString()
       .setInputCol(PREDICTED_LABEL_INDEX_COL)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala
index 379007c4d948..665e50af67d4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala
@@ -18,11 +18,12 @@
 package org.apache.spark.ml.r
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
+import org.apache.spark.ml.feature.{RFormula, RFormulaModel}
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.Dataset
 
-object RWrapperUtils extends Logging {
+private[r] object RWrapperUtils extends Logging {
 
   /**
    * DataFrame column check.
@@ -32,14 +33,41 @@ object RWrapperUtils extends Logging {
    *
    * @param rFormula RFormula instance
    * @param data Input dataset
-   * @return Unit
    */
   def checkDataColumns(rFormula: RFormula, data: Dataset[_]): Unit = {
     if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)) {
       val newFeaturesName = s"${Identifiable.randomUID(rFormula.getFeaturesCol)}"
-      logWarning(s"data containing ${rFormula.getFeaturesCol} column, " +
+      logInfo(s"data containing ${rFormula.getFeaturesCol} column, " +
         s"using new name $newFeaturesName instead")
       rFormula.setFeaturesCol(newFeaturesName)
     }
+
+    if (rFormula.getForceIndexLabel && data.schema.fieldNames.contains(rFormula.getLabelCol)) {
+      val newLabelName = s"${Identifiable.randomUID(rFormula.getLabelCol)}"
+      logInfo(s"data containing ${rFormula.getLabelCol} column and we force to index label, " +
+        s"using new name $newLabelName instead")
+      rFormula.setLabelCol(newLabelName)
+    }
+  }
+
+  /**
+   * Get the feature names and original labels from the schema
+   * of DataFrame transformed by RFormulaModel.
+   *
+   * @param rFormulaModel The RFormulaModel instance.
+   * @param data Input dataset.
+   * @return The feature names and original labels.
+   */
+  def getFeaturesAndLabels(
+      rFormulaModel: RFormulaModel,
+      data: Dataset[_]): (Array[String], Array[String]) = {
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
+      .asInstanceOf[NominalAttribute]
+    val labels = labelAttr.values.get
+    (features, labels)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
index 0e09e18027ca..b59fe292349b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
@@ -60,6 +60,10 @@ private[r] object RWrappers extends MLReader[Object] {
         RandomForestRegressorWrapper.load(path)
       case "org.apache.spark.ml.r.RandomForestClassifierWrapper" =>
         RandomForestClassifierWrapper.load(path)
+      case "org.apache.spark.ml.r.GBTRegressorWrapper" =>
+        GBTRegressorWrapper.load(path)
+      case "org.apache.spark.ml.r.GBTClassifierWrapper" =>
+        GBTClassifierWrapper.load(path)
       case _ =>
         throw new SparkException(s"SparkR read.ml does not support load $className")
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
index b0088ddaf3b1..366f375b5858 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
@@ -23,10 +23,10 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
-import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
 import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -35,18 +35,23 @@ private[r] class RandomForestClassifierWrapper private (
   val formula: String,
   val features: Array[String]) extends MLWritable {
 
-  private val DTModel: RandomForestClassificationModel =
+  import RandomForestClassifierWrapper._
+
+  private val rfcModel: RandomForestClassificationModel =
     pipeline.stages(1).asInstanceOf[RandomForestClassificationModel]
 
-  lazy val numFeatures: Int = DTModel.numFeatures
-  lazy val featureImportances: Vector = DTModel.featureImportances
-  lazy val numTrees: Int = DTModel.getNumTrees
-  lazy val treeWeights: Array[Double] = DTModel.treeWeights
+  lazy val numFeatures: Int = rfcModel.numFeatures
+  lazy val featureImportances: Vector = rfcModel.featureImportances
+  lazy val numTrees: Int = rfcModel.getNumTrees
+  lazy val treeWeights: Array[Double] = rfcModel.treeWeights
 
-  def summary: String = DTModel.toDebugString
+  def summary: String = rfcModel.toDebugString
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(DTModel.getFeaturesCol)
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(rfcModel.getFeaturesCol)
+      .drop(rfcModel.getLabelCol)
   }
 
   override def write: MLWriter = new
@@ -54,6 +59,10 @@ private[r] class RandomForestClassifierWrapper private (
 }
 
 private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestClassifierWrapper] {
+
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
   def fit(  // scalastyle:ignore
       data: DataFrame,
       formula: String,
@@ -67,20 +76,17 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
       featureSubsetStrategy: String,
       seed: String,
       subsamplingRate: Double,
-      probabilityCol: String,
       maxMemoryInMB: Int,
       cacheNodeIds: Boolean): RandomForestClassifierWrapper = {
 
     val rFormula = new RFormula()
       .setFormula(formula)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+      .setForceIndexLabel(true)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
-    // get feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
+    // get labels and feature names from output schema
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
     val rfc = new RandomForestClassifier()
@@ -95,12 +101,18 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
       .setSubsamplingRate(subsamplingRate)
       .setMaxMemoryInMB(maxMemoryInMB)
       .setCacheNodeIds(cacheNodeIds)
-      .setProbabilityCol(probabilityCol)
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     if (seed != null && seed.length > 0) rfc.setSeed(seed.toLong)
 
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
     val pipeline = new Pipeline()
-      .setStages(Array(rFormulaModel, rfc))
+      .setStages(Array(rFormulaModel, rfc, idxToStr))
       .fit(data)
 
     new RandomForestClassifierWrapper(pipeline, formula, features)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
index c8874407fa75..4b9a3a731da9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
@@ -35,18 +35,18 @@ private[r] class RandomForestRegressorWrapper private (
   val formula: String,
   val features: Array[String]) extends MLWritable {
 
-  private val DTModel: RandomForestRegressionModel =
+  private val rfrModel: RandomForestRegressionModel =
     pipeline.stages(1).asInstanceOf[RandomForestRegressionModel]
 
-  lazy val numFeatures: Int = DTModel.numFeatures
-  lazy val featureImportances: Vector = DTModel.featureImportances
-  lazy val numTrees: Int = DTModel.getNumTrees
-  lazy val treeWeights: Array[Double] = DTModel.treeWeights
+  lazy val numFeatures: Int = rfrModel.numFeatures
+  lazy val featureImportances: Vector = rfrModel.featureImportances
+  lazy val numTrees: Int = rfrModel.getNumTrees
+  lazy val treeWeights: Array[Double] = rfrModel.treeWeights
 
-  def summary: String = DTModel.toDebugString
+  def summary: String = rfrModel.toDebugString
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(DTModel.getFeaturesCol)
+    pipeline.transform(dataset).drop(rfrModel.getFeaturesCol)
   }
 
   override def write: MLWriter = new
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 02e2384afe53..4e636dbd9f5f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -355,8 +355,8 @@ object ALSModel extends MLReadable[ALSModel] {
  *
  * Essentially instead of finding the low-rank approximations to the rating matrix `R`,
  * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
- * r &gt; 0 and 0 if r &lt;= 0. The ratings then act as 'confidence' values related to strength of
- * indicated user
+ * r is greater than 0 and 0 if r is less than or equal to 0. The ratings then act as 'confidence'
+ * values related to strength of indicated user
  * preferences rather than explicit ratings given to items.
  */
 @Since("1.3.0")
@@ -877,7 +877,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
   }
 
   /**
-   * Builder for [[RatingBlock]]. [[mutable.ArrayBuilder]] is used to avoid boxing/unboxing.
+   * Builder for [[RatingBlock]]. `mutable.ArrayBuilder` is used to avoid boxing/unboxing.
    */
   private[recommendation] class RatingBlockBuilder[@specialized(Int, Long) ID: ClassTag]
     extends Serializable {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 9d5ba999781f..af68e7b9d580 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -119,7 +119,8 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
 /**
  * :: Experimental ::
  * Fit a parametric survival regression model named accelerated failure time (AFT) model
- * ([[https://en.wikipedia.org/wiki/Accelerated_failure_time_model]])
+ * (see <a href="https://en.wikipedia.org/wiki/Accelerated_failure_time_model">
+ * Accelerated failure time model (Wikipedia)</a>)
  * based on the Weibull distribution of the survival time.
  */
 @Experimental
@@ -184,7 +185,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
   setDefault(tol -> 1E-6)
 
   /**
-   * Suggested depth for treeAggregate (>= 2).
+   * Suggested depth for treeAggregate (greater than or equal to 2).
    * If the dimensions of features or the number of partitions are large,
    * this param could be adjusted to a larger size.
    * Default is 2.
@@ -432,24 +433,24 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
  * Given the values of the covariates $x^{'}$, for random lifetime $t_{i}$ of subjects i = 1,..,n,
  * with possible right-censoring, the likelihood function under the AFT model is given as
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    L(\beta,\sigma)=\prod_{i=1}^n[\frac{1}{\sigma}f_{0}
  *      (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})]^{\delta_{i}}S_{0}
  *    (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})^{1-\delta_{i}}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Where $\delta_{i}$ is the indicator of the event has occurred i.e. uncensored or not.
  * Using $\epsilon_{i}=\frac{\log{t_{i}}-x^{'}\beta}{\sigma}$, the log-likelihood function
  * assumes the form
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \iota(\beta,\sigma)=\sum_{i=1}^{n}[-\delta_{i}\log\sigma+
  *    \delta_{i}\log{f_{0}}(\epsilon_{i})+(1-\delta_{i})\log{S_{0}(\epsilon_{i})}]
  *    $$
- * </blockquote></p>
+ * </blockquote>
  * Where $S_{0}(\epsilon_{i})$ is the baseline survivor function,
  * and $f_{0}(\epsilon_{i})$ is corresponding density function.
  *
@@ -458,34 +459,34 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
  * to extreme value distribution for log of the lifetime,
  * and the $S_{0}(\epsilon)$ function is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    S_{0}(\epsilon_{i})=\exp(-e^{\epsilon_{i}})
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * and the $f_{0}(\epsilon_{i})$ function is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    f_{0}(\epsilon_{i})=e^{\epsilon_{i}}\exp(-e^{\epsilon_{i}})
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * The log-likelihood function for Weibull distribution of lifetime is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \iota(\beta,\sigma)=
  *    -\sum_{i=1}^n[\delta_{i}\log\sigma-\delta_{i}\epsilon_{i}+e^{\epsilon_{i}}]
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Due to minimizing the negative log-likelihood equivalent to maximum a posteriori probability,
  * the loss function we use to optimize is $-\iota(\beta,\sigma)$.
  * The gradient functions for $\beta$ and $\log\sigma$ respectively are
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \frac{\partial (-\iota)}{\partial \beta}=
  *    \sum_{1=1}^{n}[\delta_{i}-e^{\epsilon_{i}}]\frac{x_{i}}{\sigma} \\
@@ -493,7 +494,7 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
  *    \frac{\partial (-\iota)}{\partial (\log\sigma)}=
  *    \sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}]
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * @param bcParameters The broadcasted value includes three part: The log of scale parameter,
  *                     the intercept and regression coefficients corresponding to the features.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index ebc6c12ddcf9..01c5cc1c7efa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -38,8 +38,8 @@ import org.apache.spark.sql.functions._
 
 
 /**
- * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] learning algorithm
- * for regression.
+ * <a href="http://en.wikipedia.org/wiki/Decision_tree_learning">Decision tree</a>
+ * learning algorithm for regression.
  * It supports both continuous and categorical features.
  */
 @Since("1.4.0")
@@ -51,34 +51,52 @@ class DecisionTreeRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S
   def this() = this(Identifiable.randomUID("dtr"))
 
   // Override parameter setters from parent trait for Java API compatibility.
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setImpurity(value: String): this.type = super.setImpurity(value)
+  override def setImpurity(value: String): this.type = set(impurity, value)
 
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  /** @group setParam */
+  @Since("1.6.0")
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   /** @group setParam */
+  @Since("2.0.0")
   def setVarianceCol(value: String): this.type = set(varianceCol, value)
 
   override protected def train(dataset: Dataset[_]): DecisionTreeRegressionModel = {
@@ -132,7 +150,8 @@ object DecisionTreeRegressor extends DefaultParamsReadable[DecisionTreeRegressor
 }
 
 /**
- * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for regression.
+ * <a href="http://en.wikipedia.org/wiki/Decision_tree_learning">
+ * Decision tree (Wikipedia)</a> model for regression.
  * It supports both continuous and categorical features.
  * @param rootNode  Root of the decision tree
  */
@@ -207,9 +226,9 @@ class DecisionTreeRegressionModel private[ml] (
    *     where gain is scaled by the number of instances passing through node
    *   - Normalize importances for tree to sum to 1.
    *
-   * Note: Feature importance for single decision trees can have high variance due to
-   *       correlated predictor variables. Consider using a [[RandomForestRegressor]]
-   *       to determine feature importance instead.
+   * @note Feature importance for single decision trees can have high variance due to
+   * correlated predictor variables. Consider using a [[RandomForestRegressor]]
+   * to determine feature importance instead.
    */
   @Since("2.0.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(this, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index fa69d60836e6..f8ab3d3a45a4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.functions._
 
 /**
- * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a>
  * learning algorithm for regression.
  * It supports both continuous and categorical features.
  *
@@ -65,31 +65,48 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   // Override parameter setters from parent trait for Java API compatibility.
 
   // Parameters from TreeRegressorParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
   /**
    * The impurity setting is ignored for GBT models.
    * Individual trees are built using impurity "Variance."
+   *
+   * @group setParam
    */
   @Since("1.4.0")
   override def setImpurity(value: String): this.type = {
@@ -98,18 +115,24 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   }
 
   // Parameters from TreeEnsembleParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSubsamplingRate(value: Double): this.type = super.setSubsamplingRate(value)
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   // Parameters from GBTParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxIter(value: Int): this.type = super.setMaxIter(value)
+  override def setMaxIter(value: Int): this.type = set(maxIter, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setStepSize(value: Double): this.type = super.setStepSize(value)
+  override def setStepSize(value: Double): this.type = set(stepSize, value)
 
   // Parameters from GBTRegressorParams:
 
@@ -151,7 +174,7 @@ object GBTRegressor extends DefaultParamsReadable[GBTRegressor] {
 }
 
 /**
- * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a>
  * model for regression.
  * It supports both continuous and categorical features.
  * @param _trees  Decision trees in the ensemble.
@@ -183,6 +206,12 @@ class GBTRegressionModel private[ml](
   @Since("1.4.0")
   override def trees: Array[DecisionTreeRegressionModel] = _trees
 
+  /**
+   * Number of trees in ensemble
+   */
+  @Since("2.0.0")
+  val getNumTrees: Int = trees.length
+
   @Since("1.4.0")
   override def treeWeights: Array[Double] = _treeWeights
 
@@ -223,7 +252,7 @@ class GBTRegressionModel private[ml](
    * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
    * and follows the implementation from scikit-learn.
    *
-   * @see [[DecisionTreeRegressionModel.featureImportances]]
+   * @see `DecisionTreeRegressionModel.featureImportances`
    */
   @Since("2.0.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 8656ecf609ea..676be617953a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -57,7 +57,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
   final val family: Param[String] = new Param(this, "family",
     "The name of family which is a description of the error distribution to be used in the " +
       s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.",
-    ParamValidators.inArray[String](supportedFamilyNames.toArray))
+    (value: String) => supportedFamilyNames.contains(value.toLowerCase))
 
   /** @group getParam */
   @Since("2.0.0")
@@ -74,7 +74,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
   final val link: Param[String] = new Param(this, "link", "The name of link function " +
     "which provides the relationship between the linear predictor and the mean of the " +
     s"distribution function. Supported options: ${supportedLinkNames.mkString(", ")}",
-    ParamValidators.inArray[String](supportedLinkNames.toArray))
+    (value: String) => supportedLinkNames.contains(value.toLowerCase))
 
   /** @group getParam */
   @Since("2.0.0")
@@ -123,16 +123,18 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
 /**
  * :: Experimental ::
  *
- * Fit a Generalized Linear Model ([[https://en.wikipedia.org/wiki/Generalized_linear_model]])
- * specified by giving a symbolic description of the linear predictor (link function) and
- * a description of the error distribution (family).
+ * Fit a Generalized Linear Model
+ * (see <a href="https://en.wikipedia.org/wiki/Generalized_linear_model">
+ * Generalized linear model (Wikipedia)</a>)
+ * specified by giving a symbolic description of the linear
+ * predictor (link function) and a description of the error distribution (family).
  * It supports "gaussian", "binomial", "poisson" and "gamma" as family.
  * Valid link functions for each family is listed below. The first link function of each family
  * is the default one.
- *  - "gaussian" -> "identity", "log", "inverse"
- *  - "binomial" -> "logit", "probit", "cloglog"
- *  - "poisson"  -> "log", "identity", "sqrt"
- *  - "gamma"    -> "inverse", "identity", "log"
+ *  - "gaussian" : "identity", "log", "inverse"
+ *  - "binomial" : "logit", "probit", "cloglog"
+ *  - "poisson"  : "log", "identity", "sqrt"
+ *  - "gamma"    : "inverse", "identity", "log"
  */
 @Experimental
 @Since("2.0.0")
@@ -196,11 +198,11 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
   /**
    * Sets the regularization parameter for L2 regularization.
    * The regularization term is
-   * <p><blockquote>
+   * <blockquote>
    *    $$
    *    0.5 * regParam * L2norm(coefficients)^2
    *    $$
-   * </blockquote></p>
+   * </blockquote>
    * Default is 0.0.
    *
    * @group setParam
@@ -270,7 +272,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
           .setParent(this))
       val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
         wlsModel.diagInvAtWA.toArray, 1, getSolver)
-      return model.setSummary(trainingSummary)
+      return model.setSummary(Some(trainingSummary))
     }
 
     // Fit Generalized Linear Model by iteratively reweighted least squares (IRLS).
@@ -284,7 +286,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
         .setParent(this))
     val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
       irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver)
-    model.setSummary(trainingSummary)
+    model.setSummary(Some(trainingSummary))
   }
 
   @Since("2.0.0")
@@ -403,7 +405,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
      * @param name family name: "gaussian", "binomial", "poisson" or "gamma".
      */
     def fromName(name: String): Family = {
-      name match {
+      name.toLowerCase match {
         case Gaussian.name => Gaussian
         case Binomial.name => Binomial
         case Poisson.name => Poisson
@@ -501,9 +503,13 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
     val defaultLink: Link = Log
 
     override def initialize(y: Double, weight: Double): Double = {
-      require(y > 0.0, "The response variable of Poisson family " +
-        s"should be positive, but got $y")
-      y
+      require(y >= 0.0, "The response variable of Poisson family " +
+        s"should be non-negative, but got $y")
+      /*
+        Force Poisson mean > 0 to avoid numerical instability in IRLS.
+        R uses y + 0.1 for initialization. See poisson()$initialize.
+       */
+      math.max(y, 0.1)
     }
 
     override def variance(mu: Double): Double = mu
@@ -603,7 +609,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
      *             "inverse", "probit", "cloglog" or "sqrt".
      */
     def fromName(name: String): Link = {
-      name match {
+      name.toLowerCase match {
         case Identity.name => Identity
         case Logit.name => Logit
         case Log.name => Log
@@ -761,8 +767,8 @@ class GeneralizedLinearRegressionModel private[ml] (
   def hasSummary: Boolean = trainingSummary.nonEmpty
 
   private[regression]
-  def setSummary(summary: GeneralizedLinearRegressionTrainingSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  def setSummary(summary: Option[GeneralizedLinearRegressionTrainingSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -776,8 +782,9 @@ class GeneralizedLinearRegressionModel private[ml] (
 
   @Since("2.0.0")
   override def copy(extra: ParamMap): GeneralizedLinearRegressionModel = {
-    copyValues(new GeneralizedLinearRegressionModel(uid, coefficients, intercept), extra)
-      .setParent(parent)
+    val copied = copyValues(new GeneralizedLinearRegressionModel(uid, coefficients, intercept),
+      extra)
+    copied.setSummary(trainingSummary).setParent(parent)
   }
 
   /**
@@ -877,13 +884,15 @@ class GeneralizedLinearRegressionSummary private[regression] (
    * Private copy of model to ensure Params are not modified outside this class.
    * Coefficients is not a deep copy, but that is acceptable.
    *
-   * NOTE: [[predictionCol]] must be set correctly before the value of [[model]] is set,
-   *       and [[model]] must be set before [[predictions]] is set!
+   * @note [[predictionCol]] must be set correctly before the value of [[model]] is set,
+   * and [[model]] must be set before [[predictions]] is set!
    */
   protected val model: GeneralizedLinearRegressionModel =
     origModel.copy(ParamMap.empty).setPredictionCol(predictionCol)
 
-  /** Predictions output by the model's `transform` method. */
+  /**
+   * Predictions output by the model's `transform` method.
+   */
   @Since("2.0.0") @transient val predictions: DataFrame = model.transform(dataset)
 
   private[regression] lazy val family: Family = Family.fromName(model.getFamily)
@@ -1018,7 +1027,8 @@ class GeneralizedLinearRegressionSummary private[regression] (
    */
   @Since("2.0.0")
   lazy val dispersion: Double = if (
-    model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
+    model.getFamily.toLowerCase == Binomial.name ||
+      model.getFamily.toLowerCase == Poisson.name) {
     1.0
   } else {
     val rss = pearsonResiduals.agg(sum(pow(col("pearsonResiduals"), 2.0))).first().getDouble(0)
@@ -1062,45 +1072,76 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
 
   import GeneralizedLinearRegression._
 
+  /**
+   * Whether the underlying `WeightedLeastSquares` using the "normal" solver.
+   */
+  private[ml] val isNormalSolver: Boolean = {
+    diagInvAtWA.length != 1 || diagInvAtWA(0) != 0
+  }
+
   /**
    * Standard error of estimated coefficients and intercept.
+   * This value is only available when the underlying `WeightedLeastSquares`
+   * using the "normal" solver.
    *
-   * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+   * If `GeneralizedLinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val coefficientStandardErrors: Array[Double] = {
-    diagInvAtWA.map(_ * dispersion).map(math.sqrt)
+    if (isNormalSolver) {
+      diagInvAtWA.map(_ * dispersion).map(math.sqrt)
+    } else {
+      throw new UnsupportedOperationException(
+        "No Std. Error of coefficients available for this GeneralizedLinearRegressionModel")
+    }
   }
 
   /**
    * T-statistic of estimated coefficients and intercept.
+   * This value is only available when the underlying `WeightedLeastSquares`
+   * using the "normal" solver.
    *
-   * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+   * If `GeneralizedLinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val tValues: Array[Double] = {
-    val estimate = if (model.getFitIntercept) {
-      Array.concat(model.coefficients.toArray, Array(model.intercept))
+    if (isNormalSolver) {
+      val estimate = if (model.getFitIntercept) {
+        Array.concat(model.coefficients.toArray, Array(model.intercept))
+      } else {
+        model.coefficients.toArray
+      }
+      estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
     } else {
-      model.coefficients.toArray
+      throw new UnsupportedOperationException(
+        "No t-statistic available for this GeneralizedLinearRegressionModel")
     }
-    estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
   }
 
   /**
    * Two-sided p-value of estimated coefficients and intercept.
+   * This value is only available when the underlying `WeightedLeastSquares`
+   * using the "normal" solver.
    *
-   * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+   * If `GeneralizedLinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val pValues: Array[Double] = {
-    if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
-      tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
+    if (isNormalSolver) {
+      if (model.getFamily.toLowerCase == Binomial.name ||
+        model.getFamily.toLowerCase == Poisson.name) {
+        tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
+      } else {
+        tValues.map { x =>
+          2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x)))
+        }
+      }
     } else {
-      tValues.map { x => 2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) }
+      throw new UnsupportedOperationException(
+        "No p-value available for this GeneralizedLinearRegressionModel")
     }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index cd7b4f2a9c56..c378a99e3c23 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -56,12 +56,13 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
   final def getIsotonic: Boolean = $(isotonic)
 
   /**
-   * Param for the index of the feature if [[featuresCol]] is a vector column (default: `0`), no
+   * Param for the index of the feature if `featuresCol` is a vector column (default: `0`), no
    * effect otherwise.
    * @group param
    */
   final val featureIndex: IntParam = new IntParam(this, "featureIndex",
-    "The index of the feature if featuresCol is a vector column, no effect otherwise.")
+    "The index of the feature if featuresCol is a vector column, no effect otherwise (>= 0)",
+    ParamValidators.gtEq(0))
 
   /** @group getParam */
   final def getFeatureIndex: Int = $(featureIndex)
@@ -193,7 +194,7 @@ object IsotonicRegression extends DefaultParamsReadable[IsotonicRegression] {
  * Model fitted by IsotonicRegression.
  * Predicts using a piecewise linear function.
  *
- * For detailed rules see [[org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()]].
+ * For detailed rules see `org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()`.
  *
  * @param oldModel A [[org.apache.spark.mllib.regression.IsotonicRegressionModel]]
  *                 model trained by [[org.apache.spark.mllib.regression.IsotonicRegression]].
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index ae876b383973..534ef87ec64e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -31,7 +31,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.linalg.BLAS._
-import org.apache.spark.ml.optim.{NormalEquationSolver, WeightedLeastSquares}
+import org.apache.spark.ml.optim.WeightedLeastSquares
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
@@ -60,11 +60,11 @@ private[regression] trait LinearRegressionParams extends PredictorParams
  * The learning objective is to minimize the squared error, with regularization.
  * The specific squared error loss function used is:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    L = 1/2n ||A coefficients - y||^2^
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * This supports multiple types of regularization:
  *  - none (a.k.a. ordinary least squares)
@@ -103,11 +103,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
   /**
    * Whether to standardize the training features before fitting the model.
    * The coefficients of models will be always returned on the original scale,
-   * so it will be transparent for users. Note that with/without standardization,
-   * the models should be always converged to the same solution when no regularization
-   * is applied. In R's GLMNET package, the default behavior is true as well.
+   * so it will be transparent for users.
    * Default is true.
    *
+   * @note With/without standardization, the models should be always converged
+   * to the same solution when no regularization is applied. In R's GLMNET package,
+   * the default behavior is true as well.
+   *
    * @group setParam
    */
   @Since("1.5.0")
@@ -116,8 +118,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
 
   /**
    * Set the ElasticNet mixing parameter.
-   * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
-   * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+   * For alpha = 0, the penalty is an L2 penalty.
+   * For alpha = 1, it is an L1 penalty.
+   * For alpha in (0,1), the penalty is a combination of L1 and L2.
    * Default is 0.0 which is an L2 penalty.
    *
    * @group setParam
@@ -160,20 +163,26 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
   /**
    * Set the solver algorithm used for optimization.
    * In case of linear regression, this can be "l-bfgs", "normal" and "auto".
-   * "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton
-   * optimization method. "normal" denotes using Normal Equation as an analytical
-   * solution to the linear regression problem.
-   * The default value is "auto" which means that the solver algorithm is
-   * selected automatically.
+   *  - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton
+   *    optimization method.
+   *  - "normal" denotes using Normal Equation as an analytical solution to the linear regression
+   *    problem.  This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`.
+   *  - "auto" (default) means that the solver algorithm is selected automatically.
+   *    The Normal Equations solver will be used when possible, but this will automatically fall
+   *    back to iterative optimization methods when needed.
    *
    * @group setParam
    */
   @Since("1.6.0")
-  def setSolver(value: String): this.type = set(solver, value)
+  def setSolver(value: String): this.type = {
+    require(Set("auto", "l-bfgs", "normal").contains(value),
+      s"Solver $value was not supported. Supported options: auto, l-bfgs, normal")
+    set(solver, value)
+  }
   setDefault(solver -> "auto")
 
   /**
-   * Suggested depth for treeAggregate (>= 2).
+   * Suggested depth for treeAggregate (greater than or equal to 2).
    * If the dimensions of features or the number of partitions are large,
    * this param could be adjusted to a larger size.
    * Default is 2.
@@ -217,7 +226,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
         model.diagInvAtWA.toArray,
         model.objectiveHistory)
 
-      return lrModel.setSummary(trainingSummary)
+      return lrModel.setSummary(Some(trainingSummary))
     }
 
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
@@ -270,7 +279,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
           model,
           Array(0D),
           Array(0D))
-        return model.setSummary(trainingSummary)
+        return model.setSummary(Some(trainingSummary))
       } else {
         require($(regParam) == 0.0, "The standard deviation of the label is zero. " +
           "Model cannot be regularized.")
@@ -330,12 +339,12 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       /*
          Note that in Linear Regression, the objective history (loss + regularization) returned
          from optimizer is computed in the scaled space given by the following formula.
-         <p><blockquote>
+         <blockquote>
             $$
             L &= 1/2n||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2
                  + regTerms \\
             $$
-         </blockquote></p>
+         </blockquote>
        */
       val arrayBuilder = mutable.ArrayBuilder.make[Double]
       var state: optimizer.State = null
@@ -392,7 +401,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       model,
       Array(0D),
       objectiveHistory)
-    model.setSummary(trainingSummary)
+    model.setSummary(Some(trainingSummary))
   }
 
   @Since("1.4.0")
@@ -404,6 +413,14 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] {
 
   @Since("1.6.0")
   override def load(path: String): LinearRegression = super.load(path)
+
+  /**
+   * When using `LinearRegression.solver` == "normal", the solver must limit the number of
+   * features to at most this number.  The entire covariance matrix X^T^X will be collected
+   * to the driver. This limit helps prevent memory overflow errors.
+   */
+  @Since("2.1.0")
+  val MAX_FEATURES_FOR_NORMAL_SOLVER: Int = WeightedLeastSquares.MAX_NUM_FEATURES
 }
 
 /**
@@ -430,8 +447,9 @@ class LinearRegressionModel private[ml] (
     throw new SparkException("No training summary available for this LinearRegressionModel")
   }
 
-  private[regression] def setSummary(summary: LinearRegressionTrainingSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[regression]
+  def setSummary(summary: Option[LinearRegressionTrainingSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -474,8 +492,7 @@ class LinearRegressionModel private[ml] (
   @Since("1.4.0")
   override def copy(extra: ParamMap): LinearRegressionModel = {
     val newModel = copyValues(new LinearRegressionModel(uid, coefficients, intercept), extra)
-    if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
-    newModel.setParent(parent)
+    newModel.setSummary(trainingSummary).setParent(parent)
   }
 
   /**
@@ -568,7 +585,7 @@ class LinearRegressionTrainingSummary private[regression] (
    *
    * This value is only available when using the "l-bfgs" solver.
    *
-   * @see [[LinearRegression.solver]]
+   * @see `LinearRegression.solver`
    */
   @Since("1.5.0")
   val totalIterations = objectiveHistory.length
@@ -595,9 +612,6 @@ class LinearRegressionSummary private[regression] (
     private val privateModel: LinearRegressionModel,
     private val diagInvAtWA: Array[Double]) extends Serializable {
 
-  @deprecated("The model field is deprecated and will be removed in 2.1.0.", "2.0.0")
-  val model: LinearRegressionModel = privateModel
-
   @transient private val metrics = new RegressionMetrics(
     predictions
       .select(col(predictionCol), col(labelCol).cast(DoubleType))
@@ -608,10 +622,11 @@ class LinearRegressionSummary private[regression] (
   /**
    * Returns the explained variance regression score.
    * explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
-   * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
+   * Reference: <a href="http://en.wikipedia.org/wiki/Explained_variation">
+   * Wikipedia explain variation</a>
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val explainedVariance: Double = metrics.explainedVariance
@@ -620,8 +635,8 @@ class LinearRegressionSummary private[regression] (
    * Returns the mean absolute error, which is a risk function corresponding to the
    * expected value of the absolute error loss or l1-norm loss.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val meanAbsoluteError: Double = metrics.meanAbsoluteError
@@ -630,8 +645,8 @@ class LinearRegressionSummary private[regression] (
    * Returns the mean squared error, which is a risk function corresponding to the
    * expected value of the squared error loss or quadratic loss.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val meanSquaredError: Double = metrics.meanSquaredError
@@ -640,18 +655,19 @@ class LinearRegressionSummary private[regression] (
    * Returns the root mean squared error, which is defined as the square root of
    * the mean squared error.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val rootMeanSquaredError: Double = metrics.rootMeanSquaredError
 
   /**
    * Returns R^2^, the coefficient of determination.
-   * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+   * Reference: <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination">
+   * Wikipedia coefficient of determination</a>
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val r2: Double = metrics.r2
@@ -696,10 +712,10 @@ class LinearRegressionSummary private[regression] (
    * Standard error of estimated coefficients and intercept.
    * This value is only available when using the "normal" solver.
    *
-   * If [[LinearRegression.fitIntercept]] is set to true,
+   * If `LinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    *
-   * @see [[LinearRegression.solver]]
+   * @see `LinearRegression.solver`
    */
   lazy val coefficientStandardErrors: Array[Double] = {
     if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
@@ -724,10 +740,10 @@ class LinearRegressionSummary private[regression] (
    * T-statistic of estimated coefficients and intercept.
    * This value is only available when using the "normal" solver.
    *
-   * If [[LinearRegression.fitIntercept]] is set to true,
+   * If `LinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    *
-   * @see [[LinearRegression.solver]]
+   * @see `LinearRegression.solver`
    */
   lazy val tValues: Array[Double] = {
     if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
@@ -747,10 +763,10 @@ class LinearRegressionSummary private[regression] (
    * Two-sided p-value of estimated coefficients and intercept.
    * This value is only available when using the "normal" solver.
    *
-   * If [[LinearRegression.fitIntercept]] is set to true,
+   * If `LinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    *
-   * @see [[LinearRegression.solver]]
+   * @see `LinearRegression.solver`
    */
   lazy val pValues: Array[Double] = {
     if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
@@ -789,11 +805,11 @@ class LinearRegressionSummary private[regression] (
  * When training with intercept enabled,
  * The objective function in the scaled space is given by
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2,
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $\bar{x_i}$ is the mean of $x_i$, $\hat{x_i}$ is the standard deviation of $x_i$,
  * $\bar{y}$ is the mean of label, and $\hat{y}$ is the standard deviation of label.
@@ -804,7 +820,7 @@ class LinearRegressionSummary private[regression] (
  *
  * This can be rewritten as
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *     L &= 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
@@ -812,34 +828,34 @@ class LinearRegressionSummary private[regression] (
  *       &= 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $w_i^\prime$ is the effective coefficients defined by $w_i/\hat{x_i}$, offset is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}.
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * and diff is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \sum_i w_i^\prime x_i - y / \hat{y} + offset
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Note that the effective coefficients and offset don't depend on training dataset,
  * so they can be precomputed.
  *
  * Now, the first derivative of the objective function in scaled space is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \frac{\partial L}{\partial w_i} = diff/N (x_i - \bar{x_i}) / \hat{x_i}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * However, $(x_i - \bar{x_i})$ will densify the computation, so it's not
  * an ideal formula when the training dataset is sparse format.
@@ -849,7 +865,7 @@ class LinearRegressionSummary private[regression] (
  * objective function from all the samples is
  *
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *       \frac{\partial L}{\partial w_i} &=
@@ -858,14 +874,14 @@ class LinearRegressionSummary private[regression] (
  *         &= 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + correction_i)
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $correction_i = - diffSum \bar{x_i} / \hat{x_i}$
  *
  * A simple math can show that diffSum is actually zero, so we don't even
  * need to add the correction terms in the end. From the definition of diff,
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *       diffSum &= \sum_j (\sum_i w_i(x_{ij} - \bar{x_i})
@@ -874,17 +890,17 @@ class LinearRegressionSummary private[regression] (
  *         &= 0
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * As a result, the first derivative of the total objective function only depends on
  * the training dataset, which can be easily computed in distributed fashion, and is
  * sparse format friendly.
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \frac{\partial L}{\partial w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i})
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * @param bcCoefficients The broadcast coefficients corresponding to the features.
  * @param labelStd The standard deviation value of the label.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 0ad00aa6f928..ca4a50b825dd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -37,7 +37,8 @@ import org.apache.spark.sql.functions._
 
 
 /**
- * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] learning algorithm for regression.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
+ * learning algorithm for regression.
  * It supports both continuous and categorical features.
  */
 @Since("1.4.0")
@@ -51,45 +52,67 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S
   // Override parameter setters from parent trait for Java API compatibility.
 
   // Parameters from TreeRegressorParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setImpurity(value: String): this.type = super.setImpurity(value)
+  override def setImpurity(value: String): this.type = set(impurity, value)
 
   // Parameters from TreeEnsembleParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSubsamplingRate(value: Double): this.type = super.setSubsamplingRate(value)
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   // Parameters from RandomForestParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setNumTrees(value: Int): this.type = super.setNumTrees(value)
+  override def setNumTrees(value: Int): this.type = set(numTrees, value)
 
+  /** @group setParam */
   @Since("1.4.0")
   override def setFeatureSubsetStrategy(value: String): this.type =
-    super.setFeatureSubsetStrategy(value)
+    set(featureSubsetStrategy, value)
 
   override protected def train(dataset: Dataset[_]): RandomForestRegressionModel = {
     val categoricalFeatures: Map[Int, Int] =
@@ -132,7 +155,7 @@ object RandomForestRegressor extends DefaultParamsReadable[RandomForestRegressor
 }
 
 /**
- * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] model for regression.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> model for regression.
  * It supports both continuous and categorical features.
  *
  * @param _trees  Decision trees in the ensemble.
@@ -144,7 +167,7 @@ class RandomForestRegressionModel private[ml] (
     private val _trees: Array[DecisionTreeRegressionModel],
     override val numFeatures: Int)
   extends PredictionModel[Vector, RandomForestRegressionModel]
-  with RandomForestRegressionModelParams with TreeEnsembleModel[DecisionTreeRegressionModel]
+  with RandomForestRegressorParams with TreeEnsembleModel[DecisionTreeRegressionModel]
   with MLWritable with Serializable {
 
   require(_trees.nonEmpty, "RandomForestRegressionModel requires at least 1 tree.")
@@ -181,14 +204,6 @@ class RandomForestRegressionModel private[ml] (
     _trees.map(_.rootNode.predictImpl(features).prediction).sum / getNumTrees
   }
 
-  /**
-   * Number of trees in ensemble
-   * @deprecated  Use [[getNumTrees]] instead.  This method will be removed in 2.1.0
-   */
-  // TODO: Once this is removed, then this class can inherit from RandomForestRegressorParams
-  @deprecated("Use getNumTrees instead.  This method will be removed in 2.1.0.", "2.0.0")
-  val numTrees: Int = trees.length
-
   @Since("1.4.0")
   override def copy(extra: ParamMap): RandomForestRegressionModel = {
     copyValues(new RandomForestRegressionModel(uid, _trees, numFeatures), extra).setParent(parent)
@@ -207,7 +222,7 @@ class RandomForestRegressionModel private[ml] (
    * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
    * and follows the implementation from scikit-learn.
    *
-   * @see [[DecisionTreeRegressionModel.featureImportances]]
+   * @see `DecisionTreeRegressionModel.featureImportances`
    */
   @Since("1.5.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
index 73d813064dec..e4de8483cfa3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
@@ -17,15 +17,12 @@
 
 package org.apache.spark.ml.source.libsvm
 
-import org.apache.spark.ml.linalg.Vector
-import org.apache.spark.sql.{DataFrame, DataFrameReader}
-
 /**
- * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]].
- * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and
- * `features` containing feature vectors stored as [[Vector]]s.
+ * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as `DataFrame`.
+ * The loaded `DataFrame` has two columns: `label` containing labels stored as doubles and
+ * `features` containing feature vectors stored as `Vector`s.
  *
- * To use LIBSVM data source, you need to set "libsvm" as the format in [[DataFrameReader]] and
+ * To use LIBSVM data source, you need to set "libsvm" as the format in `DataFrameReader` and
  * optionally specify options, for example:
  * {{{
  *   // Scala
@@ -48,9 +45,9 @@ import org.apache.spark.sql.{DataFrame, DataFrameReader}
  *    inconsistent feature dimensions.
  *  - "vectorType": feature vector type, "sparse" (default) or "dense".
  *
- * Note that this class is public for documentation purpose. Please don't use this class directly.
+ * @note This class is public for documentation purpose. Please don't use this class directly.
  * Rather, use the data source API as illustrated above.
  *
- * @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]]
+ * @see <a href="https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">LIBSVM datasets</a>
  */
 class LibSVMDataSource private() {}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
index 442f52bf0231..bc3c86a57c85 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
@@ -35,7 +35,7 @@ import org.apache.spark.rdd.RDD
  * @param numClasses    For classification: labels can take values {0, ..., numClasses - 1}.
  *                      For regression: fixed at 0 (no meaning).
  * @param maxBins  Maximum number of bins, for all features.
- * @param featureArity  Map: categorical feature index --> arity.
+ * @param featureArity  Map: categorical feature index to arity.
  *                      I.e., the feature takes values in {0, ..., arity - 1}.
  * @param numBins  Number of bins for each feature.
  */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
index 7bef899a633d..f3bace818157 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -34,7 +34,7 @@ private[spark] object GradientBoostedTrees extends Logging {
 
   /**
    * Method to train a gradient boosting model
-   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   * @param input Training dataset: RDD of `LabeledPoint`.
    * @param seed Random seed.
    * @return tuple of ensemble models and weights:
    *         (array of decision tree models, array of model weights)
@@ -59,12 +59,12 @@ private[spark] object GradientBoostedTrees extends Logging {
 
   /**
    * Method to validate a gradient boosting model
-   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   * @param input Training dataset: RDD of `LabeledPoint`.
    * @param validationInput Validation dataset.
    *                        This dataset should be different from the training dataset,
    *                        but it should follow the same distribution.
    *                        E.g., these two datasets could be created from an original dataset
-   *                        by using [[org.apache.spark.rdd.RDD.randomSplit()]]
+   *                        by using `org.apache.spark.rdd.RDD.randomSplit()`
    * @param seed Random seed.
    * @return tuple of ensemble models and weights:
    *         (array of decision tree models, array of model weights)
@@ -98,7 +98,7 @@ private[spark] object GradientBoostedTrees extends Logging {
    * @param initTreeWeight: learning rate assigned to the first tree.
    * @param initTree: first DecisionTreeModel.
    * @param loss: evaluation metric.
-   * @return a RDD with each element being a zip of the prediction and error
+   * @return an RDD with each element being a zip of the prediction and error
    *         corresponding to every sample.
    */
   def computeInitialPredictionAndError(
@@ -121,7 +121,7 @@ private[spark] object GradientBoostedTrees extends Logging {
    * @param treeWeight: Learning rate.
    * @param tree: Tree using which the prediction and error should be updated.
    * @param loss: evaluation metric.
-   * @return a RDD with each element being a zip of the prediction and error
+   * @return an RDD with each element being a zip of the prediction and error
    *         corresponding to each sample.
    */
   def updatePredictionError(
@@ -162,7 +162,7 @@ private[spark] object GradientBoostedTrees extends Logging {
    * Method to calculate error of the base learner for the gradient boosting calculation.
    * Note: This method is not used by the gradient boosting algorithm but is useful for debugging
    * purposes.
-   * @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   * @param data Training dataset: RDD of `LabeledPoint`.
    * @param trees Boosted Decision Tree models
    * @param treeWeights Learning rates at each boosting iteration.
    * @param loss evaluation metric.
@@ -184,7 +184,7 @@ private[spark] object GradientBoostedTrees extends Logging {
   /**
    * Method to compute error or loss for every iteration of gradient boosting.
    *
-   * @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param data RDD of `LabeledPoint`
    * @param trees Boosted Decision Tree models
    * @param treeWeights Learning rates at each boosting iteration.
    * @param loss evaluation metric.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index b504f411d256..a61ea374cbd4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -82,7 +82,7 @@ private[spark] object RandomForest extends Logging {
   /**
    * Train a random forest.
    *
-   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param input Training data: RDD of `LabeledPoint`
    * @return an unweighted set of trees
    */
   def run(
@@ -343,7 +343,7 @@ private[spark] object RandomForest extends Logging {
   /**
    * Given a group of nodes, this finds the best split for each node.
    *
-   * @param input Training data: RDD of [[org.apache.spark.ml.tree.impl.TreePoint]]
+   * @param input Training data: RDD of [[TreePoint]]
    * @param metadata Learning and dataset metadata
    * @param topNodesForGroup For each tree in group, tree index -> root node.
    *                         Used for matching instances with nodes.
@@ -854,10 +854,10 @@ private[spark] object RandomForest extends Logging {
    *       and for multiclass classification with a high-arity feature,
    *       there is one bin per category.
    *
-   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param input Training data: RDD of [[LabeledPoint]]
    * @param metadata Learning and dataset metadata
    * @param seed random seed
-   * @return Splits, an Array of [[org.apache.spark.mllib.tree.model.Split]]
+   * @return Splits, an Array of [[Split]]
    *          of size (numFeatures, numSplits)
    */
   protected[tree] def findSplits(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
index d3cbc363799a..0d6e9034e5ce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
@@ -95,11 +95,6 @@ private[ml] trait TreeEnsembleModel[M <: DecisionTreeModel] {
   /** Trees in this ensemble. Warning: These have null parent Estimators. */
   def trees: Array[M]
 
-  /**
-   * Number of trees in ensemble
-   */
-  val getNumTrees: Int = trees.length
-
   /** Weights for each tree, zippable with [[trees]] */
   def treeWeights: Array[Double]
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 57c7e44e9760..c7a8f76eca84 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -73,11 +73,13 @@ private[ml] trait DecisionTreeParams extends PredictorParams
 
   /**
    * Minimum information gain for a split to be considered at a tree node.
+   * Should be >= 0.0.
    * (default = 0.0)
    * @group param
    */
   final val minInfoGain: DoubleParam = new DoubleParam(this, "minInfoGain",
-    "Minimum information gain for a split to be considered at a tree node.")
+    "Minimum information gain for a split to be considered at a tree node.",
+    ParamValidators.gtEq(0.0))
 
   /**
    * Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be
@@ -105,54 +107,78 @@ private[ml] trait DecisionTreeParams extends PredictorParams
   setDefault(maxDepth -> 5, maxBins -> 32, minInstancesPerNode -> 1, minInfoGain -> 0.0,
     maxMemoryInMB -> 256, cacheNodeIds -> false, checkpointInterval -> 10)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
   /** @group getParam */
   final def getMaxDepth: Int = $(maxDepth)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMaxBins(value: Int): this.type = set(maxBins, value)
 
   /** @group getParam */
   final def getMaxBins: Int = $(maxBins)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
   /** @group getParam */
   final def getMinInstancesPerNode: Int = $(minInstancesPerNode)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
   /** @group getParam */
   final def getMinInfoGain: Double = $(minInfoGain)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setSeed(value: Long): this.type = set(seed, value)
 
-  /** @group expertSetParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group expertSetParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
   /** @group expertGetParam */
   final def getMaxMemoryInMB: Int = $(maxMemoryInMB)
 
-  /** @group expertSetParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group expertSetParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
   /** @group expertGetParam */
   final def getCacheNodeIds: Boolean = $(cacheNodeIds)
 
   /**
-   * Specifies how often to checkpoint the cached node IDs.
-   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
-   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
-   * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
-   * (default = 10)
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
    * @group setParam
    */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
   /** (private[ml]) Create a Strategy instance to use with the old API. */
@@ -196,7 +222,11 @@ private[ml] trait TreeClassifierParams extends Params {
 
   setDefault(impurity -> "gini")
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setImpurity(value: String): this.type = set(impurity, value)
 
   /** @group getParam */
@@ -241,7 +271,11 @@ private[ml] trait TreeRegressorParams extends Params {
 
   setDefault(impurity -> "variance")
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setImpurity(value: String): this.type = set(impurity, value)
 
   /** @group getParam */
@@ -298,7 +332,11 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams {
 
   setDefault(subsamplingRate -> 1.0)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
   /** @group getParam */
@@ -317,8 +355,36 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams {
   }
 }
 
-/** Used for [[RandomForestParams]] */
-private[ml] trait HasFeatureSubsetStrategy extends Params {
+/**
+ * Parameters for Random Forest algorithms.
+ */
+private[ml] trait RandomForestParams extends TreeEnsembleParams {
+
+  /**
+   * Number of trees to train (>= 1).
+   * If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
+   * TODO: Change to always do bootstrapping (simpler).  SPARK-7130
+   * (default = 20)
+   *
+   * Note: The reason that we cannot add this to both GBT and RF (i.e. in TreeEnsembleParams)
+   * is the param `maxIter` controls how many trees a GBT has. The semantics in the algorithms
+   * are a bit different.
+   * @group param
+   */
+  final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)",
+    ParamValidators.gtEq(1))
+
+  setDefault(numTrees -> 20)
+
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
+  def setNumTrees(value: Int): this.type = set(numTrees, value)
+
+  /** @group getParam */
+  final def getNumTrees: Int = $(numTrees)
 
   /**
    * The number of features to consider for splits at each tree node.
@@ -340,9 +406,9 @@ private[ml] trait HasFeatureSubsetStrategy extends Params {
    *  - sqrt: recommended by Breiman manual for random forests
    *  - The defaults of sqrt (classification) and onethird (regression) match the R randomForest
    *    package.
-   * @see [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf  Breiman (2001)]]
-   * @see [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf  Breiman manual for
-   *     random forests]]
+   * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a>
+   * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf">
+   * Breiman manual for random forests</a>
    *
    * @group param
    */
@@ -357,45 +423,17 @@ private[ml] trait HasFeatureSubsetStrategy extends Params {
 
   setDefault(featureSubsetStrategy -> "auto")
 
-  /** @group setParam */
-  def setFeatureSubsetStrategy(value: String): this.type = set(featureSubsetStrategy, value)
-
-  /** @group getParam */
-  final def getFeatureSubsetStrategy: String = $(featureSubsetStrategy).toLowerCase
-}
-
-/**
- * Used for [[RandomForestParams]].
- * This is separated out from [[RandomForestParams]] because of an issue with the
- * `numTrees` method conflicting with this Param in the Estimator.
- */
-private[ml] trait HasNumTrees extends Params {
-
   /**
-   * Number of trees to train (>= 1).
-   * If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
-   * TODO: Change to always do bootstrapping (simpler).  SPARK-7130
-   * (default = 20)
-   * @group param
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
    */
-  final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)",
-    ParamValidators.gtEq(1))
-
-  setDefault(numTrees -> 20)
-
-  /** @group setParam */
-  def setNumTrees(value: Int): this.type = set(numTrees, value)
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
+  def setFeatureSubsetStrategy(value: String): this.type = set(featureSubsetStrategy, value)
 
   /** @group getParam */
-  final def getNumTrees: Int = $(numTrees)
+  final def getFeatureSubsetStrategy: String = $(featureSubsetStrategy).toLowerCase
 }
 
-/**
- * Parameters for Random Forest algorithms.
- */
-private[ml] trait RandomForestParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with HasNumTrees
-
 private[spark] object RandomForestParams {
   // These options should be lowercase.
   final val supportedFeatureSubsetStrategies: Array[String] =
@@ -405,21 +443,15 @@ private[spark] object RandomForestParams {
 private[ml] trait RandomForestClassifierParams
   extends RandomForestParams with TreeClassifierParams
 
-private[ml] trait RandomForestClassificationModelParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with TreeClassifierParams
-
 private[ml] trait RandomForestRegressorParams
   extends RandomForestParams with TreeRegressorParams
 
-private[ml] trait RandomForestRegressionModelParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with TreeRegressorParams
-
 /**
  * Parameters for Gradient-Boosted Tree algorithms.
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasStepSize {
+private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
 
   /* TODO: Add this doc when we add this param.  SPARK-7132
    * Threshold for stopping early when runWithValidation is used.
@@ -432,24 +464,34 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS
   // final val validationTol: DoubleParam = new DoubleParam(this, "validationTol", "")
   // validationTol -> 1e-5
 
-  setDefault(maxIter -> 20, stepSize -> 0.1)
-
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
   /**
-   * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
-   * estimator.
+   * Param for Step size (a.k.a. learning rate) in interval (0, 1] for shrinking
+   * the contribution of each estimator.
    * (default = 0.1)
+   * @group param
+   */
+  final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size " +
+    "(a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+
+  /** @group getParam */
+  final def getStepSize: Double = $(stepSize)
+
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
    * @group setParam
    */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setStepSize(value: Double): this.type = set(stepSize, value)
 
-  override def validateParams(): Unit = {
-    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
-      getStepSize), "GBT parameter stepSize should be in interval (0, 1], " +
-      s"but it given invalid value $getStepSize.")
-  }
+  setDefault(maxIter -> 20, stepSize -> 0.1)
 
   /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */
   private[ml] def getOldBoostingStrategy(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 6ea52ef7f025..85191d46fd36 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.types.StructType
  */
 private[ml] trait CrossValidatorParams extends ValidatorParams {
   /**
-   * Param for number of folds for cross validation.  Must be >= 2.
+   * Param for number of folds for cross validation.  Must be &gt;= 2.
    * Default: 3
    *
    * @group param
@@ -198,7 +198,7 @@ object CrossValidator extends MLReadable[CrossValidator] {
  *
  * @param bestModel The best model selected from k-fold cross validation.
  * @param avgMetrics Average cross-validation metrics for each paramMap in
- *                   [[CrossValidator.estimatorParamMaps]], in the corresponding order.
+ *                   `CrossValidator.estimatorParamMaps`, in the corresponding order.
  */
 @Since("1.2.0")
 class CrossValidatorModel private[ml] (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
index 0fdba1cb8814..5d1a39f7c16d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -221,7 +221,7 @@ class TrainValidationSplitModel private[ml] (
       uid,
       bestModel.copy(extra).asInstanceOf[Model[_]],
       validationMetrics.clone())
-    copyValues(copied, extra)
+    copyValues(copied, extra).setParent(parent)
   }
 
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index f34a8310ddf1..3e19f2718394 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -48,7 +48,7 @@ private[spark] object MetadataUtils {
    *                        If a feature does not have metadata, it is assumed to be continuous.
    *                        If a feature is Nominal, then it must have the number of values
    *                        specified.
-   * @return  Map: feature index --> number of categories.
+   * @return  Map: feature index to number of categories.
    *          The map's set of keys will be the set of categorical feature indices.
    */
   def getCategoricalFeatures(featuresSchema: StructField): Map[Int, Int] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index bc4f9e6716ee..c0e380149981 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -26,7 +26,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.classification.{OneVsRest, OneVsRestModel}
@@ -46,7 +46,7 @@ private[util] sealed trait BaseReadWrite {
    * Sets the Spark SQLContext to use for saving/loading.
    */
   @Since("1.6.0")
-  @deprecated("Use session instead", "2.0.0")
+  @deprecated("Use session instead, This method will be removed in 2.2.0.", "2.0.0")
   def context(sqlContext: SQLContext): this.type = {
     optionSparkSession = Option(sqlContext.sparkSession)
     this
@@ -76,16 +76,13 @@ private[util] sealed trait BaseReadWrite {
    */
   protected final def sqlContext: SQLContext = sparkSession.sqlContext
 
-  /** Returns the underlying [[SparkContext]]. */
+  /** Returns the underlying `SparkContext`. */
   protected final def sc: SparkContext = sparkSession.sparkContext
 }
 
 /**
- * :: Experimental ::
- *
  * Abstract class for utility classes that can save ML instances.
  */
-@Experimental
 @Since("1.6.0")
 abstract class MLWriter extends BaseReadWrite with Logging {
 
@@ -138,11 +135,8 @@ abstract class MLWriter extends BaseReadWrite with Logging {
 }
 
 /**
- * :: Experimental ::
- *
  * Trait for classes that provide [[MLWriter]].
  */
-@Experimental
 @Since("1.6.0")
 trait MLWritable {
 
@@ -163,13 +157,13 @@ trait MLWritable {
 /**
  * :: DeveloperApi ::
  *
- * Helper trait for making simple [[Params]] types writable.  If a [[Params]] class stores
+ * Helper trait for making simple `Params` types writable.  If a `Params` class stores
  * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide
  * a default implementation of writing saved instances of the class.
  * This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle
  * [[org.apache.spark.sql.Dataset]].
  *
- * @see  [[DefaultParamsReadable]], the counterpart to this trait
+ * @see `DefaultParamsReadable`, the counterpart to this trait
  */
 @DeveloperApi
 trait DefaultParamsWritable extends MLWritable { self: Params =>
@@ -178,13 +172,10 @@ trait DefaultParamsWritable extends MLWritable { self: Params =>
 }
 
 /**
- * :: Experimental ::
- *
  * Abstract class for utility classes that can load ML instances.
  *
  * @tparam T ML instance type
  */
-@Experimental
 @Since("1.6.0")
 abstract class MLReader[T] extends BaseReadWrite {
 
@@ -202,13 +193,10 @@ abstract class MLReader[T] extends BaseReadWrite {
 }
 
 /**
- * :: Experimental ::
- *
  * Trait for objects that provide [[MLReader]].
  *
  * @tparam T ML instance type
  */
-@Experimental
 @Since("1.6.0")
 trait MLReadable[T] {
 
@@ -221,7 +209,7 @@ trait MLReadable[T] {
   /**
    * Reads an ML instance from the input path, a shortcut of `read.load(path)`.
    *
-   * Note: Implementing classes should override this to be Java-friendly.
+   * @note Implementing classes should override this to be Java-friendly.
    */
   @Since("1.6.0")
   def load(path: String): T = read.load(path)
@@ -231,14 +219,14 @@ trait MLReadable[T] {
 /**
  * :: DeveloperApi ::
  *
- * Helper trait for making simple [[Params]] types readable.  If a [[Params]] class stores
+ * Helper trait for making simple `Params` types readable.  If a `Params` class stores
  * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide
  * a default implementation of reading saved instances of the class.
  * This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle
  * [[org.apache.spark.sql.Dataset]].
  *
  * @tparam T ML instance type
- * @see  [[DefaultParamsWritable]], the counterpart to this trait
+ * @see `DefaultParamsWritable`, the counterpart to this trait
  */
 @DeveloperApi
 trait DefaultParamsReadable[T] extends MLReadable[T] {
@@ -345,7 +333,7 @@ private[ml] object DefaultParamsReader {
   /**
    * All info from metadata file.
    *
-   * @param params  paramMap, as a [[JValue]]
+   * @param params  paramMap, as a `JValue`
    * @param metadata  All metadata, including the other fields
    * @param metadataJson  Full metadata file String (for debugging)
    */
@@ -360,7 +348,7 @@ private[ml] object DefaultParamsReader {
 
     /**
      * Get the JSON value of the [[org.apache.spark.ml.param.Param]] of the given name.
-     * This can be useful for getting a Param value before an instance of [[Params]]
+     * This can be useful for getting a Param value before an instance of `Params`
      * is available.
      */
     def getParamValue(paramName: String): JValue = {
@@ -438,7 +426,7 @@ private[ml] object DefaultParamsReader {
   }
 
   /**
-   * Load a [[Params]] instance from the given path, and return it.
+   * Load a `Params` instance from the given path, and return it.
    * This assumes the instance implements [[MLReadable]].
    */
   def loadParamsInstance[T](path: String, sc: SparkContext): T = {
@@ -454,7 +442,7 @@ private[ml] object DefaultParamsReader {
 private[ml] object MetaAlgorithmReadWrite {
   /**
    * Examine the given estimator (which may be a compound estimator) and extract a mapping
-   * from UIDs to corresponding [[Params]] instances.
+   * from UIDs to corresponding `Params` instances.
    */
   def getUidMap(instance: Params): Map[String, Params] = {
     val uidList = getUidMapImpl(instance)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index d851b983349c..4b650000736e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -202,9 +202,11 @@ object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
  * Train a classification model for Binary Logistic Regression
  * using Stochastic Gradient Descent. By default L2 regularization is used,
  * which can be changed via `LogisticRegressionWithSGD.optimizer`.
- * NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1}
- * for k classes multi-label classification problem.
+ *
  * Using [[LogisticRegressionWithLBFGS]] is recommended over this.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1, ..., k - 1}
+ * for k classes multi-label classification problem.
  */
 @Since("0.8.0")
 class LogisticRegressionWithSGD private[mllib] (
@@ -239,7 +241,8 @@ class LogisticRegressionWithSGD private[mllib] (
 
 /**
  * Top-level methods for calling Logistic Regression using Stochastic Gradient Descent.
- * NOTE: Labels used in Logistic Regression should be {0, 1}
+ *
+ * @note Labels used in Logistic Regression should be {0, 1}
  */
 @Since("0.8.0")
 @deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
@@ -252,7 +255,6 @@ object LogisticRegressionWithSGD {
    * number of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in
    * gradient descent are initialized using the initial weights provided.
-   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
@@ -260,6 +262,8 @@ object LogisticRegressionWithSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
+   *
+   * @note Labels used in Logistic Regression should be {0, 1}
    */
   @Since("1.0.0")
   def train(
@@ -276,13 +280,13 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient.
-   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @param stepSize Step size to be used for each iteration of gradient descent.
-
    * @param miniBatchFraction Fraction of data to be used per iteration.
+   *
+   * @note Labels used in Logistic Regression should be {0, 1}
    */
   @Since("1.0.0")
   def train(
@@ -298,13 +302,13 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using the specified step size. We use the entire data
    * set to update the gradient in each iteration.
-   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param stepSize Step size to be used for each iteration of Gradient Descent.
-
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LogisticRegressionModel which has the weights and offset from training.
+   *
+   * @note Labels used in Logistic Regression should be {0, 1}
    */
   @Since("1.0.0")
   def train(
@@ -318,11 +322,12 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using a step size of 1.0. We use the entire data set
    * to update the gradient in each iteration.
-   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LogisticRegressionModel which has the weights and offset from training.
+   *
+   * @note Labels used in Logistic Regression should be {0, 1}
    */
   @Since("1.0.0")
   def train(
@@ -335,8 +340,6 @@ object LogisticRegressionWithSGD {
 /**
  * Train a classification model for Multinomial/Binary Logistic Regression using
  * Limited-memory BFGS. Standard feature scaling and L2 regularization are used by default.
- * NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1}
- * for k classes multi-label classification problem.
  *
  * Earlier implementations of LogisticRegressionWithLBFGS applies a regularization
  * penalty to all elements including the intercept. If this is called with one of
@@ -344,6 +347,9 @@ object LogisticRegressionWithSGD {
  * into a call to ml.LogisticRegression, otherwise this will use the existing mllib
  * GeneralizedLinearAlgorithm trainer, resulting in a regularization penalty to the
  * intercept.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1, ..., k - 1}
+ * for k classes multi-label classification problem.
  */
 @Since("1.1.0")
 class LogisticRegressionWithLBFGS
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 33561be4b5bc..9e8774732efe 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -302,10 +302,11 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
 /**
  * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
  *
- * This is the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of
- * discrete data.  For example, by converting documents into TF-IDF vectors, it can be used for
- * document classification.  By making every vector a 0-1 vector, it can also be used as
- * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative.
+ * This is the Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>) which can
+ * handle all kinds of discrete data. For example, by converting documents into TF-IDF
+ * vectors, it can be used for document classification. By making every vector a 0-1 vector,
+ * it can also be used as Bernoulli NB (see <a href="http://tinyurl.com/p7c96j6">here</a>).
+ * The input feature values must be nonnegative.
  */
 @Since("0.9.0")
 class NaiveBayes private (
@@ -364,12 +365,12 @@ class NaiveBayes private (
     val nb = new NewNaiveBayes()
       .setModelType(modelType)
       .setSmoothing(lambda)
-      .setIsML(false)
 
     val dataset = data.map { case LabeledPoint(label, features) => (label, features.asML) }
       .toDF("label", "features")
 
-    val newModel = nb.fit(dataset)
+    // mllib NaiveBayes allows input labels like {-1, +1}, so set `positiveLabel` as false.
+    val newModel = nb.trainWithLabelCheck(dataset, positiveLabel = false)
 
     val pi = newModel.pi.toArray
     val theta = Array.fill[Double](newModel.numClasses, newModel.numFeatures)(0.0)
@@ -378,7 +379,7 @@ class NaiveBayes private (
         theta(i)(j) = v
     }
 
-    require(newModel.oldLabels != null,
+    assert(newModel.oldLabels != null,
       "The underlying ML NaiveBayes training does not produce labels.")
     new NaiveBayesModel(newModel.oldLabels, pi, theta, modelType)
   }
@@ -391,20 +392,20 @@ class NaiveBayes private (
 object NaiveBayes {
 
   /** String name for multinomial model type. */
-  private[spark] val Multinomial: String = "multinomial"
+  private[classification] val Multinomial: String = "multinomial"
 
   /** String name for Bernoulli model type. */
-  private[spark] val Bernoulli: String = "bernoulli"
+  private[classification] val Bernoulli: String = "bernoulli"
 
   /* Set of modelTypes that NaiveBayes supports */
-  private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
+  private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
 
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
    *
-   * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all
-   * kinds of discrete data.  For example, by converting documents into TF-IDF vectors, it
-   * can be used for document classification.
+   * This is the default Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>)
+   * which can handle all kinds of discrete data. For example, by converting documents into
+   * TF-IDF vectors, it can be used for document classification.
    *
    * This version of the method uses a default smoothing parameter of 1.0.
    *
@@ -419,9 +420,9 @@ object NaiveBayes {
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
    *
-   * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all
-   * kinds of discrete data.  For example, by converting documents into TF-IDF vectors, it
-   * can be used for document classification.
+   * This is the default Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>)
+   * which can handle all kinds of discrete data. For example, by converting documents
+   * into TF-IDF vectors, it can be used for document classification.
    *
    * @param input RDD of `(label, array of features)` pairs.  Every vector should be a frequency
    *              vector or a count vector.
@@ -435,9 +436,10 @@ object NaiveBayes {
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
    *
-   * The model type can be set to either Multinomial NB ([[http://tinyurl.com/lsdw6p]])
-   * or Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The Multinomial NB can handle
-   * discrete count data and can be called by setting the model type to "multinomial".
+   * The model type can be set to either Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">
+   * here</a>) or Bernoulli NB (see <a href="http://tinyurl.com/p7c96j6">here</a>).
+   * The Multinomial NB can handle discrete count data and can be called by setting the model
+   * type to "multinomial".
    * For example, it can be used with word counts or TF_IDF vectors of documents.
    * The Bernoulli model fits presence or absence (0-1) counts. By making every vector a
    * 0-1 vector and setting the model type to "bernoulli", the  fits and predicts as
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 7c3ccbb40b81..5fb04ed0ee9a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -124,8 +124,9 @@ object SVMModel extends Loader[SVMModel] {
 
 /**
  * Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. By default L2
- * regularization is used, which can be changed via [[SVMWithSGD.optimizer]].
- * NOTE: Labels used in SVM should be {0, 1}.
+ * regularization is used, which can be changed via `SVMWithSGD.optimizer`.
+ *
+ * @note Labels used in SVM should be {0, 1}.
  */
 @Since("0.8.0")
 class SVMWithSGD private (
@@ -158,7 +159,9 @@ class SVMWithSGD private (
 }
 
 /**
- * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}.
+ * Top-level methods for calling SVM.
+ *
+ * @note Labels used in SVM should be {0, 1}.
  */
 @Since("0.8.0")
 object SVMWithSGD {
@@ -169,8 +172,6 @@ object SVMWithSGD {
    * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in
    * gradient descent are initialized using the initial weights provided.
    *
-   * NOTE: Labels used in SVM should be {0, 1}.
-   *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @param stepSize Step size to be used for each iteration of gradient descent.
@@ -178,6 +179,8 @@ object SVMWithSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
+   *
+   * @note Labels used in SVM should be {0, 1}.
    */
   @Since("0.8.0")
   def train(
@@ -195,7 +198,8 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient.
-   * NOTE: Labels used in SVM should be {0, 1}
+   *
+   * @note Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
@@ -217,13 +221,14 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using the specified step size. We use the entire data set to
    * update the gradient in each iteration.
-   * NOTE: Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param stepSize Step size to be used for each iteration of Gradient Descent.
    * @param regParam Regularization parameter.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a SVMModel which has the weights and offset from training.
+   *
+   * @note Labels used in SVM should be {0, 1}
    */
   @Since("0.8.0")
   def train(
@@ -238,11 +243,12 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using a step size of 1.0. We use the entire data set to
    * update the gradient in each iteration.
-   * NOTE: Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a SVMModel which has the weights and offset from training.
+   *
+   * @note Labels used in SVM should be {0, 1}
    */
   @Since("0.8.0")
   def train(input: RDD[LabeledPoint], numIterations: Int): SVMModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index e6b89712e219..ae98e24a7568 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -43,13 +43,14 @@ import org.apache.spark.storage.StorageLevel
  * @param k the desired number of leaf clusters (default: 4). The actual number could be smaller if
  *          there are no divisible leaf clusters.
  * @param maxIterations the max number of k-means iterations to split clusters (default: 20)
- * @param minDivisibleClusterSize the minimum number of points (if >= 1.0) or the minimum proportion
- *                                of points (if < 1.0) of a divisible cluster (default: 1)
+ * @param minDivisibleClusterSize the minimum number of points (if greater than or equal 1.0) or
+ *                                the minimum proportion of points (if less than 1.0) of a divisible
+ *                                cluster (default: 1)
  * @param seed a random seed (default: hash value of the class name)
  *
- * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf
- *     Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
- *     KDD Workshop on Text Mining, 2000.]]
+ * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">
+ * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
+ * KDD Workshop on Text Mining, 2000.</a>
  */
 @Since("1.6.0")
 class BisectingKMeans private (
@@ -100,8 +101,8 @@ class BisectingKMeans private (
   def getMaxIterations: Int = this.maxIterations
 
   /**
-   * Sets the minimum number of points (if >= `1.0`) or the minimum proportion of points
-   * (if < `1.0`) of a divisible cluster (default: 1).
+   * Sets the minimum number of points (if greater than or equal to `1.0`) or the minimum proportion
+   * of points (if less than `1.0`) of a divisible cluster (default: 1).
    */
   @Since("1.6.0")
   def setMinDivisibleClusterSize(minDivisibleClusterSize: Double): this.type = {
@@ -112,8 +113,8 @@ class BisectingKMeans private (
   }
 
   /**
-   * Gets the minimum number of points (if >= `1.0`) or the minimum proportion of points
-   * (if < `1.0`) of a divisible cluster.
+   * Gets the minimum number of points (if greater than or equal to `1.0`) or the minimum proportion
+   * of points (if less than `1.0`) of a divisible cluster.
    */
   @Since("1.6.0")
   def getMinDivisibleClusterSize: Double = minDivisibleClusterSize
@@ -218,7 +219,7 @@ class BisectingKMeans private (
   }
 
   /**
-   * Java-friendly version of [[run()]].
+   * Java-friendly version of `run()`.
    */
   def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd)
 }
@@ -338,10 +339,15 @@ private object BisectingKMeans extends Serializable {
     assignments.map { case (index, v) =>
       if (divisibleIndices.contains(index)) {
         val children = Seq(leftChildIndex(index), rightChildIndex(index))
-        val selected = children.minBy { child =>
-          KMeans.fastSquaredDistance(newClusterCenters(child), v)
+        val newClusterChildren = children.filter(newClusterCenters.contains(_))
+        if (newClusterChildren.nonEmpty) {
+          val selected = newClusterChildren.minBy { child =>
+            KMeans.fastSquaredDistance(newClusterCenters(child), v)
+          }
+          (selected, v)
+        } else {
+          (index, v)
         }
-        (selected, v)
       } else {
         (index, v)
       }
@@ -371,12 +377,12 @@ private object BisectingKMeans extends Serializable {
         internalIndex -= 1
         val leftIndex = leftChildIndex(rawIndex)
         val rightIndex = rightChildIndex(rawIndex)
-        val height = math.sqrt(Seq(leftIndex, rightIndex).map { childIndex =>
+        val indexes = Seq(leftIndex, rightIndex).filter(clusters.contains(_))
+        val height = math.sqrt(indexes.map { childIndex =>
           KMeans.fastSquaredDistance(center, clusters(childIndex).center)
         }.max)
-        val left = buildSubTree(leftIndex)
-        val right = buildSubTree(rightIndex)
-        new ClusteringTreeNode(index, size, center, cost, height, Array(left, right))
+        val children = indexes.map(buildSubTree(_)).toArray
+        new ClusteringTreeNode(index, size, center, cost, height, children)
       } else {
         val index = leafIndex
         leafIndex += 1
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 8438015ccece..6f1ab091b231 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -71,7 +71,7 @@ class BisectingKMeansModel private[clustering] (
   }
 
   /**
-   * Java-friendly version of [[predict()]].
+   * Java-friendly version of `predict()`.
    */
   @Since("1.6.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
@@ -95,7 +95,7 @@ class BisectingKMeansModel private[clustering] (
   }
 
   /**
-   * Java-friendly version of [[computeCost()]].
+   * Java-friendly version of `computeCost()`.
    */
   @Since("1.6.0")
   def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 43193adf3e18..10bd8468b35c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -41,14 +41,14 @@ import org.apache.spark.util.Utils
  * While this process is generally guaranteed to converge, it is not guaranteed
  * to find a global optimum.
  *
- * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
- *       This is due to high-dimensional data (a) making it difficult to cluster at all (based
- *       on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
- *
  * @param k Number of independent Gaussians in the mixture model.
  * @param convergenceTol Maximum change in log-likelihood at which convergence
  *                       is considered to have occurred.
  * @param maxIterations Maximum number of iterations allowed.
+ *
+ * @note For high-dimensional data (with many features), this algorithm may perform poorly.
+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
  */
 @Since("1.3.0")
 class GaussianMixture private (
@@ -234,7 +234,7 @@ class GaussianMixture private (
   }
 
   /**
-   * Java-friendly version of [[run()]]
+   * Java-friendly version of `run()`
    */
   @Since("1.3.0")
   def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd)
@@ -273,8 +273,8 @@ class GaussianMixture private (
 
 private[clustering] object GaussianMixture {
   /**
-   * Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when
-   * d > 25 except for when k is very small.
+   * Heuristic to distribute the computation of the `MultivariateGaussian`s, approximately when
+   * d is greater than 25 except for when k is very small.
    * @param k  Number of topics
    * @param d  Number of features
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index c30cc3e2398e..afbe4f978b28 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -80,7 +80,7 @@ class GaussianMixtureModel @Since("1.3.0") (
   }
 
   /**
-   * Java-friendly version of [[predict()]]
+   * Java-friendly version of `predict()`
    */
   @Since("1.4.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index ed9c064879d0..fa72b72e2d92 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -56,14 +56,18 @@ class KMeans private (
   def this() = this(2, 20, KMeans.K_MEANS_PARALLEL, 2, 1e-4, Utils.random.nextLong())
 
   /**
-   * Number of clusters to create (k). Note that it is possible for fewer than k clusters to
+   * Number of clusters to create (k).
+   *
+   * @note It is possible for fewer than k clusters to
    * be returned, for example, if there are fewer than k distinct points to cluster.
    */
   @Since("1.4.0")
   def getK: Int = k
 
   /**
-   * Set the number of clusters to create (k). Note that it is possible for fewer than k clusters to
+   * Set the number of clusters to create (k).
+   *
+   * @note It is possible for fewer than k clusters to
    * be returned, for example, if there are fewer than k distinct points to cluster. Default: 2.
    */
   @Since("0.8.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index d999b9be8e8a..4cb920003029 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -39,8 +39,8 @@ import org.apache.spark.util.Utils
  *  - Original LDA paper (journal version):
  *    Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
  *
- * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
- *       (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">
+ * Latent Dirichlet allocation (Wikipedia)</a>
  */
 @Since("1.3.0")
 class LDA private (
@@ -113,20 +113,20 @@ class LDA private (
    *
    * If set to a singleton vector Vector(-1), then docConcentration is set automatically. If set to
    * singleton vector Vector(t) where t != -1, then t is replicated to a vector of length k during
-   * [[LDAOptimizer.initialize()]]. Otherwise, the [[docConcentration]] vector must be length k.
+   * `LDAOptimizer.initialize()`. Otherwise, the [[docConcentration]] vector must be length k.
    * (default = Vector(-1) = automatic)
    *
    * Optimizer-specific parameter settings:
    *  - EM
    *     - Currently only supports symmetric distributions, so all values in the vector should be
    *       the same.
-   *     - Values should be > 1.0
+   *     - Values should be greater than 1.0
    *     - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
    *       from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Values should be >= 0
+   *     - Values should be greater than or equal to 0
    *     - default = uniformly (1.0 / k), following the implementation from
-   *       [[https://github.com/Blei-Lab/onlineldavb]].
+   *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    */
   @Since("1.5.0")
   def setDocConcentration(docConcentration: Vector): this.type = {
@@ -158,13 +158,13 @@ class LDA private (
   def getAlpha: Double = getDocConcentration
 
   /**
-   * Alias for [[setDocConcentration()]]
+   * Alias for `setDocConcentration()`
    */
   @Since("1.5.0")
   def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)
 
   /**
-   * Alias for [[setDocConcentration()]]
+   * Alias for `setDocConcentration()`
    */
   @Since("1.3.0")
   def setAlpha(alpha: Double): this.type = setDocConcentration(alpha)
@@ -175,7 +175,7 @@ class LDA private (
    *
    * This is the parameter to a symmetric Dirichlet distribution.
    *
-   * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+   * @note The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
    */
   @Since("1.3.0")
@@ -187,7 +187,7 @@ class LDA private (
    *
    * This is the parameter to a symmetric Dirichlet distribution.
    *
-   * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+   * @note The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
    *
    * If set to -1, then topicConcentration is set automatically.
@@ -195,13 +195,13 @@ class LDA private (
    *
    * Optimizer-specific parameter settings:
    *  - EM
-   *     - Value should be > 1.0
+   *     - Value should be greater than 1.0
    *     - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
    *       Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Value should be >= 0
+   *     - Value should be greater than or equal to 0
    *     - default = (1.0 / k), following the implementation from
-   *       [[https://github.com/Blei-Lab/onlineldavb]].
+   *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    */
   @Since("1.3.0")
   def setTopicConcentration(topicConcentration: Double): this.type = {
@@ -216,7 +216,7 @@ class LDA private (
   def getBeta: Double = getTopicConcentration
 
   /**
-   * Alias for [[setTopicConcentration()]]
+   * Alias for `setTopicConcentration()`
    */
   @Since("1.3.0")
   def setBeta(beta: Double): this.type = setTopicConcentration(beta)
@@ -261,11 +261,11 @@ class LDA private (
   def getCheckpointInterval: Int = checkpointInterval
 
   /**
-   * Parameter for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that
-   * the cache will get checkpointed every 10 iterations. Checkpointing helps with recovery
-   * (when nodes fail). It also helps with eliminating temporary shuffle files on disk, which can be
-   * important when LDA is run for many iterations. If the checkpoint directory is not set in
-   * [[org.apache.spark.SparkContext]], this setting is ignored. (default = 10)
+   * Parameter for set checkpoint interval (greater than or equal to 1) or disable checkpoint (-1).
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations. Checkpointing helps
+   * with recovery (when nodes fail). It also helps with eliminating temporary shuffle files on
+   * disk, which can be important when LDA is run for many iterations. If the checkpoint directory
+   * is not set in [[org.apache.spark.SparkContext]], this setting is ignored. (default = 10)
    *
    * @see [[org.apache.spark.SparkContext#setCheckpointDir]]
    */
@@ -321,7 +321,7 @@ class LDA private (
    * @param documents  RDD of documents, which are term (word) count vectors paired with IDs.
    *                   The term count vectors are "bags of words" with a fixed-size vocabulary
    *                   (where the vocabulary size is the length of the vector).
-   *                   Document IDs must be unique and >= 0.
+   *                   Document IDs must be unique and greater than or equal to 0.
    * @return  Inferred LDA model
    */
   @Since("1.3.0")
@@ -340,7 +340,7 @@ class LDA private (
   }
 
   /**
-   * Java-friendly version of [[run()]]
+   * Java-friendly version of `run()`
    */
   @Since("1.3.0")
   def run(documents: JavaPairRDD[java.lang.Long, Vector]): LDAModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 90d8a558f10d..b55f1b1db227 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -66,7 +66,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    *
    * This is the parameter to a symmetric Dirichlet distribution.
    *
-   * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+   * @note The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
    */
   @Since("1.5.0")
@@ -171,7 +171,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    *                   The term count vectors are "bags of words" with a fixed-size vocabulary
    *                   (where the vocabulary size is the length of the vector).
    *                   This must use the same vocabulary (ordering of term counts) as in training.
-   *                   Document IDs must be unique and >= 0.
+   *                   Document IDs must be unique and greater than or equal to 0.
    * @return  Estimated topic distribution for each document.
    *          The returned RDD may be zipped with the given RDD, where each returned vector
    *          is a multinomial distribution over topics.
@@ -392,7 +392,7 @@ class LocalLDAModel private[spark] (
    * literature).  Returns a vector of zeros for an empty document.
    *
    * Note this means to allow quick query for single document. For batch documents, please refer
-   * to [[topicDistributions()]] to avoid overhead.
+   * to `topicDistributions()` to avoid overhead.
    *
    * @param document document to predict topic mixture distributions for
    * @return topic mixture distribution for the document
@@ -745,12 +745,12 @@ class DistributedLDAModel private[clustering] (
           val N_wk = vertex._2
           val smoothed_N_wk: TopicCounts = N_wk + (eta - 1.0)
           val phi_wk: TopicCounts = smoothed_N_wk :/ smoothed_N_k
-          (eta - 1.0) * sum(phi_wk.map(math.log))
+          sumPrior + (eta - 1.0) * sum(phi_wk.map(math.log))
         } else {
           val N_kj = vertex._2
           val smoothed_N_kj: TopicCounts = N_kj + (alpha - 1.0)
           val theta_kj: TopicCounts = normalize(smoothed_N_kj, 1.0)
-          (alpha - 1.0) * sum(theta_kj.map(math.log))
+          sumPrior + (alpha - 1.0) * sum(theta_kj.map(math.log))
         }
     }
     graph.vertices.aggregate(0.0)(seqOp, _ + _)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index ae324f86fe6d..48bae4276c48 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -38,7 +38,7 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.4.0")
 @DeveloperApi
-sealed trait LDAOptimizer {
+trait LDAOptimizer {
 
   /*
     DEVELOPERS NOTE:
@@ -93,9 +93,11 @@ final class EMLDAOptimizer extends LDAOptimizer {
   /**
    * If using checkpointing, this indicates whether to keep the last checkpoint (vs clean up).
    * Deleting the checkpoint can cause failures if a data partition is lost, so set this bit with
-   * care.  Note that checkpoints will be cleaned up via reference counting, regardless.
+   * care.
    *
    * Default: true
+   *
+   * @note Checkpoints will be cleaned up via reference counting, regardless.
    */
   @Since("2.0.0")
   def setKeepLastCheckpoint(keepLastCheckpoint: Boolean): this.type = {
@@ -348,9 +350,9 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * Mini-batch fraction in (0, 1], which sets the fraction of document sampled and used in
    * each iteration.
    *
-   * Note that this should be adjusted in synch with [[LDA.setMaxIterations()]]
+   * @note This should be adjusted in synch with `LDA.setMaxIterations()`
    * so the entire corpus is used.  Specifically, set both so that
-   * maxIterations * miniBatchFraction >= 1.
+   * maxIterations * miniBatchFraction is at least 1.
    *
    * Default: 0.05, i.e., 5% of total documents.
    */
@@ -561,7 +563,7 @@ private[clustering] object OnlineLDAOptimizer {
    *
    * An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001)
    * avoids explicit computation of variational parameter `phi`.
-   * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]]
+   * @see <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566">here</a>
    *
    * @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` -
    *         statistics for updating lambda and `ids` - list of termCounts vector indices.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index c760ddd6ad40..b2437b845f82 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -36,7 +36,7 @@ import org.apache.spark.util.random.XORShiftRandom
  * Model produced by [[PowerIterationClustering]].
  *
  * @param k number of clusters
- * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s
+ * @param assignments an RDD of clustering `PowerIterationClustering#Assignment`s
  */
 @Since("1.3.0")
 class PowerIterationClusteringModel @Since("1.3.0") (
@@ -103,9 +103,9 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
 
 /**
  * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
- * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very
- * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise
- * similarity matrix of the data.
+ * <a href="http://www.icml2010.org/papers/387.pdf">Lin and Cohen</a>. From the abstract: PIC finds
+ * a very low-dimensional embedding of a dataset using truncated power iteration on a normalized
+ * pair-wise similarity matrix of the data.
  *
  * @param k Number of clusters.
  * @param maxIterations Maximum number of iterations of the PIC algorithm.
@@ -113,7 +113,8 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
  *                 as vertex properties, or "degree" to use normalized sum similarities.
  *                 Default: random.
  *
- * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Spectral_clustering">
+ * Spectral clustering (Wikipedia)</a>
  */
 @Since("1.3.0")
 class PowerIterationClustering private[clustering] (
@@ -210,7 +211,7 @@ class PowerIterationClustering private[clustering] (
   }
 
   /**
-   * A Java-friendly version of [[PowerIterationClustering.run]].
+   * A Java-friendly version of `PowerIterationClustering.run`.
    */
   @Since("1.3.0")
   def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)])
@@ -258,7 +259,7 @@ object PowerIterationClustering extends Logging {
         val j = ctx.dstId
         val s = ctx.attr
         if (s < 0.0) {
-          throw new SparkException("Similarity must be nonnegative but found s($i, $j) = $s.")
+          throw new SparkException(s"Similarity must be nonnegative but found s($i, $j) = $s.")
         }
         if (s > 0.0) {
           ctx.sendToSrc(s)
@@ -282,7 +283,7 @@ object PowerIterationClustering extends Logging {
     : Graph[Double, Double] = {
     val edges = similarities.flatMap { case (i, j, s) =>
       if (s < 0.0) {
-        throw new SparkException("Similarity must be nonnegative but found s($i, $j) = $s.")
+        throw new SparkException(s"Similarity must be nonnegative but found s($i, $j) = $s.")
       }
       if (i != j) {
         Seq(Edge(i, j, s), Edge(j, i, s))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index f20ab09bf0b4..85c37c438d93 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -39,14 +39,14 @@ import org.apache.spark.util.random.XORShiftRandom
  * generalized to incorporate forgetfullness (i.e. decay).
  * The update rule (for each cluster) is:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *     c_t+1 &= [(c_t * n_t * a) + (x_t * m_t)] / [n_t + m_t] \\
  *     n_t+t &= n_t * a + m_t
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Where c_t is the previously estimated centroid for that cluster,
  * n_t is the number of points assigned to it thus far, x_t is the centroid
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
index f0779491e637..003d1411a9cf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
@@ -39,7 +39,7 @@ private[evaluation] object AreaUnderCurve {
   /**
    * Returns the area under the given curve.
    *
-   * @param curve a RDD of ordered 2D points stored in pairs representing a curve
+   * @param curve an RDD of ordered 2D points stored in pairs representing a curve
    */
   def of(curve: RDD[(Double, Double)]): Double = {
     curve.sliding(2).aggregate(0.0)(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 92cd7f22dc43..9b7cd0427f5e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -78,7 +78,8 @@ class BinaryClassificationMetrics @Since("1.3.0") (
    * Returns the receiver operating characteristic (ROC) curve,
    * which is an RDD of (false positive rate, true positive rate)
    * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
-   * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
+   * @see <a href="http://en.wikipedia.org/wiki/Receiver_operating_characteristic">
+   * Receiver operating characteristic (Wikipedia)</a>
    */
   @Since("1.0.0")
   def roc(): RDD[(Double, Double)] = {
@@ -98,7 +99,8 @@ class BinaryClassificationMetrics @Since("1.3.0") (
   /**
    * Returns the precision-recall curve, which is an RDD of (recall, precision),
    * NOT (precision, recall), with (0.0, 1.0) prepended to it.
-   * @see http://en.wikipedia.org/wiki/Precision_and_recall
+   * @see <a href="http://en.wikipedia.org/wiki/Precision_and_recall">
+   * Precision and recall (Wikipedia)</a>
    */
   @Since("1.0.0")
   def pr(): RDD[(Double, Double)] = {
@@ -118,7 +120,7 @@ class BinaryClassificationMetrics @Since("1.3.0") (
    * Returns the (threshold, F-Measure) curve.
    * @param beta the beta factor in F-Measure computation.
    * @return an RDD of (threshold, F-Measure) pairs.
-   * @see http://en.wikipedia.org/wiki/F1_score
+   * @see <a href="http://en.wikipedia.org/wiki/F1_score">F1 score (Wikipedia)</a>
    */
   @Since("1.0.0")
   def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index e29b51c3a19d..b98aa0534152 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -30,7 +30,7 @@ import org.apache.spark.rdd.RDD
 /**
  * Evaluator for ranking algorithms.
  *
- * Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance.
+ * Java users should use `RankingMetrics$.of` to create a [[RankingMetrics]] instance.
  *
  * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs.
  */
@@ -41,9 +41,9 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
   /**
    * Compute the average precision of all the queries, truncated at ranking position k.
    *
-   * If for a query, the ranking algorithm returns n (n < k) results, the precision value will be
-   * computed as #(relevant items retrieved) / k. This formula also applies when the size of the
-   * ground truth set is less than k.
+   * If for a query, the ranking algorithm returns n (n is less than k) results, the precision
+   * value will be computed as #(relevant items retrieved) / k. This formula also applies when
+   * the size of the ground truth set is less than k.
    *
    * If a query has an empty ground truth set, zero will be used as precision together with
    * a log warning.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index 8f777cc35b93..ad99b00a31fd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -74,7 +74,8 @@ class RegressionMetrics @Since("2.0.0") (
   /**
    * Returns the variance explained by regression.
    * explainedVariance = $\sum_i (\hat{y_i} - \bar{y})^2^ / n$
-   * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]]
+   * @see <a href="https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained">
+   * Fraction of variance unexplained (Wikipedia)</a>
    */
   @Since("1.2.0")
   def explainedVariance: Double = {
@@ -110,10 +111,11 @@ class RegressionMetrics @Since("2.0.0") (
 
   /**
    * Returns R^2^, the unadjusted coefficient of determination.
-   * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+   * @see <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination">
+   * Coefficient of determination (Wikipedia)</a>
    * In case of regression through the origin, the definition of R^2^ is to be modified.
-   * @see J. G. Eisenhauer, Regression through the Origin. Teaching Statistics 25, 76-80 (2003)
-   * [[https://online.stat.psu.edu/~ajw13/stat501/SpecialTopics/Reg_thru_origin.pdf]]
+   * @see <a href="https://online.stat.psu.edu/~ajw13/stat501/SpecialTopics/Reg_thru_origin.pdf">
+   * J. G. Eisenhauer, Regression through the Origin. Teaching Statistics 25, 76-80 (2003)</a>
    */
   @Since("1.2.0")
   def r2: Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
index be3319d60ce2..5a4c6aef50b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
@@ -62,7 +62,7 @@ private[evaluation] object Recall extends BinaryClassificationMetricComputer {
  * F-Measure. Defined as 0 if both precision and recall are 0. EG in the case that all examples
  * are false positives.
  * @param beta the beta constant in F-Measure
- * @see http://en.wikipedia.org/wiki/F1_score
+ * @see <a href="http://en.wikipedia.org/wiki/F1_score">F1 score (Wikipedia)</a>
  */
 private[evaluation] case class FMeasure(beta: Double) extends BinaryClassificationMetricComputer {
   private val beta2 = beta * beta
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index f9156b642785..7ef2a95b96f2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -255,14 +255,18 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
 private[spark] object ChiSqSelector {
 
-  /** String name for `numTopFeatures` selector type. */
+  /**
+   * String name for `numTopFeatures` selector type.
+   */
   val NumTopFeatures: String = "numTopFeatures"
 
-  /** String name for `percentile` selector type. */
+  /**
+   * String name for `percentile` selector type.
+   */
   val Percentile: String = "percentile"
 
   /** String name for `fpr` selector type. */
-  private[spark] val FPR: String = "fpr"
+  val FPR: String = "fpr"
 
   /** Set of selector types that ChiSqSelector supports. */
   val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index bc26655104a9..9abdd44a635d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -131,9 +131,9 @@ class HashingTF(val numFeatures: Int) extends Serializable {
 
 object HashingTF {
 
-  private[spark] val Native: String = "native"
+  private[HashingTF] val Native: String = "native"
 
-  private[spark] val Murmur3: String = "murmur3"
+  private[HashingTF] val Murmur3: String = "murmur3"
 
   private val seed = 42
 
@@ -141,7 +141,7 @@ object HashingTF {
    * Calculate a hash code value for the term object using the native Scala implementation.
    * This is the default hash algorithm used in Spark 1.6 and earlier.
    */
-  private[spark] def nativeHash(term: Any): Int = term.##
+  private[HashingTF] def nativeHash(term: Any): Int = term.##
 
   /**
    * Calculate a hash code value for the term object using
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 3c26d2670841..dca031477d3b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -28,7 +28,7 @@ import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
 import org.apache.spark.rdd.RDD
 
 /**
- * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates
+ * Generates association rules from a `RDD[FreqItemset[Item]]`. This method only generates
  * association rules which have a single item as the consequent.
  *
  */
@@ -56,7 +56,7 @@ class AssociationRules private[fpm] (
   /**
    * Computes the association rules with confidence above [[minConfidence]].
    * @param freqItemsets frequent itemset model obtained from [[FPGrowth]]
-   * @return a [[Set[Rule[Item]]] containing the association rules.
+   * @return a `Set[Rule[Item]]` containing the association rules.
    *
    */
   @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 0f7fbe9556c5..e3cf0d4979ed 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -44,7 +44,7 @@ import org.apache.spark.storage.StorageLevel
 
 /**
  * Model trained by [[FPGrowth]], which holds frequent itemsets.
- * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
+ * @param freqItemsets frequent itemset, which is an RDD of `FreqItemset`
  * @tparam Item item type
  */
 @Since("1.3.0")
@@ -69,7 +69,7 @@ class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
    *  - human-readable (JSON) model metadata to path/metadata/
    *  - Parquet formatted data to path/data/
    *
-   * The model may be loaded using [[FPGrowthModel.load]].
+   * The model may be loaded using `FPGrowthModel.load`.
    *
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
@@ -147,18 +147,18 @@ object FPGrowthModel extends Loader[FPGrowthModel[_]] {
 
 /**
  * A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
- * [[http://dx.doi.org/10.1145/1454008.1454027 Li et al., PFP: Parallel FP-Growth for Query
- *  Recommendation]]. PFP distributes computation in such a way that each worker executes an
+ * <a href="http://dx.doi.org/10.1145/1454008.1454027">Li et al., PFP: Parallel FP-Growth for Query
+ * Recommendation</a>. PFP distributes computation in such a way that each worker executes an
  * independent group of mining tasks. The FP-Growth algorithm is described in
- * [[http://dx.doi.org/10.1145/335191.335372 Han et al., Mining frequent patterns without candidate
- *  generation]].
+ * <a href="http://dx.doi.org/10.1145/335191.335372">Han et al., Mining frequent patterns without
+ * candidate generation</a>.
  *
  * @param minSupport the minimal support level of the frequent pattern, any pattern that appears
  *                   more than (minSupport * size-of-the-dataset) times will be output
  * @param numPartitions number of partitions used by parallel FP-growth
  *
- * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning
- *       (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Association_rule_learning">
+ * Association rule learning (Wikipedia)</a>
  *
  */
 @Since("1.3.0")
@@ -309,7 +309,7 @@ object FPGrowth {
 
   /**
    * Frequent itemset.
-   * @param items items in this itemset. Java users should call [[FreqItemset#javaItems]] instead.
+   * @param items items in this itemset. Java users should call `FreqItemset.javaItems` instead.
    * @param freq frequency
    * @tparam Item item type
    *
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index 7382000791cf..327cb974ef96 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -44,7 +44,8 @@ import org.apache.spark.storage.StorageLevel
 /**
  * A parallel PrefixSpan algorithm to mine frequent sequential patterns.
  * The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns
- * Efficiently by Prefix-Projected Pattern Growth ([[http://doi.org/10.1109/ICDE.2001.914830]]).
+ * Efficiently by Prefix-Projected Pattern Growth
+ * (see <a href="http://doi.org/10.1109/ICDE.2001.914830">here</a>).
  *
  * @param minSupport the minimal support level of the sequential pattern, any pattern that appears
  *                   more than (minSupport * size-of-the-dataset) times will be output
@@ -55,8 +56,8 @@ import org.apache.spark.storage.StorageLevel
  *                           processing. If a projected database exceeds this size, another
  *                           iteration of distributed prefix growth is run.
  *
- * @see [[https://en.wikipedia.org/wiki/Sequential_Pattern_Mining Sequential Pattern Mining
- *       (Wikipedia)]]
+ * @see <a href="https://en.wikipedia.org/wiki/Sequential_Pattern_Mining">Sequential Pattern Mining
+ * (Wikipedia)</a>
  */
 @Since("1.5.0")
 class PrefixSpan private (
@@ -210,7 +211,7 @@ class PrefixSpan private (
   }
 
   /**
-   * A Java-friendly version of [[run()]] that reads sequences from a [[JavaRDD]] and returns
+   * A Java-friendly version of `run()` that reads sequences from a `JavaRDD` and returns
    * frequent sequences in a [[PrefixSpanModel]].
    * @param data ordered sequences of itemsets stored as Java Iterable of Iterables
    * @tparam Item item type
@@ -365,13 +366,13 @@ object PrefixSpan extends Logging {
    * Items are represented by positive integers, and items in each itemset must be distinct and
    * ordered.
    * we use 0 as the delimiter between itemsets.
-   * For example, a sequence `<(12)(31)1>` is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`.
-   * The postfix of this sequence w.r.t. to prefix `<1>` is `<(_2)(13)1>`.
+   * For example, a sequence `(12)(31)1` is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`.
+   * The postfix of this sequence w.r.t. to prefix `1` is `(_2)(13)1`.
    * We may reuse the original items array `[0, 1, 2, 0, 1, 3, 0, 1, 0]` to represent the postfix,
    * and mark the start index of the postfix, which is `2` in this example.
    * So the active items in this postfix are `[2, 0, 1, 3, 0, 1, 0]`.
    * We also remember the start indices of partial projections, the ones that split an itemset.
-   * For example, another possible partial projection w.r.t. `<1>` is `<(_3)1>`.
+   * For example, another possible partial projection w.r.t. `1` is `(_3)1`.
    * We remember the start indices of partial projections, which is `[2, 5]` in this example.
    * This data structure makes it easier to do projections.
    *
@@ -582,7 +583,7 @@ class PrefixSpanModel[Item] @Since("1.5.0") (
    *  - human-readable (JSON) model metadata to path/metadata/
    *  - Parquet formatted data to path/data/
    *
-   * The model may be loaded using [[PrefixSpanModel.load]].
+   * The model may be loaded using `PrefixSpanModel.load`.
    *
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index bb94745f078e..7695aabf4313 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -32,7 +32,7 @@ private[mllib] object EigenValueDecomposition {
    *
    * @param mul a function that multiplies the symmetric matrix with a DenseVector.
    * @param n dimension of the square matrix (maximum Int.MaxValue).
-   * @param k number of leading eigenvalues required, 0 < k < n.
+   * @param k number of leading eigenvalues required, where k must be positive and less than n.
    * @param tol tolerance of the eigs computation.
    * @param maxIterations the maximum number of Arnoldi update iterations.
    * @return a dense vector of eigenvalues in descending order and a dense matrix of eigenvectors
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 542a69b3ef8c..2b2b5fe49ea3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -91,11 +91,15 @@ sealed trait Matrix extends Serializable {
   @Since("1.2.0")
   def copy: Matrix
 
-  /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */
+  /**
+   * Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data.
+   */
   @Since("1.3.0")
   def transpose: Matrix
 
-  /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */
+  /**
+   * Convenience method for `Matrix`-`DenseMatrix` multiplication.
+   */
   @Since("1.2.0")
   def multiply(y: DenseMatrix): DenseMatrix = {
     val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols)
@@ -103,13 +107,17 @@ sealed trait Matrix extends Serializable {
     C
   }
 
-  /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */
+  /**
+   * Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility.
+   */
   @Since("1.2.0")
   def multiply(y: DenseVector): DenseVector = {
     multiply(y.asInstanceOf[Vector])
   }
 
-  /** Convenience method for `Matrix`-`Vector` multiplication. */
+  /**
+   * Convenience method for `Matrix`-`Vector` multiplication.
+   */
   @Since("1.4.0")
   def multiply(y: Vector): DenseVector = {
     val output = new DenseVector(new Array[Double](numRows))
@@ -984,7 +992,16 @@ object Matrices {
         new DenseMatrix(dm.rows, dm.cols, dm.data, dm.isTranspose)
       case sm: BSM[Double] =>
         // There is no isTranspose flag for sparse matrices in Breeze
-        new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data)
+        val nsm = if (sm.rowIndices.length > sm.activeSize) {
+          // This sparse matrix has trailing zeros.
+          // Remove them by compacting the matrix.
+          val csm = sm.copy
+          csm.compact()
+          csm
+        } else {
+          sm
+        }
+        new SparseMatrix(nsm.rows, nsm.cols, nsm.colPtrs, nsm.rowIndices, nsm.data)
       case _ =>
         throw new UnsupportedOperationException(
           s"Do not support conversion from type ${breeze.getClass.getName}.")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index fbd217af74ec..52828492619f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.types._
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
  *
- * Note: Users should not implement this interface.
+ * @note Users should not implement this interface.
  */
 @SQLUserDefinedType(udt = classOf[VectorUDT])
 @Since("1.0.0")
@@ -77,7 +77,7 @@ sealed trait Vector extends Serializable {
 
   /**
    * Returns a hash code value for the vector. The hash code is based on its size and its first 128
-   * nonzero entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
+   * nonzero entries, using a hash algorithm similar to `java.util.Arrays.hashCode`.
    */
   override def hashCode(): Int = {
     // This is a reference implementation. It calls return in foreachActive, which is slow.
@@ -132,7 +132,9 @@ sealed trait Vector extends Serializable {
 
   /**
    * Number of active entries.  An "active entry" is an element which is explicitly stored,
-   * regardless of its value.  Note that inactive entries have value 0.
+   * regardless of its value.
+   *
+   * @note Inactive entries have value 0.
    */
   @Since("1.4.0")
   def numActives: Int
@@ -349,7 +351,7 @@ object Vectors {
   }
 
   /**
-   * Parses a string resulted from [[Vector.toString]] into a [[Vector]].
+   * Parses a string resulted from `Vector.toString` into a [[Vector]].
    */
   @Since("1.1.0")
   def parse(s: String): Vector = {
@@ -844,6 +846,8 @@ class SparseVector @Since("1.0.0") (
   override def argmax: Int = {
     if (size == 0) {
       -1
+    } else if (numActives == 0) {
+      0
     } else {
       // Find the max active entry.
       var maxIdx = indices(0)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 377be6bfb988..ff81a2f03e2a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -295,7 +295,9 @@ class BlockMatrix @Since("1.3.0") (
     new IndexedRowMatrix(rows)
   }
 
-  /** Collect the distributed matrix on the driver as a `DenseMatrix`. */
+  /**
+   * Collect the distributed matrix on the driver as a `DenseMatrix`.
+   */
   @Since("1.3.0")
   def toLocalMatrix(): Matrix = {
     require(numRows() < Int.MaxValue, "The number of rows of this matrix should be less than " +
@@ -385,10 +387,10 @@ class BlockMatrix @Since("1.3.0") (
   /**
    * Adds the given block matrix `other` to `this` block matrix: `this + other`.
    * The matrices must have the same size and matching `rowsPerBlock` and `colsPerBlock`
-   * values. If one of the blocks that are being added are instances of [[SparseMatrix]],
-   * the resulting sub matrix will also be a [[SparseMatrix]], even if it is being added
-   * to a [[DenseMatrix]]. If two dense matrices are added, the output will also be a
-   * [[DenseMatrix]].
+   * values. If one of the blocks that are being added are instances of `SparseMatrix`,
+   * the resulting sub matrix will also be a `SparseMatrix`, even if it is being added
+   * to a `DenseMatrix`. If two dense matrices are added, the output will also be a
+   * `DenseMatrix`.
    */
   @Since("1.3.0")
   def add(other: BlockMatrix): BlockMatrix =
@@ -397,10 +399,10 @@ class BlockMatrix @Since("1.3.0") (
   /**
    * Subtracts the given block matrix `other` from `this` block matrix: `this - other`.
    * The matrices must have the same size and matching `rowsPerBlock` and `colsPerBlock`
-   * values. If one of the blocks that are being subtracted are instances of [[SparseMatrix]],
-   * the resulting sub matrix will also be a [[SparseMatrix]], even if it is being subtracted
-   * from a [[DenseMatrix]]. If two dense matrices are subtracted, the output will also be a
-   * [[DenseMatrix]].
+   * values. If one of the blocks that are being subtracted are instances of `SparseMatrix`,
+   * the resulting sub matrix will also be a `SparseMatrix`, even if it is being subtracted
+   * from a `DenseMatrix`. If two dense matrices are subtracted, the output will also be a
+   * `DenseMatrix`.
    */
   @Since("2.0.0")
   def subtract(other: BlockMatrix): BlockMatrix =
@@ -447,11 +449,11 @@ class BlockMatrix @Since("1.3.0") (
   /**
    * Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock`
    * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains
-   * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output
-   * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
+   * `SparseMatrix`, they will have to be converted to a `DenseMatrix`. The output
+   * [[BlockMatrix]] will only consist of blocks of `DenseMatrix`. This may cause
    * some performance issues until support for multiplying two sparse matrices is added.
    *
-   * Note: The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
+   * @note The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
    * there were blocks with duplicate indices. Now, the blocks with duplicate indices will be added
    * with each other.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
index 008b03d1cc33..26ca1ef9be87 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
@@ -101,14 +101,16 @@ class CoordinateMatrix @Since("1.0.0") (
     toIndexedRowMatrix().toRowMatrix()
   }
 
-  /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */
+  /**
+   * Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024.
+   */
   @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
   }
 
   /**
-   * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]].
+   * Converts to BlockMatrix. Creates blocks of `SparseMatrix`.
    * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
    *                     a smaller value. Must be an integer value greater than 0.
    * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index b03b3ecde94f..d7255d527f03 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -90,14 +90,16 @@ class IndexedRowMatrix @Since("1.0.0") (
     new RowMatrix(rows.map(_.vector), 0L, nCols)
   }
 
-  /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */
+  /**
+   * Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024.
+   */
   @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
   }
 
   /**
-   * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]].
+   * Converts to BlockMatrix. Creates blocks of `SparseMatrix`.
    * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
    *                     a smaller value. Must be an integer value greater than 0.
    * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
@@ -188,8 +190,9 @@ class IndexedRowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the Gramian matrix `A^T A`. Note that this cannot be
-   * computed on matrices with more than 65535 columns.
+   * Computes the Gramian matrix `A^T A`.
+   *
+   * @note This cannot be computed on matrices with more than 65535 columns.
    */
   @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index ec32e37afb79..78a8810052ae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -106,8 +106,9 @@ class RowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the Gramian matrix `A^T A`. Note that this cannot be computed on matrices with
-   * more than 65535 columns.
+   * Computes the Gramian matrix `A^T A`.
+   *
+   * @note This cannot be computed on matrices with more than 65535 columns.
    */
   @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
@@ -168,9 +169,6 @@ class RowMatrix @Since("1.0.0") (
    * ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK's
    * eigen-decomposition is set to 1e-10.
    *
-   * @note The conditions that decide which method to use internally and the default parameters are
-   *       subject to change.
-   *
    * @param k number of leading singular values to keep (0 &lt; k &lt;= n).
    *          It might return less than k if
    *          there are numerically zero singular values or there are not enough Ritz values
@@ -180,6 +178,9 @@ class RowMatrix @Since("1.0.0") (
    * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
    *              are treated as zero, where sigma(0) is the largest singular value.
    * @return SingularValueDecomposition(U, s, V). U = null if computeU = false.
+   *
+   * @note The conditions that decide which method to use internally and the default parameters are
+   * subject to change.
    */
   @Since("1.0.0")
   def computeSVD(
@@ -319,9 +320,11 @@ class RowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the covariance matrix, treating each row as an observation. Note that this cannot
-   * be computed on matrices with more than 65535 columns.
+   * Computes the covariance matrix, treating each row as an observation.
+   *
    * @return a local dense matrix of size n x n
+   *
+   * @note This cannot be computed on matrices with more than 65535 columns.
    */
   @Since("1.0.0")
   def computeCovariance(): Matrix = {
@@ -369,12 +372,12 @@ class RowMatrix @Since("1.0.0") (
    * The row data do not need to be "centered" first; it is not necessary for
    * the mean of each column to be 0.
    *
-   * Note that this cannot be computed on matrices with more than 65535 columns.
-   *
    * @param k number of top principal components.
    * @return a matrix of size n-by-k, whose columns are principal components, and
    * a vector of values which indicate how much variance each principal component
    * explains
+   *
+   * @note This cannot be computed on matrices with more than 65535 columns.
    */
   @Since("1.6.0")
   def computePrincipalComponentsAndExplainedVariance(k: Int): (Matrix, Vector) = {
@@ -528,7 +531,7 @@ class RowMatrix @Since("1.0.0") (
    * decomposition (factorization) for the [[RowMatrix]] of a tall and skinny shape.
    * Reference:
    *  Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce
-   *  architectures"  ([[http://dx.doi.org/10.1145/1996092.1996103]])
+   *  architectures" (see <a href="http://dx.doi.org/10.1145/1996092.1996103">here</a>)
    *
    * @param computeQ whether to computeQ
    * @return QRDecomposition(Q, R), Q = null if computeQ = false.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index 81e64de4e5b5..0efce3c76f15 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -67,14 +67,14 @@ abstract class Gradient extends Serializable {
  * http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of
  * multinomial logistic regression model. A simple calculation shows that
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    P(y=0|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    P(y=1|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    ...\\
  *    P(y=K-1|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * for K classes multiclass classification problem.
  *
@@ -83,7 +83,7 @@ abstract class Gradient extends Serializable {
  * will be (K-1) * N.
  *
  * As a result, the loss of objective function for a single instance of data can be written as
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *    l(w, x) &= -log P(y|x, w) = -\alpha(y) log P(y=0|x, w) - (1-\alpha(y)) log P(y|x, w) \\
@@ -91,7 +91,7 @@ abstract class Gradient extends Serializable {
  *            &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $\alpha(i) = 1$ if $i \ne 0$, and
  *       $\alpha(i) = 0$ if $i == 0$,
@@ -100,7 +100,7 @@ abstract class Gradient extends Serializable {
  * For optimization, we have to calculate the first derivative of the loss function, and
  * a simple calculation shows that
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *      \frac{\partial l(w, x)}{\partial w_{ij}} &=
@@ -108,7 +108,7 @@ abstract class Gradient extends Serializable {
  *                                               &= multiplier_i * x_j
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $\delta_{i, j} = 1$ if $i == j$,
  *       $\delta_{i, j} = 0$ if $i != j$, and
@@ -118,12 +118,12 @@ abstract class Gradient extends Serializable {
  * If any of margins is larger than 709.78, the numerical computation of multiplier and loss
  * function will be suffered from arithmetic overflow. This issue occurs when there are outliers
  * in data which are far away from hyperplane, and this will cause the failing of training once
- * infinity / infinity is introduced. Note that this is only a concern when max(margins) > 0.
+ * infinity / infinity is introduced. Note that this is only a concern when max(margins) &gt; 0.
  *
- * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be
+ * Fortunately, when max(margins) = maxMargin &gt; 0, the loss function and the multiplier can be
  * easily rewritten into the following equivalent numerically stable formula.
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *      l(w, x) &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} \\
@@ -132,7 +132,7 @@ abstract class Gradient extends Serializable {
  *              &= log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1}
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
 
  * where sum = $\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1$.
  *
@@ -141,7 +141,7 @@ abstract class Gradient extends Serializable {
  *
  * For multiplier, similar trick can be applied as the following,
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *      multiplier
@@ -150,7 +150,7 @@ abstract class Gradient extends Serializable {
  *       &= \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1})
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where each term in $\exp$ is also smaller than zero, so overflow is not a concern.
  *
@@ -305,7 +305,8 @@ class LeastSquaresGradient extends Gradient {
  * :: DeveloperApi ::
  * Compute gradient and loss for a Hinge loss function, as used in SVM binary classification.
  * See also the documentation for the precise formulation.
- * NOTE: This assumes that the labels are {0,1}
+ *
+ * @note This assumes that the labels are {0,1}
  */
 @DeveloperApi
 class HingeGradient extends Gradient {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 123e0bb3e607..8979707666a2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -88,11 +88,11 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
    * convergenceTol is a condition which decides iteration termination.
    * The end of iteration is decided based on below logic.
    *
-   *  - If the norm of the new solution vector is >1, the diff of solution vectors
+   *  - If the norm of the new solution vector is greater than 1, the diff of solution vectors
    *    is compared to relative tolerance which means normalizing by the norm of
    *    the new solution vector.
-   *  - If the norm of the new solution vector is <=1, the diff of solution vectors
-   *    is compared to absolute tolerance which is not normalizing.
+   *  - If the norm of the new solution vector is less than or equal to 1, the diff of solution
+   *    vectors is compared to absolute tolerance which is not normalizing.
    *
    * Must be between 0.0 and 1.0 inclusively.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index e49363c2c64d..e0e41f711b98 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -31,7 +31,8 @@ import org.apache.spark.rdd.RDD
 /**
  * :: DeveloperApi ::
  * Class used to solve an optimization problem using Limited-memory BFGS.
- * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]]
+ * Reference: <a href="http://en.wikipedia.org/wiki/Limited-memory_BFGS">
+ * Wikipedia on Limited-memory BFGS</a>
  * @param gradient Gradient function to be used.
  * @param updater Updater to be used to update weights after every iteration.
  */
@@ -48,8 +49,7 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
    * Set the number of corrections used in the LBFGS update. Default 10.
    * Values of numCorrections less than 3 are not recommended; large values
    * of numCorrections will result in excessive computing time.
-   * 3 < numCorrections < 10 is recommended.
-   * Restriction: numCorrections > 0
+   * numCorrections must be positive, and values from 4 to 9 are generally recommended.
    */
   def setNumCorrections(corrections: Int): this.type = {
     require(corrections > 0,
@@ -252,6 +252,9 @@ object LBFGS extends Logging {
             (grad1, loss1 + loss2)
           })
 
+      // broadcasted model is not needed anymore
+      bcW.destroy()
+
       /**
        * regVal is sum of weight squares if it's L2 updater;
        * for other updater, the same logic is followed.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
index 64d52bae0090..86632ae33595 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
@@ -53,8 +53,13 @@ private[spark] object NNLS {
    * projected gradient method.  That is, find x minimising ||Ax - b||_2 given A^T A and A^T b.
    *
    * We solve the problem
-   *   min_x      1/2 x^T ata x^T - x^T atb
-   *   subject to x >= 0
+   *
+   * <blockquote>
+   *    $$
+   *    min_x 1/2 x^T ata x^T - x^T atb
+   *    $$
+   * </blockquote>
+   * where x is nonnegative.
    *
    * The method used is similar to one described by Polyak (B. T. Polyak, The conjugate gradient
    * method in extremal problems, Zh. Vychisl. Mat. Mat. Fiz. 9(4)(1969), pp. 94-112) for bound-
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
index 67d484575db5..142f0ec6b902 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
@@ -95,9 +95,9 @@ class SimpleUpdater extends Updater {
  * The corresponding proximal operator for the L1 norm is the soft-thresholding
  * function. That is, each weight component is shrunk towards 0 by shrinkageVal.
  *
- * If w >  shrinkageVal, set weight component to w-shrinkageVal.
- * If w < -shrinkageVal, set weight component to w+shrinkageVal.
- * If -shrinkageVal < w < shrinkageVal, set weight component to 0.
+ * If w is greater than shrinkageVal, set weight component to w-shrinkageVal.
+ * If w is less than -shrinkageVal, set weight component to w+shrinkageVal.
+ * If w is (-shrinkageVal, shrinkageVal), set weight component to 0.
  *
  * Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
  */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/package.scala b/mllib/src/main/scala/org/apache/spark/mllib/package.scala
index 9810b6f66806..8323afcb6a83 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/package.scala
@@ -32,7 +32,7 @@ package org.apache.spark
  * to reach feature parity with the RDD-based APIs.
  * And once we reach feature parity, this package will be deprecated.
  *
- * @see [[https://issues.apache.org/jira/browse/SPARK-4591 SPARK-4591]] to track the progress of
- *     feature parity
+ * @see <a href="https://issues.apache.org/jira/browse/SPARK-4591">SPARK-4591</a> to track
+ * the progress of feature parity
  */
 package object mllib
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
index 426bb818c926..f5ca1c221d66 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.pmml.export
 
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import scala.beans.BeanProperty
 
@@ -34,7 +34,7 @@ private[mllib] trait PMMLModelExport {
     val version = getClass.getPackage.getImplementationVersion
     val app = new Application("Apache Spark MLlib").setVersion(version)
     val timestamp = new Timestamp()
-      .addContent(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(new Date()))
+      .addContent(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US).format(new Date()))
     val header = new Header()
       .setApplication(app)
       .setTimestamp(timestamp)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 6d60136ddc38..85d4d7f37f2c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -249,8 +249,8 @@ object RandomRDDs {
    *  shape and scale.
    *
    * @param sc SparkContext used to create the RDD.
-   * @param shape shape parameter (> 0) for the gamma distribution
-   * @param scale scale parameter (> 0) for the gamma distribution
+   * @param shape shape parameter (greater than 0) for the gamma distribution
+   * @param scale scale parameter (greater than 0) for the gamma distribution
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
@@ -766,8 +766,8 @@ object RandomRDDs {
    * gamma distribution with the input shape and scale.
    *
    * @param sc SparkContext used to create the RDD.
-   * @param shape shape parameter (> 0) for the gamma distribution.
-   * @param scale scale parameter (> 0) for the gamma distribution.
+   * @param shape shape parameter (greater than 0) for the gamma distribution.
+   * @param scale scale parameter (greater than 0) for the gamma distribution.
    * @param numRows Number of Vectors in the RDD.
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
index 0f7857b8d862..32e6ecf6308e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
@@ -31,7 +31,7 @@ import org.apache.spark.rdd.RDD
 class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable {
 
   /**
-   * Returns a RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
+   * Returns an RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
    * window over them. The ordering is first based on the partition index and then the ordering of
    * items within each partition. This is similar to sliding in Scala collections, except that it
    * becomes an empty RDD if the window size is greater than the total number of items. It needs to
@@ -48,7 +48,7 @@ class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable {
   }
 
   /**
-   * [[sliding(Int, Int)*]] with step = 1.
+   * `sliding(Int, Int)*` with step = 1.
    */
   def sliding(windowSize: Int): RDD[Array[T]] = sliding(windowSize, 1)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
index adb5e51947f6..365b2a06110f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
@@ -42,8 +42,8 @@ class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T]
  * @param windowSize the window size, must be greater than 1
  * @param step step size for windows
  *
- * @see [[org.apache.spark.mllib.rdd.RDDFunctions.sliding(Int, Int)*]]
- * @see [[scala.collection.IterableLike.sliding(Int, Int)*]]
+ * @see `org.apache.spark.mllib.rdd.RDDFunctions.sliding(Int, Int)*`
+ * @see `scala.collection.IterableLike.sliding(Int, Int)*`
  */
 private[mllib]
 class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index cc9ee15738ad..d21588579717 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -54,11 +54,12 @@ case class Rating @Since("0.8.0") (
  *
  * For implicit preference data, the algorithm used is based on
  * "Collaborative Filtering for Implicit Feedback Datasets", available at
- * [[http://dx.doi.org/10.1109/ICDM.2008.22]], adapted for the blocked approach used here.
+ * <a href="http://dx.doi.org/10.1109/ICDM.2008.22">here</a>, adapted for the blocked approach
+ * used here.
  *
  * Essentially instead of finding the low-rank approximations to the rating matrix `R`,
  * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
- * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of
+ * r &gt; 0 and 0 if r &lt;= 0. The ratings then act as 'confidence' values related to strength of
  * indicated user
  * preferences rather than explicit ratings given to items.
  */
@@ -280,7 +281,7 @@ class ALS private (
   }
 
   /**
-   * Java-friendly version of [[ALS.run]].
+   * Java-friendly version of `ALS.run`.
    */
   @Since("1.3.0")
   def run(ratings: JavaRDD[Rating]): MatrixFactorizationModel = run(ratings.rdd)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index c642573ccba6..23045fa2b686 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -43,14 +43,14 @@ import org.apache.spark.storage.StorageLevel
 /**
  * Model representing the result of matrix factorization.
  *
- * Note: If you create the model directly using constructor, please be aware that fast prediction
- * requires cached user/product features and their associated partitioners.
- *
  * @param rank Rank for the features in this model.
  * @param userFeatures RDD of tuples where each tuple represents the userId and
  *                     the features computed for this user.
  * @param productFeatures RDD of tuples where each tuple represents the productId
  *                        and the features computed for this product.
+ *
+ * @note If you create the model directly using constructor, please be aware that fast prediction
+ * requires cached user/product features and their associated partitioners.
  */
 @Since("0.8.0")
 class MatrixFactorizationModel @Since("0.8.0") (
@@ -146,7 +146,7 @@ class MatrixFactorizationModel @Since("0.8.0") (
   }
 
   /**
-   * Java-friendly version of [[MatrixFactorizationModel.predict]].
+   * Java-friendly version of `MatrixFactorizationModel.predict`.
    */
   @Since("1.2.0")
   def predict(usersProducts: JavaPairRDD[JavaInteger, JavaInteger]): JavaRDD[Rating] = {
@@ -195,7 +195,7 @@ class MatrixFactorizationModel @Since("0.8.0") (
    *  - human-readable (JSON) model metadata to path/metadata/
    *  - Parquet formatted data to path/data/
    *
-   * The model may be loaded using [[Loader.load]].
+   * The model may be loaded using `Loader.load`.
    *
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
@@ -320,7 +320,7 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
   /**
    * Load a model from the given path.
    *
-   * The model should have been saved by [[Saveable.save]].
+   * The model should have been saved by `Saveable.save`.
    *
    * @param sc  Spark context used for loading model files.
    * @param path  Path specifying the directory to which the model was saved.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 377326f8739b..36894d52346a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -238,23 +238,22 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
  * Sequential PAV implementation based on:
  * Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani.
  *   "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61.
- *   Available from [[http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf]]
+ *   Available from <a href="http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf">here</a>
  *
  * Sequential PAV parallelization based on:
  * Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset.
  *   "An approach to parallelizing isotonic regression."
  *   Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147.
- *   Available from [[http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf]]
+ *   Available from <a href="http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf">here</a>
  *
- * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Isotonic_regression">Isotonic regression
+ * (Wikipedia)</a>
  */
 @Since("1.3.0")
 class IsotonicRegression private (private var isotonic: Boolean) extends Serializable {
 
   /**
    * Constructs IsotonicRegression instance with default parameter isotonic = true.
-   *
-   * @return New instance of IsotonicRegression.
    */
   @Since("1.3.0")
   def this() = this(true)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 7a2a7a35a91c..7dc0c459ec03 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -30,12 +30,15 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors}
  * the corresponding joint dataset.
  *
  * A numerically stable algorithm is implemented to compute the mean and variance of instances:
- * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
+ * Reference: <a href="http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance">
+ * variance-wiki</a>
  * Zero elements (including explicit zero values) are skipped when calling add(),
  * to have time complexity O(nnz) instead of O(n) for each column.
  *
  * For weighted instances, the unbiased estimation of variance is defined by the reliability
- * weights: [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]].
+ * weights:
+ * see <a href="https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights">
+ * Reliability weights (Wikipedia)</a>.
  */
 @Since("1.1.0")
 @DeveloperApi
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index f3159f7e724c..5ebbfb2b6298 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -60,15 +60,15 @@ object Statistics {
    * Compute the correlation matrix for the input RDD of Vectors using the specified method.
    * Methods currently supported: `pearson` (default), `spearman`.
    *
-   * Note that for Spearman, a rank correlation, we need to create an RDD[Double] for each column
-   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
-   * which is fairly costly. Cache the input RDD before calling corr with `method = "spearman"` to
-   * avoid recomputing the common lineage.
-   *
    * @param X an RDD[Vector] for which the correlation matrix is to be computed.
    * @param method String specifying the method to use for computing correlation.
    *               Supported: `pearson` (default), `spearman`
    * @return Correlation matrix comparing columns in X.
+   *
+   * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column
+   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
+   * which is fairly costly. Cache the input RDD before calling corr with `method = "spearman"` to
+   * avoid recomputing the common lineage.
    */
   @Since("1.1.0")
   def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
@@ -77,18 +77,18 @@ object Statistics {
    * Compute the Pearson correlation for the input RDDs.
    * Returns NaN if either vector has 0 variance.
    *
-   * Note: the two input RDDs need to have the same number of partitions and the same number of
-   * elements in each partition.
-   *
    * @param x RDD[Double] of the same cardinality as y.
    * @param y RDD[Double] of the same cardinality as x.
    * @return A Double containing the Pearson correlation between the two input RDD[Double]s
+   *
+   * @note The two input RDDs need to have the same number of partitions and the same number of
+   * elements in each partition.
    */
   @Since("1.1.0")
   def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
 
   /**
-   * Java-friendly version of [[corr()]]
+   * Java-friendly version of `corr()`
    */
   @Since("1.4.1")
   def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
@@ -98,21 +98,21 @@ object Statistics {
    * Compute the correlation for the input RDDs using the specified method.
    * Methods currently supported: `pearson` (default), `spearman`.
    *
-   * Note: the two input RDDs need to have the same number of partitions and the same number of
-   * elements in each partition.
-   *
    * @param x RDD[Double] of the same cardinality as y.
    * @param y RDD[Double] of the same cardinality as x.
    * @param method String specifying the method to use for computing correlation.
    *               Supported: `pearson` (default), `spearman`
    * @return A Double containing the correlation between the two input RDD[Double]s using the
    *         specified method.
+   *
+   * @note The two input RDDs need to have the same number of partitions and the same number of
+   * elements in each partition.
    */
   @Since("1.1.0")
   def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
 
   /**
-   * Java-friendly version of [[corr()]]
+   * Java-friendly version of `corr()`
    */
   @Since("1.4.1")
   def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
@@ -122,15 +122,15 @@ object Statistics {
    * Conduct Pearson's chi-squared goodness of fit test of the observed data against the
    * expected distribution.
    *
-   * Note: the two input Vectors need to have the same size.
-   *       `observed` cannot contain negative values.
-   *       `expected` cannot contain nonpositive values.
-   *
    * @param observed Vector containing the observed categorical counts/relative frequencies.
    * @param expected Vector containing the expected categorical counts/relative frequencies.
    *                 `expected` is rescaled if the `expected` sum differs from the `observed` sum.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
+   *
+   * @note The two input Vectors need to have the same size.
+   * `observed` cannot contain negative values.
+   * `expected` cannot contain nonpositive values.
    */
   @Since("1.1.0")
   def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
@@ -141,11 +141,11 @@ object Statistics {
    * Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
    * distribution, with each category having an expected frequency of `1 / observed.size`.
    *
-   * Note: `observed` cannot contain negative values.
-   *
    * @param observed Vector containing the observed categorical counts/relative frequencies.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
+   *
+   * @note `observed` cannot contain negative values.
    */
   @Since("1.1.0")
   def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
@@ -176,7 +176,9 @@ object Statistics {
     ChiSqTest.chiSquaredFeatures(data)
   }
 
-  /** Java-friendly version of [[chiSqTest()]] */
+  /**
+   * Java-friendly version of `chiSqTest()`
+   */
   @Since("1.5.0")
   def chiSqTest(data: JavaRDD[LabeledPoint]): Array[ChiSqTestResult] = chiSqTest(data.rdd)
 
@@ -186,7 +188,8 @@ object Statistics {
    * distribution of the sample data and the theoretical distribution we can provide a test for the
    * the null hypothesis that the sample data comes from that theoretical distribution.
    * For more information on KS Test:
-   * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]]
+   * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test">
+   * Kolmogorov-Smirnov test (Wikipedia)</a>
    *
    * @param data an `RDD[Double]` containing the sample of data to test
    * @param cdf a `Double => Double` function to calculate the theoretical CDF at a given value
@@ -217,7 +220,9 @@ object Statistics {
     KolmogorovSmirnovTest.testOneSample(data, distName, params: _*)
   }
 
-  /** Java-friendly version of [[kolmogorovSmirnovTest()]] */
+  /**
+   * Java-friendly version of `kolmogorovSmirnovTest()`
+   */
   @Since("1.5.0")
   @varargs
   def kolmogorovSmirnovTest(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index 39c3644450d6..4cf662e03634 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -28,7 +28,8 @@ import org.apache.spark.mllib.util.MLUtils
  * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
  * the event that the covariance matrix is singular, the density will be computed in a
  * reduced dimensional subspace under which the distribution is supported.
- * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
+ * (see <a href="http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case">
+ * Degenerate case in Multivariate normal distribution (Wikipedia)</a>)
  *
  * @param mu The mean vector of the distribution
  * @param sigma The covariance matrix of the distribution
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
index a8b5955a7285..d17f7047c5b2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -31,7 +31,8 @@ import org.apache.spark.rdd.RDD
  * distribution of the sample data and the theoretical distribution we can provide a test for the
  * the null hypothesis that the sample data comes from that theoretical distribution.
  * For more information on KS Test:
- * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]]
+ * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test">
+ * Kolmogorov-Smirnov test (Wikipedia)</a>
  *
  * Implementation note: We seek to implement the KS test with a minimal number of distributed
  * passes. We sort the RDD, and then perform the following operations on a per-partition basis:
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
index 97c032de7a81..d680237bf687 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
@@ -47,7 +47,7 @@ case class BinarySample @Since("1.6.0") (
  * of the observation.
  *
  * To address novelty affects, the `peacePeriod` specifies a set number of initial
- * [[org.apache.spark.rdd.RDD]] batches of the [[DStream]] to be dropped from significance testing.
+ * [[org.apache.spark.rdd.RDD]] batches of the `DStream` to be dropped from significance testing.
  *
  * The `windowSize` sets the number of batches each significance test is to be performed over. The
  * window is sliding with a stride length of 1 batch. Setting windowSize to 0 will perform
@@ -97,7 +97,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
   }
 
   /**
-   * Register a [[DStream]] of values for significance testing.
+   * Register a `DStream` of values for significance testing.
    *
    * @param data stream of BinarySample(key,value) pairs where the key denotes group membership
    *             (true = experiment, false = control) and the value is the numerical metric to
@@ -114,7 +114,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
   }
 
   /**
-   * Register a [[JavaDStream]] of values for significance testing.
+   * Register a `JavaDStream` of values for significance testing.
    *
    * @param data stream of BinarySample(isExperiment,value) pairs where the isExperiment denotes
    *             group (true = experiment, false = control) and the value is the numerical metric
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
index ff27f28459e2..14ac14d6d61f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
@@ -73,7 +73,7 @@ private[stat] sealed trait StreamingTestMethod extends Serializable {
  * This test does not assume equal variance between the two samples and does not assume equal
  * sample size.
  *
- * @see http://en.wikipedia.org/wiki/Welch%27s_t_test
+ * @see <a href="http://en.wikipedia.org/wiki/Welch%27s_t_test">Welch's t-test (Wikipedia)</a>
  */
 private[stat] object WelchTTest extends StreamingTestMethod with Logging {
 
@@ -115,7 +115,7 @@ private[stat] object WelchTTest extends StreamingTestMethod with Logging {
  * mean. This test assumes equal variance between the two samples and does not assume equal sample
  * size. For unequal variances, Welch's t-test should be used instead.
  *
- * @see http://en.wikipedia.org/wiki/Student%27s_t-test
+ * @see <a href="http://en.wikipedia.org/wiki/Student%27s_t-test">Student's t-test (Wikipedia)</a>
  */
 private[stat] object StudentTTest extends StreamingTestMethod with Logging {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 36feab7859b4..499c80767aea 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -75,10 +75,6 @@ object DecisionTree extends Serializable with Logging {
    * Method to train a decision tree model.
    * The method supports binary and multiclass classification and regression.
    *
-   * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   *       and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   *       is recommended to clearly separate classification and regression.
-   *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              For classification, labels should take values {0, 1, ..., numClasses-1}.
    *              For regression, labels are real numbers.
@@ -86,6 +82,10 @@ object DecisionTree extends Serializable with Logging {
    *                 of decision tree (classification or regression), feature type (continuous,
    *                 categorical), depth of the tree, quantile calculation strategy, etc.
    * @return DecisionTreeModel that can be used for prediction.
+   *
+   * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+   * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+   * is recommended to clearly separate classification and regression.
    */
   @Since("1.0.0")
   def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = {
@@ -96,10 +96,6 @@ object DecisionTree extends Serializable with Logging {
    * Method to train a decision tree model.
    * The method supports binary and multiclass classification and regression.
    *
-   * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   *       and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   *       is recommended to clearly separate classification and regression.
-   *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              For classification, labels should take values {0, 1, ..., numClasses-1}.
    *              For regression, labels are real numbers.
@@ -108,6 +104,10 @@ object DecisionTree extends Serializable with Logging {
    * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means
    *                 1 internal node + 2 leaf nodes).
    * @return DecisionTreeModel that can be used for prediction.
+   *
+   * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+   * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+   * is recommended to clearly separate classification and regression.
    */
   @Since("1.0.0")
   def train(
@@ -123,10 +123,6 @@ object DecisionTree extends Serializable with Logging {
    * Method to train a decision tree model.
    * The method supports binary and multiclass classification and regression.
    *
-   * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   *       and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   *       is recommended to clearly separate classification and regression.
-   *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              For classification, labels should take values {0, 1, ..., numClasses-1}.
    *              For regression, labels are real numbers.
@@ -136,6 +132,10 @@ object DecisionTree extends Serializable with Logging {
    *                 1 internal node + 2 leaf nodes).
    * @param numClasses Number of classes for classification. Default value of 2.
    * @return DecisionTreeModel that can be used for prediction.
+   *
+   * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+   * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+   * is recommended to clearly separate classification and regression.
    */
   @Since("1.2.0")
   def train(
@@ -152,10 +152,6 @@ object DecisionTree extends Serializable with Logging {
    * Method to train a decision tree model.
    * The method supports binary and multiclass classification and regression.
    *
-   * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   *       and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   *       is recommended to clearly separate classification and regression.
-   *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              For classification, labels should take values {0, 1, ..., numClasses-1}.
    *              For regression, labels are real numbers.
@@ -166,10 +162,14 @@ object DecisionTree extends Serializable with Logging {
    * @param numClasses Number of classes for classification. Default value of 2.
    * @param maxBins Maximum number of bins used for splitting features.
    * @param quantileCalculationStrategy  Algorithm for calculating quantiles.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @return DecisionTreeModel that can be used for prediction.
+   *
+   * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+   * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+   * is recommended to clearly separate classification and regression.
    */
   @Since("1.0.0")
   def train(
@@ -192,7 +192,7 @@ object DecisionTree extends Serializable with Logging {
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              Labels should take values {0, 1, ..., numClasses-1}.
    * @param numClasses Number of classes for classification.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @param impurity Criterion used for information gain calculation.
@@ -238,7 +238,7 @@ object DecisionTree extends Serializable with Logging {
    *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              Labels are real numbers.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @param impurity Criterion used for information gain calculation.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index ece1e41d986d..3e85678906b3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
 
 /**
  * A class that implements
- * [[http://en.wikipedia.org/wiki/Gradient_boosting  Stochastic Gradient Boosting]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Stochastic Gradient Boosting</a>
  * for regression and binary classification.
  *
  * The implementation is based upon:
@@ -74,7 +74,7 @@ class GradientBoostedTrees private[spark] (
   }
 
   /**
-   * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#run]].
+   * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees.run`.
    */
   @Since("1.2.0")
   def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
@@ -89,7 +89,7 @@ class GradientBoostedTrees private[spark] (
    *                        This dataset should be different from the training dataset,
    *                        but it should follow the same distribution.
    *                        E.g., these two datasets could be created from an original dataset
-   *                        by using [[org.apache.spark.rdd.RDD.randomSplit()]]
+   *                        by using `org.apache.spark.rdd.RDD.randomSplit()`
    * @return GradientBoostedTreesModel that can be used for prediction.
    */
   @Since("1.4.0")
@@ -106,7 +106,7 @@ class GradientBoostedTrees private[spark] (
   }
 
   /**
-   * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]].
+   * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees.runWithValidation`.
    */
   @Since("1.4.0")
   def runWithValidation(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 14f11ce51b87..1f6cb086cefa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -36,7 +36,7 @@ import org.apache.spark.util.Utils
 
 
 /**
- * A class that implements a [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]]
+ * A class that implements a <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
  * learning algorithm for classification and regression.
  * It supports both continuous and categorical features.
  *
@@ -46,21 +46,22 @@ import org.apache.spark.util.Utils
  *  - The defaults of sqrt (classification) and onethird (regression) match the R randomForest
  *    package.
  *
- * @see [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf  Breiman (2001)]]
- * @see [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf  Breiman manual for
- *     random forests]]
+ * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a>
+ * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf">
+ * Breiman manual for random forests</a>
  * @param strategy The configuration parameters for the random forest algorithm which specify
  *                 the type of random forest (classification or regression), feature type
  *                 (continuous, categorical), depth of the tree, quantile calculation strategy,
  *                 etc.
- * @param numTrees If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
+ * @param numTrees If 1, then no bootstrapping is used.  If greater than 1, then bootstrapping is
+ *                 done.
  * @param featureSubsetStrategy Number of features to consider for splits at each node.
  *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
  *                              Supported numerical values: "(0.0-1.0]", "[1-n]".
  *                              If "auto" is set, this parameter is set based on numTrees:
  *                                if numTrees == 1, set to "all";
- *                                if numTrees > 1 (forest) set to "sqrt" for classification and
- *                                  to "onethird" for regression.
+ *                                if numTrees is greater than 1 (forest) set to "sqrt" for
+ *                                  classification and to "onethird" for regression.
  *                              If a real value "n" in the range (0, 1.0] is set,
  *                                use n * number of features.
  *                              If an integer value "n" in the range (1, num features) is set,
@@ -111,7 +112,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "sqrt".
+   *                                if numTrees is greater than 1 (forest) set to "sqrt".
    * @param seed Random seed for bootstrapping and choosing feature subsets.
    * @return RandomForestModel that can be used for prediction.
    */
@@ -134,7 +135,7 @@ object RandomForest extends Serializable with Logging {
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              Labels should take values {0, 1, ..., numClasses-1}.
    * @param numClasses Number of classes for classification.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @param numTrees Number of trees in the random forest.
@@ -142,7 +143,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "sqrt".
+   *                                if numTrees is greater than 1 (forest) set to "sqrt".
    * @param impurity Criterion used for information gain calculation.
    *                 Supported values: "gini" (recommended) or "entropy".
    * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means
@@ -200,7 +201,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "onethird".
+   *                                if numTrees is greater than 1 (forest) set to "onethird".
    * @param seed Random seed for bootstrapping and choosing feature subsets.
    * @return RandomForestModel that can be used for prediction.
    */
@@ -222,7 +223,7 @@ object RandomForest extends Serializable with Logging {
    *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              Labels are real numbers.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @param numTrees Number of trees in the random forest.
@@ -230,7 +231,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "onethird".
+   *                                if numTrees is greater than 1 (forest) set to "onethird".
    * @param impurity Criterion used for information gain calculation.
    *                 The only supported value for regression is "variance".
    * @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index d8405d13ce90..4334b316cc83 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -36,14 +36,14 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, Loss, SquaredError}
  * @param validationTol validationTol is a condition which decides iteration termination when
  *                      runWithValidation is used.
  *                      The end of iteration is decided based on below logic:
- *                      If the current loss on the validation set is > 0.01, the diff
+ *                      If the current loss on the validation set is greater than 0.01, the diff
  *                      of validation error is compared to relative tolerance which is
  *                      validationTol * (current loss on the validation set).
- *                      If the current loss on the validation set is <= 0.01, the diff
- *                      of validation error is compared to absolute tolerance which is
+ *                      If the current loss on the validation set is less than or equal to 0.01,
+ *                      the diff of validation error is compared to absolute tolerance which is
  *                      validationTol * 0.01.
  *                      Ignored when
- *                      [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used.
+ *                      `org.apache.spark.mllib.tree.GradientBoostedTrees.run()` is used.
  */
 @Since("1.2.0")
 case class BoostingStrategy @Since("1.4.0") (
@@ -92,8 +92,8 @@ object BoostingStrategy {
   /**
    * Returns default configuration for the boosting algorithm
    * @param algo Learning goal.  Supported:
-   *             [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
-   *             [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
+   *             `org.apache.spark.mllib.tree.configuration.Algo.Classification`,
+   *             `org.apache.spark.mllib.tree.configuration.Algo.Regression`
    * @return Configuration for boosting algorithm
    */
   @Since("1.3.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index b34e1b1b56c4..58e8f5be7b9f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -28,8 +28,8 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
 /**
  * Stores all the configuration options for tree construction
  * @param algo  Learning goal.  Supported:
- *              [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
- *              [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
+ *              `org.apache.spark.mllib.tree.configuration.Algo.Classification`,
+ *              `org.apache.spark.mllib.tree.configuration.Algo.Regression`
  * @param impurity Criterion used for information gain calculation.
  *                 Supported for Classification: [[org.apache.spark.mllib.tree.impurity.Gini]],
  *                  [[org.apache.spark.mllib.tree.impurity.Entropy]].
@@ -43,9 +43,9 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
  *                for choosing how to split on features at each node.
  *                More bins give higher granularity.
  * @param quantileCalculationStrategy Algorithm for calculating quantiles.  Supported:
- *                             [[org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort]]
+ *                             `org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort`
  * @param categoricalFeaturesInfo A map storing information about the categorical variables and the
- *                                number of discrete values they take. An entry (n -> k)
+ *                                number of discrete values they take. An entry (n to k)
  *                                indicates that feature n is categorical with k categories
  *                                indexed from 0: {0, 1, ..., k-1}.
  * @param minInstancesPerNode Minimum number of instances each child must have after split.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
index a5bdc2c6d2c9..98a3021461eb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -184,7 +184,7 @@ private[spark] object ImpurityCalculator {
    * the given stats.
    */
   def getCalculator(impurity: String, stats: Array[Double]): ImpurityCalculator = {
-    impurity match {
+    impurity.toLowerCase match {
       case "gini" => new GiniCalculator(stats)
       case "entropy" => new EntropyCalculator(stats)
       case "variance" => new VarianceCalculator(stats)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
index de14ddf024d7..09274a2e1b2a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
@@ -42,11 +42,13 @@ trait Loss extends Serializable {
 
   /**
    * Method to calculate error of the base learner for the gradient boosting calculation.
-   * Note: This method is not used by the gradient boosting algorithm but is useful for debugging
-   * purposes.
+   *
    * @param model Model of the weak learner.
    * @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    * @return Measure of model error on data
+   *
+   * @note This method is not used by the gradient boosting algorithm but is useful for debugging
+   * purposes.
    */
   @Since("1.2.0")
   def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double = {
@@ -55,11 +57,13 @@ trait Loss extends Serializable {
 
   /**
    * Method to calculate loss when the predictions are already known.
-   * Note: This method is used in the method evaluateEachIteration to avoid recomputing the
-   * predicted values from previously fit trees.
+   *
    * @param prediction Predicted label.
    * @param label True label.
    * @return Measure of model error on datapoint.
+   *
+   * @note This method is used in the method evaluateEachIteration to avoid recomputing the
+   * predicted values from previously fit trees.
    */
   private[spark] def computeError(prediction: Double, label: Double): Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index a1562384b0a7..27618e122aef 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -248,7 +248,7 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging {
       // Build node data into a tree.
       val trees = constructTrees(nodes)
       assert(trees.length == 1,
-        "Decision tree should contain exactly one tree but got ${trees.size} trees.")
+        s"Decision tree should contain exactly one tree but got ${trees.size} trees.")
       val model = new DecisionTreeModel(trees(0), Algo.fromString(algo))
       assert(model.numNodes == numNodes, s"Unable to load DecisionTreeModel data from: $dataPath." +
         s" Expected $numNodes nodes but found ${model.numNodes}")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
index 5cef9d0631b5..bda5e662779c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
@@ -25,7 +25,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
  * Split applied to a feature
  * @param feature feature index
  * @param threshold Threshold for continuous feature.
- *                  Split left if feature <= threshold, else right.
+ *                  Split left if feature is less than or equal to threshold, else right.
  * @param featureType type of feature -- categorical or continuous
  * @param categories Split left if categorical feature value is in this set, else right.
  */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 657ed0a8ecda..299950785e42 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -187,7 +187,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
    * @param initTreeWeight: learning rate assigned to the first tree.
    * @param initTree: first DecisionTreeModel.
    * @param loss: evaluation metric.
-   * @return a RDD with each element being a zip of the prediction and error
+   * @return an RDD with each element being a zip of the prediction and error
    *         corresponding to every sample.
    */
   @Since("1.4.0")
@@ -213,7 +213,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
    * @param treeWeight: Learning rate.
    * @param tree: Tree using which the prediction and error should be updated.
    * @param loss: evaluation metric.
-   * @return a RDD with each element being a zip of the prediction and error
+   * @return an RDD with each element being a zip of the prediction and error
    *         corresponding to each sample.
    */
   @Since("1.4.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index e96c2bc6edfc..92ff37a5934f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -119,7 +119,7 @@ object MLUtils extends Logging {
     while (i < indicesLength) {
       val current = indices(i)
       require(current > previous, s"indices should be one-based and in ascending order;"
-        + " found current=$current, previous=$previous; line=\"$line\"")
+        + s""" found current=$current, previous=$previous; line="$line"""")
       previous = current
       i += 1
     }
@@ -213,7 +213,7 @@ object MLUtils extends Logging {
   }
 
   /**
-   * Version of [[kFold()]] taking a Long seed.
+   * Version of `kFold()` taking a Long seed.
    */
   @Since("2.0.0")
   def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Long): Array[(RDD[T], RDD[T])] = {
@@ -262,7 +262,7 @@ object MLUtils extends Logging {
    * @param dataset input dataset
    * @param cols a list of vector columns to be converted. New vector columns will be ignored. If
    *             unspecified, all old vector columns will be converted except nested ones.
-   * @return the input [[DataFrame]] with old vector columns converted to the new vector type
+   * @return the input `DataFrame` with old vector columns converted to the new vector type
    */
   @Since("2.0.0")
   @varargs
@@ -314,7 +314,7 @@ object MLUtils extends Logging {
    * @param dataset input dataset
    * @param cols a list of vector columns to be converted. Old vector columns will be ignored. If
    *             unspecified, all new vector columns will be converted except nested ones.
-   * @return the input [[DataFrame]] with new vector columns converted to the old vector type
+   * @return the input `DataFrame` with new vector columns converted to the old vector type
    */
   @Since("2.0.0")
   @varargs
@@ -366,7 +366,7 @@ object MLUtils extends Logging {
    * @param dataset input dataset
    * @param cols a list of matrix columns to be converted. New matrix columns will be ignored. If
    *             unspecified, all old matrix columns will be converted except nested ones.
-   * @return the input [[DataFrame]] with old matrix columns converted to the new matrix type
+   * @return the input `DataFrame` with old matrix columns converted to the new matrix type
    */
   @Since("2.0.0")
   @varargs
@@ -416,7 +416,7 @@ object MLUtils extends Logging {
    * @param dataset input dataset
    * @param cols a list of matrix columns to be converted. Old matrix columns will be ignored. If
    *             unspecified, all new matrix columns will be converted except nested ones.
-   * @return the input [[DataFrame]] with new matrix columns converted to the old matrix type
+   * @return the input `DataFrame` with new matrix columns converted to the old matrix type
    */
   @Since("2.0.0")
   @varargs
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
index c881c8ea50c0..da0eb04764c5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
@@ -72,7 +72,7 @@ trait Loader[M <: Saveable] {
   /**
    * Load a model from the given path.
    *
-   * The model should have been saved by [[Saveable.save]].
+   * The model should have been saved by `Saveable.save`.
    *
    * @param sc  Spark context used for loading model files.
    * @param path  Path specifying the directory to which the model was saved.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
index 6413ca1f8b19..dafc6c200f95 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
@@ -101,13 +101,31 @@ class PipelineSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     }
   }
 
+  test("Pipeline.copy") {
+    val hashingTF = new HashingTF()
+      .setNumFeatures(100)
+    val pipeline = new Pipeline("pipeline").setStages(Array[Transformer](hashingTF))
+    val copied = pipeline.copy(ParamMap(hashingTF.numFeatures -> 10))
+
+    assert(copied.uid === pipeline.uid,
+      "copy should create an instance with the same UID")
+    assert(copied.getStages(0).asInstanceOf[HashingTF].getNumFeatures === 10,
+      "copy should handle extra stage params")
+  }
+
   test("PipelineModel.copy") {
     val hashingTF = new HashingTF()
       .setNumFeatures(100)
-    val model = new PipelineModel("pipeline", Array[Transformer](hashingTF))
+    val model = new PipelineModel("pipelineModel", Array[Transformer](hashingTF))
+      .setParent(new Pipeline())
     val copied = model.copy(ParamMap(hashingTF.numFeatures -> 10))
-    require(copied.stages(0).asInstanceOf[HashingTF].getNumFeatures === 10,
+
+    assert(copied.uid === model.uid,
+      "copy should create an instance with the same UID")
+    assert(copied.stages(0).asInstanceOf[HashingTF].getNumFeatures === 10,
       "copy should handle extra stage params")
+    assert(copied.parent === model.parent,
+      "copy should create an instance with the same parent")
   }
 
   test("pipeline model constructors") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index c711e7fa9dc6..692a172e7409 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -383,6 +383,20 @@ class DecisionTreeClassifierSuite
     testEstimatorAndModelReadWrite(dt, continuousData, allParamSettings ++ Map("maxDepth" -> 0),
       checkModelData)
   }
+
+  test("SPARK-20043: " +
+       "ImpurityCalculator builder fails for uppercase impurity type Gini in model read/write") {
+    val rdd = TreeTests.getTreeReadWriteData(sc)
+    val data: DataFrame =
+      TreeTests.setMetadata(rdd, Map.empty[Int, Int], numClasses = 2)
+
+    val dt = new DecisionTreeClassifier()
+      .setImpurity("Gini")
+      .setMaxDepth(2)
+    val model = dt.fit(data)
+
+    testDefaultReadWrite(model)
+  }
 }
 
 private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 3492709677d4..7c36745ab213 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -70,6 +70,14 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
     ParamsSuite.checkParams(model)
   }
 
+  test("GBT parameter stepSize should be in interval (0, 1]") {
+    withClue("GBT parameter stepSize should be in interval (0, 1]") {
+      intercept[IllegalArgumentException] {
+        new GBTClassifier().setStepSize(10)
+      }
+    }
+  }
+
   test("Binary classification with continuous features: Log Loss") {
     val categoricalFeatures = Map.empty[Int, Int]
     testCombinations.foreach {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 8771fd2e9d2b..9c4c59a5e60f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -27,7 +27,7 @@ import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors}
-import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -141,6 +141,14 @@ class LogisticRegressionSuite
     assert(model.getProbabilityCol === "probability")
     assert(model.intercept !== 0.0)
     assert(model.hasParent)
+
+    // copied model must have the same parent.
+    MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
+    model.setSummary(None)
+    assert(!model.hasSummary)
   }
 
   test("empty probabilityCol") {
@@ -184,6 +192,12 @@ class LogisticRegressionSuite
       }
     }
     // thresholds and threshold must be consistent: values
+    withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
+      intercept[IllegalArgumentException] {
+        lr2.fit(smallBinaryDataset,
+          lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0))
+      }
+    }
     withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
       intercept[IllegalArgumentException] {
         val lr2model = lr2.fit(smallBinaryDataset,
@@ -251,9 +265,6 @@ class LogisticRegressionSuite
     mlr.setFitIntercept(false)
     val mlrModel = mlr.fit(smallMultinomialDataset)
     assert(mlrModel.interceptVector === Vectors.sparse(3, Seq()))
-
-    // copied model must have the same parent.
-    MLTestingUtils.checkCopy(model)
   }
 
   test("logistic regression with setters") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 3f9bcec42739..aacb7921b835 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
+import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils}
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -33,6 +33,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.Metadata
 
 class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
@@ -136,6 +137,17 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     assert(outputFields.contains("p"))
   }
 
+  test("SPARK-18625 : OneVsRestModel should support setFeaturesCol and setPredictionCol") {
+    val ova = new OneVsRest().setClassifier(new LogisticRegression)
+    val ovaModel = ova.fit(dataset)
+    val dataset2 = dataset.select(col("label").as("y"), col("features").as("fea"))
+    ovaModel.setFeaturesCol("fea")
+    ovaModel.setPredictionCol("pred")
+    val transformedDataset = ovaModel.transform(dataset2)
+    val outputFields = transformedDataset.schema.fieldNames.toSet
+    assert(outputFields === Set("y", "fea", "pred"))
+  }
+
   test("SPARK-8049: OneVsRest shouldn't output temp columns") {
     val logReg = new LogisticRegression()
       .setMaxIter(1)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index f2368a9f8dad..30513c1e276a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.ml.clustering
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
 
@@ -28,9 +29,12 @@ class BisectingKMeansSuite
   final val k = 5
   @transient var dataset: Dataset[_] = _
 
+  @transient var sparseDataset: Dataset[_] = _
+
   override def beforeAll(): Unit = {
     super.beforeAll()
     dataset = KMeansSuite.generateKMeansData(spark, 50, 3, k)
+    sparseDataset = KMeansSuite.generateSparseData(spark, 10, 1000, 42)
   }
 
   test("default parameters") {
@@ -41,6 +45,29 @@ class BisectingKMeansSuite
     assert(bkm.getPredictionCol === "prediction")
     assert(bkm.getMaxIter === 20)
     assert(bkm.getMinDivisibleClusterSize === 1.0)
+    val model = bkm.setMaxIter(1).fit(dataset)
+
+    // copied model must have the same parent
+    MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
+  }
+
+  test("SPARK-16473: Verify Bisecting K-Means does not fail in edge case where" +
+    "one cluster is empty after split") {
+    val bkm = new BisectingKMeans()
+      .setK(k)
+      .setMinDivisibleClusterSize(4)
+      .setMaxIter(4)
+      .setSeed(123)
+
+    // Verify fit does not fail on very sparse data
+    val model = bkm.fit(sparseDataset)
+    val result = model.transform(sparseDataset)
+    val numClusters = result.select("prediction").distinct().collect().length
+    // Verify we hit the edge case
+    assert(numClusters < k && numClusters > 1)
   }
 
   test("setter/getter") {
@@ -101,6 +128,9 @@ class BisectingKMeansSuite
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+
+    model.setSummary(None)
+    assert(!model.hasSummary)
   }
 
   test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 003fa6abf659..07299123f8a4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.ml.clustering
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
 
@@ -43,6 +44,13 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(gm.getPredictionCol === "prediction")
     assert(gm.getMaxIter === 100)
     assert(gm.getTol === 0.01)
+    val model = gm.setMaxIter(1).fit(dataset)
+
+    // copied model must have the same parent
+    MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
   }
 
   test("set parameters") {
@@ -103,6 +111,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+
+    model.setSummary(None)
+    assert(!model.hasSummary)
   }
 
   test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index ca392653557c..e10127f7d108 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -17,9 +17,12 @@
 
 package org.apache.spark.ml.clustering
 
+import scala.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
@@ -47,6 +50,13 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
     assert(kmeans.getInitMode === MLlibKMeans.K_MEANS_PARALLEL)
     assert(kmeans.getInitSteps === 2)
     assert(kmeans.getTol === 1e-4)
+    val model = kmeans.setMaxIter(1).fit(dataset)
+
+    // copied model must have the same parent
+    MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
   }
 
   test("set parameters") {
@@ -115,6 +125,9 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+
+    model.setSummary(None)
+    assert(!model.hasSummary)
   }
 
   test("KMeansModel transform with non-default feature and prediction cols") {
@@ -149,6 +162,17 @@ object KMeansSuite {
     spark.createDataFrame(rdd)
   }
 
+  def generateSparseData(spark: SparkSession, rows: Int, dim: Int, seed: Int): DataFrame = {
+    val sc = spark.sparkContext
+    val random = new Random(seed)
+    val nnz = random.nextInt(dim)
+    val rdd = sc.parallelize(1 to rows)
+      .map(i => Vectors.sparse(dim, random.shuffle(0 to dim - 1).slice(0, nnz).sorted.toArray,
+        Array.fill(nnz)(random.nextDouble())))
+      .map(v => new TestRow(v))
+    spark.createDataFrame(rdd)
+  }
+
   /**
    * Mapping from all Params to valid settings which differ from the defaults.
    * This is useful for tests which need to exercise all Params, such as save/load.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
index 3f39deddf20b..9aa11fbdbe86 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
@@ -260,6 +260,14 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
         Vectors.dense(model2.topicsMatrix.toArray) absTol 1e-6)
       assert(Vectors.dense(model.getDocConcentration) ~==
         Vectors.dense(model2.getDocConcentration) absTol 1e-6)
+      val logPrior = model.asInstanceOf[DistributedLDAModel].logPrior
+      val logPrior2 = model2.asInstanceOf[DistributedLDAModel].logPrior
+      val trainingLogLikelihood =
+        model.asInstanceOf[DistributedLDAModel].trainingLogLikelihood
+      val trainingLogLikelihood2 =
+        model2.asInstanceOf[DistributedLDAModel].trainingLogLikelihood
+      assert(logPrior ~== logPrior2 absTol 1e-6)
+      assert(trainingLogLikelihood ~== trainingLogLikelihood2 absTol 1e-6)
     }
     val lda = new LDA()
     testEstimatorAndModelReadWrite(lda, dataset,
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
similarity index 66%
rename from mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala
rename to mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
index cd82ee2117a0..ab937685a555 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
 
-class RandomProjectionSuite
+class BucketedRandomProjectionLSHSuite
   extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
   @transient var dataset: Dataset[_] = _
@@ -43,70 +43,72 @@ class RandomProjectionSuite
   }
 
   test("params") {
-    ParamsSuite.checkParams(new RandomProjection)
-    val model = new RandomProjectionModel("rp", randUnitVectors = Array(Vectors.dense(1.0, 0.0)))
+    ParamsSuite.checkParams(new BucketedRandomProjectionLSH)
+    val model = new BucketedRandomProjectionLSHModel(
+      "brp", randUnitVectors = Array(Vectors.dense(1.0, 0.0)))
     ParamsSuite.checkParams(model)
   }
 
-  test("RandomProjection: default params") {
-    val rp = new RandomProjection
-    assert(rp.getOutputDim === 1.0)
+  test("BucketedRandomProjectionLSH: default params") {
+    val brp = new BucketedRandomProjectionLSH
+    assert(brp.getNumHashTables === 1.0)
   }
 
   test("read/write") {
-    def checkModelData(model: RandomProjectionModel, model2: RandomProjectionModel): Unit = {
+    def checkModelData(
+      model: BucketedRandomProjectionLSHModel,
+      model2: BucketedRandomProjectionLSHModel): Unit = {
       model.randUnitVectors.zip(model2.randUnitVectors)
         .foreach(pair => assert(pair._1 === pair._2))
     }
-    val mh = new RandomProjection()
+    val mh = new BucketedRandomProjectionLSH()
     val settings = Map("inputCol" -> "keys", "outputCol" -> "values", "bucketLength" -> 1.0)
     testEstimatorAndModelReadWrite(mh, dataset, settings, checkModelData)
   }
 
   test("hashFunction") {
     val randUnitVectors = Array(Vectors.dense(0.0, 1.0), Vectors.dense(1.0, 0.0))
-    val model = new RandomProjectionModel("rp", randUnitVectors)
+    val model = new BucketedRandomProjectionLSHModel("brp", randUnitVectors)
     model.set(model.bucketLength, 0.5)
     val res = model.hashFunction(Vectors.dense(1.23, 4.56))
-    assert(res.equals(Vectors.dense(9.0, 2.0)))
+    assert(res.length == 2)
+    assert(res(0).equals(Vectors.dense(9.0)))
+    assert(res(1).equals(Vectors.dense(2.0)))
   }
 
-  test("keyDistance and hashDistance") {
-    val model = new RandomProjectionModel("rp", Array(Vectors.dense(0.0, 1.0)))
+  test("keyDistance") {
+    val model = new BucketedRandomProjectionLSHModel("brp", Array(Vectors.dense(0.0, 1.0)))
     val keyDist = model.keyDistance(Vectors.dense(1, 2), Vectors.dense(-2, -2))
-    val hashDist = model.hashDistance(Vectors.dense(-5, 5), Vectors.dense(1, 2))
     assert(keyDist === 5)
-    assert(hashDist === 3)
   }
 
-  test("RandomProjection: randUnitVectors") {
-    val rp = new RandomProjection()
-      .setOutputDim(20)
+  test("BucketedRandomProjectionLSH: randUnitVectors") {
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(1.0)
       .setSeed(12345)
-    val unitVectors = rp.fit(dataset).randUnitVectors
+    val unitVectors = brp.fit(dataset).randUnitVectors
     unitVectors.foreach { v: Vector =>
       assert(Vectors.norm(v, 2.0) ~== 1.0 absTol 1e-14)
     }
   }
 
-  test("RandomProjection: test of LSH property") {
+  test("BucketedRandomProjectionLSH: test of LSH property") {
     // Project from 2 dimensional Euclidean Space to 1 dimensions
-    val rp = new RandomProjection()
-      .setOutputDim(1)
+    val brp = new BucketedRandomProjectionLSH()
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(1.0)
       .setSeed(12345)
 
-    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(dataset, rp, 8.0, 2.0)
+    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(dataset, brp, 8.0, 2.0)
     assert(falsePositive < 0.4)
     assert(falseNegative < 0.4)
   }
 
-  test("RandomProjection with high dimension data: test of LSH property") {
+  test("BucketedRandomProjectionLSH with high dimension data: test of LSH property") {
     val numDim = 100
     val data = {
       for (i <- 0 until numDim; j <- Seq(-2, -1, 1, 2))
@@ -115,30 +117,30 @@ class RandomProjectionSuite
     val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
 
     // Project from 100 dimensional Euclidean Space to 10 dimensions
-    val rp = new RandomProjection()
-      .setOutputDim(10)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(10)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(2.5)
       .setSeed(12345)
 
-    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 3.0, 2.0)
+    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, brp, 3.0, 2.0)
     assert(falsePositive < 0.3)
     assert(falseNegative < 0.3)
   }
 
-  test("approxNearestNeighbors for random projection") {
+  test("approxNearestNeighbors for bucketed random projection") {
     val key = Vectors.dense(1.2, 3.4)
 
-    val rp = new RandomProjection()
-      .setOutputDim(2)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(2)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(4.0)
       .setSeed(12345)
 
-    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, dataset, key, 100,
-      singleProbing = true)
+    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(brp, dataset, key, 100,
+      singleProbe = true)
     assert(precision >= 0.6)
     assert(recall >= 0.6)
   }
@@ -146,33 +148,47 @@ class RandomProjectionSuite
   test("approxNearestNeighbors with multiple probing") {
     val key = Vectors.dense(1.2, 3.4)
 
-    val rp = new RandomProjection()
-      .setOutputDim(20)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(1.0)
       .setSeed(12345)
 
-    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, dataset, key, 100,
-      singleProbing = false)
+    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(brp, dataset, key, 100,
+      singleProbe = false)
     assert(precision >= 0.7)
     assert(recall >= 0.7)
   }
 
-  test("approxSimilarityJoin for random projection on different dataset") {
+  test("approxNearestNeighbors for numNeighbors <= 0") {
+    val key = Vectors.dense(1.2, 3.4)
+
+    val model = new BucketedRandomProjectionLSHModel(
+      "brp", randUnitVectors = Array(Vectors.dense(1.0, 0.0)))
+
+    intercept[IllegalArgumentException] {
+      model.approxNearestNeighbors(dataset, key, 0)
+    }
+    intercept[IllegalArgumentException] {
+      model.approxNearestNeighbors(dataset, key, -1)
+    }
+  }
+
+  test("approxSimilarityJoin for bucketed random projection on different dataset") {
     val data2 = {
       for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i))
     }
     val dataset2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys")
 
-    val rp = new RandomProjection()
-      .setOutputDim(2)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(2)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(4.0)
       .setSeed(12345)
 
-    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dataset, dataset2, 1.0)
+    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(brp, dataset, dataset2, 1.0)
     assert(precision == 1.0)
     assert(recall >= 0.7)
   }
@@ -183,14 +199,14 @@ class RandomProjectionSuite
     }
     val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
 
-    val rp = new RandomProjection()
-      .setOutputDim(2)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(2)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(4.0)
       .setSeed(12345)
 
-    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0)
+    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(brp, df, df, 3.0)
     assert(precision == 1.0)
     assert(recall >= 0.7)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
index 5c025546f332..a9b559f7ba64 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
@@ -58,12 +58,18 @@ private[ml] object LSHTest {
     val outputCol = model.getOutputCol
     val transformedData = model.transform(dataset)
 
-    SchemaUtils.checkColumnType(transformedData.schema, model.getOutputCol, new VectorUDT)
+    // Check output column type
+    SchemaUtils.checkColumnType(
+      transformedData.schema, model.getOutputCol, DataTypes.createArrayType(new VectorUDT))
+
+    // Check output column dimensions
+    val headHashValue = transformedData.select(outputCol).head().get(0).asInstanceOf[Seq[Vector]]
+    assert(headHashValue.length == model.getNumHashTables)
 
     // Perform a cross join and label each pair of same_bucket and distance
     val pairs = transformedData.as("a").crossJoin(transformedData.as("b"))
     val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType)
-    val sameBucket = udf((x: Vector, y: Vector) => model.hashDistance(x, y) == 0.0,
+    val sameBucket = udf((x: Seq[Vector], y: Seq[Vector]) => model.hashDistance(x, y) == 0.0,
       DataTypes.BooleanType)
     val result = pairs
       .withColumn("same_bucket", sameBucket(col(s"a.$outputCol"), col(s"b.$outputCol")))
@@ -83,6 +89,7 @@ private[ml] object LSHTest {
    * @param dataset the dataset to look for the key
    * @param key The key to hash for the item
    * @param k The maximum number of items closest to the key
+   * @param singleProbe True for using single-probe; false for multi-probe
    * @tparam T The class type of lsh
    * @return A tuple of two doubles, representing precision and recall rate
    */
@@ -91,7 +98,7 @@ private[ml] object LSHTest {
       dataset: Dataset[_],
       key: Vector,
       k: Int,
-      singleProbing: Boolean): (Double, Double) = {
+      singleProbe: Boolean): (Double, Double) = {
     val model = lsh.fit(dataset)
 
     // Compute expected
@@ -99,14 +106,14 @@ private[ml] object LSHTest {
     val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k)
 
     // Compute actual
-    val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing, "distCol")
+    val actual = model.approxNearestNeighbors(dataset, key, k, singleProbe, "distCol")
 
     assert(actual.schema.sameType(model
       .transformSchema(dataset.schema)
       .add("distCol", DataTypes.DoubleType))
     )
 
-    if (!singleProbing) {
+    if (!singleProbe) {
       assert(actual.count() == k)
     }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
similarity index 60%
rename from mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala
rename to mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
index c32ca7d69cf8..3461cdf82460 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
 
-class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
   @transient var dataset: Dataset[_] = _
 
@@ -38,45 +38,51 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default
   }
 
   test("params") {
-    ParamsSuite.checkParams(new MinHash)
-    val model = new MinHashModel("mh", numEntries = 2, randCoefficients = Array(1))
+    ParamsSuite.checkParams(new MinHashLSH)
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((1, 0)))
     ParamsSuite.checkParams(model)
   }
 
-  test("MinHash: default params") {
-    val rp = new MinHash
-    assert(rp.getOutputDim === 1.0)
+  test("MinHashLSH: default params") {
+    val rp = new MinHashLSH
+    assert(rp.getNumHashTables === 1.0)
   }
 
   test("read/write") {
-    def checkModelData(model: MinHashModel, model2: MinHashModel): Unit = {
-      assert(model.numEntries === model2.numEntries)
+    def checkModelData(model: MinHashLSHModel, model2: MinHashLSHModel): Unit = {
       assertResult(model.randCoefficients)(model2.randCoefficients)
     }
-    val mh = new MinHash()
+    val mh = new MinHashLSH()
     val settings = Map("inputCol" -> "keys", "outputCol" -> "values")
     testEstimatorAndModelReadWrite(mh, dataset, settings, checkModelData)
   }
 
   test("hashFunction") {
-    val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(0, 1, 3))
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((0, 1), (1, 2), (3, 0)))
     val res = model.hashFunction(Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0))))
-    assert(res.equals(Vectors.dense(0.0, 3.0, 4.0)))
+    assert(res.length == 3)
+    assert(res(0).equals(Vectors.dense(1.0)))
+    assert(res(1).equals(Vectors.dense(5.0)))
+    assert(res(2).equals(Vectors.dense(9.0)))
   }
 
-  test("keyDistance and hashDistance") {
-    val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(1))
+  test("hashFunction: empty vector") {
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((0, 1), (1, 2), (3, 0)))
+    intercept[IllegalArgumentException] {
+      model.hashFunction(Vectors.sparse(10, Seq()))
+    }
+  }
+
+  test("keyDistance") {
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((1, 0)))
     val v1 = Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0)))
     val v2 = Vectors.sparse(10, Seq((1, 1.0), (3, 1.0), (5, 1.0), (7, 1.0), (9, 1.0)))
     val keyDist = model.keyDistance(v1, v2)
-    val hashDist = model.hashDistance(Vectors.dense(-5, 5), Vectors.dense(1, 2))
     assert(keyDist === 0.5)
-    assert(hashDist === 3)
   }
 
-  test("MinHash: test of LSH property") {
-    val mh = new MinHash()
-      .setOutputDim(1)
+  test("MinHashLSH: test of LSH property") {
+    val mh = new MinHashLSH()
       .setInputCol("keys")
       .setOutputCol("values")
       .setSeed(12344)
@@ -86,9 +92,24 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     assert(falseNegative < 0.3)
   }
 
+  test("MinHashLSH: test of inputDim > prime") {
+    val mh = new MinHashLSH()
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setSeed(12344)
+
+    val data = {
+      for (i <- 0 to 2) yield Vectors.sparse(Int.MaxValue, (i until i + 5).map((_, 1.0)))
+    }
+    val badDataset = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
+    intercept[IllegalArgumentException] {
+      mh.fit(badDataset)
+    }
+  }
+
   test("approxNearestNeighbors for min hash") {
-    val mh = new MinHash()
-      .setOutputDim(20)
+    val mh = new MinHashLSH()
+      .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
       .setSeed(12345)
@@ -97,12 +118,26 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (0 until 100).filter(_.toString.contains("1")).map((_, 1.0)))
 
     val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, dataset, key, 20,
-      singleProbing = true)
+      singleProbe = true)
     assert(precision >= 0.7)
     assert(recall >= 0.7)
   }
 
-  test("approxSimilarityJoin for minhash on different dataset") {
+  test("approxNearestNeighbors for numNeighbors <= 0") {
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((1, 0)))
+
+    val key: Vector = Vectors.sparse(100,
+      (0 until 100).filter(_.toString.contains("1")).map((_, 1.0)))
+
+    intercept[IllegalArgumentException] {
+      model.approxNearestNeighbors(dataset, key, 0)
+    }
+    intercept[IllegalArgumentException] {
+      model.approxNearestNeighbors(dataset, key, -1)
+    }
+  }
+
+  test("approxSimilarityJoin for min hash on different dataset") {
     val data1 = {
       for (i <- 0 until 20) yield Vectors.sparse(100, (5 * i until 5 * i + 5).map((_, 1.0)))
     }
@@ -113,8 +148,8 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
     val df2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys")
 
-    val mh = new MinHash()
-      .setOutputDim(20)
+    val mh = new MinHashLSH()
+      .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
       .setSeed(12345)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index ac1ef5feb95b..95b443dd255a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors}
-import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.random._
@@ -44,6 +44,7 @@ class GeneralizedLinearRegressionSuite
   @transient var datasetGaussianInverse: DataFrame = _
   @transient var datasetBinomial: DataFrame = _
   @transient var datasetPoissonLog: DataFrame = _
+  @transient var datasetPoissonLogWithZero: DataFrame = _
   @transient var datasetPoissonIdentity: DataFrame = _
   @transient var datasetPoissonSqrt: DataFrame = _
   @transient var datasetGammaInverse: DataFrame = _
@@ -88,6 +89,15 @@ class GeneralizedLinearRegressionSuite
       xVariance = Array(0.7, 1.2), nPoints = 10000, seed, noiseLevel = 0.01,
       family = "poisson", link = "log").toDF()
 
+    datasetPoissonLogWithZero = Seq(
+      LabeledPoint(0.0, Vectors.dense(18, 1.0)),
+      LabeledPoint(1.0, Vectors.dense(12, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(15, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(13, 2.0)),
+      LabeledPoint(0.0, Vectors.dense(15, 1.0)),
+      LabeledPoint(1.0, Vectors.dense(16, 1.0))
+    ).toDF()
+
     datasetPoissonIdentity = generateGeneralizedLinearRegressionInput(
       intercept = 2.5, coefficients = Array(2.2, 0.6), xMean = Array(2.9, 10.5),
       xVariance = Array(0.7, 1.2), nPoints = 10000, seed, noiseLevel = 0.01,
@@ -139,6 +149,10 @@ class GeneralizedLinearRegressionSuite
       label + "," + features.toArray.mkString(",")
     }.repartition(1).saveAsTextFile(
       "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonLog")
+    datasetPoissonLogWithZero.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonLogWithZero")
     datasetPoissonIdentity.rdd.map { case Row(label: Double, features: Vector) =>
       label + "," + features.toArray.mkString(",")
     }.repartition(1).saveAsTextFile(
@@ -183,6 +197,11 @@ class GeneralizedLinearRegressionSuite
 
     // copied model must have the same parent.
     MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
+    model.setSummary(None)
+    assert(!model.hasSummary)
 
     assert(model.getFeaturesCol === "features")
     assert(model.getPredictionCol === "prediction")
@@ -453,6 +472,40 @@ class GeneralizedLinearRegressionSuite
     }
   }
 
+  test("generalized linear regression: poisson family against glm (with zero values)") {
+    /*
+       R code:
+       f1 <- data$V1 ~ data$V2 + data$V3 - 1
+       f2 <- data$V1 ~ data$V2 + data$V3
+
+       data <- read.csv("path", header=FALSE)
+       for (formula in c(f1, f2)) {
+         model <- glm(formula, family="poisson", data=data)
+         print(as.vector(coef(model)))
+       }
+       [1] -0.0457441 -0.6833928
+       [1] 1.8121235  -0.1747493  -0.5815417
+     */
+    val expected = Seq(
+      Vectors.dense(0.0, -0.0457441, -0.6833928),
+      Vectors.dense(1.8121235, -0.1747493, -0.5815417))
+
+    import GeneralizedLinearRegression._
+
+    var idx = 0
+    val link = "log"
+    val dataset = datasetPoissonLogWithZero
+    for (fitIntercept <- Seq(false, true)) {
+      val trainer = new GeneralizedLinearRegression().setFamily("poisson").setLink(link)
+        .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction")
+      val model = trainer.fit(dataset)
+      val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1))
+      assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with poisson family, " +
+        s"$link link and fitIntercept = $fitIntercept (with zero values).")
+      idx += 1
+    }
+  }
+
   test("generalized linear regression: gamma family against glm") {
     /*
        R code:
@@ -500,7 +553,7 @@ class GeneralizedLinearRegressionSuite
     for ((link, dataset) <- Seq(("inverse", datasetGammaInverse),
       ("identity", datasetGammaIdentity), ("log", datasetGammaLog))) {
       for (fitIntercept <- Seq(false, true)) {
-        val trainer = new GeneralizedLinearRegression().setFamily("gamma").setLink(link)
+        val trainer = new GeneralizedLinearRegression().setFamily("Gamma").setLink(link)
           .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction")
         val model = trainer.fit(dataset)
         val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1))
@@ -704,7 +757,7 @@ class GeneralizedLinearRegressionSuite
        0.5554219 -0.4034267  0.6567520 -0.2611382
      */
     val trainer = new GeneralizedLinearRegression()
-      .setFamily("binomial")
+      .setFamily("Binomial")
       .setWeightCol("weight")
       .setFitIntercept(false)
 
@@ -821,7 +874,7 @@ class GeneralizedLinearRegressionSuite
        -0.4378554  0.2189277  0.1459518 -0.1094638
      */
     val trainer = new GeneralizedLinearRegression()
-      .setFamily("poisson")
+      .setFamily("Poisson")
       .setWeightCol("weight")
       .setFitIntercept(true)
 
@@ -936,7 +989,7 @@ class GeneralizedLinearRegressionSuite
        -0.6344390  0.3172195  0.2114797 -0.1586097
      */
     val trainer = new GeneralizedLinearRegression()
-      .setFamily("gamma")
+      .setFamily("Gamma")
       .setWeightCol("weight")
 
     val model = trainer.fit(datasetWithWeight)
@@ -998,6 +1051,27 @@ class GeneralizedLinearRegressionSuite
     assert(summary.solver === "irls")
   }
 
+  test("glm handle collinear features") {
+    val collinearInstances = Seq(
+      Instance(1.0, 1.0, Vectors.dense(1.0, 2.0)),
+      Instance(2.0, 1.0, Vectors.dense(2.0, 4.0)),
+      Instance(3.0, 1.0, Vectors.dense(3.0, 6.0)),
+      Instance(4.0, 1.0, Vectors.dense(4.0, 8.0))
+    ).toDF()
+    val trainer = new GeneralizedLinearRegression()
+    val model = trainer.fit(collinearInstances)
+    // to make it clear that underlying WLS did not solve analytically
+    intercept[UnsupportedOperationException] {
+      model.summary.coefficientStandardErrors
+    }
+    intercept[UnsupportedOperationException] {
+      model.summary.pValues
+    }
+    intercept[UnsupportedOperationException] {
+      model.summary.tValues
+    }
+  }
+
   test("read/write") {
     def checkModelData(
         model: GeneralizedLinearRegressionModel,
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index c0e8afbf5e34..0be82742a33b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
-import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
@@ -143,6 +143,11 @@ class LinearRegressionSuite
 
     // copied model must have the same parent.
     MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
+    model.setSummary(None)
+    assert(!model.hasSummary)
 
     model.transform(datasetWithDenseFeature)
       .select("label", "prediction")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 87100ae2e342..4463a9b6e543 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -22,11 +22,11 @@ import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
 import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
 import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
-import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
+import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared.HasInputCol
 import org.apache.spark.ml.regression.LinearRegression
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.types.StructType
@@ -78,6 +78,10 @@ class TrainValidationSplitSuite
       .setTrainRatio(0.5)
       .setSeed(42L)
     val cvModel = cv.fit(dataset)
+
+    // copied model must have the same paren.
+    MLTestingUtils.checkCopy(cvModel)
+
     val parent = cvModel.bestModel.parent.asInstanceOf[LinearRegression]
     assert(parent.getRegParam === 0.001)
     assert(parent.getMaxIter === 10)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index 563756907d20..93c00d80974c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -513,6 +513,26 @@ class MatricesSuite extends SparkFunSuite {
     Matrices.fromBreeze(sum)
   }
 
+  test("Test FromBreeze when Breeze.CSCMatrix.rowIndices has trailing zeros. - SPARK-20687") {
+    // (2, 0, 0)
+    // (2, 0, 0)
+    val mat1Brz = Matrices.sparse(2, 3, Array(0, 2, 2, 2), Array(0, 1), Array(2, 2)).asBreeze
+    // (2, 1E-15, 1E-15)
+    // (2, 1E-15, 1E-15)
+    val mat2Brz = Matrices.sparse(2, 3,
+      Array(0, 2, 4, 6),
+      Array(0, 0, 0, 1, 1, 1),
+      Array(2, 1E-15, 1E-15, 2, 1E-15, 1E-15)).asBreeze
+    val t1Brz = mat1Brz - mat2Brz
+    val t2Brz = mat2Brz - mat1Brz
+    // The following operations raise exceptions on un-patch Matrices.fromBreeze
+    val t1 = Matrices.fromBreeze(t1Brz)
+    val t2 = Matrices.fromBreeze(t2Brz)
+    // t1 == t1Brz && t2 == t2Brz
+    assert((t1.asBreeze - t1Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15)
+    assert((t2.asBreeze - t2Brz).iterator.map((x) => math.abs(x._2)).sum < 1E-15)
+  }
+
   test("row/col iterator") {
     val dm = new DenseMatrix(3, 2, Array(0, 1, 2, 3, 4, 0))
     val sm = dm.toSparse
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
index 71a3ceac1b94..6172cffee861 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala
@@ -122,6 +122,13 @@ class VectorsSuite extends SparkFunSuite with Logging {
 
     val vec8 = Vectors.sparse(5, Array(1, 2), Array(0.0, -1.0))
     assert(vec8.argmax === 0)
+
+    // Check for case when sparse vector is non-empty but the values are empty
+    val vec9 = Vectors.sparse(100, Array.empty[Int], Array.empty[Double]).asInstanceOf[SparseVector]
+    assert(vec9.argmax === 0)
+
+    val vec10 = Vectors.sparse(1, Array.empty[Int], Array.empty[Double]).asInstanceOf[SparseVector]
+    assert(vec10.argmax === 0)
   }
 
   test("vector equals") {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
index 39a6bc37d963..d39865a19a5c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
@@ -207,7 +207,7 @@ object TestingUtils {
       if (r.fun(x, r.y, r.eps)) {
         throw new TestFailedException(
           s"Did not expect \n$x\n and \n${r.y}\n to be within " +
-            "${r.eps}${r.method} for all elements.", 0)
+            s"${r.eps}${r.method} for all elements.", 0)
       }
       true
     }
diff --git a/pom.xml b/pom.xml
index aaf7cfa7eb2a..a66156c9050a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.0-SNAPSHOT</version>
+  <version>2.1.2-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
@@ -175,7 +175,7 @@
     <joda.version>2.9.3</joda.version>
     <jodd.version>3.5.2</jodd.version>
     <jsr305.version>1.3.9</jsr305.version>
-    <libthrift.version>0.9.2</libthrift.version>
+    <libthrift.version>0.9.3</libthrift.version>
     <antlr4.version>4.5.3</antlr4.version>
     <jpam.version>1.1</jpam.version>
     <selenium.version>2.52.0</selenium.version>
@@ -293,6 +293,12 @@
         <artifactId>spark-tags_${scala.binary.version}</artifactId>
         <version>${project.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.apache.spark</groupId>
+        <artifactId>spark-tags_${scala.binary.version}</artifactId>
+        <version>${project.version}</version>
+        <type>test-jar</type>
+      </dependency>
       <dependency>
         <groupId>com.twitter</groupId>
         <artifactId>chill_${scala.binary.version}</artifactId>
@@ -552,7 +558,7 @@
       <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty-all</artifactId>
-        <version>4.0.41.Final</version>
+        <version>4.0.42.Final</version>
       </dependency>
       <dependency>
         <groupId>io.netty</groupId>
@@ -1429,6 +1435,11 @@
             <groupId>jline</groupId>
             <artifactId>jline</artifactId>
           </exclusion>
+          <!-- Cat X license now; see SPARK-18262 -->
+          <exclusion>
+            <groupId>org.json</groupId>
+            <artifactId>json</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
       <dependency>
@@ -1754,6 +1765,10 @@
             <groupId>org.codehaus.janino</groupId>
             <artifactId>janino</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>org.codehaus.janino</groupId>
+            <artifactId>commons-compiler</artifactId>
+          </exclusion>
           <!-- hsqldb interferes with the use of derby as the default db
             in hive's use of datanucleus.
           -->
@@ -1791,6 +1806,11 @@
         <artifactId>janino</artifactId>
         <version>${janino.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.codehaus.janino</groupId>
+        <artifactId>commons-compiler</artifactId>
+        <version>${janino.version}</version>
+      </dependency>
       <dependency>
         <groupId>joda-time</groupId>
         <artifactId>joda-time</artifactId>
@@ -2471,6 +2491,38 @@
             <artifactId>maven-javadoc-plugin</artifactId>
             <configuration>
               <additionalparam>-Xdoclint:all -Xdoclint:-missing</additionalparam>
+              <tags>
+                <tag>
+                  <name>example</name>
+                  <placement>a</placement>
+                  <head>Example:</head>
+                </tag>
+                <tag>
+                  <name>note</name>
+                  <placement>a</placement>
+                  <head>Note:</head>
+                </tag>
+                <tag>
+                  <name>group</name>
+                  <placement>X</placement>
+                </tag>
+                <tag>
+                  <name>tparam</name>
+                  <placement>X</placement>
+                </tag>
+                <tag>
+                  <name>constructor</name>
+                  <placement>X</placement>
+                </tag>
+                <tag>
+                  <name>todo</name>
+                  <placement>X</placement>
+                </tag>
+                <tag>
+                  <name>groupname</name>
+                  <placement>X</placement>
+                </tag>
+              </tags>
             </configuration>
           </plugin>
         </plugins>
@@ -2693,6 +2745,54 @@
       </build>
     </profile>
 
+    <!--
+     This is a profile to enable the use of the ASF snapshot and staging repositories
+     during a build. It is useful when testing againt nightly or RC releases of dependencies.
+     It MUST NOT be used when building copies of Spark to use in production of for distribution,
+     -->
+    <profile>
+      <id>snapshots-and-staging</id>
+      <properties>
+        <!-- override point for ASF staging/snapshot repos -->
+        <asf.staging>https://repository.apache.org/content/groups/staging/</asf.staging>
+        <asf.snapshots>https://repository.apache.org/content/repositories/snapshots/</asf.snapshots>
+      </properties>
+
+      <pluginRepositories>
+        <pluginRepository>
+          <id>ASF Staging</id>
+          <url>${asf.staging}</url>
+        </pluginRepository>
+        <pluginRepository>
+          <id>ASF Snapshots</id>
+          <url>${asf.snapshots}</url>
+          <snapshots>
+            <enabled>true</enabled>
+          </snapshots>
+          <releases>
+            <enabled>false</enabled>
+          </releases>
+        </pluginRepository>
+
+      </pluginRepositories>
+      <repositories>
+        <repository>
+          <id>ASF Staging</id>
+          <url>${asf.staging}</url>
+        </repository>
+        <repository>
+          <id>ASF Snapshots</id>
+          <url>${asf.snapshots}</url>
+          <snapshots>
+            <enabled>true</enabled>
+          </snapshots>
+          <releases>
+            <enabled>false</enabled>
+          </releases>
+        </repository>
+      </repositories>
+    </profile>
+
     <!--
       These empty profiles are available in some sub-modules. Declare them here so that
       maven does not complain when they're provided on the command line for a sub-module
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 350b144f8294..d8720935e989 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -37,6 +37,9 @@ object MimaExcludes {
   // Exclude rules for 2.1.x
   lazy val v21excludes = v20excludes ++ {
     Seq(
+      // [SPARK-19652][UI] Do auth checks for REST API access.
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.history.HistoryServer.withSparkUI"),
+      ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.status.api.v1.UIRootFromServletContext"),
       // [SPARK-17671] Spark 2.0 history server summary page is slow even set spark.history.ui.maxApplications
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.deploy.history.HistoryServer.getApplicationList"),
       // [SPARK-14743] Improve delegation token handling in secure cluster
@@ -78,6 +81,17 @@ object MimaExcludes {
       ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryTerminated"),
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryTerminated"),
 
+      // [SPARK-18516][SQL] Split state and progress in streaming
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.SourceStatus"),
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.SinkStatus"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.sinkStatus"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.sourceStatuses"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQuery.id"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.lastProgress"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.recentProgress"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.id"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.get"),
+
       // [SPARK-17338][SQL] add global temp view
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.dropGlobalTempView"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.catalog.Catalog.dropTempView"),
@@ -86,7 +100,23 @@ object MimaExcludes {
       // [SPARK-18034] Upgrade to MiMa 0.1.11 to fix flakiness.
       ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.aggregationDepth"),
       ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.getAggregationDepth"),
-      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.org$apache$spark$ml$param$shared$HasAggregationDepth$_setter_$aggregationDepth_=")
+      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.org$apache$spark$ml$param$shared$HasAggregationDepth$_setter_$aggregationDepth_="),
+
+      // [SPARK-18236] Reduce duplicate objects in Spark UI and HistoryServer
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.scheduler.TaskInfo.accumulables"),
+
+      // [SPARK-18657] Add StreamingQuery.runId
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.runId"),
+
+      // [SPARK-18694] Add StreamingQuery.explain and exception to Python and fix StreamingQueryException
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.StreamingQueryException$"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.startOffset"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.endOffset"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.this"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query"),
+
+      // [SPARK-18949] [SQL] Add repairTable API to Catalog
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.recoverPartitions")
     )
   }
 
@@ -864,6 +894,36 @@ object MimaExcludes {
       // [SPARK-12221] Add CPU time to metrics
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskMetrics.this"),
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskMetricDistributions.this")
+    ) ++ Seq(
+      // [SPARK-18481] ML 2.1 QA: Remove deprecated methods for ML
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.PipelineStage.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.JavaParams.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.Params.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.GBTClassificationModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegression.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.GBTClassifier.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel.setLabelCol"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.evaluation.Evaluator.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressor.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressionModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.LinearRegressionSummary.model"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassifier"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.GBTClassifier"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.GBTClassificationModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressor"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.GBTRegressor"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.GBTRegressionModel"),
+      ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.getNumTrees"),
+      ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.getNumTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy")
     )
   }
 
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 2d3a95b163a7..e772fa071a77 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -699,6 +699,7 @@ object Unidoc {
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/util/collection")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/catalyst")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/execution")))
+      .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/internal")))
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/hive/test")))
   }
 
@@ -741,7 +742,14 @@ object Unidoc {
     javacOptions in (JavaUnidoc, unidoc) := Seq(
       "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " JavaDoc",
       "-public",
-      "-noqualifier", "java.lang"
+      "-noqualifier", "java.lang",
+      "-tag", """example:a:Example\:""",
+      "-tag", """note:a:Note\:""",
+      "-tag", "group:X",
+      "-tag", "tparam:X",
+      "-tag", "constructor:X",
+      "-tag", "todo:X",
+      "-tag", "groupname:X"
     ),
 
     // Use GitHub repository for Scaladoc source links
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
new file mode 100644
index 000000000000..40f1fb2f1ee7
--- /dev/null
+++ b/python/MANIFEST.in
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+global-exclude *.py[cod] __pycache__ .DS_Store
+recursive-include deps/jars *.jar
+graft deps/bin
+recursive-include deps/data *.data *.txt
+recursive-include deps/licenses *.txt
+recursive-include deps/examples *.py
+recursive-include lib *.zip
+include README.md
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 000000000000..0a5c8010b848
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,32 @@
+# Apache Spark
+
+Spark is a fast and general cluster computing system for Big Data. It provides
+high-level APIs in Scala, Java, Python, and R, and an optimized engine that
+supports general computation graphs for data analysis. It also supports a
+rich set of higher-level tools including Spark SQL for SQL and DataFrames,
+MLlib for machine learning, GraphX for graph processing,
+and Spark Streaming for stream processing.
+
+<http://spark.apache.org/>
+
+## Online Documentation
+
+You can find the latest Spark documentation, including a programming
+guide, on the [project web page](http://spark.apache.org/documentation.html)
+
+
+## Python Packaging
+
+This README file only contains basic information related to pip installed PySpark.
+This packaging is currently experimental and may change in future versions (although we will do our best to keep compatibility).
+Using PySpark requires the Spark JARs, and if you are building this from source please see the builder instructions at
+["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
+
+The Python packaging for Spark is not intended to replace all of the other use cases. This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to setup your own standalone Spark cluster. You can download the full version of Spark from the [Apache Spark downloads page](http://spark.apache.org/downloads.html).
+
+
+**NOTE:** If you are using this with a Spark standalone cluster you must ensure that the version (including minor version) matches or you may experience odd errors.
+
+## Python Requirements
+
+At its core PySpark depends on Py4J (currently version 0.10.4), but additional sub-packages have their own requirements (including numpy and pandas).
\ No newline at end of file
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index ec1687415a7f..f7927b38e590 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -50,6 +50,7 @@
 from pyspark.serializers import MarshalSerializer, PickleSerializer
 from pyspark.status import *
 from pyspark.profiler import Profiler, BasicProfiler
+from pyspark.version import __version__
 
 
 def since(version):
@@ -89,13 +90,15 @@ def keyword_only(func):
     """
     A decorator that forces keyword arguments in the wrapped method
     and saves actual input keyword arguments in `_input_kwargs`.
+
+    .. note:: Should only be used to wrap a method where first arg is `self`
     """
     @wraps(func)
-    def wrapper(*args, **kwargs):
-        if len(args) > 1:
+    def wrapper(self, *args, **kwargs):
+        if len(args) > 0:
             raise TypeError("Method %s forces keyword arguments." % func.__name__)
-        wrapper._input_kwargs = kwargs
-        return func(*args, **kwargs)
+        self._input_kwargs = kwargs
+        return func(self, **kwargs)
     return wrapper
 
 
diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py
index da2b2f375796..959fb8b357f9 100644
--- a/python/pyspark/cloudpickle.py
+++ b/python/pyspark/cloudpickle.py
@@ -43,6 +43,7 @@
 from __future__ import print_function
 
 import operator
+import opcode
 import os
 import io
 import pickle
@@ -53,6 +54,8 @@
 import itertools
 import dis
 import traceback
+import weakref
+
 
 if sys.version < '3':
     from pickle import Pickler
@@ -68,10 +71,10 @@
     PY3 = True
 
 #relevant opcodes
-STORE_GLOBAL = dis.opname.index('STORE_GLOBAL')
-DELETE_GLOBAL = dis.opname.index('DELETE_GLOBAL')
-LOAD_GLOBAL = dis.opname.index('LOAD_GLOBAL')
-GLOBAL_OPS = [STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL]
+STORE_GLOBAL = opcode.opmap['STORE_GLOBAL']
+DELETE_GLOBAL = opcode.opmap['DELETE_GLOBAL']
+LOAD_GLOBAL = opcode.opmap['LOAD_GLOBAL']
+GLOBAL_OPS = (STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL)
 HAVE_ARGUMENT = dis.HAVE_ARGUMENT
 EXTENDED_ARG = dis.EXTENDED_ARG
 
@@ -90,6 +93,43 @@ def _builtin_type(name):
     return getattr(types, name)
 
 
+if sys.version_info < (3, 4):
+    def _walk_global_ops(code):
+        """
+        Yield (opcode, argument number) tuples for all
+        global-referencing instructions in *code*.
+        """
+        code = getattr(code, 'co_code', b'')
+        if not PY3:
+            code = map(ord, code)
+
+        n = len(code)
+        i = 0
+        extended_arg = 0
+        while i < n:
+            op = code[i]
+            i += 1
+            if op >= HAVE_ARGUMENT:
+                oparg = code[i] + code[i + 1] * 256 + extended_arg
+                extended_arg = 0
+                i += 2
+                if op == EXTENDED_ARG:
+                    extended_arg = oparg * 65536
+                if op in GLOBAL_OPS:
+                    yield op, oparg
+
+else:
+    def _walk_global_ops(code):
+        """
+        Yield (opcode, argument number) tuples for all
+        global-referencing instructions in *code*.
+        """
+        for instr in dis.get_instructions(code):
+            op = instr.opcode
+            if op in GLOBAL_OPS:
+                yield op, instr.arg
+
+
 class CloudPickler(Pickler):
 
     dispatch = Pickler.dispatch.copy()
@@ -260,38 +300,34 @@ def save_function_tuple(self, func):
         write(pickle.TUPLE)
         write(pickle.REDUCE)  # applies _fill_function on the tuple
 
-    @staticmethod
-    def extract_code_globals(co):
+    _extract_code_globals_cache = (
+        weakref.WeakKeyDictionary()
+        if sys.version_info >= (2, 7) and not hasattr(sys, "pypy_version_info")
+        else {})
+
+    @classmethod
+    def extract_code_globals(cls, co):
         """
         Find all globals names read or written to by codeblock co
         """
-        code = co.co_code
-        if not PY3:
-            code = [ord(c) for c in code]
-        names = co.co_names
-        out_names = set()
-
-        n = len(code)
-        i = 0
-        extended_arg = 0
-        while i < n:
-            op = code[i]
+        out_names = cls._extract_code_globals_cache.get(co)
+        if out_names is None:
+            try:
+                names = co.co_names
+            except AttributeError:
+                # PyPy "builtin-code" object
+                out_names = set()
+            else:
+                out_names = set(names[oparg]
+                                for op, oparg in _walk_global_ops(co))
 
-            i += 1
-            if op >= HAVE_ARGUMENT:
-                oparg = code[i] + code[i+1] * 256 + extended_arg
-                extended_arg = 0
-                i += 2
-                if op == EXTENDED_ARG:
-                    extended_arg = oparg*65536
-                if op in GLOBAL_OPS:
-                    out_names.add(names[oparg])
+                # see if nested function have any global refs
+                if co.co_consts:
+                    for const in co.co_consts:
+                        if type(const) is types.CodeType:
+                            out_names |= cls.extract_code_globals(const)
 
-        # see if nested function have any global refs
-        if co.co_consts:
-            for const in co.co_consts:
-                if type(const) is types.CodeType:
-                    out_names |= CloudPickler.extract_code_globals(const)
+            cls._extract_code_globals_cache[co] = out_names
 
         return out_names
 
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index 64b6f238e9c3..491b3a81972b 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -90,8 +90,8 @@ class SparkConf(object):
     All setter methods in this class support chaining. For example,
     you can write C{conf.setMaster("local").setAppName("My app")}.
 
-    Note that once a SparkConf object is passed to Spark, it is cloned
-    and can no longer be modified by the user.
+    .. note:: Once a SparkConf object is passed to Spark, it is cloned
+        and can no longer be modified by the user.
     """
 
     def __init__(self, loadDefaults=True, _jvm=None, _jconf=None):
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 1b2e199c395b..ac4b2b035f5c 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -22,9 +22,12 @@
 import signal
 import sys
 import threading
+import warnings
 from threading import RLock
 from tempfile import NamedTemporaryFile
 
+from py4j.protocol import Py4JError
+
 from pyspark import accumulators
 from pyspark.accumulators import Accumulator
 from pyspark.broadcast import Broadcast
@@ -129,6 +132,9 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
             self._conf = conf
         else:
             self._conf = SparkConf(_jvm=SparkContext._jvm)
+            if conf is not None:
+                for k, v in conf.getAll():
+                    self._conf.set(k, v)
 
         self._batchSize = batchSize  # -1 represents an unlimited batch size
         self._unbatched_serializer = serializer
@@ -187,6 +193,9 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
         self.pythonVer = "%d.%d" % sys.version_info[:2]
 
+        if sys.version_info < (2, 7):
+            warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0")
+
         # Broadcast's __reduce__ method stores Broadcast instances here.
         # This allows other code to determine which Broadcast instances have
         # been pickled, so it can determine which Java broadcast objects to
@@ -369,8 +378,19 @@ def stop(self):
         Shut down the SparkContext.
         """
         if getattr(self, "_jsc", None):
-            self._jsc.stop()
-            self._jsc = None
+            try:
+                self._jsc.stop()
+            except Py4JError:
+                # Case: SPARK-18523
+                warnings.warn(
+                    'Unable to cleanly shutdown Spark JVM process.'
+                    ' It is possible that the process has crashed,'
+                    ' been killed or may also be in a zombie state.',
+                    RuntimeWarning
+                )
+                pass
+            finally:
+                self._jsc = None
         if getattr(self, "_accumulatorServer", None):
             self._accumulatorServer.shutdown()
             self._accumulatorServer = None
@@ -516,8 +536,8 @@ def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):
           ...
           (a-hdfs-path/part-nnnnn, its content)
 
-        NOTE: Small files are preferred, as each file will be loaded
-        fully in memory.
+        .. note:: Small files are preferred, as each file will be loaded
+            fully in memory.
 
         >>> dirPath = os.path.join(tempdir, "files")
         >>> os.mkdir(dirPath)
@@ -543,8 +563,8 @@ def binaryFiles(self, path, minPartitions=None):
         in a key-value pair, where the key is the path of each file, the
         value is the content of each file.
 
-        Note: Small files are preferred, large file is also allowable, but
-        may cause bad performance.
+        .. note:: Small files are preferred, large file is also allowable, but
+            may cause bad performance.
         """
         minPartitions = minPartitions or self.defaultMinPartitions
         return RDD(self._jsc.binaryFiles(path, minPartitions), self,
diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py
new file mode 100755
index 000000000000..212a618b767a
--- /dev/null
+++ b/python/pyspark/find_spark_home.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script attempt to determine the correct setting for SPARK_HOME given
+# that Spark may have been installed on the system with pip.
+
+from __future__ import print_function
+import os
+import sys
+
+
+def _find_spark_home():
+    """Find the SPARK_HOME."""
+    # If the enviroment has SPARK_HOME set trust it.
+    if "SPARK_HOME" in os.environ:
+        return os.environ["SPARK_HOME"]
+
+    def is_spark_home(path):
+        """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME"""
+        return (os.path.isfile(os.path.join(path, "bin/spark-submit")) and
+                (os.path.isdir(os.path.join(path, "jars")) or
+                 os.path.isdir(os.path.join(path, "assembly"))))
+
+    paths = ["../", os.path.dirname(os.path.realpath(__file__))]
+
+    # Add the path of the PySpark module if it exists
+    if sys.version < "3":
+        import imp
+        try:
+            module_home = imp.find_module("pyspark")[1]
+            paths.append(module_home)
+            # If we are installed in edit mode also look two dirs up
+            paths.append(os.path.join(module_home, "../../"))
+        except ImportError:
+            # Not pip installed no worries
+            pass
+    else:
+        from importlib.util import find_spec
+        try:
+            module_home = os.path.dirname(find_spec("pyspark").origin)
+            paths.append(module_home)
+            # If we are installed in edit mode also look two dirs up
+            paths.append(os.path.join(module_home, "../../"))
+        except ImportError:
+            # Not pip installed no worries
+            pass
+
+    # Normalize the paths
+    paths = [os.path.abspath(p) for p in paths]
+
+    try:
+        return next(path for path in paths if is_spark_home(path))
+    except StopIteration:
+        print("Could not find valid SPARK_HOME while searching {0}".format(paths), file=sys.stderr)
+        exit(-1)
+
+if __name__ == "__main__":
+    print(_find_spark_home())
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index c1cf843d8438..3c783ae541a1 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -29,6 +29,7 @@
     xrange = range
 
 from py4j.java_gateway import java_import, JavaGateway, GatewayClient
+from pyspark.find_spark_home import _find_spark_home
 from pyspark.serializers import read_int
 
 
@@ -41,7 +42,7 @@ def launch_gateway(conf=None):
     if "PYSPARK_GATEWAY_PORT" in os.environ:
         gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
     else:
-        SPARK_HOME = os.environ["SPARK_HOME"]
+        SPARK_HOME = _find_spark_home()
         # Launch the Py4j gateway using Spark's run command so that we pick up the
         # proper classpath and settings from spark-env.sh
         on_windows = platform.system() == "Windows"
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index d9ff356b9403..2b47c402674a 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -152,7 +152,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.LogisticRegression", self.uid)
         self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
         self._checkThresholdConsistency()
 
@@ -172,7 +172,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
         Sets params for logistic regression.
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         self._set(**kwargs)
         self._checkThresholdConsistency()
         return self
@@ -238,13 +238,13 @@ def getThresholds(self):
 
     def _checkThresholdConsistency(self):
         if self.isSet(self.threshold) and self.isSet(self.thresholds):
-            ts = self.getParam(self.thresholds)
+            ts = self.getOrDefault(self.thresholds)
             if len(ts) != 2:
                 raise ValueError("Logistic Regression getThreshold only applies to" +
                                  " binary classification, but thresholds has length != 2." +
-                                 " thresholds: " + ",".join(ts))
+                                 " thresholds: {0}".format(str(ts)))
             t = 1.0/(1.0 + ts[0]/ts[1])
-            t2 = self.getParam(self.threshold)
+            t2 = self.getOrDefault(self.threshold)
             if abs(t2 - t) >= 1E-5:
                 raise ValueError("Logistic Regression getThreshold found inconsistent values for" +
                                  " threshold (%g) and thresholds (equivalent to %g)" % (t2, t))
@@ -309,13 +309,16 @@ def interceptVector(self):
     @since("2.0.0")
     def summary(self):
         """
-        Gets summary (e.g. residuals, mse, r-squared ) of model on
-        training set. An exception is thrown if
-        `trainingSummary is None`.
+        Gets summary (e.g. accuracy/precision/recall, objective history, total iterations) of model
+        trained on the training set. An exception is thrown if `trainingSummary is None`.
         """
-        java_blrt_summary = self._call_java("summary")
-        # Note: Once multiclass is added, update this to return correct summary
-        return BinaryLogisticRegressionTrainingSummary(java_blrt_summary)
+        if self.hasSummary:
+            java_blrt_summary = self._call_java("summary")
+            # Note: Once multiclass is added, update this to return correct summary
+            return BinaryLogisticRegressionTrainingSummary(java_blrt_summary)
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
 
     @property
     @since("2.0.0")
@@ -437,9 +440,9 @@ def roc(self):
         .. seealso:: `Wikipedia reference \
         <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("roc")
 
@@ -450,9 +453,9 @@ def areaUnderROC(self):
         Computes the area under the receiver operating characteristic
         (ROC) curve.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("areaUnderROC")
 
@@ -464,9 +467,9 @@ def pr(self):
         containing two fields recall, precision with (0.0, 1.0) prepended
         to it.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("pr")
 
@@ -477,9 +480,9 @@ def fMeasureByThreshold(self):
         Returns a dataframe with two fields (threshold, F-Measure) curve
         with beta = 1.0.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("fMeasureByThreshold")
 
@@ -491,9 +494,9 @@ def precisionByThreshold(self):
         Every possible probability obtained in transforming the dataset
         are used as thresholds used in calculating the precision.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("precisionByThreshold")
 
@@ -505,9 +508,9 @@ def recallByThreshold(self):
         Every possible probability obtained in transforming the dataset
         are used as thresholds used in calculating the recall.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("recallByThreshold")
 
@@ -643,7 +646,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="gini")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -661,7 +664,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   seed=None)
         Sets params for the DecisionTreeClassifier.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -692,9 +695,9 @@ def featureImportances(self):
             where gain is scaled by the number of instances passing through node
           - Normalize importances for tree to sum to 1.
 
-        Note: Feature importance for single decision trees can have high variance due to
-              correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`
-              to determine feature importance instead.
+        .. note:: Feature importance for single decision trees can have high variance due to
+            correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`
+            to determine feature importance instead.
         """
         return self._call_java("featureImportances")
 
@@ -773,7 +776,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="gini", numTrees=20, featureSubsetStrategy="auto",
                          subsamplingRate=1.0)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -791,7 +794,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0)
         Sets params for linear classification.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -836,7 +839,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     `Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_
     learning algorithm for classification.
     It supports binary labels, as well as both continuous and categorical features.
-    Note: Multiclass labels are not currently supported.
 
     The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
 
@@ -848,6 +850,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     - We expect to implement TreeBoost in the future:
     `SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_
 
+    .. note:: Multiclass labels are not currently supported.
+
     >>> from numpy import allclose
     >>> from pyspark.ml.linalg import Vectors
     >>> from pyspark.ml.feature import StringIndexer
@@ -900,20 +904,20 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic",
-                 maxIter=20, stepSize=0.1, seed=None):
+                 maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
-                 lossType="logistic", maxIter=20, stepSize=0.1, seed=None)
+                 lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)
         """
         super(GBTClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.GBTClassifier", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                         lossType="logistic", maxIter=20, stepSize=0.1)
-        kwargs = self.__init__._input_kwargs
+                         lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0)
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -921,15 +925,15 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None):
+                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
-                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None)
+                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)
         Sets params for Gradient Boosted Tree Classification.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -1056,7 +1060,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.NaiveBayes", self.uid)
         self._setDefault(smoothing=1.0, modelType="multinomial")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1070,7 +1074,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   modelType="multinomial", thresholds=None, weightCol=None)
         Sets params for Naive Bayes.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -1134,8 +1138,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
                                      HasMaxIter, HasTol, HasSeed, HasStepSize, JavaMLWritable,
                                      JavaMLReadable):
     """
-    .. note:: Experimental
-
     Classifier trainer based on the Multilayer Perceptron.
     Each layer has sigmoid activation function, output layer has softmax.
     Number of inputs has to be equal to the size of feature vectors.
@@ -1213,7 +1215,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
         self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1227,7 +1229,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   solver="l-bfgs", initialWeights=None)
         Sets params for MultilayerPerceptronClassifier.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -1307,8 +1309,6 @@ def getInitialWeights(self):
 class MultilayerPerceptronClassificationModel(JavaModel, JavaPredictionModel, JavaMLWritable,
                                               JavaMLReadable):
     """
-    .. note:: Experimental
-
     Model fitted by MultilayerPerceptronClassifier.
 
     .. versionadded:: 1.6.0
@@ -1400,7 +1400,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  classifier=None)
         """
         super(OneVsRest, self).__init__()
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @keyword_only
@@ -1410,7 +1410,7 @@ def setParams(self, featuresCol=None, labelCol=None, predictionCol=None, classif
         setParams(self, featuresCol=None, labelCol=None, predictionCol=None, classifier=None):
         Sets params for OneVsRest.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _fit(self, dataset):
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 7632f05c3b68..86aa28905c91 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -17,20 +17,76 @@
 
 from pyspark import since, keyword_only
 from pyspark.ml.util import *
-from pyspark.ml.wrapper import JavaEstimator, JavaModel
+from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper
 from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc
 
-__all__ = ['BisectingKMeans', 'BisectingKMeansModel',
+__all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary',
            'KMeans', 'KMeansModel',
-           'GaussianMixture', 'GaussianMixtureModel',
+           'GaussianMixture', 'GaussianMixtureModel', 'GaussianMixtureSummary',
            'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel']
 
 
-class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable):
+class ClusteringSummary(JavaWrapper):
     """
     .. note:: Experimental
 
+    Clustering results for a given model.
+
+    .. versionadded:: 2.1.0
+    """
+
+    @property
+    @since("2.1.0")
+    def predictionCol(self):
+        """
+        Name for column of predicted clusters in `predictions`.
+        """
+        return self._call_java("predictionCol")
+
+    @property
+    @since("2.1.0")
+    def predictions(self):
+        """
+        DataFrame produced by the model's `transform` method.
+        """
+        return self._call_java("predictions")
+
+    @property
+    @since("2.1.0")
+    def featuresCol(self):
+        """
+        Name for column of features in `predictions`.
+        """
+        return self._call_java("featuresCol")
+
+    @property
+    @since("2.1.0")
+    def k(self):
+        """
+        The number of clusters the model was trained with.
+        """
+        return self._call_java("k")
+
+    @property
+    @since("2.1.0")
+    def cluster(self):
+        """
+        DataFrame of predicted cluster centers for each training data point.
+        """
+        return self._call_java("cluster")
+
+    @property
+    @since("2.1.0")
+    def clusterSizes(self):
+        """
+        Size of (number of data points in) each cluster.
+        """
+        return self._call_java("clusterSizes")
+
+
+class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable):
+    """
     Model fitted by GaussianMixture.
 
     .. versionadded:: 2.0.0
@@ -56,13 +112,33 @@ def gaussiansDF(self):
         """
         return self._call_java("gaussiansDF")
 
+    @property
+    @since("2.1.0")
+    def hasSummary(self):
+        """
+        Indicates whether a training summary exists for this model
+        instance.
+        """
+        return self._call_java("hasSummary")
+
+    @property
+    @since("2.1.0")
+    def summary(self):
+        """
+        Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
+        training set. An exception is thrown if no summary exists.
+        """
+        if self.hasSummary:
+            return GaussianMixtureSummary(self._call_java("summary"))
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
+
 
 @inherit_doc
 class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
                       HasProbabilityCol, JavaMLWritable, JavaMLReadable):
     """
-    .. note:: Experimental
-
     GaussianMixture clustering.
     This class performs expectation maximization for multivariate Gaussian
     Mixture Models (GMMs).  A GMM represents a composite distribution of
@@ -75,7 +151,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     While this process is generally guaranteed to converge, it is not guaranteed
     to find a global optimum.
 
-    Note: For high-dimensional data (with many features), this algorithm may perform poorly.
+    .. note:: For high-dimensional data (with many features), this algorithm may perform poorly.
           This is due to high-dimensional data (a) making it difficult to cluster at all
           (based on statistical/theoretical arguments) and (b) numerical issues with
           Gaussian distributions.
@@ -92,6 +168,13 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> gm = GaussianMixture(k=3, tol=0.0001,
     ...                      maxIter=10, seed=10)
     >>> model = gm.fit(df)
+    >>> model.hasSummary
+    True
+    >>> summary = model.summary
+    >>> summary.k
+    3
+    >>> summary.clusterSizes
+    [2, 2, 2]
     >>> weights = model.weights
     >>> len(weights)
     3
@@ -118,6 +201,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> model_path = temp_path + "/gmm_model"
     >>> model.save(model_path)
     >>> model2 = GaussianMixtureModel.load(model_path)
+    >>> model2.hasSummary
+    False
     >>> model2.weights == model.weights
     True
     >>> model2.gaussiansDF.show()
@@ -147,7 +232,7 @@ def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
         self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.GaussianMixture",
                                             self.uid)
         self._setDefault(k=2, tol=0.01, maxIter=100)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     def _create_model(self, java_model):
@@ -163,7 +248,7 @@ def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
 
         Sets params for GaussianMixture.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("2.0.0")
@@ -181,6 +266,43 @@ def getK(self):
         return self.getOrDefault(self.k)
 
 
+class GaussianMixtureSummary(ClusteringSummary):
+    """
+    .. note:: Experimental
+
+    Gaussian mixture clustering results for a given model.
+
+    .. versionadded:: 2.1.0
+    """
+
+    @property
+    @since("2.1.0")
+    def probabilityCol(self):
+        """
+        Name for column of predicted probability of each cluster in `predictions`.
+        """
+        return self._call_java("probabilityCol")
+
+    @property
+    @since("2.1.0")
+    def probability(self):
+        """
+        DataFrame of probabilities of each cluster for each training data point.
+        """
+        return self._call_java("probability")
+
+
+class KMeansSummary(ClusteringSummary):
+    """
+    .. note:: Experimental
+
+    Summary of KMeans.
+
+    .. versionadded:: 2.1.0
+    """
+    pass
+
+
 class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
     """
     Model fitted by KMeans.
@@ -201,6 +323,27 @@ def computeCost(self, dataset):
         """
         return self._call_java("computeCost", dataset)
 
+    @property
+    @since("2.1.0")
+    def hasSummary(self):
+        """
+        Indicates whether a training summary exists for this model instance.
+        """
+        return self._call_java("hasSummary")
+
+    @property
+    @since("2.1.0")
+    def summary(self):
+        """
+        Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
+        training set. An exception is thrown if no summary exists.
+        """
+        if self.hasSummary:
+            return KMeansSummary(self._call_java("summary"))
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
+
 
 @inherit_doc
 class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
@@ -226,6 +369,13 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
     True
     >>> rows[2].prediction == rows[3].prediction
     True
+    >>> model.hasSummary
+    True
+    >>> summary = model.summary
+    >>> summary.k
+    2
+    >>> summary.clusterSizes
+    [2, 2]
     >>> kmeans_path = temp_path + "/kmeans"
     >>> kmeans.save(kmeans_path)
     >>> kmeans2 = KMeans.load(kmeans_path)
@@ -234,6 +384,8 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
     >>> model_path = temp_path + "/kmeans_model"
     >>> model.save(model_path)
     >>> model2 = KMeansModel.load(model_path)
+    >>> model2.hasSummary
+    False
     >>> model.clusterCenters()[0] == model2.clusterCenters()[0]
     array([ True,  True], dtype=bool)
     >>> model.clusterCenters()[1] == model2.clusterCenters()[1]
@@ -262,7 +414,7 @@ def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
         super(KMeans, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
         self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     def _create_model(self, java_model):
@@ -278,7 +430,7 @@ def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
 
         Sets params for KMeans.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.5.0")
@@ -326,8 +478,6 @@ def getInitSteps(self):
 
 class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
     """
-    .. note:: Experimental
-
     Model fitted by BisectingKMeans.
 
     .. versionadded:: 2.0.0
@@ -346,13 +496,32 @@ def computeCost(self, dataset):
         """
         return self._call_java("computeCost", dataset)
 
+    @property
+    @since("2.1.0")
+    def hasSummary(self):
+        """
+        Indicates whether a training summary exists for this model instance.
+        """
+        return self._call_java("hasSummary")
+
+    @property
+    @since("2.1.0")
+    def summary(self):
+        """
+        Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
+        training set. An exception is thrown if no summary exists.
+        """
+        if self.hasSummary:
+            return BisectingKMeansSummary(self._call_java("summary"))
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
+
 
 @inherit_doc
 class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasSeed,
                       JavaMLWritable, JavaMLReadable):
     """
-    .. note:: Experimental
-
     A bisecting k-means algorithm based on the paper "A comparison of document clustering
     techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark.
     The algorithm starts from a single cluster that contains all points.
@@ -373,6 +542,13 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     2
     >>> model.computeCost(df)
     2.000...
+    >>> model.hasSummary
+    True
+    >>> summary = model.summary
+    >>> summary.k
+    2
+    >>> summary.clusterSizes
+    [2, 2]
     >>> transformed = model.transform(df).select("features", "prediction")
     >>> rows = transformed.collect()
     >>> rows[0].prediction == rows[1].prediction
@@ -387,6 +563,8 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> model_path = temp_path + "/bkm_model"
     >>> model.save(model_path)
     >>> model2 = BisectingKMeansModel.load(model_path)
+    >>> model2.hasSummary
+    False
     >>> model.clusterCenters()[0] == model2.clusterCenters()[0]
     array([ True,  True], dtype=bool)
     >>> model.clusterCenters()[1] == model2.clusterCenters()[1]
@@ -413,7 +591,7 @@ def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=2
         self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.BisectingKMeans",
                                             self.uid)
         self._setDefault(maxIter=20, k=4, minDivisibleClusterSize=1.0)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -425,7 +603,7 @@ def setParams(self, featuresCol="features", predictionCol="prediction", maxIter=
                   seed=None, k=4, minDivisibleClusterSize=1.0)
         Sets params for BisectingKMeans.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("2.0.0")
@@ -460,11 +638,20 @@ def _create_model(self, java_model):
         return BisectingKMeansModel(java_model)
 
 
-@inherit_doc
-class LDAModel(JavaModel):
+class BisectingKMeansSummary(ClusteringSummary):
     """
     .. note:: Experimental
 
+    Bisecting KMeans clustering results for a given model.
+
+    .. versionadded:: 2.1.0
+    """
+    pass
+
+
+@inherit_doc
+class LDAModel(JavaModel):
+    """
     Latent Dirichlet Allocation (LDA) model.
     This abstraction permits for different underlying representations,
     including local and distributed data structures.
@@ -541,8 +728,6 @@ def estimatedDocConcentration(self):
 @inherit_doc
 class DistributedLDAModel(LDAModel, JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Distributed model fitted by :py:class:`LDA`.
     This type of model is currently only produced by Expectation-Maximization (EM).
 
@@ -593,9 +778,9 @@ def getCheckpointFiles(self):
         If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may
         be saved checkpoint files.  This method is provided so that users can manage those files.
 
-        Note that removing the checkpoints can cause failures if a partition is lost and is needed
-        by certain :py:class:`DistributedLDAModel` methods.  Reference counting will clean up the
-        checkpoints when this model and derivative data go out of scope.
+        .. note:: Removing the checkpoints can cause failures if a partition is lost and is needed
+            by certain :py:class:`DistributedLDAModel` methods.  Reference counting will clean up
+            the checkpoints when this model and derivative data go out of scope.
 
         :return  List of checkpoint files from training
         """
@@ -605,8 +790,6 @@ def getCheckpointFiles(self):
 @inherit_doc
 class LocalLDAModel(LDAModel, JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Local (non-distributed) model fitted by :py:class:`LDA`.
     This model stores the inferred topics only; it does not store info about the training dataset.
 
@@ -619,8 +802,6 @@ class LocalLDAModel(LDAModel, JavaMLReadable, JavaMLWritable):
 class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval,
           JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
 
     Terminology:
@@ -735,7 +916,7 @@ def __init__(self, featuresCol="features", maxIter=20, seed=None, checkpointInte
                          k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51,
                          subsamplingRate=0.05, optimizeDocConcentration=True,
                          topicDistributionCol="topicDistribution", keepLastCheckpoint=True)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     def _create_model(self, java_model):
@@ -760,7 +941,7 @@ def setParams(self, featuresCol="features", maxIter=20, seed=None, checkpointInt
 
         Sets params for LDA.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("2.0.0")
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 7aa16fa5b90f..7cb8d62f212c 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -148,7 +148,7 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
             "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
         self._setDefault(rawPredictionCol="rawPrediction", labelCol="label",
                          metricName="areaUnderROC")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @since("1.4.0")
@@ -174,7 +174,7 @@ def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
                   metricName="areaUnderROC")
         Sets params for binary classification evaluator.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
 
@@ -226,7 +226,7 @@ def __init__(self, predictionCol="prediction", labelCol="label",
             "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
         self._setDefault(predictionCol="prediction", labelCol="label",
                          metricName="rmse")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @since("1.4.0")
@@ -252,7 +252,7 @@ def setParams(self, predictionCol="prediction", labelCol="label",
                   metricName="rmse")
         Sets params for regression evaluator.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
 
@@ -299,7 +299,7 @@ def __init__(self, predictionCol="prediction", labelCol="label",
             "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
         self._setDefault(predictionCol="prediction", labelCol="label",
                          metricName="f1")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @since("1.5.0")
@@ -325,7 +325,7 @@ def setParams(self, predictionCol="prediction", labelCol="label",
                   metricName="f1")
         Sets params for multiclass classification evaluator.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
 if __name__ == "__main__":
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 635cf1304588..3a4b6ed6a307 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -92,7 +92,7 @@ def __init__(self, threshold=0.0, inputCol=None, outputCol=None):
         super(Binarizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid)
         self._setDefault(threshold=0.0)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -102,7 +102,7 @@ def setParams(self, threshold=0.0, inputCol=None, outputCol=None):
         setParams(self, threshold=0.0, inputCol=None, outputCol=None)
         Sets params for this Binarizer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -125,10 +125,13 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
     """
     Maps a column of continuous features to a column of feature buckets.
 
-    >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
+    >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
+    >>> df = spark.createDataFrame(values, ["values"])
     >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
     ...     inputCol="values", outputCol="buckets")
-    >>> bucketed = bucketizer.transform(df).collect()
+    >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()
+    >>> len(bucketed)
+    6
     >>> bucketed[0].buckets
     0.0
     >>> bucketed[1].buckets
@@ -144,6 +147,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
     >>> loadedBucketizer = Bucketizer.load(bucketizerPath)
     >>> loadedBucketizer.getSplits() == bucketizer.getSplits()
     True
+    >>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect()
+    >>> len(bucketed)
+    4
 
     .. versionadded:: 1.4.0
     """
@@ -158,24 +164,31 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
               "splits specified will be treated as errors.",
               typeConverter=TypeConverters.toListFloat)
 
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
+                          "Options are 'skip' (filter out rows with invalid values), " +
+                          "'error' (throw an error), or 'keep' (keep invalid values in a special " +
+                          "additional bucket).",
+                          typeConverter=TypeConverters.toString)
+
     @keyword_only
-    def __init__(self, splits=None, inputCol=None, outputCol=None):
+    def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        __init__(self, splits=None, inputCol=None, outputCol=None)
+        __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
         """
         super(Bucketizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
-        kwargs = self.__init__._input_kwargs
+        self._setDefault(handleInvalid="error")
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("1.4.0")
-    def setParams(self, splits=None, inputCol=None, outputCol=None):
+    def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        setParams(self, splits=None, inputCol=None, outputCol=None)
+        setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
         Sets params for this Bucketizer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -192,6 +205,20 @@ def getSplits(self):
         """
         return self.getOrDefault(self.splits)
 
+    @since("2.1.0")
+    def setHandleInvalid(self, value):
+        """
+        Sets the value of :py:attr:`handleInvalid`.
+        """
+        return self._set(handleInvalid=value)
+
+    @since("2.1.0")
+    def getHandleInvalid(self):
+        """
+        Gets the value of :py:attr:`handleInvalid` or its default value.
+        """
+        return self.getOrDefault(self.handleInvalid)
+
 
 @inherit_doc
 class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
@@ -265,7 +292,7 @@ def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputC
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
                                             self.uid)
         self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -277,7 +304,7 @@ def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, input
                   outputCol=None)
         Set the params for the CountVectorizer
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.6.0")
@@ -397,7 +424,7 @@ def __init__(self, inverse=False, inputCol=None, outputCol=None):
         super(DCT, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
         self._setDefault(inverse=False)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -407,7 +434,7 @@ def setParams(self, inverse=False, inputCol=None, outputCol=None):
         setParams(self, inverse=False, inputCol=None, outputCol=None)
         Sets params for this DCT.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.6.0")
@@ -461,7 +488,7 @@ def __init__(self, scalingVec=None, inputCol=None, outputCol=None):
         super(ElementwiseProduct, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",
                                             self.uid)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -471,7 +498,7 @@ def setParams(self, scalingVec=None, inputCol=None, outputCol=None):
         setParams(self, scalingVec=None, inputCol=None, outputCol=None)
         Sets params for this ElementwiseProduct.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("2.0.0")
@@ -531,7 +558,7 @@ def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=N
         super(HashingTF, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid)
         self._setDefault(numFeatures=1 << 18, binary=False)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -541,7 +568,7 @@ def setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=
         setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None)
         Sets params for this HashingTF.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("2.0.0")
@@ -604,7 +631,7 @@ def __init__(self, minDocFreq=0, inputCol=None, outputCol=None):
         super(IDF, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid)
         self._setDefault(minDocFreq=0)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -614,7 +641,7 @@ def setParams(self, minDocFreq=0, inputCol=None, outputCol=None):
         setParams(self, minDocFreq=0, inputCol=None, outputCol=None)
         Sets params for this IDF.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -654,8 +681,6 @@ def idf(self):
 @inherit_doc
 class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Rescale each feature individually to range [-1, 1] by dividing through the largest maximum
     absolute value in each feature. It does not shift/center the data, and thus does not destroy
     any sparsity.
@@ -696,7 +721,7 @@ def __init__(self, inputCol=None, outputCol=None):
         super(MaxAbsScaler, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MaxAbsScaler", self.uid)
         self._setDefault()
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -706,7 +731,7 @@ def setParams(self, inputCol=None, outputCol=None):
         setParams(self, inputCol=None, outputCol=None)
         Sets params for this MaxAbsScaler.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -715,8 +740,6 @@ def _create_model(self, java_model):
 
 class MaxAbsScalerModel(JavaModel, JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Model fitted by :py:class:`MaxAbsScaler`.
 
     .. versionadded:: 2.0.0
@@ -742,8 +765,8 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav
 
     For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)
 
-    Note that since zero values will probably be transformed to non-zero values, output of the
-    transformer will be DenseVector even for sparse input.
+    .. note:: Since zero values will probably be transformed to non-zero values, output of the
+        transformer will be DenseVector even for sparse input.
 
     >>> from pyspark.ml.linalg import Vectors
     >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
@@ -792,7 +815,7 @@ def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
         super(MinMaxScaler, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)
         self._setDefault(min=0.0, max=1.0)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -802,7 +825,7 @@ def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
         setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
         Sets params for this MinMaxScaler.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.6.0")
@@ -910,7 +933,7 @@ def __init__(self, n=2, inputCol=None, outputCol=None):
         super(NGram, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid)
         self._setDefault(n=2)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -920,7 +943,7 @@ def setParams(self, n=2, inputCol=None, outputCol=None):
         setParams(self, n=2, inputCol=None, outputCol=None)
         Sets params for this NGram.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.5.0")
@@ -974,7 +997,7 @@ def __init__(self, p=2.0, inputCol=None, outputCol=None):
         super(Normalizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)
         self._setDefault(p=2.0)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -984,7 +1007,7 @@ def setParams(self, p=2.0, inputCol=None, outputCol=None):
         setParams(self, p=2.0, inputCol=None, outputCol=None)
         Sets params for this Normalizer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -1014,9 +1037,9 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
     :py:attr:`dropLast`) because it makes the vector entries sum up to
     one, and hence linearly dependent.
     So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
-    Note that this is different from scikit-learn's OneHotEncoder,
-    which keeps all categories.
-    The output vectors are sparse.
+
+    .. note:: This is different from scikit-learn's OneHotEncoder,
+        which keeps all categories. The output vectors are sparse.
 
     .. seealso::
 
@@ -1054,7 +1077,7 @@ def __init__(self, dropLast=True, inputCol=None, outputCol=None):
         super(OneHotEncoder, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
         self._setDefault(dropLast=True)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1064,7 +1087,7 @@ def setParams(self, dropLast=True, inputCol=None, outputCol=None):
         setParams(self, dropLast=True, inputCol=None, outputCol=None)
         Sets params for this OneHotEncoder.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -1120,7 +1143,7 @@ def __init__(self, degree=2, inputCol=None, outputCol=None):
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.feature.PolynomialExpansion", self.uid)
         self._setDefault(degree=2)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1130,7 +1153,7 @@ def setParams(self, degree=2, inputCol=None, outputCol=None):
         setParams(self, degree=2, inputCol=None, outputCol=None)
         Sets params for this PolynomialExpansion.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -1161,12 +1184,17 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadab
     :py:attr:`relativeError` parameter.
     The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values.
 
-    >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
+    >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
+    >>> df = spark.createDataFrame(values, ["values"])
     >>> qds = QuantileDiscretizer(numBuckets=2,
-    ...     inputCol="values", outputCol="buckets", relativeError=0.01)
+    ...     inputCol="values", outputCol="buckets", relativeError=0.01, handleInvalid="error")
     >>> qds.getRelativeError()
     0.01
     >>> bucketizer = qds.fit(df)
+    >>> qds.setHandleInvalid("keep").fit(df).transform(df).count()
+    6
+    >>> qds.setHandleInvalid("skip").fit(df).transform(df).count()
+    4
     >>> splits = bucketizer.getSplits()
     >>> splits[0]
     -inf
@@ -1194,26 +1222,36 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadab
                           "Must be in the range [0, 1].",
                           typeConverter=TypeConverters.toFloat)
 
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
+                          "Options are skip (filter out rows with invalid values), " +
+                          "error (throw an error), or keep (keep invalid values in a special " +
+                          "additional bucket).",
+                          typeConverter=TypeConverters.toString)
+
     @keyword_only
-    def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
+    def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001,
+                 handleInvalid="error"):
         """
-        __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
+        __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \
+                 handleInvalid="error")
         """
         super(QuantileDiscretizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
                                             self.uid)
-        self._setDefault(numBuckets=2, relativeError=0.001)
-        kwargs = self.__init__._input_kwargs
+        self._setDefault(numBuckets=2, relativeError=0.001, handleInvalid="error")
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("2.0.0")
-    def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
+    def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001,
+                  handleInvalid="error"):
         """
-        setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
+        setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \
+                  handleInvalid="error")
         Set the params for the QuantileDiscretizer
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("2.0.0")
@@ -1244,13 +1282,28 @@ def getRelativeError(self):
         """
         return self.getOrDefault(self.relativeError)
 
+    @since("2.1.0")
+    def setHandleInvalid(self, value):
+        """
+        Sets the value of :py:attr:`handleInvalid`.
+        """
+        return self._set(handleInvalid=value)
+
+    @since("2.1.0")
+    def getHandleInvalid(self):
+        """
+        Gets the value of :py:attr:`handleInvalid` or its default value.
+        """
+        return self.getOrDefault(self.handleInvalid)
+
     def _create_model(self, java_model):
         """
         Private method to convert the java_model to a Python model.
         """
         return Bucketizer(splits=list(java_model.getSplits()),
                           inputCol=self.getInputCol(),
-                          outputCol=self.getOutputCol())
+                          outputCol=self.getOutputCol(),
+                          handleInvalid=self.getHandleInvalid())
 
 
 @inherit_doc
@@ -1311,7 +1364,7 @@ def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None,
         super(RegexTokenizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
         self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1323,7 +1376,7 @@ def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None,
                   outputCol=None, toLowercase=True)
         Sets params for this RegexTokenizer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -1414,7 +1467,7 @@ def __init__(self, statement=None):
         """
         super(SQLTransformer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1424,7 +1477,7 @@ def setParams(self, statement=None):
         setParams(self, statement=None)
         Sets params for this SQLTransformer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.6.0")
@@ -1493,7 +1546,7 @@ def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None):
         super(StandardScaler, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid)
         self._setDefault(withMean=False, withStd=True)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1503,7 +1556,7 @@ def setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
         setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
         Sets params for this StandardScaler.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -1609,7 +1662,7 @@ def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"):
         super(StringIndexer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
         self._setDefault(handleInvalid="error")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1619,7 +1672,7 @@ def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
         setParams(self, inputCol=None, outputCol=None, handleInvalid="error")
         Sets params for this StringIndexer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -1667,7 +1720,7 @@ def __init__(self, inputCol=None, outputCol=None, labels=None):
         super(IndexToString, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
                                             self.uid)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1677,7 +1730,7 @@ def setParams(self, inputCol=None, outputCol=None, labels=None):
         setParams(self, inputCol=None, outputCol=None, labels=None)
         Sets params for this IndexToString.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.6.0")
@@ -1698,7 +1751,8 @@ def getLabels(self):
 class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
     """
     A feature transformer that filters out stop words from input.
-    Note: null values from input array are preserved unless adding null to stopWords explicitly.
+
+    .. note:: null values from input array are preserved unless adding null to stopWords explicitly.
 
     >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"])
     >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"])
@@ -1730,7 +1784,7 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=
                                             self.uid)
         self._setDefault(stopWords=StopWordsRemover.loadDefaultStopWords("english"),
                          caseSensitive=False)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1740,7 +1794,7 @@ def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive
         setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false)
         Sets params for this StopWordRemover.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.6.0")
@@ -1823,7 +1877,7 @@ def __init__(self, inputCol=None, outputCol=None):
         """
         super(Tokenizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Tokenizer", self.uid)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1833,7 +1887,7 @@ def setParams(self, inputCol=None, outputCol=None):
         setParams(self, inputCol=None, outputCol=None)
         Sets params for this Tokenizer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
 
@@ -1867,7 +1921,7 @@ def __init__(self, inputCols=None, outputCol=None):
         """
         super(VectorAssembler, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorAssembler", self.uid)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1877,7 +1931,7 @@ def setParams(self, inputCols=None, outputCol=None):
         setParams(self, inputCols=None, outputCol=None)
         Sets params for this VectorAssembler.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
 
@@ -1965,7 +2019,7 @@ def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
         super(VectorIndexer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
         self._setDefault(maxCategories=20)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1975,7 +2029,7 @@ def setParams(self, maxCategories=20, inputCol=None, outputCol=None):
         setParams(self, maxCategories=20, inputCol=None, outputCol=None)
         Sets params for this VectorIndexer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -2080,7 +2134,7 @@ def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
         super(VectorSlicer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
         self._setDefault(indices=[], names=[])
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -2090,7 +2144,7 @@ def setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
         setParams(self, inputCol=None, outputCol=None, indices=None, names=None):
         Sets params for this VectorSlicer.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.6.0")
@@ -2203,7 +2257,7 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
         self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
                          windowSize=5, maxSentenceLength=1000)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -2215,7 +2269,7 @@ def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025,
                  inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000)
         Sets params for this Word2Vec.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -2363,7 +2417,7 @@ def __init__(self, k=None, inputCol=None, outputCol=None):
         """
         super(PCA, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -2373,7 +2427,7 @@ def setParams(self, k=None, inputCol=None, outputCol=None):
         setParams(self, k=None, inputCol=None, outputCol=None)
         Set params for this PCA.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.5.0")
@@ -2503,7 +2557,7 @@ def __init__(self, formula=None, featuresCol="features", labelCol="label",
         super(RFormula, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid)
         self._setDefault(forceIndexLabel=False)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -2515,7 +2569,7 @@ def setParams(self, formula=None, featuresCol="features", labelCol="label",
                   forceIndexLabel=False)
         Sets params for RFormula.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.5.0")
@@ -2633,7 +2687,7 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None,
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
         self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1,
                          fpr=0.05)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -2645,7 +2699,7 @@ def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,
                   labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05)
         Sets params for this ChiSqSelector.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("2.1.0")
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index a5df727fdb41..eed9946aba06 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -72,7 +72,10 @@ def _convert_to_vector(l):
         return DenseVector(l)
     elif _have_scipy and scipy.sparse.issparse(l):
         assert l.shape[1] == 1, "Expected column vector"
+        # Make sure the converted csc_matrix has sorted indices.
         csc = l.tocsc()
+        if not csc.has_sorted_indices:
+            csc.sort_indices()
         return SparseVector(l.shape[0], csc.indices, csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(l))
@@ -746,11 +749,12 @@ def __hash__(self):
 class Vectors(object):
 
     """
-    Factory methods for working with vectors. Note that dense vectors
-    are simply represented as NumPy array objects, so there is no need
-    to covert them for use in MLlib. For sparse vectors, the factory
-    methods in this class create an MLlib-compatible type, or users
-    can pass in SciPy's C{scipy.sparse} column vectors.
+    Factory methods for working with vectors.
+
+    .. note:: Dense vectors are simply represented as NumPy array objects,
+        so there is no need to covert them for use in MLlib. For sparse vectors,
+        the factory methods in this class create an MLlib-compatible type, or users
+        can pass in SciPy's C{scipy.sparse} column vectors.
     """
 
     @staticmethod
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index ade4864e1d78..205b8d516a6f 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -385,6 +385,7 @@ def copy(self, extra=None):
             extra = dict()
         that = copy.copy(self)
         that._paramMap = {}
+        that._defaultParamMap = {}
         return self._copyValues(that, extra)
 
     def _shouldOwn(self, param):
@@ -465,12 +466,16 @@ def _copyValues(self, to, extra=None):
         :param extra: extra params to be copied
         :return: the target instance with param values copied
         """
-        if extra is None:
-            extra = dict()
-        paramMap = self.extractParamMap(extra)
-        for p in self.params:
-            if p in paramMap and to.hasParam(p.name):
-                to._set(**{p.name: paramMap[p]})
+        paramMap = self._paramMap.copy()
+        if extra is not None:
+            paramMap.update(extra)
+        for param in self.params:
+            # copy default params
+            if param in self._defaultParamMap and to.hasParam(param.name):
+                to._defaultParamMap[to.getParam(param.name)] = self._defaultParamMap[param]
+            # copy explicitly set params
+            if param in paramMap and to.hasParam(param.name):
+                to._set(**{param.name: paramMap[param]})
         return to
 
     def _resetUid(self, newUid):
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 4307ad02a0eb..2d2e4c13e8d7 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -58,7 +58,7 @@ def __init__(self, stages=None):
         __init__(self, stages=None)
         """
         super(Pipeline, self).__init__()
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @since("1.3.0")
@@ -85,7 +85,7 @@ def setParams(self, stages=None):
         setParams(self, stages=None)
         Sets params for Pipeline.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _fit(self, dataset):
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index e28d38bd19f8..ee9916f47271 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -146,7 +146,7 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB
                          ratingCol="rating", nonnegative=False, checkpointInterval=10,
                          intermediateStorageLevel="MEMORY_AND_DISK",
                          finalStorageLevel="MEMORY_AND_DISK")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -164,7 +164,7 @@ def setParams(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItem
                  finalStorageLevel="MEMORY_AND_DISK")
         Sets params for ALS.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 9233d2e7e1a7..b199bf282e4f 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -108,7 +108,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.LinearRegression", self.uid)
         self._setDefault(maxIter=100, regParam=0.0, tol=1e-6)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -122,7 +122,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   standardization=True, solver="auto", weightCol=None, aggregationDepth=2)
         Sets params for linear regression.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -160,8 +160,12 @@ def summary(self):
         training set. An exception is thrown if
         `trainingSummary is None`.
         """
-        java_lrt_summary = self._call_java("summary")
-        return LinearRegressionTrainingSummary(java_lrt_summary)
+        if self.hasSummary:
+            java_lrt_summary = self._call_java("summary")
+            return LinearRegressionTrainingSummary(java_lrt_summary)
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
 
     @property
     @since("2.0.0")
@@ -241,9 +245,9 @@ def explainedVariance(self):
         .. seealso:: `Wikipedia explain variation \
         <http://en.wikipedia.org/wiki/Explained_variation>`_
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("explainedVariance")
 
@@ -255,9 +259,9 @@ def meanAbsoluteError(self):
         corresponding to the expected value of the absolute error
         loss or l1-norm loss.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("meanAbsoluteError")
 
@@ -269,9 +273,9 @@ def meanSquaredError(self):
         corresponding to the expected value of the squared error
         loss or quadratic loss.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("meanSquaredError")
 
@@ -282,9 +286,9 @@ def rootMeanSquaredError(self):
         Returns the root mean squared error, which is defined as the
         square root of the mean squared error.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("rootMeanSquaredError")
 
@@ -297,9 +301,9 @@ def r2(self):
         .. seealso:: `Wikipedia coefficient of determination \
         <http://en.wikipedia.org/wiki/Coefficient_of_determination>`
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("r2")
 
@@ -460,7 +464,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.IsotonicRegression", self.uid)
         self._setDefault(isotonic=True, featureIndex=0)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -471,7 +475,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                  weightCol=None, isotonic=True, featureIndex=0):
         Set the params for IsotonicRegression.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -700,7 +704,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="variance")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -716,7 +720,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   impurity="variance", seed=None, varianceCol=None)
         Sets params for the DecisionTreeRegressor.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -818,7 +822,7 @@ def featureImportances(self):
             where gain is scaled by the number of instances passing through node
           - Normalize importances for tree to sum to 1.
 
-        Note: Feature importance for single decision trees can have high variance due to
+        .. note:: Feature importance for single decision trees can have high variance due to
               correlated predictor variables. Consider using a :py:class:`RandomForestRegressor`
               to determine feature importance instead.
         """
@@ -828,7 +832,7 @@ def featureImportances(self):
 @inherit_doc
 class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
                             RandomForestParams, TreeRegressorParams, HasCheckpointInterval,
-                            JavaMLWritable, JavaMLReadable, HasVarianceCol):
+                            JavaMLWritable, JavaMLReadable):
     """
     `Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
     learning algorithm for regression.
@@ -876,13 +880,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
-                 featureSubsetStrategy="auto", varianceCol=None):
+                 featureSubsetStrategy="auto"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
-                 featureSubsetStrategy="auto", varianceCol=None)
+                 featureSubsetStrategy="auto")
         """
         super(RandomForestRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -891,7 +895,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="variance", subsamplingRate=1.0, numTrees=20,
                          featureSubsetStrategy="auto")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -900,16 +904,16 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                   impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
-                  featureSubsetStrategy="auto", varianceCol=None):
+                  featureSubsetStrategy="auto"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
                   impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
-                  featureSubsetStrategy="auto", varianceCol=None)
+                  featureSubsetStrategy="auto")
         Sets params for linear regression.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -1018,7 +1022,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                          maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
                          checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1,
                          impurity="variance")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1036,7 +1040,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   impurity="variance")
         Sets params for Gradient Boosted Tree Regression.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -1167,7 +1171,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
         self._setDefault(censorCol="censor",
                          quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99],
                          maxIter=100, tol=1E-6)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1182,7 +1186,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \
                   quantilesCol=None, aggregationDepth=2):
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -1362,7 +1366,7 @@ def __init__(self, labelCol="label", featuresCol="features", predictionCol="pred
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.GeneralizedLinearRegression", self.uid)
         self._setDefault(family="gaussian", maxIter=25, tol=1e-6, regParam=0.0, solver="irls")
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -1376,7 +1380,7 @@ def setParams(self, labelCol="label", featuresCol="features", predictionCol="pre
                   regParam=0.0, weightCol=None, solver="irls", linkPredictionCol=None)
         Sets params for generalized linear regression.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
@@ -1459,8 +1463,12 @@ def summary(self):
         training set. An exception is thrown if
         `trainingSummary is None`.
         """
-        java_glrt_summary = self._call_java("summary")
-        return GeneralizedLinearRegressionTrainingSummary(java_glrt_summary)
+        if self.hasSummary:
+            java_glrt_summary = self._call_java("summary")
+            return GeneralizedLinearRegressionTrainingSummary(java_glrt_summary)
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
 
     @property
     @since("2.0.0")
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 9d46cc3b4ae6..7152036e385d 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -250,7 +250,7 @@ class TestParams(HasMaxIter, HasInputCol, HasSeed):
     def __init__(self, seed=None):
         super(TestParams, self).__init__()
         self._setDefault(maxIter=10)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -259,7 +259,7 @@ def setParams(self, seed=None):
         setParams(self, seed=None)
         Sets params for this test.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
 
@@ -271,7 +271,7 @@ class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
     def __init__(self, seed=None):
         super(OtherTestParams, self).__init__()
         self._setDefault(maxIter=10)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
@@ -280,7 +280,7 @@ def setParams(self, seed=None):
         setParams(self, seed=None)
         Sets params for this test.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
 
@@ -389,6 +389,40 @@ def test_word2vec_param(self):
         # Check windowSize is set properly
         self.assertEqual(model.getWindowSize(), 6)
 
+    def test_copy_param_extras(self):
+        tp = TestParams(seed=42)
+        extra = {tp.getParam(TestParams.inputCol.name): "copy_input"}
+        tp_copy = tp.copy(extra=extra)
+        self.assertEqual(tp.uid, tp_copy.uid)
+        self.assertEqual(tp.params, tp_copy.params)
+        for k, v in extra.items():
+            self.assertTrue(tp_copy.isDefined(k))
+            self.assertEqual(tp_copy.getOrDefault(k), v)
+        copied_no_extra = {}
+        for k, v in tp_copy._paramMap.items():
+            if k not in extra:
+                copied_no_extra[k] = v
+        self.assertEqual(tp._paramMap, copied_no_extra)
+        self.assertEqual(tp._defaultParamMap, tp_copy._defaultParamMap)
+
+
+class EvaluatorTests(SparkSessionTestCase):
+
+    def test_java_params(self):
+        """
+        This tests a bug fixed by SPARK-18274 which causes multiple copies
+        of a Params instance in Python to be linked to the same Java instance.
+        """
+        evaluator = RegressionEvaluator(metricName="r2")
+        df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
+        evaluator.evaluate(df)
+        self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
+        evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
+        evaluator.evaluate(df)
+        evaluatorCopy.evaluate(df)
+        self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
+        self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
+
 
 class FeatureTests(SparkSessionTestCase):
 
@@ -774,6 +808,18 @@ def test_logistic_regression(self):
         except OSError:
             pass
 
+    def logistic_regression_check_thresholds(self):
+        self.assertIsInstance(
+            LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]),
+            LogisticRegressionModel
+        )
+
+        self.assertRaisesRegexp(
+            ValueError,
+            "Logistic Regression getThreshold found inconsistent.*$",
+            LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5]
+        )
+
     def _compare_params(self, m1, m2, param):
         """
         Compare 2 ML Params instances for the given param, and assert both have the same param value
@@ -1097,6 +1143,53 @@ def test_logistic_regression_summary(self):
         sameSummary = model.evaluate(df)
         self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
 
+    def test_gaussian_mixture_summary(self):
+        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
+                (Vectors.sparse(1, [], []),)]
+        df = self.spark.createDataFrame(data, ["features"])
+        gmm = GaussianMixture(k=2)
+        model = gmm.fit(df)
+        self.assertTrue(model.hasSummary)
+        s = model.summary
+        self.assertTrue(isinstance(s.predictions, DataFrame))
+        self.assertEqual(s.probabilityCol, "probability")
+        self.assertTrue(isinstance(s.probability, DataFrame))
+        self.assertEqual(s.featuresCol, "features")
+        self.assertEqual(s.predictionCol, "prediction")
+        self.assertTrue(isinstance(s.cluster, DataFrame))
+        self.assertEqual(len(s.clusterSizes), 2)
+        self.assertEqual(s.k, 2)
+
+    def test_bisecting_kmeans_summary(self):
+        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
+                (Vectors.sparse(1, [], []),)]
+        df = self.spark.createDataFrame(data, ["features"])
+        bkm = BisectingKMeans(k=2)
+        model = bkm.fit(df)
+        self.assertTrue(model.hasSummary)
+        s = model.summary
+        self.assertTrue(isinstance(s.predictions, DataFrame))
+        self.assertEqual(s.featuresCol, "features")
+        self.assertEqual(s.predictionCol, "prediction")
+        self.assertTrue(isinstance(s.cluster, DataFrame))
+        self.assertEqual(len(s.clusterSizes), 2)
+        self.assertEqual(s.k, 2)
+
+    def test_kmeans_summary(self):
+        data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
+                (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
+        df = self.spark.createDataFrame(data, ["features"])
+        kmeans = KMeans(k=2, seed=1)
+        model = kmeans.fit(df)
+        self.assertTrue(model.hasSummary)
+        s = model.summary
+        self.assertTrue(isinstance(s.predictions, DataFrame))
+        self.assertEqual(s.featuresCol, "features")
+        self.assertEqual(s.predictionCol, "prediction")
+        self.assertTrue(isinstance(s.cluster, DataFrame))
+        self.assertEqual(len(s.clusterSizes), 2)
+        self.assertEqual(s.k, 2)
+
 
 class OneVsRestTests(SparkSessionTestCase):
 
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 2dcc99cef8aa..ffeb4459e1aa 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -186,7 +186,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF
         """
         super(CrossValidator, self).__init__()
         self._setDefault(numFolds=3)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @keyword_only
@@ -198,7 +198,7 @@ def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, num
                   seed=None):
         Sets params for cross validator.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("1.4.0")
@@ -346,7 +346,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trai
         """
         super(TrainValidationSplit, self).__init__()
         self._setDefault(trainRatio=0.75)
-        kwargs = self.__init__._input_kwargs
+        kwargs = self._input_kwargs
         self._set(**kwargs)
 
     @since("2.0.0")
@@ -358,7 +358,7 @@ def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, tra
                   seed=None):
         Sets params for the train validation split.
         """
-        kwargs = self.setParams._input_kwargs
+        kwargs = self._input_kwargs
         return self._set(**kwargs)
 
     @since("2.0.0")
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 7d39c3012235..02016f172aeb 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -17,6 +17,7 @@
 
 import sys
 import uuid
+import warnings
 
 if sys.version > '3':
     basestring = str
@@ -62,8 +63,6 @@ def _randomUID(cls):
 @inherit_doc
 class MLWriter(object):
     """
-    .. note:: Experimental
-
     Utility class that can save ML instances.
 
     .. versionadded:: 2.0.0
@@ -78,7 +77,14 @@ def overwrite(self):
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for saving."""
+        """
+        Sets the SQL context to use for saving.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
+        """
+        raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
+
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for saving."""
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
 
@@ -105,16 +111,23 @@ def overwrite(self):
         return self
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for saving."""
+        """
+        Sets the SQL context to use for saving.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
+        """
+        warnings.warn("Deprecated in 2.1 and will be removed in 2.2, use session instead.")
         self._jwrite.context(sqlContext._ssql_ctx)
         return self
 
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for saving."""
+        self._jwrite.session(sparkSession._jsparkSession)
+        return self
+
 
 @inherit_doc
 class MLWritable(object):
     """
-    .. note:: Experimental
-
     Mixin for ML instances that provide :py:class:`MLWriter`.
 
     .. versionadded:: 2.0.0
@@ -143,8 +156,6 @@ def write(self):
 @inherit_doc
 class MLReader(object):
     """
-    .. note:: Experimental
-
     Utility class that can load ML instances.
 
     .. versionadded:: 2.0.0
@@ -155,7 +166,14 @@ def load(self, path):
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for loading."""
+        """
+        Sets the SQL context to use for loading.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
+        """
+        raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
+
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for loading."""
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
 
@@ -180,10 +198,19 @@ def load(self, path):
         return self._clazz._from_java(java_obj)
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for loading."""
+        """
+        Sets the SQL context to use for loading.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
+        """
+        warnings.warn("Deprecated in 2.1 and will be removed in 2.2, use session instead.")
         self._jread.context(sqlContext._ssql_ctx)
         return self
 
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for loading."""
+        self._jread.session(sparkSession._jsparkSession)
+        return self
+
     @classmethod
     def _java_loader_class(cls, clazz):
         """
@@ -210,8 +237,6 @@ def _load_java_obj(cls, clazz):
 @inherit_doc
 class MLReadable(object):
     """
-    .. note:: Experimental
-
     Mixin for instances that provide :py:class:`MLReader`.
 
     .. versionadded:: 2.0.0
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 25c44b7533c7..13b75e991922 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -71,6 +71,10 @@ class JavaParams(JavaWrapper, Params):
 
     __metaclass__ = ABCMeta
 
+    def __del__(self):
+        if SparkContext._active_spark_context:
+            SparkContext._active_spark_context._gateway.detach(self._java_obj)
+
     def _make_java_param_pair(self, param, value):
         """
         Makes a Java parm pair.
@@ -180,6 +184,25 @@ def __get_class(clazz):
                                       % stage_name)
         return py_stage
 
+    def copy(self, extra=None):
+        """
+        Creates a copy of this instance with the same uid and some
+        extra params. This implementation first calls Params.copy and
+        then make a copy of the companion Java pipeline component with
+        extra params. So both the Python wrapper and the Java pipeline
+        component get copied.
+
+        :param extra: Extra parameters to copy to the new instance
+        :return: Copy of this instance
+        """
+        if extra is None:
+            extra = dict()
+        that = super(JavaParams, self).copy(extra)
+        if self._java_obj is not None:
+            that._java_obj = self._java_obj.copy(self._empty_java_param_map())
+            that._transfer_params_to_java()
+        return that
+
 
 @inherit_doc
 class JavaEstimator(JavaParams, Estimator):
@@ -256,21 +279,3 @@ def __init__(self, java_model=None):
         super(JavaModel, self).__init__(java_model)
         if java_model is not None:
             self._resetUid(java_model.uid())
-
-    def copy(self, extra=None):
-        """
-        Creates a copy of this instance with the same uid and some
-        extra params. This implementation first calls Params.copy and
-        then make a copy of the companion Java model with extra params.
-        So both the Python wrapper and the Java model get copied.
-
-        :param extra: Extra parameters to copy to the new instance
-        :return: Copy of this instance
-        """
-        if extra is None:
-            extra = dict()
-        that = super(JavaModel, self).copy(extra)
-        if self._java_obj is not None:
-            that._java_obj = self._java_obj.copy(self._empty_java_param_map())
-            that._transfer_params_to_java()
-        return that
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 9f53ed098202..e04eeb2b60d7 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -171,7 +171,7 @@ def __init__(self, weights, intercept, numFeatures, numClasses):
             self._dataWithBiasSize = None
             self._weightsMatrix = None
         else:
-            self._dataWithBiasSize = self._coeff.size / (self._numClasses - 1)
+            self._dataWithBiasSize = self._coeff.size // (self._numClasses - 1)
             self._weightsMatrix = self._coeff.toArray().reshape(self._numClasses - 1,
                                                                 self._dataWithBiasSize)
 
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 2036168e456f..91123ace3387 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -699,9 +699,9 @@ class StreamingKMeansModel(KMeansModel):
     * n_t+1: New number of weights.
     * a: Decay Factor, which gives the forgetfulness.
 
-    Note that if a is set to 1, it is the weighted mean of the previous
-    and new data. If it set to zero, the old centroids are completely
-    forgotten.
+    .. note:: If a is set to 1, it is the weighted mean of the previous
+        and new data. If it set to zero, the old centroids are completely
+        forgotten.
 
     :param clusterCenters:
       Initial cluster centers.
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 7eaa2282cb8b..bde0f67be775 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -114,9 +114,9 @@ def transform(self, vector):
         """
         Applies transformation on a vector or an RDD[Vector].
 
-        Note: In Python, transform cannot currently be used within
-              an RDD transformation or action.
-              Call transform directly on the RDD instead.
+        .. note:: In Python, transform cannot currently be used within
+            an RDD transformation or action.
+            Call transform directly on the RDD instead.
 
         :param vector: Vector or RDD of Vector to be transformed.
         """
@@ -139,9 +139,9 @@ def transform(self, vector):
         """
         Applies standardization transformation on a vector.
 
-        Note: In Python, transform cannot currently be used within
-              an RDD transformation or action.
-              Call transform directly on the RDD instead.
+        .. note:: In Python, transform cannot currently be used within
+            an RDD transformation or action.
+            Call transform directly on the RDD instead.
 
         :param vector: Vector or RDD of Vector to be standardized.
         :return: Standardized vector. If the variance of a column is
@@ -407,7 +407,7 @@ class HashingTF(object):
     Maps a sequence of terms to their term frequencies using the hashing
     trick.
 
-    Note: the terms must be hashable (can not be dict/set/list...).
+    .. note:: The terms must be hashable (can not be dict/set/list...).
 
     :param numFeatures: number of features (default: 2^20)
 
@@ -469,9 +469,9 @@ def transform(self, x):
         the terms which occur in fewer than `minDocFreq`
         documents will have an entry of 0.
 
-        Note: In Python, transform cannot currently be used within
-              an RDD transformation or action.
-              Call transform directly on the RDD instead.
+        .. note:: In Python, transform cannot currently be used within
+            an RDD transformation or action.
+            Call transform directly on the RDD instead.
 
         :param x: an RDD of term frequency vectors or a term frequency
                   vector
@@ -551,7 +551,7 @@ def transform(self, word):
         """
         Transforms a word to its vector representation
 
-        Note: local use only
+        .. note:: Local use only
 
         :param word: a word
         :return: vector representation of word(s)
@@ -570,7 +570,7 @@ def findSynonyms(self, word, num):
         :param num: number of synonyms to find
         :return: array of (word, cosineSimilarity)
 
-        Note: local use only
+        .. note:: Local use only
         """
         if not isinstance(word, basestring):
             word = _convert_to_vector(word)
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index d37e715c8d8e..7b24b3c74a9f 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -74,7 +74,10 @@ def _convert_to_vector(l):
         return DenseVector(l)
     elif _have_scipy and scipy.sparse.issparse(l):
         assert l.shape[1] == 1, "Expected column vector"
+        # Make sure the converted csc_matrix has sorted indices.
         csc = l.tocsc()
+        if not csc.has_sorted_indices:
+            csc.sort_indices()
         return SparseVector(l.shape[0], csc.indices, csc.data)
     else:
         raise TypeError("Cannot convert type %s into Vector" % type(l))
@@ -835,11 +838,12 @@ def __hash__(self):
 class Vectors(object):
 
     """
-    Factory methods for working with vectors. Note that dense vectors
-    are simply represented as NumPy array objects, so there is no need
-    to covert them for use in MLlib. For sparse vectors, the factory
-    methods in this class create an MLlib-compatible type, or users
-    can pass in SciPy's C{scipy.sparse} column vectors.
+    Factory methods for working with vectors.
+
+    .. note:: Dense vectors are simply represented as NumPy array objects,
+        so there is no need to covert them for use in MLlib. For sparse vectors,
+        the factory methods in this class create an MLlib-compatible type, or users
+        can pass in SciPy's C{scipy.sparse} column vectors.
     """
 
     @staticmethod
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 538cada7d163..600655c912ca 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -171,8 +171,9 @@ def computeColumnSummaryStatistics(self):
     def computeCovariance(self):
         """
         Computes the covariance matrix, treating each row as an
-        observation. Note that this cannot be computed on matrices
-        with more than 65535 columns.
+        observation.
+
+        .. note:: This cannot be computed on matrices with more than 65535 columns.
 
         >>> rows = sc.parallelize([[1, 2], [2, 1]])
         >>> mat = RowMatrix(rows)
@@ -185,8 +186,9 @@ def computeCovariance(self):
     @since('2.0.0')
     def computeGramianMatrix(self):
         """
-        Computes the Gramian matrix `A^T A`. Note that this cannot be
-        computed on matrices with more than 65535 columns.
+        Computes the Gramian matrix `A^T A`.
+
+        .. note:: This cannot be computed on matrices with more than 65535 columns.
 
         >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
         >>> mat = RowMatrix(rows)
@@ -458,8 +460,9 @@ def columnSimilarities(self):
     @since('2.0.0')
     def computeGramianMatrix(self):
         """
-        Computes the Gramian matrix `A^T A`. Note that this cannot be
-        computed on matrices with more than 65535 columns.
+        Computes the Gramian matrix `A^T A`.
+
+        .. note:: This cannot be computed on matrices with more than 65535 columns.
 
         >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
         ...                        IndexedRow(1, [4, 5, 6])])
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 705022934e41..1b66f5b51044 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -44,7 +44,7 @@ class LabeledPoint(object):
       Vector of features for this point (NumPy array, list,
       pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix).
 
-    Note: 'label' and 'features' are accessible as class attributes.
+    .. note:: 'label' and 'features' are accessible as class attributes.
 
     .. versionadded:: 1.0.0
     """
diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py
index 3b1c5519bd87..7250eab6705a 100644
--- a/python/pyspark/mllib/stat/KernelDensity.py
+++ b/python/pyspark/mllib/stat/KernelDensity.py
@@ -28,7 +28,7 @@
 
 class KernelDensity(object):
     """
-    Estimate probability density at required points given a RDD of samples
+    Estimate probability density at required points given an RDD of samples
     from the population.
 
     >>> kd = KernelDensity()
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index 67d5f0e44f41..49b26446dbc3 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -164,7 +164,6 @@ def chiSqTest(observed, expected=None):
         of fit test of the observed data against the expected distribution,
         or againt the uniform distribution (by default), with each category
         having an expected frequency of `1 / len(observed)`.
-        (Note: `observed` cannot contain negative values)
 
         If `observed` is matrix, conduct Pearson's independence test on the
         input contingency matrix, which cannot contain negative entries or
@@ -176,6 +175,8 @@ def chiSqTest(observed, expected=None):
         contingency matrix for which the chi-squared statistic is computed.
         All label and feature values must be categorical.
 
+        .. note:: `observed` cannot contain negative values
+
         :param observed: it could be a vector containing the observed categorical
                          counts/relative frequencies, or the contingency matrix
                          (containing either counts or relative frequencies),
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index c519883cdd73..523b3f111331 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -853,6 +853,17 @@ def serialize(l):
         self.assertEqual(sv, serialize(lil.tocsr()))
         self.assertEqual(sv, serialize(lil.todok()))
 
+    def test_convert_to_vector(self):
+        from scipy.sparse import csc_matrix
+        # Create a CSC matrix with non-sorted indices
+        indptr = array([0, 2])
+        indices = array([3, 1])
+        data = array([2.0, 1.0])
+        csc = csc_matrix((data, indices, indptr))
+        self.assertFalse(csc.has_sorted_indices)
+        sv = SparseVector(4, {1: 1, 3: 2})
+        self.assertEqual(sv, _convert_to_vector(csc))
+
     def test_dot(self):
         from scipy.sparse import lil_matrix
         lil = lil_matrix((4, 1))
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index b3011d42e56a..a6089fc8b9d3 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -40,9 +40,9 @@ def predict(self, x):
         Predict values for a single data point or an RDD of points using
         the model trained.
 
-        Note: In Python, predict cannot currently be used within an RDD
-              transformation or action.
-              Call predict directly on the RDD instead.
+        .. note:: In Python, predict cannot currently be used within an RDD
+            transformation or action.
+            Call predict directly on the RDD instead.
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))
@@ -85,9 +85,9 @@ def predict(self, x):
         """
         Predict the label of one or more examples.
 
-        Note: In Python, predict cannot currently be used within an RDD
-              transformation or action.
-              Call predict directly on the RDD instead.
+        .. note:: In Python, predict cannot currently be used within an RDD
+            transformation or action.
+            Call predict directly on the RDD instead.
 
         :param x:
           Data point (feature vector), or an RDD of data points (feature
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index ed6fd4bca4c5..97755807ef26 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -499,7 +499,7 @@ def generateLinearInput(intercept, weights, xMean, xVariance,
     def generateLinearRDD(sc, nexamples, nfeatures, eps,
                           nParts=2, intercept=0.0):
         """
-        Generate a RDD of LabeledPoints.
+        Generate an RDD of LabeledPoints.
         """
         return callMLlibFunc(
             "generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures),
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 2de2c2fd1a60..ccef30cf322e 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -135,12 +135,11 @@ def _load_from_socket(port, serializer):
         break
     if not sock:
         raise Exception("could not open socket")
-    try:
-        rf = sock.makefile("rb", 65536)
-        for item in serializer.load_stream(rf):
-            yield item
-    finally:
-        sock.close()
+    # The RDD materialization time is unpredicable, if we set a timeout for socket reading
+    # operation, it will very possibly fail. See SPARK-18281.
+    sock.settimeout(None)
+    # The socket will be automatically closed when garbage-collected.
+    return serializer.load_stream(sock.makefile("rb", 65536))
 
 
 def ignore_unicode_prefix(f):
@@ -263,13 +262,44 @@ def checkpoint(self):
 
     def isCheckpointed(self):
         """
-        Return whether this RDD has been checkpointed or not
+        Return whether this RDD is checkpointed and materialized, either reliably or locally.
         """
         return self._jrdd.rdd().isCheckpointed()
 
+    def localCheckpoint(self):
+        """
+        Mark this RDD for local checkpointing using Spark's existing caching layer.
+
+        This method is for users who wish to truncate RDD lineages while skipping the expensive
+        step of replicating the materialized data in a reliable distributed file system. This is
+        useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX).
+
+        Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed
+        data is written to ephemeral local storage in the executors instead of to a reliable,
+        fault-tolerant storage. The effect is that if an executor fails during the computation,
+        the checkpointed data may no longer be accessible, causing an irrecoverable job failure.
+
+        This is NOT safe to use with dynamic allocation, which removes executors along
+        with their cached blocks. If you must use both features, you are advised to set
+        L{spark.dynamicAllocation.cachedExecutorIdleTimeout} to a high value.
+
+        The checkpoint directory set through L{SparkContext.setCheckpointDir()} is not used.
+        """
+        self._jrdd.rdd().localCheckpoint()
+
+    def isLocallyCheckpointed(self):
+        """
+        Return whether this RDD is marked for local checkpointing.
+
+        Exposed for testing.
+        """
+        return self._jrdd.rdd().isLocallyCheckpointed()
+
     def getCheckpointFile(self):
         """
         Gets the name of the file to which this RDD was checkpointed
+
+        Not defined if RDD is checkpointed locally.
         """
         checkpointFile = self._jrdd.rdd().getCheckpointFile()
         if checkpointFile.isDefined():
@@ -386,6 +416,9 @@ def sample(self, withReplacement, fraction, seed=None):
             with replacement: expected number of times each element is chosen; fraction must be >= 0
         :param seed: seed for the random number generator
 
+        .. note:: This is not guaranteed to provide exactly the fraction specified of the total
+            count of the given :class:`DataFrame`.
+
         >>> rdd = sc.parallelize(range(100), 4)
         >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14
         True
@@ -424,8 +457,8 @@ def takeSample(self, withReplacement, num, seed=None):
         """
         Return a fixed-size sampled subset of this RDD.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: This method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
 
         >>> rdd = sc.parallelize(range(0, 10))
         >>> len(rdd.takeSample(True, 20, 1))
@@ -536,7 +569,7 @@ def intersection(self, other):
         Return the intersection of this RDD and another one. The output will
         not contain any duplicate elements, even if the input RDDs did.
 
-        Note that this method performs a shuffle internally.
+        .. note:: This method performs a shuffle internally.
 
         >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
         >>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
@@ -767,8 +800,9 @@ def func(it):
     def collect(self):
         """
         Return a list that contains all of the elements in this RDD.
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+
+        .. note:: This method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
         """
         with SCCallSiteSync(self.context) as css:
             port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
@@ -1213,12 +1247,12 @@ def mergeMaps(m1, m2):
 
     def top(self, num, key=None):
         """
-        Get the top N elements from a RDD.
+        Get the top N elements from an RDD.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: This method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
 
-        Note: It returns the list sorted in descending order.
+        .. note:: It returns the list sorted in descending order.
 
         >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
         [12]
@@ -1237,11 +1271,11 @@ def merge(a, b):
 
     def takeOrdered(self, num, key=None):
         """
-        Get the N elements from a RDD ordered in ascending order or as
+        Get the N elements from an RDD ordered in ascending order or as
         specified by the optional key function.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: this method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
 
         >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)
         [1, 2, 3, 4, 5, 6]
@@ -1262,11 +1296,11 @@ def take(self, num):
         that partition to estimate the number of additional partitions needed
         to satisfy the limit.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
-
         Translated from the Scala implementation in RDD#take().
 
+        .. note:: this method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
+
         >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)
         [2, 3]
         >>> sc.parallelize([2, 3, 4, 5, 6]).take(10)
@@ -1330,8 +1364,9 @@ def first(self):
 
     def isEmpty(self):
         """
-        Returns true if and only if the RDD contains no elements at all. Note that an RDD
-        may be empty even when it has at least 1 partition.
+        Returns true if and only if the RDD contains no elements at all.
+
+        .. note:: an RDD may be empty even when it has at least 1 partition.
 
         >>> sc.parallelize([]).isEmpty()
         True
@@ -1522,8 +1557,8 @@ def collectAsMap(self):
         """
         Return the key-value pairs in this RDD to the master as a dictionary.
 
-        Note that this method should only be used if the resulting data is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: this method should only be used if the resulting data is expected
+            to be small, as all the data is loaded into the driver's memory.
 
         >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()
         >>> m[1]
@@ -1760,8 +1795,7 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         set of aggregation functions.
 
         Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined
-        type" C.  Note that V and C can be different -- for example, one might
-        group an RDD of type (Int, Int) into an RDD of type (Int, List[Int]).
+        type" C.
 
         Users provide three functions:
 
@@ -1773,6 +1807,9 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
 
         In addition, users can control the partitioning of the output RDD.
 
+        .. note:: V and C can be different -- for example, one might group an RDD of type
+            (Int, Int) into an RDD of type (Int, List[Int]).
+
         >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
         >>> def add(a, b): return a + str(b)
         >>> sorted(x.combineByKey(str, add, add).collect())
@@ -1844,9 +1881,9 @@ def groupByKey(self, numPartitions=None, partitionFunc=portable_hash):
         Group the values for each key in the RDD into a single sequence.
         Hash-partitions the resulting RDD with numPartitions partitions.
 
-        Note: If you are grouping in order to perform an aggregation (such as a
-        sum or average) over each key, using reduceByKey or aggregateByKey will
-        provide much better performance.
+        .. note:: If you are grouping in order to perform an aggregation (such as a
+            sum or average) over each key, using reduceByKey or aggregateByKey will
+            provide much better performance.
 
         >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
         >>> sorted(rdd.groupByKey().mapValues(len).collect())
@@ -2034,10 +2071,12 @@ def coalesce(self, numPartitions, shuffle=False):
             batchSize = min(10, self.ctx._batchSize or 1024)
             ser = BatchedSerializer(PickleSerializer(), batchSize)
             selfCopy = self._reserialize(ser)
+            jrdd_deserializer = selfCopy._jrdd_deserializer
             jrdd = selfCopy._jrdd.coalesce(numPartitions, shuffle)
         else:
+            jrdd_deserializer = self._jrdd_deserializer
             jrdd = self._jrdd.coalesce(numPartitions, shuffle)
-        return RDD(jrdd, self.ctx, self._jrdd_deserializer)
+        return RDD(jrdd, self.ctx, jrdd_deserializer)
 
     def zip(self, other):
         """
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 2a1326947f4f..ea5e00e9eeef 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -61,7 +61,7 @@
 if sys.version < '3':
     import cPickle as pickle
     protocol = 2
-    from itertools import izip as zip
+    from itertools import izip as zip, imap as map
 else:
     import pickle
     protocol = 3
@@ -96,7 +96,12 @@ def load_stream(self, stream):
         raise NotImplementedError
 
     def _load_stream_without_unbatching(self, stream):
-        return self.load_stream(stream)
+        """
+        Return an iterator of deserialized batches (lists) of objects from the input stream.
+        if the serializer does not operate on batches the default implementation returns an
+        iterator of single element lists.
+        """
+        return map(lambda x: [x], self.load_stream(stream))
 
     # Note: our notion of "equality" is that output generated by
     # equal serializers can be deserialized using the same serializer.
@@ -278,50 +283,57 @@ def __repr__(self):
         return "AutoBatchedSerializer(%s)" % self.serializer
 
 
-class CartesianDeserializer(FramedSerializer):
+class CartesianDeserializer(Serializer):
 
     """
     Deserializes the JavaRDD cartesian() of two PythonRDDs.
+    Due to pyspark batching we cannot simply use the result of the Java RDD cartesian,
+    we additionally need to do the cartesian within each pair of batches.
     """
 
     def __init__(self, key_ser, val_ser):
-        FramedSerializer.__init__(self)
         self.key_ser = key_ser
         self.val_ser = val_ser
 
-    def prepare_keys_values(self, stream):
-        key_stream = self.key_ser._load_stream_without_unbatching(stream)
-        val_stream = self.val_ser._load_stream_without_unbatching(stream)
-        key_is_batched = isinstance(self.key_ser, BatchedSerializer)
-        val_is_batched = isinstance(self.val_ser, BatchedSerializer)
-        for (keys, vals) in zip(key_stream, val_stream):
-            keys = keys if key_is_batched else [keys]
-            vals = vals if val_is_batched else [vals]
-            yield (keys, vals)
+    def _load_stream_without_unbatching(self, stream):
+        key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)
+        val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)
+        for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):
+            # for correctness with repeated cartesian/zip this must be returned as one batch
+            yield product(key_batch, val_batch)
 
     def load_stream(self, stream):
-        for (keys, vals) in self.prepare_keys_values(stream):
-            for pair in product(keys, vals):
-                yield pair
+        return chain.from_iterable(self._load_stream_without_unbatching(stream))
 
     def __repr__(self):
         return "CartesianDeserializer(%s, %s)" % \
                (str(self.key_ser), str(self.val_ser))
 
 
-class PairDeserializer(CartesianDeserializer):
+class PairDeserializer(Serializer):
 
     """
     Deserializes the JavaRDD zip() of two PythonRDDs.
+    Due to pyspark batching we cannot simply use the result of the Java RDD zip,
+    we additionally need to do the zip within each pair of batches.
     """
 
+    def __init__(self, key_ser, val_ser):
+        self.key_ser = key_ser
+        self.val_ser = val_ser
+
+    def _load_stream_without_unbatching(self, stream):
+        key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)
+        val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)
+        for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):
+            if len(key_batch) != len(val_batch):
+                raise ValueError("Can not deserialize PairRDD with different number of items"
+                                 " in batches: (%d, %d)" % (len(key_batch), len(val_batch)))
+            # for correctness with repeated cartesian/zip this must be returned as one batch
+            yield zip(key_batch, val_batch)
+
     def load_stream(self, stream):
-        for (keys, vals) in self.prepare_keys_values(stream):
-            if len(keys) != len(vals):
-                raise ValueError("Can not deserialize RDD with different number of items"
-                                 " in pair: (%d, %d)" % (len(keys), len(vals)))
-            for pair in zip(keys, vals):
-                yield pair
+        return chain.from_iterable(self._load_stream_without_unbatching(stream))
 
     def __repr__(self):
         return "PairDeserializer(%s, %s)" % (str(self.key_ser), str(self.val_ser))
@@ -370,18 +382,38 @@ def _hijack_namedtuple():
         return
 
     global _old_namedtuple  # or it will put in closure
+    global _old_namedtuple_kwdefaults  # or it will put in closure too
 
     def _copy_func(f):
         return types.FunctionType(f.__code__, f.__globals__, f.__name__,
                                   f.__defaults__, f.__closure__)
 
+    def _kwdefaults(f):
+        # __kwdefaults__ contains the default values of keyword-only arguments which are
+        # introduced from Python 3. The possible cases for __kwdefaults__ in namedtuple
+        # are as below:
+        #
+        # - Does not exist in Python 2.
+        # - Returns None in <= Python 3.5.x.
+        # - Returns a dictionary containing the default values to the keys from Python 3.6.x
+        #    (See https://bugs.python.org/issue25628).
+        kargs = getattr(f, "__kwdefaults__", None)
+        if kargs is None:
+            return {}
+        else:
+            return kargs
+
     _old_namedtuple = _copy_func(collections.namedtuple)
+    _old_namedtuple_kwdefaults = _kwdefaults(collections.namedtuple)
 
     def namedtuple(*args, **kwargs):
+        for k, v in _old_namedtuple_kwdefaults.items():
+            kwargs[k] = kwargs.get(k, v)
         cls = _old_namedtuple(*args, **kwargs)
         return _hack_namedtuple(cls)
 
     # replace namedtuple with new one
+    collections.namedtuple.__globals__["_old_namedtuple_kwdefaults"] = _old_namedtuple_kwdefaults
     collections.namedtuple.__globals__["_old_namedtuple"] = _old_namedtuple
     collections.namedtuple.__globals__["_hack_namedtuple"] = _hack_namedtuple
     collections.namedtuple.__code__ = namedtuple.__code__
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index a36d02e0db13..30c7a3fe4fe6 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -258,6 +258,11 @@ def refreshTable(self, tableName):
         """Invalidate and refresh all the cached metadata of the given table."""
         self._jcatalog.refreshTable(tableName)
 
+    @since('2.1.1')
+    def recoverPartitions(self, tableName):
+        """Recover all the partitions of the given table and update the catalog."""
+        self._jcatalog.recoverPartitions(tableName)
+
     def _reset(self):
         """(Internal use only) Drop all existing databases (except "default"), tables,
         partitions and functions, and set the current database to "default".
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index de4c335ad275..c22f4b87e1a7 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -73,7 +73,7 @@ def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
         self._jsc = self._sc._jsc
         self._jvm = self._sc._jvm
         if sparkSession is None:
-            sparkSession = SparkSession(sparkContext)
+            sparkSession = SparkSession.builder.getOrCreate()
         if jsqlContext is None:
             jsqlContext = sparkSession._jwrapped
         self.sparkSession = sparkSession
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 29710acf54c4..18ce130f7e5b 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -322,6 +322,54 @@ def show(self, n=20, truncate=True):
     def __repr__(self):
         return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
 
+    @since(2.1)
+    def checkpoint(self, eager=True):
+        """Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
+        logical plan of this DataFrame, which is especially useful in iterative algorithms where the
+        plan may grow exponentially. It will be saved to files inside the checkpoint
+        directory set with L{SparkContext.setCheckpointDir()}.
+
+        :param eager: Whether to checkpoint this DataFrame immediately
+
+        .. note:: Experimental
+        """
+        jdf = self._jdf.checkpoint(eager)
+        return DataFrame(jdf, self.sql_ctx)
+
+    @since(2.1)
+    def withWatermark(self, eventTime, delayThreshold):
+        """Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point
+        in time before which we assume no more late data is going to arrive.
+
+        Spark will use this watermark for several purposes:
+          - To know when a given time window aggregation can be finalized and thus can be emitted
+            when using output modes that do not allow updates.
+
+          - To minimize the amount of state that we need to keep for on-going aggregations.
+
+        The current watermark is computed by looking at the `MAX(eventTime)` seen across
+        all of the partitions in the query minus a user specified `delayThreshold`.  Due to the cost
+        of coordinating this value across partitions, the actual watermark used is only guaranteed
+        to be at least `delayThreshold` behind the actual event time.  In some cases we may still
+        process records that arrive more than `delayThreshold` late.
+
+        :param eventTime: the name of the column that contains the event time of the row.
+        :param delayThreshold: the minimum delay to wait to data to arrive late, relative to the
+            latest record that has been processed in the form of an interval
+            (e.g. "1 minute" or "5 hours").
+
+        .. note:: Experimental
+
+        >>> sdf.select('name', sdf.time.cast('timestamp')).withWatermark('time', '10 minutes')
+        DataFrame[name: string, time: timestamp]
+        """
+        if not eventTime or type(eventTime) is not str:
+            raise TypeError("eventTime should be provided as a string")
+        if not delayThreshold or type(delayThreshold) is not str:
+            raise TypeError("delayThreshold should be provided as a string interval")
+        jdf = self._jdf.withWatermark(eventTime, delayThreshold)
+        return DataFrame(jdf, self.sql_ctx)
+
     @since(1.3)
     def count(self):
         """Returns the number of rows in this :class:`DataFrame`.
@@ -409,7 +457,7 @@ def foreachPartition(self, f):
     def cache(self):
         """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}).
 
-        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
+        .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         self._jdf.cache()
@@ -422,7 +470,7 @@ def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK):
         a new storage level if the :class:`DataFrame` does not have a storage level set yet.
         If no storage level is specified defaults to (C{MEMORY_AND_DISK}).
 
-        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
+        .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
@@ -468,7 +516,15 @@ def coalesce(self, numPartitions):
         Similar to coalesce defined on an :class:`RDD`, this operation results in a
         narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,
         there will not be a shuffle, instead each of the 100 new partitions will
-        claim 10 of the current partitions.
+        claim 10 of the current partitions. If a larger number of partitions is requested,
+        it will stay at the current number of partitions.
+
+        However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
+        this may result in your computation taking place on fewer nodes than
+        you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+        you can call repartition(). This will add a shuffle step, but means the
+        current upstream partitions will be executed in parallel (per whatever
+        the current partitioning is).
 
         >>> df.coalesce(1).rdd.getNumPartitions()
         1
@@ -549,6 +605,9 @@ def distinct(self):
     def sample(self, withReplacement, fraction, seed=None):
         """Returns a sampled subset of this :class:`DataFrame`.
 
+        .. note:: This is not guaranteed to provide exactly the fraction specified of the total
+            count of the given :class:`DataFrame`.
+
         >>> df.sample(False, 0.5, 42).count()
         2
         """
@@ -679,8 +738,9 @@ def join(self, other, on=None, how=None):
             a join expression (Column), or a list of Columns.
             If `on` is a string or a list of strings indicating the name of the join column(s),
             the column(s) must exist on both sides, and this performs an equi-join.
-        :param how: str, default 'inner'.
-            One of `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
+        :param how: str, default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
+            ``full``, ``full_outer``, ``left``, ``left_outer``, ``right``, ``right_outer``,
+            ``left_semi``, and ``left_anti``.
 
         The following performs a full outer join between ``df1`` and ``df2``.
 
@@ -813,8 +873,8 @@ def describe(self, *cols):
         This include count, mean, stddev, min, and max. If no columns are
         given, this function computes statistics for all numerical or string columns.
 
-        .. note:: This function is meant for exploratory data analysis, as we make no \
-        guarantee about the backward compatibility of the schema of the resulting DataFrame.
+        .. note:: This function is meant for exploratory data analysis, as we make no
+            guarantee about the backward compatibility of the schema of the resulting DataFrame.
 
         >>> df.describe(['age']).show()
         +-------+------------------+
@@ -847,8 +907,8 @@ def describe(self, *cols):
     def head(self, n=None):
         """Returns the first ``n`` rows.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: This method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
 
         :param n: int, default 1. Number of rows to return.
         :return: If n is greater than 1, return a list of :class:`Row`.
@@ -1409,8 +1469,8 @@ def freqItems(self, cols, support=None):
         "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
         :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
 
-        .. note::  This function is meant for exploratory data analysis, as we make no \
-        guarantee about the backward compatibility of the schema of the resulting DataFrame.
+        .. note:: This function is meant for exploratory data analysis, as we make no
+            guarantee about the backward compatibility of the schema of the resulting DataFrame.
 
         :param cols: Names of the columns to calculate frequent items for as a list or tuple of
             strings.
@@ -1511,11 +1571,11 @@ def toDF(self, *cols):
     def toPandas(self):
         """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
 
-        Note that this method should only be used if the resulting Pandas's DataFrame is expected
-        to be small, as all the data is loaded into the driver's memory.
-
         This is only available if Pandas is installed and available.
 
+        .. note:: This method should only be used if the resulting Pandas's DataFrame is expected
+            to be small, as all the data is loaded into the driver's memory.
+
         >>> df.toPandas()  # doctest: +SKIP
            age   name
         0    2  Alice
@@ -1621,6 +1681,7 @@ def _test():
     from pyspark.context import SparkContext
     from pyspark.sql import Row, SQLContext, SparkSession
     import pyspark.sql.dataframe
+    from pyspark.sql.functions import from_unixtime
     globs = pyspark.sql.dataframe.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
@@ -1633,9 +1694,11 @@ def _test():
     globs['df3'] = sc.parallelize([Row(name='Alice', age=2),
                                    Row(name='Bob', age=5)]).toDF()
     globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
-                                  Row(name='Bob', age=5, height=None),
-                                  Row(name='Tom', age=None, height=None),
-                                  Row(name=None, age=None, height=None)]).toDF()
+                                   Row(name='Bob', age=5, height=None),
+                                   Row(name='Tom', age=None, height=None),
+                                   Row(name=None, age=None, height=None)]).toDF()
+    globs['sdf'] = sc.parallelize([Row(name='Tom', time=1479441846),
+                                   Row(name='Bob', time=1479442946)]).toDF()
 
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.dataframe, globs=globs,
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 45e3c22bfc6a..7fe901a4fbba 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -157,17 +157,21 @@ def _():
     'dense_rank':
         """returns the rank of rows within a window partition, without any gaps.
 
-        The difference between rank and denseRank is that denseRank leaves no gaps in ranking
-        sequence when there are ties. That is, if you were ranking a competition using denseRank
+        The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
+        sequence when there are ties. That is, if you were ranking a competition using dense_rank
         and had three people tie for second place, you would say that all three were in second
-        place and that the next person came in third.""",
+        place and that the next person came in third. Rank would give me sequential numbers, making
+        the person that came in third place (after the ties) would register as coming in fifth.
+
+        This is equivalent to the DENSE_RANK function in SQL.""",
     'rank':
         """returns the rank of rows within a window partition.
 
-        The difference between rank and denseRank is that denseRank leaves no gaps in ranking
-        sequence when there are ties. That is, if you were ranking a competition using denseRank
+        The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
+        sequence when there are ties. That is, if you were ranking a competition using dense_rank
         and had three people tie for second place, you would say that all three were in second
-        place and that the next person came in third.
+        place and that the next person came in third. Rank would give me sequential numbers, making
+        the person that came in third place (after the ties) would register as coming in fifth.
 
         This is equivalent to the RANK function in SQL.""",
     'cume_dist':
@@ -359,8 +363,8 @@ def grouping_id(*cols):
 
        (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
 
-    Note: the list of columns should match with grouping columns exactly, or empty (means all the
-    grouping columns).
+    .. note:: The list of columns should match with grouping columns exactly, or empty (means all
+        the grouping columns).
 
     >>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
     +-----+-------------+--------+
@@ -457,7 +461,8 @@ def nanvl(col1, col2):
 
 @since(1.4)
 def rand(seed=None):
-    """Generates a random column with i.i.d. samples from U[0.0, 1.0].
+    """Generates a random column with independent and identically distributed (i.i.d.) samples
+    from U[0.0, 1.0].
     """
     sc = SparkContext._active_spark_context
     if seed is not None:
@@ -469,7 +474,8 @@ def rand(seed=None):
 
 @since(1.4)
 def randn(seed=None):
-    """Generates a column with i.i.d. samples from the standard normal distribution.
+    """Generates a column with independent and identically distributed (i.i.d.) samples from
+    the standard normal distribution.
     """
     sc = SparkContext._active_spark_context
     if seed is not None:
@@ -518,7 +524,7 @@ def shiftLeft(col, numBits):
 
 @since(1.5)
 def shiftRight(col, numBits):
-    """Shift the given value numBits right.
+    """(Signed) shift the given value numBits right.
 
     >>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect()
     [Row(r=21)]
@@ -543,9 +549,9 @@ def shiftRightUnsigned(col, numBits):
 
 @since(1.6)
 def spark_partition_id():
-    """A column for partition ID of the Spark task.
+    """A column for partition ID.
 
-    Note that this is indeterministic because it depends on data partitioning and task scheduling.
+    .. note:: This is indeterministic because it depends on data partitioning and task scheduling.
 
     >>> df.repartition(1).select(spark_partition_id().alias("pid")).collect()
     [Row(pid=0), Row(pid=0)]
@@ -777,8 +783,8 @@ def date_format(date, format):
     A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
     pattern letters of the Java class `java.text.SimpleDateFormat` can be used.
 
-    NOTE: Use when ever possible specialized functions like `year`. These benefit from a
-    specialized implementation.
+    .. note:: Use when ever possible specialized functions like `year`. These benefit from a
+        specialized implementation.
 
     >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
     >>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect()
@@ -1059,7 +1065,8 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
 @since(1.5)
 def from_utc_timestamp(timestamp, tz):
     """
-    Assumes given timestamp is UTC and converts to given timezone.
+    Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+    that corresponds to the same time of day in the given timezone.
 
     >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
     >>> df.select(from_utc_timestamp(df.t, "PST").alias('t')).collect()
@@ -1072,7 +1079,8 @@ def from_utc_timestamp(timestamp, tz):
 @since(1.5)
 def to_utc_timestamp(timestamp, tz):
     """
-    Assumes given timestamp is in given timezone and converts to UTC.
+    Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+    another timestamp that corresponds to the same time of day in UTC.
 
     >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
     >>> df.select(to_utc_timestamp(df.t, "PST").alias('t')).collect()
@@ -1314,8 +1322,8 @@ def instr(str, substr):
     Locate the position of the first occurrence of substr column in the given string.
     Returns null if either of the arguments are null.
 
-    NOTE: The position is not zero based, but 1 based index, returns 0 if substr
-    could not be found in str.
+    .. note:: The position is not zero based, but 1 based index. Returns 0 if substr
+        could not be found in str.
 
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(instr(df.s, 'b').alias('s')).collect()
@@ -1379,8 +1387,8 @@ def locate(substr, str, pos=1):
     """
     Locate the position of the first occurrence of substr in a string column, after position pos.
 
-    NOTE: The position is not zero based, but 1 based index. returns 0 if substr
-    could not be found in str.
+    .. note:: The position is not zero based, but 1 based index. Returns 0 if substr
+        could not be found in str.
 
     :param substr: a string
     :param str: a Column of :class:`pyspark.sql.types.StringType`
@@ -1442,7 +1450,7 @@ def split(str, pattern):
     """
     Splits str around pattern (pattern is a regular expression).
 
-    NOTE: pattern is a string represent the regular expression.
+    .. note:: pattern is a string represent the regular expression.
 
     >>> df = spark.createDataFrame([('ab12cd',)], ['s',])
     >>> df.select(split(df.s, '[0-9]+').alias('s')).collect()
@@ -1785,7 +1793,8 @@ def size(col):
 @since(1.5)
 def sort_array(col, asc=True):
     """
-    Collection function: sorts the input array for the given column in ascending order.
+    Collection function: sorts the input array in ascending or descending order according
+    to the natural ordering of the array elements.
 
     :param col: name of column or expression
 
@@ -1847,9 +1856,10 @@ def __call__(self, *cols):
 @since(1.3)
 def udf(f, returnType=StringType()):
     """Creates a :class:`Column` expression representing a user defined function (UDF).
-    Note that the user-defined functions must be deterministic. Due to optimization,
-    duplicate invocations may be eliminated or the function may even be invoked more times than
-    it is present in the query.
+
+    .. note:: The user-defined functions must be deterministic. Due to optimization,
+        duplicate invocations may be eliminated or the function may even be invoked more times than
+        it is present in the query.
 
     :param f: python function
     :param returnType: a :class:`pyspark.sql.types.DataType` object
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 1e40b9c39fc4..9f4772eec9f2 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -161,8 +161,8 @@ def getOrCreate(self):
             with self._lock:
                 from pyspark.context import SparkContext
                 from pyspark.conf import SparkConf
-                session = SparkSession._instantiatedContext
-                if session is None:
+                session = SparkSession._instantiatedSession
+                if session is None or session._sc._jsc is None:
                     sparkConf = SparkConf()
                     for key, value in self._options.items():
                         sparkConf.set(key, value)
@@ -183,7 +183,7 @@ def getOrCreate(self):
 
     builder = Builder()
 
-    _instantiatedContext = None
+    _instantiatedSession = None
 
     @ignore_unicode_prefix
     def __init__(self, sparkContext, jsparkSession=None):
@@ -214,8 +214,12 @@ def __init__(self, sparkContext, jsparkSession=None):
         self._wrapped = SQLContext(self._sc, self, self._jwrapped)
         _monkey_patch_RDD(self)
         install_exception_handler()
-        if SparkSession._instantiatedContext is None:
-            SparkSession._instantiatedContext = self
+        # If we had an instantiated SparkSession attached with a SparkContext
+        # which is stopped now, we need to renew the instantiated SparkSession.
+        # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate.
+        if SparkSession._instantiatedSession is None \
+                or SparkSession._instantiatedSession._sc._jsc is None:
+            SparkSession._instantiatedSession = self
 
     @since(2.0)
     def newSession(self):
@@ -595,7 +599,7 @@ def stop(self):
         """Stop the underlying :class:`SparkContext`.
         """
         self._sc.stop()
-        SparkSession._instantiatedContext = None
+        SparkSession._instantiatedSession = None
 
     @since(2.0)
     def __enter__(self):
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 1c94413e3c45..a10b185cd4c7 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -16,6 +16,8 @@
 #
 
 import sys
+import json
+
 if sys.version >= '3':
     intlike = int
     basestring = unicode = str
@@ -26,8 +28,10 @@
 
 from pyspark import since, keyword_only
 from pyspark.rdd import ignore_unicode_prefix
+from pyspark.sql.column import _to_seq
 from pyspark.sql.readwriter import OptionUtils, to_str
 from pyspark.sql.types import *
+from pyspark.sql.utils import StreamingQueryException
 
 __all__ = ["StreamingQuery", "StreamingQueryManager", "DataStreamReader", "DataStreamWriter"]
 
@@ -48,15 +52,29 @@ def __init__(self, jsq):
     @property
     @since(2.0)
     def id(self):
-        """The id of the streaming query. This id is unique across all queries that have been
-        started in the current process.
+        """Returns the unique id of this query that persists across restarts from checkpoint data.
+        That is, this id is generated when a query is started for the first time, and
+        will be the same every time it is restarted from checkpoint data.
+        There can only be one query with the same id active in a Spark cluster.
+        Also see, `runId`.
+        """
+        return self._jsq.id().toString()
+
+    @property
+    @since(2.1)
+    def runId(self):
+        """Returns the unique id of this query that does not persist across restarts. That is, every
+        query that is started (or restarted from checkpoint) will have a different runId.
         """
-        return self._jsq.id()
+        return self._jsq.runId().toString()
 
     @property
     @since(2.0)
     def name(self):
-        """The name of the streaming query. This name is unique across all active queries.
+        """Returns the user-specified name of the query, or null if not specified.
+        This name can be specified in the `org.apache.spark.sql.streaming.DataStreamWriter`
+        as `dataframe.writeStream.queryName("query").start()`.
+        This name, if set, must be unique across all active queries.
         """
         return self._jsq.name()
 
@@ -87,13 +105,46 @@ def awaitTermination(self, timeout=None):
         else:
             return self._jsq.awaitTermination()
 
+    @property
+    @since(2.1)
+    def status(self):
+        """
+        Returns the current status of the query.
+        """
+        return json.loads(self._jsq.status().json())
+
+    @property
+    @since(2.1)
+    def recentProgress(self):
+        """Returns an array of the most recent [[StreamingQueryProgress]] updates for this query.
+        The number of progress updates retained for each stream is configured by Spark session
+        configuration `spark.sql.streaming.numRecentProgressUpdates`.
+        """
+        return [json.loads(p.json()) for p in self._jsq.recentProgress()]
+
+    @property
+    @since(2.1)
+    def lastProgress(self):
+        """
+        Returns the most recent :class:`StreamingQueryProgress` update of this streaming query or
+        None if there were no progress updates
+        :return: a map
+        """
+        lastProgress = self._jsq.lastProgress()
+        if lastProgress:
+            return json.loads(lastProgress.json())
+        else:
+            return None
+
     @since(2.0)
     def processAllAvailable(self):
         """Blocks until all available data in the source has been processed and committed to the
-        sink. This method is intended for testing. Note that in the case of continually arriving
-        data, this method may block forever. Additionally, this method is only guaranteed to block
-        until data that has been synchronously appended data to a stream source prior to invocation.
-        (i.e. `getOffset` must immediately reflect the addition).
+        sink. This method is intended for testing.
+
+        .. note:: In the case of continually arriving data, this method may block forever.
+            Additionally, this method is only guaranteed to block until data that has been
+            synchronously appended data to a stream source prior to invocation.
+            (i.e. `getOffset` must immediately reflect the addition).
         """
         return self._jsq.processAllAvailable()
 
@@ -103,6 +154,45 @@ def stop(self):
         """
         self._jsq.stop()
 
+    @since(2.1)
+    def explain(self, extended=False):
+        """Prints the (logical and physical) plans to the console for debugging purpose.
+
+        :param extended: boolean, default ``False``. If ``False``, prints only the physical plan.
+
+        >>> sq = sdf.writeStream.format('memory').queryName('query_explain').start()
+        >>> sq.processAllAvailable() # Wait a bit to generate the runtime plans.
+        >>> sq.explain()
+        == Physical Plan ==
+        ...
+        >>> sq.explain(True)
+        == Parsed Logical Plan ==
+        ...
+        == Analyzed Logical Plan ==
+        ...
+        == Optimized Logical Plan ==
+        ...
+        == Physical Plan ==
+        ...
+        >>> sq.stop()
+        """
+        # Cannot call `_jsq.explain(...)` because it will print in the JVM process.
+        # We should print it in the Python process.
+        print(self._jsq.explainInternal(extended))
+
+    @since(2.1)
+    def exception(self):
+        """
+        :return: the StreamingQueryException if the query was terminated by an exception, or None.
+        """
+        if self._jsq.exception().isDefined():
+            je = self._jsq.exception().get()
+            msg = je.toString().split(': ', 1)[1]  # Drop the Java StreamingQueryException type info
+            stackTrace = '\n\t at '.join(map(lambda x: x.toString(), je.getStackTrace()))
+            return StreamingQueryException(msg, stackTrace)
+        else:
+            return None
+
 
 class StreamingQueryManager(object):
     """A class to manage all the :class:`StreamingQuery` StreamingQueries active.
@@ -147,8 +237,6 @@ def get(self, id):
         True
         >>> sq.stop()
         """
-        if not isinstance(id, intlike):
-            raise ValueError("The id for the query must be an integer. Got: %s" % id)
         return StreamingQuery(self._jsqm.get(id))
 
     @since(2.0)
@@ -189,303 +277,6 @@ def resetTerminated(self):
         self._jsqm.resetTerminated()
 
 
-class StreamingQueryStatus(object):
-    """A class used to report information about the progress of a StreamingQuery.
-
-    .. note:: Experimental
-
-    .. versionadded:: 2.1
-    """
-
-    def __init__(self, jsqs):
-        self._jsqs = jsqs
-
-    def __str__(self):
-        """
-        Pretty string of this query status.
-
-        >>> print(sqs)
-        Status of query 'query'
-            Query id: 1
-            Status timestamp: 123
-            Input rate: 15.5 rows/sec
-            Processing rate 23.5 rows/sec
-            Latency: 345.0 ms
-            Trigger details:
-                isDataPresentInTrigger: true
-                isTriggerActive: true
-                latency.getBatch.total: 20
-                latency.getOffset.total: 10
-                numRows.input.total: 100
-                triggerId: 5
-            Source statuses [1 source]:
-                Source 1 - MySource1
-                    Available offset: #0
-                    Input rate: 15.5 rows/sec
-                    Processing rate: 23.5 rows/sec
-                    Trigger details:
-                        numRows.input.source: 100
-                        latency.getOffset.source: 10
-                        latency.getBatch.source: 20
-            Sink status - MySink
-                Committed offsets: [#1, -]
-        """
-        return self._jsqs.toString()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def name(self):
-        """
-        Name of the query. This name is unique across all active queries.
-
-        >>> sqs.name
-        u'query'
-        """
-        return self._jsqs.name()
-
-    @property
-    @since(2.1)
-    def id(self):
-        """
-        Id of the query. This id is unique across all queries that have been started in
-        the current process.
-
-        >>> int(sqs.id)
-        1
-        """
-        return self._jsqs.id()
-
-    @property
-    @since(2.1)
-    def timestamp(self):
-        """
-        Timestamp (ms) of when this query was generated.
-
-        >>> int(sqs.timestamp)
-        123
-        """
-        return self._jsqs.timestamp()
-
-    @property
-    @since(2.1)
-    def inputRate(self):
-        """
-        Current total rate (rows/sec) at which data is being generated by all the sources.
-
-        >>> sqs.inputRate
-        15.5
-        """
-        return self._jsqs.inputRate()
-
-    @property
-    @since(2.1)
-    def processingRate(self):
-        """
-        Current rate (rows/sec) at which the query is processing data from all the sources.
-
-        >>> sqs.processingRate
-        23.5
-        """
-        return self._jsqs.processingRate()
-
-    @property
-    @since(2.1)
-    def latency(self):
-        """
-        Current average latency between the data being available in source and the sink
-        writing the corresponding output.
-
-        >>> sqs.latency
-        345.0
-        """
-        if (self._jsqs.latency().nonEmpty()):
-            return self._jsqs.latency().get()
-        else:
-            return None
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def sourceStatuses(self):
-        """
-        Current statuses of the sources as a list.
-
-        >>> len(sqs.sourceStatuses)
-        1
-        >>> sqs.sourceStatuses[0].description
-        u'MySource1'
-        """
-        return [SourceStatus(ss) for ss in self._jsqs.sourceStatuses()]
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def sinkStatus(self):
-        """
-        Current status of the sink.
-
-        >>> sqs.sinkStatus.description
-        u'MySink'
-        """
-        return SinkStatus(self._jsqs.sinkStatus())
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def triggerDetails(self):
-        """
-        Low-level details of the currently active trigger (e.g. number of rows processed
-        in trigger, latency of intermediate steps, etc.).
-
-        If no trigger is currently active, then it will have details of the last completed trigger.
-
-        >>> sqs.triggerDetails
-        {u'triggerId': u'5', u'latency.getBatch.total': u'20', u'numRows.input.total': u'100',
-        u'isTriggerActive': u'true', u'latency.getOffset.total': u'10',
-        u'isDataPresentInTrigger': u'true'}
-        """
-        return self._jsqs.triggerDetails()
-
-
-class SourceStatus(object):
-    """
-    Status and metrics of a streaming Source.
-
-    .. note:: Experimental
-
-    .. versionadded:: 2.1
-    """
-
-    def __init__(self, jss):
-        self._jss = jss
-
-    def __str__(self):
-        """
-        Pretty string of this source status.
-
-        >>> print(sqs.sourceStatuses[0])
-        Status of source MySource1
-            Available offset: #0
-            Input rate: 15.5 rows/sec
-            Processing rate: 23.5 rows/sec
-            Trigger details:
-                numRows.input.source: 100
-                latency.getOffset.source: 10
-                latency.getBatch.source: 20
-        """
-        return self._jss.toString()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def description(self):
-        """
-        Description of the source corresponding to this status.
-
-        >>> sqs.sourceStatuses[0].description
-        u'MySource1'
-        """
-        return self._jss.description()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def offsetDesc(self):
-        """
-        Description of the current offset if known.
-
-        >>> sqs.sourceStatuses[0].offsetDesc
-        u'#0'
-        """
-        return self._jss.offsetDesc()
-
-    @property
-    @since(2.1)
-    def inputRate(self):
-        """
-        Current rate (rows/sec) at which data is being generated by the source.
-
-        >>> sqs.sourceStatuses[0].inputRate
-        15.5
-        """
-        return self._jss.inputRate()
-
-    @property
-    @since(2.1)
-    def processingRate(self):
-        """
-        Current rate (rows/sec) at which the query is processing data from the source.
-
-        >>> sqs.sourceStatuses[0].processingRate
-        23.5
-        """
-        return self._jss.processingRate()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def triggerDetails(self):
-        """
-        Low-level details of the currently active trigger (e.g. number of rows processed
-        in trigger, latency of intermediate steps, etc.).
-
-        If no trigger is currently active, then it will have details of the last completed trigger.
-
-        >>> sqs.sourceStatuses[0].triggerDetails
-        {u'numRows.input.source': u'100', u'latency.getOffset.source': u'10',
-        u'latency.getBatch.source': u'20'}
-       """
-        return self._jss.triggerDetails()
-
-
-class SinkStatus(object):
-    """
-    Status and metrics of a streaming Sink.
-
-    .. note:: Experimental
-
-    .. versionadded:: 2.1
-    """
-
-    def __init__(self, jss):
-        self._jss = jss
-
-    def __str__(self):
-        """
-        Pretty string of this source status.
-
-        >>> print(sqs.sinkStatus)
-        Status of sink MySink
-            Committed offsets: [#1, -]
-        """
-        return self._jss.toString()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def description(self):
-        """
-        Description of the source corresponding to this status.
-
-        >>> sqs.sinkStatus.description
-        u'MySink'
-        """
-        return self._jss.description()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def offsetDesc(self):
-        """
-        Description of the current offsets up to which data has been written by the sink.
-
-        >>> sqs.sinkStatus.offsetDesc
-        u'[#1, -]'
-        """
-        return self._jss.offsetDesc()
-
-
 class Trigger(object):
     """Used to indicate how often results should be produced by a :class:`StreamingQuery`.
 
@@ -874,6 +665,9 @@ def outputMode(self, outputMode):
            the sink
         * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink
            every time these is some updates
+        * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be
+           written to the sink every time there are some updates. If the query doesn't contain
+           aggregations, it will be equivalent to `append` mode.
 
        .. note:: Experimental.
 
@@ -977,7 +771,8 @@ def trigger(self, processingTime=None):
 
     @ignore_unicode_prefix
     @since(2.0)
-    def start(self, path=None, format=None, partitionBy=None, queryName=None, **options):
+    def start(self, path=None, format=None, outputMode=None, partitionBy=None, queryName=None,
+              **options):
         """Streams the contents of the :class:`DataFrame` to a data source.
 
         The data source is specified by the ``format`` and a set of ``options``.
@@ -988,15 +783,20 @@ def start(self, path=None, format=None, partitionBy=None, queryName=None, **opti
 
         :param path: the path in a Hadoop supported file system
         :param format: the format used to save
-
-            * ``append``: Append contents of this :class:`DataFrame` to existing data.
-            * ``overwrite``: Overwrite existing data.
-            * ``ignore``: Silently ignore this operation if data already exists.
-            * ``error`` (default case): Throw an exception if data already exists.
+        :param outputMode: specifies how data of a streaming DataFrame/Dataset is written to a
+                           streaming sink.
+
+            * `append`:Only the new rows in the streaming DataFrame/Dataset will be written to the
+              sink
+            * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink
+               every time these is some updates
+            * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be
+              written to the sink every time there are some updates. If the query doesn't contain
+              aggregations, it will be equivalent to `append` mode.
         :param partitionBy: names of partitioning columns
         :param queryName: unique name for the query
         :param options: All other string options. You may want to provide a `checkpointLocation`
-            for most streams, however it is not required for a `memory` stream.
+                        for most streams, however it is not required for a `memory` stream.
 
         >>> sq = sdf.writeStream.format('memory').queryName('this_query').start()
         >>> sq.isActive
@@ -1007,7 +807,7 @@ def start(self, path=None, format=None, partitionBy=None, queryName=None, **opti
         >>> sq.isActive
         False
         >>> sq = sdf.writeStream.trigger(processingTime='5 seconds').start(
-        ...     queryName='that_query', format='memory')
+        ...     queryName='that_query', outputMode="append", format='memory')
         >>> sq.name
         u'that_query'
         >>> sq.isActive
@@ -1015,6 +815,8 @@ def start(self, path=None, format=None, partitionBy=None, queryName=None, **opti
         >>> sq.stop()
         """
         self.options(**options)
+        if outputMode is not None:
+            self.outputMode(outputMode)
         if partitionBy is not None:
             self.partitionBy(partitionBy)
         if format is not None:
@@ -1051,8 +853,6 @@ def _test():
     globs['sdf_schema'] = StructType([StructField("data", StringType(), False)])
     globs['df'] = \
         globs['spark'].readStream.format('text').load('python/test_support/sql/streaming')
-    globs['sqs'] = StreamingQueryStatus(
-        spark.sparkContext._jvm.org.apache.spark.sql.streaming.StreamingQueryStatus.testStatus())
 
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.streaming, globs=globs,
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 3d46b852c52e..c3cc7426b177 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -46,11 +46,12 @@
 else:
     import unittest
 
-from pyspark.sql import SparkSession, HiveContext, Column, Row
+from pyspark import SparkContext
+from pyspark.sql import SparkSession, SQLContext, HiveContext, Column, Row
 from pyspark.sql.types import *
 from pyspark.sql.types import UserDefinedType, _infer_type
 from pyspark.tests import ReusedPySparkTestCase, SparkSubmitTests
-from pyspark.sql.functions import UserDefinedFunction, sha2
+from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
 from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException
 
@@ -205,6 +206,11 @@ def tearDownClass(cls):
         cls.spark.stop()
         shutil.rmtree(cls.tempdir.name, ignore_errors=True)
 
+    def test_sqlcontext_reuses_sparksession(self):
+        sqlContext1 = SQLContext(self.sc)
+        sqlContext2 = SQLContext(self.sc)
+        self.assertTrue(sqlContext1.sparkSession is sqlContext2.sparkSession)
+
     def test_row_should_be_read_only(self):
         row = Row(a=1, b=2)
         self.assertEqual(1, row.a)
@@ -318,6 +324,12 @@ def test_chained_udf(self):
         [row] = self.spark.sql("SELECT double(double(1) + 1)").collect()
         self.assertEqual(row[0], 6)
 
+    def test_single_udf_with_repeated_argument(self):
+        # regression test for SPARK-20685
+        self.spark.catalog.registerFunction("add", lambda x, y: x + y, IntegerType())
+        row = self.spark.sql("SELECT add(1, 1)").first()
+        self.assertEqual(tuple(row), (2, ))
+
     def test_multiple_udfs(self):
         self.spark.catalog.registerFunction("double", lambda x: x * 2, IntegerType())
         [row] = self.spark.sql("SELECT double(1), double(2)").collect()
@@ -336,6 +348,15 @@ def test_udf_in_filter_on_top_of_outer_join(self):
         df = df.withColumn('b', udf(lambda x: 'x')(df.a))
         self.assertEqual(df.filter('b = "x"').collect(), [Row(a=1, b='x')])
 
+    def test_udf_in_filter_on_top_of_join(self):
+        # regression test for SPARK-18589
+        from pyspark.sql.functions import udf
+        left = self.spark.createDataFrame([Row(a=1)])
+        right = self.spark.createDataFrame([Row(b=1)])
+        f = udf(lambda a, b: a == b, BooleanType())
+        df = left.crossJoin(right).filter(f("a", "b"))
+        self.assertEqual(df.collect(), [Row(a=1, b=1)])
+
     def test_udf_without_arguments(self):
         self.spark.catalog.registerFunction("foo", lambda: "bar")
         [row] = self.spark.sql("SELECT foo()").collect()
@@ -384,6 +405,26 @@ def test_udf_in_generate(self):
         row = df.select(explode(f(*df))).groupBy().sum().first()
         self.assertEqual(row[0], 10)
 
+        df = self.spark.range(3)
+        res = df.select("id", explode(f(df.id))).collect()
+        self.assertEqual(res[0][0], 1)
+        self.assertEqual(res[0][1], 0)
+        self.assertEqual(res[1][0], 2)
+        self.assertEqual(res[1][1], 0)
+        self.assertEqual(res[2][0], 2)
+        self.assertEqual(res[2][1], 1)
+
+        range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType()))
+        res = df.select("id", explode(range_udf(df.id))).collect()
+        self.assertEqual(res[0][0], 0)
+        self.assertEqual(res[0][1], -1)
+        self.assertEqual(res[1][0], 0)
+        self.assertEqual(res[1][1], 0)
+        self.assertEqual(res[2][0], 1)
+        self.assertEqual(res[2][1], 0)
+        self.assertEqual(res[3][0], 1)
+        self.assertEqual(res[3][1], 1)
+
     def test_udf_with_order_by_and_limit(self):
         from pyspark.sql.functions import udf
         my_copy = udf(lambda x: x, IntegerType())
@@ -392,6 +433,14 @@ def test_udf_with_order_by_and_limit(self):
         res.explain(True)
         self.assertEqual(res.collect(), [Row(id=0, copy=0)])
 
+    def test_udf_with_input_file_name(self):
+        from pyspark.sql.functions import udf, input_file_name
+        from pyspark.sql.types import StringType
+        sourceFile = udf(lambda path: path, StringType())
+        filePath = "python/test_support/sql/people1.json"
+        row = self.spark.read.json(filePath).select(sourceFile(input_file_name())).first()
+        self.assertTrue(row[0].find("people1.json") != -1)
+
     def test_basic_functions(self):
         rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
         df = self.spark.read.json(rdd)
@@ -1028,7 +1077,8 @@ def test_stream_read_options_overwrite(self):
         self.assertEqual(df.schema.simpleString(), "struct<data:string>")
 
     def test_stream_save_options(self):
-        df = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
+        df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') \
+            .withColumn('id', lit(1))
         for q in self.spark._wrapped.streams.active:
             q.stop()
         tmpPath = tempfile.mkdtemp()
@@ -1037,7 +1087,7 @@ def test_stream_save_options(self):
         out = os.path.join(tmpPath, 'out')
         chk = os.path.join(tmpPath, 'chk')
         q = df.writeStream.option('checkpointLocation', chk).queryName('this_query') \
-            .format('parquet').outputMode('append').option('path', out).start()
+            .format('parquet').partitionBy('id').outputMode('append').option('path', out).start()
         try:
             self.assertEqual(q.name, 'this_query')
             self.assertTrue(q.isActive)
@@ -1082,6 +1132,49 @@ def test_stream_save_options_overwrite(self):
             q.stop()
             shutil.rmtree(tmpPath)
 
+    def test_stream_status_and_progress(self):
+        df = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
+        for q in self.spark._wrapped.streams.active:
+            q.stop()
+        tmpPath = tempfile.mkdtemp()
+        shutil.rmtree(tmpPath)
+        self.assertTrue(df.isStreaming)
+        out = os.path.join(tmpPath, 'out')
+        chk = os.path.join(tmpPath, 'chk')
+
+        def func(x):
+            time.sleep(1)
+            return x
+
+        from pyspark.sql.functions import col, udf
+        sleep_udf = udf(func)
+
+        # Use "sleep_udf" to delay the progress update so that we can test `lastProgress` when there
+        # were no updates.
+        q = df.select(sleep_udf(col("value")).alias('value')).writeStream \
+            .start(path=out, format='parquet', queryName='this_query', checkpointLocation=chk)
+        try:
+            # "lastProgress" will return None in most cases. However, as it may be flaky when
+            # Jenkins is very slow, we don't assert it. If there is something wrong, "lastProgress"
+            # may throw error with a high chance and make this test flaky, so we should still be
+            # able to detect broken codes.
+            q.lastProgress
+
+            q.processAllAvailable()
+            lastProgress = q.lastProgress
+            recentProgress = q.recentProgress
+            status = q.status
+            self.assertEqual(lastProgress['name'], q.name)
+            self.assertEqual(lastProgress['id'], q.id)
+            self.assertTrue(any(p == lastProgress for p in recentProgress))
+            self.assertTrue(
+                "message" in status and
+                "isDataAvailable" in status and
+                "isTriggerActive" in status)
+        finally:
+            q.stop()
+            shutil.rmtree(tmpPath)
+
     def test_stream_await_termination(self):
         df = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
         for q in self.spark._wrapped.streams.active:
@@ -1110,6 +1203,35 @@ def test_stream_await_termination(self):
             q.stop()
             shutil.rmtree(tmpPath)
 
+    def test_stream_exception(self):
+        sdf = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
+        sq = sdf.writeStream.format('memory').queryName('query_explain').start()
+        try:
+            sq.processAllAvailable()
+            self.assertEqual(sq.exception(), None)
+        finally:
+            sq.stop()
+
+        from pyspark.sql.functions import col, udf
+        from pyspark.sql.utils import StreamingQueryException
+        bad_udf = udf(lambda x: 1 / 0)
+        sq = sdf.select(bad_udf(col("value")))\
+            .writeStream\
+            .format('memory')\
+            .queryName('this_query')\
+            .start()
+        try:
+            # Process some data to fail the query
+            sq.processAllAvailable()
+            self.fail("bad udf should fail the query")
+        except StreamingQueryException as e:
+            # This is expected
+            self.assertTrue("ZeroDivisionError" in e.desc)
+        finally:
+            sq.stop()
+        self.assertTrue(type(sq.exception()) is StreamingQueryException)
+        self.assertTrue("ZeroDivisionError" in sq.exception().desc)
+
     def test_query_manager_await_termination(self):
         df = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
         for q in self.spark._wrapped.streams.active:
@@ -1244,6 +1366,14 @@ def test_time_with_timezone(self):
         self.assertEqual(now, now1)
         self.assertEqual(now, utcnow1)
 
+    # regression test for SPARK-19561
+    def test_datetime_at_epoch(self):
+        epoch = datetime.datetime.fromtimestamp(0)
+        df = self.spark.createDataFrame([Row(date=epoch)])
+        first = df.select('date', lit(epoch).alias('lit_date')).first()
+        self.assertEqual(first['date'], epoch)
+        self.assertEqual(first['lit_date'], epoch)
+
     def test_decimal(self):
         from decimal import Decimal
         schema = StructType([StructField("decimal", DecimalType(10, 5))])
@@ -1776,6 +1906,28 @@ def test_hivecontext(self):
         self.assertTrue(os.path.exists(metastore_path))
 
 
+class SQLTests2(ReusedPySparkTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        ReusedPySparkTestCase.setUpClass()
+        cls.spark = SparkSession(cls.sc)
+
+    @classmethod
+    def tearDownClass(cls):
+        ReusedPySparkTestCase.tearDownClass()
+        cls.spark.stop()
+
+    # We can't include this test into SQLTests because we will stop class's SparkContext and cause
+    # other tests failed.
+    def test_sparksession_with_stopped_sparkcontext(self):
+        self.sc.stop()
+        sc = SparkContext('local[4]', self.sc.appName)
+        spark = SparkSession.builder.getOrCreate()
+        df = spark.createDataFrame([(1, 2)], ["c", "c"])
+        df.collect()
+
+
 class HiveContextSQLTests(ReusedPySparkTestCase):
 
     @classmethod
@@ -1953,6 +2105,41 @@ def assert_runs_only_one_job_stage_and_task(job_group_name, f):
         # Regression test for SPARK-17514: limit(n).collect() should the perform same as take(n)
         assert_runs_only_one_job_stage_and_task("collect_limit", lambda: df.limit(1).collect())
 
+    @unittest.skipIf(sys.version_info < (3, 3), "Unittest < 3.3 doesn't support mocking")
+    def test_unbounded_frames(self):
+        from unittest.mock import patch
+        from pyspark.sql import functions as F
+        from pyspark.sql import window
+        import importlib
+
+        df = self.spark.range(0, 3)
+
+        def rows_frame_match():
+            return "ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select(
+                F.count("*").over(window.Window.rowsBetween(-sys.maxsize, sys.maxsize))
+            ).columns[0]
+
+        def range_frame_match():
+            return "RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select(
+                F.count("*").over(window.Window.rangeBetween(-sys.maxsize, sys.maxsize))
+            ).columns[0]
+
+        with patch("sys.maxsize", 2 ** 31 - 1):
+            importlib.reload(window)
+            self.assertTrue(rows_frame_match())
+            self.assertTrue(range_frame_match())
+
+        with patch("sys.maxsize", 2 ** 63 - 1):
+            importlib.reload(window)
+            self.assertTrue(rows_frame_match())
+            self.assertTrue(range_frame_match())
+
+        with patch("sys.maxsize", 2 ** 127 - 1):
+            importlib.reload(window)
+            self.assertTrue(rows_frame_match())
+            self.assertTrue(range_frame_match())
+
+        importlib.reload(window)
 
 if __name__ == "__main__":
     from pyspark.sql.tests import *
diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py
index c345e623f1cb..7ce27f9b102c 100644
--- a/python/pyspark/sql/window.py
+++ b/python/pyspark/sql/window.py
@@ -49,6 +49,8 @@ class Window(object):
 
     _JAVA_MIN_LONG = -(1 << 63)  # -9223372036854775808
     _JAVA_MAX_LONG = (1 << 63) - 1  # 9223372036854775807
+    _PRECEDING_THRESHOLD = max(-sys.maxsize, _JAVA_MIN_LONG)
+    _FOLLOWING_THRESHOLD = min(sys.maxsize, _JAVA_MAX_LONG)
 
     unboundedPreceding = _JAVA_MIN_LONG
 
@@ -98,9 +100,9 @@ def rowsBetween(start, end):
                     The frame is unbounded if this is ``Window.unboundedFollowing``, or
                     any value greater than or equal to 9223372036854775807.
         """
-        if start <= Window._JAVA_MIN_LONG:
+        if start <= Window._PRECEDING_THRESHOLD:
             start = Window.unboundedPreceding
-        if end >= Window._JAVA_MAX_LONG:
+        if end >= Window._FOLLOWING_THRESHOLD:
             end = Window.unboundedFollowing
         sc = SparkContext._active_spark_context
         jspec = sc._jvm.org.apache.spark.sql.expressions.Window.rowsBetween(start, end)
@@ -123,14 +125,14 @@ def rangeBetween(start, end):
 
         :param start: boundary start, inclusive.
                       The frame is unbounded if this is ``Window.unboundedPreceding``, or
-                      any value less than or equal to -9223372036854775808.
+                      any value less than or equal to max(-sys.maxsize, -9223372036854775808).
         :param end: boundary end, inclusive.
                     The frame is unbounded if this is ``Window.unboundedFollowing``, or
-                    any value greater than or equal to 9223372036854775807.
+                    any value greater than or equal to min(sys.maxsize, 9223372036854775807).
         """
-        if start <= Window._JAVA_MIN_LONG:
+        if start <= Window._PRECEDING_THRESHOLD:
             start = Window.unboundedPreceding
-        if end >= Window._JAVA_MAX_LONG:
+        if end >= Window._FOLLOWING_THRESHOLD:
             end = Window.unboundedFollowing
         sc = SparkContext._active_spark_context
         jspec = sc._jvm.org.apache.spark.sql.expressions.Window.rangeBetween(start, end)
@@ -185,14 +187,14 @@ def rowsBetween(self, start, end):
 
         :param start: boundary start, inclusive.
                       The frame is unbounded if this is ``Window.unboundedPreceding``, or
-                      any value less than or equal to -9223372036854775808.
+                      any value less than or equal to max(-sys.maxsize, -9223372036854775808).
         :param end: boundary end, inclusive.
                     The frame is unbounded if this is ``Window.unboundedFollowing``, or
-                    any value greater than or equal to 9223372036854775807.
+                    any value greater than or equal to min(sys.maxsize, 9223372036854775807).
         """
-        if start <= Window._JAVA_MIN_LONG:
+        if start <= Window._PRECEDING_THRESHOLD:
             start = Window.unboundedPreceding
-        if end >= Window._JAVA_MAX_LONG:
+        if end >= Window._FOLLOWING_THRESHOLD:
             end = Window.unboundedFollowing
         return WindowSpec(self._jspec.rowsBetween(start, end))
 
@@ -211,14 +213,14 @@ def rangeBetween(self, start, end):
 
         :param start: boundary start, inclusive.
                       The frame is unbounded if this is ``Window.unboundedPreceding``, or
-                      any value less than or equal to -9223372036854775808.
+                      any value less than or equal to max(-sys.maxsize, -9223372036854775808).
         :param end: boundary end, inclusive.
                     The frame is unbounded if this is ``Window.unboundedFollowing``, or
-                    any value greater than or equal to 9223372036854775807.
+                    any value greater than or equal to min(sys.maxsize, 9223372036854775807).
         """
-        if start <= Window._JAVA_MIN_LONG:
+        if start <= Window._PRECEDING_THRESHOLD:
             start = Window.unboundedPreceding
-        if end >= Window._JAVA_MAX_LONG:
+        if end >= Window._FOLLOWING_THRESHOLD:
             end = Window.unboundedFollowing
         return WindowSpec(self._jspec.rangeBetween(start, end))
 
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index ec3ad9933cf6..17c34f8a1c54 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -304,7 +304,7 @@ def queueStream(self, rdds, oneAtATime=True, default=None):
         Create an input stream from an queue of RDDs or list. In each batch,
         it will process either one or all of the RDDs returned by the queue.
 
-        NOTE: changes to the queue after the stream is created will not be recognized.
+        .. note:: Changes to the queue after the stream is created will not be recognized.
 
         @param rdds:       Queue of RDDs
         @param oneAtATime: pick one rdd each time or pick all of them once.
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index bf27d8047a75..134424add3b6 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -144,7 +144,7 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
         """
         .. note:: Experimental
 
-        Create a RDD from Kafka using offset ranges for each topic and partition.
+        Create an RDD from Kafka using offset ranges for each topic and partition.
 
         :param sc:  SparkContext object
         :param kafkaParams: Additional params for Kafka
@@ -155,7 +155,7 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
         :param valueDecoder:  A function used to decode value (default is utf8_decoder)
         :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
                                meta using messageHandler (default is None).
-        :return: A RDD object
+        :return: An RDD object
         """
         if leaders is None:
             leaders = dict()
diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py
index 434ce83e1e6f..3a8d8b819fd3 100644
--- a/python/pyspark/streaming/kinesis.py
+++ b/python/pyspark/streaming/kinesis.py
@@ -42,8 +42,8 @@ def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName,
         Create an input stream that pulls messages from a Kinesis stream. This uses the
         Kinesis Client Library (KCL) to pull messages from Kinesis.
 
-        Note: The given AWS credentials will get saved in DStream checkpoints if checkpointing is
-        enabled. Make sure that your checkpoint directory is secure.
+        .. note:: The given AWS credentials will get saved in DStream checkpoints if checkpointing
+            is enabled. Make sure that your checkpoint directory is secure.
 
         :param ssc:  StreamingContext object
         :param kinesisAppName:  Kinesis application name used by the Kinesis Client Library (KCL) to
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 5ac007cd598b..f4fbc5a413eb 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -55,7 +55,7 @@
 
 class PySparkStreamingTestCase(unittest.TestCase):
 
-    timeout = 10  # seconds
+    timeout = 30  # seconds
     duration = .5
 
     @classmethod
@@ -903,11 +903,11 @@ def updater(vs, s):
         def setup():
             conf = SparkConf().set("spark.default.parallelism", 1)
             sc = SparkContext(conf=conf)
-            ssc = StreamingContext(sc, 0.5)
+            ssc = StreamingContext(sc, 2)
             dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
             wc = dstream.updateStateByKey(updater)
             wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
-            wc.checkpoint(.5)
+            wc.checkpoint(2)
             self.setupCalled = True
             return ssc
 
@@ -921,21 +921,22 @@ def setup():
 
         def check_output(n):
             while not os.listdir(outputd):
-                time.sleep(0.01)
+                if self.ssc.awaitTerminationOrTimeout(0.5):
+                    raise Exception("ssc stopped")
             time.sleep(1)  # make sure mtime is larger than the previous one
             with open(os.path.join(inputd, str(n)), 'w') as f:
                 f.writelines(["%d\n" % i for i in range(10)])
 
             while True:
+                if self.ssc.awaitTerminationOrTimeout(0.5):
+                    raise Exception("ssc stopped")
                 p = os.path.join(outputd, max(os.listdir(outputd)))
                 if '_SUCCESS' not in os.listdir(p):
                     # not finished
-                    time.sleep(0.01)
                     continue
                 ordd = self.ssc.sparkContext.textFile(p).map(lambda line: line.split(","))
                 d = ordd.values().map(int).collect()
                 if not d:
-                    time.sleep(0.01)
                     continue
                 self.assertEqual(10, len(d))
                 s = set(d)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 3e0bd16d85ca..8d227eac3d7e 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -58,6 +58,7 @@
     from StringIO import StringIO
 
 
+from pyspark import keyword_only
 from pyspark.conf import SparkConf
 from pyspark.context import SparkContext
 from pyspark.rdd import RDD
@@ -390,6 +391,23 @@ def test_checkpoint_and_restore(self):
         self.assertEqual([1, 2, 3, 4], recovered.collect())
 
 
+class LocalCheckpointTests(ReusedPySparkTestCase):
+
+    def test_basic_localcheckpointing(self):
+        parCollection = self.sc.parallelize([1, 2, 3, 4])
+        flatMappedRDD = parCollection.flatMap(lambda x: range(1, x + 1))
+
+        self.assertFalse(flatMappedRDD.isCheckpointed())
+        self.assertFalse(flatMappedRDD.isLocallyCheckpointed())
+
+        flatMappedRDD.localCheckpoint()
+        result = flatMappedRDD.collect()
+        time.sleep(1)  # 1 second
+        self.assertTrue(flatMappedRDD.isCheckpointed())
+        self.assertTrue(flatMappedRDD.isLocallyCheckpointed())
+        self.assertEqual(flatMappedRDD.collect(), result)
+
+
 class AddFileTests(PySparkTestCase):
 
     def test_add_py_file(self):
@@ -485,6 +503,18 @@ def test_sum(self):
         self.assertEqual(0, self.sc.emptyRDD().sum())
         self.assertEqual(6, self.sc.parallelize([1, 2, 3]).sum())
 
+    def test_to_localiterator(self):
+        from time import sleep
+        rdd = self.sc.parallelize([1, 2, 3])
+        it = rdd.toLocalIterator()
+        sleep(5)
+        self.assertEqual([1, 2, 3], sorted(it))
+
+        rdd2 = rdd.repartition(1000)
+        it2 = rdd2.toLocalIterator()
+        sleep(5)
+        self.assertEqual([1, 2, 3], sorted(it2))
+
     def test_save_as_textfile_with_unicode(self):
         # Regression test for SPARK-970
         x = u"\u00A1Hola, mundo!"
@@ -531,6 +561,24 @@ def test_cartesian_on_textfile(self):
         self.assertEqual(u"Hello World!", x.strip())
         self.assertEqual(u"Hello World!", y.strip())
 
+    def test_cartesian_chaining(self):
+        # Tests for SPARK-16589
+        rdd = self.sc.parallelize(range(10), 2)
+        self.assertSetEqual(
+            set(rdd.cartesian(rdd).cartesian(rdd).collect()),
+            set([((x, y), z) for x in range(10) for y in range(10) for z in range(10)])
+        )
+
+        self.assertSetEqual(
+            set(rdd.cartesian(rdd.cartesian(rdd)).collect()),
+            set([(x, (y, z)) for x in range(10) for y in range(10) for z in range(10)])
+        )
+
+        self.assertSetEqual(
+            set(rdd.cartesian(rdd.zip(rdd)).collect()),
+            set([(x, (y, y)) for x in range(10) for y in range(10)])
+        )
+
     def test_deleting_input_files(self):
         # Regression test for SPARK-1025
         tempFile = tempfile.NamedTemporaryFile(delete=False)
@@ -924,6 +972,12 @@ def test_repartition_no_skewed(self):
         zeros = len([x for x in l if x == 0])
         self.assertTrue(zeros == 0)
 
+    def test_repartition_on_textfile(self):
+        path = os.path.join(SPARK_HOME, "python/test_support/hello/hello.txt")
+        rdd = self.sc.textFile(path)
+        result = rdd.repartition(1).collect()
+        self.assertEqual(u"Hello World!", result[0])
+
     def test_distinct(self):
         rdd = self.sc.parallelize((1, 2, 3)*10, 10)
         self.assertEqual(rdd.getNumPartitions(), 10)
@@ -1923,6 +1977,26 @@ def test_single_script_on_cluster(self):
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 4, 6]", out.decode('utf-8'))
 
+    def test_user_configuration(self):
+        """Make sure user configuration is respected (SPARK-19307)"""
+        script = self.createTempFile("test.py", """
+            |from pyspark import SparkConf, SparkContext
+            |
+            |conf = SparkConf().set("spark.test_config", "1")
+            |sc = SparkContext(conf = conf)
+            |try:
+            |    if sc._conf.get("spark.test_config") != "1":
+            |        raise Exception("Cannot find spark.test_config in SparkContext's conf.")
+            |finally:
+            |    sc.stop()
+            """)
+        proc = subprocess.Popen(
+            [self.sparkSubmit, "--master", "local", script],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT)
+        out, err = proc.communicate()
+        self.assertEqual(0, proc.returncode, msg="Process failed with error:\n {0}".format(out))
+
 
 class ContextTests(unittest.TestCase):
 
@@ -2028,6 +2102,44 @@ def test_memory_conf(self):
             sc.stop()
 
 
+class KeywordOnlyTests(unittest.TestCase):
+    class Wrapped(object):
+        @keyword_only
+        def set(self, x=None, y=None):
+            if "x" in self._input_kwargs:
+                self._x = self._input_kwargs["x"]
+            if "y" in self._input_kwargs:
+                self._y = self._input_kwargs["y"]
+            return x, y
+
+    def test_keywords(self):
+        w = self.Wrapped()
+        x, y = w.set(y=1)
+        self.assertEqual(y, 1)
+        self.assertEqual(y, w._y)
+        self.assertIsNone(x)
+        self.assertFalse(hasattr(w, "_x"))
+
+    def test_non_keywords(self):
+        w = self.Wrapped()
+        self.assertRaises(TypeError, lambda: w.set(0, y=1))
+
+    def test_kwarg_ownership(self):
+        # test _input_kwargs is owned by each class instance and not a shared static variable
+        class Setter(object):
+            @keyword_only
+            def set(self, x=None, other=None, other_x=None):
+                if "other" in self._input_kwargs:
+                    self._input_kwargs["other"].set(x=self._input_kwargs["other_x"])
+                self._x = self._input_kwargs["x"]
+
+        a = Setter()
+        b = Setter()
+        a.set(x=1, other=b, other_x=2)
+        self.assertEqual(a._x, 1)
+        self.assertEqual(b._x, 2)
+
+
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):
 
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
new file mode 100644
index 000000000000..4447e3d9c761
--- /dev/null
+++ b/python/pyspark/version.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "2.1.2.dev0"
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index cf47ab8f96c6..5eeaac70b80e 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -86,22 +86,19 @@ def read_single_udf(pickleSer, infile):
 
 def read_udfs(pickleSer, infile):
     num_udfs = read_int(infile)
-    if num_udfs == 1:
-        # fast path for single UDF
-        _, udf = read_single_udf(pickleSer, infile)
-        mapper = lambda a: udf(*a)
-    else:
-        udfs = {}
-        call_udf = []
-        for i in range(num_udfs):
-            arg_offsets, udf = read_single_udf(pickleSer, infile)
-            udfs['f%d' % i] = udf
-            args = ["a[%d]" % o for o in arg_offsets]
-            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
-        # Create function like this:
-        #   lambda a: (f0(a0), f1(a1, a2), f2(a3))
-        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))
-        mapper = eval(mapper_str, udfs)
+    udfs = {}
+    call_udf = []
+    for i in range(num_udfs):
+        arg_offsets, udf = read_single_udf(pickleSer, infile)
+        udfs['f%d' % i] = udf
+        args = ["a[%d]" % o for o in arg_offsets]
+        call_udf.append("f%d(%s)" % (i, ", ".join(args)))
+    # Create function like this:
+    #   lambda a: (f0(a0), f1(a1, a2), f2(a3))
+    # In the special case of a single UDF this will return a single result rather
+    # than a tuple of results; this is the format that the JVM side expects.
+    mapper_str = "lambda a: (%s)" % (", ".join(call_udf))
+    mapper = eval(mapper_str, udfs)
 
     func = lambda _, it: map(mapper, it)
     ser = BatchedSerializer(PickleSerializer(), 100)
@@ -119,7 +116,9 @@ def main(infile, outfile):
         version = utf8_deserializer.loads(infile)
         if version != "%d.%d" % sys.version_info[:2]:
             raise Exception(("Python in worker has different version %s than that in " +
-                             "driver %s, PySpark cannot run with different minor versions") %
+                             "driver %s, PySpark cannot run with different minor versions." +
+                             "Please check environment variables PYSPARK_PYTHON and " +
+                             "PYSPARK_DRIVER_PYTHON are correctly set.") %
                             ("%d.%d" % sys.version_info[:2], version))
 
         # initialize global state
diff --git a/python/run-tests.py b/python/run-tests.py
index 38b3bb84c10b..53a0aef229b0 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -72,7 +72,7 @@ def run_individual_python_test(test_name, pyspark_python):
         'PYSPARK_PYTHON': which(pyspark_python),
         'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
     })
-    LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name)
+    LOGGER.info("Starting test(%s): %s", pyspark_python, test_name)
     start_time = time.time()
     try:
         per_test_output = tempfile.TemporaryFile()
diff --git a/python/setup.cfg b/python/setup.cfg
new file mode 100644
index 000000000000..d100b932bbaf
--- /dev/null
+++ b/python/setup.cfg
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+[bdist_wheel]
+universal = 1
+
+[metadata]
+description-file = README.md
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 000000000000..f50035435e26
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import glob
+import os
+import sys
+from setuptools import setup, find_packages
+from shutil import copyfile, copytree, rmtree
+
+if sys.version_info < (2, 7):
+    print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
+          file=sys.stderr)
+    exit(-1)
+
+try:
+    exec(open('pyspark/version.py').read())
+except IOError:
+    print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
+          file=sys.stderr)
+    sys.exit(-1)
+VERSION = __version__
+# A temporary path so we can access above the Python project root and fetch scripts and jars we need
+TEMP_PATH = "deps"
+SPARK_HOME = os.path.abspath("../")
+
+# Provide guidance about how to use setup.py
+incorrect_invocation_message = """
+If you are installing pyspark from spark source, you must first build Spark and
+run sdist.
+
+    To build Spark with maven you can run:
+      ./build/mvn -DskipTests clean package
+    Building the source dist is done in the Python directory:
+      cd python
+      python setup.py sdist
+      pip install dist/*.tar.gz"""
+
+# Figure out where the jars are we need to package with PySpark.
+JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))
+
+if len(JARS_PATH) == 1:
+    JARS_PATH = JARS_PATH[0]
+elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1):
+    # Release mode puts the jars in a jars directory
+    JARS_PATH = os.path.join(SPARK_HOME, "jars")
+elif len(JARS_PATH) > 1:
+    print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
+        JARS_PATH), file=sys.stderr)
+    sys.exit(-1)
+elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
+    print(incorrect_invocation_message, file=sys.stderr)
+    sys.exit(-1)
+
+EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
+SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
+DATA_PATH = os.path.join(SPARK_HOME, "data")
+LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
+
+SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
+JARS_TARGET = os.path.join(TEMP_PATH, "jars")
+EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
+DATA_TARGET = os.path.join(TEMP_PATH, "data")
+LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
+
+# Check and see if we are under the spark path in which case we need to build the symlink farm.
+# This is important because we only want to build the symlink farm while under Spark otherwise we
+# want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a
+# partially built sdist) we should error and have the user sort it out.
+in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or
+            (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1))
+
+
+def _supports_symlinks():
+    """Check if the system supports symlinks (e.g. *nix) or not."""
+    return getattr(os, "symlink", None) is not None
+
+
+if (in_spark):
+    # Construct links for setup
+    try:
+        os.mkdir(TEMP_PATH)
+    except:
+        print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH),
+              file=sys.stderr)
+        exit(-1)
+
+try:
+    # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
+    # find it where expected. The rest of the files aren't copied because they are accessed
+    # using Python imports instead which will be resolved correctly.
+    try:
+        os.makedirs("pyspark/python/pyspark")
+    except OSError:
+        # Don't worry if the directory already exists.
+        pass
+    copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")
+
+    if (in_spark):
+        # Construct the symlink farm - this is necessary since we can't refer to the path above the
+        # package root and we need to copy the jars and scripts which are up above the python root.
+        if _supports_symlinks():
+            os.symlink(JARS_PATH, JARS_TARGET)
+            os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
+            os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
+            os.symlink(DATA_PATH, DATA_TARGET)
+            os.symlink(LICENSES_PATH, LICENSES_TARGET)
+        else:
+            # For windows fall back to the slower copytree
+            copytree(JARS_PATH, JARS_TARGET)
+            copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
+            copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
+            copytree(DATA_PATH, DATA_TARGET)
+            copytree(LICENSES_PATH, LICENSES_TARGET)
+    else:
+        # If we are not inside of SPARK_HOME verify we have the required symlink farm
+        if not os.path.exists(JARS_TARGET):
+            print("To build packaging must be in the python directory under the SPARK_HOME.",
+                  file=sys.stderr)
+
+    if not os.path.isdir(SCRIPTS_TARGET):
+        print(incorrect_invocation_message, file=sys.stderr)
+        exit(-1)
+
+    # Scripts directive requires a list of each script path and does not take wild cards.
+    script_names = os.listdir(SCRIPTS_TARGET)
+    scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
+    # We add find_spark_home.py to the bin directory we install so that pip installed PySpark
+    # will search for SPARK_HOME with Python.
+    scripts.append("pyspark/find_spark_home.py")
+
+    # Parse the README markdown file into rst for PyPI
+    long_description = "!!!!! missing pandoc do not upload to PyPI !!!!"
+    try:
+        import pypandoc
+        long_description = pypandoc.convert('README.md', 'rst')
+    except ImportError:
+        print("Could not import pypandoc - required to package PySpark", file=sys.stderr)
+
+    setup(
+        name='pyspark',
+        version=VERSION,
+        description='Apache Spark Python API',
+        long_description=long_description,
+        author='Spark Developers',
+        author_email='dev@spark.apache.org',
+        url='https://github.com/apache/spark/tree/master/python',
+        packages=['pyspark',
+                  'pyspark.mllib',
+                  'pyspark.mllib.linalg',
+                  'pyspark.mllib.stat',
+                  'pyspark.ml',
+                  'pyspark.ml.linalg',
+                  'pyspark.ml.param',
+                  'pyspark.sql',
+                  'pyspark.streaming',
+                  'pyspark.bin',
+                  'pyspark.jars',
+                  'pyspark.python.pyspark',
+                  'pyspark.python.lib',
+                  'pyspark.data',
+                  'pyspark.licenses',
+                  'pyspark.examples.src.main.python'],
+        include_package_data=True,
+        package_dir={
+            'pyspark.jars': 'deps/jars',
+            'pyspark.bin': 'deps/bin',
+            'pyspark.python.lib': 'lib',
+            'pyspark.data': 'deps/data',
+            'pyspark.licenses': 'deps/licenses',
+            'pyspark.examples.src.main.python': 'deps/examples',
+        },
+        package_data={
+            'pyspark.jars': ['*.jar'],
+            'pyspark.bin': ['*'],
+            'pyspark.python.lib': ['*.zip'],
+            'pyspark.data': ['*.txt', '*.data'],
+            'pyspark.licenses': ['*.txt'],
+            'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
+        scripts=scripts,
+        license='http://www.apache.org/licenses/LICENSE-2.0',
+        install_requires=['py4j==0.10.4'],
+        setup_requires=['pypandoc'],
+        extras_require={
+            'ml': ['numpy>=1.7'],
+            'mllib': ['numpy>=1.7'],
+            'sql': ['pandas']
+        },
+        classifiers=[
+            'Development Status :: 5 - Production/Stable',
+            'License :: OSI Approved :: Apache Software License',
+            'Programming Language :: Python :: 2.7',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.4',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: Implementation :: CPython',
+            'Programming Language :: Python :: Implementation :: PyPy']
+    )
+finally:
+    # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
+    # packaging.
+    if (in_spark):
+        # Depending on cleaning up the symlink farm or copied version
+        if _supports_symlinks():
+            os.remove(os.path.join(TEMP_PATH, "jars"))
+            os.remove(os.path.join(TEMP_PATH, "bin"))
+            os.remove(os.path.join(TEMP_PATH, "examples"))
+            os.remove(os.path.join(TEMP_PATH, "data"))
+            os.remove(os.path.join(TEMP_PATH, "licenses"))
+        else:
+            rmtree(os.path.join(TEMP_PATH, "jars"))
+            rmtree(os.path.join(TEMP_PATH, "bin"))
+            rmtree(os.path.join(TEMP_PATH, "examples"))
+            rmtree(os.path.join(TEMP_PATH, "data"))
+            rmtree(os.path.join(TEMP_PATH, "licenses"))
+        os.rmdir(TEMP_PATH)
diff --git a/repl/pom.xml b/repl/pom.xml
index 73493e600e54..2cefaa191afd 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -92,6 +92,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.xbean</groupId>
       <artifactId>xbean-asm5-shaded</artifactId>
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/Main.scala
index 7b4e14bb6aa4..fba321be9188 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/Main.scala
@@ -22,6 +22,7 @@ import org.apache.spark.internal.Logging
 object Main extends Logging {
 
   initializeLogIfNecessary(true)
+  Signaling.cancelOnInterrupt()
 
   private var _interp: SparkILoop = _
 
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index e017aa42a4c1..b7237a6ce822 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -1027,7 +1027,6 @@ class SparkILoop(
       builder.getOrCreate()
     }
     sparkContext = sparkSession.sparkContext
-    Signaling.cancelOnInterrupt(sparkContext)
     sparkSession
   }
 
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index fec4d4937959..7f2ec01cc967 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -30,6 +30,7 @@ import org.apache.spark.util.Utils
 object Main extends Logging {
 
   initializeLogIfNecessary(true)
+  Signaling.cancelOnInterrupt()
 
   val conf = new SparkConf()
   val rootDir = conf.getOption("spark.repl.classdir").getOrElse(Utils.getLocalDir(conf))
@@ -108,7 +109,6 @@ object Main extends Logging {
       logInfo("Created Spark session")
     }
     sparkContext = sparkSession.sparkContext
-    Signaling.cancelOnInterrupt(sparkContext)
     sparkSession
   }
 
diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 96d2dfc2658b..5ef3987d3d9d 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -474,20 +474,14 @@ class ReplSuite extends SparkFunSuite {
     assertDoesNotContain("Exception", output)
   }
 
-  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
-    val resultValue = 12345
-    val output = runInterpreter("local",
-      s"""
-         |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
-         |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
-         |val broadcasted = sc.broadcast($resultValue)
-         |
-         |// Using broadcast triggers serialization issue in KeyValueGroupedDataset
-         |val dataset = mapGroups.map(_ => broadcasted.value)
-         |dataset.collect()
-      """.stripMargin)
+  test("newProductSeqEncoder with REPL defined class") {
+    val output = runInterpreterInPasteMode("local-cluster[1,4,4096]",
+      """
+      |case class Click(id: Int)
+      |spark.implicits.newProductSeqEncoder[Click]
+    """.stripMargin)
+
     assertDoesNotContain("error:", output)
     assertDoesNotContain("Exception", output)
-    assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output)
   }
 }
diff --git a/repl/src/main/scala/org/apache/spark/repl/Signaling.scala b/repl/src/main/scala/org/apache/spark/repl/Signaling.scala
index 202febf14462..9577e0ecaa2e 100644
--- a/repl/src/main/scala/org/apache/spark/repl/Signaling.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/Signaling.scala
@@ -28,15 +28,17 @@ private[repl] object Signaling extends Logging {
    * when no jobs are currently running.
    * This makes it possible to interrupt a running shell job by pressing Ctrl+C.
    */
-  def cancelOnInterrupt(ctx: SparkContext): Unit = SignalUtils.register("INT") {
-    if (!ctx.statusTracker.getActiveJobIds().isEmpty) {
-      logWarning("Cancelling all active jobs, this can take a while. " +
-        "Press Ctrl+C again to exit now.")
-      ctx.cancelAllJobs()
-      true
-    } else {
-      false
-    }
+  def cancelOnInterrupt(): Unit = SignalUtils.register("INT") {
+    SparkContext.getActive.map { ctx =>
+      if (!ctx.statusTracker.getActiveJobIds().isEmpty) {
+        logWarning("Cancelling all active jobs, this can take a while. " +
+          "Press Ctrl+C again to exit now.")
+        ctx.cancelAllJobs()
+        true
+      } else {
+        false
+      }
+    }.getOrElse(false)
   }
 
 }
diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
index 3d622d42f408..6d274bddb778 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
@@ -24,7 +24,6 @@ import java.nio.charset.StandardCharsets
 import java.nio.file.{Paths, StandardOpenOption}
 import java.util
 
-import scala.concurrent.duration._
 import scala.io.Source
 import scala.language.implicitConversions
 
@@ -34,8 +33,6 @@ import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfterAll
-import org.scalatest.concurrent.Interruptor
-import org.scalatest.concurrent.Timeouts._
 import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark._
@@ -61,7 +58,7 @@ class ExecutorClassLoaderSuite
     super.beforeAll()
     tempDir1 = Utils.createTempDir()
     tempDir2 = Utils.createTempDir()
-    url1 = "file://" + tempDir1
+    url1 = tempDir1.toURI.toURL.toString
     urls2 = List(tempDir2.toURI.toURL).toArray
     childClassNames.foreach(TestUtils.createCompiledClass(_, tempDir1, "1"))
     parentResourceNames.foreach { x =>
@@ -118,8 +115,14 @@ class ExecutorClassLoaderSuite
     val resourceName: String = parentResourceNames.head
     val is = classLoader.getResourceAsStream(resourceName)
     assert(is != null, s"Resource $resourceName not found")
-    val content = Source.fromInputStream(is, "UTF-8").getLines().next()
-    assert(content.contains("resource"), "File doesn't contain 'resource'")
+
+    val bufferedSource = Source.fromInputStream(is, "UTF-8")
+    Utils.tryWithSafeFinally {
+      val content = bufferedSource.getLines().next()
+      assert(content.contains("resource"), "File doesn't contain 'resource'")
+    } {
+      bufferedSource.close()
+    }
   }
 
   test("resources from parent") {
@@ -128,8 +131,14 @@ class ExecutorClassLoaderSuite
     val resourceName: String = parentResourceNames.head
     val resources: util.Enumeration[URL] = classLoader.getResources(resourceName)
     assert(resources.hasMoreElements, s"Resource $resourceName not found")
-    val fileReader = Source.fromInputStream(resources.nextElement().openStream()).bufferedReader()
-    assert(fileReader.readLine().contains("resource"), "File doesn't contain 'resource'")
+
+    val bufferedSource = Source.fromInputStream(resources.nextElement().openStream())
+    Utils.tryWithSafeFinally {
+      val fileReader = bufferedSource.bufferedReader()
+      assert(fileReader.readLine().contains("resource"), "File doesn't contain 'resource'")
+    } {
+      bufferedSource.close()
+    }
   }
 
   test("fetch classes using Spark's RpcEnv") {
diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index 061019a55e99..c227c9828e6a 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -124,9 +124,8 @@ if [ "$SPARK_NICENESS" = "" ]; then
 fi
 
 execute_command() {
-  local command="$@"
   if [ -z ${SPARK_NO_DAEMONIZE+set} ]; then
-      nohup -- $command >> $log 2>&1 < /dev/null &
+      nohup -- "$@" >> $log 2>&1 < /dev/null &
       newpid="$!"
 
       echo "$newpid" > "$pid"
@@ -143,12 +142,12 @@ execute_command() {
       sleep 2
       # Check if the process has died; in that case we'll tail the log so the user can see
       if [[ ! $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
-        echo "failed to launch $command:"
+        echo "failed to launch: $@"
         tail -2 "$log" | sed 's/^/  /'
         echo "full log in $log"
       fi
   else
-      $command
+      "$@"
   fi
 }
 
@@ -176,11 +175,11 @@ run_command() {
 
   case "$mode" in
     (class)
-      execute_command nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command $@
+      execute_command nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class "$command" "$@"
       ;;
 
     (submit)
-      execute_command nice -n "$SPARK_NICENESS" bash "${SPARK_HOME}"/bin/spark-submit --class $command $@
+      execute_command nice -n "$SPARK_NICENESS" bash "${SPARK_HOME}"/bin/spark-submit --class "$command" "$@"
       ;;
 
     (*)
diff --git a/sbin/start-history-server.sh b/sbin/start-history-server.sh
index 6851d99b7e8f..38a43b98c399 100755
--- a/sbin/start-history-server.sh
+++ b/sbin/start-history-server.sh
@@ -31,4 +31,4 @@ fi
 . "${SPARK_HOME}/sbin/spark-config.sh"
 . "${SPARK_HOME}/bin/load-spark-env.sh"
 
-exec "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1 $@
+exec "${SPARK_HOME}/sbin"/spark-daemon.sh start org.apache.spark.deploy.history.HistoryServer 1 "$@"
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 82b49ebb21a4..4b4a8eb3815e 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -56,6 +56,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-unsafe_${scala.binary.version}</artifactId>
@@ -70,6 +82,10 @@
       <groupId>org.codehaus.janino</groupId>
       <artifactId>janino</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.codehaus.janino</groupId>
+      <artifactId>commons-compiler</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.antlr</groupId>
       <artifactId>antlr4-runtime</artifactId>
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index b599a884957a..6e8b89f7db91 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -71,11 +71,7 @@ statement
     | createTableHeader ('(' colTypeList ')')? tableProvider
         (OPTIONS tablePropertyList)?
         (PARTITIONED BY partitionColumnNames=identifierList)?
-        bucketSpec?                                                    #createTableUsing
-    | createTableHeader tableProvider
-        (OPTIONS tablePropertyList)?
-        (PARTITIONED BY partitionColumnNames=identifierList)?
-        bucketSpec? AS? query                                          #createTableUsing
+        bucketSpec? (AS? query)?                                       #createTableUsing
     | createTableHeader ('(' columns=colTypeList ')')?
         (COMMENT STRING)?
         (PARTITIONED BY '(' partitionColumns=colTypeList ')')?
@@ -142,7 +138,7 @@ statement
     | REFRESH TABLE tableIdentifier                                    #refreshTable
     | REFRESH .*?                                                      #refreshResource
     | CACHE LAZY? TABLE tableIdentifier (AS? query)?                   #cacheTable
-    | UNCACHE TABLE tableIdentifier                                    #uncacheTable
+    | UNCACHE TABLE (IF EXISTS)? tableIdentifier                       #uncacheTable
     | CLEAR CACHE                                                      #clearCache
     | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE
         tableIdentifier partitionSpec?                                 #loadData
@@ -529,8 +525,8 @@ valueExpression
 
 primaryExpression
     : name=(CURRENT_DATE | CURRENT_TIMESTAMP)                                                  #timeFunctionCall
-    | CASE value=expression whenClause+ (ELSE elseExpression=expression)? END                  #simpleCase
     | CASE whenClause+ (ELSE elseExpression=expression)? END                                   #searchedCase
+    | CASE value=expression whenClause+ (ELSE elseExpression=expression)? END                  #simpleCase
     | CAST '(' expression AS dataType ')'                                                      #cast
     | constant                                                                                 #constantDefault
     | ASTERISK                                                                                 #star
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
index 9e10f27d59d5..62a2ce47d0ce 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
@@ -39,5 +39,5 @@
 @Retention(RetentionPolicy.RUNTIME)
 public @interface ExpressionDescription {
     String usage() default "_FUNC_ is undocumented";
-    String extended() default "No example for _FUNC_.";
+    String extended() default "\n    No example/argument for _FUNC_.\n";
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 86523c147401..e8c33871f97b 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -109,7 +109,8 @@ public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
     // Read the number of elements from the first 8 bytes.
     final long numElements = Platform.getLong(baseObject, baseOffset);
     assert numElements >= 0 : "numElements (" + numElements + ") should >= 0";
-    assert numElements <= Integer.MAX_VALUE : "numElements (" + numElements + ") should <= Integer.MAX_VALUE";
+    assert numElements <= Integer.MAX_VALUE :
+      "numElements (" + numElements + ") should <= Integer.MAX_VALUE";
 
     this.numElements = (int)numElements;
     this.baseObject = baseObject;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
index 35029f5a50e3..f17441dfccb6 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
@@ -68,7 +68,8 @@ public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
     // Read the numBytes of key array from the first 8 bytes.
     final long keyArraySize = Platform.getLong(baseObject, baseOffset);
     assert keyArraySize >= 0 : "keyArraySize (" + keyArraySize + ") should >= 0";
-    assert keyArraySize <= Integer.MAX_VALUE : "keyArraySize (" + keyArraySize + ") should <= Integer.MAX_VALUE";
+    assert keyArraySize <= Integer.MAX_VALUE :
+      "keyArraySize (" + keyArraySize + ") should <= Integer.MAX_VALUE";
     final int valueArraySize = sizeInBytes - (int)keyArraySize - 8;
     assert valueArraySize >= 0 : "valueArraySize (" + valueArraySize + ") should >= 0";
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index c3f0abac244c..d205547698c5 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -578,12 +578,8 @@ public boolean equals(Object other) {
       return (sizeInBytes == o.sizeInBytes) &&
         ByteArrayMethods.arrayEquals(baseObject, baseOffset, o.baseObject, o.baseOffset,
           sizeInBytes);
-    } else if (!(other instanceof InternalRow)) {
-      return false;
-    } else {
-      throw new IllegalArgumentException(
-        "Cannot compare UnsafeRow to " + other.getClass().getName());
     }
+    return false;
   }
 
   /**
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
index 49a18df2c72c..3f7cdb293e0f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
@@ -19,7 +19,7 @@
 
 import org.apache.spark.annotation.Experimental;
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.InternalOutputModes;
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes;
 
 /**
  * :: Experimental ::
@@ -46,7 +46,7 @@ public static OutputMode Append() {
 
   /**
    * OutputMode in which all the rows in the streaming DataFrame/Dataset will be written
-   * to the sink every time these is some updates. This output mode can only be used in queries
+   * to the sink every time there are some updates. This output mode can only be used in queries
    * that contain aggregations.
    *
    * @since 2.0.0
@@ -54,4 +54,15 @@ public static OutputMode Append() {
   public static OutputMode Complete() {
     return InternalOutputModes.Complete$.MODULE$;
   }
+
+  /**
+   * OutputMode in which only the rows that were updated in the streaming DataFrame/Dataset will
+   * be written to the sink every time there are some updates. If the query doesn't contain
+   * aggregations, it will be equivalent to `Append` mode.
+   *
+   * @since 2.1.1
+   */
+  public static OutputMode Update() {
+    return InternalOutputModes.Update$.MODULE$;
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
index 7defb9df862c..50ee6cd4085e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
@@ -31,7 +31,8 @@ class AnalysisException protected[sql] (
     val message: String,
     val line: Option[Int] = None,
     val startPosition: Option[Int] = None,
-    val plan: Option[LogicalPlan] = None,
+    // Some plans fail to serialize due to bugs in scala collections.
+    @transient val plan: Option[LogicalPlan] = None,
     val cause: Option[Throwable] = None)
   extends Exception(message, cause.orNull) with Serializable {
 
@@ -42,7 +43,7 @@ class AnalysisException protected[sql] (
   }
 
   override def getMessage: String = {
-    val planAnnotation = plan.map(p => s";\n$p").getOrElse("")
+    val planAnnotation = Option(plan).flatten.map(p => s";\n$p").getOrElse("")
     getSimpleMessage + planAnnotation
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoder.scala
index b9f8c4644302..68ea47cedac9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoder.scala
@@ -77,6 +77,8 @@ trait Encoder[T] extends Serializable {
   /** Returns the schema of encoding this type of object as a Row. */
   def schema: StructType
 
-  /** A ClassTag that can be used to construct and Array to contain a collection of `T`. */
+  /**
+   * A ClassTag that can be used to construct and Array to contain a collection of `T`.
+   */
   def clsTag: ClassTag[T]
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
index dc90659a676e..0b95a8821b05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
@@ -165,10 +165,10 @@ object Encoders {
    * (Scala-specific) Creates an encoder that serializes objects of type T using generic Java
    * serialization. This encoder maps T into a single byte array (binary) field.
    *
-   * Note that this is extremely inefficient and should only be used as the last resort.
-   *
    * T must be publicly accessible.
    *
+   * @note This is extremely inefficient and should only be used as the last resort.
+   *
    * @since 1.6.0
    */
   def javaSerialization[T: ClassTag]: Encoder[T] = genericSerializer(useKryo = false)
@@ -177,10 +177,10 @@ object Encoders {
    * Creates an encoder that serializes objects of type T using generic Java serialization.
    * This encoder maps T into a single byte array (binary) field.
    *
-   * Note that this is extremely inefficient and should only be used as the last resort.
-   *
    * T must be publicly accessible.
    *
+   * @note This is extremely inefficient and should only be used as the last resort.
+   *
    * @since 1.6.0
    */
   def javaSerialization[T](clazz: Class[T]): Encoder[T] = javaSerialization(ClassTag[T](clazz))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 65f91429648c..c362104b26ff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -74,7 +74,7 @@ object Row {
  * It is invalid to use the native primitive interface to retrieve a value that is null, instead a
  * user must check `isNullAt` before attempting to retrieve a value that might be null.
  *
- * To create a new Row, use [[RowFactory.create()]] in Java or [[Row.apply()]] in Scala.
+ * To create a new Row, use `RowFactory.create()` in Java or `Row.apply()` in Scala.
  *
  * A [[Row]] object can be constructed by providing field values. Example:
  * {{{
@@ -343,7 +343,7 @@ trait Row extends Serializable {
   }
 
   /**
-   * Returns a Map(name -> value) for the requested fieldNames
+   * Returns a Map consisting of names and values for the requested fieldNames
    * For primitive types if value is null it returns 'zero value' specific for primitive
    * ie. 0 for Int - use isNullAt to ensure that value is not null
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala
deleted file mode 100644
index 75ae588c18ec..000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystConf.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst
-
-import org.apache.spark.sql.catalyst.analysis._
-
-/**
- * Interface for configuration options used in the catalyst module.
- */
-trait CatalystConf {
-  def caseSensitiveAnalysis: Boolean
-
-  def orderByOrdinal: Boolean
-  def groupByOrdinal: Boolean
-
-  def optimizerMaxIterations: Int
-  def optimizerInSetConversionThreshold: Int
-  def maxCaseBranchesForCodegen: Int
-
-  def runSQLonFile: Boolean
-
-  def warehousePath: String
-
-  /** If true, cartesian products between relations will be allowed for all
-   * join types(inner, (left|right|full) outer).
-   * If false, cartesian products will require explicit CROSS JOIN syntax.
-   */
-  def crossJoinEnabled: Boolean
-
-  /**
-   * Returns the [[Resolver]] for the current configuration, which can be used to determine if two
-   * identifiers are equal.
-   */
-  def resolver: Resolver = {
-    if (caseSensitiveAnalysis) caseSensitiveResolution else caseInsensitiveResolution
-  }
-}
-
-
-/** A CatalystConf that can be used for local testing. */
-case class SimpleCatalystConf(
-    caseSensitiveAnalysis: Boolean,
-    orderByOrdinal: Boolean = true,
-    groupByOrdinal: Boolean = true,
-    optimizerMaxIterations: Int = 100,
-    optimizerInSetConversionThreshold: Int = 10,
-    maxCaseBranchesForCodegen: Int = 20,
-    runSQLonFile: Boolean = true,
-    crossJoinEnabled: Boolean = false,
-    warehousePath: String = "/user/hive/warehouse")
-  extends CatalystConf
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index f498e071b50a..256f64e320be 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{DataType, Decimal, StructType}
 
 /**
- * An abstract class for row used internal in Spark SQL, which only contain the columns as
+ * An abstract class for row used internally in Spark SQL, which only contains the columns as
  * internal types.
  */
 abstract class InternalRow extends SpecializedGetters with Serializable {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 04f0cfce883f..2de066f99498 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst
 
 import java.beans.{Introspector, PropertyDescriptor}
 import java.lang.{Iterable => JIterable}
+import java.lang.reflect.Type
 import java.util.{Iterator => JIterator, List => JList, Map => JMap}
 
 import scala.language.existentials
@@ -54,12 +55,21 @@ object JavaTypeInference {
     inferDataType(TypeToken.of(beanClass))
   }
 
+  /**
+   * Infers the corresponding SQL data type of a Java type.
+   * @param beanType Java type
+   * @return (SQL data type, nullable)
+   */
+  private[sql] def inferDataType(beanType: Type): (DataType, Boolean) = {
+    inferDataType(TypeToken.of(beanType))
+  }
+
   /**
    * Infers the corresponding SQL data type of a Java type.
    * @param typeToken Java type
    * @return (SQL data type, nullable)
    */
-  private[sql] def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = {
+  private def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = {
     typeToken.getRawType match {
       case c: Class[_] if c.isAnnotationPresent(classOf[SQLUserDefinedType]) =>
         (c.getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance(), true)
@@ -324,7 +334,11 @@ object JavaTypeInference {
    */
   def serializerFor(beanClass: Class[_]): CreateNamedStruct = {
     val inputObject = BoundReference(0, ObjectType(beanClass), nullable = true)
-    serializerFor(inputObject, TypeToken.of(beanClass)).asInstanceOf[CreateNamedStruct]
+    val nullSafeInput = AssertNotNull(inputObject, Seq("top level input bean"))
+    serializerFor(nullSafeInput, TypeToken.of(beanClass)) match {
+      case expressions.If(_, _, s: CreateNamedStruct) => s
+      case other => CreateNamedStruct(expressions.Literal("value") :: other :: Nil)
+    }
   }
 
   private def serializerFor(inputObject: Expression, typeToken: TypeToken[_]): Expression = {
@@ -407,7 +421,7 @@ object JavaTypeInference {
         case other =>
           val properties = getJavaBeanProperties(other)
           if (properties.length > 0) {
-            CreateNamedStruct(properties.flatMap { p =>
+            val nonNullOutput = CreateNamedStruct(properties.flatMap { p =>
               val fieldName = p.getName
               val fieldType = typeToken.method(p.getReadMethod).getReturnType
               val fieldValue = Invoke(
@@ -416,6 +430,9 @@ object JavaTypeInference {
                 inferExternalType(fieldType.getRawType))
               expressions.Literal(fieldName) :: serializerFor(fieldValue, fieldType) :: Nil
             })
+
+            val nullOutput = expressions.Literal.create(null, nonNullOutput.dataType)
+            expressions.If(IsNull(inputObject), nullOutput, nonNullOutput)
           } else {
             throw new UnsupportedOperationException(
               s"Cannot infer type for class ${other.getName} because it is not bean-compliant")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 31c6e5def143..1a1daaeba467 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -342,7 +342,7 @@ object ScalaReflection extends ScalaReflection {
 
         StaticInvoke(
           ArrayBasedMapData.getClass,
-          ObjectType(classOf[Map[_, _]]),
+          ObjectType(classOf[scala.collection.immutable.Map[_, _]]),
           "toScalaMap",
           keyData :: valueData :: Nil)
 
@@ -441,6 +441,22 @@ object ScalaReflection extends ScalaReflection {
           val newPath = s"""- array element class: "$clsName"""" +: walkedTypePath
           MapObjects(serializerFor(_, elementType, newPath), input, dt)
 
+         case dt @ (BooleanType | ByteType | ShortType | IntegerType | LongType |
+                    FloatType | DoubleType) =>
+          val cls = input.dataType.asInstanceOf[ObjectType].cls
+          if (cls.isArray && cls.getComponentType.isPrimitive) {
+            StaticInvoke(
+              classOf[UnsafeArrayData],
+              ArrayType(dt, false),
+              "fromPrimitiveArray",
+              input :: Nil)
+          } else {
+            NewInstance(
+              classOf[GenericArrayData],
+              input :: Nil,
+              dataType = ArrayType(dt, schemaFor(elementType).nullable))
+          }
+
         case dt =>
           NewInstance(
             classOf[GenericArrayData],
@@ -589,6 +605,19 @@ object ScalaReflection extends ScalaReflection {
 
   }
 
+  /**
+   * Returns true if the given type is option of product type, e.g. `Option[Tuple2]`. Note that,
+   * we also treat [[DefinedByConstructorParams]] as product type.
+   */
+  def optionOfProductType(tpe: `Type`): Boolean = ScalaReflectionLock.synchronized {
+    tpe match {
+      case t if t <:< localTypeOf[Option[_]] =>
+        val TypeRef(_, _, Seq(optType)) = t
+        definedByConstructorParams(optType)
+      case _ => false
+    }
+  }
+
   /**
    * Returns the parameter names and types for the primary constructor of this class.
    *
@@ -785,8 +814,16 @@ trait ScalaReflection {
   def getConstructorParameters(tpe: Type): Seq[(String, Type)] = {
     val formalTypeArgs = tpe.typeSymbol.asClass.typeParams
     val TypeRef(_, _, actualTypeArgs) = tpe
-    constructParams(tpe).map { p =>
-      p.name.toString -> p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)
+    val params = constructParams(tpe)
+    // if there are type variables to fill in, do the substitution (SomeClass[T] -> SomeClass[Int])
+    if (actualTypeArgs.nonEmpty) {
+      params.map { p =>
+        p.name.toString -> p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)
+      }
+    } else {
+      params.map { p =>
+        p.name.toString -> p.typeSignature
+      }
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SimpleCatalystConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SimpleCatalystConf.scala
new file mode 100644
index 000000000000..ab52a90aaad5
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SimpleCatalystConf.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+import org.apache.spark.sql.internal.SQLConf
+
+
+/**
+ * A SQLConf that can be used for local testing. This class is only here to minimize the change
+ * for ticket SPARK-19944 (moves SQLConf from sql/core to sql/catalyst). This class should
+ * eventually be removed (test cases should just create SQLConf and set values appropriately).
+ */
+case class SimpleCatalystConf(
+    override val caseSensitiveAnalysis: Boolean,
+    override val orderByOrdinal: Boolean = true,
+    override val groupByOrdinal: Boolean = true,
+    override val optimizerMaxIterations: Int = 100,
+    override val optimizerInSetConversionThreshold: Int = 10,
+    override val maxCaseBranchesForCodegen: Int = 20,
+    override val runSQLonFile: Boolean = true,
+    override val crossJoinEnabled: Boolean = false,
+    override val warehousePath: String = "/user/hive/warehouse")
+  extends SQLConf
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f8f4799322b3..25584de0a923 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, _}
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.trees.{TreeNodeRef}
+import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.catalyst.util.toPrettySQL
 import org.apache.spark.sql.types._
 
@@ -83,6 +83,7 @@ class Analyzer(
       ResolveTableValuedFunctions ::
       ResolveRelations ::
       ResolveReferences ::
+      ResolveCreateNamedStruct ::
       ResolveDeserializer ::
       ResolveNewInstance ::
       ResolveUpCast ::
@@ -298,10 +299,16 @@ class Analyzer(
           case other => Alias(other, other.toString)()
         }
 
-        val nonNullBitmask = x.bitmasks.reduce(_ & _)
+        // The rightmost bit in the bitmasks corresponds to the last expression in groupByAliases
+        // with 0 indicating this expression is in the grouping set. The following line of code
+        // calculates the bitmask representing the expressions that absent in at least one grouping
+        // set (indicated by 1).
+        val nullBitmask = x.bitmasks.reduce(_ | _)
 
+        val attrLength = groupByAliases.length
         val expandedAttributes = groupByAliases.zipWithIndex.map { case (a, idx) =>
-          a.toAttribute.withNullability((nonNullBitmask & 1 << idx) == 0)
+          val canBeNull = ((nullBitmask >> (attrLength - idx - 1)) & 1) == 1
+          a.toAttribute.withNullability(a.nullable || canBeNull)
         }
 
         val expand = Expand(x.bitmasks, groupByAliases, expandedAttributes, gid, x.child)
@@ -377,7 +384,7 @@ class Analyzer(
           } else {
             val suffix = aggregate match {
               case n: NamedExpression => n.name
-              case _ => aggregate.sql
+              case _ => toPrettySQL(aggregate)
             }
             value + "_" + suffix
           }
@@ -398,14 +405,15 @@ class Analyzer(
               .toAggregateExpression()
             , "__pivot_" + a.sql)()
           }
-          val secondAgg = Aggregate(groupByExprs, groupByExprs ++ pivotAggs, firstAgg)
+          val groupByExprsAttr = groupByExprs.map(_.toAttribute)
+          val secondAgg = Aggregate(groupByExprsAttr, groupByExprsAttr ++ pivotAggs, firstAgg)
           val pivotAggAttribute = pivotAggs.map(_.toAttribute)
           val pivotOutputs = pivotValues.zipWithIndex.flatMap { case (value, i) =>
             aggregates.zip(pivotAggAttribute).map { case (aggregate, pivotAtt) =>
               Alias(ExtractValue(pivotAtt, Literal(i), resolver), outputName(value, aggregate))()
             }
           }
-          Project(groupByExprs ++ pivotOutputs, secondAgg)
+          Project(groupByExprsAttr ++ pivotOutputs, secondAgg)
         } else {
           val pivotAggregates: Seq[NamedExpression] = pivotValues.flatMap { value =>
             def ifExpr(expr: Expression) = {
@@ -653,11 +661,12 @@ class Analyzer(
             case s: Star => s.expand(child, resolver)
             case o => o :: Nil
           })
-        case c: CreateStruct if containsStar(c.children) =>
-          c.copy(children = c.children.flatMap {
-            case s: Star => s.expand(child, resolver)
-            case o => o :: Nil
-          })
+        case c: CreateNamedStruct if containsStar(c.valExprs) =>
+          val newChildren = c.children.grouped(2).flatMap {
+            case Seq(k, s : Star) => CreateStruct(s.expand(child, resolver)).children
+            case kv => kv
+          }
+          c.copy(children = newChildren.toList )
         case c: CreateArray if containsStar(c.children) =>
           c.copy(children = c.children.flatMap {
             case s: Star => s.expand(child, resolver)
@@ -718,7 +727,7 @@ class Analyzer(
       case p if !p.childrenResolved => p
       // Replace the index with the related attribute for ORDER BY,
       // which is a 1-base position of the projection list.
-      case s @ Sort(orders, global, child)
+      case Sort(orders, global, child)
         if orders.exists(_.child.isInstanceOf[UnresolvedOrdinal]) =>
         val newOrders = orders map {
           case s @ SortOrder(UnresolvedOrdinal(index), direction, nullOrdering) =>
@@ -735,17 +744,11 @@ class Analyzer(
 
       // Replace the index with the corresponding expression in aggregateExpressions. The index is
       // a 1-base position of aggregateExpressions, which is output columns (select expression)
-      case a @ Aggregate(groups, aggs, child) if aggs.forall(_.resolved) &&
+      case Aggregate(groups, aggs, child) if aggs.forall(_.resolved) &&
         groups.exists(_.isInstanceOf[UnresolvedOrdinal]) =>
         val newGroups = groups.map {
-          case ordinal @ UnresolvedOrdinal(index) if index > 0 && index <= aggs.size =>
-            aggs(index - 1) match {
-              case e if ResolveAggregateFunctions.containsAggregate(e) =>
-                ordinal.failAnalysis(
-                  s"GROUP BY position $index is an aggregate function, and " +
-                    "aggregate functions are not allowed in GROUP BY")
-              case o => o
-            }
+          case u @ UnresolvedOrdinal(index) if index > 0 && index <= aggs.size =>
+            aggs(index - 1)
           case ordinal @ UnresolvedOrdinal(index) =>
             ordinal.failAnalysis(
               s"GROUP BY position $index is not in select list " +
@@ -945,23 +948,55 @@ class Analyzer(
     private def pullOutCorrelatedPredicates(sub: LogicalPlan): (LogicalPlan, Seq[Expression]) = {
       val predicateMap = scala.collection.mutable.Map.empty[LogicalPlan, Seq[Expression]]
 
-      /** Make sure a plans' subtree does not contain a tagged predicate. */
-      def failOnOuterReferenceInSubTree(p: LogicalPlan, msg: String): Unit = {
-        if (p.collect(predicateMap).nonEmpty) {
-          failAnalysis(s"Accessing outer query column is not allowed in $msg: $p")
+      // Make sure a plan's subtree does not contain outer references
+      def failOnOuterReferenceInSubTree(p: LogicalPlan): Unit = {
+        if (p.collectFirst(predicateMap).nonEmpty) {
+          failAnalysis(s"Accessing outer query column is not allowed in:\n$p")
         }
       }
 
-      /** Helper function for locating outer references. */
+      // Helper function for locating outer references.
       def containsOuter(e: Expression): Boolean = {
         e.find(_.isInstanceOf[OuterReference]).isDefined
       }
 
-      /** Make sure a plans' expressions do not contain a tagged predicate. */
+      // Make sure a plan's expressions do not contain outer references
       def failOnOuterReference(p: LogicalPlan): Unit = {
         if (p.expressions.exists(containsOuter)) {
           failAnalysis(
-            s"Correlated predicates are not supported outside of WHERE/HAVING clauses: $p")
+            "Expressions referencing the outer query are not supported outside of WHERE/HAVING " +
+              s"clauses:\n$p")
+        }
+      }
+
+      // SPARK-17348: A potential incorrect result case.
+      // When a correlated predicate is a non-equality predicate,
+      // certain operators are not permitted from the operator
+      // hosting the correlated predicate up to the operator on the outer table.
+      // Otherwise, the pull up of the correlated predicate
+      // will generate a plan with a different semantics
+      // which could return incorrect result.
+      // Currently we check for Aggregate and Window operators
+      //
+      // Below shows an example of a Logical Plan during Analyzer phase that
+      // show this problem. Pulling the correlated predicate [outer(c2#77) >= ..]
+      // through the Aggregate (or Window) operator could alter the result of
+      // the Aggregate.
+      //
+      // Project [c1#76]
+      // +- Project [c1#87, c2#88]
+      // :  (Aggregate or Window operator)
+      // :  +- Filter [outer(c2#77) >= c2#88)]
+      // :     +- SubqueryAlias t2, `t2`
+      // :        +- Project [_1#84 AS c1#87, _2#85 AS c2#88]
+      // :           +- LocalRelation [_1#84, _2#85]
+      // +- SubqueryAlias t1, `t1`
+      // +- Project [_1#73 AS c1#76, _2#74 AS c2#77]
+      // +- LocalRelation [_1#73, _2#74]
+      def failOnNonEqualCorrelatedPredicate(found: Boolean, p: LogicalPlan): Unit = {
+        if (found) {
+          // Report a non-supported case as an exception
+          failAnalysis(s"Correlated column is not allowed in a non-equality predicate:\n$p")
         }
       }
 
@@ -975,12 +1010,65 @@ class Analyzer(
         localPredicateReferences -- p.outputSet
       }
 
+      var foundNonEqualCorrelatedPred : Boolean = false
+
       // Simplify the predicates before pulling them out.
       val transformed = BooleanSimplification(sub) transformUp {
+
+        // Whitelist operators allowed in a correlated subquery
+        // There are 4 categories:
+        // 1. Operators that are allowed anywhere in a correlated subquery, and,
+        //    by definition of the operators, they either do not contain
+        //    any columns or cannot host outer references.
+        // 2. Operators that are allowed anywhere in a correlated subquery
+        //    so long as they do not host outer references.
+        // 3. Operators that need special handlings. These operators are
+        //    Project, Filter, Join, Aggregate, and Generate.
+        //
+        // Any operators that are not in the above list are allowed
+        // in a correlated subquery only if they are not on a correlation path.
+        // In other word, these operators are allowed only under a correlation point.
+        //
+        // A correlation path is defined as the sub-tree of all the operators that
+        // are on the path from the operator hosting the correlated expressions
+        // up to the operator producing the correlated values.
+
+        // Category 1:
+        // BroadcastHint, Distinct, LeafNode, Repartition, and SubqueryAlias
+        case p: BroadcastHint =>
+          p
+        case p: Distinct =>
+          p
+        case p: LeafNode =>
+          p
+        case p: Repartition =>
+          p
+        case p: SubqueryAlias =>
+          p
+
+        // Category 2:
+        // These operators can be anywhere in a correlated subquery.
+        // so long as they do not host outer references in the operators.
+        case p: Sort =>
+          failOnOuterReference(p)
+          p
+        case p: RepartitionByExpression =>
+          failOnOuterReference(p)
+          p
+
+        // Category 3:
+        // Filter is one of the two operators allowed to host correlated expressions.
+        // The other operator is Join. Filter can be anywhere in a correlated subquery.
         case f @ Filter(cond, child) =>
           // Find all predicates with an outer reference.
           val (correlated, local) = splitConjunctivePredicates(cond).partition(containsOuter)
 
+          // Find any non-equality correlated predicates
+          foundNonEqualCorrelatedPred = foundNonEqualCorrelatedPred || correlated.exists {
+            case _: EqualTo | _: EqualNullSafe => false
+            case _ => true
+          }
+
           // Rewrite the filter without the correlated predicates if any.
           correlated match {
             case Nil => f
@@ -992,55 +1080,83 @@ class Analyzer(
               predicateMap += child -> xs
               child
           }
+
+        // Project cannot host any correlated expressions
+        // but can be anywhere in a correlated subquery.
         case p @ Project(expressions, child) =>
           failOnOuterReference(p)
+
           val referencesToAdd = missingReferences(p)
           if (referencesToAdd.nonEmpty) {
             Project(expressions ++ referencesToAdd, child)
           } else {
             p
           }
+
+        // Aggregate cannot host any correlated expressions
+        // It can be on a correlation path if the correlation contains
+        // only equality correlated predicates.
+        // It cannot be on a correlation path if the correlation has
+        // non-equality correlated predicates.
         case a @ Aggregate(grouping, expressions, child) =>
           failOnOuterReference(a)
+          failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, a)
+
           val referencesToAdd = missingReferences(a)
           if (referencesToAdd.nonEmpty) {
             Aggregate(grouping ++ referencesToAdd, expressions ++ referencesToAdd, child)
           } else {
             a
           }
-        case j @ Join(left, _, RightOuter, _) =>
-          failOnOuterReference(j)
-          failOnOuterReferenceInSubTree(left, "a RIGHT OUTER JOIN")
-          j
-        case j @ Join(_, right, jt, _) if !jt.isInstanceOf[InnerLike] =>
-          failOnOuterReference(j)
-          failOnOuterReferenceInSubTree(right, "a LEFT (OUTER) JOIN")
+
+        // Join can host correlated expressions.
+        case j @ Join(left, right, joinType, _) =>
+          joinType match {
+            // Inner join, like Filter, can be anywhere.
+            case _: InnerLike =>
+              failOnOuterReference(j)
+
+            // Left outer join's right operand cannot be on a correlation path.
+            // LeftAnti and ExistenceJoin are special cases of LeftOuter.
+            // Note that ExistenceJoin cannot be expressed externally in both SQL and DataFrame
+            // so it should not show up here in Analysis phase. This is just a safety net.
+            //
+            // LeftSemi does not allow output from the right operand.
+            // Any correlated references in the subplan
+            // of the right operand cannot be pulled up.
+            case LeftOuter | LeftSemi | LeftAnti | ExistenceJoin(_) =>
+              failOnOuterReference(j)
+              failOnOuterReferenceInSubTree(right)
+
+            // Likewise, Right outer join's left operand cannot be on a correlation path.
+            case RightOuter =>
+              failOnOuterReference(j)
+              failOnOuterReferenceInSubTree(left)
+
+            // Any other join types not explicitly listed above,
+            // including Full outer join, are treated as Category 4.
+            case _ =>
+              failOnOuterReferenceInSubTree(j)
+          }
           j
-        case u: Union =>
-          failOnOuterReferenceInSubTree(u, "a UNION")
-          u
-        case s: SetOperation =>
-          failOnOuterReferenceInSubTree(s.right, "an INTERSECT/EXCEPT")
-          s
-        case e: Expand =>
-          failOnOuterReferenceInSubTree(e, "an EXPAND")
-          e
-        case l : LocalLimit =>
-          failOnOuterReferenceInSubTree(l, "a LIMIT")
-          l
-        // Since LIMIT <n> is represented as GlobalLimit(<n>, (LocalLimit (<n>, child))
-        // and we are walking bottom up, we will fail on LocalLimit before
-        // reaching GlobalLimit.
-        // The code below is just a safety net.
-        case g : GlobalLimit =>
-          failOnOuterReferenceInSubTree(g, "a LIMIT")
-          g
-        case s : Sample =>
-          failOnOuterReferenceInSubTree(s, "a TABLESAMPLE")
-          s
-        case p =>
+
+        // Generator with join=true, i.e., expressed with
+        // LATERAL VIEW [OUTER], similar to inner join,
+        // allows to have correlation under it
+        // but must not host any outer references.
+        // Note:
+        // Generator with join=false is treated as Category 4.
+        case p @ Generate(generator, true, _, _, _, _) =>
           failOnOuterReference(p)
           p
+
+        // Category 4: Any other operators not in the above 3 categories
+        // cannot be on a correlation path, that is they are allowed only
+        // under a correlation point but they and their descendant operators
+        // are not allowed to have any correlated expressions.
+        case p =>
+          failOnOuterReferenceInSubTree(p)
+          p
       }
       (transformed, predicateMap.values.flatten.toSeq)
     }
@@ -1131,9 +1247,6 @@ class Analyzer(
      */
     private def resolveSubQueries(plan: LogicalPlan, plans: Seq[LogicalPlan]): LogicalPlan = {
       plan transformExpressions {
-        case s @ ScalarSubquery(sub, conditions, exprId)
-            if sub.resolved && conditions.isEmpty && sub.output.size != 1 =>
-          failAnalysis(s"Scalar subquery must return only one column, but got ${sub.output.size}")
         case s @ ScalarSubquery(sub, _, exprId) if !sub.resolved =>
           resolveSubQuery(s, plans, 1)(ScalarSubquery(_, _, exprId))
         case e @ Exists(sub, exprId) =>
@@ -1141,7 +1254,7 @@ class Analyzer(
         case In(e, Seq(l @ ListQuery(_, exprId))) if e.resolved =>
           // Get the left hand side expressions.
           val expressions = e match {
-            case CreateStruct(exprs) => exprs
+            case cns : CreateNamedStruct => cns.valExprs
             case expr => Seq(expr)
           }
           resolveSubQuery(l, plans, expressions.size) { (rewrite, conditions) =>
@@ -1741,28 +1854,37 @@ class Analyzer(
       case p: Project => p
       case f: Filter => f
 
+      case a: Aggregate if a.groupingExpressions.exists(!_.deterministic) =>
+        val nondeterToAttr = getNondeterToAttr(a.groupingExpressions)
+        val newChild = Project(a.child.output ++ nondeterToAttr.values, a.child)
+        a.transformExpressions { case e =>
+          nondeterToAttr.get(e).map(_.toAttribute).getOrElse(e)
+        }.copy(child = newChild)
+
       // todo: It's hard to write a general rule to pull out nondeterministic expressions
       // from LogicalPlan, currently we only do it for UnaryNode which has same output
       // schema with its child.
       case p: UnaryNode if p.output == p.child.output && p.expressions.exists(!_.deterministic) =>
-        val nondeterministicExprs = p.expressions.filterNot(_.deterministic).flatMap { expr =>
-          val leafNondeterministic = expr.collect {
-            case n: Nondeterministic => n
-          }
-          leafNondeterministic.map { e =>
-            val ne = e match {
-              case n: NamedExpression => n
-              case _ => Alias(e, "_nondeterministic")(isGenerated = true)
-            }
-            new TreeNodeRef(e) -> ne
-          }
-        }.toMap
+        val nondeterToAttr = getNondeterToAttr(p.expressions)
         val newPlan = p.transformExpressions { case e =>
-          nondeterministicExprs.get(new TreeNodeRef(e)).map(_.toAttribute).getOrElse(e)
+          nondeterToAttr.get(e).map(_.toAttribute).getOrElse(e)
         }
-        val newChild = Project(p.child.output ++ nondeterministicExprs.values, p.child)
+        val newChild = Project(p.child.output ++ nondeterToAttr.values, p.child)
         Project(p.output, newPlan.withNewChildren(newChild :: Nil))
     }
+
+    private def getNondeterToAttr(exprs: Seq[Expression]): Map[Expression, NamedExpression] = {
+      exprs.filterNot(_.deterministic).flatMap { expr =>
+        val leafNondeterministic = expr.collect { case n: Nondeterministic => n }
+        leafNondeterministic.distinct.map { e =>
+          val ne = e match {
+            case n: NamedExpression => n
+            case _ => Alias(e, "_nondeterministic")(isGenerated = true)
+          }
+          e -> ne
+        }
+      }.toMap
+    }
   }
 
   /**
@@ -1777,7 +1899,7 @@ class Analyzer(
 
       case p => p transformExpressionsUp {
 
-        case udf @ ScalaUDF(func, _, inputs, _) =>
+        case udf @ ScalaUDF(func, _, inputs, _, _) =>
           val parameterTypes = ScalaReflection.getParameterTypes(func)
           assert(parameterTypes.length == inputs.length)
 
@@ -1840,15 +1962,7 @@ class Analyzer(
     override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
       case j @ Join(left, right, UsingJoin(joinType, usingCols), condition)
           if left.resolved && right.resolved && j.duplicateResolved =>
-        // Resolve the column names referenced in using clause from both the legs of join.
-        val lCols = usingCols.flatMap(col => left.resolveQuoted(col.name, resolver))
-        val rCols = usingCols.flatMap(col => right.resolveQuoted(col.name, resolver))
-        if ((lCols.length == usingCols.length) && (rCols.length == usingCols.length)) {
-          val joinNames = lCols.map(exp => exp.name)
-          commonNaturalJoinProcessing(left, right, joinType, joinNames, None)
-        } else {
-          j
-        }
+        commonNaturalJoinProcessing(left, right, joinType, usingCols, None)
       case j @ Join(left, right, NaturalJoin(joinType), condition) if j.resolvedExceptNatural =>
         // find common column names from both sides
         val joinNames = left.output.map(_.name).intersect(right.output.map(_.name))
@@ -1863,18 +1977,16 @@ class Analyzer(
       joinNames: Seq[String],
       condition: Option[Expression]) = {
     val leftKeys = joinNames.map { keyName =>
-      val joinColumn = left.output.find(attr => resolver(attr.name, keyName))
-      assert(
-        joinColumn.isDefined,
-        s"$keyName should exist in ${left.output.map(_.name).mkString(",")}")
-      joinColumn.get
+      left.output.find(attr => resolver(attr.name, keyName)).getOrElse {
+        throw new AnalysisException(s"USING column `$keyName` cannot be resolved on the left " +
+          s"side of the join. The left-side columns: [${left.output.map(_.name).mkString(", ")}]")
+      }
     }
     val rightKeys = joinNames.map { keyName =>
-      val joinColumn = right.output.find(attr => resolver(attr.name, keyName))
-      assert(
-        joinColumn.isDefined,
-        s"$keyName should exist in ${right.output.map(_.name).mkString(",")}")
-      joinColumn.get
+      right.output.find(attr => resolver(attr.name, keyName)).getOrElse {
+        throw new AnalysisException(s"USING column `$keyName` cannot be resolved on the right " +
+          s"side of the join. The right-side columns: [${right.output.map(_.name).mkString(", ")}]")
+      }
     }
     val joinPairs = leftKeys.zip(rightKeys)
 
@@ -2072,18 +2184,8 @@ object EliminateUnions extends Rule[LogicalPlan] {
  */
 object CleanupAliases extends Rule[LogicalPlan] {
   private def trimAliases(e: Expression): Expression = {
-    var stop = false
     e.transformDown {
-      // CreateStruct is a special case, we need to retain its top level Aliases as they decide the
-      // name of StructField. We also need to stop transform down this expression, or the Aliases
-      // under CreateStruct will be mistakenly trimmed.
-      case c: CreateStruct if !stop =>
-        stop = true
-        c.copy(children = c.children.map(trimNonTopLevelAliases))
-      case c: CreateStructUnsafe if !stop =>
-        stop = true
-        c.copy(children = c.children.map(trimNonTopLevelAliases))
-      case Alias(child, _) if !stop => child
+      case Alias(child, _) => child
     }
   }
 
@@ -2116,15 +2218,8 @@ object CleanupAliases extends Rule[LogicalPlan] {
     case a: AppendColumns => a
 
     case other =>
-      var stop = false
       other transformExpressionsDown {
-        case c: CreateStruct if !stop =>
-          stop = true
-          c.copy(children = c.children.map(trimNonTopLevelAliases))
-        case c: CreateStructUnsafe if !stop =>
-          stop = true
-          c.copy(children = c.children.map(trimNonTopLevelAliases))
-        case Alias(child, _) if !stop => child
+        case Alias(child, _) => child
       }
   }
 }
@@ -2179,7 +2274,13 @@ object TimeWindowing extends Rule[LogicalPlan] {
           windowExpressions.head.timeColumn.resolved &&
           windowExpressions.head.checkInputDataTypes().isSuccess) {
         val window = windowExpressions.head
-        val windowAttr = AttributeReference("window", window.dataType)()
+
+        val metadata = window.timeColumn match {
+          case a: Attribute => a.metadata
+          case _ => Metadata.empty
+        }
+        val windowAttr =
+          AttributeReference("window", window.dataType, metadata = metadata)()
 
         val maxNumOverlapping = math.ceil(window.windowDuration * 1.0 / window.slideDuration).toInt
         val windows = Seq.tabulate(maxNumOverlapping + 1) { i =>
@@ -2217,3 +2318,19 @@ object TimeWindowing extends Rule[LogicalPlan] {
       }
   }
 }
+
+/**
+ * Resolve a [[CreateNamedStruct]] if it contains [[NamePlaceholder]]s.
+ */
+object ResolveCreateNamedStruct extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressions {
+    case e: CreateNamedStruct if !e.resolved =>
+      val children = e.children.grouped(2).flatMap {
+        case Seq(NamePlaceholder, e: NamedExpression) if e.resolved =>
+          Seq(Literal(e.name), e)
+        case kv =>
+          kv
+      }
+      CreateNamedStruct(children.toList)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 3455a567b778..e68bf81c6428 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -21,7 +21,6 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.catalog.SimpleCatalogRelation
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
-import org.apache.spark.sql.catalyst.plans.UsingJoin
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
@@ -46,6 +45,16 @@ trait CheckAnalysis extends PredicateHelper {
     }).length > 1
   }
 
+  protected def hasMapType(dt: DataType): Boolean = {
+    dt.existsRecursively(_.isInstanceOf[MapType])
+  }
+
+  protected def mapColumnInSetOperation(plan: LogicalPlan): Option[Attribute] = plan match {
+    case _: Intersect | _: Except | _: Distinct =>
+      plan.output.find(a => hasMapType(a.dataType))
+    case _ => None
+  }
+
   private def checkLimitClause(limitExpr: Expression): Unit = {
     limitExpr match {
       case e if !e.foldable => failAnalysis(
@@ -118,49 +127,91 @@ trait CheckAnalysis extends PredicateHelper {
               case None => w
             }
 
-          case s @ ScalarSubquery(query, conditions, _) if conditions.nonEmpty =>
-            // Make sure we are using equi-joins.
-            conditions.foreach {
-              case _: EqualTo | _: EqualNullSafe => // ok
-              case e => failAnalysis(
-                s"The correlated scalar subquery can only contain equality predicates: $e")
-            }
+          case s @ ScalarSubquery(query, conditions, _) =>
+            checkAnalysis(query)
+
+            // If no correlation, the output must be exactly one column
+            if (conditions.isEmpty && query.output.size != 1) {
+              failAnalysis(
+                s"Scalar subquery must return only one column, but got ${query.output.size}")
+            } else if (conditions.nonEmpty) {
+              // Collect the columns from the subquery for further checking.
+              var subqueryColumns = conditions.flatMap(_.references).filter(query.output.contains)
+
+              def checkAggregate(agg: Aggregate): Unit = {
+                // Make sure correlated scalar subqueries contain one row for every outer row by
+                // enforcing that they are aggregates containing exactly one aggregate expression.
+                // The analyzer has already checked that subquery contained only one output column,
+                // and added all the grouping expressions to the aggregate.
+                val aggregates = agg.expressions.flatMap(_.collect {
+                  case a: AggregateExpression => a
+                })
+                if (aggregates.isEmpty) {
+                  failAnalysis("The output of a correlated scalar subquery must be aggregated")
+                }
 
-            // Make sure correlated scalar subqueries contain one row for every outer row by
-            // enforcing that they are aggregates which contain exactly one aggregate expressions.
-            // The analyzer has already checked that subquery contained only one output column, and
-            // added all the grouping expressions to the aggregate.
-            def checkAggregate(a: Aggregate): Unit = {
-              val aggregates = a.expressions.flatMap(_.collect {
-                case a: AggregateExpression => a
-              })
-              if (aggregates.isEmpty) {
-                failAnalysis("The output of a correlated scalar subquery must be aggregated")
+                // SPARK-18504/SPARK-18814: Block cases where GROUP BY columns
+                // are not part of the correlated columns.
+                val groupByCols = AttributeSet(agg.groupingExpressions.flatMap(_.references))
+                val correlatedCols = AttributeSet(subqueryColumns)
+                val invalidCols = groupByCols -- correlatedCols
+                // GROUP BY columns must be a subset of columns in the predicates
+                if (invalidCols.nonEmpty) {
+                  failAnalysis(
+                    "A GROUP BY clause in a scalar correlated subquery " +
+                      "cannot contain non-correlated columns: " +
+                      invalidCols.mkString(","))
+                }
               }
-            }
 
-            // Skip projects and subquery aliases added by the Analyzer and the SQLBuilder.
-            def cleanQuery(p: LogicalPlan): LogicalPlan = p match {
-              case s: SubqueryAlias => cleanQuery(s.child)
-              case p: Project => cleanQuery(p.child)
-              case child => child
-            }
+              // Skip subquery aliases added by the Analyzer and the SQLBuilder.
+              // For projects, do the necessary mapping and skip to its child.
+              def cleanQuery(p: LogicalPlan): LogicalPlan = p match {
+                case s: SubqueryAlias => cleanQuery(s.child)
+                case p: Project =>
+                  // SPARK-18814: Map any aliases to their AttributeReference children
+                  // for the checking in the Aggregate operators below this Project.
+                  subqueryColumns = subqueryColumns.map {
+                    xs => p.projectList.collectFirst {
+                      case e @ Alias(child : AttributeReference, _) if e.exprId == xs.exprId =>
+                        child
+                    }.getOrElse(xs)
+                  }
+
+                  cleanQuery(p.child)
+                case child => child
+              }
 
-            cleanQuery(query) match {
-              case a: Aggregate => checkAggregate(a)
-              case Filter(_, a: Aggregate) => checkAggregate(a)
-              case fail => failAnalysis(s"Correlated scalar subqueries must be Aggregated: $fail")
+              cleanQuery(query) match {
+                case a: Aggregate => checkAggregate(a)
+                case Filter(_, a: Aggregate) => checkAggregate(a)
+                case fail => failAnalysis(s"Correlated scalar subqueries must be Aggregated: $fail")
+              }
             }
             s
+
+          case s: SubqueryExpression =>
+            checkAnalysis(s.plan)
+            s
         }
 
         operator match {
+          case etw: EventTimeWatermark =>
+            etw.eventTime.dataType match {
+              case s: StructType
+                if s.find(_.name == "end").map(_.dataType) == Some(TimestampType) =>
+              case _: TimestampType =>
+              case _ =>
+                failAnalysis(
+                  s"Event time must be defined on a window or a timestamp, but " +
+                  s"${etw.eventTime.name} is of type ${etw.eventTime.dataType.simpleString}")
+            }
           case f: Filter if f.condition.dataType != BooleanType =>
             failAnalysis(
               s"filter expression '${f.condition.sql}' " +
                 s"of type ${f.condition.dataType.simpleString} is not a boolean.")
 
-          case f @ Filter(condition, child) =>
+          case Filter(condition, _) =>
             splitConjunctivePredicates(condition).foreach {
               case _: PredicateSubquery | Not(_: PredicateSubquery) =>
               case e if PredicateSubquery.hasNullAwarePredicateWithinNot(e) =>
@@ -169,32 +220,11 @@ trait CheckAnalysis extends PredicateHelper {
               case e =>
             }
 
-          case j @ Join(_, _, UsingJoin(_, cols), _) =>
-            val from = operator.inputSet.map(_.name).mkString(", ")
-            failAnalysis(
-              s"using columns [${cols.mkString(",")}] " +
-                s"can not be resolved given input columns: [$from] ")
-
           case j @ Join(_, _, _, Some(condition)) if condition.dataType != BooleanType =>
             failAnalysis(
               s"join condition '${condition.sql}' " +
                 s"of type ${condition.dataType.simpleString} is not a boolean.")
 
-          case j @ Join(_, _, _, Some(condition)) =>
-            def checkValidJoinConditionExprs(expr: Expression): Unit = expr match {
-              case p: Predicate =>
-                p.asInstanceOf[Expression].children.foreach(checkValidJoinConditionExprs)
-              case e if e.dataType.isInstanceOf[BinaryType] =>
-                failAnalysis(s"binary type expression ${e.sql} cannot be used " +
-                  "in join conditions")
-              case e if e.dataType.isInstanceOf[MapType] =>
-                failAnalysis(s"map type expression ${e.sql} cannot be used " +
-                  "in join conditions")
-              case _ => // OK
-            }
-
-            checkValidJoinConditionExprs(condition)
-
           case Aggregate(groupingExprs, aggregateExprs, child) =>
             def checkValidAggregateExpression(expr: Expression): Unit = expr match {
               case aggExpr: AggregateExpression =>
@@ -237,6 +267,11 @@ trait CheckAnalysis extends PredicateHelper {
             }
 
             def checkValidGroupingExprs(expr: Expression): Unit = {
+              if (expr.find(_.isInstanceOf[AggregateExpression]).isDefined) {
+                failAnalysis(
+                  "aggregate functions are not allowed in GROUP BY, but found " + expr.sql)
+              }
+
               // Check if the data type of expr is orderable.
               if (!RowOrdering.isOrderable(expr.dataType)) {
                 failAnalysis(
@@ -254,8 +289,8 @@ trait CheckAnalysis extends PredicateHelper {
               }
             }
 
-            aggregateExprs.foreach(checkValidAggregateExpression)
             groupingExprs.foreach(checkValidGroupingExprs)
+            aggregateExprs.foreach(checkValidAggregateExpression)
 
           case Sort(orders, _, _) =>
             orders.foreach { order =>
@@ -300,7 +335,7 @@ trait CheckAnalysis extends PredicateHelper {
               // Check if the data types match.
               dataTypes(child).zip(ref).zipWithIndex.foreach { case ((dt1, dt2), ci) =>
                 // SPARK-18058: we shall not care about the nullability of columns
-                if (dt1.asNullable != dt2.asNullable) {
+                if (!dt1.sameType(dt2)) {
                   failAnalysis(
                     s"""
                       |${operator.nodeName} can only be performed on tables with the compatible
@@ -355,6 +390,14 @@ trait CheckAnalysis extends PredicateHelper {
                  |Conflicting attributes: ${conflictingAttributes.mkString(",")}
                """.stripMargin)
 
+          // TODO: although map type is not orderable, technically map type should be able to be
+          // used in equality comparison, remove this type check once we support it.
+          case o if mapColumnInSetOperation(o).isDefined =>
+            val mapCol = mapColumnInSetOperation(o).get
+            failAnalysis("Cannot have map type columns in DataFrame which calls " +
+              s"set operations(intersect, except, etc.), but the type of column ${mapCol.name} " +
+              "is " + mapCol.dataType.simpleString)
+
           case s: SimpleCatalogRelation =>
             failAnalysis(
               s"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 3e836ca375e2..06b52a8db965 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -249,8 +249,10 @@ object FunctionRegistry {
     expression[Max]("max"),
     expression[Average]("mean"),
     expression[Min]("min"),
+    expression[Percentile]("percentile"),
     expression[Skewness]("skewness"),
     expression[ApproximatePercentile]("percentile_approx"),
+    expression[ApproximatePercentile]("approx_percentile"),
     expression[StddevSamp]("std"),
     expression[StddevSamp]("stddev"),
     expression[StddevPop]("stddev_pop"),
@@ -357,7 +359,7 @@ object FunctionRegistry {
     expression[MapValues]("map_values"),
     expression[Size]("size"),
     expression[SortArray]("sort_array"),
-    expression[CreateStruct]("struct"),
+    CreateStruct.registryEntry,
 
     // misc functions
     expression[AssertTrue]("assert_true"),
@@ -446,7 +448,10 @@ object FunctionRegistry {
         // If there is an apply method that accepts Seq[Expression], use that one.
         Try(varargCtor.get.newInstance(expressions).asInstanceOf[Expression]) match {
           case Success(e) => e
-          case Failure(e) => throw new AnalysisException(e.getMessage)
+          case Failure(e) =>
+            // the exception is an invocation exception. To get a meaningful message, we need the
+            // cause.
+            throw new AnalysisException(e.getCause.getMessage)
         }
       } else {
         // Otherwise, find a constructor method that matches the number of arguments, and use that.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 01b04c036d15..6700fc79d4bc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -51,7 +51,6 @@ object TypeCoercion {
       PromoteStrings ::
       DecimalPrecision ::
       BooleanEquality ::
-      StringToIntegralCasts ::
       FunctionArgumentConversion ::
       CaseWhenCoercion ::
       IfCoercion ::
@@ -428,21 +427,6 @@ object TypeCoercion {
     }
   }
 
-  /**
-   * When encountering a cast from a string representing a valid fractional number to an integral
-   * type the jvm will throw a `java.lang.NumberFormatException`.  Hive, in contrast, returns the
-   * truncated version of this number.
-   */
-  object StringToIntegralCasts extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
-      // Skip nodes who's children have not been resolved yet.
-      case e if !e.childrenResolved => e
-
-      case Cast(e @ StringType(), t: IntegralType) =>
-        Cast(Cast(e, DecimalType.forType(LongType)), t)
-    }
-  }
-
   /**
    * This ensure that the types for various functions are as expected.
    */
@@ -528,8 +512,7 @@ object TypeCoercion {
         NaNvl(l, Cast(r, DoubleType))
       case NaNvl(l, r) if l.dataType == FloatType && r.dataType == DoubleType =>
         NaNvl(Cast(l, DoubleType), r)
-
-      case e: RuntimeReplaceable => e.replaceForTypeCoercion()
+      case NaNvl(l, r) if r.dataType == NullType => NaNvl(l, Cast(r, l.dataType))
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index e81370c504ab..f0b68abfdc1b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -17,9 +17,12 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import org.apache.spark.sql.{AnalysisException, InternalOutputModes}
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
 import org.apache.spark.sql.streaming.OutputMode
 
 /**
@@ -55,11 +58,22 @@ object UnsupportedOperationChecker {
     // Disallow some output mode
     outputMode match {
       case InternalOutputModes.Append if aggregates.nonEmpty =>
-        throwError(
-          s"$outputMode output mode not supported when there are streaming aggregations on " +
-            s"streaming DataFrames/DataSets")(plan)
-
-      case InternalOutputModes.Complete | InternalOutputModes.Update if aggregates.isEmpty =>
+        val aggregate = aggregates.head
+
+        // Find any attributes that are associated with an eventTime watermark.
+        val watermarkAttributes = aggregate.groupingExpressions.collect {
+          case a: Attribute if a.metadata.contains(EventTimeWatermark.delayKey) => a
+        }
+
+        // We can append rows to the sink once the group is under the watermark. Without this
+        // watermark a group is never "finished" so we would never output anything.
+        if (watermarkAttributes.isEmpty) {
+          throwError(
+            s"$outputMode output mode not supported when there are streaming aggregations on " +
+                s"streaming DataFrames/DataSets")(plan)
+        }
+
+      case InternalOutputModes.Complete if aggregates.isEmpty =>
         throwError(
           s"$outputMode output mode not supported when there are no streaming aggregations on " +
             s"streaming DataFrames/Datasets")(plan)
@@ -73,7 +87,7 @@ object UnsupportedOperationChecker {
      * data.
      */
     def containsCompleteData(subplan: LogicalPlan): Boolean = {
-      val aggs = plan.collect { case a@Aggregate(_, _, _) if a.isStreaming => a }
+      val aggs = subplan.collect { case a@Aggregate(_, _, _) if a.isStreaming => a }
       // Either the subplan has no streaming source, or it has aggregation with Complete mode
       !subplan.isStreaming || (aggs.nonEmpty && outputMode == InternalOutputModes.Complete)
     }
@@ -83,6 +97,15 @@ object UnsupportedOperationChecker {
       // Operations that cannot exists anywhere in a streaming plan
       subPlan match {
 
+        case Aggregate(_, aggregateExpressions, child) =>
+          val distinctAggExprs = aggregateExpressions.flatMap { expr =>
+            expr.collect { case ae: AggregateExpression if ae.isDistinct => ae }
+          }
+          throwErrorIf(
+            child.isStreaming && distinctAggExprs.nonEmpty,
+            "Distinct aggregations are not supported on streaming DataFrames/Datasets. Consider " +
+              "using approx_count_distinct() instead.")
+
         case _: Command =>
           throwError("Commands like CreateTable*, AlterTable*, Show* are not supported with " +
             "streaming DataFrames/Datasets")
@@ -131,7 +154,7 @@ object UnsupportedOperationChecker {
           throwError("Union between streaming and batch DataFrames/Datasets is not supported")
 
         case Except(left, right) if right.isStreaming =>
-          throwError("Except with a streaming DataFrame/Dataset on the right is not supported")
+          throwError("Except on a streaming DataFrame/Dataset on the right is not supported")
 
         case Intersect(left, right) if left.isStreaming && right.isStreaming =>
           throwError("Intersect between two streaming DataFrames/Datasets is not supported")
@@ -142,9 +165,9 @@ object UnsupportedOperationChecker {
         case GlobalLimit(_, _) | LocalLimit(_, _) if subPlan.children.forall(_.isStreaming) =>
           throwError("Limits are not supported on streaming DataFrames/Datasets")
 
-        case Sort(_, _, _) | SortPartitions(_, _) if !containsCompleteData(subPlan) =>
-          throwError("Sorting is not supported on streaming DataFrames/Datasets, unless it is on" +
-            "aggregated DataFrame/Dataset in Complete mode")
+        case Sort(_, _, _) if !containsCompleteData(subPlan) =>
+          throwError("Sorting is not supported on streaming DataFrames/Datasets, unless it is on " +
+            "aggregated DataFrame/Dataset in Complete output mode")
 
         case Sample(_, _, _, _, child) if child.isStreaming =>
           throwError("Sampling is not supported on streaming DataFrames/Datasets")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 235ae0478245..36ed9ba50372 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, Codege
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan}
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.types.{DataType, Metadata, StructType}
 
 /**
  * Thrown when an invalid attempt is made to access a property of a tree that has yet to be fully
@@ -98,6 +98,7 @@ case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute with Un
   override def withNullability(newNullability: Boolean): UnresolvedAttribute = this
   override def withQualifier(newQualifier: Option[String]): UnresolvedAttribute = this
   override def withName(newName: String): UnresolvedAttribute = UnresolvedAttribute.quoted(newName)
+  override def withMetadata(newMetadata: Metadata): Attribute = this
 
   override def toString: String = s"'$name"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index a5e02523d288..5e8316320917 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.sql.catalyst.catalog
 
-import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException}
+import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException, NoSuchTableException}
 import org.apache.spark.sql.catalyst.expressions.Expression
-
+import org.apache.spark.sql.types.StructType
 
 /**
- * Interface for the system catalog (of columns, partitions, tables, and databases).
+ * Interface for the system catalog (of functions, partitions, tables, and databases).
  *
  * This is only used for non-temporary items, and implementations must be thread-safe as they
  * can be accessed in multiple threads. This is an external catalog because it is expected to
@@ -39,6 +39,12 @@ abstract class ExternalCatalog {
     }
   }
 
+  protected def requireTableExists(db: String, table: String): Unit = {
+    if (!tableExists(db, table)) {
+      throw new NoSuchTableException(db = db, table = table)
+    }
+  }
+
   protected def requireFunctionExists(db: String, funcName: String): Unit = {
     if (!functionExists(db, funcName)) {
       throw new NoSuchFunctionException(db = db, func = funcName)
@@ -98,6 +104,19 @@ abstract class ExternalCatalog {
    */
   def alterTable(tableDefinition: CatalogTable): Unit
 
+  /**
+   * Alter the schema of a table identified by the provided database and table name. The new schema
+   * should still contain the existing bucket columns and partition columns used by the table. This
+   * method will also update any Spark SQL-related parameters stored as Hive table properties (such
+   * as the schema itself).
+   *
+   * @param db Database that table to alter schema for exists in
+   * @param table Name of table to alter schema for
+   * @param schema Updated schema to be used for the table (must contain existing partition and
+   *               bucket columns)
+   */
+  def alterTableSchema(db: String, table: String, schema: StructType): Unit
+
   def getTable(db: String, table: String): CatalogTable
 
   def getTableOption(db: String, table: String): Option[CatalogTable]
@@ -148,7 +167,8 @@ abstract class ExternalCatalog {
       table: String,
       parts: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit
+      purge: Boolean,
+      retainData: Boolean): Unit
 
   /**
    * Override the specs of one or many existing table partitions, assuming they exist.
@@ -182,15 +202,37 @@ abstract class ExternalCatalog {
       table: String,
       spec: TablePartitionSpec): Option[CatalogTablePartition]
 
+  /**
+   * List the names of all partitions that belong to the specified table, assuming it exists.
+   *
+   * For a table with partition columns p1, p2, p3, each partition name is formatted as
+   * `p1=v1/p2=v2/p3=v3`. Each partition column name and value is an escaped path name, and can be
+   * decoded with the `ExternalCatalogUtils.unescapePathName` method.
+   *
+   * The returned sequence is sorted as strings.
+   *
+   * A partial partition spec may optionally be provided to filter the partitions returned, as
+   * described in the `listPartitions` method.
+   *
+   * @param db database name
+   * @param table table name
+   * @param partialSpec partition spec
+   */
+  def listPartitionNames(
+      db: String,
+      table: String,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String]
+
   /**
    * List the metadata of all partitions that belong to the specified table, assuming it exists.
    *
    * A partial partition spec may optionally be provided to filter the partitions returned.
    * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'),
    * then a partial spec of (a='1') will return the first two only.
+   *
    * @param db database name
    * @param table table name
-   * @param partialSpec  partition spec
+   * @param partialSpec partition spec
    */
   def listPartitions(
       db: String,
@@ -203,7 +245,7 @@ abstract class ExternalCatalog {
    *
    * @param db database name
    * @param table table name
-   * @param predicates  partition-pruning predicates
+   * @param predicates partition-pruning predicates
    */
   def listPartitionsByFilter(
       db: String,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
new file mode 100644
index 000000000000..6f2a11748b1a
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.catalog
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.util.Shell
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.analysis.Resolver
+import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
+
+object ExternalCatalogUtils {
+  // This duplicates default value of Hive `ConfVars.DEFAULTPARTITIONNAME`, since catalyst doesn't
+  // depend on Hive.
+  val DEFAULT_PARTITION_NAME = "__HIVE_DEFAULT_PARTITION__"
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+  // The following string escaping code is mainly copied from Hive (o.a.h.h.common.FileUtils).
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
+  val charToEscape = {
+    val bitSet = new java.util.BitSet(128)
+
+    /**
+     * ASCII 01-1F are HTTP control characters that need to be escaped.
+     * \u000A and \u000D are \n and \r, respectively.
+     */
+    val clist = Array(
+      '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009',
+      '\n', '\u000B', '\u000C', '\r', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
+      '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
+      '\u001D', '\u001E', '\u001F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F',
+      '{', '[', ']', '^')
+
+    clist.foreach(bitSet.set(_))
+
+    if (Shell.WINDOWS) {
+      Array(' ', '<', '>', '|').foreach(bitSet.set(_))
+    }
+
+    bitSet
+  }
+
+  def needsEscaping(c: Char): Boolean = {
+    c >= 0 && c < charToEscape.size() && charToEscape.get(c)
+  }
+
+  def escapePathName(path: String): String = {
+    val builder = new StringBuilder()
+    path.foreach { c =>
+      if (needsEscaping(c)) {
+        builder.append('%')
+        builder.append(f"${c.asInstanceOf[Int]}%02X")
+      } else {
+        builder.append(c)
+      }
+    }
+
+    builder.toString()
+  }
+
+
+  def unescapePathName(path: String): String = {
+    val sb = new StringBuilder
+    var i = 0
+
+    while (i < path.length) {
+      val c = path.charAt(i)
+      if (c == '%' && i + 2 < path.length) {
+        val code: Int = try {
+          Integer.parseInt(path.substring(i + 1, i + 3), 16)
+        } catch {
+          case _: Exception => -1
+        }
+        if (code >= 0) {
+          sb.append(code.asInstanceOf[Char])
+          i += 3
+        } else {
+          sb.append(c)
+          i += 1
+        }
+      } else {
+        sb.append(c)
+        i += 1
+      }
+    }
+
+    sb.toString()
+  }
+
+  def generatePartitionPath(
+      spec: TablePartitionSpec,
+      partitionColumnNames: Seq[String],
+      tablePath: Path): Path = {
+    val partitionPathStrings = partitionColumnNames.map { col =>
+      getPartitionPathString(col, spec(col))
+    }
+    partitionPathStrings.foldLeft(tablePath) { (totalPath, nextPartPath) =>
+      new Path(totalPath, nextPartPath)
+    }
+  }
+
+  def getPartitionPathString(col: String, value: String): String = {
+    val partitionString = if (value == null || value.isEmpty) {
+      DEFAULT_PARTITION_NAME
+    } else {
+      escapePathName(value)
+    }
+    escapePathName(col) + "=" + partitionString
+  }
+}
+
+object CatalogUtils {
+  /**
+   * Masking credentials in the option lists. For example, in the sql plan explain output
+   * for JDBC data sources.
+   */
+  def maskCredentials(options: Map[String, String]): Map[String, String] = {
+    options.map {
+      case (key, _) if key.toLowerCase == "password" => (key, "###")
+      case (key, value) if key.toLowerCase == "url" && value.toLowerCase.contains("password") =>
+        (key, "###")
+      case o => o
+    }
+  }
+
+  def normalizePartCols(
+      tableName: String,
+      tableCols: Seq[String],
+      partCols: Seq[String],
+      resolver: Resolver): Seq[String] = {
+    partCols.map(normalizeColumnName(tableName, tableCols, _, "partition", resolver))
+  }
+
+  def normalizeBucketSpec(
+      tableName: String,
+      tableCols: Seq[String],
+      bucketSpec: BucketSpec,
+      resolver: Resolver): BucketSpec = {
+    val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec
+    val normalizedBucketCols = bucketColumnNames.map { colName =>
+      normalizeColumnName(tableName, tableCols, colName, "bucket", resolver)
+    }
+    val normalizedSortCols = sortColumnNames.map { colName =>
+      normalizeColumnName(tableName, tableCols, colName, "sort", resolver)
+    }
+    BucketSpec(numBuckets, normalizedBucketCols, normalizedSortCols)
+  }
+
+  private def normalizeColumnName(
+      tableName: String,
+      tableCols: Seq[String],
+      colName: String,
+      colType: String,
+      resolver: Resolver): String = {
+    tableCols.find(resolver(_, colName)).getOrElse {
+      throw new AnalysisException(s"$colType column $colName is not defined in table $tableName, " +
+        s"defined table columns are: ${tableCols.mkString(", ")}")
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index f95c9f8cfa2d..d700634e3c2a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -28,8 +28,10 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.util.StringUtils
+import org.apache.spark.sql.types.StructType
 
 /**
  * An in-memory (ephemeral) implementation of the system catalog.
@@ -64,12 +66,6 @@ class InMemoryCatalog(
     catalog(db).tables(table).partitions.contains(spec)
   }
 
-  private def requireTableExists(db: String, table: String): Unit = {
-    if (!tableExists(db, table)) {
-      throw new NoSuchTableException(db = db, table = table)
-    }
-  }
-
   private def requireTableNotExists(db: String, table: String): Unit = {
     if (tableExists(db, table)) {
       throw new TableAlreadyExistsException(db = db, table = table)
@@ -196,18 +192,32 @@ class InMemoryCatalog(
         throw new TableAlreadyExistsException(db = db, table = table)
       }
     } else {
-      if (tableDefinition.tableType == CatalogTableType.MANAGED) {
-        val dir = new Path(catalog(db).db.locationUri, table)
+      // Set the default table location if this is a managed table and its location is not
+      // specified.
+      // Ideally we should not create a managed table with location, but Hive serde table can
+      // specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have
+      // to create the table directory and write out data before we create this table, to avoid
+      // exposing a partial written table.
+      val needDefaultTableLocation =
+        tableDefinition.tableType == CatalogTableType.MANAGED &&
+          tableDefinition.storage.locationUri.isEmpty
+
+      val tableWithLocation = if (needDefaultTableLocation) {
+        val defaultTableLocation = new Path(catalog(db).db.locationUri, table)
         try {
-          val fs = dir.getFileSystem(hadoopConfig)
-          fs.mkdirs(dir)
+          val fs = defaultTableLocation.getFileSystem(hadoopConfig)
+          fs.mkdirs(defaultTableLocation)
         } catch {
           case e: IOException =>
             throw new SparkException(s"Unable to create table $table as failed " +
-              s"to create its directory $dir", e)
+              s"to create its directory $defaultTableLocation", e)
         }
+        tableDefinition.withNewStorage(locationUri = Some(defaultTableLocation.toUri.toString))
+      } else {
+        tableDefinition
       }
-      catalog(db).tables.put(table, new TableDesc(tableDefinition))
+
+      catalog(db).tables.put(table, new TableDesc(tableWithLocation))
     }
   }
 
@@ -218,8 +228,25 @@ class InMemoryCatalog(
       purge: Boolean): Unit = synchronized {
     requireDbExists(db)
     if (tableExists(db, table)) {
-      if (getTable(db, table).tableType == CatalogTableType.MANAGED) {
-        val dir = new Path(catalog(db).db.locationUri, table)
+      val tableMeta = getTable(db, table)
+      if (tableMeta.tableType == CatalogTableType.MANAGED) {
+        // Delete the data/directory for each partition
+        val locationAllParts = catalog(db).tables(table).partitions.values.toSeq.map(_.location)
+        locationAllParts.foreach { loc =>
+          val partitionPath = new Path(loc)
+          try {
+            val fs = partitionPath.getFileSystem(hadoopConfig)
+            fs.delete(partitionPath, true)
+          } catch {
+            case e: IOException =>
+              throw new SparkException(s"Unable to delete partition path $partitionPath", e)
+          }
+        }
+        assert(tableMeta.storage.locationUri.isDefined,
+          "Managed table should always have table location, as we will assign a default location " +
+            "to it if it doesn't have one.")
+        // Delete the data/directory of the table
+        val dir = new Path(tableMeta.location)
         try {
           val fs = dir.getFileSystem(hadoopConfig)
           fs.delete(dir, true)
@@ -244,7 +271,10 @@ class InMemoryCatalog(
     oldDesc.table = oldDesc.table.copy(identifier = TableIdentifier(newName, Some(db)))
 
     if (oldDesc.table.tableType == CatalogTableType.MANAGED) {
-      val oldDir = new Path(catalog(db).db.locationUri, oldName)
+      assert(oldDesc.table.storage.locationUri.isDefined,
+        "Managed table should always have table location, as we will assign a default location " +
+          "to it if it doesn't have one.")
+      val oldDir = new Path(oldDesc.table.location)
       val newDir = new Path(catalog(db).db.locationUri, newName)
       try {
         val fs = oldDir.getFileSystem(hadoopConfig)
@@ -254,6 +284,7 @@ class InMemoryCatalog(
           throw new SparkException(s"Unable to rename table $oldName to $newName as failed " +
             s"to rename its directory $oldDir", e)
       }
+      oldDesc.table = oldDesc.table.withNewStorage(locationUri = Some(newDir.toUri.toString))
     }
 
     catalog(db).tables.put(newName, oldDesc)
@@ -267,6 +298,15 @@ class InMemoryCatalog(
     catalog(db).tables(tableDefinition.identifier.table).table = tableDefinition
   }
 
+  override def alterTableSchema(
+      db: String,
+      table: String,
+      schema: StructType): Unit = synchronized {
+    requireTableExists(db, table)
+    val origTable = catalog(db).tables(table).table
+    catalog(db).tables(table).table = origTable.copy(schema = schema)
+  }
+
   override def getTable(db: String, table: String): CatalogTable = synchronized {
     requireTableExists(db, table)
     catalog(db).tables(table).table
@@ -339,25 +379,28 @@ class InMemoryCatalog(
       }
     }
 
-    val tableDir = new Path(catalog(db).db.locationUri, table)
-    val partitionColumnNames = getTable(db, table).partitionColumnNames
+    val tableMeta = getTable(db, table)
+    val partitionColumnNames = tableMeta.partitionColumnNames
+    val tablePath = new Path(tableMeta.location)
     // TODO: we should follow hive to roll back if one partition path failed to create.
     parts.foreach { p =>
-      // If location is set, the partition is using an external partition location and we don't
-      // need to handle its directory.
-      if (p.storage.locationUri.isEmpty) {
-        val partitionPath = partitionColumnNames.flatMap { col =>
-          p.spec.get(col).map(col + "=" + _)
-        }.mkString("/")
-        try {
-          val fs = tableDir.getFileSystem(hadoopConfig)
-          fs.mkdirs(new Path(tableDir, partitionPath))
-        } catch {
-          case e: IOException =>
-            throw new SparkException(s"Unable to create partition path $partitionPath", e)
+      val partitionPath = p.storage.locationUri.map(new Path(_)).getOrElse {
+        ExternalCatalogUtils.generatePartitionPath(p.spec, partitionColumnNames, tablePath)
+      }
+
+      try {
+        val fs = tablePath.getFileSystem(hadoopConfig)
+        if (!fs.exists(partitionPath)) {
+          fs.mkdirs(partitionPath)
         }
+      } catch {
+        case e: IOException =>
+          throw new SparkException(s"Unable to create partition path $partitionPath", e)
       }
-      existingParts.put(p.spec, p)
+
+      existingParts.put(
+        p.spec,
+        p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toString))))
     }
   }
 
@@ -366,7 +409,8 @@ class InMemoryCatalog(
       table: String,
       partSpecs: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit = synchronized {
+      purge: Boolean,
+      retainData: Boolean): Unit = synchronized {
     requireTableExists(db, table)
     val existingParts = catalog(db).tables(table).partitions
     if (!ignoreIfNotExists) {
@@ -376,19 +420,20 @@ class InMemoryCatalog(
       }
     }
 
-    val tableDir = new Path(catalog(db).db.locationUri, table)
-    val partitionColumnNames = getTable(db, table).partitionColumnNames
-    // TODO: we should follow hive to roll back if one partition path failed to delete.
+    val shouldRemovePartitionLocation = if (retainData) {
+      false
+    } else {
+      getTable(db, table).tableType == CatalogTableType.MANAGED
+    }
+
+    // TODO: we should follow hive to roll back if one partition path failed to delete, and support
+    // partial partition spec.
     partSpecs.foreach { p =>
-      // If location is set, the partition is using an external partition location and we don't
-      // need to handle its directory.
-      if (existingParts.contains(p) && existingParts(p).storage.locationUri.isEmpty) {
-        val partitionPath = partitionColumnNames.flatMap { col =>
-          p.get(col).map(col + "=" + _)
-        }.mkString("/")
+      if (existingParts.contains(p) && shouldRemovePartitionLocation) {
+        val partitionPath = new Path(existingParts(p).location)
         try {
-          val fs = tableDir.getFileSystem(hadoopConfig)
-          fs.delete(new Path(tableDir, partitionPath), true)
+          val fs = partitionPath.getFileSystem(hadoopConfig)
+          fs.delete(partitionPath, true)
         } catch {
           case e: IOException =>
             throw new SparkException(s"Unable to delete partition path $partitionPath", e)
@@ -407,33 +452,34 @@ class InMemoryCatalog(
     requirePartitionsExist(db, table, specs)
     requirePartitionsNotExist(db, table, newSpecs)
 
-    val tableDir = new Path(catalog(db).db.locationUri, table)
-    val partitionColumnNames = getTable(db, table).partitionColumnNames
+    val tableMeta = getTable(db, table)
+    val partitionColumnNames = tableMeta.partitionColumnNames
+    val tablePath = new Path(tableMeta.location)
+    val shouldUpdatePartitionLocation = getTable(db, table).tableType == CatalogTableType.MANAGED
+    val existingParts = catalog(db).tables(table).partitions
     // TODO: we should follow hive to roll back if one partition path failed to rename.
     specs.zip(newSpecs).foreach { case (oldSpec, newSpec) =>
-      val newPart = getPartition(db, table, oldSpec).copy(spec = newSpec)
-      val existingParts = catalog(db).tables(table).partitions
-
-      // If location is set, the partition is using an external partition location and we don't
-      // need to handle its directory.
-      if (newPart.storage.locationUri.isEmpty) {
-        val oldPath = partitionColumnNames.flatMap { col =>
-          oldSpec.get(col).map(col + "=" + _)
-        }.mkString("/")
-        val newPath = partitionColumnNames.flatMap { col =>
-          newSpec.get(col).map(col + "=" + _)
-        }.mkString("/")
+      val oldPartition = getPartition(db, table, oldSpec)
+      val newPartition = if (shouldUpdatePartitionLocation) {
+        val oldPartPath = new Path(oldPartition.location)
+        val newPartPath = ExternalCatalogUtils.generatePartitionPath(
+          newSpec, partitionColumnNames, tablePath)
         try {
-          val fs = tableDir.getFileSystem(hadoopConfig)
-          fs.rename(new Path(tableDir, oldPath), new Path(tableDir, newPath))
+          val fs = tablePath.getFileSystem(hadoopConfig)
+          fs.rename(oldPartPath, newPartPath)
         } catch {
           case e: IOException =>
-            throw new SparkException(s"Unable to rename partition path $oldPath", e)
+            throw new SparkException(s"Unable to rename partition path $oldPartPath", e)
         }
+        oldPartition.copy(
+          spec = newSpec,
+          storage = oldPartition.storage.copy(locationUri = Some(newPartPath.toString)))
+      } else {
+        oldPartition.copy(spec = newSpec)
       }
 
       existingParts.remove(oldSpec)
-      existingParts.put(newSpec, newPart)
+      existingParts.put(newSpec, newPartition)
     }
   }
 
@@ -466,16 +512,44 @@ class InMemoryCatalog(
     }
   }
 
+  override def listPartitionNames(
+      db: String,
+      table: String,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String] = synchronized {
+    val partitionColumnNames = getTable(db, table).partitionColumnNames
+
+    listPartitions(db, table, partialSpec).map { partition =>
+      partitionColumnNames.map { name =>
+        escapePathName(name) + "=" + escapePathName(partition.spec(name))
+      }.mkString("/")
+    }.sorted
+  }
+
   override def listPartitions(
       db: String,
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = synchronized {
     requireTableExists(db, table)
-    if (partialSpec.nonEmpty) {
-      throw new UnsupportedOperationException(
-        "listPartition with partial partition spec is not implemented")
+
+    partialSpec match {
+      case None => catalog(db).tables(table).partitions.values.toSeq
+      case Some(partial) =>
+        catalog(db).tables(table).partitions.toSeq.collect {
+          case (spec, partition) if isPartialPartitionSpec(partial, spec) => partition
+        }
+    }
+  }
+
+  /**
+   * Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a
+   * partial partition spec w.r.t. PARTITION (a=1,b=2).
+   */
+  private def isPartialPartitionSpec(
+      spec1: TablePartitionSpec,
+      spec2: TablePartitionSpec): Boolean = {
+    spec1.forall {
+      case (partitionColumn, value) => spec2(partitionColumn) == value
     }
-    catalog(db).tables(table).partitions.values.toSeq
   }
 
   override def listPartitionsByFilter(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 714ef825ab83..a5cf7196b21e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -83,13 +83,21 @@ class SessionCatalog(
   // check whether the temporary table or function exists, then, if not, operate on
   // the corresponding item in the current database.
   @GuardedBy("this")
-  protected var currentDb = {
-    val defaultName = DEFAULT_DATABASE
-    val defaultDbDefinition =
-      CatalogDatabase(defaultName, "default database", conf.warehousePath, Map())
-    // Initialize default database if it doesn't already exist
-    createDatabase(defaultDbDefinition, ignoreIfExists = true)
-    formatDatabaseName(defaultName)
+  protected var currentDb = formatDatabaseName(DEFAULT_DATABASE)
+
+  /**
+   * Checks if the given name conforms the Hive standard ("[a-zA-z_0-9]+"),
+   * i.e. if this name only contains characters, numbers, and _.
+   *
+   * This method is intended to have the same behavior of
+   * org.apache.hadoop.hive.metastore.MetaStoreUtils.validateName.
+   */
+  private def validateName(name: String): Unit = {
+    val validNameFormat = "([\\w_]+)".r
+    if (!validNameFormat.pattern.matcher(name).matches()) {
+      throw new AnalysisException(s"`$name` is not a valid name for tables/databases. " +
+        "Valid names only contain alphabet characters, numbers and _.")
+    }
   }
 
   /**
@@ -150,6 +158,7 @@ class SessionCatalog(
         s"${globalTempViewManager.database} is a system preserved database, " +
           "you cannot create a database with this name.")
     }
+    validateName(dbName)
     val qualifiedPath = makeQualifiedPath(dbDefinition.locationUri).toString
     externalCatalog.createDatabase(
       dbDefinition.copy(name = dbName, locationUri = qualifiedPath),
@@ -160,8 +169,6 @@ class SessionCatalog(
     val dbName = formatDatabaseName(db)
     if (dbName == DEFAULT_DATABASE) {
       throw new AnalysisException(s"Can not drop default database")
-    } else if (dbName == getCurrentDatabase) {
-      throw new AnalysisException(s"Can not drop current database `$dbName`")
     }
     externalCatalog.dropDatabase(dbName, ignoreIfNotExists, cascade)
   }
@@ -235,6 +242,7 @@ class SessionCatalog(
   def createTable(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = {
     val db = formatDatabaseName(tableDefinition.identifier.database.getOrElse(getCurrentDatabase))
     val table = formatTableName(tableDefinition.identifier.table)
+    validateName(table)
     val newTableDefinition = tableDefinition.copy(identifier = TableIdentifier(table, Some(db)))
     requireDbExists(db)
     externalCatalog.createTable(newTableDefinition, ignoreIfExists)
@@ -319,7 +327,7 @@ class SessionCatalog(
   def loadPartition(
       name: TableIdentifier,
       loadPath: String,
-      partition: TablePartitionSpec,
+      spec: TablePartitionSpec,
       isOverwrite: Boolean,
       holdDDLTime: Boolean,
       inheritTableSpecs: Boolean): Unit = {
@@ -327,8 +335,9 @@ class SessionCatalog(
     val table = formatTableName(name.table)
     requireDbExists(db)
     requireTableExists(TableIdentifier(table, Some(db)))
+    requireNonEmptyValueInPartitionSpec(Seq(spec))
     externalCatalog.loadPartition(
-      db, table, loadPath, partition, isOverwrite, holdDDLTime, inheritTableSpecs)
+      db, table, loadPath, spec, isOverwrite, holdDDLTime, inheritTableSpecs)
   }
 
   def defaultTablePath(tableIdent: TableIdentifier): String = {
@@ -483,6 +492,7 @@ class SessionCatalog(
       if (oldName.database.isDefined || !tempTables.contains(oldTableName)) {
         requireTableExists(TableIdentifier(oldTableName, Some(db)))
         requireTableNotExists(TableIdentifier(newTableName, Some(db)))
+        validateName(newTableName)
         externalCatalog.renameTable(db, oldTableName, newTableName)
       } else {
         if (newName.database.isDefined) {
@@ -623,7 +633,7 @@ class SessionCatalog(
   /**
    * Refresh the cache entry for a metastore table, if any.
    */
-  def refreshTable(name: TableIdentifier): Unit = {
+  def refreshTable(name: TableIdentifier): Unit = synchronized {
     // Go through temporary tables and invalidate them.
     // If the database is defined, this is definitely not a temp table.
     // If the database is not defined, there is a good chance this is a temp table.
@@ -667,6 +677,7 @@ class SessionCatalog(
     requireDbExists(db)
     requireTableExists(TableIdentifier(table, Option(db)))
     requireExactMatchedPartitionSpec(parts.map(_.spec), getTableMetadata(tableName))
+    requireNonEmptyValueInPartitionSpec(parts.map(_.spec))
     externalCatalog.createPartitions(db, table, parts, ignoreIfExists)
   }
 
@@ -678,13 +689,15 @@ class SessionCatalog(
       tableName: TableIdentifier,
       specs: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit = {
+      purge: Boolean,
+      retainData: Boolean): Unit = {
     val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
     val table = formatTableName(tableName.table)
     requireDbExists(db)
     requireTableExists(TableIdentifier(table, Option(db)))
     requirePartialMatchedPartitionSpec(specs, getTableMetadata(tableName))
-    externalCatalog.dropPartitions(db, table, specs, ignoreIfNotExists, purge)
+    requireNonEmptyValueInPartitionSpec(specs)
+    externalCatalog.dropPartitions(db, table, specs, ignoreIfNotExists, purge, retainData)
   }
 
   /**
@@ -704,6 +717,8 @@ class SessionCatalog(
     requireTableExists(TableIdentifier(table, Option(db)))
     requireExactMatchedPartitionSpec(specs, tableMetadata)
     requireExactMatchedPartitionSpec(newSpecs, tableMetadata)
+    requireNonEmptyValueInPartitionSpec(specs)
+    requireNonEmptyValueInPartitionSpec(newSpecs)
     externalCatalog.renamePartitions(db, table, specs, newSpecs)
   }
 
@@ -722,6 +737,7 @@ class SessionCatalog(
     requireDbExists(db)
     requireTableExists(TableIdentifier(table, Option(db)))
     requireExactMatchedPartitionSpec(parts.map(_.spec), getTableMetadata(tableName))
+    requireNonEmptyValueInPartitionSpec(parts.map(_.spec))
     externalCatalog.alterPartitions(db, table, parts)
   }
 
@@ -735,9 +751,31 @@ class SessionCatalog(
     requireDbExists(db)
     requireTableExists(TableIdentifier(table, Option(db)))
     requireExactMatchedPartitionSpec(Seq(spec), getTableMetadata(tableName))
+    requireNonEmptyValueInPartitionSpec(Seq(spec))
     externalCatalog.getPartition(db, table, spec)
   }
 
+  /**
+   * List the names of all partitions that belong to the specified table, assuming it exists.
+   *
+   * A partial partition spec may optionally be provided to filter the partitions returned.
+   * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'),
+   * then a partial spec of (a='1') will return the first two only.
+   */
+  def listPartitionNames(
+      tableName: TableIdentifier,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String] = {
+    val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
+    val table = formatTableName(tableName.table)
+    requireDbExists(db)
+    requireTableExists(TableIdentifier(table, Option(db)))
+    partialSpec.foreach { spec =>
+      requirePartialMatchedPartitionSpec(Seq(spec), getTableMetadata(tableName))
+      requireNonEmptyValueInPartitionSpec(Seq(spec))
+    }
+    externalCatalog.listPartitionNames(db, table, partialSpec)
+  }
+
   /**
    * List the metadata of all partitions that belong to the specified table, assuming it exists.
    *
@@ -752,6 +790,10 @@ class SessionCatalog(
     val table = formatTableName(tableName.table)
     requireDbExists(db)
     requireTableExists(TableIdentifier(table, Option(db)))
+    partialSpec.foreach { spec =>
+      requirePartialMatchedPartitionSpec(Seq(spec), getTableMetadata(tableName))
+      requireNonEmptyValueInPartitionSpec(Seq(spec))
+    }
     externalCatalog.listPartitions(db, table, partialSpec)
   }
 
@@ -769,6 +811,19 @@ class SessionCatalog(
     externalCatalog.listPartitionsByFilter(db, table, predicates)
   }
 
+  /**
+   * Verify if the input partition spec has any empty value.
+   */
+  private def requireNonEmptyValueInPartitionSpec(specs: Seq[TablePartitionSpec]): Unit = {
+    specs.foreach { s =>
+      if (s.values.exists(_.isEmpty)) {
+        val spec = s.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]")
+        throw new AnalysisException(
+          s"Partition spec is invalid. The spec ($spec) contains an empty partition column value")
+      }
+    }
+  }
+
   /**
    * Verify if the input partition spec exactly matches the existing defined partition spec
    * The columns must be the same but the orders could be different.
@@ -925,6 +980,21 @@ class SessionCatalog(
     }
   }
 
+  /**
+   * Returns whether it is a temporary function. If not existed, returns false.
+   */
+  def isTemporaryFunction(name: FunctionIdentifier): Boolean = {
+    // copied from HiveSessionCatalog
+    val hiveFunctions = Seq("histogram_numeric")
+
+    // A temporary function is a function that has been registered in functionRegistry
+    // without a database name, and is neither a built-in function nor a Hive function
+    name.database.isEmpty &&
+      functionRegistry.functionExists(name.funcName) &&
+      !FunctionRegistry.builtin.functionExists(name.funcName) &&
+      !hiveFunctions.contains(name.funcName.toLowerCase)
+  }
+
   protected def failFunctionLookup(name: String): Nothing = {
     throw new NoSuchFunctionException(db = currentDb, func = name)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 7c3bec897956..051fcaa63c7f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -44,6 +44,9 @@ case class CatalogFunction(
  * Storage format, used to describe how a partition or a table is stored.
  */
 case class CatalogStorageFormat(
+    // TODO(ekl) consider storing this field as java.net.URI for type safety. Note that this must
+    // be converted to/from a hadoop Path object using new Path(new URI(locationUri)) and
+    // path.toUri respectively before use as a filesystem path due to URI char escaping.
     locationUri: Option[String],
     inputFormat: Option[String],
     outputFormat: Option[String],
@@ -52,12 +55,10 @@ case class CatalogStorageFormat(
     properties: Map[String, String]) {
 
   override def toString: String = {
-    val serdePropsToString =
-      if (properties.nonEmpty) {
-        s"Properties: " + properties.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]")
-      } else {
-        ""
-      }
+    val serdePropsToString = CatalogUtils.maskCredentials(properties) match {
+      case props if props.isEmpty => ""
+      case props => "Properties: " + props.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]")
+    }
     val output =
       Seq(locationUri.map("Location: " + _).getOrElse(""),
         inputFormat.map("InputFormat: " + _).getOrElse(""),
@@ -99,12 +100,23 @@ case class CatalogTablePartition(
     output.filter(_.nonEmpty).mkString("CatalogPartition(\n\t", "\n\t", ")")
   }
 
+  /** Return the partition location, assuming it is specified. */
+  def location: String = storage.locationUri.getOrElse {
+    val specString = spec.map { case (k, v) => s"$k=$v" }.mkString(", ")
+    throw new AnalysisException(s"Partition [$specString] did not specify locationUri")
+  }
+
   /**
    * Given the partition schema, returns a row with that schema holding the partition values.
    */
   def toRow(partitionSchema: StructType): InternalRow = {
     InternalRow.fromSeq(partitionSchema.map { field =>
-      Cast(Literal(spec(field.name)), field.dataType).eval()
+      val partValue = if (spec(field.name) == ExternalCatalogUtils.DEFAULT_PARTITION_NAME) {
+        null
+      } else {
+        spec(field.name)
+      }
+      Cast(Literal(partValue), field.dataType).eval()
     })
   }
 }
@@ -126,6 +138,16 @@ case class BucketSpec(
   if (numBuckets <= 0) {
     throw new AnalysisException(s"Expected positive number of buckets, but got `$numBuckets`.")
   }
+
+  override def toString: String = {
+    val bucketString = s"bucket columns: [${bucketColumnNames.mkString(", ")}]"
+    val sortString = if (sortColumnNames.nonEmpty) {
+      s", sort columns: [${sortColumnNames.mkString(", ")}]"
+    } else {
+      ""
+    }
+    s"$numBuckets buckets, $bucketString$sortString"
+  }
 }
 
 /**
@@ -138,8 +160,14 @@ case class BucketSpec(
  *                 Can be None if this table is a View, should be "hive" for hive serde tables.
  * @param unsupportedFeatures is a list of string descriptions of features that are used by the
  *        underlying table but not supported by Spark SQL yet.
- * @param partitionProviderIsHive whether this table's partition metadata is stored in the Hive
- *                                metastore.
+ * @param tracksPartitionsInCatalog whether this table's partition metadata is stored in the
+ *                                  catalog. If false, it is inferred automatically based on file
+ *                                  structure.
+ * @param schemaPreservesCase Whether or not the schema resolved for this table is case-sensitive.
+ *                           When using a Hive Metastore, this flag is set to false if a case-
+ *                           sensitive schema was unable to be read from the table properties.
+ *                           Used to trigger case-sensitive schema inference at query time, when
+ *                           configured.
  */
 case class CatalogTable(
     identifier: TableIdentifier,
@@ -158,7 +186,8 @@ case class CatalogTable(
     viewText: Option[String] = None,
     comment: Option[String] = None,
     unsupportedFeatures: Seq[String] = Seq.empty,
-    partitionProviderIsHive: Boolean = false) {
+    tracksPartitionsInCatalog: Boolean = false,
+    schemaPreservesCase: Boolean = true) {
 
   /** schema of this table's partition columns */
   def partitionSchema: StructType = StructType(schema.filter {
@@ -170,6 +199,11 @@ case class CatalogTable(
     throw new AnalysisException(s"table $identifier did not specify database")
   }
 
+  /** Return the table location, assuming it is specified. */
+  def location: String = storage.locationUri.getOrElse {
+    throw new AnalysisException(s"table $identifier did not specify locationUri")
+  }
+
   /** Return the fully qualified name of this table, assuming the database was specified. */
   def qualifiedName: String = identifier.unquotedString
 
@@ -217,7 +251,7 @@ case class CatalogTable(
         if (properties.nonEmpty) s"Properties: $tableProperties" else "",
         if (stats.isDefined) s"Statistics: ${stats.get.simpleString}" else "",
         s"$storage",
-        if (partitionProviderIsHive) "Partition Provider: Hive" else "")
+        if (tracksPartitionsInCatalog) "Partition Provider: Catalog" else "")
 
     output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 66e52ca68af1..e901683be685 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -367,7 +367,7 @@ package object dsl {
       def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan =
         InsertIntoTable(
           analysis.UnresolvedRelation(TableIdentifier(tableName)),
-          Map.empty, logicalPlan, overwrite, false)
+          Map.empty, logicalPlan, OverwriteOptions(overwrite), false)
 
       def as(alias: String): LogicalPlan = logicalPlan match {
         case UnresolvedRelation(tbl, _) => UnresolvedRelation(tbl, Option(alias))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
index 82e1a8a7cad9..f7999a3cc92b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -45,8 +45,18 @@ import org.apache.spark.util.Utils
 object ExpressionEncoder {
   def apply[T : TypeTag](): ExpressionEncoder[T] = {
     // We convert the not-serializable TypeTag into StructType and ClassTag.
-    val mirror = typeTag[T].mirror
-    val tpe = typeTag[T].tpe
+    val mirror = ScalaReflection.mirror
+    val tpe = typeTag[T].in(mirror).tpe
+
+    if (ScalaReflection.optionOfProductType(tpe)) {
+      throw new UnsupportedOperationException(
+        "Cannot create encoder for Option of Product type, because Product type is represented " +
+          "as a row, and the entire row can not be null in Spark SQL like normal databases. " +
+          "You can wrap your type with Tuple1 if you do want top level null Product objects, " +
+          "e.g. instead of creating `Dataset[Option[MyClass]]`, you can do something like " +
+          "`val ds: Dataset[Tuple1[MyClass]] = Seq(Tuple1(MyClass(...)), Tuple1(null)).toDS`")
+    }
+
     val cls = mirror.runtimeClass(tpe)
     val flat = !ScalaReflection.definedByConstructorParams(tpe)
 
@@ -54,9 +64,9 @@ object ExpressionEncoder {
     val nullSafeInput = if (flat) {
       inputObject
     } else {
-      // For input object of non-flat type, we can't encode it to row if it's null, as Spark SQL
+      // For input object of Product type, we can't encode it to row if it's null, as Spark SQL
       // doesn't allow top-level row to be null, only its columns can be null.
-      AssertNotNull(inputObject, Seq("top level non-flat input object"))
+      AssertNotNull(inputObject, Seq("top level Product input object"))
     }
     val serializer = ScalaReflection.serializerFor[T](nullSafeInput)
     val deserializer = ScalaReflection.deserializerFor[T]
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index 2a6fcd03a26b..e95e97b9dc6c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -23,7 +23,7 @@ import scala.reflect.ClassTag
 import org.apache.spark.SparkException
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal
 import org.apache.spark.sql.catalyst.expressions.objects._
@@ -119,18 +119,19 @@ object RowEncoder {
         "fromString",
         inputObject :: Nil)
 
-    case t @ ArrayType(et, _) => et match {
-      case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType =>
-        // TODO: validate input type for primitive array.
-        NewInstance(
-          classOf[GenericArrayData],
-          inputObject :: Nil,
-          dataType = t)
-      case _ => MapObjects(
-        element => serializerFor(ValidateExternalType(element, et), et),
-        inputObject,
-        ObjectType(classOf[Object]))
-    }
+    case t @ ArrayType(et, cn) =>
+      et match {
+        case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType =>
+          StaticInvoke(
+            classOf[ArrayData],
+            t,
+            "toArrayData",
+            inputObject :: Nil)
+        case _ => MapObjects(
+          element => serializerFor(ValidateExternalType(element, et), et),
+          inputObject,
+          ObjectType(classOf[Object]))
+      }
 
     case t @ MapType(kt, vt, valueNullable) =>
       val keys =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
index fe24c0489fc9..4859e0c53761 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
@@ -43,11 +43,15 @@ import org.apache.spark.util.Utils
  *                 and the second element should be a literal string for the method name,
  *                 and the remaining are input arguments to the Java method.
  */
-// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(class,method[,arg1[,arg2..]]) calls method with reflection",
-  extended = "> SELECT _FUNC_('java.util.UUID', 'randomUUID');\n c33fb387-8500-4bfa-81d2-6e0e3e930df2")
-// scalastyle:on line.size.limit
+  usage = "_FUNC_(class, method[, arg1[, arg2 ..]]) - Calls a method with reflection.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('java.util.UUID', 'randomUUID');
+       c33fb387-8500-4bfa-81d2-6e0e3e930df2
+      > SELECT _FUNC_('java.util.UUID', 'fromString', 'a5cf6c42-0c85-418f-af6c-3e4e5b1328f2');
+       a5cf6c42-0c85-418f-af6c-3e4e5b1328f2
+  """)
 case class CallMethodViaReflection(children: Seq[Expression])
   extends Expression with CodegenFallback {
 
@@ -61,6 +65,10 @@ case class CallMethodViaReflection(children: Seq[Expression])
       TypeCheckFailure("first two arguments should be string literals")
     } else if (!classExists) {
       TypeCheckFailure(s"class $className not found")
+    } else if (children.slice(2, children.length)
+        .exists(e => !CallMethodViaReflection.typeMapping.contains(e.dataType))) {
+      TypeCheckFailure("arguments from the third require boolean, byte, short, " +
+        "integer, long, float, double or string expressions")
     } else if (method == null) {
       TypeCheckFailure(s"cannot find a static method that matches the argument types in $className")
     } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 58fd65f62ffe..f15ae3255ca9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -114,8 +114,12 @@ object Cast {
 
 /** Cast the child expression to the target data type. */
 @ExpressionDescription(
-  usage = " - Cast value v to the target data type.",
-  extended = "> SELECT _FUNC_('10' as int);\n 10")
+  usage = "_FUNC_(expr AS type) - Casts the value `expr` to the target data type `type`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('10' as int);
+       10
+  """)
 case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with NullIntolerant {
 
   override def toString: String = s"cast($child as ${dataType.simpleString})"
@@ -243,7 +247,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   // LongConverter
   private[this] def castToLong(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, s => try s.toString.toLong catch {
+      buildCast[UTF8String](_, s => try s.toLong catch {
         case _: NumberFormatException => null
       })
     case BooleanType =>
@@ -259,7 +263,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   // IntConverter
   private[this] def castToInt(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, s => try s.toString.toInt catch {
+      buildCast[UTF8String](_, s => try s.toInt catch {
         case _: NumberFormatException => null
       })
     case BooleanType =>
@@ -275,7 +279,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   // ShortConverter
   private[this] def castToShort(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, s => try s.toString.toShort catch {
+      buildCast[UTF8String](_, s => try s.toShort catch {
         case _: NumberFormatException => null
       })
     case BooleanType =>
@@ -291,7 +295,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
   // ByteConverter
   private[this] def castToByte(from: DataType): Any => Any = from match {
     case StringType =>
-      buildCast[UTF8String](_, s => try s.toString.toByte catch {
+      buildCast[UTF8String](_, s => try s.toByte catch {
         case _: NumberFormatException => null
       })
     case BooleanType =>
@@ -494,7 +498,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
     s"""
       boolean $resultNull = $childNull;
       ${ctx.javaType(resultType)} $resultPrim = ${ctx.defaultValue(resultType)};
-      if (!${childNull}) {
+      if (!$childNull) {
         ${cast(childPrim, resultPrim, resultNull)}
       }
     """
@@ -701,7 +705,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       (c, evPrim, evNull) =>
         s"""
           try {
-            $evPrim = Byte.valueOf($c.toString());
+            $evPrim = $c.toByte();
           } catch (java.lang.NumberFormatException e) {
             $evNull = true;
           }
@@ -723,7 +727,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       (c, evPrim, evNull) =>
         s"""
           try {
-            $evPrim = Short.valueOf($c.toString());
+            $evPrim = $c.toShort();
           } catch (java.lang.NumberFormatException e) {
             $evNull = true;
           }
@@ -745,7 +749,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       (c, evPrim, evNull) =>
         s"""
           try {
-            $evPrim = Integer.valueOf($c.toString());
+            $evPrim = $c.toInt();
           } catch (java.lang.NumberFormatException e) {
             $evNull = true;
           }
@@ -767,7 +771,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
       (c, evPrim, evNull) =>
         s"""
           try {
-            $evPrim = Long.valueOf($c.toString());
+            $evPrim = $c.toLong();
           } catch (java.lang.NumberFormatException e) {
             $evNull = true;
           }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
index b8e2b67b2fe9..6c246a5663ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable
 
 /**
  * This class is used to compute equality of (sub)expression trees. Expressions can be added
@@ -72,7 +73,10 @@ class EquivalentExpressions {
       root: Expression,
       ignoreLeaf: Boolean = true,
       skipReferenceToExpressions: Boolean = true): Unit = {
-    val skip = root.isInstanceOf[LeafExpression] && ignoreLeaf
+    val skip = (root.isInstanceOf[LeafExpression] && ignoreLeaf) ||
+      // `LambdaVariable` is usually used as a loop variable, which can't be evaluated ahead of the
+      // loop. So we can't evaluate sub-expressions containing `LambdaVariable` at the beginning.
+      root.find(_.isInstanceOf[LambdaVariable]).isDefined
     // There are some special expressions that we should not recurse into children.
     //   1. CodegenFallback: it's children will not be used to generate code (call eval() instead)
     //   2. ReferenceToExpressions: it's kind of an explicit sub-expression elimination.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 9edc1ceff26a..b93a5d0b7a0e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -70,9 +70,9 @@ abstract class Expression extends TreeNode[Expression] {
    * children.
    *
    * Note that this means that an expression should be considered as non-deterministic if:
-   * - if it relies on some mutable internal state, or
-   * - if it relies on some implicit input that is not part of the children expression list.
-   * - if it has non-deterministic child or children.
+   * - it relies on some mutable internal state, or
+   * - it relies on some implicit input that is not part of the children expression list.
+   * - it has non-deterministic child or children.
    *
    * An example would be `SparkPartitionID` that relies on the partition id returned by TaskContext.
    * By default leaf expressions are deterministic as Nil.forall(_.deterministic) returns true.
@@ -186,7 +186,7 @@ abstract class Expression extends TreeNode[Expression] {
    */
   def prettyName: String = nodeName.toLowerCase
 
-  protected def flatArguments = productIterator.flatMap {
+  protected def flatArguments: Iterator[Any] = productIterator.flatMap {
     case t: Traversable[_] => t
     case single => single :: Nil
   }
@@ -229,26 +229,16 @@ trait Unevaluable extends Expression {
  * An expression that gets replaced at runtime (currently by the optimizer) into a different
  * expression for evaluation. This is mainly used to provide compatibility with other databases.
  * For example, we use this to support "nvl" by replacing it with "coalesce".
+ *
+ * A RuntimeReplaceable should have the original parameters along with a "child" expression in the
+ * case class constructor, and define a normal constructor that accepts only the original
+ * parameters. For an example, see [[Nvl]]. To make sure the explain plan and expression SQL
+ * works correctly, the implementation should also override flatArguments method and sql method.
  */
-trait RuntimeReplaceable extends Unevaluable {
-  /**
-   * Method for concrete implementations to override that specifies how to construct the expression
-   * that should replace the current one.
-   */
-  def replaceForEvaluation(): Expression
-
-  /**
-   * Method for concrete implementations to override that specifies how to coerce the input types.
-   */
-  def replaceForTypeCoercion(): Expression
-
-  /** The expression that should be used during evaluation. */
-  lazy val replaced: Expression = replaceForEvaluation()
-
-  override def nullable: Boolean = replaced.nullable
-  override def foldable: Boolean = replaced.foldable
-  override def dataType: DataType = replaced.dataType
-  override def checkInputDataTypes(): TypeCheckResult = replaced.checkInputDataTypes()
+trait RuntimeReplaceable extends UnaryExpression with Unevaluable {
+  override def nullable: Boolean = child.nullable
+  override def foldable: Boolean = child.foldable
+  override def dataType: DataType = child.dataType
 }
 
 
@@ -272,17 +262,28 @@ trait Nondeterministic extends Expression {
   final override def deterministic: Boolean = false
   final override def foldable: Boolean = false
 
+  @transient
   private[this] var initialized = false
 
-  final def setInitialValues(): Unit = {
-    initInternal()
+  /**
+   * Initializes internal states given the current partition index and mark this as initialized.
+   * Subclasses should override [[initializeInternal()]].
+   */
+  final def initialize(partitionIndex: Int): Unit = {
+    initializeInternal(partitionIndex)
     initialized = true
   }
 
-  protected def initInternal(): Unit
+  protected def initializeInternal(partitionIndex: Int): Unit
 
+  /**
+   * @inheritdoc
+   * Throws an exception if [[initialize()]] is not called yet.
+   * Subclasses should override [[evalInternal()]].
+   */
   final override def eval(input: InternalRow = null): Any = {
-    require(initialized, "nondeterministic expression should be initialized before evaluate")
+    require(initialized,
+      s"Nondeterministic expression ${this.getClass.getName} should be initialized before eval.")
     evalInternal(input)
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
index 96929ecf5637..d412336699d8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
@@ -27,17 +27,16 @@ import org.apache.spark.unsafe.types.UTF8String
  * Expression that returns the name of the current file being read.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns the name of the current file being read if available",
-  extended = "> SELECT _FUNC_();\n ''")
+  usage = "_FUNC_() - Returns the name of the current file being read if available.")
 case class InputFileName() extends LeafExpression with Nondeterministic {
 
-  override def nullable: Boolean = true
+  override def nullable: Boolean = false
 
   override def dataType: DataType = StringType
 
   override def prettyName: String = "input_file_name"
 
-  override protected def initInternal(): Unit = {}
+  override protected def initializeInternal(partitionIndex: Int): Unit = {}
 
   override protected def evalInternal(input: InternalRow): UTF8String = {
     InputFileNameHolder.getInputFileName()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
index 5b4922e0cf2b..32358a99e7ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
@@ -33,13 +33,13 @@ import org.apache.spark.sql.types.{DataType, LongType}
  * Since this expression is stateful, it cannot be a case object.
  */
 @ExpressionDescription(
-  usage =
-    """_FUNC_() - Returns monotonically increasing 64-bit integers.
-      The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
-      The current implementation puts the partition ID in the upper 31 bits, and the lower 33 bits
-      represent the record number within each partition. The assumption is that the data frame has
-      less than 1 billion partitions, and each partition has less than 8 billion records.""",
-  extended = "> SELECT _FUNC_();\n 0")
+  usage = """
+    _FUNC_() - Returns monotonically increasing 64-bit integers. The generated ID is guaranteed
+      to be monotonically increasing and unique, but not consecutive. The current implementation
+      puts the partition ID in the upper 31 bits, and the lower 33 bits represent the record number
+      within each partition. The assumption is that the data frame has less than 1 billion
+      partitions, and each partition has less than 8 billion records.
+  """)
 case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterministic {
 
   /**
@@ -50,9 +50,9 @@ case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterminis
 
   @transient private[this] var partitionMask: Long = _
 
-  override protected def initInternal(): Unit = {
+  override protected def initializeInternal(partitionIndex: Int): Unit = {
     count = 0L
-    partitionMask = TaskContext.getPartitionId().toLong << 33
+    partitionMask = partitionIndex.toLong << 33
   }
 
   override def nullable: Boolean = false
@@ -68,9 +68,10 @@ case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterminis
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val countTerm = ctx.freshName("count")
     val partitionMaskTerm = ctx.freshName("partitionMask")
-    ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;")
-    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
-      s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;")
+    ctx.addMutableState(ctx.JAVA_LONG, countTerm, "")
+    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "")
+    ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
+    ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")
 
     ev.copy(code = s"""
       final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index a81fa1ce3adc..476e37e6a9ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.types.{DataType, StructType}
 
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
+ *
  * @param expressions a sequence of expressions that determine the value of each column of the
  *                    output row.
  */
@@ -30,10 +31,12 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
-  expressions.foreach(_.foreach {
-    case n: Nondeterministic => n.setInitialValues()
-    case _ =>
-  })
+  override def initialize(partitionIndex: Int): Unit = {
+    expressions.foreach(_.foreach {
+      case n: Nondeterministic => n.initialize(partitionIndex)
+      case _ =>
+    })
+  }
 
   // null check is required for when Kryo invokes the no-arg constructor.
   protected val exprArray = if (expressions != null) expressions.toArray else null
@@ -54,6 +57,7 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
 /**
  * A [[MutableProjection]] that is calculated by calling `eval` on each of the specified
  * expressions.
+ *
  * @param expressions a sequence of expressions that determine the value of each column of the
  *                    output row.
  */
@@ -63,10 +67,12 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
 
   private[this] val buffer = new Array[Any](expressions.size)
 
-  expressions.foreach(_.foreach {
-    case n: Nondeterministic => n.setInitialValues()
-    case _ =>
-  })
+  override def initialize(partitionIndex: Int): Unit = {
+    expressions.foreach(_.foreach {
+      case n: Nondeterministic => n.initialize(partitionIndex)
+      case _ =>
+    })
+  }
 
   private[this] val exprArray = expressions.toArray
   private[this] var mutableRow: InternalRow = new GenericInternalRow(exprArray.length)
@@ -119,7 +125,6 @@ object UnsafeProjection {
    */
   def create(exprs: Seq[Expression]): UnsafeProjection = {
     val unsafeExprs = exprs.map(_ transform {
-      case CreateStruct(children) => CreateStructUnsafe(children)
       case CreateNamedStruct(children) => CreateNamedStructUnsafe(children)
     })
     GenerateUnsafeProjection.generate(unsafeExprs)
@@ -145,7 +150,6 @@ object UnsafeProjection {
       subexpressionEliminationEnabled: Boolean): UnsafeProjection = {
     val e = exprs.map(BindReferences.bindReference(_, inputSchema))
       .map(_ transform {
-        case CreateStruct(children) => CreateStructUnsafe(children)
         case CreateNamedStruct(children) => CreateNamedStructUnsafe(children)
     })
     GenerateUnsafeProjection.generate(e, subexpressionEliminationEnabled)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
index 127797c0974b..6c75a7a50214 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
@@ -63,15 +63,30 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression])
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val childrenGen = children.map(_.genCode(ctx))
-    val childrenVars = childrenGen.zip(children).map {
-      case (childGen, child) => LambdaVariable(childGen.value, childGen.isNull, child.dataType)
-    }
+    val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map {
+      case (childGen, child) =>
+        // SPARK-18125: The children vars are local variables. If the result expression uses
+        // splitExpression, those variables cannot be accessed so compilation fails.
+        // To fix it, we use class variables to hold those local variables.
+        val classChildVarName = ctx.freshName("classChildVar")
+        val classChildVarIsNull = ctx.freshName("classChildVarIsNull")
+        ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "")
+        ctx.addMutableState("boolean", classChildVarIsNull, "")
+
+        val classChildVar =
+          LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType)
+
+        val initCode = s"${classChildVar.value} = ${childGen.value};\n" +
+          s"${classChildVar.isNull} = ${childGen.isNull};"
+
+        (classChildVar, initCode)
+    }.unzip
 
     val resultGen = result.transform {
-      case b: BoundReference => childrenVars(b.ordinal)
+      case b: BoundReference => classChildrenVars(b.ordinal)
     }.genCode(ctx)
 
-    ExprCode(code = childrenGen.map(_.code).mkString("\n") + "\n" + resultGen.code,
-      isNull = resultGen.isNull, value = resultGen.value)
+    ExprCode(code = childrenGen.map(_.code).mkString("\n") + initClassChildrenVars.mkString("\n") +
+      resultGen.code, isNull = resultGen.isNull, value = resultGen.value)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 6cfdea9fdf9c..228f4b756c8b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -35,17 +35,20 @@ import org.apache.spark.sql.types.DataType
  *                    not want to perform coercion, simply use "Nil". Note that it would've been
  *                    better to use Option of Seq[DataType] so we can use "None" as the case for no
  *                    type coercion. However, that would require more refactoring of the codebase.
+ * @param udfName   The user-specified name of this UDF.
  */
 case class ScalaUDF(
     function: AnyRef,
     dataType: DataType,
     children: Seq[Expression],
-    inputTypes: Seq[DataType] = Nil)
+    inputTypes: Seq[DataType] = Nil,
+    udfName: Option[String] = None)
   extends Expression with ImplicitCastInputTypes with NonSQLExpression {
 
   override def nullable: Boolean = true
 
-  override def toString: String = s"UDF(${children.mkString(", ")})"
+  override def toString: String =
+    s"${udfName.map(name => s"UDF:$name").getOrElse("UDF")}(${children.mkString(", ")})"
 
   // scalastyle:off line.size.limit
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
index 1f675d5b0727..8db7efdbb5dd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -17,17 +17,15 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.types.{DataType, IntegerType}
 
 /**
- * Expression that returns the current partition id of the Spark task.
+ * Expression that returns the current partition id.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns the current partition id of the Spark task",
-  extended = "> SELECT _FUNC_();\n 0")
+  usage = "_FUNC_() - Returns the current partition id.")
 case class SparkPartitionID() extends LeafExpression with Nondeterministic {
 
   override def nullable: Boolean = false
@@ -38,16 +36,16 @@ case class SparkPartitionID() extends LeafExpression with Nondeterministic {
 
   override val prettyName = "SPARK_PARTITION_ID"
 
-  override protected def initInternal(): Unit = {
-    partitionId = TaskContext.getPartitionId()
+  override protected def initializeInternal(partitionIndex: Int): Unit = {
+    partitionId = partitionIndex
   }
 
   override protected def evalInternal(input: InternalRow): Int = partitionId
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val idTerm = ctx.freshName("partitionId")
-    ctx.addMutableState(ctx.JAVA_INT, idTerm,
-      s"$idTerm = org.apache.spark.TaskContext.getPartitionId();")
+    ctx.addMutableState(ctx.JAVA_INT, idTerm, "")
+    ctx.addPartitionInitializationStatement(s"$idTerm = partitionIndex;")
     ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = $idTerm;", isNull = "false")
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index f91ff87fc1c0..be9e6103b3b1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -49,21 +49,23 @@ import org.apache.spark.sql.types._
  *                           DEFAULT_PERCENTILE_ACCURACY.
  */
 @ExpressionDescription(
-  usage =
-    """
-      _FUNC_(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric
+  usage = """
+    _FUNC_(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric
       column `col` at the given percentage. The value of percentage must be between 0.0
-      and 1.0. The `accuracy` parameter (default: 10000) is a positive integer literal which
+      and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which
       controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields
       better accuracy, `1.0/accuracy` is the relative error of the approximation.
-
-      _FUNC_(col, array(percentage1 [, percentage2]...) [, accuracy]) - Returns the approximate
-      percentile array of column `col` at the given percentage array. Each value of the
-      percentage array must be between 0.0 and 1.0. The `accuracy` parameter (default: 10000) is
-       a positive integer literal which controls approximation accuracy at the cost of memory.
-       Higher value of `accuracy` yields better accuracy, `1.0/accuracy` is the relative error of
-       the approximation.
-    """)
+      When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0.
+      In this case, returns the approximate percentile array of column `col` at the given
+      percentage array.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10.0, array(0.5, 0.4, 0.1), 100);
+       [10.0,10.0,10.0]
+      > SELECT _FUNC_(10.0, 0.5, 100);
+       10.0
+  """)
 case class ApproximatePercentile(
     child: Expression,
     percentageExpression: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
index ff7077484783..d523420530c2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the mean calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the mean calculated from values of a group.")
 case class Average(child: Expression) extends DeclarativeAggregate {
 
   override def prettyName: String = "avg"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
index 17a7c6dce89c..1a93f4590331 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
@@ -37,8 +37,8 @@ import org.apache.spark.sql.types._
  *  - Xiangrui Meng.  "Simpler Online Updates for Arbitrary-Order Central Moments."
  *      2015. http://arxiv.org/abs/1510.04923
  *
- * @see [[https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- *     Algorithms for calculating variance (Wikipedia)]]
+ * @see <a href="https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance">
+ * Algorithms for calculating variance (Wikipedia)</a>
  *
  * @param child to compute central moments of.
  */
@@ -132,7 +132,7 @@ abstract class CentralMomentAgg(child: Expression) extends DeclarativeAggregate
 // Compute the population standard deviation of a column
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the population standard deviation calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the population standard deviation calculated from values of a group.")
 // scalastyle:on line.size.limit
 case class StddevPop(child: Expression) extends CentralMomentAgg(child) {
 
@@ -147,8 +147,10 @@ case class StddevPop(child: Expression) extends CentralMomentAgg(child) {
 }
 
 // Compute the sample standard deviation of a column
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sample standard deviation calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the sample standard deviation calculated from values of a group.")
+// scalastyle:on line.size.limit
 case class StddevSamp(child: Expression) extends CentralMomentAgg(child) {
 
   override protected def momentOrder = 2
@@ -164,7 +166,7 @@ case class StddevSamp(child: Expression) extends CentralMomentAgg(child) {
 
 // Compute the population variance of a column
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the population variance calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the population variance calculated from values of a group.")
 case class VariancePop(child: Expression) extends CentralMomentAgg(child) {
 
   override protected def momentOrder = 2
@@ -179,7 +181,7 @@ case class VariancePop(child: Expression) extends CentralMomentAgg(child) {
 
 // Compute the sample variance of a column
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sample variance calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the sample variance calculated from values of a group.")
 case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
 
   override protected def momentOrder = 2
@@ -194,7 +196,7 @@ case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the Skewness value calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the skewness value calculated from values of a group.")
 case class Skewness(child: Expression) extends CentralMomentAgg(child) {
 
   override def prettyName: String = "skewness"
@@ -209,7 +211,7 @@ case class Skewness(child: Expression) extends CentralMomentAgg(child) {
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the Kurtosis value calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the kurtosis value calculated from values of a group.")
 case class Kurtosis(child: Expression) extends CentralMomentAgg(child) {
 
   override protected def momentOrder = 4
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
index e29265e2f41e..657f519d2a05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
@@ -28,8 +28,10 @@ import org.apache.spark.sql.types._
  * Definition of Pearson correlation can be found at
  * http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x,y) - Returns Pearson coefficient of correlation between a set of number pairs.")
+  usage = "_FUNC_(expr1, expr2) - Returns Pearson coefficient of correlation between a set of number pairs.")
+// scalastyle:on line.size.limit
 case class Corr(x: Expression, y: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = Seq(x, y)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
index 17ae012af79b..bcae0dc0754c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
@@ -23,9 +23,13 @@ import org.apache.spark.sql.types._
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(*) - Returns the total number of retrieved rows, including rows containing NULL values.
-    _FUNC_(expr) - Returns the number of rows for which the supplied expression is non-NULL.
-    _FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-NULL.""")
+  usage = """
+    _FUNC_(*) - Returns the total number of retrieved rows, including rows containing null.
+
+    _FUNC_(expr) - Returns the number of rows for which the supplied expression is non-null.
+
+    _FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-null.
+  """)
 // scalastyle:on line.size.limit
 case class Count(children: Seq[Expression]) extends DeclarativeAggregate {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
index d80afbebf740..ae5ed779700b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
@@ -77,7 +77,7 @@ abstract class Covariance(x: Expression, y: Expression) extends DeclarativeAggre
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x,y) - Returns the population covariance of a set of number pairs.")
+  usage = "_FUNC_(expr1, expr2) - Returns the population covariance of a set of number pairs.")
 case class CovPopulation(left: Expression, right: Expression) extends Covariance(left, right) {
   override val evaluateExpression: Expression = {
     If(n === Literal(0.0), Literal.create(null, DoubleType),
@@ -88,7 +88,7 @@ case class CovPopulation(left: Expression, right: Expression) extends Covariance
 
 
 @ExpressionDescription(
-  usage = "_FUNC_(x,y) - Returns the sample covariance of a set of number pairs.")
+  usage = "_FUNC_(expr1, expr2) - Returns the sample covariance of a set of number pairs.")
 case class CovSample(left: Expression, right: Expression) extends Covariance(left, right) {
   override val evaluateExpression: Expression = {
     If(n === Literal(0.0), Literal.create(null, DoubleType),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
index d702c08cfd34..29b894798000 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
@@ -29,10 +29,10 @@ import org.apache.spark.sql.types._
  * a single partition, and we use a single reducer to do the aggregation.).
  */
 @ExpressionDescription(
-  usage = """_FUNC_(expr) - Returns the first value of `child` for a group of rows.
-    _FUNC_(expr,isIgnoreNull=false) - Returns the first value of `child` for a group of rows.
-      If isIgnoreNull is true, returns only non-null values.
-    """)
+  usage = """
+    _FUNC_(expr[, isIgnoreNull]) - Returns the first value of `expr` for a group of rows.
+      If `isIgnoreNull` is true, returns only non-null values.
+  """)
 case class First(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate {
 
   def this(child: Expression) = this(child, Literal.create(false, BooleanType))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
index 83c8d400c5d6..77b7eb228edc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
@@ -47,10 +47,10 @@ import org.apache.spark.sql.types._
  */
 // scalastyle:on
 @ExpressionDescription(
-  usage = """_FUNC_(expr) - Returns the estimated cardinality by HyperLogLog++.
-    _FUNC_(expr, relativeSD=0.05) - Returns the estimated cardinality by HyperLogLog++
-      with relativeSD, the maximum estimation error allowed.
-    """)
+  usage = """
+    _FUNC_(expr[, relativeSD]) - Returns the estimated cardinality by HyperLogLog++.
+      `relativeSD` defines the maximum estimation error allowed.
+  """)
 case class HyperLogLogPlusPlus(
     child: Expression,
     relativeSD: Double = 0.05,
@@ -93,7 +93,7 @@ case class HyperLogLogPlusPlus(
   private[this] val p = Math.ceil(2.0d * Math.log(1.106d / relativeSD) / Math.log(2.0d)).toInt
 
   require(p >= 4, "HLL++ requires at least 4 bits for addressing. " +
-    "Use a lower error, at most 27%.")
+    "Use a lower error, at most 39%.")
 
   /**
    * Shift used to extract the index of the register from the hashed value.
@@ -296,8 +296,9 @@ case class HyperLogLogPlusPlus(
     // We integrate two steps from the paper:
     // val Z = 1.0d / zInverse
     // val E = alphaM2 * Z
+    val E = alphaM2 / zInverse
     @inline
-    def EBiasCorrected = alphaM2 / zInverse match {
+    def EBiasCorrected = E match {
       case e if p < 19 && e < 5.0d * m => e - estimateBias(e)
       case e => e
     }
@@ -306,7 +307,9 @@ case class HyperLogLogPlusPlus(
     val estimate = if (V > 0) {
       // Use linear counting for small cardinality estimates.
       val H = m * Math.log(m / V)
-      if (H <= THRESHOLDS(p - 4)) {
+      // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
+      // The threshold `2.5 * m` is from the original HLL algorithm.
+      if ((p < 19 && H <= THRESHOLDS(p - 4)) || E <= 2.5 * m) {
         H
       } else {
         EBiasCorrected
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
index 8579f7292d3a..b0a363e7d6dc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
@@ -29,7 +29,10 @@ import org.apache.spark.sql.types._
  * a single partition, and we use a single reducer to do the aggregation.).
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr,isIgnoreNull) - Returns the last value of `child` for a group of rows.")
+  usage = """
+    _FUNC_(expr[, isIgnoreNull]) - Returns the last value of `expr` for a group of rows.
+      If `isIgnoreNull` is true, returns only non-null values.
+  """)
 case class Last(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate {
 
   def this(child: Expression) = this(child, Literal.create(false, BooleanType))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
index c534fe495fc1..f32c9c677a86 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the maximum value of expr.")
+  usage = "_FUNC_(expr) - Returns the maximum value of `expr`.")
 case class Max(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
index 35289b468183..9ef42b96975a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the minimum value of expr.")
+  usage = "_FUNC_(expr) - Returns the minimum value of `expr`.")
 case class Min(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
new file mode 100644
index 000000000000..8dd4f2c59243
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
+import java.util
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.types._
+import org.apache.spark.util.collection.OpenHashMap
+
+/**
+ * The Percentile aggregate function returns the exact percentile(s) of numeric column `expr` at
+ * the given percentage(s) with value range in [0.0, 1.0].
+ *
+ * The operator is bound to the slower sort based aggregation path because the number of elements
+ * and their partial order cannot be determined in advance. Therefore we have to store all the
+ * elements in memory, and that too many elements can cause GC paused and eventually OutOfMemory
+ * Errors.
+ *
+ * @param child child expression that produce numeric column value with `child.eval(inputRow)`
+ * @param percentageExpression Expression that represents a single percentage value or an array of
+ *                             percentage values. Each percentage value must be in the range
+ *                             [0.0, 1.0].
+ */
+@ExpressionDescription(
+  usage =
+    """
+      _FUNC_(col, percentage) - Returns the exact percentile value of numeric column `col` at the
+      given percentage. The value of percentage must be between 0.0 and 1.0.
+
+      _FUNC_(col, array(percentage1 [, percentage2]...)) - Returns the exact percentile value array
+      of numeric column `col` at the given percentage(s). Each value of the percentage array must
+      be between 0.0 and 1.0.
+    """)
+case class Percentile(
+  child: Expression,
+  percentageExpression: Expression,
+  mutableAggBufferOffset: Int = 0,
+  inputAggBufferOffset: Int = 0) extends TypedImperativeAggregate[OpenHashMap[AnyRef, Long]] {
+
+  def this(child: Expression, percentageExpression: Expression) = {
+    this(child, percentageExpression, 0, 0)
+  }
+
+  override def prettyName: String = "percentile"
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): Percentile =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): Percentile =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  // Mark as lazy so that percentageExpression is not evaluated during tree transformation.
+  @transient
+  private lazy val returnPercentileArray = percentageExpression.dataType.isInstanceOf[ArrayType]
+
+  @transient
+  private lazy val percentages =
+    (percentageExpression.dataType, percentageExpression.eval()) match {
+      case (_, num: Double) => Seq(num)
+      case (ArrayType(baseType: NumericType, _), arrayData: ArrayData) =>
+        val numericArray = arrayData.toObjectArray(baseType)
+        numericArray.map { x =>
+          baseType.numeric.toDouble(x.asInstanceOf[baseType.InternalType])}.toSeq
+      case other =>
+        throw new AnalysisException(s"Invalid data type ${other._1} for parameter percentages")
+  }
+
+  override def children: Seq[Expression] = child :: percentageExpression :: Nil
+
+  // Returns null for empty inputs
+  override def nullable: Boolean = true
+
+  override lazy val dataType: DataType = percentageExpression.dataType match {
+    case _: ArrayType => ArrayType(DoubleType, false)
+    case _ => DoubleType
+  }
+
+  override def inputTypes: Seq[AbstractDataType] = percentageExpression.dataType match {
+    case _: ArrayType => Seq(NumericType, ArrayType)
+    case _ => Seq(NumericType, DoubleType)
+  }
+
+  // Check the inputTypes are valid, and the percentageExpression satisfies:
+  // 1. percentageExpression must be foldable;
+  // 2. percentages(s) must be in the range [0.0, 1.0].
+  override def checkInputDataTypes(): TypeCheckResult = {
+    // Validate the inputTypes
+    val defaultCheck = super.checkInputDataTypes()
+    if (defaultCheck.isFailure) {
+      defaultCheck
+    } else if (!percentageExpression.foldable) {
+      // percentageExpression must be foldable
+      TypeCheckFailure("The percentage(s) must be a constant literal, " +
+        s"but got $percentageExpression")
+    } else if (percentages.exists(percentage => percentage < 0.0 || percentage > 1.0)) {
+      // percentages(s) must be in the range [0.0, 1.0]
+      TypeCheckFailure("Percentage(s) must be between 0.0 and 1.0, " +
+        s"but got $percentageExpression")
+    } else {
+      TypeCheckSuccess
+    }
+  }
+
+  private def toDoubleValue(d: Any): Double = d match {
+    case d: Decimal => d.toDouble
+    case n: Number => n.doubleValue
+  }
+
+  override def createAggregationBuffer(): OpenHashMap[AnyRef, Long] = {
+    // Initialize new counts map instance here.
+    new OpenHashMap[AnyRef, Long]()
+  }
+
+  override def update(buffer: OpenHashMap[AnyRef, Long], input: InternalRow): Unit = {
+    val key = child.eval(input).asInstanceOf[AnyRef]
+
+    // Null values are ignored in counts map.
+    if (key != null) {
+      buffer.changeValue(key, 1L, _ + 1L)
+    }
+  }
+
+  override def merge(buffer: OpenHashMap[AnyRef, Long], other: OpenHashMap[AnyRef, Long]): Unit = {
+    other.foreach { case (key, count) =>
+      buffer.changeValue(key, count, _ + count)
+    }
+  }
+
+  override def eval(buffer: OpenHashMap[AnyRef, Long]): Any = {
+    generateOutput(getPercentiles(buffer))
+  }
+
+  private def getPercentiles(buffer: OpenHashMap[AnyRef, Long]): Seq[Double] = {
+    if (buffer.isEmpty) {
+      return Seq.empty
+    }
+
+    val sortedCounts = buffer.toSeq.sortBy(_._1)(
+      child.dataType.asInstanceOf[NumericType].ordering.asInstanceOf[Ordering[AnyRef]])
+    val accumlatedCounts = sortedCounts.scanLeft(sortedCounts.head._1, 0L) {
+      case ((key1, count1), (key2, count2)) => (key2, count1 + count2)
+    }.tail
+    val maxPosition = accumlatedCounts.last._2 - 1
+
+    percentages.map { percentile =>
+      getPercentile(accumlatedCounts, maxPosition * percentile)
+    }
+  }
+
+  private def generateOutput(results: Seq[Double]): Any = {
+    if (results.isEmpty) {
+      null
+    } else if (returnPercentileArray) {
+      new GenericArrayData(results)
+    } else {
+      results.head
+    }
+  }
+
+  /**
+   * Get the percentile value.
+   *
+   * This function has been based upon similar function from HIVE
+   * `org.apache.hadoop.hive.ql.udf.UDAFPercentile.getPercentile()`.
+   */
+  private def getPercentile(aggreCounts: Seq[(AnyRef, Long)], position: Double): Double = {
+    // We may need to do linear interpolation to get the exact percentile
+    val lower = position.floor.toLong
+    val higher = position.ceil.toLong
+
+    // Use binary search to find the lower and the higher position.
+    val countsArray = aggreCounts.map(_._2).toArray[Long]
+    val lowerIndex = binarySearchCount(countsArray, 0, aggreCounts.size, lower + 1)
+    val higherIndex = binarySearchCount(countsArray, 0, aggreCounts.size, higher + 1)
+
+    val lowerKey = aggreCounts(lowerIndex)._1
+    if (higher == lower) {
+      // no interpolation needed because position does not have a fraction
+      return toDoubleValue(lowerKey)
+    }
+
+    val higherKey = aggreCounts(higherIndex)._1
+    if (higherKey == lowerKey) {
+      // no interpolation needed because lower position and higher position has the same key
+      return toDoubleValue(lowerKey)
+    }
+
+    // Linear interpolation to get the exact percentile
+    (higher - position) * toDoubleValue(lowerKey) + (position - lower) * toDoubleValue(higherKey)
+  }
+
+  /**
+   * use a binary search to find the index of the position closest to the current value.
+   */
+  private def binarySearchCount(
+      countsArray: Array[Long], start: Int, end: Int, value: Long): Int = {
+    util.Arrays.binarySearch(countsArray, 0, end, value) match {
+      case ix if ix < 0 => -(ix + 1)
+      case ix => ix
+    }
+  }
+
+  override def serialize(obj: OpenHashMap[AnyRef, Long]): Array[Byte] = {
+    val buffer = new Array[Byte](4 << 10)  // 4K
+    val bos = new ByteArrayOutputStream()
+    val out = new DataOutputStream(bos)
+    try {
+      val projection = UnsafeProjection.create(Array[DataType](child.dataType, LongType))
+      // Write pairs in counts map to byte buffer.
+      obj.foreach { case (key, count) =>
+        val row = InternalRow.apply(key, count)
+        val unsafeRow = projection.apply(row)
+        out.writeInt(unsafeRow.getSizeInBytes)
+        unsafeRow.writeToStream(out, buffer)
+      }
+      out.writeInt(-1)
+      out.flush()
+
+      bos.toByteArray
+    } finally {
+      out.close()
+      bos.close()
+    }
+  }
+
+  override def deserialize(bytes: Array[Byte]): OpenHashMap[AnyRef, Long] = {
+    val bis = new ByteArrayInputStream(bytes)
+    val ins = new DataInputStream(bis)
+    try {
+      val counts = new OpenHashMap[AnyRef, Long]
+      // Read unsafeRow size and content in bytes.
+      var sizeOfNextRow = ins.readInt()
+      while (sizeOfNextRow >= 0) {
+        val bs = new Array[Byte](sizeOfNextRow)
+        ins.readFully(bs)
+        val row = new UnsafeRow(2)
+        row.pointTo(bs, sizeOfNextRow)
+        // Insert the pairs into counts map.
+        val key = row.get(0, child.dataType)
+        val count = row.get(1, LongType).asInstanceOf[Long]
+        counts.update(key, count)
+        sizeOfNextRow = ins.readInt()
+      }
+
+      counts
+    } finally {
+      ins.close()
+      bis.close()
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
index ad217f25b5a2..3c77b1198ac2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sum calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the sum calculated from values of a group.")
 case class Sum(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
@@ -33,8 +33,7 @@ case class Sum(child: Expression) extends DeclarativeAggregate {
   // Return data type.
   override def dataType: DataType = resultType
 
-  override def inputTypes: Seq[AbstractDataType] =
-    Seq(TypeCollection(LongType, DoubleType, DecimalType))
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
 
   override def checkInputDataTypes(): TypeCheckResult =
     TypeUtils.checkForNumericExpr(child.dataType, "function sum")
@@ -42,7 +41,8 @@ case class Sum(child: Expression) extends DeclarativeAggregate {
   private lazy val resultType = child.dataType match {
     case DecimalType.Fixed(precision, scale) =>
       DecimalType.bounded(precision + 10, scale)
-    case _ => child.dataType
+    case _: IntegralType => LongType
+    case _ => DoubleType
   }
 
   private lazy val sumDataType = resultType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index 89eb864e9470..d2880d58aefe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -106,7 +106,7 @@ case class CollectList(
 }
 
 /**
- * Collect a list of unique elements.
+ * Collect a set of unique elements.
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Collects and returns a set of unique elements.")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 6f3db79622fa..4870093e9250 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -25,7 +25,12 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns -a.")
+  usage = "_FUNC_(expr) - Returns the negated value of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       -1
+  """)
 case class UnaryMinus(child: Expression) extends UnaryExpression
     with ExpectsInputTypes with NullIntolerant {
 
@@ -62,7 +67,7 @@ case class UnaryMinus(child: Expression) extends UnaryExpression
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns a.")
+  usage = "_FUNC_(expr) - Returns the value of `expr`.")
 case class UnaryPositive(child: Expression)
     extends UnaryExpression with ExpectsInputTypes with NullIntolerant {
   override def prettyName: String = "positive"
@@ -84,7 +89,11 @@ case class UnaryPositive(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the absolute value of the numeric value.",
-  extended = "> SELECT _FUNC_('-1');\n 1")
+  extended = """
+    Examples:
+      > SELECT _FUNC_(-1);
+       1
+  """)
 case class Abs(child: Expression)
     extends UnaryExpression with ExpectsInputTypes with NullIntolerant {
 
@@ -131,7 +140,12 @@ object BinaryArithmetic {
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns a+b.")
+  usage = "expr1 _FUNC_ expr2 - Returns `expr1`+`expr2`.",
+  extended = """
+    Examples:
+      > SELECT 1 _FUNC_ 2;
+       3
+  """)
 case class Add(left: Expression, right: Expression) extends BinaryArithmetic with NullIntolerant {
 
   override def inputType: AbstractDataType = TypeCollection.NumericAndInterval
@@ -162,7 +176,12 @@ case class Add(left: Expression, right: Expression) extends BinaryArithmetic wit
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns a-b.")
+  usage = "expr1 _FUNC_ expr2 - Returns `expr1`-`expr2`.",
+  extended = """
+    Examples:
+      > SELECT 2 _FUNC_ 1;
+       1
+  """)
 case class Subtract(left: Expression, right: Expression)
     extends BinaryArithmetic with NullIntolerant {
 
@@ -194,7 +213,12 @@ case class Subtract(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Multiplies a by b.")
+  usage = "expr1 _FUNC_ expr2 - Returns `expr1`*`expr2`.",
+  extended = """
+    Examples:
+      > SELECT 2 _FUNC_ 3;
+       6
+  """)
 case class Multiply(left: Expression, right: Expression)
     extends BinaryArithmetic with NullIntolerant {
 
@@ -208,9 +232,17 @@ case class Multiply(left: Expression, right: Expression)
   protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.times(input1, input2)
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Divides a by b.",
-  extended = "> SELECT 3 _FUNC_ 2;\n 1.5")
+  usage = "expr1 _FUNC_ expr2 - Returns `expr1`/`expr2`. It always performs floating point division.",
+  extended = """
+    Examples:
+      > SELECT 3 _FUNC_ 2;
+       1.5
+      > SELECT 2L _FUNC_ 2L;
+       1.0
+  """)
+// scalastyle:on line.size.limit
 case class Divide(left: Expression, right: Expression)
     extends BinaryArithmetic with NullIntolerant {
 
@@ -286,7 +318,12 @@ case class Divide(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns the remainder when dividing a by b.")
+  usage = "expr1 _FUNC_ expr2 - Returns the remainder after `expr1`/`expr2`.",
+  extended = """
+    Examples:
+      > SELECT 2 _FUNC_ 1.8;
+       0.2
+  """)
 case class Remainder(left: Expression, right: Expression)
     extends BinaryArithmetic with NullIntolerant {
 
@@ -367,8 +404,14 @@ case class Remainder(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Returns the positive modulo",
-  extended = "> SELECT _FUNC_(10,3);\n 1")
+  usage = "_FUNC_(expr1, expr2) - Returns the positive value of `expr1` mod `expr2`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10, 3);
+       1
+      > SELECT _FUNC_(-10, 3);
+       2
+  """)
 case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic with NullIntolerant {
 
   override def toString: String = s"pmod($left, $right)"
@@ -471,7 +514,12 @@ case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic wi
  * It takes at least 2 parameters, and returns null iff all parameters are null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n1, ...) - Returns the least value of all parameters, skipping null values.")
+  usage = "_FUNC_(expr, ...) - Returns the least value of all parameters, skipping null values.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10, 9, 2, 4, 3);
+       2
+  """)
 case class Least(children: Seq[Expression]) extends Expression {
 
   override def nullable: Boolean = children.forall(_.nullable)
@@ -531,7 +579,12 @@ case class Least(children: Seq[Expression]) extends Expression {
  * It takes at least 2 parameters, and returns null iff all parameters are null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n1, ...) - Returns the greatest value of all parameters, skipping null values.")
+  usage = "_FUNC_(expr, ...) - Returns the greatest value of all parameters, skipping null values.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10, 9, 2, 4, 3);
+       10
+  """)
 case class Greatest(children: Seq[Expression]) extends Expression {
 
   override def nullable: Boolean = children.forall(_.nullable)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
index 3a0a882e3876..291804077143 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
@@ -27,8 +27,12 @@ import org.apache.spark.sql.types._
  * Code generation inherited from BinaryArithmetic.
  */
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Bitwise AND.",
-  extended = "> SELECT 3 _FUNC_ 5; 1")
+  usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise AND of `expr1` and `expr2`.",
+  extended = """
+    Examples:
+      > SELECT 3 _FUNC_ 5;
+       1
+  """)
 case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
 
   override def inputType: AbstractDataType = IntegralType
@@ -55,8 +59,12 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
  * Code generation inherited from BinaryArithmetic.
  */
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Bitwise OR.",
-  extended = "> SELECT 3 _FUNC_ 5; 7")
+  usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise OR of `expr1` and `expr2`.",
+  extended = """
+    Examples:
+      > SELECT 3 _FUNC_ 5;
+       7
+  """)
 case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
 
   override def inputType: AbstractDataType = IntegralType
@@ -83,8 +91,12 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
  * Code generation inherited from BinaryArithmetic.
  */
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Bitwise exclusive OR.",
-  extended = "> SELECT 3 _FUNC_ 5; 2")
+  usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise exclusive OR of `expr1` and `expr2`.",
+  extended = """
+    Examples:
+      > SELECT 3 _FUNC_ 5;
+       2
+  """)
 case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
 
   override def inputType: AbstractDataType = IntegralType
@@ -109,8 +121,12 @@ case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithme
  * A function that calculates bitwise not(~) of a number.
  */
 @ExpressionDescription(
-  usage = "_FUNC_ b - Bitwise NOT.",
-  extended = "> SELECT _FUNC_ 0; -1")
+  usage = "_FUNC_ expr - Returns the result of bitwise NOT of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_ 0;
+       -1
+  """)
 case class BitwiseNot(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(IntegralType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 6cab50ae1bf8..683b9cbb343c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -184,6 +184,20 @@ class CodegenContext {
     splitExpressions(initCodes, "init", Nil)
   }
 
+  /**
+   * Code statements to initialize states that depend on the partition index.
+   * An integer `partitionIndex` will be made available within the scope.
+   */
+  val partitionInitializationStatements: mutable.ArrayBuffer[String] = mutable.ArrayBuffer.empty
+
+  def addPartitionInitializationStatement(statement: String): Unit = {
+    partitionInitializationStatements += statement
+  }
+
+  def initPartition(): String = {
+    partitionInitializationStatements.mkString("\n")
+  }
+
   /**
    * Holding all the functions those will be added into generated class.
    */
@@ -467,8 +481,13 @@ class CodegenContext {
     case FloatType => s"(java.lang.Float.isNaN($c1) && java.lang.Float.isNaN($c2)) || $c1 == $c2"
     case DoubleType => s"(java.lang.Double.isNaN($c1) && java.lang.Double.isNaN($c2)) || $c1 == $c2"
     case dt: DataType if isPrimitiveType(dt) => s"$c1 == $c2"
+    case dt: DataType if dt.isInstanceOf[AtomicType] => s"$c1.equals($c2)"
+    case array: ArrayType => genComp(array, c1, c2) + " == 0"
+    case struct: StructType => genComp(struct, c1, c2) + " == 0"
     case udt: UserDefinedType[_] => genEqual(udt.sqlType, c1, c2)
-    case other => s"$c1.equals($c2)"
+    case _ =>
+      throw new IllegalArgumentException(
+        "cannot generate equality code for un-comparable type: " + dataType.simpleString)
   }
 
   /**
@@ -498,6 +517,11 @@ class CodegenContext {
       val funcCode: String =
         s"""
           public int $compareFunc(ArrayData a, ArrayData b) {
+            // when comparing unsafe arrays, try equals first as it compares the binary directly
+            // which is very fast.
+            if (a instanceof UnsafeArrayData && b instanceof UnsafeArrayData && a.equals(b)) {
+              return 0;
+            }
             int lengthA = a.numElements();
             int lengthB = b.numElements();
             int $minLength = (lengthA > lengthB) ? lengthB : lengthA;
@@ -531,13 +555,16 @@ class CodegenContext {
       addNewFunction(compareFunc, funcCode)
       s"this.$compareFunc($c1, $c2)"
     case schema: StructType =>
-      INPUT_ROW = "i"
       val comparisons = GenerateOrdering.genComparisons(this, schema)
       val compareFunc = freshName("compareStruct")
       val funcCode: String =
         s"""
           public int $compareFunc(InternalRow a, InternalRow b) {
-            InternalRow i = null;
+            // when comparing unsafe rows, try equals first as it compares the binary directly
+            // which is very fast.
+            if (a instanceof UnsafeRow && b instanceof UnsafeRow && a.equals(b)) {
+              return 0;
+            }
             $comparisons
             return 0;
           }
@@ -547,7 +574,8 @@ class CodegenContext {
     case other if other.isInstanceOf[AtomicType] => s"$c1.compare($c2)"
     case udt: UserDefinedType[_] => genComp(udt.sqlType, c1, c2)
     case _ =>
-      throw new IllegalArgumentException("cannot generate compare code for un-comparable type")
+      throw new IllegalArgumentException(
+        "cannot generate compare code for un-comparable type: " + dataType.simpleString)
   }
 
   /**
@@ -610,8 +638,24 @@ class CodegenContext {
     splitExpressions(expressions, "apply", ("InternalRow", row) :: Nil)
   }
 
-  private def splitExpressions(
-      expressions: Seq[String], funcName: String, arguments: Seq[(String, String)]): String = {
+  /**
+   * Splits the generated code of expressions into multiple functions, because function has
+   * 64kb code size limit in JVM
+   *
+   * @param expressions the codes to evaluate expressions.
+   * @param funcName the split function name base.
+   * @param arguments the list of (type, name) of the arguments of the split function.
+   * @param returnType the return type of the split function.
+   * @param makeSplitFunction makes split function body, e.g. add preparation or cleanup.
+   * @param foldFunctions folds the split function calls.
+   */
+  def splitExpressions(
+      expressions: Seq[String],
+      funcName: String,
+      arguments: Seq[(String, String)],
+      returnType: String = "void",
+      makeSplitFunction: String => String = identity,
+      foldFunctions: Seq[String] => String = _.mkString("", ";\n", ";")): String = {
     val blocks = new ArrayBuffer[String]()
     val blockBuilder = new StringBuilder()
     for (code <- expressions) {
@@ -632,18 +676,19 @@ class CodegenContext {
       blocks.head
     } else {
       val func = freshName(funcName)
+      val argString = arguments.map { case (t, name) => s"$t $name" }.mkString(", ")
       val functions = blocks.zipWithIndex.map { case (body, i) =>
         val name = s"${func}_$i"
         val code = s"""
-           |private void $name(${arguments.map { case (t, name) => s"$t $name" }.mkString(", ")}) {
-           |  $body
+           |private $returnType $name($argString) {
+           |  ${makeSplitFunction(body)}
            |}
          """.stripMargin
         addNewFunction(name, code)
         name
       }
 
-      functions.map(name => s"$name(${arguments.map(_._2).mkString(", ")});").mkString("\n")
+      foldFunctions(functions.map(name => s"$name(${arguments.map(_._2).mkString(", ")})"))
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
index 6a5a3e7933ee..0322d1dd6a9f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
@@ -25,15 +25,23 @@ import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, No
 trait CodegenFallback extends Expression {
 
   protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    foreach {
-      case n: Nondeterministic => n.setInitialValues()
-      case _ =>
-    }
-
     // LeafNode does not need `input`
     val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW
     val idx = ctx.references.length
     ctx.references += this
+    var childIndex = idx
+    this.foreach {
+      case n: Nondeterministic =>
+        // This might add the current expression twice, but it won't hurt.
+        ctx.references += n
+        childIndex += 1
+        ctx.addPartitionInitializationStatement(
+          s"""
+             |((Nondeterministic) references[$childIndex])
+             |  .initialize(partitionIndex);
+          """.stripMargin)
+      case _ =>
+    }
     val objectTerm = ctx.freshName("obj")
     val placeHolder = ctx.registerComment(this.toString)
     if (nullable) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 5c4b56b0b224..4d732445544a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -111,6 +111,10 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP
           ${ctx.initMutableStates()}
         }
 
+        public void initialize(int partitionIndex) {
+          ${ctx.initPartition()}
+        }
+
         ${ctx.declareAddedFunctions()}
 
         public ${classOf[BaseMutableProjection].getName} target(InternalRow row) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
index 1cef95654a17..f7fc2d54a047 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateOrdering.scala
@@ -73,7 +73,12 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
    */
   def genComparisons(ctx: CodegenContext, ordering: Seq[SortOrder]): String = {
     val comparisons = ordering.map { order =>
+      val oldCurrentVars = ctx.currentVars
+      ctx.INPUT_ROW = "i"
+      // to use INPUT_ROW we must make sure currentVars is null
+      ctx.currentVars = null
       val eval = order.child.genCode(ctx)
+      ctx.currentVars = oldCurrentVars
       val asc = order.isAscending
       val isNullA = ctx.freshName("isNullA")
       val primitiveA = ctx.freshName("primitiveA")
@@ -117,8 +122,37 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
             }
           }
       """
-    }.mkString("\n")
-    comparisons
+    }
+
+    val code = ctx.splitExpressions(
+      expressions = comparisons,
+      funcName = "compare",
+      arguments = Seq(("InternalRow", "a"), ("InternalRow", "b")),
+      returnType = "int",
+      makeSplitFunction = { body =>
+        s"""
+          InternalRow ${ctx.INPUT_ROW} = null;  // Holds current row being evaluated.
+          $body
+          return 0;
+        """
+      },
+      foldFunctions = { funCalls =>
+        funCalls.zipWithIndex.map { case (funCall, i) =>
+          val comp = ctx.freshName("comp")
+          s"""
+            int $comp = $funCall;
+            if ($comp != 0) {
+              return $comp;
+            }
+          """
+        }.mkString
+      })
+    // make sure INPUT_ROW is declared even if splitExpressions
+    // returns an inlined block
+    s"""
+       |InternalRow ${ctx.INPUT_ROW} = null;
+       |$code
+     """.stripMargin
   }
 
   protected def create(ordering: Seq[SortOrder]): BaseOrdering = {
@@ -142,7 +176,6 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
         ${ctx.declareAddedFunctions()}
 
         public int compare(InternalRow a, InternalRow b) {
-          InternalRow ${ctx.INPUT_ROW} = null;  // Holds current row being evaluated.
           $comparisons
           return 0;
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 39aa7b17de6c..dcd1ed96a298 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -25,19 +25,26 @@ import org.apache.spark.sql.catalyst.expressions._
  */
 abstract class Predicate {
   def eval(r: InternalRow): Boolean
+
+  /**
+   * Initializes internal states given the current partition index.
+   * This is used by nondeterministic expressions to set initial states.
+   * The default implementation does nothing.
+   */
+  def initialize(partitionIndex: Int): Unit = {}
 }
 
 /**
  * Generates bytecode that evaluates a boolean [[Expression]] on a given input [[InternalRow]].
  */
-object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Boolean] {
+object GeneratePredicate extends CodeGenerator[Expression, Predicate] {
 
   protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in)
 
   protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression =
     BindReferences.bindReference(in, inputSchema)
 
-  protected def create(predicate: Expression): ((InternalRow) => Boolean) = {
+  protected def create(predicate: Expression): Predicate = {
     val ctx = newCodeGenContext()
     val eval = predicate.genCode(ctx)
 
@@ -55,6 +62,10 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
           ${ctx.initMutableStates()}
         }
 
+        public void initialize(int partitionIndex) {
+          ${ctx.initPartition()}
+        }
+
         ${ctx.declareAddedFunctions()}
 
         public boolean eval(InternalRow ${ctx.INPUT_ROW}) {
@@ -67,7 +78,6 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
       new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
     logDebug(s"Generated predicate '$predicate':\n${CodeFormatter.format(code)}")
 
-    val p = CodeGenerator.compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate]
-    (r: InternalRow) => p.eval(r)
+    CodeGenerator.compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index 2773e1a66621..b1cb6edefb85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -173,6 +173,10 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
           ${ctx.initMutableStates()}
         }
 
+        public void initialize(int partitionIndex) {
+          ${ctx.initPartition()}
+        }
+
         ${ctx.declareAddedFunctions()}
 
         public java.lang.Object apply(java.lang.Object _i) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 7cc45372daa5..b358102d914b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -50,10 +50,17 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       fieldTypes: Seq[DataType],
       bufferHolder: String): String = {
     val fieldEvals = fieldTypes.zipWithIndex.map { case (dt, i) =>
-      val fieldName = ctx.freshName("fieldName")
-      val code = s"final ${ctx.javaType(dt)} $fieldName = ${ctx.getValue(input, dt, i.toString)};"
-      val isNull = s"$input.isNullAt($i)"
-      ExprCode(code, isNull, fieldName)
+      val javaType = ctx.javaType(dt)
+      val isNullVar = ctx.freshName("isNull")
+      val valueVar = ctx.freshName("value")
+      val defaultValue = ctx.defaultValue(dt)
+      val readValue = ctx.getValue(input, dt, i.toString)
+      val code =
+        s"""
+          boolean $isNullVar = $input.isNullAt($i);
+          $javaType $valueVar = $isNullVar ? $defaultValue : $readValue;
+        """
+      ExprCode(code, isNullVar, valueVar)
     }
 
     s"""
@@ -380,6 +387,10 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           ${ctx.initMutableStates()}
         }
 
+        public void initialize(int partitionIndex) {
+          ${ctx.initPartition()}
+        }
+
         ${ctx.declareAddedFunctions()}
 
         // Scala.Function1 need this
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index f56bb39d1079..c863ba434120 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -28,8 +28,12 @@ import org.apache.spark.sql.types._
  * Given an array or map, returns its size. Returns -1 if null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the size of an array or a map.",
-  extended = " > SELECT _FUNC_(array('b', 'd', 'c', 'a'));\n 4")
+  usage = "_FUNC_(expr) - Returns the size of an array or a map. Returns -1 if null.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array('b', 'd', 'c', 'a'));
+       4
+  """)
 case class Size(child: Expression) extends UnaryExpression with ExpectsInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(ArrayType, MapType))
@@ -60,7 +64,11 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
  */
 @ExpressionDescription(
   usage = "_FUNC_(map) - Returns an unordered array containing the keys of the map.",
-  extended = " > SELECT _FUNC_(map(1, 'a', 2, 'b'));\n [1,2]")
+  extended = """
+    Examples:
+      > SELECT _FUNC_(map(1, 'a', 2, 'b'));
+       [1,2]
+  """)
 case class MapKeys(child: Expression)
   extends UnaryExpression with ExpectsInputTypes {
 
@@ -84,7 +92,11 @@ case class MapKeys(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(map) - Returns an unordered array containing the values of the map.",
-  extended = " > SELECT _FUNC_(map(1, 'a', 2, 'b'));\n [\"a\",\"b\"]")
+  extended = """
+    Examples:
+      > SELECT _FUNC_(map(1, 'a', 2, 'b'));
+       ["a","b"]
+  """)
 case class MapValues(child: Expression)
   extends UnaryExpression with ExpectsInputTypes {
 
@@ -109,8 +121,12 @@ case class MapValues(child: Expression)
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(array(obj1, obj2, ...), ascendingOrder) - Sorts the input array in ascending order according to the natural ordering of the array elements.",
-  extended = " > SELECT _FUNC_(array('b', 'd', 'c', 'a'), true);\n 'a', 'b', 'c', 'd'")
+  usage = "_FUNC_(array[, ascendingOrder]) - Sorts the input array in ascending or descending order according to the natural ordering of the array elements.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array('b', 'd', 'c', 'a'), true);
+       ["a","b","c","d"]
+  """)
 // scalastyle:on line.size.limit
 case class SortArray(base: Expression, ascendingOrder: Expression)
   extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
@@ -200,8 +216,12 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
  * Checks if the array (left) has the element (right)
  */
 @ExpressionDescription(
-  usage = "_FUNC_(array, value) - Returns TRUE if the array contains the value.",
-  extended = " > SELECT _FUNC_(array(1, 2, 3), 2);\n true")
+  usage = "_FUNC_(array, value) - Returns true if the array contains the value.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array(1, 2, 3), 2);
+       true
+  """)
 case class ArrayContains(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 917aa0873130..3df2ed8be065 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -18,9 +18,11 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
+import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, MapData, TypeUtils}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, TypeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -28,7 +30,12 @@ import org.apache.spark.unsafe.types.UTF8String
  * Returns an Array containing the evaluation of all children expressions.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n0, ...) - Returns an array with the given elements.")
+  usage = "_FUNC_(expr, ...) - Returns an array with the given elements.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1, 2, 3);
+       [1,2,3]
+  """)
 case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def foldable: Boolean = children.forall(_.foldable)
@@ -54,7 +61,6 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
     ctx.addMutableState("Object[]", values, s"this.$values = null;")
 
     ev.copy(code = s"""
-      final boolean ${ev.isNull} = false;
       this.$values = new Object[${children.size}];""" +
       ctx.splitExpressions(
         ctx.INPUT_ROW,
@@ -71,7 +77,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
       s"""
         final ArrayData ${ev.value} = new $arrayClass($values);
         this.$values = null;
-      """)
+      """, isNull = "false")
   }
 
   override def prettyName: String = "array"
@@ -82,7 +88,12 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
  * The children are a flatted sequence of kv pairs, e.g. (key1, value1, key2, value2, ...)
  */
 @ExpressionDescription(
-  usage = "_FUNC_(key0, value0, key1, value1...) - Creates a map with the given key/value pairs.")
+  usage = "_FUNC_(key0, value0, key1, value1, ...) - Creates a map with the given key/value pairs.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1.0, '2', 3.0, '4');
+       {1.0:"2",3.0:"4"}
+  """)
 case class CreateMap(children: Seq[Expression]) extends Expression {
   lazy val keys = children.indices.filter(_ % 2 == 0).map(children)
   lazy val values = children.indices.filter(_ % 2 != 0).map(children)
@@ -132,7 +143,6 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
     val keyData = s"new $arrayClass($keyArray)"
     val valueData = s"new $arrayClass($valueArray)"
     ev.copy(code = s"""
-      final boolean ${ev.isNull} = false;
       $keyArray = new Object[${keys.size}];
       $valueArray = new Object[${values.size}];""" +
       ctx.splitExpressions(
@@ -165,108 +175,78 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
         final MapData ${ev.value} = new $mapClass($keyData, $valueData);
         this.$keyArray = null;
         this.$valueArray = null;
-      """)
+      """, isNull = "false")
   }
 
   override def prettyName: String = "map"
 }
 
 /**
- * Returns a Row containing the evaluation of all children expressions.
+ * An expression representing a not yet available attribute name. This expression is unevaluable
+ * and as its name suggests it is a temporary place holder until we're able to determine the
+ * actual attribute name.
  */
-@ExpressionDescription(
-  usage = "_FUNC_(col1, col2, col3, ...) - Creates a struct with the given field values.")
-case class CreateStruct(children: Seq[Expression]) extends Expression {
-
-  override def foldable: Boolean = children.forall(_.foldable)
-
-  override lazy val dataType: StructType = {
-    val fields = children.zipWithIndex.map { case (child, idx) =>
-      child match {
-        case ne: NamedExpression =>
-          StructField(ne.name, ne.dataType, ne.nullable, ne.metadata)
-        case _ =>
-          StructField(s"col${idx + 1}", child.dataType, child.nullable, Metadata.empty)
-      }
-    }
-    StructType(fields)
-  }
-
+case object NamePlaceholder extends LeafExpression with Unevaluable {
+  override lazy val resolved: Boolean = false
+  override def foldable: Boolean = false
   override def nullable: Boolean = false
+  override def dataType: DataType = StringType
+  override def prettyName: String = "NamePlaceholder"
+  override def toString: String = prettyName
+}
 
-  override def eval(input: InternalRow): Any = {
-    InternalRow(children.map(_.eval(input)): _*)
+/**
+ * Returns a Row containing the evaluation of all children expressions.
+ */
+object CreateStruct extends FunctionBuilder {
+  def apply(children: Seq[Expression]): CreateNamedStruct = {
+    CreateNamedStruct(children.zipWithIndex.flatMap {
+      case (e: NamedExpression, _) if e.resolved => Seq(Literal(e.name), e)
+      case (e: NamedExpression, _) => Seq(NamePlaceholder, e)
+      case (e, index) => Seq(Literal(s"col${index + 1}"), e)
+    })
   }
 
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val rowClass = classOf[GenericInternalRow].getName
-    val values = ctx.freshName("values")
-    ctx.addMutableState("Object[]", values, s"this.$values = null;")
-
-    ev.copy(code = s"""
-      boolean ${ev.isNull} = false;
-      this.$values = new Object[${children.size}];""" +
-      ctx.splitExpressions(
-        ctx.INPUT_ROW,
-        children.zipWithIndex.map { case (e, i) =>
-          val eval = e.genCode(ctx)
-          eval.code + s"""
-            if (${eval.isNull}) {
-              $values[$i] = null;
-            } else {
-              $values[$i] = ${eval.value};
-            }"""
-        }) +
-      s"""
-        final InternalRow ${ev.value} = new $rowClass($values);
-        this.$values = null;
-      """)
+  /**
+   * Entry to use in the function registry.
+   */
+  val registryEntry: (String, (ExpressionInfo, FunctionBuilder)) = {
+    val info: ExpressionInfo = new ExpressionInfo(
+      "org.apache.spark.sql.catalyst.expressions.NamedStruct",
+      null,
+      "struct",
+      "_FUNC_(col1, col2, col3, ...) - Creates a struct with the given field values.",
+      "")
+    ("struct", (info, this))
   }
-
-  override def prettyName: String = "struct"
 }
 
-
 /**
- * Creates a struct with the given field names and values
- *
- * @param children Seq(name1, val1, name2, val2, ...)
+ * Common base class for both [[CreateNamedStruct]] and [[CreateNamedStructUnsafe]].
  */
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = "_FUNC_(name1, val1, name2, val2, ...) - Creates a struct with the given field names and values.")
-// scalastyle:on line.size.limit
-case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
+trait CreateNamedStructLike extends Expression {
+  lazy val (nameExprs, valExprs) = children.grouped(2).map {
+    case Seq(name, value) => (name, value)
+  }.toList.unzip
 
-  /**
-   * Returns Aliased [[Expression]]s that could be used to construct a flattened version of this
-   * StructType.
-   */
-  def flatten: Seq[NamedExpression] = valExprs.zip(names).map {
-    case (v, n) => Alias(v, n.toString)()
-  }
+  lazy val names = nameExprs.map(_.eval(EmptyRow))
 
-  private lazy val (nameExprs, valExprs) =
-    children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
+  override def nullable: Boolean = false
 
-  private lazy val names = nameExprs.map(_.eval(EmptyRow))
+  override def foldable: Boolean = valExprs.forall(_.foldable)
 
   override lazy val dataType: StructType = {
     val fields = names.zip(valExprs).map {
-      case (name, valExpr: NamedExpression) =>
-        StructField(name.asInstanceOf[UTF8String].toString,
-          valExpr.dataType, valExpr.nullable, valExpr.metadata)
-      case (name, valExpr) =>
-        StructField(name.asInstanceOf[UTF8String].toString,
-          valExpr.dataType, valExpr.nullable, Metadata.empty)
+      case (name, expr) =>
+        val metadata = expr match {
+          case ne: NamedExpression => ne.metadata
+          case _ => Metadata.empty
+        }
+        StructField(name.toString, expr.dataType, expr.nullable, metadata)
     }
     StructType(fields)
   }
 
-  override def foldable: Boolean = valExprs.forall(_.foldable)
-
-  override def nullable: Boolean = false
-
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.size % 2 != 0) {
       TypeCheckResult.TypeCheckFailure(s"$prettyName expects an even number of arguments.")
@@ -274,8 +254,8 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
       val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType)
       if (invalidNames.nonEmpty) {
         TypeCheckResult.TypeCheckFailure(
-          s"Only foldable StringType expressions are allowed to appear at odd position , got :" +
-            s" ${invalidNames.mkString(",")}")
+          "Only foldable StringType expressions are allowed to appear at odd position, got:" +
+          s" ${invalidNames.mkString(",")}")
       } else if (!names.contains(null)) {
         TypeCheckResult.TypeCheckSuccess
       } else {
@@ -284,9 +264,34 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
     }
   }
 
+  /**
+   * Returns Aliased [[Expression]]s that could be used to construct a flattened version of this
+   * StructType.
+   */
+  def flatten: Seq[NamedExpression] = valExprs.zip(names).map {
+    case (v, n) => Alias(v, n.toString)()
+  }
+
   override def eval(input: InternalRow): Any = {
     InternalRow(valExprs.map(_.eval(input)): _*)
   }
+}
+
+/**
+ * Creates a struct with the given field names and values
+ *
+ * @param children Seq(name1, val1, name2, val2, ...)
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(name1, val1, name2, val2, ...) - Creates a struct with the given field names and values.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_("a", 1, "b", 2, "c", 3);
+       {"a":1,"b":2,"c":3}
+  """)
+// scalastyle:on line.size.limit
+case class CreateNamedStruct(children: Seq[Expression]) extends CreateNamedStructLike {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rowClass = classOf[GenericInternalRow].getName
@@ -294,7 +299,6 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
     ctx.addMutableState("Object[]", values, s"this.$values = null;")
 
     ev.copy(code = s"""
-      boolean ${ev.isNull} = false;
       $values = new Object[${valExprs.size}];""" +
       ctx.splitExpressions(
         ctx.INPUT_ROW,
@@ -310,50 +314,12 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
       s"""
         final InternalRow ${ev.value} = new $rowClass($values);
         this.$values = null;
-      """)
+      """, isNull = "false")
   }
 
   override def prettyName: String = "named_struct"
 }
 
-/**
- * Returns a Row containing the evaluation of all children expressions. This is a variant that
- * returns UnsafeRow directly. The unsafe projection operator replaces [[CreateStruct]] with
- * this expression automatically at runtime.
- */
-case class CreateStructUnsafe(children: Seq[Expression]) extends Expression {
-
-  override def foldable: Boolean = children.forall(_.foldable)
-
-  override lazy val resolved: Boolean = childrenResolved
-
-  override lazy val dataType: StructType = {
-    val fields = children.zipWithIndex.map { case (child, idx) =>
-      child match {
-        case ne: NamedExpression =>
-          StructField(ne.name, ne.dataType, ne.nullable, ne.metadata)
-        case _ =>
-          StructField(s"col${idx + 1}", child.dataType, child.nullable, Metadata.empty)
-      }
-    }
-    StructType(fields)
-  }
-
-  override def nullable: Boolean = false
-
-  override def eval(input: InternalRow): Any = {
-    InternalRow(children.map(_.eval(input)): _*)
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val eval = GenerateUnsafeProjection.createCode(ctx, children)
-    ExprCode(code = eval.code, isNull = eval.isNull, value = eval.value)
-  }
-
-  override def prettyName: String = "struct_unsafe"
-}
-
-
 /**
  * Creates a struct with the given field names and values. This is a variant that returns
  * UnsafeRow directly. The unsafe projection operator replaces [[CreateStruct]] with
@@ -361,34 +327,10 @@ case class CreateStructUnsafe(children: Seq[Expression]) extends Expression {
  *
  * @param children Seq(name1, val1, name2, val2, ...)
  */
-case class CreateNamedStructUnsafe(children: Seq[Expression]) extends Expression {
-
-  private lazy val (nameExprs, valExprs) =
-    children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
-
-  private lazy val names = nameExprs.map(_.eval(EmptyRow).toString)
-
-  override lazy val dataType: StructType = {
-    val fields = names.zip(valExprs).map {
-      case (name, valExpr: NamedExpression) =>
-        StructField(name, valExpr.dataType, valExpr.nullable, valExpr.metadata)
-      case (name, valExpr) =>
-        StructField(name, valExpr.dataType, valExpr.nullable, Metadata.empty)
-    }
-    StructType(fields)
-  }
-
-  override def foldable: Boolean = valExprs.forall(_.foldable)
-
-  override def nullable: Boolean = false
-
-  override def eval(input: InternalRow): Any = {
-    InternalRow(valExprs.map(_.eval(input)): _*)
-  }
-
+case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateNamedStructLike {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val eval = GenerateUnsafeProjection.createCode(ctx, valExprs)
-    ExprCode(code = eval.code, isNull = eval.isNull, value = eval.value)
+    ExprCode(code = eval.code, isNull = "false", value = eval.value)
   }
 
   override def prettyName: String = "named_struct_unsafe"
@@ -399,8 +341,14 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends Expression
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(text[, pairDelim, keyValueDelim]) - Creates a map after splitting the text into key/value pairs using delimiters. Default delimiters are ',' for pairDelim and ':' for keyValueDelim.",
-  extended = """ > SELECT _FUNC_('a:1,b:2,c:3',',',':');\n map("a":"1","b":"2","c":"3") """)
+  usage = "_FUNC_(text[, pairDelim[, keyValueDelim]]) - Creates a map after splitting the text into key/value pairs using delimiters. Default delimiters are ',' for `pairDelim` and ':' for `keyValueDelim`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('a:1,b:2,c:3', ',', ':');
+       map("a":"1","b":"2","c":"3")
+      > SELECT _FUNC_('a');
+       map("a":null)
+  """)
 // scalastyle:on line.size.limit
 case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: Expression)
   extends TernaryExpression with CodegenFallback with ExpectsInputTypes {
@@ -417,7 +365,7 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E
 
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, StringType)
 
-  override def dataType: DataType = MapType(StringType, StringType, valueContainsNull = false)
+  override def dataType: DataType = MapType(StringType, StringType)
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (Seq(pairDelim, keyValueDelim).exists(! _.foldable)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index abb5594bfa7f..0c256c3d890f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -260,7 +260,7 @@ case class GetArrayItem(child: Expression, ordinal: Expression)
  * We need to do type checking here as `key` expression maybe unresolved.
  */
 case class GetMapValue(child: Expression, key: Expression)
-  extends BinaryExpression with ExpectsInputTypes with ExtractValue {
+  extends BinaryExpression with ImplicitCastInputTypes with ExtractValue {
 
   private def keyType = child.dataType.asInstanceOf[MapType].keyType
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index 71d4e9a3c947..bacedec1ae20 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -24,7 +24,12 @@ import org.apache.spark.sql.types._
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr1,expr2,expr3) - If expr1 is TRUE then IF() returns expr2; otherwise it returns expr3.")
+  usage = "_FUNC_(expr1, expr2, expr3) - If `expr1` evaluates to true, then returns `expr2`; otherwise returns `expr3`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1 < 2, 'a', 'b');
+       a
+  """)
 // scalastyle:on line.size.limit
 case class If(predicate: Expression, trueValue: Expression, falseValue: Expression)
   extends Expression {
@@ -36,7 +41,7 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
     if (predicate.dataType != BooleanType) {
       TypeCheckResult.TypeCheckFailure(
         s"type of predicate expression in If should be boolean, not ${predicate.dataType}")
-    } else if (trueValue.dataType.asNullable != falseValue.dataType.asNullable) {
+    } else if (!trueValue.dataType.sameType(falseValue.dataType)) {
       TypeCheckResult.TypeCheckFailure(s"differing types in '$sql' " +
         s"(${trueValue.dataType.simpleString} and ${falseValue.dataType.simpleString}).")
     } else {
@@ -59,19 +64,75 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
     val trueEval = trueValue.genCode(ctx)
     val falseEval = falseValue.genCode(ctx)
 
-    ev.copy(code = s"""
-      ${condEval.code}
-      boolean ${ev.isNull} = false;
-      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
-      if (!${condEval.isNull} && ${condEval.value}) {
-        ${trueEval.code}
-        ${ev.isNull} = ${trueEval.isNull};
-        ${ev.value} = ${trueEval.value};
-      } else {
-        ${falseEval.code}
-        ${ev.isNull} = ${falseEval.isNull};
-        ${ev.value} = ${falseEval.value};
-      }""")
+    // place generated code of condition, true value and false value in separate methods if
+    // their code combined is large
+    val combinedLength = condEval.code.length + trueEval.code.length + falseEval.code.length
+    val generatedCode = if (combinedLength > 1024 &&
+      // Split these expressions only if they are created from a row object
+      (ctx.INPUT_ROW != null && ctx.currentVars == null)) {
+
+      val (condFuncName, condGlobalIsNull, condGlobalValue) =
+        createAndAddFunction(ctx, condEval, predicate.dataType, "evalIfCondExpr")
+      val (trueFuncName, trueGlobalIsNull, trueGlobalValue) =
+        createAndAddFunction(ctx, trueEval, trueValue.dataType, "evalIfTrueExpr")
+      val (falseFuncName, falseGlobalIsNull, falseGlobalValue) =
+        createAndAddFunction(ctx, falseEval, falseValue.dataType, "evalIfFalseExpr")
+      s"""
+        $condFuncName(${ctx.INPUT_ROW});
+        boolean ${ev.isNull} = false;
+        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+        if (!$condGlobalIsNull && $condGlobalValue) {
+          $trueFuncName(${ctx.INPUT_ROW});
+          ${ev.isNull} = $trueGlobalIsNull;
+          ${ev.value} = $trueGlobalValue;
+        } else {
+          $falseFuncName(${ctx.INPUT_ROW});
+          ${ev.isNull} = $falseGlobalIsNull;
+          ${ev.value} = $falseGlobalValue;
+        }
+      """
+    }
+    else {
+      s"""
+        ${condEval.code}
+        boolean ${ev.isNull} = false;
+        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+        if (!${condEval.isNull} && ${condEval.value}) {
+          ${trueEval.code}
+          ${ev.isNull} = ${trueEval.isNull};
+          ${ev.value} = ${trueEval.value};
+        } else {
+          ${falseEval.code}
+          ${ev.isNull} = ${falseEval.isNull};
+          ${ev.value} = ${falseEval.value};
+        }
+      """
+    }
+
+    ev.copy(code = generatedCode)
+  }
+
+  private def createAndAddFunction(
+      ctx: CodegenContext,
+      ev: ExprCode,
+      dataType: DataType,
+      baseFuncName: String): (String, String, String) = {
+    val globalIsNull = ctx.freshName("isNull")
+    ctx.addMutableState("boolean", globalIsNull, s"$globalIsNull = false;")
+    val globalValue = ctx.freshName("value")
+    ctx.addMutableState(ctx.javaType(dataType), globalValue,
+      s"$globalValue = ${ctx.defaultValue(dataType)};")
+    val funcName = ctx.freshName(baseFuncName)
+    val funcBody =
+      s"""
+         |private void $funcName(InternalRow ${ctx.INPUT_ROW}) {
+         |  ${ev.code.trim}
+         |  $globalIsNull = ${ev.isNull};
+         |  $globalValue = ${ev.value};
+         |}
+         """.stripMargin
+    ctx.addNewFunction(funcName, funcBody)
+    (funcName, globalIsNull, globalValue)
   }
 
   override def toString: String = s"if ($predicate) $trueValue else $falseValue"
@@ -162,7 +223,7 @@ abstract class CaseWhenBase(
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "CASE WHEN a THEN b [WHEN c THEN d]* [ELSE e] END - When a = true, returns b; when c = true, return d; else return e.")
+  usage = "CASE WHEN expr1 THEN expr2 [WHEN expr3 THEN expr4]* [ELSE expr5] END - When `expr1` = true, returns `expr2`; when `expr3` = true, return `expr4`; else return `expr5`.")
 // scalastyle:on line.size.limit
 case class CaseWhen(
     val branches: Seq[(Expression, Expression)],
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 7ab68a13e09c..ef1ac360daad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -17,14 +17,14 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.Timestamp
 import java.text.SimpleDateFormat
-import java.util.{Calendar, TimeZone}
+import java.util.{Calendar, Locale, TimeZone}
 
 import scala.util.Try
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback,
-  ExprCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -71,12 +71,45 @@ case class CurrentTimestamp() extends LeafExpression with CodegenFallback {
   override def prettyName: String = "current_timestamp"
 }
 
+/**
+ * Expression representing the current batch time, which is used by StreamExecution to
+ * 1. prevent optimizer from pushing this expression below a stateful operator
+ * 2. allow IncrementalExecution to substitute this expression with a Literal(timestamp)
+ *
+ * There is no code generation since this expression should be replaced with a literal.
+ */
+case class CurrentBatchTimestamp(timestampMs: Long, dataType: DataType)
+  extends LeafExpression with Nondeterministic with CodegenFallback {
+
+  override def nullable: Boolean = false
+
+  override def prettyName: String = "current_batch_timestamp"
+
+  override protected def initializeInternal(partitionIndex: Int): Unit = {}
+
+  /**
+   * Need to return literal value in order to support compile time expression evaluation
+   * e.g., select(current_date())
+   */
+  override protected def evalInternal(input: InternalRow): Any = toLiteral.value
+
+  def toLiteral: Literal = dataType match {
+    case _: TimestampType =>
+      Literal(DateTimeUtils.fromJavaTimestamp(new Timestamp(timestampMs)), TimestampType)
+    case _: DateType => Literal(DateTimeUtils.millisToDays(timestampMs), DateType)
+  }
+}
+
 /**
  * Adds a number of days to startdate.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(start_date, num_days) - Returns the date that is num_days after start_date.",
-  extended = "> SELECT _FUNC_('2016-07-30', 1);\n '2016-07-31'")
+  usage = "_FUNC_(start_date, num_days) - Returns the date that is `num_days` after `start_date`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-07-30', 1);
+       2016-07-31
+  """)
 case class DateAdd(startDate: Expression, days: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -104,8 +137,12 @@ case class DateAdd(startDate: Expression, days: Expression)
  * Subtracts a number of days to startdate.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(start_date, num_days) - Returns the date that is num_days before start_date.",
-  extended = "> SELECT _FUNC_('2016-07-30', 1);\n '2016-07-29'")
+  usage = "_FUNC_(start_date, num_days) - Returns the date that is `num_days` before `start_date`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-07-30', 1);
+       2016-07-29
+  """)
 case class DateSub(startDate: Expression, days: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
   override def left: Expression = startDate
@@ -129,8 +166,12 @@ case class DateSub(startDate: Expression, days: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the hour component of the string/timestamp/interval.",
-  extended = "> SELECT _FUNC_('2009-07-30 12:58:59');\n 12")
+  usage = "_FUNC_(timestamp) - Returns the hour component of the string/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30 12:58:59');
+       12
+  """)
 case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
@@ -148,8 +189,12 @@ case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInpu
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the minute component of the string/timestamp/interval.",
-  extended = "> SELECT _FUNC_('2009-07-30 12:58:59');\n 58")
+  usage = "_FUNC_(timestamp) - Returns the minute component of the string/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30 12:58:59');
+       58
+  """)
 case class Minute(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
@@ -167,8 +212,12 @@ case class Minute(child: Expression) extends UnaryExpression with ImplicitCastIn
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the second component of the string/timestamp/interval.",
-  extended = "> SELECT _FUNC_('2009-07-30 12:58:59');\n 59")
+  usage = "_FUNC_(timestamp) - Returns the second component of the string/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30 12:58:59');
+       59
+  """)
 case class Second(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
@@ -186,8 +235,12 @@ case class Second(child: Expression) extends UnaryExpression with ImplicitCastIn
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the day of year of date/timestamp.",
-  extended = "> SELECT _FUNC_('2016-04-09');\n 100")
+  usage = "_FUNC_(date) - Returns the day of year of the date/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-04-09');
+       100
+  """)
 case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -205,8 +258,12 @@ case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCas
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the year component of the date/timestamp/interval.",
-  extended = "> SELECT _FUNC_('2016-07-30');\n 2016")
+  usage = "_FUNC_(date) - Returns the year component of the date/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-07-30');
+       2016
+  """)
 case class Year(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -224,7 +281,12 @@ case class Year(child: Expression) extends UnaryExpression with ImplicitCastInpu
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the quarter of the year for date, in the range 1 to 4.")
+  usage = "_FUNC_(date) - Returns the quarter of the year for date, in the range 1 to 4.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-08-31');
+       3
+  """)
 case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -242,8 +304,12 @@ case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastI
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the month component of the date/timestamp/interval",
-  extended = "> SELECT _FUNC_('2016-07-30');\n 7")
+  usage = "_FUNC_(date) - Returns the month component of the date/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-07-30');
+       7
+  """)
 case class Month(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -261,8 +327,12 @@ case class Month(child: Expression) extends UnaryExpression with ImplicitCastInp
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the day of month of date/timestamp, or the day of interval.",
-  extended = "> SELECT _FUNC_('2009-07-30');\n 30")
+  usage = "_FUNC_(date) - Returns the day of month of the date/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30');
+       30
+  """)
 case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -280,8 +350,12 @@ case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCa
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the week of the year of the given date.",
-  extended = "> SELECT _FUNC_('2008-02-20');\n 8")
+  usage = "_FUNC_(date) - Returns the week of the year of the given date.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2008-02-20');
+       8
+  """)
 case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -320,8 +394,12 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(date/timestamp/string, fmt) - Converts a date/timestamp/string to a value of string in the format specified by the date format fmt.",
-  extended = "> SELECT _FUNC_('2016-04-08', 'y')\n '2016'")
+  usage = "_FUNC_(timestamp, fmt) - Converts `timestamp` to a value of string in the format specified by the date format `fmt`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-04-08', 'y');
+       2016
+  """)
 // scalastyle:on line.size.limit
 case class DateFormatClass(left: Expression, right: Expression) extends BinaryExpression
   with ImplicitCastInputTypes {
@@ -331,7 +409,7 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType)
 
   override protected def nullSafeEval(timestamp: Any, format: Any): Any = {
-    val sdf = new SimpleDateFormat(format.toString)
+    val sdf = new SimpleDateFormat(format.toString, Locale.US)
     UTF8String.fromString(sdf.format(new java.util.Date(timestamp.asInstanceOf[Long] / 1000)))
   }
 
@@ -351,7 +429,12 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
  * Deterministic version of [[UnixTimestamp]], must have at least one parameter.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(date[, pattern]) - Returns the UNIX timestamp of the give time.")
+  usage = "_FUNC_(expr[, pattern]) - Returns the UNIX timestamp of the give time.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
+       1460041200
+  """)
 case class ToUnixTimestamp(timeExp: Expression, format: Expression) extends UnixTime {
   override def left: Expression = timeExp
   override def right: Expression = format
@@ -374,7 +457,14 @@ case class ToUnixTimestamp(timeExp: Expression, format: Expression) extends Unix
  * second parameter.
  */
 @ExpressionDescription(
-  usage = "_FUNC_([date[, pattern]]) - Returns the UNIX timestamp of current or specified time.")
+  usage = "_FUNC_([expr[, pattern]]) - Returns the UNIX timestamp of current or specified time.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       1476884637
+      > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
+       1460041200
+  """)
 case class UnixTimestamp(timeExp: Expression, format: Expression) extends UnixTime {
   override def left: Expression = timeExp
   override def right: Expression = format
@@ -400,7 +490,7 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
 
   private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
   private lazy val formatter: SimpleDateFormat =
-    Try(new SimpleDateFormat(constFormat.toString)).getOrElse(null)
+    Try(new SimpleDateFormat(constFormat.toString, Locale.US)).getOrElse(null)
 
   override def eval(input: InternalRow): Any = {
     val t = left.eval(input)
@@ -425,7 +515,7 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
             null
           } else {
             val formatString = f.asInstanceOf[UTF8String].toString
-            Try(new SimpleDateFormat(formatString).parse(
+            Try(new SimpleDateFormat(formatString, Locale.US).parse(
               t.asInstanceOf[UTF8String].toString).getTime / 1000L).getOrElse(null)
           }
       }
@@ -488,8 +578,6 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
           }""")
     }
   }
-
-  override def prettyName: String = "unix_time"
 }
 
 /**
@@ -499,8 +587,12 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
  * Note that hive Language Manual says it returns 0 if fail, but in fact it returns null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(unix_time, format) - Returns unix_time in the specified format",
-  extended = "> SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss');\n '1970-01-01 00:00:00'")
+  usage = "_FUNC_(unix_time, format) - Returns `unix_time` in the specified `format`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss');
+       1970-01-01 00:00:00
+  """)
 case class FromUnixTime(sec: Expression, format: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -520,7 +612,7 @@ case class FromUnixTime(sec: Expression, format: Expression)
 
   private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
   private lazy val formatter: SimpleDateFormat =
-    Try(new SimpleDateFormat(constFormat.toString)).getOrElse(null)
+    Try(new SimpleDateFormat(constFormat.toString, Locale.US)).getOrElse(null)
 
   override def eval(input: InternalRow): Any = {
     val time = left.eval(input)
@@ -539,9 +631,10 @@ case class FromUnixTime(sec: Expression, format: Expression)
         if (f == null) {
           null
         } else {
-          Try(UTF8String.fromString(new SimpleDateFormat(
-            f.asInstanceOf[UTF8String].toString).format(new java.util.Date(
-              time.asInstanceOf[Long] * 1000L)))).getOrElse(null)
+          Try(
+            UTF8String.fromString(new SimpleDateFormat(f.toString, Locale.US).
+              format(new java.util.Date(time.asInstanceOf[Long] * 1000L)))
+          ).getOrElse(null)
         }
       }
     }
@@ -587,7 +680,11 @@ case class FromUnixTime(sec: Expression, format: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the last day of the month which the date belongs to.",
-  extended = "> SELECT _FUNC_('2009-01-12');\n '2009-01-31'")
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-01-12');
+       2009-01-31
+  """)
 case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def child: Expression = startDate
 
@@ -616,8 +713,12 @@ case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitC
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(start_date, day_of_week) - Returns the first date which is later than start_date and named as indicated.",
-  extended = "> SELECT _FUNC_('2015-01-14', 'TU');\n '2015-01-20'")
+  usage = "_FUNC_(start_date, day_of_week) - Returns the first date which is later than `start_date` and named as indicated.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2015-01-14', 'TU');
+       2015-01-20
+  """)
 // scalastyle:on line.size.limit
 case class NextDay(startDate: Expression, dayOfWeek: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -702,11 +803,17 @@ case class TimeAdd(start: Expression, interval: Expression)
 }
 
 /**
- * Assumes given timestamp is UTC and converts to given timezone.
+ * Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+ * that corresponds to the same time of day in the given timezone.
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(timestamp, string timezone) - Assumes given timestamp is UTC and converts to given timezone.")
+  usage = "_FUNC_(timestamp, timezone) - Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp that corresponds to the same time of day in the given timezone.",
+  extended = """
+    Examples:
+      > SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul');
+       2016-08-31 09:00:00
+  """)
 // scalastyle:on line.size.limit
 case class FromUTCTimestamp(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -785,9 +892,15 @@ case class TimeSub(start: Expression, interval: Expression)
 /**
  * Returns the date that is num_months after start_date.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(start_date, num_months) - Returns the date that is num_months after start_date.",
-  extended = "> SELECT _FUNC_('2016-08-31', 1);\n '2016-09-30'")
+  usage = "_FUNC_(start_date, num_months) - Returns the date that is `num_months` after `start_date`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-08-31', 1);
+       2016-09-30
+  """)
+// scalastyle:on line.size.limit
 case class AddMonths(startDate: Expression, numMonths: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -815,9 +928,15 @@ case class AddMonths(startDate: Expression, numMonths: Expression)
 /**
  * Returns number of months between dates date1 and date2.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(date1, date2) - returns number of months between dates date1 and date2.",
-  extended = "> SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30');\n 3.94959677")
+  usage = "_FUNC_(timestamp1, timestamp2) - Returns number of months between `timestamp1` and `timestamp2`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30');
+       3.94959677
+  """)
+// scalastyle:on line.size.limit
 case class MonthsBetween(date1: Expression, date2: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -843,11 +962,17 @@ case class MonthsBetween(date1: Expression, date2: Expression)
 }
 
 /**
- * Assumes given timestamp is in given timezone and converts to UTC.
+ * Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+ * another timestamp that corresponds to the same time of day in UTC.
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(timestamp, string timezone) - Assumes given timestamp is in given timezone and converts to UTC.")
+  usage = "_FUNC_(timestamp, timezone) - Given a timestamp, which corresponds to a certain time of day in the given timezone, returns another timestamp that corresponds to the same time of day in UTC.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-08-31', 'Asia/Seoul');
+       2016-08-30 15:00:00
+  """)
 // scalastyle:on line.size.limit
 case class ToUTCTimestamp(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -898,8 +1023,12 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
  * Returns the date part of a timestamp or string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Extracts the date part of the date or datetime expression expr.",
-  extended = "> SELECT _FUNC_('2009-07-30 04:17:52');\n '2009-07-30'")
+  usage = "_FUNC_(expr) - Extracts the date part of the date or timestamp expression `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30 04:17:52');
+       2009-07-30
+  """)
 case class ToDate(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   // Implicit casting of spark will accept string in both date and timestamp format, as
@@ -922,8 +1051,14 @@ case class ToDate(child: Expression) extends UnaryExpression with ImplicitCastIn
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(date, fmt) - Returns returns date with the time portion of the day truncated to the unit specified by the format model fmt.",
-  extended = "> SELECT _FUNC_('2009-02-12', 'MM')\n '2009-02-01'\n> SELECT _FUNC_('2015-10-27', 'YEAR');\n '2015-01-01'")
+  usage = "_FUNC_(date, fmt) - Returns `date` with the time portion of the day truncated to the unit specified by the format model `fmt`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-02-12', 'MM');
+       2009-02-01
+      > SELECT _FUNC_('2015-10-27', 'YEAR');
+       2015-01-01
+  """)
 // scalastyle:on line.size.limit
 case class TruncDate(date: Expression, format: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -995,8 +1130,15 @@ case class TruncDate(date: Expression, format: Expression)
  * Returns the number of days from startDate to endDate.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(date1, date2) - Returns the number of days between date1 and date2.",
-  extended = "> SELECT _FUNC_('2009-07-30', '2009-07-31');\n 1")
+  usage = "_FUNC_(endDate, startDate) - Returns the number of days from `startDate` to `endDate`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-31', '2009-07-30');
+       1
+
+      > SELECT _FUNC_('2009-07-30', '2009-07-31');
+       -1
+  """)
 case class DateDiff(endDate: Expression, startDate: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index f74208ff66db..d042bfb63d56 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -102,8 +102,13 @@ case class UserDefinedGenerator(
  * }}}
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n, v1, ..., vk) - Separate v1, ..., vk into n rows.",
-  extended = "> SELECT _FUNC_(2, 1, 2, 3);\n  [1,2]\n  [3,null]")
+  usage = "_FUNC_(n, expr1, ..., exprk) - Separates `expr1`, ..., `exprk` into `n` rows.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2, 1, 2, 3);
+       1  2
+       3  NULL
+  """)
 case class Stack(children: Seq[Expression])
     extends Expression with Generator with CodegenFallback {
 
@@ -226,8 +231,13 @@ abstract class ExplodeBase(child: Expression, position: Boolean)
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Separates the elements of array a into multiple rows, or the elements of map a into multiple rows and columns.",
-  extended = "> SELECT _FUNC_(array(10,20));\n  10\n  20")
+  usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple rows, or the elements of map `expr` into multiple rows and columns.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array(10, 20));
+       10
+       20
+  """)
 // scalastyle:on line.size.limit
 case class Explode(child: Expression) extends ExplodeBase(child, position = false)
 
@@ -242,8 +252,13 @@ case class Explode(child: Expression) extends ExplodeBase(child, position = fals
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Separates the elements of array a into multiple rows with positions, or the elements of a map into multiple rows and columns with positions.",
-  extended = "> SELECT _FUNC_(array(10,20));\n  0\t10\n  1\t20")
+  usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple rows with positions, or the elements of map `expr` into multiple rows and columns with positions.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array(10,20));
+       0  10
+       1  20
+  """)
 // scalastyle:on line.size.limit
 case class PosExplode(child: Expression) extends ExplodeBase(child, position = true)
 
@@ -251,8 +266,13 @@ case class PosExplode(child: Expression) extends ExplodeBase(child, position = t
  * Explodes an array of structs into a table.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Explodes an array of structs into a table.",
-  extended = "> SELECT _FUNC_(array(struct(1, 'a'), struct(2, 'b')));\n  [1,a]\n  [2,b]")
+  usage = "_FUNC_(expr) - Explodes an array of structs into a table.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array(struct(1, 'a'), struct(2, 'b')));
+       1  a
+       2  b
+  """)
 case class Inline(child: Expression) extends UnaryExpression with Generator with CodegenFallback {
 
   override def checkInputDataTypes(): TypeCheckResult = child.dataType match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
new file mode 100644
index 000000000000..e14f0544c2b8
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -0,0 +1,792 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.security.{MessageDigest, NoSuchAlgorithmException}
+import java.util.zip.CRC32
+
+import scala.annotation.tailrec
+
+import org.apache.commons.codec.digest.DigestUtils
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.hash.Murmur3_x86_32
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
+import org.apache.spark.unsafe.Platform
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// This file defines all the expressions for hashing.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * A function that calculates an MD5 128-bit checksum and returns it as a hex string
+ * For input of type [[BinaryType]]
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns an MD5 128-bit checksum as a hex string of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       8cde774d6f7333752ed72cacddb05126
+  """)
+case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[DataType] = Seq(BinaryType)
+
+  protected override def nullSafeEval(input: Any): Any =
+    UTF8String.fromString(DigestUtils.md5Hex(input.asInstanceOf[Array[Byte]]))
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c =>
+      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
+  }
+}
+
+/**
+ * A function that calculates the SHA-2 family of functions (SHA-224, SHA-256, SHA-384, and SHA-512)
+ * and returns it as a hex string. The first argument is the string or binary to be hashed. The
+ * second argument indicates the desired bit length of the result, which must have a value of 224,
+ * 256, 384, 512, or 0 (which is equivalent to 256). SHA-224 is supported starting from Java 8. If
+ * asking for an unsupported SHA function, the return value is NULL. If either argument is NULL or
+ * the hash length is not one of the permitted values, the return value is NULL.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of `expr`.
+      SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark', 256);
+       529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b
+  """)
+// scalastyle:on line.size.limit
+case class Sha2(left: Expression, right: Expression)
+  extends BinaryExpression with Serializable with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+  override def nullable: Boolean = true
+
+  override def inputTypes: Seq[DataType] = Seq(BinaryType, IntegerType)
+
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val bitLength = input2.asInstanceOf[Int]
+    val input = input1.asInstanceOf[Array[Byte]]
+    bitLength match {
+      case 224 =>
+        // DigestUtils doesn't support SHA-224 now
+        try {
+          val md = MessageDigest.getInstance("SHA-224")
+          md.update(input)
+          UTF8String.fromBytes(md.digest())
+        } catch {
+          // SHA-224 is not supported on the system, return null
+          case noa: NoSuchAlgorithmException => null
+        }
+      case 256 | 0 =>
+        UTF8String.fromString(DigestUtils.sha256Hex(input))
+      case 384 =>
+        UTF8String.fromString(DigestUtils.sha384Hex(input))
+      case 512 =>
+        UTF8String.fromString(DigestUtils.sha512Hex(input))
+      case _ => null
+    }
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val digestUtils = "org.apache.commons.codec.digest.DigestUtils"
+    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+      s"""
+        if ($eval2 == 224) {
+          try {
+            java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-224");
+            md.update($eval1);
+            ${ev.value} = UTF8String.fromBytes(md.digest());
+          } catch (java.security.NoSuchAlgorithmException e) {
+            ${ev.isNull} = true;
+          }
+        } else if ($eval2 == 256 || $eval2 == 0) {
+          ${ev.value} =
+            UTF8String.fromString($digestUtils.sha256Hex($eval1));
+        } else if ($eval2 == 384) {
+          ${ev.value} =
+            UTF8String.fromString($digestUtils.sha384Hex($eval1));
+        } else if ($eval2 == 512) {
+          ${ev.value} =
+            UTF8String.fromString($digestUtils.sha512Hex($eval1));
+        } else {
+          ${ev.isNull} = true;
+        }
+      """
+    })
+  }
+}
+
+/**
+ * A function that calculates a sha1 hash value and returns it as a hex string
+ * For input of type [[BinaryType]] or [[StringType]]
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns a sha1 hash value as a hex string of the `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+  """)
+case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[DataType] = Seq(BinaryType)
+
+  protected override def nullSafeEval(input: Any): Any =
+    UTF8String.fromString(DigestUtils.sha1Hex(input.asInstanceOf[Array[Byte]]))
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c =>
+      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.sha1Hex($c))"
+    )
+  }
+}
+
+/**
+ * A function that computes a cyclic redundancy check value and returns it as a bigint
+ * For input of type [[BinaryType]]
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns a cyclic redundancy check value of the `expr` as a bigint.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       1557323817
+  """)
+case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = LongType
+
+  override def inputTypes: Seq[DataType] = Seq(BinaryType)
+
+  protected override def nullSafeEval(input: Any): Any = {
+    val checksum = new CRC32
+    checksum.update(input.asInstanceOf[Array[Byte]], 0, input.asInstanceOf[Array[Byte]].length)
+    checksum.getValue
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val CRC32 = "java.util.zip.CRC32"
+    val checksum = ctx.freshName("checksum")
+    nullSafeCodeGen(ctx, ev, value => {
+      s"""
+        $CRC32 $checksum = new $CRC32();
+        $checksum.update($value, 0, $value.length);
+        ${ev.value} = $checksum.getValue();
+      """
+    })
+  }
+}
+
+
+/**
+ * A function that calculates hash value for a group of expressions.  Note that the `seed` argument
+ * is not exposed to users and should only be set inside spark SQL.
+ *
+ * The hash value for an expression depends on its type and seed:
+ *  - null:               seed
+ *  - boolean:            turn boolean into int, 1 for true, 0 for false, and then use murmur3 to
+ *                        hash this int with seed.
+ *  - byte, short, int:   use murmur3 to hash the input as int with seed.
+ *  - long:               use murmur3 to hash the long input with seed.
+ *  - float:              turn it into int: java.lang.Float.floatToIntBits(input), and hash it.
+ *  - double:             turn it into long: java.lang.Double.doubleToLongBits(input), and hash it.
+ *  - decimal:            if it's a small decimal, i.e. precision <= 18, turn it into long and hash
+ *                        it. Else, turn it into bytes and hash it.
+ *  - calendar interval:  hash `microseconds` first, and use the result as seed to hash `months`.
+ *  - binary:             use murmur3 to hash the bytes with seed.
+ *  - string:             get the bytes of string and hash it.
+ *  - array:              The `result` starts with seed, then use `result` as seed, recursively
+ *                        calculate hash value for each element, and assign the element hash value
+ *                        to `result`.
+ *  - map:                The `result` starts with seed, then use `result` as seed, recursively
+ *                        calculate hash value for each key-value, and assign the key-value hash
+ *                        value to `result`.
+ *  - struct:             The `result` starts with seed, then use `result` as seed, recursively
+ *                        calculate hash value for each field, and assign the field hash value to
+ *                        `result`.
+ *
+ * Finally we aggregate the hash values for each expression by the same way of struct.
+ */
+abstract class HashExpression[E] extends Expression {
+  /** Seed of the HashExpression. */
+  val seed: E
+
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  override def nullable: Boolean = false
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (children.isEmpty) {
+      TypeCheckResult.TypeCheckFailure("function hash requires at least one argument")
+    } else {
+      TypeCheckResult.TypeCheckSuccess
+    }
+  }
+
+  override def eval(input: InternalRow = null): Any = {
+    var hash = seed
+    var i = 0
+    val len = children.length
+    while (i < len) {
+      hash = computeHash(children(i).eval(input), children(i).dataType, hash)
+      i += 1
+    }
+    hash
+  }
+
+  protected def computeHash(value: Any, dataType: DataType, seed: E): E
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    ev.isNull = "false"
+    val childrenHash = ctx.splitExpressions(ctx.INPUT_ROW, children.map { child =>
+      val childGen = child.genCode(ctx)
+      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
+        computeHash(childGen.value, child.dataType, ev.value, ctx)
+      }
+    })
+
+    ctx.addMutableState(ctx.javaType(dataType), ev.value, "")
+    ev.copy(code = s"""
+      ${ev.value} = $seed;
+      $childrenHash""")
+  }
+
+  protected def nullSafeElementHash(
+      input: String,
+      index: String,
+      nullable: Boolean,
+      elementType: DataType,
+      result: String,
+      ctx: CodegenContext): String = {
+    val element = ctx.freshName("element")
+
+    ctx.nullSafeExec(nullable, s"$input.isNullAt($index)") {
+      s"""
+        final ${ctx.javaType(elementType)} $element = ${ctx.getValue(input, elementType, index)};
+        ${computeHash(element, elementType, result, ctx)}
+      """
+    }
+  }
+
+  protected def genHashInt(i: String, result: String): String =
+    s"$result = $hasherClassName.hashInt($i, $result);"
+
+  protected def genHashLong(l: String, result: String): String =
+    s"$result = $hasherClassName.hashLong($l, $result);"
+
+  protected def genHashBytes(b: String, result: String): String = {
+    val offset = "Platform.BYTE_ARRAY_OFFSET"
+    s"$result = $hasherClassName.hashUnsafeBytes($b, $offset, $b.length, $result);"
+  }
+
+  protected def genHashBoolean(input: String, result: String): String =
+    genHashInt(s"$input ? 1 : 0", result)
+
+  protected def genHashFloat(input: String, result: String): String =
+    genHashInt(s"Float.floatToIntBits($input)", result)
+
+  protected def genHashDouble(input: String, result: String): String =
+    genHashLong(s"Double.doubleToLongBits($input)", result)
+
+  protected def genHashDecimal(
+      ctx: CodegenContext,
+      d: DecimalType,
+      input: String,
+      result: String): String = {
+    if (d.precision <= Decimal.MAX_LONG_DIGITS) {
+      genHashLong(s"$input.toUnscaledLong()", result)
+    } else {
+      val bytes = ctx.freshName("bytes")
+      s"""
+            final byte[] $bytes = $input.toJavaBigDecimal().unscaledValue().toByteArray();
+            ${genHashBytes(bytes, result)}
+          """
+    }
+  }
+
+  protected def genHashCalendarInterval(input: String, result: String): String = {
+    val microsecondsHash = s"$hasherClassName.hashLong($input.microseconds, $result)"
+    s"$result = $hasherClassName.hashInt($input.months, $microsecondsHash);"
+  }
+
+  protected def genHashString(input: String, result: String): String = {
+    val baseObject = s"$input.getBaseObject()"
+    val baseOffset = s"$input.getBaseOffset()"
+    val numBytes = s"$input.numBytes()"
+    s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes, $result);"
+  }
+
+  protected def genHashForMap(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      keyType: DataType,
+      valueType: DataType,
+      valueContainsNull: Boolean): String = {
+    val index = ctx.freshName("index")
+    val keys = ctx.freshName("keys")
+    val values = ctx.freshName("values")
+    s"""
+        final ArrayData $keys = $input.keyArray();
+        final ArrayData $values = $input.valueArray();
+        for (int $index = 0; $index < $input.numElements(); $index++) {
+          ${nullSafeElementHash(keys, index, false, keyType, result, ctx)}
+          ${nullSafeElementHash(values, index, valueContainsNull, valueType, result, ctx)}
+        }
+      """
+  }
+
+  protected def genHashForArray(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      elementType: DataType,
+      containsNull: Boolean): String = {
+    val index = ctx.freshName("index")
+    s"""
+        for (int $index = 0; $index < $input.numElements(); $index++) {
+          ${nullSafeElementHash(input, index, containsNull, elementType, result, ctx)}
+        }
+      """
+  }
+
+  protected def genHashForStruct(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      fields: Array[StructField]): String = {
+    fields.zipWithIndex.map { case (field, index) =>
+      nullSafeElementHash(input, index.toString, field.nullable, field.dataType, result, ctx)
+    }.mkString("\n")
+  }
+
+  @tailrec
+  private def computeHashWithTailRec(
+      input: String,
+      dataType: DataType,
+      result: String,
+      ctx: CodegenContext): String = dataType match {
+    case NullType => ""
+    case BooleanType => genHashBoolean(input, result)
+    case ByteType | ShortType | IntegerType | DateType => genHashInt(input, result)
+    case LongType | TimestampType => genHashLong(input, result)
+    case FloatType => genHashFloat(input, result)
+    case DoubleType => genHashDouble(input, result)
+    case d: DecimalType => genHashDecimal(ctx, d, input, result)
+    case CalendarIntervalType => genHashCalendarInterval(input, result)
+    case BinaryType => genHashBytes(input, result)
+    case StringType => genHashString(input, result)
+    case ArrayType(et, containsNull) => genHashForArray(ctx, input, result, et, containsNull)
+    case MapType(kt, vt, valueContainsNull) =>
+      genHashForMap(ctx, input, result, kt, vt, valueContainsNull)
+    case StructType(fields) => genHashForStruct(ctx, input, result, fields)
+    case udt: UserDefinedType[_] => computeHashWithTailRec(input, udt.sqlType, result, ctx)
+  }
+
+  protected def computeHash(
+      input: String,
+      dataType: DataType,
+      result: String,
+      ctx: CodegenContext): String = computeHashWithTailRec(input, dataType, result, ctx)
+
+  protected def hasherClassName: String
+}
+
+/**
+ * Base class for interpreted hash functions.
+ */
+abstract class InterpretedHashFunction {
+  protected def hashInt(i: Int, seed: Long): Long
+
+  protected def hashLong(l: Long, seed: Long): Long
+
+  protected def hashUnsafeBytes(base: AnyRef, offset: Long, length: Int, seed: Long): Long
+
+  def hash(value: Any, dataType: DataType, seed: Long): Long = {
+    value match {
+      case null => seed
+      case b: Boolean => hashInt(if (b) 1 else 0, seed)
+      case b: Byte => hashInt(b, seed)
+      case s: Short => hashInt(s, seed)
+      case i: Int => hashInt(i, seed)
+      case l: Long => hashLong(l, seed)
+      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
+      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
+      case d: Decimal =>
+        val precision = dataType.asInstanceOf[DecimalType].precision
+        if (precision <= Decimal.MAX_LONG_DIGITS) {
+          hashLong(d.toUnscaledLong, seed)
+        } else {
+          val bytes = d.toJavaBigDecimal.unscaledValue().toByteArray
+          hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length, seed)
+        }
+      case c: CalendarInterval => hashInt(c.months, hashLong(c.microseconds, seed))
+      case a: Array[Byte] =>
+        hashUnsafeBytes(a, Platform.BYTE_ARRAY_OFFSET, a.length, seed)
+      case s: UTF8String =>
+        hashUnsafeBytes(s.getBaseObject, s.getBaseOffset, s.numBytes(), seed)
+
+      case array: ArrayData =>
+        val elementType = dataType match {
+          case udt: UserDefinedType[_] => udt.sqlType.asInstanceOf[ArrayType].elementType
+          case ArrayType(et, _) => et
+        }
+        var result = seed
+        var i = 0
+        while (i < array.numElements()) {
+          result = hash(array.get(i, elementType), elementType, result)
+          i += 1
+        }
+        result
+
+      case map: MapData =>
+        val (kt, vt) = dataType match {
+          case udt: UserDefinedType[_] =>
+            val mapType = udt.sqlType.asInstanceOf[MapType]
+            mapType.keyType -> mapType.valueType
+          case MapType(kt, vt, _) => kt -> vt
+        }
+        val keys = map.keyArray()
+        val values = map.valueArray()
+        var result = seed
+        var i = 0
+        while (i < map.numElements()) {
+          result = hash(keys.get(i, kt), kt, result)
+          result = hash(values.get(i, vt), vt, result)
+          i += 1
+        }
+        result
+
+      case struct: InternalRow =>
+        val types: Array[DataType] = dataType match {
+          case udt: UserDefinedType[_] =>
+            udt.sqlType.asInstanceOf[StructType].map(_.dataType).toArray
+          case StructType(fields) => fields.map(_.dataType)
+        }
+        var result = seed
+        var i = 0
+        val len = struct.numFields
+        while (i < len) {
+          result = hash(struct.get(i, types(i)), types(i), result)
+          i += 1
+        }
+        result
+    }
+  }
+}
+
+/**
+ * A MurMur3 Hash expression.
+ *
+ * We should use this hash function for both shuffle and bucket, so that we can guarantee shuffle
+ * and bucketing have same data distribution.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark', array(123), 2);
+        -1321691492
+  """)
+case class Murmur3Hash(children: Seq[Expression], seed: Int) extends HashExpression[Int] {
+  def this(arguments: Seq[Expression]) = this(arguments, 42)
+
+  override def dataType: DataType = IntegerType
+
+  override def prettyName: String = "hash"
+
+  override protected def hasherClassName: String = classOf[Murmur3_x86_32].getName
+
+  override protected def computeHash(value: Any, dataType: DataType, seed: Int): Int = {
+    Murmur3HashFunction.hash(value, dataType, seed).toInt
+  }
+}
+
+object Murmur3HashFunction extends InterpretedHashFunction {
+  override protected def hashInt(i: Int, seed: Long): Long = {
+    Murmur3_x86_32.hashInt(i, seed.toInt)
+  }
+
+  override protected def hashLong(l: Long, seed: Long): Long = {
+    Murmur3_x86_32.hashLong(l, seed.toInt)
+  }
+
+  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
+    Murmur3_x86_32.hashUnsafeBytes(base, offset, len, seed.toInt)
+  }
+}
+
+/**
+ * A xxHash64 64-bit hash expression.
+ */
+case class XxHash64(children: Seq[Expression], seed: Long) extends HashExpression[Long] {
+  def this(arguments: Seq[Expression]) = this(arguments, 42L)
+
+  override def dataType: DataType = LongType
+
+  override def prettyName: String = "xxHash"
+
+  override protected def hasherClassName: String = classOf[XXH64].getName
+
+  override protected def computeHash(value: Any, dataType: DataType, seed: Long): Long = {
+    XxHash64Function.hash(value, dataType, seed)
+  }
+}
+
+object XxHash64Function extends InterpretedHashFunction {
+  override protected def hashInt(i: Int, seed: Long): Long = XXH64.hashInt(i, seed)
+
+  override protected def hashLong(l: Long, seed: Long): Long = XXH64.hashLong(l, seed)
+
+  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
+    XXH64.hashUnsafeBytes(base, offset, len, seed)
+  }
+}
+
+
+/**
+ * Simulates Hive's hashing function at
+ * org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils#hashcode() in Hive
+ *
+ * We should use this hash function for both shuffle and bucket of Hive tables, so that
+ * we can guarantee shuffle and bucketing have same data distribution
+ *
+ * TODO: Support Decimal and date related types
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.")
+case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
+  override val seed = 0
+
+  override def dataType: DataType = IntegerType
+
+  override def prettyName: String = "hive-hash"
+
+  override protected def hasherClassName: String = classOf[HiveHasher].getName
+
+  override protected def computeHash(value: Any, dataType: DataType, seed: Int): Int = {
+    HiveHashFunction.hash(value, dataType, seed).toInt
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    ev.isNull = "false"
+    val childHash = ctx.freshName("childHash")
+    val childrenHash = ctx.splitExpressions(ctx.INPUT_ROW, children.map { child =>
+      val childGen = child.genCode(ctx)
+      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
+        computeHash(childGen.value, child.dataType, childHash, ctx)
+      } + s"${ev.value} = (31 * ${ev.value}) + $childHash;" +
+        s"\n$childHash = 0;"
+    })
+
+    ctx.addMutableState(ctx.javaType(dataType), ev.value, "")
+    ctx.addMutableState("int", childHash, s"$childHash = 0;")
+    ev.copy(code = s"""
+      ${ev.value} = $seed;
+      $childrenHash""")
+  }
+
+  override def eval(input: InternalRow = null): Int = {
+    var hash = seed
+    var i = 0
+    val len = children.length
+    while (i < len) {
+      hash = (31 * hash) + computeHash(children(i).eval(input), children(i).dataType, hash)
+      i += 1
+    }
+    hash
+  }
+
+  override protected def genHashInt(i: String, result: String): String =
+    s"$result = $hasherClassName.hashInt($i);"
+
+  override protected def genHashLong(l: String, result: String): String =
+    s"$result = $hasherClassName.hashLong($l);"
+
+  override protected def genHashBytes(b: String, result: String): String =
+    s"$result = $hasherClassName.hashUnsafeBytes($b, Platform.BYTE_ARRAY_OFFSET, $b.length);"
+
+  override protected def genHashCalendarInterval(input: String, result: String): String = {
+    s"""
+        $result = (31 * $hasherClassName.hashInt($input.months)) +
+          $hasherClassName.hashLong($input.microseconds);"
+     """
+  }
+
+  override protected def genHashString(input: String, result: String): String = {
+    val baseObject = s"$input.getBaseObject()"
+    val baseOffset = s"$input.getBaseOffset()"
+    val numBytes = s"$input.numBytes()"
+    s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes);"
+  }
+
+  override protected def genHashForArray(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      elementType: DataType,
+      containsNull: Boolean): String = {
+    val index = ctx.freshName("index")
+    val childResult = ctx.freshName("childResult")
+    s"""
+        int $childResult = 0;
+        for (int $index = 0; $index < $input.numElements(); $index++) {
+          $childResult = 0;
+          ${nullSafeElementHash(input, index, containsNull, elementType, childResult, ctx)};
+          $result = (31 * $result) + $childResult;
+        }
+      """
+  }
+
+  override protected def genHashForMap(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      keyType: DataType,
+      valueType: DataType,
+      valueContainsNull: Boolean): String = {
+    val index = ctx.freshName("index")
+    val keys = ctx.freshName("keys")
+    val values = ctx.freshName("values")
+    val keyResult = ctx.freshName("keyResult")
+    val valueResult = ctx.freshName("valueResult")
+    s"""
+        final ArrayData $keys = $input.keyArray();
+        final ArrayData $values = $input.valueArray();
+        int $keyResult = 0;
+        int $valueResult = 0;
+        for (int $index = 0; $index < $input.numElements(); $index++) {
+          $keyResult = 0;
+          ${nullSafeElementHash(keys, index, false, keyType, keyResult, ctx)}
+          $valueResult = 0;
+          ${nullSafeElementHash(values, index, valueContainsNull, valueType, valueResult, ctx)}
+          $result += $keyResult ^ $valueResult;
+        }
+      """
+  }
+
+  override protected def genHashForStruct(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      fields: Array[StructField]): String = {
+    val localResult = ctx.freshName("localResult")
+    val childResult = ctx.freshName("childResult")
+    fields.zipWithIndex.map { case (field, index) =>
+      s"""
+         $childResult = 0;
+         ${nullSafeElementHash(input, index.toString, field.nullable, field.dataType,
+           childResult, ctx)}
+         $localResult = (31 * $localResult) + $childResult;
+       """
+    }.mkString(
+      s"""
+         int $localResult = 0;
+         int $childResult = 0;
+       """,
+      "",
+      s"$result = (31 * $result) + $localResult;"
+    )
+  }
+}
+
+object HiveHashFunction extends InterpretedHashFunction {
+  override protected def hashInt(i: Int, seed: Long): Long = {
+    HiveHasher.hashInt(i)
+  }
+
+  override protected def hashLong(l: Long, seed: Long): Long = {
+    HiveHasher.hashLong(l)
+  }
+
+  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
+    HiveHasher.hashUnsafeBytes(base, offset, len)
+  }
+
+  override def hash(value: Any, dataType: DataType, seed: Long): Long = {
+    value match {
+      case null => 0
+      case array: ArrayData =>
+        val elementType = dataType match {
+          case udt: UserDefinedType[_] => udt.sqlType.asInstanceOf[ArrayType].elementType
+          case ArrayType(et, _) => et
+        }
+
+        var result = 0
+        var i = 0
+        val length = array.numElements()
+        while (i < length) {
+          result = (31 * result) + hash(array.get(i, elementType), elementType, 0).toInt
+          i += 1
+        }
+        result
+
+      case map: MapData =>
+        val (kt, vt) = dataType match {
+          case udt: UserDefinedType[_] =>
+            val mapType = udt.sqlType.asInstanceOf[MapType]
+            mapType.keyType -> mapType.valueType
+          case MapType(_kt, _vt, _) => _kt -> _vt
+        }
+        val keys = map.keyArray()
+        val values = map.valueArray()
+
+        var result = 0
+        var i = 0
+        val length = map.numElements()
+        while (i < length) {
+          result += hash(keys.get(i, kt), kt, 0).toInt ^ hash(values.get(i, vt), vt, 0).toInt
+          i += 1
+        }
+        result
+
+      case struct: InternalRow =>
+        val types: Array[DataType] = dataType match {
+          case udt: UserDefinedType[_] =>
+            udt.sqlType.asInstanceOf[StructType].map(_.dataType).toArray
+          case StructType(fields) => fields.map(_.dataType)
+        }
+
+        var result = 0
+        var i = 0
+        val length = struct.numFields
+        while (i < length) {
+          result = (31 * result) + hash(struct.get(i, types(i)), types(i), seed + 1).toInt
+          i += 1
+        }
+        result
+
+      case _ => super.hash(value, dataType, seed)
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 244a5a34f359..abd7696a58c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -69,7 +69,7 @@ private[this] object JsonPathParser extends RegexParsers {
   // parse `.name` or `['name']` child expressions
   def named: Parser[List[PathInstruction]] =
     for {
-      name <- '.' ~> "[^\\.\\[]+".r | "[\\'" ~> "[^\\'\\?]+" <~ "\\']"
+      name <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^\\'\\?]+".r <~ "']"
     } yield {
       Key :: Named(name) :: Nil
     }
@@ -110,7 +110,12 @@ private[this] object SharedFactory {
  * of the extracted json object. It will return null if the input json string is invalid.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(json_txt, path) - Extract a json object from path")
+  usage = "_FUNC_(json_txt, path) - Extracts a json object from `path`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('{"a":"b"}', '$.a');
+       b
+  """)
 case class GetJsonObject(json: Expression, path: Expression)
   extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
 
@@ -326,7 +331,12 @@ case class GetJsonObject(json: Expression, path: Expression)
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - like get_json_object, but it takes multiple names and return a tuple. All the input parameters and output column types are string.")
+  usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - Return a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('{"a":1, "b":2}', 'a', 'b');
+       1  2
+  """)
 // scalastyle:on line.size.limit
 case class JsonTuple(children: Seq[Expression])
   extends Generator with CodegenFallback {
@@ -474,7 +484,7 @@ case class JsonTuple(children: Seq[Expression])
  * Converts an json input string to a [[StructType]] with the specified schema.
  */
 case class JsonToStruct(schema: StructType, options: Map[String, String], child: Expression)
-  extends Expression with CodegenFallback with ExpectsInputTypes {
+  extends UnaryExpression with CodegenFallback with ExpectsInputTypes {
   override def nullable: Boolean = true
 
   @transient
@@ -485,10 +495,9 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child:
       new JSONOptions(options ++ Map("mode" -> ParseModes.FAIL_FAST_MODE)))
 
   override def dataType: DataType = schema
-  override def children: Seq[Expression] = child :: Nil
 
-  override def eval(input: InternalRow): Any = {
-    try parser.parse(child.eval(input).toString).head catch {
+  override def nullSafeEval(json: Any): Any = {
+    try parser.parse(json.toString).headOption.orNull catch {
       case _: SparkSQLJsonProcessingException => null
     }
   }
@@ -500,7 +509,7 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child:
  * Converts a [[StructType]] to a json output string.
  */
 case class StructToJson(options: Map[String, String], child: Expression)
-  extends Expression with CodegenFallback with ExpectsInputTypes {
+  extends UnaryExpression with CodegenFallback with ExpectsInputTypes {
   override def nullable: Boolean = true
 
   @transient
@@ -508,10 +517,12 @@ case class StructToJson(options: Map[String, String], child: Expression)
 
   @transient
   lazy val gen =
-    new JacksonGenerator(child.dataType.asInstanceOf[StructType], writer)
+    new JacksonGenerator(
+      child.dataType.asInstanceOf[StructType],
+      writer,
+      new JSONOptions(options))
 
   override def dataType: DataType = StringType
-  override def children: Seq[Expression] = child :: Nil
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (StructType.acceptsType(child.dataType)) {
@@ -528,8 +539,8 @@ case class StructToJson(options: Map[String, String], child: Expression)
     }
   }
 
-  override def eval(input: InternalRow): Any = {
-    gen.write(child.eval(input).asInstanceOf[InternalRow])
+  override def nullSafeEval(row: Any): Any = {
+    gen.write(row.asInstanceOf[InternalRow])
     gen.flush()
     val json = writer.toString
     writer.reset()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index a597a17aadd9..1985e68c94e2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -17,14 +17,25 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.lang.{Boolean => JavaBoolean}
+import java.lang.{Byte => JavaByte}
+import java.lang.{Double => JavaDouble}
+import java.lang.{Float => JavaFloat}
+import java.lang.{Integer => JavaInteger}
+import java.lang.{Long => JavaLong}
+import java.lang.{Short => JavaShort}
+import java.math.{BigDecimal => JavaBigDecimal}
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 import java.util
 import java.util.Objects
 import javax.xml.bind.DatatypeConverter
 
+import scala.math.{BigDecimal, BigInt}
+
 import org.json4s.JsonAST._
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -46,12 +57,17 @@ object Literal {
     case s: String => Literal(UTF8String.fromString(s), StringType)
     case b: Boolean => Literal(b, BooleanType)
     case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale))
-    case d: java.math.BigDecimal =>
+    case d: JavaBigDecimal =>
       Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale()))
     case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale))
     case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType)
     case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
     case a: Array[Byte] => Literal(a, BinaryType)
+    case a: Array[_] =>
+      val elementType = componentTypeToDataType(a.getClass.getComponentType())
+      val dataType = ArrayType(elementType)
+      val convert = CatalystTypeConverters.createToCatalystConverter(dataType)
+      Literal(convert(a), dataType)
     case i: CalendarInterval => Literal(i, CalendarIntervalType)
     case null => Literal(null, NullType)
     case v: Literal => v
@@ -59,6 +75,45 @@ object Literal {
       throw new RuntimeException("Unsupported literal type " + v.getClass + " " + v)
   }
 
+  /**
+   * Returns the Spark SQL DataType for a given class object. Since this type needs to be resolved
+   * in runtime, we use match-case idioms for class objects here. However, there are similar
+   * functions in other files (e.g., HiveInspectors), so these functions need to merged into one.
+   */
+  private[this] def componentTypeToDataType(clz: Class[_]): DataType = clz match {
+    // primitive types
+    case JavaShort.TYPE => ShortType
+    case JavaInteger.TYPE => IntegerType
+    case JavaLong.TYPE => LongType
+    case JavaDouble.TYPE => DoubleType
+    case JavaByte.TYPE => ByteType
+    case JavaFloat.TYPE => FloatType
+    case JavaBoolean.TYPE => BooleanType
+
+    // java classes
+    case _ if clz == classOf[Date] => DateType
+    case _ if clz == classOf[Timestamp] => TimestampType
+    case _ if clz == classOf[JavaBigDecimal] => DecimalType.SYSTEM_DEFAULT
+    case _ if clz == classOf[Array[Byte]] => BinaryType
+    case _ if clz == classOf[JavaShort] => ShortType
+    case _ if clz == classOf[JavaInteger] => IntegerType
+    case _ if clz == classOf[JavaLong] => LongType
+    case _ if clz == classOf[JavaDouble] => DoubleType
+    case _ if clz == classOf[JavaByte] => ByteType
+    case _ if clz == classOf[JavaFloat] => FloatType
+    case _ if clz == classOf[JavaBoolean] => BooleanType
+
+    // other scala classes
+    case _ if clz == classOf[String] => StringType
+    case _ if clz == classOf[BigInt] => DecimalType.SYSTEM_DEFAULT
+    case _ if clz == classOf[BigDecimal] => DecimalType.SYSTEM_DEFAULT
+    case _ if clz == classOf[CalendarInterval] => CalendarIntervalType
+
+    case _ if clz.isArray => ArrayType(componentTypeToDataType(clz.getComponentType))
+
+    case _ => throw new AnalysisException(s"Unsupported component type $clz in arrays")
+  }
+
   /**
    * Constructs a [[Literal]] of [[ObjectType]], for example when you need to pass an object
    * into code generation.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index 5152265152ae..54b845740375 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -36,7 +36,7 @@ import org.apache.spark.unsafe.types.UTF8String
  * @param name The short name of the function
  */
 abstract class LeafMathExpression(c: Double, name: String)
-  extends LeafExpression with CodegenFallback {
+  extends LeafExpression with CodegenFallback with Serializable {
 
   override def dataType: DataType = DoubleType
   override def foldable: Boolean = true
@@ -139,8 +139,12 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
  * evaluated by the optimizer during constant folding.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns Euler's number, E.",
-  extended = "> SELECT _FUNC_();\n 2.718281828459045")
+  usage = "_FUNC_() - Returns Euler's number, e.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       2.718281828459045
+  """)
 case class EulerNumber() extends LeafMathExpression(math.E, "E")
 
 /**
@@ -148,8 +152,12 @@ case class EulerNumber() extends LeafMathExpression(math.E, "E")
  * evaluated by the optimizer during constant folding.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns PI.",
-  extended = "> SELECT _FUNC_();\n 3.141592653589793")
+  usage = "_FUNC_() - Returns pi.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       3.141592653589793
+  """)
 case class Pi() extends LeafMathExpression(math.Pi, "PI")
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -158,29 +166,61 @@ case class Pi() extends LeafMathExpression(math.Pi, "PI")
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the arc cosine of x if -1<=x<=1 or NaN otherwise.",
-  extended = "> SELECT _FUNC_(1);\n 0.0\n> SELECT _FUNC_(2);\n NaN")
+  usage = "_FUNC_(expr) - Returns the inverse cosine (a.k.a. arccosine) of `expr` if -1<=`expr`<=1 or NaN otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       0.0
+      > SELECT _FUNC_(2);
+       NaN
+  """)
+// scalastyle:on line.size.limit
 case class Acos(child: Expression) extends UnaryMathExpression(math.acos, "ACOS")
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the arc sin of x if -1<=x<=1 or NaN otherwise.",
-  extended = "> SELECT _FUNC_(0);\n 0.0\n> SELECT _FUNC_(2);\n NaN")
+  usage = "_FUNC_(expr) - Returns the inverse sine (a.k.a. arcsine) the arc sin of `expr` if -1<=`expr`<=1 or NaN otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+      > SELECT _FUNC_(2);
+       NaN
+  """)
+// scalastyle:on line.size.limit
 case class Asin(child: Expression) extends UnaryMathExpression(math.asin, "ASIN")
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the arc tangent.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the inverse tangent (a.k.a. arctangent).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
+// scalastyle:on line.size.limit
 case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the cube root of a double value.",
-  extended = "> SELECT _FUNC_(27.0);\n 3.0")
+  usage = "_FUNC_(expr) - Returns the cube root of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(27.0);
+       3.0
+  """)
 case class Cbrt(child: Expression) extends UnaryMathExpression(math.cbrt, "CBRT")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the smallest integer not smaller than x.",
-  extended = "> SELECT _FUNC_(-0.1);\n 0\n> SELECT _FUNC_(5);\n 5")
+  usage = "_FUNC_(expr) - Returns the smallest integer not smaller than `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(-0.1);
+       0
+      > SELECT _FUNC_(5);
+       5
+  """)
 case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL") {
   override def dataType: DataType = child.dataType match {
     case dt @ DecimalType.Fixed(_, 0) => dt
@@ -208,13 +248,21 @@ case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL"
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the cosine of x.",
-  extended = "> SELECT _FUNC_(0);\n 1.0")
+  usage = "_FUNC_(expr) - Returns the cosine of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       1.0
+  """)
 case class Cos(child: Expression) extends UnaryMathExpression(math.cos, "COS")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the hyperbolic cosine of x.",
-  extended = "> SELECT _FUNC_(0);\n 1.0")
+  usage = "_FUNC_(expr) - Returns the hyperbolic cosine of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       1.0
+  """)
 case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH")
 
 /**
@@ -225,8 +273,14 @@ case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH"
  * @param toBaseExpr to which base
  */
 @ExpressionDescription(
-  usage = "_FUNC_(num, from_base, to_base) - Convert num from from_base to to_base.",
-  extended = "> SELECT _FUNC_('100', 2, 10);\n '4'\n> SELECT _FUNC_(-10, 16, -10);\n '16'")
+  usage = "_FUNC_(num, from_base, to_base) - Convert `num` from `from_base` to `to_base`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('100', 2, 10);
+       4
+      > SELECT _FUNC_(-10, 16, -10);
+       16
+  """)
 case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
 
@@ -256,18 +310,32 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns e to the power of x.",
-  extended = "> SELECT _FUNC_(0);\n 1.0")
+  usage = "_FUNC_(expr) - Returns e to the power of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       1.0
+  """)
 case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns exp(x) - 1.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns exp(`expr`) - 1.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXPM1")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the largest integer not greater than x.",
-  extended = "> SELECT _FUNC_(-0.1);\n -1\n> SELECT _FUNC_(5);\n 5")
+  usage = "_FUNC_(expr) - Returns the largest integer not greater than `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(-0.1);
+       -1
+      > SELECT _FUNC_(5);
+       5
+  """)
 case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR") {
   override def dataType: DataType = child.dataType match {
     case dt @ DecimalType.Fixed(_, 0) => dt
@@ -326,8 +394,12 @@ object Factorial {
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(n) - Returns n factorial for n is [0..20]. Otherwise, NULL.",
-  extended = "> SELECT _FUNC_(5);\n 120")
+  usage = "_FUNC_(expr) - Returns the factorial of `expr`. `expr` is [0..20]. Otherwise, null.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(5);
+       120
+  """)
 case class Factorial(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(IntegerType)
@@ -361,13 +433,21 @@ case class Factorial(child: Expression) extends UnaryExpression with ImplicitCas
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the natural logarithm of x with base e.",
-  extended = "> SELECT _FUNC_(1);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the natural logarithm (base e) of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       0.0
+  """)
 case class Log(child: Expression) extends UnaryLogExpression(math.log, "LOG")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the logarithm of x with base 2.",
-  extended = "> SELECT _FUNC_(2);\n 1.0")
+  usage = "_FUNC_(expr) - Returns the logarithm of `expr` with base 2.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2);
+       1.0
+  """)
 case class Log2(child: Expression)
   extends UnaryLogExpression((x: Double) => math.log(x) / math.log(2), "LOG2") {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
@@ -384,71 +464,127 @@ case class Log2(child: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the logarithm of x with base 10.",
-  extended = "> SELECT _FUNC_(10);\n 1.0")
+  usage = "_FUNC_(expr) - Returns the logarithm of `expr` with base 10.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10);
+       1.0
+  """)
 case class Log10(child: Expression) extends UnaryLogExpression(math.log10, "LOG10")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns log(1 + x).",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns log(1 + `expr`).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Log1p(child: Expression) extends UnaryLogExpression(math.log1p, "LOG1P") {
   protected override val yAsymptote: Double = -1.0
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x, d) - Return the rounded x at d decimal places.",
-  extended = "> SELECT _FUNC_(12.3456, 1);\n 12.3")
+  usage = "_FUNC_(expr) - Returns the double value that is closest in value to the argument and is equal to a mathematical integer.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(12.3456);
+       12.0
+  """)
+// scalastyle:on line.size.limit
 case class Rint(child: Expression) extends UnaryMathExpression(math.rint, "ROUND") {
   override def funcName: String = "rint"
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sign of x.",
-  extended = "> SELECT _FUNC_(40);\n 1.0")
+  usage = "_FUNC_(expr) - Returns -1.0, 0.0 or 1.0 as `expr` is negative, 0 or positive.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(40);
+       1.0
+  """)
 case class Signum(child: Expression) extends UnaryMathExpression(math.signum, "SIGNUM")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sine of x.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the sine of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Sin(child: Expression) extends UnaryMathExpression(math.sin, "SIN")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the hyperbolic sine of x.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the hyperbolic sine of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Sinh(child: Expression) extends UnaryMathExpression(math.sinh, "SINH")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the square root of x.",
-  extended = "> SELECT _FUNC_(4);\n 2.0")
+  usage = "_FUNC_(expr) - Returns the square root of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(4);
+       2.0
+  """)
 case class Sqrt(child: Expression) extends UnaryMathExpression(math.sqrt, "SQRT")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the tangent of x.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the tangent of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the hyperbolic tangent of x.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the hyperbolic tangent of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Tanh(child: Expression) extends UnaryMathExpression(math.tanh, "TANH")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Converts radians to degrees.",
-  extended = "> SELECT _FUNC_(3.141592653589793);\n 180.0")
+  usage = "_FUNC_(expr) - Converts radians to degrees.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(3.141592653589793);
+       180.0
+  """)
 case class ToDegrees(child: Expression) extends UnaryMathExpression(math.toDegrees, "DEGREES") {
   override def funcName: String = "toDegrees"
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Converts degrees to radians.",
-  extended = "> SELECT _FUNC_(180);\n 3.141592653589793")
+  usage = "_FUNC_(expr) - Converts degrees to radians.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(180);
+       3.141592653589793
+  """)
 case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadians, "RADIANS") {
   override def funcName: String = "toRadians"
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns x in binary.",
-  extended = "> SELECT _FUNC_(13);\n '1101'")
+  usage = "_FUNC_(expr) - Returns the string representation of the long value `expr` represented in binary.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(13);
+       1101
+      > SELECT _FUNC_(-13);
+       1111111111111111111111111111111111111111111111111111111111110011
+      > SELECT _FUNC_(13.3);
+       1101
+  """)
+// scalastyle:on line.size.limit
 case class Bin(child: Expression)
   extends UnaryExpression with Serializable with ImplicitCastInputTypes {
 
@@ -541,8 +677,14 @@ object Hex {
  * and returns the resulting STRING. Negative numbers would be treated as two's complement.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Convert the argument to hexadecimal.",
-  extended = "> SELECT _FUNC_(17);\n '11'\n> SELECT _FUNC_('Spark SQL');\n '537061726B2053514C'")
+  usage = "_FUNC_(expr) - Converts `expr` to hexadecimal.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(17);
+       11
+      > SELECT _FUNC_('Spark SQL');
+       537061726B2053514C
+  """)
 case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] =
@@ -572,8 +714,12 @@ case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInput
  * Resulting characters are returned as a byte array.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Converts hexadecimal argument to binary.",
-  extended = "> SELECT decode(_FUNC_('537061726B2053514C'),'UTF-8');\n 'Spark SQL'")
+  usage = "_FUNC_(expr) - Converts hexadecimal `expr` to binary.",
+  extended = """
+    Examples:
+      > SELECT decode(_FUNC_('537061726B2053514C'), 'UTF-8');
+       Spark SQL
+  """)
 case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
@@ -602,9 +748,15 @@ case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInp
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x,y) - Returns the arc tangent2.",
-  extended = "> SELECT _FUNC_(0, 0);\n 0.0")
+  usage = "_FUNC_(expr1, expr2) - Returns the angle in radians between the positive x-axis of a plane and the point given by the coordinates (`expr1`, `expr2`).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0, 0);
+       0.0
+  """)
+// scalastyle:on line.size.limit
 case class Atan2(left: Expression, right: Expression)
   extends BinaryMathExpression(math.atan2, "ATAN2") {
 
@@ -619,8 +771,12 @@ case class Atan2(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x1, x2) - Raise x1 to the power of x2.",
-  extended = "> SELECT _FUNC_(2, 3);\n 8.0")
+  usage = "_FUNC_(expr1, expr2) - Raises `expr1` to the power of `expr2`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2, 3);
+       8.0
+  """)
 case class Pow(left: Expression, right: Expression)
   extends BinaryMathExpression(math.pow, "POWER") {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
@@ -636,8 +792,12 @@ case class Pow(left: Expression, right: Expression)
  * @param right number of bits to left shift.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Bitwise left shift.",
-  extended = "> SELECT _FUNC_(2, 1);\n 4")
+  usage = "_FUNC_(base, expr) - Bitwise left shift.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2, 1);
+       4
+  """)
 case class ShiftLeft(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -660,14 +820,18 @@ case class ShiftLeft(left: Expression, right: Expression)
 
 
 /**
- * Bitwise right shift.
+ * Bitwise (signed) right shift.
  *
  * @param left the base number to shift.
  * @param right number of bits to right shift.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Bitwise right shift.",
-  extended = "> SELECT _FUNC_(4, 1);\n 2")
+  usage = "_FUNC_(base, expr) - Bitwise (signed) right shift.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(4, 1);
+       2
+  """)
 case class ShiftRight(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -696,8 +860,12 @@ case class ShiftRight(left: Expression, right: Expression)
  * @param right the number of bits to right shift.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Bitwise unsigned right shift.",
-  extended = "> SELECT _FUNC_(4, 1);\n 2")
+  usage = "_FUNC_(base, expr) - Bitwise unsigned right shift.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(4, 1);
+       2
+  """)
 case class ShiftRightUnsigned(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -719,8 +887,12 @@ case class ShiftRightUnsigned(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Returns sqrt(a**2 + b**2).",
-  extended = "> SELECT _FUNC_(3, 4);\n 5.0")
+  usage = "_FUNC_(expr1, expr2) - Returns sqrt(`expr1`**2 + `expr2`**2).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(3, 4);
+       5.0
+  """)
 case class Hypot(left: Expression, right: Expression)
   extends BinaryMathExpression(math.hypot, "HYPOT")
 
@@ -732,8 +904,12 @@ case class Hypot(left: Expression, right: Expression)
  * @param right the number to compute the logarithm of.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(b, x) - Returns the logarithm of x with base b.",
-  extended = "> SELECT _FUNC_(10, 100);\n 2.0")
+  usage = "_FUNC_(base, expr) - Returns the logarithm of `expr` with `base`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10, 100);
+       2.0
+  """)
 case class Logarithm(left: Expression, right: Expression)
   extends BinaryMathExpression((c1, c2) => math.log(c2) / math.log(c1), "LOG") {
 
@@ -845,10 +1021,10 @@ abstract class RoundBase(child: Expression, scale: Expression,
 
   // not overriding since _scale is a constant int at runtime
   def nullSafeEval(input1: Any): Any = {
-    child.dataType match {
-      case _: DecimalType =>
+    dataType match {
+      case DecimalType.Fixed(_, s) =>
         val decimal = input1.asInstanceOf[Decimal]
-        if (decimal.changePrecision(decimal.precision, _scale, mode)) decimal else null
+        if (decimal.changePrecision(decimal.precision, s, mode)) decimal else null
       case ByteType =>
         BigDecimal(input1.asInstanceOf[Byte]).setScale(_scale, mode).toByte
       case ShortType =>
@@ -877,10 +1053,10 @@ abstract class RoundBase(child: Expression, scale: Expression,
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val ce = child.genCode(ctx)
 
-    val evaluationCode = child.dataType match {
-      case _: DecimalType =>
+    val evaluationCode = dataType match {
+      case DecimalType.Fixed(_, s) =>
         s"""
-        if (${ce.value}.changePrecision(${ce.value}.precision(), ${_scale},
+        if (${ce.value}.changePrecision(${ce.value}.precision(), ${s},
             java.math.BigDecimal.${modeStr})) {
           ${ev.value} = ${ce.value};
         } else {
@@ -956,9 +1132,15 @@ abstract class RoundBase(child: Expression, scale: Expression,
  * Round an expression to d decimal places using HALF_UP rounding mode.
  * round(2.5) == 3.0, round(3.5) == 4.0.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x, d) - Round x to d decimal places using HALF_UP rounding mode.",
-  extended = "> SELECT _FUNC_(2.5, 0);\n 3.0")
+  usage = "_FUNC_(expr, d) - Returns `expr` rounded to `d` decimal places using HALF_UP rounding mode.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2.5, 0);
+       3.0
+  """)
+// scalastyle:on line.size.limit
 case class Round(child: Expression, scale: Expression)
   extends RoundBase(child, scale, BigDecimal.RoundingMode.HALF_UP, "ROUND_HALF_UP")
     with Serializable with ImplicitCastInputTypes {
@@ -970,9 +1152,15 @@ case class Round(child: Expression, scale: Expression)
  * also known as Gaussian rounding or bankers' rounding.
  * round(2.5) = 2.0, round(3.5) = 4.0.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x, d) - Round x to d decimal places using HALF_EVEN rounding mode.",
-  extended = "> SELECT _FUNC_(2.5, 0);\n 2.0")
+  usage = "_FUNC_(expr, d) - Returns `expr` rounded to `d` decimal places using HALF_EVEN rounding mode.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2.5, 0);
+       2.0
+  """)
+// scalastyle:on line.size.limit
 case class BRound(child: Expression, scale: Expression)
   extends RoundBase(child, scale, BigDecimal.RoundingMode.HALF_EVEN, "ROUND_HALF_EVEN")
     with Serializable with ImplicitCastInputTypes {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 5ead16908732..a874a1cf3708 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -17,507 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.security.{MessageDigest, NoSuchAlgorithmException}
-import java.util.zip.CRC32
-
-import scala.annotation.tailrec
-
-import org.apache.commons.codec.digest.DigestUtils
-
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.hash.Murmur3_x86_32
-import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
-import org.apache.spark.unsafe.Platform
-
-/**
- * A function that calculates an MD5 128-bit checksum and returns it as a hex string
- * For input of type [[BinaryType]]
- */
-@ExpressionDescription(
-  usage = "_FUNC_(input) - Returns an MD5 128-bit checksum as a hex string of the input",
-  extended = "> SELECT _FUNC_('Spark');\n '8cde774d6f7333752ed72cacddb05126'")
-case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
-
-  override def dataType: DataType = StringType
-
-  override def inputTypes: Seq[DataType] = Seq(BinaryType)
-
-  protected override def nullSafeEval(input: Any): Any =
-    UTF8String.fromString(DigestUtils.md5Hex(input.asInstanceOf[Array[Byte]]))
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, c =>
-      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
-  }
-}
-
-/**
- * A function that calculates the SHA-2 family of functions (SHA-224, SHA-256, SHA-384, and SHA-512)
- * and returns it as a hex string. The first argument is the string or binary to be hashed. The
- * second argument indicates the desired bit length of the result, which must have a value of 224,
- * 256, 384, 512, or 0 (which is equivalent to 256). SHA-224 is supported starting from Java 8. If
- * asking for an unsupported SHA function, the return value is NULL. If either argument is NULL or
- * the hash length is not one of the permitted values, the return value is NULL.
- */
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = """_FUNC_(input, bitLength) - Returns a checksum of SHA-2 family as a hex string of the input.
-            SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.""",
-  extended = """> SELECT _FUNC_('Spark', 0);
-               '529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b'""")
-// scalastyle:on line.size.limit
-case class Sha2(left: Expression, right: Expression)
-  extends BinaryExpression with Serializable with ImplicitCastInputTypes {
-
-  override def dataType: DataType = StringType
-  override def nullable: Boolean = true
-
-  override def inputTypes: Seq[DataType] = Seq(BinaryType, IntegerType)
-
-  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    val bitLength = input2.asInstanceOf[Int]
-    val input = input1.asInstanceOf[Array[Byte]]
-    bitLength match {
-      case 224 =>
-        // DigestUtils doesn't support SHA-224 now
-        try {
-          val md = MessageDigest.getInstance("SHA-224")
-          md.update(input)
-          UTF8String.fromBytes(md.digest())
-        } catch {
-          // SHA-224 is not supported on the system, return null
-          case noa: NoSuchAlgorithmException => null
-        }
-      case 256 | 0 =>
-        UTF8String.fromString(DigestUtils.sha256Hex(input))
-      case 384 =>
-        UTF8String.fromString(DigestUtils.sha384Hex(input))
-      case 512 =>
-        UTF8String.fromString(DigestUtils.sha512Hex(input))
-      case _ => null
-    }
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val digestUtils = "org.apache.commons.codec.digest.DigestUtils"
-    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
-      s"""
-        if ($eval2 == 224) {
-          try {
-            java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-224");
-            md.update($eval1);
-            ${ev.value} = UTF8String.fromBytes(md.digest());
-          } catch (java.security.NoSuchAlgorithmException e) {
-            ${ev.isNull} = true;
-          }
-        } else if ($eval2 == 256 || $eval2 == 0) {
-          ${ev.value} =
-            UTF8String.fromString($digestUtils.sha256Hex($eval1));
-        } else if ($eval2 == 384) {
-          ${ev.value} =
-            UTF8String.fromString($digestUtils.sha384Hex($eval1));
-        } else if ($eval2 == 512) {
-          ${ev.value} =
-            UTF8String.fromString($digestUtils.sha512Hex($eval1));
-        } else {
-          ${ev.isNull} = true;
-        }
-      """
-    })
-  }
-}
-
-/**
- * A function that calculates a sha1 hash value and returns it as a hex string
- * For input of type [[BinaryType]] or [[StringType]]
- */
-@ExpressionDescription(
-  usage = "_FUNC_(input) - Returns a sha1 hash value as a hex string of the input",
-  extended = "> SELECT _FUNC_('Spark');\n '85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c'")
-case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
-
-  override def dataType: DataType = StringType
-
-  override def inputTypes: Seq[DataType] = Seq(BinaryType)
-
-  protected override def nullSafeEval(input: Any): Any =
-    UTF8String.fromString(DigestUtils.sha1Hex(input.asInstanceOf[Array[Byte]]))
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, c =>
-      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.sha1Hex($c))"
-    )
-  }
-}
-
-/**
- * A function that computes a cyclic redundancy check value and returns it as a bigint
- * For input of type [[BinaryType]]
- */
-@ExpressionDescription(
-  usage = "_FUNC_(input) - Returns a cyclic redundancy check value as a bigint of the input",
-  extended = "> SELECT _FUNC_('Spark');\n '1557323817'")
-case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
-
-  override def dataType: DataType = LongType
-
-  override def inputTypes: Seq[DataType] = Seq(BinaryType)
-
-  protected override def nullSafeEval(input: Any): Any = {
-    val checksum = new CRC32
-    checksum.update(input.asInstanceOf[Array[Byte]], 0, input.asInstanceOf[Array[Byte]].length)
-    checksum.getValue
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val CRC32 = "java.util.zip.CRC32"
-    val checksum = ctx.freshName("checksum")
-    nullSafeCodeGen(ctx, ev, value => {
-      s"""
-        $CRC32 $checksum = new $CRC32();
-        $checksum.update($value, 0, $value.length);
-        ${ev.value} = $checksum.getValue();
-      """
-    })
-  }
-}
-
-
-/**
- * A function that calculates hash value for a group of expressions.  Note that the `seed` argument
- * is not exposed to users and should only be set inside spark SQL.
- *
- * The hash value for an expression depends on its type and seed:
- *  - null:               seed
- *  - boolean:            turn boolean into int, 1 for true, 0 for false, and then use murmur3 to
- *                        hash this int with seed.
- *  - byte, short, int:   use murmur3 to hash the input as int with seed.
- *  - long:               use murmur3 to hash the long input with seed.
- *  - float:              turn it into int: java.lang.Float.floatToIntBits(input), and hash it.
- *  - double:             turn it into long: java.lang.Double.doubleToLongBits(input), and hash it.
- *  - decimal:            if it's a small decimal, i.e. precision <= 18, turn it into long and hash
- *                        it. Else, turn it into bytes and hash it.
- *  - calendar interval:  hash `microseconds` first, and use the result as seed to hash `months`.
- *  - binary:             use murmur3 to hash the bytes with seed.
- *  - string:             get the bytes of string and hash it.
- *  - array:              The `result` starts with seed, then use `result` as seed, recursively
- *                        calculate hash value for each element, and assign the element hash value
- *                        to `result`.
- *  - map:                The `result` starts with seed, then use `result` as seed, recursively
- *                        calculate hash value for each key-value, and assign the key-value hash
- *                        value to `result`.
- *  - struct:             The `result` starts with seed, then use `result` as seed, recursively
- *                        calculate hash value for each field, and assign the field hash value to
- *                        `result`.
- *
- * Finally we aggregate the hash values for each expression by the same way of struct.
- */
-abstract class HashExpression[E] extends Expression {
-  /** Seed of the HashExpression. */
-  val seed: E
-
-  override def foldable: Boolean = children.forall(_.foldable)
-
-  override def nullable: Boolean = false
-
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (children.isEmpty) {
-      TypeCheckResult.TypeCheckFailure("function hash requires at least one argument")
-    } else {
-      TypeCheckResult.TypeCheckSuccess
-    }
-  }
-
-  override def eval(input: InternalRow): Any = {
-    var hash = seed
-    var i = 0
-    val len = children.length
-    while (i < len) {
-      hash = computeHash(children(i).eval(input), children(i).dataType, hash)
-      i += 1
-    }
-    hash
-  }
-
-  protected def computeHash(value: Any, dataType: DataType, seed: E): E
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    ev.isNull = "false"
-    val childrenHash = children.map { child =>
-      val childGen = child.genCode(ctx)
-      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
-        computeHash(childGen.value, child.dataType, ev.value, ctx)
-      }
-    }.mkString("\n")
-
-    ev.copy(code = s"""
-      ${ctx.javaType(dataType)} ${ev.value} = $seed;
-      $childrenHash""")
-  }
-
-  protected def nullSafeElementHash(
-      input: String,
-      index: String,
-      nullable: Boolean,
-      elementType: DataType,
-      result: String,
-      ctx: CodegenContext): String = {
-    val element = ctx.freshName("element")
-
-    ctx.nullSafeExec(nullable, s"$input.isNullAt($index)") {
-      s"""
-        final ${ctx.javaType(elementType)} $element = ${ctx.getValue(input, elementType, index)};
-        ${computeHash(element, elementType, result, ctx)}
-      """
-    }
-  }
-
-  protected def genHashInt(i: String, result: String): String =
-    s"$result = $hasherClassName.hashInt($i, $result);"
-
-  protected def genHashLong(l: String, result: String): String =
-    s"$result = $hasherClassName.hashLong($l, $result);"
-
-  protected def genHashBytes(b: String, result: String): String = {
-    val offset = "Platform.BYTE_ARRAY_OFFSET"
-    s"$result = $hasherClassName.hashUnsafeBytes($b, $offset, $b.length, $result);"
-  }
-
-  protected def genHashBoolean(input: String, result: String): String =
-    genHashInt(s"$input ? 1 : 0", result)
-
-  protected def genHashFloat(input: String, result: String): String =
-    genHashInt(s"Float.floatToIntBits($input)", result)
-
-  protected def genHashDouble(input: String, result: String): String =
-    genHashLong(s"Double.doubleToLongBits($input)", result)
-
-  protected def genHashDecimal(
-      ctx: CodegenContext,
-      d: DecimalType,
-      input: String,
-      result: String): String = {
-    if (d.precision <= Decimal.MAX_LONG_DIGITS) {
-      genHashLong(s"$input.toUnscaledLong()", result)
-    } else {
-      val bytes = ctx.freshName("bytes")
-      s"""
-            final byte[] $bytes = $input.toJavaBigDecimal().unscaledValue().toByteArray();
-            ${genHashBytes(bytes, result)}
-          """
-    }
-  }
-
-  protected def genHashCalendarInterval(input: String, result: String): String = {
-    val microsecondsHash = s"$hasherClassName.hashLong($input.microseconds, $result)"
-    s"$result = $hasherClassName.hashInt($input.months, $microsecondsHash);"
-  }
-
-  protected def genHashString(input: String, result: String): String = {
-    val baseObject = s"$input.getBaseObject()"
-    val baseOffset = s"$input.getBaseOffset()"
-    val numBytes = s"$input.numBytes()"
-    s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes, $result);"
-  }
-
-  protected def genHashForMap(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      keyType: DataType,
-      valueType: DataType,
-      valueContainsNull: Boolean): String = {
-    val index = ctx.freshName("index")
-    val keys = ctx.freshName("keys")
-    val values = ctx.freshName("values")
-    s"""
-        final ArrayData $keys = $input.keyArray();
-        final ArrayData $values = $input.valueArray();
-        for (int $index = 0; $index < $input.numElements(); $index++) {
-          ${nullSafeElementHash(keys, index, false, keyType, result, ctx)}
-          ${nullSafeElementHash(values, index, valueContainsNull, valueType, result, ctx)}
-        }
-      """
-  }
-
-  protected def genHashForArray(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      elementType: DataType,
-      containsNull: Boolean): String = {
-    val index = ctx.freshName("index")
-    s"""
-        for (int $index = 0; $index < $input.numElements(); $index++) {
-          ${nullSafeElementHash(input, index, containsNull, elementType, result, ctx)}
-        }
-      """
-  }
-
-  protected def genHashForStruct(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      fields: Array[StructField]): String = {
-    fields.zipWithIndex.map { case (field, index) =>
-      nullSafeElementHash(input, index.toString, field.nullable, field.dataType, result, ctx)
-    }.mkString("\n")
-  }
-
-  @tailrec
-  private def computeHashWithTailRec(
-      input: String,
-      dataType: DataType,
-      result: String,
-      ctx: CodegenContext): String = dataType match {
-    case NullType => ""
-    case BooleanType => genHashBoolean(input, result)
-    case ByteType | ShortType | IntegerType | DateType => genHashInt(input, result)
-    case LongType | TimestampType => genHashLong(input, result)
-    case FloatType => genHashFloat(input, result)
-    case DoubleType => genHashDouble(input, result)
-    case d: DecimalType => genHashDecimal(ctx, d, input, result)
-    case CalendarIntervalType => genHashCalendarInterval(input, result)
-    case BinaryType => genHashBytes(input, result)
-    case StringType => genHashString(input, result)
-    case ArrayType(et, containsNull) => genHashForArray(ctx, input, result, et, containsNull)
-    case MapType(kt, vt, valueContainsNull) =>
-      genHashForMap(ctx, input, result, kt, vt, valueContainsNull)
-    case StructType(fields) => genHashForStruct(ctx, input, result, fields)
-    case udt: UserDefinedType[_] => computeHashWithTailRec(input, udt.sqlType, result, ctx)
-  }
-
-  protected def computeHash(
-      input: String,
-      dataType: DataType,
-      result: String,
-      ctx: CodegenContext): String = computeHashWithTailRec(input, dataType, result, ctx)
-
-  protected def hasherClassName: String
-}
-
-/**
- * Base class for interpreted hash functions.
- */
-abstract class InterpretedHashFunction {
-  protected def hashInt(i: Int, seed: Long): Long
-
-  protected def hashLong(l: Long, seed: Long): Long
-
-  protected def hashUnsafeBytes(base: AnyRef, offset: Long, length: Int, seed: Long): Long
-
-  def hash(value: Any, dataType: DataType, seed: Long): Long = {
-    value match {
-      case null => seed
-      case b: Boolean => hashInt(if (b) 1 else 0, seed)
-      case b: Byte => hashInt(b, seed)
-      case s: Short => hashInt(s, seed)
-      case i: Int => hashInt(i, seed)
-      case l: Long => hashLong(l, seed)
-      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
-      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
-      case d: Decimal =>
-        val precision = dataType.asInstanceOf[DecimalType].precision
-        if (precision <= Decimal.MAX_LONG_DIGITS) {
-          hashLong(d.toUnscaledLong, seed)
-        } else {
-          val bytes = d.toJavaBigDecimal.unscaledValue().toByteArray
-          hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length, seed)
-        }
-      case c: CalendarInterval => hashInt(c.months, hashLong(c.microseconds, seed))
-      case a: Array[Byte] =>
-        hashUnsafeBytes(a, Platform.BYTE_ARRAY_OFFSET, a.length, seed)
-      case s: UTF8String =>
-        hashUnsafeBytes(s.getBaseObject, s.getBaseOffset, s.numBytes(), seed)
-
-      case array: ArrayData =>
-        val elementType = dataType match {
-          case udt: UserDefinedType[_] => udt.sqlType.asInstanceOf[ArrayType].elementType
-          case ArrayType(et, _) => et
-        }
-        var result = seed
-        var i = 0
-        while (i < array.numElements()) {
-          result = hash(array.get(i, elementType), elementType, result)
-          i += 1
-        }
-        result
-
-      case map: MapData =>
-        val (kt, vt) = dataType match {
-          case udt: UserDefinedType[_] =>
-            val mapType = udt.sqlType.asInstanceOf[MapType]
-            mapType.keyType -> mapType.valueType
-          case MapType(kt, vt, _) => kt -> vt
-        }
-        val keys = map.keyArray()
-        val values = map.valueArray()
-        var result = seed
-        var i = 0
-        while (i < map.numElements()) {
-          result = hash(keys.get(i, kt), kt, result)
-          result = hash(values.get(i, vt), vt, result)
-          i += 1
-        }
-        result
-
-      case struct: InternalRow =>
-        val types: Array[DataType] = dataType match {
-          case udt: UserDefinedType[_] =>
-            udt.sqlType.asInstanceOf[StructType].map(_.dataType).toArray
-          case StructType(fields) => fields.map(_.dataType)
-        }
-        var result = seed
-        var i = 0
-        val len = struct.numFields
-        while (i < len) {
-          result = hash(struct.get(i, types(i)), types(i), result)
-          i += 1
-        }
-        result
-    }
-  }
-}
-
-/**
- * A MurMur3 Hash expression.
- *
- * We should use this hash function for both shuffle and bucket, so that we can guarantee shuffle
- * and bucketing have same data distribution.
- */
-@ExpressionDescription(
-  usage = "_FUNC_(a1, a2, ...) - Returns a hash value of the arguments.")
-case class Murmur3Hash(children: Seq[Expression], seed: Int) extends HashExpression[Int] {
-  def this(arguments: Seq[Expression]) = this(arguments, 42)
-
-  override def dataType: DataType = IntegerType
-
-  override def prettyName: String = "hash"
-
-  override protected def hasherClassName: String = classOf[Murmur3_x86_32].getName
-
-  override protected def computeHash(value: Any, dataType: DataType, seed: Int): Int = {
-    Murmur3HashFunction.hash(value, dataType, seed).toInt
-  }
-}
-
-object Murmur3HashFunction extends InterpretedHashFunction {
-  override protected def hashInt(i: Int, seed: Long): Long = {
-    Murmur3_x86_32.hashInt(i, seed.toInt)
-  }
-
-  override protected def hashLong(l: Long, seed: Long): Long = {
-    Murmur3_x86_32.hashLong(l, seed.toInt)
-  }
-
-  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
-    Murmur3_x86_32.hashUnsafeBytes(base, offset, len, seed.toInt)
-  }
-}
 
 /**
  * Print the result of an expression to stderr (used for debugging codegen).
@@ -544,7 +46,12 @@ case class PrintToStderr(child: Expression) extends UnaryExpression {
  * A function throws an exception if 'condition' is not true.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(condition) - Throw an exception if 'condition' is not true.")
+  usage = "_FUNC_(expr) - Throws an exception if `expr` is not true.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0 < 1);
+       NULL
+  """)
 case class AssertTrue(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def nullable: Boolean = true
@@ -581,256 +88,19 @@ case class AssertTrue(child: Expression) extends UnaryExpression with ImplicitCa
   override def sql: String = s"assert_true(${child.sql})"
 }
 
-/**
- * A xxHash64 64-bit hash expression.
- */
-case class XxHash64(children: Seq[Expression], seed: Long) extends HashExpression[Long] {
-  def this(arguments: Seq[Expression]) = this(arguments, 42L)
-
-  override def dataType: DataType = LongType
-
-  override def prettyName: String = "xxHash"
-
-  override protected def hasherClassName: String = classOf[XXH64].getName
-
-  override protected def computeHash(value: Any, dataType: DataType, seed: Long): Long = {
-    XxHash64Function.hash(value, dataType, seed)
-  }
-}
-
-object XxHash64Function extends InterpretedHashFunction {
-  override protected def hashInt(i: Int, seed: Long): Long = XXH64.hashInt(i, seed)
-
-  override protected def hashLong(l: Long, seed: Long): Long = XXH64.hashLong(l, seed)
-
-  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
-    XXH64.hashUnsafeBytes(base, offset, len, seed)
-  }
-}
-
 /**
  * Returns the current database of the SessionCatalog.
  */
 @ExpressionDescription(
   usage = "_FUNC_() - Returns the current database.",
-  extended = "> SELECT _FUNC_()")
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       default
+  """)
 case class CurrentDatabase() extends LeafExpression with Unevaluable {
   override def dataType: DataType = StringType
   override def foldable: Boolean = true
   override def nullable: Boolean = false
   override def prettyName: String = "current_database"
 }
-
-/**
- * Simulates Hive's hashing function at
- * org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils#hashcode() in Hive
- *
- * We should use this hash function for both shuffle and bucket of Hive tables, so that
- * we can guarantee shuffle and bucketing have same data distribution
- *
- * TODO: Support Decimal and date related types
- */
-@ExpressionDescription(
-  usage = "_FUNC_(a1, a2, ...) - Returns a hash value of the arguments.")
-case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
-  override val seed = 0
-
-  override def dataType: DataType = IntegerType
-
-  override def prettyName: String = "hive-hash"
-
-  override protected def hasherClassName: String = classOf[HiveHasher].getName
-
-  override protected def computeHash(value: Any, dataType: DataType, seed: Int): Int = {
-    HiveHashFunction.hash(value, dataType, seed).toInt
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    ev.isNull = "false"
-    val childHash = ctx.freshName("childHash")
-    val childrenHash = children.map { child =>
-      val childGen = child.genCode(ctx)
-      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
-        computeHash(childGen.value, child.dataType, childHash, ctx)
-      } + s"${ev.value} = (31 * ${ev.value}) + $childHash;"
-    }.mkString(s"int $childHash = 0;", s"\n$childHash = 0;\n", "")
-
-    ev.copy(code = s"""
-      ${ctx.javaType(dataType)} ${ev.value} = $seed;
-      $childrenHash""")
-  }
-
-  override def eval(input: InternalRow): Int = {
-    var hash = seed
-    var i = 0
-    val len = children.length
-    while (i < len) {
-      hash = (31 * hash) + computeHash(children(i).eval(input), children(i).dataType, hash)
-      i += 1
-    }
-    hash
-  }
-
-  override protected def genHashInt(i: String, result: String): String =
-    s"$result = $hasherClassName.hashInt($i);"
-
-  override protected def genHashLong(l: String, result: String): String =
-    s"$result = $hasherClassName.hashLong($l);"
-
-  override protected def genHashBytes(b: String, result: String): String =
-    s"$result = $hasherClassName.hashUnsafeBytes($b, Platform.BYTE_ARRAY_OFFSET, $b.length);"
-
-  override protected def genHashCalendarInterval(input: String, result: String): String = {
-    s"""
-        $result = (31 * $hasherClassName.hashInt($input.months)) +
-          $hasherClassName.hashLong($input.microseconds);"
-     """
-  }
-
-  override protected def genHashString(input: String, result: String): String = {
-    val baseObject = s"$input.getBaseObject()"
-    val baseOffset = s"$input.getBaseOffset()"
-    val numBytes = s"$input.numBytes()"
-    s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes);"
-  }
-
-  override protected def genHashForArray(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      elementType: DataType,
-      containsNull: Boolean): String = {
-    val index = ctx.freshName("index")
-    val childResult = ctx.freshName("childResult")
-    s"""
-        int $childResult = 0;
-        for (int $index = 0; $index < $input.numElements(); $index++) {
-          $childResult = 0;
-          ${nullSafeElementHash(input, index, containsNull, elementType, childResult, ctx)};
-          $result = (31 * $result) + $childResult;
-        }
-      """
-  }
-
-  override protected def genHashForMap(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      keyType: DataType,
-      valueType: DataType,
-      valueContainsNull: Boolean): String = {
-    val index = ctx.freshName("index")
-    val keys = ctx.freshName("keys")
-    val values = ctx.freshName("values")
-    val keyResult = ctx.freshName("keyResult")
-    val valueResult = ctx.freshName("valueResult")
-    s"""
-        final ArrayData $keys = $input.keyArray();
-        final ArrayData $values = $input.valueArray();
-        int $keyResult = 0;
-        int $valueResult = 0;
-        for (int $index = 0; $index < $input.numElements(); $index++) {
-          $keyResult = 0;
-          ${nullSafeElementHash(keys, index, false, keyType, keyResult, ctx)}
-          $valueResult = 0;
-          ${nullSafeElementHash(values, index, valueContainsNull, valueType, valueResult, ctx)}
-          $result += $keyResult ^ $valueResult;
-        }
-      """
-  }
-
-  override protected def genHashForStruct(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      fields: Array[StructField]): String = {
-    val localResult = ctx.freshName("localResult")
-    val childResult = ctx.freshName("childResult")
-    fields.zipWithIndex.map { case (field, index) =>
-      s"""
-         $childResult = 0;
-         ${nullSafeElementHash(input, index.toString, field.nullable, field.dataType,
-           childResult, ctx)}
-         $localResult = (31 * $localResult) + $childResult;
-       """
-    }.mkString(
-      s"""
-         int $localResult = 0;
-         int $childResult = 0;
-       """,
-      "",
-      s"$result = (31 * $result) + $localResult;"
-    )
-  }
-}
-
-object HiveHashFunction extends InterpretedHashFunction {
-  override protected def hashInt(i: Int, seed: Long): Long = {
-    HiveHasher.hashInt(i)
-  }
-
-  override protected def hashLong(l: Long, seed: Long): Long = {
-    HiveHasher.hashLong(l)
-  }
-
-  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
-    HiveHasher.hashUnsafeBytes(base, offset, len)
-  }
-
-  override def hash(value: Any, dataType: DataType, seed: Long): Long = {
-    value match {
-      case null => 0
-      case array: ArrayData =>
-        val elementType = dataType match {
-          case udt: UserDefinedType[_] => udt.sqlType.asInstanceOf[ArrayType].elementType
-          case ArrayType(et, _) => et
-        }
-
-        var result = 0
-        var i = 0
-        val length = array.numElements()
-        while (i < length) {
-          result = (31 * result) + hash(array.get(i, elementType), elementType, 0).toInt
-          i += 1
-        }
-        result
-
-      case map: MapData =>
-        val (kt, vt) = dataType match {
-          case udt: UserDefinedType[_] =>
-            val mapType = udt.sqlType.asInstanceOf[MapType]
-            mapType.keyType -> mapType.valueType
-          case MapType(_kt, _vt, _) => _kt -> _vt
-        }
-        val keys = map.keyArray()
-        val values = map.valueArray()
-
-        var result = 0
-        var i = 0
-        val length = map.numElements()
-        while (i < length) {
-          result += hash(keys.get(i, kt), kt, 0).toInt ^ hash(values.get(i, vt), vt, 0).toInt
-          i += 1
-        }
-        result
-
-      case struct: InternalRow =>
-        val types: Array[DataType] = dataType match {
-          case udt: UserDefinedType[_] =>
-            udt.sqlType.asInstanceOf[StructType].map(_.dataType).toArray
-          case StructType(fields) => fields.map(_.dataType)
-        }
-
-        var result = 0
-        var i = 0
-        val length = struct.numFields
-        while (i < length) {
-          result = (31 * result) + hash(struct.get(i, types(i)), types(i), seed + 1).toInt
-          i += 1
-        }
-        result
-
-      case _ => super.hash(value, dataType, seed)
-    }
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 306a99d5a37b..c842f85af693 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -22,6 +22,7 @@ import java.util.{Objects, UUID}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.types._
 
@@ -104,6 +105,7 @@ abstract class Attribute extends LeafExpression with NamedExpression with NullIn
   def withNullability(newNullability: Boolean): Attribute
   def withQualifier(newQualifier: Option[String]): Attribute
   def withName(newName: String): Attribute
+  def withMetadata(newMetadata: Metadata): Attribute
 
   override def toAttribute: Attribute = this
   def newInstance(): Attribute
@@ -292,11 +294,22 @@ case class AttributeReference(
     }
   }
 
+  override def withMetadata(newMetadata: Metadata): Attribute = {
+    AttributeReference(name, dataType, nullable, newMetadata)(exprId, qualifier, isGenerated)
+  }
+
   override protected final def otherCopyArgs: Seq[AnyRef] = {
     exprId :: qualifier :: isGenerated :: Nil
   }
 
-  override def toString: String = s"$name#${exprId.id}$typeSuffix"
+  /** Used to signal the column used to calculate an eventTime watermark (e.g. a#1-T{delayMs}) */
+  private def delaySuffix = if (metadata.contains(EventTimeWatermark.delayKey)) {
+    s"-T${metadata.getLong(EventTimeWatermark.delayKey)}ms"
+  } else {
+    ""
+  }
+
+  override def toString: String = s"$name#${exprId.id}$typeSuffix$delaySuffix"
 
   // Since the expression id is not in the first constructor it is missing from the default
   // tree string.
@@ -332,6 +345,8 @@ case class PrettyAttribute(
   override def withQualifier(newQualifier: Option[String]): Attribute =
     throw new UnsupportedOperationException
   override def withName(newName: String): Attribute = throw new UnsupportedOperationException
+  override def withMetadata(newMetadata: Metadata): Attribute =
+    throw new UnsupportedOperationException
   override def qualifier: Option[String] = throw new UnsupportedOperationException
   override def exprId: ExprId = throw new UnsupportedOperationException
   override def nullable: Boolean = true
@@ -341,10 +356,17 @@ case class PrettyAttribute(
  * A place holder used to hold a reference that has been resolved to a field outside of the current
  * plan. This is used for correlated subqueries.
  */
-case class OuterReference(e: NamedExpression) extends LeafExpression with Unevaluable {
+case class OuterReference(e: NamedExpression)
+  extends LeafExpression with NamedExpression with Unevaluable {
   override def dataType: DataType = e.dataType
   override def nullable: Boolean = e.nullable
   override def prettyName: String = "outer"
+
+  override def name: String = e.name
+  override def qualifier: Option[String] = e.qualifier
+  override def exprId: ExprId = e.exprId
+  override def toAttribute: Attribute = e.toAttribute
+  override def newInstance(): NamedExpression = OuterReference(e.newInstance())
 }
 
 object VirtualColumn {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 1c18265e0fed..d24a502c9fbd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -34,9 +34,15 @@ import org.apache.spark.sql.types._
  *   coalesce(null, null, null) => null
  * }}}
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a1, a2, ...) - Returns the first non-null argument if exists. Otherwise, NULL.",
-  extended = "> SELECT _FUNC_(NULL, 1, NULL);\n 1")
+  usage = "_FUNC_(expr1, expr2, ...) - Returns the first non-null argument if exists. Otherwise, null.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(NULL, 1, NULL);
+       1
+  """)
+// scalastyle:on line.size.limit
 case class Coalesce(children: Seq[Expression]) extends Expression {
 
   /** Coalesce is nullable if all of its children are nullable, or if it has no children. */
@@ -88,79 +94,80 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
 }
 
 
-@ExpressionDescription(usage = "_FUNC_(a,b) - Returns b if a is null, or a otherwise.")
-case class IfNull(left: Expression, right: Expression) extends RuntimeReplaceable {
-  override def children: Seq[Expression] = Seq(left, right)
-
-  override def replaceForEvaluation(): Expression = Coalesce(Seq(left, right))
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2) - Returns `expr2` if `expr1` is null, or `expr1` otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(NULL, array('2'));
+       ["2"]
+  """)
+case class IfNull(left: Expression, right: Expression, child: Expression)
+  extends RuntimeReplaceable {
 
-  override def replaceForTypeCoercion(): Expression = {
-    if (left.dataType != right.dataType) {
-      TypeCoercion.findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { dtype =>
-        copy(left = Cast(left, dtype), right = Cast(right, dtype))
-      }.getOrElse(this)
-    } else {
-      this
-    }
+  def this(left: Expression, right: Expression) = {
+    this(left, right, Coalesce(Seq(left, right)))
   }
+
+  override def flatArguments: Iterator[Any] = Iterator(left, right)
+  override def sql: String = s"$prettyName(${left.sql}, ${right.sql})"
 }
 
 
-@ExpressionDescription(usage = "_FUNC_(a,b) - Returns null if a equals to b, or a otherwise.")
-case class NullIf(left: Expression, right: Expression) extends RuntimeReplaceable {
-  override def children: Seq[Expression] = Seq(left, right)
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2) - Returns null if `expr1` equals to `expr2`, or `expr1` otherwise.",
+  extended = """
+   Examples:
+     > SELECT _FUNC_(2, 2);
+      NULL
+  """)
+case class NullIf(left: Expression, right: Expression, child: Expression)
+  extends RuntimeReplaceable {
 
-  override def replaceForEvaluation(): Expression = {
-    If(EqualTo(left, right), Literal.create(null, left.dataType), left)
+  def this(left: Expression, right: Expression) = {
+    this(left, right, If(EqualTo(left, right), Literal.create(null, left.dataType), left))
   }
 
-  override def replaceForTypeCoercion(): Expression = {
-    if (left.dataType != right.dataType) {
-      TypeCoercion.findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { dtype =>
-        copy(left = Cast(left, dtype), right = Cast(right, dtype))
-      }.getOrElse(this)
-    } else {
-      this
-    }
-  }
+  override def flatArguments: Iterator[Any] = Iterator(left, right)
+  override def sql: String = s"$prettyName(${left.sql}, ${right.sql})"
 }
 
 
-@ExpressionDescription(usage = "_FUNC_(a,b) - Returns b if a is null, or a otherwise.")
-case class Nvl(left: Expression, right: Expression) extends RuntimeReplaceable {
-  override def children: Seq[Expression] = Seq(left, right)
-
-  override def replaceForEvaluation(): Expression = Coalesce(Seq(left, right))
-
-  override def replaceForTypeCoercion(): Expression = {
-    if (left.dataType != right.dataType) {
-      TypeCoercion.findTightestCommonTypeToString(left.dataType, right.dataType).map { dtype =>
-        copy(left = Cast(left, dtype), right = Cast(right, dtype))
-      }.getOrElse(this)
-    } else {
-      this
-    }
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2) - Returns `expr2` if `expr1` is null, or `expr1` otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(NULL, array('2'));
+       ["2"]
+  """)
+case class Nvl(left: Expression, right: Expression, child: Expression) extends RuntimeReplaceable {
+
+  def this(left: Expression, right: Expression) = {
+    this(left, right, Coalesce(Seq(left, right)))
   }
+
+  override def flatArguments: Iterator[Any] = Iterator(left, right)
+  override def sql: String = s"$prettyName(${left.sql}, ${right.sql})"
 }
 
 
-@ExpressionDescription(usage = "_FUNC_(a,b,c) - Returns b if a is not null, or c otherwise.")
-case class Nvl2(expr1: Expression, expr2: Expression, expr3: Expression)
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2, expr3) - Returns `expr2` if `expr1` is not null, or `expr3` otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(NULL, 2, 1);
+       1
+  """)
+// scalastyle:on line.size.limit
+case class Nvl2(expr1: Expression, expr2: Expression, expr3: Expression, child: Expression)
   extends RuntimeReplaceable {
 
-  override def replaceForEvaluation(): Expression = If(IsNotNull(expr1), expr2, expr3)
-
-  override def children: Seq[Expression] = Seq(expr1, expr2, expr3)
-
-  override def replaceForTypeCoercion(): Expression = {
-    if (expr2.dataType != expr3.dataType) {
-      TypeCoercion.findTightestCommonTypeOfTwo(expr2.dataType, expr3.dataType).map { dtype =>
-        copy(expr2 = Cast(expr2, dtype), expr3 = Cast(expr3, dtype))
-      }.getOrElse(this)
-    } else {
-      this
-    }
+  def this(expr1: Expression, expr2: Expression, expr3: Expression) = {
+    this(expr1, expr2, expr3, If(IsNotNull(expr1), expr2, expr3))
   }
+
+  override def flatArguments: Iterator[Any] = Iterator(expr1, expr2, expr3)
+  override def sql: String = s"$prettyName(${expr1.sql}, ${expr2.sql}, ${expr3.sql})"
 }
 
 
@@ -168,7 +175,12 @@ case class Nvl2(expr1: Expression, expr2: Expression, expr3: Expression)
  * Evaluates to `true` iff it's NaN.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns true if a is NaN and false otherwise.")
+  usage = "_FUNC_(expr) - Returns true if `expr` is NaN, or false otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(cast('NaN' as double));
+       true
+  """)
 case class IsNaN(child: Expression) extends UnaryExpression
   with Predicate with ImplicitCastInputTypes {
 
@@ -194,9 +206,8 @@ case class IsNaN(child: Expression) extends UnaryExpression
       case DoubleType | FloatType =>
         ev.copy(code = s"""
           ${eval.code}
-          boolean ${ev.isNull} = false;
           ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
-          ${ev.value} = !${eval.isNull} && Double.isNaN(${eval.value});""")
+          ${ev.value} = !${eval.isNull} && Double.isNaN(${eval.value});""", isNull = "false")
     }
   }
 }
@@ -206,7 +217,12 @@ case class IsNaN(child: Expression) extends UnaryExpression
  * This Expression is useful for mapping NaN values to null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a,b) - Returns a iff it's not NaN, or b otherwise.")
+  usage = "_FUNC_(expr1, expr2) - Returns `expr1` if it's not NaN, or `expr2` otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(cast('NaN' as double), 123);
+       123.0
+  """)
 case class NaNvl(left: Expression, right: Expression)
     extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -261,7 +277,12 @@ case class NaNvl(left: Expression, right: Expression)
  * An expression that is evaluated to true if the input is null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns true if a is NULL and false otherwise.")
+  usage = "_FUNC_(expr) - Returns true if `expr` is null, or false otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       false
+  """)
 case class IsNull(child: Expression) extends UnaryExpression with Predicate {
   override def nullable: Boolean = false
 
@@ -282,7 +303,12 @@ case class IsNull(child: Expression) extends UnaryExpression with Predicate {
  * An expression that is evaluated to true if the input is not null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns true if a is not NULL and false otherwise.")
+  usage = "_FUNC_(expr) - Returns true if `expr` is not null, or false otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       true
+  """)
 case class IsNotNull(child: Expression) extends UnaryExpression with Predicate {
   override def nullable: Boolean = false
 
@@ -356,7 +382,6 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
     ev.copy(code = s"""
       int $nonnull = 0;
       $code
-      boolean ${ev.isNull} = false;
-      boolean ${ev.value} = $nonnull >= $n;""")
+      boolean ${ev.value} = $nonnull >= $n;""", isNull = "false")
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 50e2ac3c36d9..256de74d410e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -32,6 +32,78 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCo
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
 import org.apache.spark.sql.types._
 
+/**
+ * Common base class for [[StaticInvoke]], [[Invoke]], and [[NewInstance]].
+ */
+trait InvokeLike extends Expression with NonSQLExpression {
+
+  def arguments: Seq[Expression]
+
+  def propagateNull: Boolean
+
+  protected lazy val needNullCheck: Boolean = propagateNull && arguments.exists(_.nullable)
+
+  /**
+   * Prepares codes for arguments.
+   *
+   * - generate codes for argument.
+   * - use ctx.splitExpressions() to not exceed 64kb JVM limit while preparing arguments.
+   * - avoid some of nullabilty checking which are not needed because the expression is not
+   *   nullable.
+   * - when needNullCheck == true, short circuit if we found one of arguments is null because
+   *   preparing rest of arguments can be skipped in the case.
+   *
+   * @param ctx a [[CodegenContext]]
+   * @return (code to prepare arguments, argument string, result of argument null check)
+   */
+  def prepareArguments(ctx: CodegenContext): (String, String, String) = {
+
+    val resultIsNull = if (needNullCheck) {
+      val resultIsNull = ctx.freshName("resultIsNull")
+      ctx.addMutableState("boolean", resultIsNull, "")
+      resultIsNull
+    } else {
+      "false"
+    }
+    val argValues = arguments.map { e =>
+      val argValue = ctx.freshName("argValue")
+      ctx.addMutableState(ctx.javaType(e.dataType), argValue, "")
+      argValue
+    }
+
+    val argCodes = if (needNullCheck) {
+      val reset = s"$resultIsNull = false;"
+      val argCodes = arguments.zipWithIndex.map { case (e, i) =>
+        val expr = e.genCode(ctx)
+        val updateResultIsNull = if (e.nullable) {
+          s"$resultIsNull = ${expr.isNull};"
+        } else {
+          ""
+        }
+        s"""
+          if (!$resultIsNull) {
+            ${expr.code}
+            $updateResultIsNull
+            ${argValues(i)} = ${expr.value};
+          }
+        """
+      }
+      reset +: argCodes
+    } else {
+      arguments.zipWithIndex.map { case (e, i) =>
+        val expr = e.genCode(ctx)
+        s"""
+          ${expr.code}
+          ${argValues(i)} = ${expr.value};
+        """
+      }
+    }
+    val argCode = ctx.splitExpressions(ctx.INPUT_ROW, argCodes)
+
+    (argCode, argValues.mkString(", "), resultIsNull)
+  }
+}
+
 /**
  * Invokes a static function, returning the result.  By default, any of the arguments being null
  * will result in returning null instead of calling the function.
@@ -50,7 +122,7 @@ case class StaticInvoke(
     dataType: DataType,
     functionName: String,
     arguments: Seq[Expression] = Nil,
-    propagateNull: Boolean = true) extends Expression with NonSQLExpression {
+    propagateNull: Boolean = true) extends InvokeLike {
 
   val objectName = staticObject.getName.stripSuffix("$")
 
@@ -62,16 +134,10 @@ case class StaticInvoke(
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val javaType = ctx.javaType(dataType)
-    val argGen = arguments.map(_.genCode(ctx))
-    val argString = argGen.map(_.value).mkString(", ")
 
-    val callFunc = s"$objectName.$functionName($argString)"
+    val (argCode, argString, resultIsNull) = prepareArguments(ctx)
 
-    val setIsNull = if (propagateNull && arguments.nonEmpty) {
-      s"boolean ${ev.isNull} = ${argGen.map(_.isNull).mkString(" || ")};"
-    } else {
-      s"boolean ${ev.isNull} = false;"
-    }
+    val callFunc = s"$objectName.$functionName($argString)"
 
     // If the function can return null, we do an extra check to make sure our null bit is still set
     // correctly.
@@ -82,9 +148,9 @@ case class StaticInvoke(
     }
 
     val code = s"""
-      ${argGen.map(_.code).mkString("\n")}
-      $setIsNull
-      final $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : $callFunc;
+      $argCode
+      boolean ${ev.isNull} = $resultIsNull;
+      final $javaType ${ev.value} = $resultIsNull ? ${ctx.defaultValue(dataType)} : $callFunc;
       $postNullCheck
      """
     ev.copy(code = code)
@@ -103,13 +169,15 @@ case class StaticInvoke(
  * @param functionName The name of the method to call.
  * @param dataType The expected return type of the function.
  * @param arguments An optional list of expressions, whos evaluation will be passed to the function.
+ * @param propagateNull When true, and any of the arguments is null, null will be returned instead
+ *                      of calling the function.
  */
 case class Invoke(
     targetObject: Expression,
     functionName: String,
     dataType: DataType,
     arguments: Seq[Expression] = Nil,
-    propagateNull: Boolean = true) extends Expression with NonSQLExpression {
+    propagateNull: Boolean = true) extends InvokeLike {
 
   override def nullable: Boolean = true
   override def children: Seq[Expression] = targetObject +: arguments
@@ -131,8 +199,8 @@ case class Invoke(
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val javaType = ctx.javaType(dataType)
     val obj = targetObject.genCode(ctx)
-    val argGen = arguments.map(_.genCode(ctx))
-    val argString = argGen.map(_.value).mkString(", ")
+
+    val (argCode, argString, resultIsNull) = prepareArguments(ctx)
 
     val returnPrimitive = method.isDefined && method.get.getReturnType.isPrimitive
     val needTryCatch = method.isDefined && method.get.getExceptionTypes.nonEmpty
@@ -164,12 +232,6 @@ case class Invoke(
       """
     }
 
-    val setIsNull = if (propagateNull && arguments.nonEmpty) {
-      s"boolean ${ev.isNull} = ${obj.isNull} || ${argGen.map(_.isNull).mkString(" || ")};"
-    } else {
-      s"boolean ${ev.isNull} = ${obj.isNull};"
-    }
-
     // If the function can return null, we do an extra check to make sure our null bit is still set
     // correctly.
     val postNullCheck = if (ctx.defaultValue(dataType) == "null") {
@@ -177,15 +239,19 @@ case class Invoke(
     } else {
       ""
     }
+
     val code = s"""
       ${obj.code}
-      ${argGen.map(_.code).mkString("\n")}
-      $setIsNull
+      boolean ${ev.isNull} = true;
       $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        $evaluate
+      if (!${obj.isNull}) {
+        $argCode
+        ${ev.isNull} = $resultIsNull;
+        if (!${ev.isNull}) {
+          $evaluate
+        }
+        $postNullCheck
       }
-      $postNullCheck
      """
     ev.copy(code = code)
   }
@@ -223,10 +289,10 @@ case class NewInstance(
     arguments: Seq[Expression],
     propagateNull: Boolean,
     dataType: DataType,
-    outerPointer: Option[() => AnyRef]) extends Expression with NonSQLExpression {
+    outerPointer: Option[() => AnyRef]) extends InvokeLike {
   private val className = cls.getName
 
-  override def nullable: Boolean = propagateNull
+  override def nullable: Boolean = needNullCheck
 
   override def children: Seq[Expression] = arguments
 
@@ -245,52 +311,25 @@ case class NewInstance(
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val javaType = ctx.javaType(dataType)
-    val argIsNulls = ctx.freshName("argIsNulls")
-    ctx.addMutableState("boolean[]", argIsNulls,
-      s"$argIsNulls = new boolean[${arguments.size}];")
-    val argValues = arguments.zipWithIndex.map { case (e, i) =>
-      val argValue = ctx.freshName("argValue")
-      ctx.addMutableState(ctx.javaType(e.dataType), argValue, "")
-      argValue
-    }
 
-    val argCodes = arguments.zipWithIndex.map { case (e, i) =>
-      val expr = e.genCode(ctx)
-      expr.code + s"""
-       $argIsNulls[$i] = ${expr.isNull};
-       ${argValues(i)} = ${expr.value};
-     """
-    }
-    val argCode = ctx.splitExpressions(ctx.INPUT_ROW, argCodes)
+    val (argCode, argString, resultIsNull) = prepareArguments(ctx)
 
     val outer = outerPointer.map(func => Literal.fromObject(func()).genCode(ctx))
 
-    var isNull = ev.isNull
-    val setIsNull = if (propagateNull && arguments.nonEmpty) {
-      s"""
-       boolean $isNull = false;
-       for (int idx = 0; idx < ${arguments.length}; idx++) {
-         if ($argIsNulls[idx]) { $isNull = true; break; }
-       }
-     """
-    } else {
-      isNull = "false"
-      ""
-    }
+    ev.isNull = resultIsNull
 
     val constructorCall = outer.map { gen =>
-      s"""${gen.value}.new ${cls.getSimpleName}(${argValues.mkString(", ")})"""
+      s"${gen.value}.new ${cls.getSimpleName}($argString)"
     }.getOrElse {
-      s"new $className(${argValues.mkString(", ")})"
+      s"new $className($argString)"
     }
 
     val code = s"""
       $argCode
       ${outer.map(_.code).getOrElse("")}
-      $setIsNull
-      final $javaType ${ev.value} = $isNull ? ${ctx.defaultValue(javaType)} : $constructorCall;
-     """
-    ev.copy(code = code, isNull = isNull)
+      final $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(javaType)} : $constructorCall;
+    """
+    ev.copy(code = code)
   }
 
   override def toString: String = s"newInstance($cls)"
@@ -341,7 +380,7 @@ case class WrapOption(child: Expression, optType: DataType)
 
   override def dataType: DataType = ObjectType(classOf[Option[_]])
 
-  override def nullable: Boolean = true
+  override def nullable: Boolean = false
 
   override def inputTypes: Seq[AbstractDataType] = optType :: Nil
 
@@ -422,14 +461,15 @@ case class MapObjects private(
     lambdaFunction: Expression,
     inputData: Expression) extends Expression with NonSQLExpression {
 
-  override def nullable: Boolean = true
+  override def nullable: Boolean = inputData.nullable
 
   override def children: Seq[Expression] = lambdaFunction :: inputData :: Nil
 
   override def eval(input: InternalRow): Any =
     throw new UnsupportedOperationException("Only code-generated evaluation is supported")
 
-  override def dataType: DataType = ArrayType(lambdaFunction.dataType)
+  override def dataType: DataType =
+    ArrayType(lambdaFunction.dataType, containsNull = lambdaFunction.nullable)
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val elementJavaType = ctx.javaType(loopVarDataType)
@@ -603,7 +643,8 @@ case class ExternalMapToCatalyst private(
 
   override def foldable: Boolean = false
 
-  override def dataType: MapType = MapType(keyConverter.dataType, valueConverter.dataType)
+  override def dataType: MapType = MapType(
+    keyConverter.dataType, valueConverter.dataType, valueContainsNull = valueConverter.nullable)
 
   override def eval(input: InternalRow): Any =
     throw new UnsupportedOperationException("Only code-generated evaluation is supported")
@@ -855,19 +896,25 @@ case class InitializeJavaBean(beanInstance: Expression, setters: Map[String, Exp
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val instanceGen = beanInstance.genCode(ctx)
 
+    val javaBeanInstance = ctx.freshName("javaBean")
+    val beanInstanceJavaType = ctx.javaType(beanInstance.dataType)
+    ctx.addMutableState(beanInstanceJavaType, javaBeanInstance, "")
+
     val initialize = setters.map {
       case (setterMethod, fieldValue) =>
         val fieldGen = fieldValue.genCode(ctx)
         s"""
            ${fieldGen.code}
-           ${instanceGen.value}.$setterMethod(${fieldGen.value});
+           ${javaBeanInstance}.$setterMethod(${fieldGen.value});
          """
     }
+    val initializeCode = ctx.splitExpressions(ctx.INPUT_ROW, initialize.toSeq)
 
     val code = s"""
       ${instanceGen.code}
+      this.${javaBeanInstance} = ${instanceGen.value};
       if (!${instanceGen.isNull}) {
-        ${initialize.mkString("\n")}
+        $initializeCode
       }
      """
     ev.copy(code = code, isNull = instanceGen.isNull, value = instanceGen.value)
@@ -877,18 +924,20 @@ case class InitializeJavaBean(beanInstance: Expression, setters: Map[String, Exp
 /**
  * Asserts that input values of a non-nullable child expression are not null.
  *
- * Note that there are cases where `child.nullable == true`, while we still needs to add this
+ * Note that there are cases where `child.nullable == true`, while we still need to add this
  * assertion.  Consider a nullable column `s` whose data type is a struct containing a non-nullable
  * `Int` field named `i`.  Expression `s.i` is nullable because `s` can be null.  However, for all
  * non-null `s`, `s.i` can't be null.
  */
-case class AssertNotNull(child: Expression, walkedTypePath: Seq[String])
+case class AssertNotNull(child: Expression, walkedTypePath: Seq[String] = Nil)
   extends UnaryExpression with NonSQLExpression {
 
   override def dataType: DataType = child.dataType
   override def foldable: Boolean = false
   override def nullable: Boolean = false
 
+  override def flatArguments: Iterator[Any] = Iterator(child)
+
   private val errMsg = "Null value appeared in non-nullable field:" +
     walkedTypePath.mkString("\n", "\n", "\n") +
     "If the schema is inferred from a Scala tuple/case class, or a Java bean, " +
@@ -898,7 +947,7 @@ case class AssertNotNull(child: Expression, walkedTypePath: Seq[String])
   override def eval(input: InternalRow): Any = {
     val result = child.eval(input)
     if (result == null) {
-      throw new RuntimeException(errMsg);
+      throw new NullPointerException(errMsg)
     }
     result
   }
@@ -914,7 +963,7 @@ case class AssertNotNull(child: Expression, walkedTypePath: Seq[String])
       ${childGen.code}
 
       if (${childGen.isNull}) {
-        throw new RuntimeException($errMsgField);
+        throw new NullPointerException($errMsgField);
       }
      """
     ev.copy(code = code, isNull = "false", value = childGen.value)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index 1510a4796683..1b00c9e79da2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -64,7 +64,15 @@ package object expressions  {
    * column of the new row. If the schema of the input row is specified, then the given expression
    * will be bound to that schema.
    */
-  abstract class Projection extends (InternalRow => InternalRow)
+  abstract class Projection extends (InternalRow => InternalRow) {
+
+    /**
+     * Initializes internal states given the current partition index.
+     * This is used by nondeterministic expressions to set initial states.
+     * The default implementation does nothing.
+     */
+    def initialize(partitionIndex: Int): Unit = {}
+  }
 
   /**
    * Converts a [[InternalRow]] to another Row given a sequence of expression that define each
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 9394e39aadd9..ac56ff13fa5b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -23,7 +23,6 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCo
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 
 object InterpretedPredicate {
@@ -31,10 +30,6 @@ object InterpretedPredicate {
     create(BindReferences.bindReference(expression, inputSchema))
 
   def create(expression: Expression): (InternalRow => Boolean) = {
-    expression.foreach {
-      case n: Nondeterministic => n.setInitialValues()
-      case _ =>
-    }
     (r: InternalRow) => expression.eval(r).asInstanceOf[Boolean]
   }
 }
@@ -90,10 +85,22 @@ trait PredicateHelper {
    */
   protected def canEvaluate(expr: Expression, plan: LogicalPlan): Boolean =
     expr.references.subsetOf(plan.outputSet)
+
+  /**
+   * Returns true iff `expr` could be evaluated as a condition within join.
+   */
+  protected def canEvaluateWithinJoin(expr: Expression): Boolean = expr match {
+    case e: SubqueryExpression =>
+      // non-correlated subquery will be replaced as literal
+      e.children.isEmpty
+    case a: AttributeReference => true
+    case e: Unevaluable => false
+    case e => e.children.forall(canEvaluateWithinJoin)
+  }
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_ a - Logical not")
+  usage = "_FUNC_ expr - Logical not.")
 case class Not(child: Expression)
   extends UnaryExpression with Predicate with ImplicitCastInputTypes with NullIntolerant {
 
@@ -115,7 +122,7 @@ case class Not(child: Expression)
  * Evaluates to `true` if `list` contains `value`.
  */
 @ExpressionDescription(
-  usage = "expr _FUNC_(val1, val2, ...) - Returns true if expr equals to any valN.")
+  usage = "expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN.")
 case class In(value: Expression, list: Seq[Expression]) extends Predicate
     with ImplicitCastInputTypes {
 
@@ -252,7 +259,7 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Logical AND.")
+  usage = "expr1 _FUNC_ expr2 - Logical AND.")
 case class And(left: Expression, right: Expression) extends BinaryOperator with Predicate {
 
   override def inputType: AbstractDataType = BooleanType
@@ -315,7 +322,7 @@ case class And(left: Expression, right: Expression) extends BinaryOperator with
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Logical OR.")
+  usage = "expr1 _FUNC_ expr2 - Logical OR.")
 case class Or(left: Expression, right: Expression) extends BinaryOperator with Predicate {
 
   override def inputType: AbstractDataType = BooleanType
@@ -392,6 +399,8 @@ abstract class BinaryComparison extends BinaryOperator with Predicate {
       defineCodeGen(ctx, ev, (c1, c2) => s"${ctx.genComp(left.dataType, c1, c2)} $symbol 0")
     }
   }
+
+  protected lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
 }
 
 
@@ -410,38 +419,60 @@ object Equality {
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a equals b and false otherwise.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` equals `expr2`, or false otherwise.")
 case class EqualTo(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
   override def inputType: AbstractDataType = AnyDataType
 
-  override def symbol: String = "="
-
-  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    if (left.dataType == FloatType) {
-      Utils.nanSafeCompareFloats(input1.asInstanceOf[Float], input2.asInstanceOf[Float]) == 0
-    } else if (left.dataType == DoubleType) {
-      Utils.nanSafeCompareDoubles(input1.asInstanceOf[Double], input2.asInstanceOf[Double]) == 0
-    } else if (left.dataType != BinaryType) {
-      input1 == input2
-    } else {
-      java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
+  override def checkInputDataTypes(): TypeCheckResult = {
+    super.checkInputDataTypes() match {
+      case TypeCheckResult.TypeCheckSuccess =>
+        // TODO: although map type is not orderable, technically map type should be able to be used
+        // in equality comparison, remove this type check once we support it.
+        if (left.dataType.existsRecursively(_.isInstanceOf[MapType])) {
+          TypeCheckResult.TypeCheckFailure("Cannot use map type in EqualTo, but the actual " +
+            s"input type is ${left.dataType.catalogString}.")
+        } else {
+          TypeCheckResult.TypeCheckSuccess
+        }
+      case failure => failure
     }
   }
 
+  override def symbol: String = "="
+
+  protected override def nullSafeEval(left: Any, right: Any): Any = ordering.equiv(left, right)
+
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     defineCodeGen(ctx, ev, (c1, c2) => ctx.genEqual(left.dataType, c1, c2))
   }
 }
 
 @ExpressionDescription(
-  usage = """a _FUNC_ b - Returns same result with EQUAL(=) operator for non-null operands,
-    but returns TRUE if both are NULL, FALSE if one of the them is NULL.""")
+  usage = """
+    expr1 _FUNC_ expr2 - Returns same result as the EQUAL(=) operator for non-null operands,
+      but returns true if both are null, false if one of the them is null.
+  """)
 case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
 
   override def inputType: AbstractDataType = AnyDataType
 
+  override def checkInputDataTypes(): TypeCheckResult = {
+    super.checkInputDataTypes() match {
+      case TypeCheckResult.TypeCheckSuccess =>
+        // TODO: although map type is not orderable, technically map type should be able to be used
+        // in equality comparison, remove this type check once we support it.
+        if (left.dataType.existsRecursively(_.isInstanceOf[MapType])) {
+          TypeCheckResult.TypeCheckFailure("Cannot use map type in EqualNullSafe, but the actual " +
+            s"input type is ${left.dataType.catalogString}.")
+        } else {
+          TypeCheckResult.TypeCheckSuccess
+        }
+      case failure => failure
+    }
+  }
+
   override def symbol: String = "<=>"
 
   override def nullable: Boolean = false
@@ -454,15 +485,7 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
     } else if (input1 == null || input2 == null) {
       false
     } else {
-      if (left.dataType == FloatType) {
-        Utils.nanSafeCompareFloats(input1.asInstanceOf[Float], input2.asInstanceOf[Float]) == 0
-      } else if (left.dataType == DoubleType) {
-        Utils.nanSafeCompareDoubles(input1.asInstanceOf[Double], input2.asInstanceOf[Double]) == 0
-      } else if (left.dataType != BinaryType) {
-        input1 == input2
-      } else {
-        java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
-      }
+      ordering.equiv(input1, input2)
     }
   }
 
@@ -477,7 +500,7 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a is less than b.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than `expr2`.")
 case class LessThan(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -485,13 +508,11 @@ case class LessThan(left: Expression, right: Expression)
 
   override def symbol: String = "<"
 
-  private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
-
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lt(input1, input2)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a is not greater than b.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than or equal to `expr2`.")
 case class LessThanOrEqual(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -499,13 +520,11 @@ case class LessThanOrEqual(left: Expression, right: Expression)
 
   override def symbol: String = "<="
 
-  private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
-
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lteq(input1, input2)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a is greater than b.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than `expr2`.")
 case class GreaterThan(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -513,13 +532,11 @@ case class GreaterThan(left: Expression, right: Expression)
 
   override def symbol: String = ">"
 
-  private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
-
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gt(input1, input2)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a is not smaller than b.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than or equal to `expr2`.")
 case class GreaterThanOrEqual(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -527,7 +544,5 @@ case class GreaterThanOrEqual(left: Expression, right: Expression)
 
   override def symbol: String = ">="
 
-  private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
-
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gteq(input1, input2)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
index ca200768b228..1d7a3c735607 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.TaskContext
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
-import org.apache.spark.sql.types.{DataType, DoubleType}
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
 
@@ -32,70 +31,97 @@ import org.apache.spark.util.random.XORShiftRandom
  *
  * Since this expression is stateful, it cannot be a case object.
  */
-abstract class RDG extends LeafExpression with Nondeterministic {
-
-  protected def seed: Long
-
+abstract class RDG extends UnaryExpression with ExpectsInputTypes with Nondeterministic {
   /**
    * Record ID within each partition. By being transient, the Random Number Generator is
    * reset every time we serialize and deserialize and initialize it.
    */
   @transient protected var rng: XORShiftRandom = _
 
-  override protected def initInternal(): Unit = {
-    rng = new XORShiftRandom(seed + TaskContext.getPartitionId)
+  override protected def initializeInternal(partitionIndex: Int): Unit = {
+    rng = new XORShiftRandom(seed + partitionIndex)
+  }
+
+  @transient protected lazy val seed: Long = child match {
+    case Literal(s, IntegerType) => s.asInstanceOf[Int]
+    case Literal(s, LongType) => s.asInstanceOf[Long]
+    case _ => throw new AnalysisException(
+      s"Input argument to $prettyName must be an integer, long or null literal.")
   }
 
   override def nullable: Boolean = false
 
   override def dataType: DataType = DoubleType
 
-  // NOTE: Even if the user doesn't provide a seed, Spark SQL adds a default seed.
-  override def sql: String = s"$prettyName($seed)"
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(IntegerType, LongType))
 }
 
 /** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns a random column with i.i.d. uniformly distributed values in [0, 1).")
-case class Rand(seed: Long) extends RDG {
-  override protected def evalInternal(input: InternalRow): Double = rng.nextDouble()
-
-  def this() = this(Utils.random.nextLong())
+  usage = "_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) uniformly distributed values in [0, 1).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       0.9629742951434543
+      > SELECT _FUNC_(0);
+       0.8446490682263027
+      > SELECT _FUNC_(null);
+       0.8446490682263027
+  """)
+// scalastyle:on line.size.limit
+case class Rand(child: Expression) extends RDG {
+
+  def this() = this(Literal(Utils.random.nextLong(), LongType))
 
-  def this(seed: Expression) = this(seed match {
-    case IntegerLiteral(s) => s
-    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
-  })
+  override protected def evalInternal(input: InternalRow): Double = rng.nextDouble()
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rngTerm = ctx.freshName("rng")
     val className = classOf[XORShiftRandom].getName
-    ctx.addMutableState(className, rngTerm,
-      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
+    ctx.addMutableState(className, rngTerm, "")
+    ctx.addPartitionInitializationStatement(
+      s"$rngTerm = new $className(${seed}L + partitionIndex);")
     ev.copy(code = s"""
       final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextDouble();""", isNull = "false")
   }
 }
 
-/** Generate a random column with i.i.d. gaussian random distribution. */
-@ExpressionDescription(
-  usage = "_FUNC_(a) - Returns a random column with i.i.d. gaussian random distribution.")
-case class Randn(seed: Long) extends RDG {
-  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()
+object Rand {
+  def apply(seed: Long): Rand = Rand(Literal(seed, LongType))
+}
 
-  def this() = this(Utils.random.nextLong())
+/** Generate a random column with i.i.d. values drawn from the standard normal distribution. */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) values drawn from the standard normal distribution.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       -0.3254147983080288
+      > SELECT _FUNC_(0);
+       1.1164209726833079
+      > SELECT _FUNC_(null);
+       1.1164209726833079
+  """)
+// scalastyle:on line.size.limit
+case class Randn(child: Expression) extends RDG {
+
+  def this() = this(Literal(Utils.random.nextLong(), LongType))
 
-  def this(seed: Expression) = this(seed match {
-    case IntegerLiteral(s) => s
-    case _ => throw new AnalysisException("Input argument to randn must be an integer literal.")
-  })
+  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rngTerm = ctx.freshName("rng")
     val className = classOf[XORShiftRandom].getName
-    ctx.addMutableState(className, rngTerm,
-      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
+    ctx.addMutableState(className, rngTerm, "")
+    ctx.addPartitionInitializationStatement(
+      s"$rngTerm = new $className(${seed}L + partitionIndex);")
     ev.copy(code = s"""
       final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = "false")
   }
 }
+
+object Randn {
+  def apply(seed: Long): Randn = Randn(Literal(seed, LongType))
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index d25da3fd587b..a7fccfc7216d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -68,7 +68,30 @@ trait StringRegexExpression extends ImplicitCastInputTypes {
  * Simple RegEx pattern matching function
  */
 @ExpressionDescription(
-  usage = "str _FUNC_ pattern - Returns true if str matches pattern and false otherwise.")
+  usage = "str _FUNC_ pattern - Returns true if str matches pattern, " +
+    "null if any arguments are null, false otherwise.",
+  extended = """
+    Arguments:
+      str - a string expression
+      pattern - a string expression. The pattern is a string which is matched literally, with
+        exception to the following special symbols:
+
+          _ matches any one character in the input (similar to . in posix regular expressions)
+
+          % matches zero or more characters in the input (similar to .* in posix regular
+          expressions)
+
+        The escape character is '\'. If an escape character precedes a special symbol or another
+        escape character, the following character is matched literally. It is invalid to escape
+        any other character.
+
+    Examples:
+      > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'
+      true
+
+    See also:
+      Use RLIKE to match with standard regular expressions.
+""")
 case class Like(left: Expression, right: Expression)
   extends BinaryExpression with StringRegexExpression {
 
@@ -121,7 +144,7 @@ case class Like(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "str _FUNC_ regexp - Returns true if str matches regexp and false otherwise.")
+  usage = "str _FUNC_ regexp - Returns true if `str` matches `regexp`, or false otherwise.")
 case class RLike(left: Expression, right: Expression)
   extends BinaryExpression with StringRegexExpression {
 
@@ -175,8 +198,12 @@ case class RLike(left: Expression, right: Expression)
  * Splits str around pat (pattern is a regular expression).
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, regex) - Splits str around occurrences that match regex",
-  extended = "> SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');\n ['one', 'two', 'three']")
+  usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
+       ["one","two","three",""]
+  """)
 case class StringSplit(str: Expression, pattern: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -206,9 +233,15 @@ case class StringSplit(str: Expression, pattern: Expression)
  *
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(str, regexp, rep) - replace all substrings of str that match regexp with rep.",
-  extended = "> SELECT _FUNC_('100-200', '(\\d+)', 'num');\n 'num-num'")
+  usage = "_FUNC_(str, regexp, rep) - Replaces all substrings of `str` that match `regexp` with `rep`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('100-200', '(\d+)', 'num');
+       num-num
+  """)
+// scalastyle:on line.size.limit
 case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
 
@@ -220,7 +253,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
   @transient private var lastReplacement: String = _
   @transient private var lastReplacementInUTF8: UTF8String = _
   // result buffer write by Matcher
-  @transient private val result: StringBuffer = new StringBuffer
+  @transient private lazy val result: StringBuffer = new StringBuffer
 
   override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
     if (!p.equals(lastRegex)) {
@@ -309,8 +342,12 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, regexp[, idx]) - extracts a group that matches regexp.",
-  extended = "> SELECT _FUNC_('100-200', '(\\d+)-(\\d+)', 1);\n '100'")
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches `regexp`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('100-200', '(\d+)-(\d+)', 1);
+       100
+  """)
 case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
   def this(s: Expression, r: Expression) = this(s, r, Literal(1))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 1bcbb6cfc924..908aa44f81c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -41,8 +41,12 @@ import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
  * If any input is null, concat returns null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str1, str2, ..., strN) - Returns the concatenation of str1, str2, ..., strN",
-  extended = "> SELECT _FUNC_('Spark','SQL');\n 'SparkSQL'")
+  usage = "_FUNC_(str1, str2, ..., strN) - Returns the concatenation of str1, str2, ..., strN.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark', 'SQL');
+       SparkSQL
+  """)
 case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType)
@@ -78,10 +82,15 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
  *
  * Returns null if the separator is null. Otherwise, concat_ws skips all null values.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage =
-    "_FUNC_(sep, [str | array(str)]+) - Returns the concatenation of the strings separated by sep.",
-  extended = "> SELECT _FUNC_(' ', Spark', 'SQL');\n 'Spark SQL'")
+  usage = "_FUNC_(sep, [str | array(str)]+) - Returns the concatenation of the strings separated by `sep`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(' ', 'Spark', 'SQL');
+        Spark SQL
+  """)
+// scalastyle:on line.size.limit
 case class ConcatWs(children: Seq[Expression])
   extends Expression with ImplicitCastInputTypes {
 
@@ -167,9 +176,15 @@ case class ConcatWs(children: Seq[Expression])
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(n, str1, str2, ...) - returns the n-th string, e.g. returns str2 when n is 2",
-  extended = "> SELECT _FUNC_(1, 'scala', 'java') FROM src LIMIT 1;\n" + "'scala'")
+  usage = "_FUNC_(n, str1, str2, ...) - Returns the `n`-th string, e.g., returns `str2` when `n` is 2.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1, 'scala', 'java');
+       scala
+  """)
+// scalastyle:on line.size.limit
 case class Elt(children: Seq[Expression])
   extends Expression with ImplicitCastInputTypes {
 
@@ -246,8 +261,12 @@ trait String2StringExpression extends ImplicitCastInputTypes {
  * A function that converts the characters of a string to uppercase.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Returns str with all characters changed to uppercase",
-  extended = "> SELECT _FUNC_('SparkSql');\n 'SPARKSQL'")
+  usage = "_FUNC_(str) - Returns `str` with all characters changed to uppercase.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('SparkSql');
+       SPARKSQL
+  """)
 case class Upper(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -262,8 +281,12 @@ case class Upper(child: Expression)
  * A function that converts the characters of a string to lowercase.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Returns str with all characters changed to lowercase",
-  extended = "> SELECT _FUNC_('SparkSql');\n 'sparksql'")
+  usage = "_FUNC_(str) - Returns `str` with all characters changed to lowercase.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('SparkSql');
+       sparksql
+  """)
 case class Lower(child: Expression) extends UnaryExpression with String2StringExpression {
 
   override def convert(v: UTF8String): UTF8String = v.toLowerCase
@@ -347,8 +370,12 @@ object StringTranslate {
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(input, from, to) - Translates the input string by replacing the characters present in the from string with the corresponding characters in the to string""",
-  extended = "> SELECT _FUNC_('AaBbCc', 'abc', '123');\n 'A1B2C3'")
+  usage = "_FUNC_(input, from, to) - Translates the `input` string by replacing the characters present in the `from` string with the corresponding characters in the `to` string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('AaBbCc', 'abc', '123');
+       A1B2C3
+  """)
 // scalastyle:on line.size.limit
 case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replaceExpr: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
@@ -407,9 +434,15 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(str, str_array) - Returns the index (1-based) of the given string (left) in the comma-delimited list (right).
-    Returns 0, if the string wasn't found or if the given string (left) contains a comma.""",
-  extended = "> SELECT _FUNC_('ab','abc,b,ab,c,def');\n 3")
+  usage = """
+    _FUNC_(str, str_array) - Returns the index (1-based) of the given string (`str`) in the comma-delimited list (`str_array`).
+      Returns 0, if the string was not found or if the given string (`str`) contains a comma.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('ab','abc,b,ab,c,def');
+       3
+  """)
 // scalastyle:on
 case class FindInSet(left: Expression, right: Expression) extends BinaryExpression
     with ImplicitCastInputTypes {
@@ -434,8 +467,12 @@ case class FindInSet(left: Expression, right: Expression) extends BinaryExpressi
  * A function that trim the spaces from both ends for the specified string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the leading and trailing space characters from str.",
-  extended = "> SELECT _FUNC_('    SparkSQL   ');\n 'SparkSQL'")
+  usage = "_FUNC_(str) - Removes the leading and trailing space characters from `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('    SparkSQL   ');
+       SparkSQL
+  """)
 case class StringTrim(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -452,8 +489,12 @@ case class StringTrim(child: Expression)
  * A function that trim the spaces from left end for given string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the leading space characters from str.",
-  extended = "> SELECT _FUNC_('    SparkSQL   ');\n 'SparkSQL   '")
+  usage = "_FUNC_(str) - Removes the leading and trailing space characters from `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('    SparkSQL');
+       SparkSQL
+  """)
 case class StringTrimLeft(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -470,8 +511,12 @@ case class StringTrimLeft(child: Expression)
  * A function that trim the spaces from right end for given string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the trailing space characters from str.",
-  extended = "> SELECT _FUNC_('    SparkSQL   ');\n '    SparkSQL'")
+  usage = "_FUNC_(str) - Removes the trailing space characters from `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('    SparkSQL   ');
+           SparkSQL
+  """)
 case class StringTrimRight(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -492,8 +537,12 @@ case class StringTrimRight(child: Expression)
  * NOTE: that this is not zero based, but 1-based index. The first character in str has index 1.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, substr) - Returns the (1-based) index of the first occurrence of substr in str.",
-  extended = "> SELECT _FUNC_('SparkSQL', 'SQL');\n 6")
+  usage = "_FUNC_(str, substr) - Returns the (1-based) index of the first occurrence of `substr` in `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('SparkSQL', 'SQL');
+       6
+  """)
 case class StringInstr(str: Expression, substr: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -522,12 +571,18 @@ case class StringInstr(str: Expression, substr: Expression)
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(str, delim, count) - Returns the substring from str before count occurrences of the delimiter delim.
-    If count is positive, everything to the left of the final delimiter (counting from the
-    left) is returned. If count is negative, everything to the right of the final delimiter
-    (counting from the right) is returned. Substring_index performs a case-sensitive match
-    when searching for delim.""",
-  extended = "> SELECT _FUNC_('www.apache.org', '.', 2);\n 'www.apache'")
+  usage = """
+    _FUNC_(str, delim, count) - Returns the substring from `str` before `count` occurrences of the delimiter `delim`.
+      If `count` is positive, everything to the left of the final delimiter (counting from the
+      left) is returned. If `count` is negative, everything to the right of the final delimiter
+      (counting from the right) is returned. The function substring_index performs a case-sensitive match
+      when searching for `delim`.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('www.apache.org', '.', 2);
+       www.apache
+  """)
 // scalastyle:on line.size.limit
 case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr: Expression)
  extends TernaryExpression with ImplicitCastInputTypes {
@@ -554,9 +609,15 @@ case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr:
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(substr, str[, pos]) - Returns the position of the first occurrence of substr in str after position pos.
-    The given pos and return value are 1-based.""",
-  extended = "> SELECT _FUNC_('bar', 'foobarbar', 5);\n 7")
+  usage = """
+    _FUNC_(substr, str[, pos]) - Returns the position of the first occurrence of `substr` in `str` after position `pos`.
+      The given `pos` and return value are 1-based.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('bar', 'foobarbar', 5);
+       7
+  """)
 // scalastyle:on line.size.limit
 case class StringLocate(substr: Expression, str: Expression, start: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
@@ -631,10 +692,17 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
  * Returns str, left-padded with pad to a length of len.
  */
 @ExpressionDescription(
-  usage = """_FUNC_(str, len, pad) - Returns str, left-padded with pad to a length of len.
-    If str is longer than len, the return value is shortened to len characters.""",
-  extended = "> SELECT _FUNC_('hi', 5, '??');\n '???hi'\n" +
-    "> SELECT _FUNC_('hi', 1, '??');\n 'h'")
+  usage = """
+    _FUNC_(str, len, pad) - Returns `str`, left-padded with `pad` to a length of `len`.
+      If `str` is longer than `len`, the return value is shortened to `len` characters.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('hi', 5, '??');
+       ???hi
+      > SELECT _FUNC_('hi', 1, '??');
+       h
+  """)
 case class StringLPad(str: Expression, len: Expression, pad: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
 
@@ -657,10 +725,17 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
  * Returns str, right-padded with pad to a length of len.
  */
 @ExpressionDescription(
-  usage = """_FUNC_(str, len, pad) - Returns str, right-padded with pad to a length of len.
-    If str is longer than len, the return value is shortened to len characters.""",
-  extended = "> SELECT _FUNC_('hi', 5, '??');\n 'hi???'\n" +
-    "> SELECT _FUNC_('hi', 1, '??');\n 'h'")
+  usage = """
+    _FUNC_(str, len, pad) - Returns `str`, right-padded with `pad` to a length of `len`.
+      If `str` is longer than `len`, the return value is shortened to `len` characters.
+  """,
+  extended = """
+    Examples:
+     > SELECT _FUNC_('hi', 5, '??');
+      hi???
+     > SELECT _FUNC_('hi', 1, '??');
+      h
+  """)
 case class StringRPad(str: Expression, len: Expression, pad: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
 
@@ -696,16 +771,16 @@ object ParseUrl {
  * Extracts a part from a URL
  */
 @ExpressionDescription(
-  usage = "_FUNC_(url, partToExtract[, key]) - extracts a part from a URL",
-  extended = """Parts: HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO.
-    Key specifies which query to extract.
+  usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.",
+  extended = """
     Examples:
       > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST')
-      'spark.apache.org'
+       spark.apache.org
       > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY')
-      'query=1'
+       query=1
       > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query')
-      '1'""")
+       1
+  """)
 case class ParseUrl(children: Seq[Expression])
   extends Expression with ExpectsInputTypes with CodegenFallback {
 
@@ -851,8 +926,12 @@ case class ParseUrl(children: Seq[Expression])
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(String format, Obj... args) - Returns a formatted string from printf-style format strings.",
-  extended = "> SELECT _FUNC_(\"Hello World %d %s\", 100, \"days\");\n 'Hello World 100 days'")
+  usage = "_FUNC_(strfmt, obj, ...) - Returns a formatted string from printf-style format strings.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_("Hello World %d %s", 100, "days");
+       Hello World 100 days
+  """)
 // scalastyle:on line.size.limit
 case class FormatString(children: Expression*) extends Expression with ImplicitCastInputTypes {
 
@@ -923,10 +1002,15 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
  * Words are delimited by whitespace.
  */
 @ExpressionDescription(
-  usage =
-   """_FUNC_(str) - Returns str with the first letter of each word in uppercase.
-     All other letters are in lowercase. Words are delimited by white space.""",
-  extended = "> SELECT initcap('sPark sql');\n 'Spark Sql'")
+  usage = """
+    _FUNC_(str) - Returns `str` with the first letter of each word in uppercase.
+      All other letters are in lowercase. Words are delimited by white space.
+  """,
+  extended = """
+    Examples:
+      > SELECT initcap('sPark sql');
+       Spark Sql
+  """)
 case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(StringType)
@@ -944,8 +1028,12 @@ case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastI
  * Returns the string which repeat the given string value n times.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, n) - Returns the string which repeat the given string value n times.",
-  extended = "> SELECT _FUNC_('123', 2);\n '123123'")
+  usage = "_FUNC_(str, n) - Returns the string which repeats the given string value n times.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('123', 2);
+       123123
+  """)
 case class StringRepeat(str: Expression, times: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -970,7 +1058,11 @@ case class StringRepeat(str: Expression, times: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Returns the reversed given string.",
-  extended = "> SELECT _FUNC_('Spark SQL');\n 'LQS krapS'")
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       LQS krapS
+  """)
 case class StringReverse(child: Expression) extends UnaryExpression with String2StringExpression {
   override def convert(v: UTF8String): UTF8String = v.reverse()
 
@@ -982,11 +1074,15 @@ case class StringReverse(child: Expression) extends UnaryExpression with String2
 }
 
 /**
- * Returns a n spaces string.
+ * Returns a string consisting of n spaces.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n) - Returns a n spaces string.",
-  extended = "> SELECT _FUNC_(2);\n '  '")
+  usage = "_FUNC_(n) - Returns a string consisting of `n` spaces.",
+  extended = """
+    Examples:
+      > SELECT concat(_FUNC_(2), '1');
+         1
+  """)
 case class StringSpace(child: Expression)
   extends UnaryExpression with ImplicitCastInputTypes {
 
@@ -1014,8 +1110,16 @@ case class StringSpace(child: Expression)
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(str, pos[, len]) - Returns the substring of str that starts at pos and is of length len or the slice of byte array that starts at pos and is of length len.",
-  extended = "> SELECT _FUNC_('Spark SQL', 5);\n 'k SQL'\n> SELECT _FUNC_('Spark SQL', -3);\n 'SQL'\n> SELECT _FUNC_('Spark SQL', 5, 1);\n 'k'")
+  usage = "_FUNC_(str, pos[, len]) - Returns the substring of `str` that starts at `pos` and is of length `len`, or the slice of byte array that starts at `pos` and is of length `len`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL', 5);
+       k SQL
+      > SELECT _FUNC_('Spark SQL', -3);
+       SQL
+      > SELECT _FUNC_('Spark SQL', 5, 1);
+       k
+  """)
 // scalastyle:on line.size.limit
 case class Substring(str: Expression, pos: Expression, len: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
@@ -1055,8 +1159,12 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
  * A function that return the length of the given string or binary expression.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str | binary) - Returns the length of str or number of bytes in binary data.",
-  extended = "> SELECT _FUNC_('Spark SQL');\n 9")
+  usage = "_FUNC_(expr) - Returns the length of `expr` or number of bytes in binary data.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       9
+  """)
 case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
@@ -1079,7 +1187,11 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn
  */
 @ExpressionDescription(
   usage = "_FUNC_(str1, str2) - Returns the Levenshtein distance between the two given strings.",
-  extended = "> SELECT _FUNC_('kitten', 'sitting');\n 3")
+  extended = """
+    Examples:
+      > SELECT _FUNC_('kitten', 'sitting');
+       3
+  """)
 case class Levenshtein(left: Expression, right: Expression) extends BinaryExpression
     with ImplicitCastInputTypes {
 
@@ -1096,11 +1208,15 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
 }
 
 /**
- * A function that return soundex code of the given string expression.
+ * A function that return Soundex code of the given string expression.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Returns soundex code of the string.",
-  extended = "> SELECT _FUNC_('Miller');\n 'M460'")
+  usage = "_FUNC_(str) - Returns Soundex code of the string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Miller');
+       M460
+  """)
 case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
   override def dataType: DataType = StringType
@@ -1118,9 +1234,14 @@ case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputT
  * Returns the numeric value of the first character of str.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Returns the numeric value of the first character of str.",
-  extended = "> SELECT _FUNC_('222');\n 50\n" +
-    "> SELECT _FUNC_(2);\n 50")
+  usage = "_FUNC_(str) - Returns the numeric value of the first character of `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('222');
+       50
+      > SELECT _FUNC_(2);
+       50
+  """)
 case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = IntegerType
@@ -1153,7 +1274,12 @@ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInp
  * Converts the argument from binary to a base 64 string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(bin) - Convert the argument from binary to a base 64 string.")
+  usage = "_FUNC_(bin) - Converts the argument from a binary `bin` to a base 64 string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       U3BhcmsgU1FM
+  """)
 case class Base64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
@@ -1177,7 +1303,12 @@ case class Base64(child: Expression) extends UnaryExpression with ImplicitCastIn
  * Converts the argument from a base 64 string to BINARY.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Convert the argument from a base 64 string to binary.")
+  usage = "_FUNC_(str) - Converts the argument from a base 64 string `str` to a binary.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('U3BhcmsgU1FM');
+       Spark SQL
+  """)
 case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = BinaryType
@@ -1199,8 +1330,15 @@ case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCast
  * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
  * If either argument is null, the result will also be null.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(bin, str) - Decode the first argument using the second argument character set.")
+  usage = "_FUNC_(bin, charset) - Decodes the first argument using the second argument character set.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8');
+       abc
+  """)
+// scalastyle:on line.size.limit
 case class Decode(bin: Expression, charset: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -1231,8 +1369,15 @@ case class Decode(bin: Expression, charset: Expression)
  * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
  * If either argument is null, the result will also be null.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(str, str) - Encode the first argument using the second argument character set.")
+  usage = "_FUNC_(str, charset) - Encodes the first argument using the second argument character set.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('abc', 'utf-8');
+       abc
+  """)
+// scalastyle:on line.size.limit
 case class Encode(value: Expression, charset: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -1263,10 +1408,16 @@ case class Encode(value: Expression, charset: Expression)
  * fractional part.
  */
 @ExpressionDescription(
-  usage = """_FUNC_(X, D) - Formats the number X like '#,###,###.##', rounded to D decimal places.
-    If D is 0, the result has no decimal point or fractional part.
-    This is supposed to function like MySQL's FORMAT.""",
-  extended = "> SELECT _FUNC_(12332.123456, 4);\n '12,332.1235'")
+  usage = """
+    _FUNC_(expr1, expr2) - Formats the number `expr1` like '#,###,###.##', rounded to `expr2`
+      decimal places. If `expr2` is 0, the result has no decimal point or fractional part.
+      This is supposed to function like MySQL's FORMAT.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_(12332.123456, 4);
+       12,332.1235
+  """)
 case class FormatNumber(x: Expression, d: Expression)
   extends BinaryExpression with ExpectsInputTypes {
 
@@ -1278,18 +1429,20 @@ case class FormatNumber(x: Expression, d: Expression)
 
   // Associated with the pattern, for the last d value, and we will update the
   // pattern (DecimalFormat) once the new coming d value differ with the last one.
+  // This is an Option to distinguish between 0 (numberFormat is valid) and uninitialized after
+  // serialization (numberFormat has not been updated for dValue = 0).
   @transient
-  private var lastDValue: Int = -100
+  private var lastDValue: Option[Int] = None
 
   // A cached DecimalFormat, for performance concern, we will change it
   // only if the d value changed.
   @transient
-  private val pattern: StringBuffer = new StringBuffer()
+  private lazy val pattern: StringBuffer = new StringBuffer()
 
   // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.')
   // as a decimal separator.
   @transient
-  private val numberFormat = new DecimalFormat("", new DecimalFormatSymbols(Locale.US))
+  private lazy val numberFormat = new DecimalFormat("", new DecimalFormatSymbols(Locale.US))
 
   override protected def nullSafeEval(xObject: Any, dObject: Any): Any = {
     val dValue = dObject.asInstanceOf[Int]
@@ -1297,24 +1450,28 @@ case class FormatNumber(x: Expression, d: Expression)
       return null
     }
 
-    if (dValue != lastDValue) {
-      // construct a new DecimalFormat only if a new dValue
-      pattern.delete(0, pattern.length)
-      pattern.append("#,###,###,###,###,###,##0")
-
-      // decimal place
-      if (dValue > 0) {
-        pattern.append(".")
-
-        var i = 0
-        while (i < dValue) {
-          i += 1
-          pattern.append("0")
+    lastDValue match {
+      case Some(last) if last == dValue =>
+        // use the current pattern
+      case _ =>
+        // construct a new DecimalFormat only if a new dValue
+        pattern.delete(0, pattern.length)
+        pattern.append("#,###,###,###,###,###,##0")
+
+        // decimal place
+        if (dValue > 0) {
+          pattern.append(".")
+
+          var i = 0
+          while (i < dValue) {
+            i += 1
+            pattern.append("0")
+          }
         }
-      }
-      lastDValue = dValue
 
-      numberFormat.applyLocalizedPattern(pattern.toString)
+        lastDValue = Some(dValue)
+
+        numberFormat.applyLocalizedPattern(pattern.toString)
     }
 
     x.dataType match {
@@ -1388,8 +1545,12 @@ case class FormatNumber(x: Expression, d: Expression)
  * The 'lang' and 'country' arguments are optional, and if omitted, the default locale is used.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str[, lang, country]) - Splits str into an array of array of words.",
-  extended = "> SELECT _FUNC_('Hi there! Good morning.');\n  [['Hi','there'], ['Good','morning']]")
+  usage = "_FUNC_(str[, lang, country]) - Splits `str` into an array of array of words.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Hi there! Good morning.');
+       [["Hi","there"],["Good","morning"]]
+  """)
 case class Sentences(
     str: Expression,
     language: Expression = Literal(""),
@@ -1415,7 +1576,7 @@ case class Sentences(
       val locale = if (languageStr != null && countryStr != null) {
         new Locale(languageStr.toString, countryStr.toString)
       } else {
-        Locale.getDefault
+        Locale.US
       }
       getSentences(string.asInstanceOf[UTF8String].toString, locale)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index b47486f7af7f..3cbbcdf4a96c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -321,7 +321,7 @@ abstract class OffsetWindowFunction
   val input: Expression
 
   /**
-   * Default result value for the function when the 'offset'th row does not exist.
+   * Default result value for the function when the `offset`th row does not exist.
    */
   val default: Expression
 
@@ -372,22 +372,23 @@ abstract class OffsetWindowFunction
 }
 
 /**
- * The Lead function returns the value of 'x' at the 'offset'th row after the current row in
+ * The Lead function returns the value of `input` at the `offset`th row after the current row in
  * the window. Offsets start at 0, which is the current row. The offset must be constant
- * integer value. The default offset is 1. When the value of 'x' is null at the 'offset'th row,
- * null is returned. If there is no such offset row, the default expression is evaluated.
+ * integer value. The default offset is 1. When the value of `input` is null at the `offset`th row,
+ * null is returned. If there is no such offset row, the `default` expression is evaluated.
  *
- * @param input expression to evaluate 'offset' rows after the current row.
+ * @param input expression to evaluate `offset` rows after the current row.
  * @param offset rows to jump ahead in the partition.
  * @param default to use when the offset is larger than the window. The default value is null.
  */
-@ExpressionDescription(usage =
-  """_FUNC_(input, offset, default) - LEAD returns the value of 'x' at the 'offset'th row
-     after the current row in the window.
-     The default value of 'offset' is 1 and the default value of 'default' is null.
-     If the value of 'x' at the 'offset'th row is null, null is returned.
-     If there is no such offset row (e.g. when the offset is 1, the last row of the window
-     does not have any subsequent row), 'default' is returned.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_(input[, offset[, default]]) - Returns the value of `input` at the `offset`th row
+      after the current row in the window. The default value of `offset` is 1 and the default
+      value of `default` is null. If the value of `input` at the `offset`th row is null,
+      null is returned. If there is no such an offset row (e.g., when the offset is 1, the last
+      row of the window does not have any subsequent row), `default` is returned.
+  """)
 case class Lead(input: Expression, offset: Expression, default: Expression)
     extends OffsetWindowFunction {
 
@@ -401,22 +402,23 @@ case class Lead(input: Expression, offset: Expression, default: Expression)
 }
 
 /**
- * The Lag function returns the value of 'x' at the 'offset'th row before the current row in
+ * The Lag function returns the value of `input` at the `offset`th row before the current row in
  * the window. Offsets start at 0, which is the current row. The offset must be constant
- * integer value. The default offset is 1. When the value of 'x' is null at the 'offset'th row,
- * null is returned. If there is no such offset row, the default expression is evaluated.
+ * integer value. The default offset is 1. When the value of `input` is null at the `offset`th row,
+ * null is returned. If there is no such offset row, the `default` expression is evaluated.
  *
- * @param input expression to evaluate 'offset' rows before the current row.
+ * @param input expression to evaluate `offset` rows before the current row.
  * @param offset rows to jump back in the partition.
  * @param default to use when the offset row does not exist.
  */
-@ExpressionDescription(usage =
-  """_FUNC_(input, offset, default) - LAG returns the value of 'x' at the 'offset'th row
-     before the current row in the window.
-     The default value of 'offset' is 1 and the default value of 'default' is null.
-     If the value of 'x' at the 'offset'th row is null, null is returned.
-     If there is no such offset row (e.g. when the offset is 1, the first row of the window
-     does not have any previous row), 'default' is returned.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_(input[, offset[, default]]) - Returns the value of `input` at the `offset`th row
+      before the current row in the window. The default value of `offset` is 1 and the default
+      value of `default` is null. If the value of `input` at the `offset`th row is null,
+      null is returned. If there is no such offset row (e.g., when the offset is 1, the first
+      row of the window does not have any previous row), `default` is returned.
+  """)
 case class Lag(input: Expression, offset: Expression, default: Expression)
     extends OffsetWindowFunction {
 
@@ -471,26 +473,28 @@ object SizeBasedWindowFunction {
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() - The ROW_NUMBER() function assigns a unique, sequential number to
-     each row, starting with one, according to the ordering of rows within
-     the window partition.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Assigns a unique, sequential number to each row, starting with one,
+      according to the ordering of rows within the window partition.
+  """)
 case class RowNumber() extends RowNumberLike {
   override val evaluateExpression = rowNumber
   override def prettyName: String = "row_number"
 }
 
 /**
- * The CumeDist function computes the position of a value relative to a all values in the partition.
+ * The CumeDist function computes the position of a value relative to all values in the partition.
  * The result is the number of rows preceding or equal to the current row in the ordering of the
  * partition divided by the total number of rows in the window partition. Any tie values in the
  * ordering will evaluate to the same position.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() - The CUME_DIST() function computes the position of a value relative to
-     a all values in the partition.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Computes the position of a value relative to all values in the partition.
+  """)
 case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction {
   override def dataType: DataType = DoubleType
   // The frame for CUME_DIST is Range based instead of Row based, because CUME_DIST must
@@ -501,8 +505,8 @@ case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction {
 }
 
 /**
- * The NTile function divides the rows for each window partition into 'n' buckets ranging from 1 to
- * at most 'n'. Bucket values will differ by at most 1. If the number of rows in the partition does
+ * The NTile function divides the rows for each window partition into `n` buckets ranging from 1 to
+ * at most `n`. Bucket values will differ by at most 1. If the number of rows in the partition does
  * not divide evenly into the number of buckets, then the remainder values are distributed one per
  * bucket, starting with the first bucket.
  *
@@ -521,9 +525,11 @@ case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction {
  *
  * @param buckets number of buckets to divide the rows in. Default value is 1.
  */
-@ExpressionDescription(usage =
-  """_FUNC_(x) - The NTILE(n) function divides the rows for each window partition
-     into 'n' buckets ranging from 1 to at most 'n'.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_(n) - Divides the rows for each window partition into `n` buckets ranging
+      from 1 to at most `n`.
+  """)
 case class NTile(buckets: Expression) extends RowNumberLike with SizeBasedWindowFunction {
   def this() = this(Literal(1))
 
@@ -587,9 +593,9 @@ case class NTile(buckets: Expression) extends RowNumberLike with SizeBasedWindow
 
 /**
  * A RankLike function is a WindowFunction that changes its value based on a change in the value of
- * the order of the window in which is processed. For instance, when the value of 'x' changes in a
- * window ordered by 'x' the rank function also changes. The size of the change of the rank function
- * is (typically) not dependent on the size of the change in 'x'.
+ * the order of the window in which is processed. For instance, when the value of `input` changes
+ * in a window ordered by `input` the rank function also changes. The size of the change of the
+ * rank function is (typically) not dependent on the size of the change in `input`.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
  */
@@ -635,7 +641,7 @@ abstract class RankLike extends AggregateWindowFunction {
 
 /**
  * The Rank function computes the rank of a value in a group of values. The result is one plus the
- * number of rows preceding or equal to the current row in the ordering of the partition. Tie values
+ * number of rows preceding or equal to the current row in the ordering of the partition. The values
  * will produce gaps in the sequence.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
@@ -644,10 +650,12 @@ abstract class RankLike extends AggregateWindowFunction {
  *                 change in rank. This is an internal parameter and will be assigned by the
  *                 Analyser.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() -  RANK() computes the rank of a value in a group of values. The result
-     is one plus the number of rows preceding or equal to the current row in the
-     ordering of the partition. Tie values will produce gaps in the sequence.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Computes the rank of a value in a group of values. The result is one plus the number
+      of rows preceding or equal to the current row in the ordering of the partition. The values
+      will produce gaps in the sequence.
+  """)
 case class Rank(children: Seq[Expression]) extends RankLike {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): Rank = Rank(order)
@@ -655,8 +663,8 @@ case class Rank(children: Seq[Expression]) extends RankLike {
 
 /**
  * The DenseRank function computes the rank of a value in a group of values. The result is one plus
- * the previously assigned rank value. Unlike Rank, DenseRank will not produce gaps in the ranking
- * sequence.
+ * the previously assigned rank value. Unlike [[Rank]], [[DenseRank]] will not produce gaps in the
+ * ranking sequence.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
  *
@@ -664,10 +672,12 @@ case class Rank(children: Seq[Expression]) extends RankLike {
  *                 change in rank. This is an internal parameter and will be assigned by the
  *                 Analyser.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() - The DENSE_RANK() function computes the rank of a value in a group of
-     values. The result is one plus the previously assigned rank value. Unlike Rank,
-     DenseRank will not produce gaps in the ranking sequence.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Computes the rank of a value in a group of values. The result is one plus the
+      previously assigned rank value. Unlike the function rank, dense_rank will not produce gaps
+      in the ranking sequence.
+  """)
 case class DenseRank(children: Seq[Expression]) extends RankLike {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): DenseRank = DenseRank(order)
@@ -692,9 +702,10 @@ case class DenseRank(children: Seq[Expression]) extends RankLike {
  *                 change in rank. This is an internal parameter and will be assigned by the
  *                 Analyser.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() - PERCENT_RANK() The PercentRank function computes the percentage
-     ranking of a value in a group of values.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Computes the percentage ranking of a value in a group of values.
+  """)
 case class PercentRank(children: Seq[Expression]) extends RankLike with SizeBasedWindowFunction {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): PercentRank = PercentRank(order)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
index 47f039e6a4cc..aa328045cafd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
@@ -55,9 +55,15 @@ abstract class XPathExtract extends BinaryExpression with ExpectsInputTypes with
   def path: Expression
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Evaluates a boolean xpath expression.",
-  extended = "> SELECT _FUNC_('<a><b>1</b></a>','a/b');\ntrue")
+  usage = "_FUNC_(xml, xpath) - Returns true if the XPath expression evaluates to true, or if a matching node is found.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b></a>','a/b');
+       true
+  """)
+// scalastyle:on line.size.limit
 case class XPathBoolean(xml: Expression, path: Expression) extends XPathExtract {
 
   override def prettyName: String = "xpath_boolean"
@@ -68,11 +74,17 @@ case class XPathBoolean(xml: Expression, path: Expression) extends XPathExtract
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a short value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3")
+  usage = "_FUNC_(xml, xpath) - Returns a short integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3
+  """)
+// scalastyle:on line.size.limit
 case class XPathShort(xml: Expression, path: Expression) extends XPathExtract {
-  override def prettyName: String = "xpath_int"
+  override def prettyName: String = "xpath_short"
   override def dataType: DataType = ShortType
 
   override def nullSafeEval(xml: Any, path: Any): Any = {
@@ -81,9 +93,15 @@ case class XPathShort(xml: Expression, path: Expression) extends XPathExtract {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns an integer value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3")
+  usage = "_FUNC_(xml, xpath) - Returns an integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3
+  """)
+// scalastyle:on line.size.limit
 case class XPathInt(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_int"
   override def dataType: DataType = IntegerType
@@ -94,9 +112,15 @@ case class XPathInt(xml: Expression, path: Expression) extends XPathExtract {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a long value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3")
+  usage = "_FUNC_(xml, xpath) - Returns a long integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3
+  """)
+// scalastyle:on line.size.limit
 case class XPathLong(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_long"
   override def dataType: DataType = LongType
@@ -107,9 +131,15 @@ case class XPathLong(xml: Expression, path: Expression) extends XPathExtract {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a float value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3.0")
+  usage = "_FUNC_(xml, xpath) - Returns a float value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3.0
+  """)
+// scalastyle:on line.size.limit
 case class XPathFloat(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_float"
   override def dataType: DataType = FloatType
@@ -120,9 +150,15 @@ case class XPathFloat(xml: Expression, path: Expression) extends XPathExtract {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a double value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3.0")
+  usage = "_FUNC_(xml, xpath) - Returns a double value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3.0
+  """)
+// scalastyle:on line.size.limit
 case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_float"
   override def dataType: DataType = DoubleType
@@ -135,8 +171,12 @@ case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract {
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns the text contents of the first xml node that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>b</b><c>cc</c></a>','a/c');\ncc")
+  usage = "_FUNC_(xml, xpath) - Returns the text contents of the first xml node that matches the XPath expression.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>b</b><c>cc</c></a>','a/c');
+       cc
+  """)
 // scalastyle:on line.size.limit
 case class XPathString(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_string"
@@ -150,8 +190,12 @@ case class XPathString(xml: Expression, path: Expression) extends XPathExtract {
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a string array of values within xml nodes that match the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b/text()');\n['b1','b2','b3']")
+  usage = "_FUNC_(xml, xpath) - Returns a string array of values within the nodes of xml that match the XPath expression.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b/text()');
+       ['b1','b2','b3']
+  """)
 // scalastyle:on line.size.limit
 case class XPathList(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index aec18922ea6c..38e191bbbad6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql.catalyst.json
 
+import java.util.Locale
+
 import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
 import org.apache.commons.lang3.time.FastDateFormat
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.util.{CompressionCodecs, ParseModes}
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CompressionCodecs, ParseModes}
 
 /**
  * Options for parsing JSON data into Spark SQL rows.
@@ -29,9 +31,11 @@ import org.apache.spark.sql.catalyst.util.{CompressionCodecs, ParseModes}
  * Most of these map directly to Jackson's internal options, specified in [[JsonParser.Feature]].
  */
 private[sql] class JSONOptions(
-    @transient private val parameters: Map[String, String])
+    @transient private val parameters: CaseInsensitiveMap)
   extends Logging with Serializable  {
 
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
+
   val samplingRatio =
     parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
   val primitivesAsString =
@@ -56,11 +60,11 @@ private[sql] class JSONOptions(
 
   // Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe.
   val dateFormat: FastDateFormat =
-    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"))
+    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), Locale.US)
 
   val timestampFormat: FastDateFormat =
     FastDateFormat.getInstance(
-      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"))
+      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"), Locale.US)
 
   // Parse mode flags
   if (!ParseModes.isValidMode(parseMode)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index e5e2cd7d27d1..291a0c894e42 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -109,7 +109,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf)
       SimplifyCaseConversionExpressions,
       RewriteCorrelatedScalarSubquery,
       EliminateSerialization,
-      RemoveAliasOnlyProject) ::
+      RemoveRedundantAliases,
+      RemoveRedundantProject) ::
     Batch("Check Cartesian Products", Once,
       CheckCartesianProducts(conf)) ::
     Batch("Decimal Optimizations", fixedPoint,
@@ -132,7 +133,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf)
   object OptimizeSubqueries extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       case s: SubqueryExpression =>
-        s.withNewPlan(Optimizer.this.execute(s.plan))
+        val Subquery(newPlan) = Optimizer.this.execute(Subquery(s.plan))
+        s.withNewPlan(newPlan)
     }
   }
 }
@@ -153,56 +155,106 @@ class SimpleTestOptimizer extends Optimizer(
   new SimpleCatalystConf(caseSensitiveAnalysis = true))
 
 /**
- * Removes the Project only conducting Alias of its child node.
- * It is created mainly for removing extra Project added in EliminateSerialization rule,
- * but can also benefit other operators.
+ * Remove redundant aliases from a query plan. A redundant alias is an alias that does not change
+ * the name or metadata of a column, and does not deduplicate it.
  */
-object RemoveAliasOnlyProject extends Rule[LogicalPlan] {
+object RemoveRedundantAliases extends Rule[LogicalPlan] {
+
   /**
-   * Returns true if the project list is semantically same as child output, after strip alias on
-   * attribute.
+   * Create an attribute mapping from the old to the new attributes. This function will only
+   * return the attribute pairs that have changed.
    */
-  private def isAliasOnly(
-      projectList: Seq[NamedExpression],
-      childOutput: Seq[Attribute]): Boolean = {
-    if (projectList.length != childOutput.length) {
-      false
-    } else {
-      stripAliasOnAttribute(projectList).zip(childOutput).forall {
-        case (a: Attribute, o) if a semanticEquals o => true
-        case _ => false
-      }
+  private def createAttributeMapping(current: LogicalPlan, next: LogicalPlan)
+      : Seq[(Attribute, Attribute)] = {
+    current.output.zip(next.output).filterNot {
+      case (a1, a2) => a1.semanticEquals(a2)
     }
   }
 
-  private def stripAliasOnAttribute(projectList: Seq[NamedExpression]) = {
-    projectList.map {
-      // Alias with metadata can not be stripped, or the metadata will be lost.
-      // If the alias name is different from attribute name, we can't strip it either, or we may
-      // accidentally change the output schema name of the root plan.
-      case a @ Alias(attr: Attribute, name) if a.metadata == Metadata.empty && name == attr.name =>
-        attr
-      case other => other
-    }
+  /**
+   * Remove the top-level alias from an expression when it is redundant.
+   */
+  private def removeRedundantAlias(e: Expression, blacklist: AttributeSet): Expression = e match {
+    // Alias with metadata can not be stripped, or the metadata will be lost.
+    // If the alias name is different from attribute name, we can't strip it either, or we
+    // may accidentally change the output schema name of the root plan.
+    case a @ Alias(attr: Attribute, name)
+      if a.metadata == Metadata.empty &&
+        name == attr.name &&
+        !blacklist.contains(attr) &&
+        !blacklist.contains(a) =>
+      attr
+    case a => a
   }
 
-  def apply(plan: LogicalPlan): LogicalPlan = {
-    val aliasOnlyProject = plan.collectFirst {
-      case p @ Project(pList, child) if isAliasOnly(pList, child.output) => p
-    }
+  /**
+   * Remove redundant alias expression from a LogicalPlan and its subtree. A blacklist is used to
+   * prevent the removal of seemingly redundant aliases used to deduplicate the input for a (self)
+   * join or to prevent the removal of top-level subquery attributes.
+   */
+  private def removeRedundantAliases(plan: LogicalPlan, blacklist: AttributeSet): LogicalPlan = {
+    plan match {
+      // We want to keep the same output attributes for subqueries. This means we cannot remove
+      // the aliases that produce these attributes
+      case Subquery(child) =>
+        Subquery(removeRedundantAliases(child, blacklist ++ child.outputSet))
+
+      // A join has to be treated differently, because the left and the right side of the join are
+      // not allowed to use the same attributes. We use a blacklist to prevent us from creating a
+      // situation in which this happens; the rule will only remove an alias if its child
+      // attribute is not on the black list.
+      case Join(left, right, joinType, condition) =>
+        val newLeft = removeRedundantAliases(left, blacklist ++ right.outputSet)
+        val newRight = removeRedundantAliases(right, blacklist ++ newLeft.outputSet)
+        val mapping = AttributeMap(
+          createAttributeMapping(left, newLeft) ++
+          createAttributeMapping(right, newRight))
+        val newCondition = condition.map(_.transform {
+          case a: Attribute => mapping.getOrElse(a, a)
+        })
+        Join(newLeft, newRight, joinType, newCondition)
+
+      case _ =>
+        // Remove redundant aliases in the subtree(s).
+        val currentNextAttrPairs = mutable.Buffer.empty[(Attribute, Attribute)]
+        val newNode = plan.mapChildren { child =>
+          val newChild = removeRedundantAliases(child, blacklist)
+          currentNextAttrPairs ++= createAttributeMapping(child, newChild)
+          newChild
+        }
 
-    aliasOnlyProject.map { case proj =>
-      val attributesToReplace = proj.output.zip(proj.child.output).filterNot {
-        case (a1, a2) => a1 semanticEquals a2
-      }
-      val attrMap = AttributeMap(attributesToReplace)
-      plan transform {
-        case plan: Project if plan eq proj => plan.child
-        case plan => plan transformExpressions {
-          case a: Attribute if attrMap.contains(a) => attrMap(a)
+        // Create the attribute mapping. Note that the currentNextAttrPairs can contain duplicate
+        // keys in case of Union (this is caused by the PushProjectionThroughUnion rule); in this
+        // case we use the the first mapping (which should be provided by the first child).
+        val mapping = AttributeMap(currentNextAttrPairs)
+
+        // Create a an expression cleaning function for nodes that can actually produce redundant
+        // aliases, use identity otherwise.
+        val clean: Expression => Expression = plan match {
+          case _: Project => removeRedundantAlias(_, blacklist)
+          case _: Aggregate => removeRedundantAlias(_, blacklist)
+          case _: Window => removeRedundantAlias(_, blacklist)
+          case _ => identity[Expression]
         }
-      }
-    }.getOrElse(plan)
+
+        // Transform the expressions.
+        newNode.mapExpressions { expr =>
+          clean(expr.transform {
+            case a: Attribute => mapping.getOrElse(a, a)
+          })
+        }
+    }
+  }
+
+  def apply(plan: LogicalPlan): LogicalPlan = removeRedundantAliases(plan, AttributeSet.empty)
+}
+
+/**
+ * Remove projections from the query plan that do not make any modifications.
+ */
+object RemoveRedundantProject extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case p @ Project(_, child) if p.output == child.output => child
   }
 }
 
@@ -540,12 +592,14 @@ object CollapseRepartition extends Rule[LogicalPlan] {
 
 /**
  * Collapse Adjacent Window Expression.
- * - If the partition specs and order specs are the same, collapse into the parent.
+ * - If the partition specs and order specs are the same and the window expression are
+ *   independent, collapse into the parent.
  */
 object CollapseWindow extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
-    case w @ Window(we1, ps1, os1, Window(we2, ps2, os2, grandChild)) if ps1 == ps2 && os1 == os2 =>
-      w.copy(windowExpressions = we1 ++ we2, child = grandChild)
+    case w1 @ Window(we1, ps1, os1, w2 @ Window(we2, ps2, os2, grandChild))
+        if ps1 == ps2 && os1 == os2 && w1.references.intersect(w2.windowOutputSet).isEmpty =>
+      w1.copy(windowExpressions = we2 ++ we1, child = grandChild)
   }
 }
 
@@ -688,8 +742,9 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
     // implies that, for a given input row, the output are determined by the expression's initial
     // state and all the input rows processed before. In another word, the order of input rows
     // matters for non-deterministic expressions, while pushing down predicates changes the order.
-    case filter @ Filter(condition, project @ Project(fields, grandChild))
-      if fields.forall(_.deterministic) =>
+    // This also applies to Aggregate.
+    case Filter(condition, project @ Project(fields, grandChild))
+      if fields.forall(_.deterministic) && canPushThroughCondition(grandChild, condition) =>
 
       // Create a map of Aliases to their values from the child projection.
       // e.g., 'SELECT a + b AS c, d ...' produces Map(c -> a + b).
@@ -699,33 +754,8 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
 
       project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild))
 
-    // Push [[Filter]] operators through [[Window]] operators. Parts of the predicate that can be
-    // pushed beneath must satisfy the following conditions:
-    // 1. All the expressions are part of window partitioning key. The expressions can be compound.
-    // 2. Deterministic.
-    // 3. Placed before any non-deterministic predicates.
-    case filter @ Filter(condition, w: Window)
-        if w.partitionSpec.forall(_.isInstanceOf[AttributeReference]) =>
-      val partitionAttrs = AttributeSet(w.partitionSpec.flatMap(_.references))
-
-      val (candidates, containingNonDeterministic) =
-        splitConjunctivePredicates(condition).span(_.deterministic)
-
-      val (pushDown, rest) = candidates.partition { cond =>
-        cond.references.subsetOf(partitionAttrs)
-      }
-
-      val stayUp = rest ++ containingNonDeterministic
-
-      if (pushDown.nonEmpty) {
-        val pushDownPredicate = pushDown.reduce(And)
-        val newWindow = w.copy(child = Filter(pushDownPredicate, w.child))
-        if (stayUp.isEmpty) newWindow else Filter(stayUp.reduce(And), newWindow)
-      } else {
-        filter
-      }
-
-    case filter @ Filter(condition, aggregate: Aggregate) =>
+    case filter @ Filter(condition, aggregate: Aggregate)
+      if aggregate.aggregateExpressions.forall(_.deterministic) =>
       // Find all the aliased expressions in the aggregate list that don't include any actual
       // AggregateExpression, and create a map from the alias to the expression
       val aliasMap = AttributeMap(aggregate.aggregateExpressions.collect {
@@ -756,6 +786,32 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
         filter
       }
 
+    // Push [[Filter]] operators through [[Window]] operators. Parts of the predicate that can be
+    // pushed beneath must satisfy the following conditions:
+    // 1. All the expressions are part of window partitioning key. The expressions can be compound.
+    // 2. Deterministic.
+    // 3. Placed before any non-deterministic predicates.
+    case filter @ Filter(condition, w: Window)
+      if w.partitionSpec.forall(_.isInstanceOf[AttributeReference]) =>
+      val partitionAttrs = AttributeSet(w.partitionSpec.flatMap(_.references))
+
+      val (candidates, containingNonDeterministic) =
+        splitConjunctivePredicates(condition).span(_.deterministic)
+
+      val (pushDown, rest) = candidates.partition { cond =>
+        cond.references.subsetOf(partitionAttrs)
+      }
+
+      val stayUp = rest ++ containingNonDeterministic
+
+      if (pushDown.nonEmpty) {
+        val pushDownPredicate = pushDown.reduce(And)
+        val newWindow = w.copy(child = Filter(pushDownPredicate, w.child))
+        if (stayUp.isEmpty) newWindow else Filter(stayUp.reduce(And), newWindow)
+      } else {
+        filter
+      }
+
     case filter @ Filter(condition, union: Union) =>
       // Union could change the rows, so non-deterministic predicate can't be pushed down
       val (pushDown, stayUp) = splitConjunctivePredicates(condition).span(_.deterministic)
@@ -781,7 +837,7 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
         filter
       }
 
-    case filter @ Filter(condition, u: UnaryNode)
+    case filter @ Filter(_, u: UnaryNode)
         if canPushThrough(u) && u.expressions.forall(_.deterministic) =>
       pushDownPredicate(filter, u.child) { predicate =>
         u.withNewChildren(Seq(Filter(predicate, u.child)))
@@ -796,7 +852,7 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
     case _: Distinct => true
     case _: Generate => true
     case _: Pivot => true
-    case _: RedistributeData => true
+    case _: RepartitionByExpression => true
     case _: Repartition => true
     case _: ScriptTransformation => true
     case _: Sort => true
@@ -830,6 +886,20 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
       filter
     }
   }
+
+  /**
+   * Check if we can safely push a filter through a projection, by making sure that predicate
+   * subqueries in the condition do not contain the same attributes as the plan they are moved
+   * into. This can happen when the plan and predicate subquery have the same source.
+   */
+  private def canPushThroughCondition(plan: LogicalPlan, condition: Expression): Boolean = {
+    val attributes = plan.outputSet
+    val matched = condition.find {
+      case PredicateSubquery(p, _, _, _) => p.outputSet.intersect(attributes).nonEmpty
+      case _ => false
+    }
+    matched.isEmpty
+  }
 }
 
 /**
@@ -878,7 +948,7 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
           val newRight = rightFilterConditions.
             reduceLeftOption(And).map(Filter(_, right)).getOrElse(right)
           val (newJoinConditions, others) =
-            commonFilterCondition.partition(e => !SubqueryExpression.hasCorrelatedSubquery(e))
+            commonFilterCondition.partition(canEvaluateWithinJoin)
           val newJoinCond = (newJoinConditions ++ joinCondition).reduceLeftOption(And)
 
           val join = Join(newLeft, newRight, joinType, newJoinCond)
@@ -918,7 +988,7 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
         split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right)
 
       joinType match {
-        case _: InnerLike | LeftExistence(_) =>
+        case _: InnerLike | LeftSemi =>
           // push down the single side only join filter for both sides sub queries
           val newLeft = leftJoinConditions.
             reduceLeftOption(And).map(Filter(_, left)).getOrElse(left)
@@ -935,14 +1005,14 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
           val newJoinCond = (rightJoinConditions ++ commonJoinCondition).reduceLeftOption(And)
 
           Join(newLeft, newRight, RightOuter, newJoinCond)
-        case LeftOuter =>
+        case LeftOuter | LeftAnti | ExistenceJoin(_) =>
           // push down the right side only join filter for right sub query
           val newLeft = left
           val newRight = rightJoinConditions.
             reduceLeftOption(And).map(Filter(_, right)).getOrElse(right)
           val newJoinCond = (leftJoinConditions ++ commonJoinCondition).reduceLeftOption(And)
 
-          Join(newLeft, newRight, LeftOuter, newJoinCond)
+          Join(newLeft, newRight, joinType, newJoinCond)
         case FullOuter => j
         case NaturalJoin(_) => sys.error("Untransformed NaturalJoin node")
         case UsingJoin(_, _) => sys.error("Untransformed Using join node")
@@ -1060,6 +1130,7 @@ object ConvertToLocalRelation extends Rule[LogicalPlan] {
     case Project(projectList, LocalRelation(output, data))
         if !projectList.exists(hasUnevaluableExpr) =>
       val projection = new InterpretedProjection(projectList, output)
+      projection.initialize(0)
       LocalRelation(projectList.map(_.toAttribute), data.map(projection))
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
index 7400a01918c5..987cd7434b45 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
@@ -30,7 +29,7 @@ import org.apache.spark.sql.catalyst.rules._
  *    - Join with one or two empty children (including Intersect/Except).
  * 2. Unary-node Logical Plans
  *    - Project/Filter/Sample/Join/Limit/Repartition with all empty children.
- *    - Aggregate with all empty children and without AggregateFunction expressions like COUNT.
+ *    - Aggregate with all empty children and at least one grouping expression.
  *    - Generate(Explode) with all empty children. Others like Hive UDTF may return results.
  */
 object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
@@ -39,10 +38,6 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
     case _ => false
   }
 
-  private def containsAggregateExpression(e: Expression): Boolean = {
-    e.collectFirst { case _: AggregateFunction => () }.isDefined
-  }
-
   private def empty(plan: LogicalPlan) = LocalRelation(plan.output, data = Seq.empty)
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
@@ -68,8 +63,13 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
       case _: LocalLimit => empty(p)
       case _: Repartition => empty(p)
       case _: RepartitionByExpression => empty(p)
-      // AggregateExpressions like COUNT(*) return their results like 0.
-      case Aggregate(_, ae, _) if !ae.exists(containsAggregateExpression) => empty(p)
+      // An aggregate with non-empty group expression will return one output row per group when the
+      // input to the aggregate is not empty. If the input to the aggregate is empty then all groups
+      // will be empty and thus the output will be empty.
+      //
+      // If the grouping expressions are empty, however, then the aggregate will always produce a
+      // single output row and thus we cannot propagate the EmptyRelation.
+      case Aggregate(ge, _, _) if ge.nonEmpty => empty(p)
       // Generators like Hive-style UDTF may return their records within `close`.
       case Generate(_: Explode, _, _, _, _, _) => empty(p)
       case _ => p
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
index d6a39ecf53b8..cd8912f793f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
@@ -115,9 +115,21 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
     }
 
     // Extract distinct aggregate expressions.
-    val distinctAggGroups = aggExpressions
-      .filter(_.isDistinct)
-      .groupBy(_.aggregateFunction.children.toSet)
+    val distinctAggGroups = aggExpressions.filter(_.isDistinct).groupBy { e =>
+        val unfoldableChildren = e.aggregateFunction.children.filter(!_.foldable).toSet
+        if (unfoldableChildren.nonEmpty) {
+          // Only expand the unfoldable children
+          unfoldableChildren
+        } else {
+          // If aggregateFunction's children are all foldable
+          // we must expand at least one of the children (here we take the first child),
+          // or If we don't, we will get the wrong result, for example:
+          // count(distinct 1) will be explained to count(1) after the rewrite function.
+          // Generally, the distinct aggregateFunction should not run
+          // foldable TypeCheck for the first child.
+          e.aggregateFunction.children.take(1).toSet
+        }
+    }
 
     // Check if the aggregates contains functions that do not support partial aggregation.
     val existsNonPartial = aggExpressions.exists(!_.aggregateFunction.supportsPartial)
@@ -136,8 +148,9 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
       def evalWithinGroup(id: Literal, e: Expression) = If(EqualTo(gid, id), e, nullify(e))
       def patchAggregateFunctionChildren(
           af: AggregateFunction)(
-          attrs: Expression => Expression): AggregateFunction = {
-        af.withNewChildren(af.children.map(attrs)).asInstanceOf[AggregateFunction]
+          attrs: Expression => Option[Expression]): AggregateFunction = {
+        val newChildren = af.children.map(c => attrs(c).getOrElse(c))
+        af.withNewChildren(newChildren).asInstanceOf[AggregateFunction]
       }
 
       // Setup unique distinct aggregate children.
@@ -161,7 +174,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
           val operators = expressions.map { e =>
             val af = e.aggregateFunction
             val naf = patchAggregateFunctionChildren(af) { x =>
-              evalWithinGroup(id, distinctAggChildAttrLookup(x))
+              distinctAggChildAttrLookup.get(x).map(evalWithinGroup(id, _))
             }
             (e, e.copy(aggregateFunction = naf, isDistinct = false))
           }
@@ -170,8 +183,12 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
       }
 
       // Setup expand for the 'regular' aggregate expressions.
-      val regularAggExprs = aggExpressions.filter(!_.isDistinct)
-      val regularAggChildren = regularAggExprs.flatMap(_.aggregateFunction.children).distinct
+      // only expand unfoldable children
+      val regularAggExprs = aggExpressions
+        .filter(e => !e.isDistinct && e.children.exists(!_.foldable))
+      val regularAggChildren = regularAggExprs
+        .flatMap(_.aggregateFunction.children.filter(!_.foldable))
+        .distinct
       val regularAggChildAttrMap = regularAggChildren.map(expressionAttributePair)
 
       // Setup aggregates for 'regular' aggregate expressions.
@@ -179,7 +196,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
       val regularAggChildAttrLookup = regularAggChildAttrMap.toMap
       val regularAggOperatorMap = regularAggExprs.map { e =>
         // Perform the actual aggregation in the initial aggregate.
-        val af = patchAggregateFunctionChildren(e.aggregateFunction)(regularAggChildAttrLookup)
+        val af = patchAggregateFunctionChildren(e.aggregateFunction)(regularAggChildAttrLookup.get)
         val operator = Alias(e.copy(aggregateFunction = af), e.sql)()
 
         // Select the result of the first aggregate in the last aggregate.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index b7458910da13..243bb592f5b8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -428,46 +428,73 @@ object FoldablePropagation extends Rule[LogicalPlan] {
       }
       case _ => Nil
     })
+    val replaceFoldable: PartialFunction[Expression, Expression] = {
+      case a: AttributeReference if foldableMap.contains(a) => foldableMap(a)
+    }
 
     if (foldableMap.isEmpty) {
       plan
     } else {
       var stop = false
       CleanupAliases(plan.transformUp {
-        case u: Union =>
-          stop = true
-          u
-        case c: Command =>
-          stop = true
-          c
-        // For outer join, although its output attributes are derived from its children, they are
-        // actually different attributes: the output of outer join is not always picked from its
-        // children, but can also be null.
+        // A leaf node should not stop the folding process (note that we are traversing up the
+        // tree, starting at the leaf nodes); so we are allowing it.
+        case l: LeafNode =>
+          l
+
+        // We can only propagate foldables for a subset of unary nodes.
+        case u: UnaryNode if !stop && canPropagateFoldables(u) =>
+          u.transformExpressions(replaceFoldable)
+
+        // Allow inner joins. We do not allow outer join, although its output attributes are
+        // derived from its children, they are actually different attributes: the output of outer
+        // join is not always picked from its children, but can also be null.
         // TODO(cloud-fan): It seems more reasonable to use new attributes as the output attributes
         // of outer join.
-        case j @ Join(_, _, LeftOuter | RightOuter | FullOuter, _) =>
+        case j @ Join(_, _, Inner, _) if !stop =>
+          j.transformExpressions(replaceFoldable)
+
+        // We can fold the projections an expand holds. However expand changes the output columns
+        // and often reuses the underlying attributes; so we cannot assume that a column is still
+        // foldable after the expand has been applied.
+        // TODO(hvanhovell): Expand should use new attributes as the output attributes.
+        case expand: Expand if !stop =>
+          val newExpand = expand.copy(projections = expand.projections.map { projection =>
+            projection.map(_.transform(replaceFoldable))
+          })
           stop = true
-          j
+          newExpand
 
-        // These 3 operators take attributes as constructor parameters, and these attributes
-        // can't be replaced by alias.
-        case m: MapGroups =>
-          stop = true
-          m
-        case f: FlatMapGroupsInR =>
-          stop = true
-          f
-        case c: CoGroup =>
+        case other =>
           stop = true
-          c
-
-        case p: LogicalPlan if !stop => p.transformExpressions {
-          case a: AttributeReference if foldableMap.contains(a) =>
-            foldableMap(a)
-        }
+          other
       })
     }
   }
+
+  /**
+   * Whitelist of all [[UnaryNode]]s for which allow foldable propagation.
+   */
+  private def canPropagateFoldables(u: UnaryNode): Boolean = u match {
+    case _: Project => true
+    case _: Filter => true
+    case _: SubqueryAlias => true
+    case _: Aggregate => true
+    case _: Window => true
+    case _: Sample => true
+    case _: GlobalLimit => true
+    case _: LocalLimit => true
+    case _: Generate => true
+    case _: Distinct => true
+    case _: AppendColumns => true
+    case _: AppendColumnsWithObject => true
+    case _: BroadcastHint => true
+    case _: RepartitionByExpression => true
+    case _: Repartition => true
+    case _: Sort => true
+    case _: TypedFilter => true
+    case _ => false
+  }
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
index 7c667315870f..f20eb958fe97 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types._
  */
 object ReplaceExpressions extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
-    case e: RuntimeReplaceable => e.replaced
+    case e: RuntimeReplaceable => e.child
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
index 180ad2e0ad1f..e314955a07ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -46,8 +46,7 @@ object ReorderJoin extends Rule[LogicalPlan] with PredicateHelper {
     : LogicalPlan = {
     assert(input.size >= 2)
     if (input.size == 2) {
-      val (joinConditions, others) = conditions.partition(
-        e => !SubqueryExpression.hasCorrelatedSubquery(e))
+      val (joinConditions, others) = conditions.partition(canEvaluateWithinJoin)
       val ((left, leftJoinType), (right, rightJoinType)) = (input(0), input(1))
       val innerJoinType = (leftJoinType, rightJoinType) match {
         case (Inner, Inner) => Inner
@@ -75,7 +74,7 @@ object ReorderJoin extends Rule[LogicalPlan] with PredicateHelper {
 
       val joinedRefs = left.outputSet ++ right.outputSet
       val (joinConditions, others) = conditions.partition(
-        e => e.references.subsetOf(joinedRefs) && !SubqueryExpression.hasCorrelatedSubquery(e))
+        e => e.references.subsetOf(joinedRefs) && canEvaluateWithinJoin(e))
       val joined = Join(left, right, innerJoinType, joinConditions.reduceLeftOption(And))
 
       // should not have reference to same logical plan
@@ -122,8 +121,8 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with PredicateHelper {
     val leftConditions = conditions.filter(_.references.subsetOf(join.left.outputSet))
     val rightConditions = conditions.filter(_.references.subsetOf(join.right.outputSet))
 
-    val leftHasNonNullPredicate = leftConditions.exists(canFilterOutNull)
-    val rightHasNonNullPredicate = rightConditions.exists(canFilterOutNull)
+    lazy val leftHasNonNullPredicate = leftConditions.exists(canFilterOutNull)
+    lazy val rightHasNonNullPredicate = rightConditions.exists(canFilterOutNull)
 
     join.joinType match {
       case RightOuter if leftHasNonNullPredicate => Inner
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index f14aaab72a98..4d62cce9da0a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -68,8 +68,14 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper {
           // Note that will almost certainly be planned as a Broadcast Nested Loop join.
           // Use EXISTS if performance matters to you.
           val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p)
-          val anyNull = splitConjunctivePredicates(joinCond.get).map(IsNull).reduceLeft(Or)
-          Join(outerPlan, sub, LeftAnti, Option(Or(anyNull, joinCond.get)))
+          // Expand the NOT IN expression with the NULL-aware semantic
+          // to its full form. That is from:
+          //   (a1,b1,...) = (a2,b2,...)
+          // to
+          //   (a1=a2 OR isnull(a1=a2)) AND (b1=b2 OR isnull(b1=b2)) AND ...
+          val joinConds = splitConjunctivePredicates(joinCond.get)
+          val pairs = joinConds.map(c => Or(c, IsNull(c))).reduceLeft(And)
+          Join(outerPlan, sub, LeftAnti, Option(pairs))
         case (p, predicate) =>
           val (newCond, inputPlan) = rewriteExistentialExpr(Seq(predicate), p)
           Project(p.output, Filter(newCond.get, inputPlan))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
index 105cdf52500c..4af56afebb76 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.internal.SQLConf
+
 /**
  * Catalyst is a library for manipulating relational query plans.  All classes in catalyst are
  * considered an internal API to Spark SQL and are subject to change between minor releases.
@@ -29,4 +31,9 @@ package object catalyst {
    */
   protected[sql] object ScalaReflectionLock
 
+  /**
+   * This class is only here to minimize the change for ticket SPARK-19944
+   * (moves SQLConf from sql/core to sql/catalyst). This class should eventually be removed.
+   */
+  type CatalystConf = SQLConf
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 38e9bb6c162a..a3b39a854065 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -76,7 +76,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
   }
 
   override def visitSingleDataType(ctx: SingleDataTypeContext): DataType = withOrigin(ctx) {
-    visit(ctx.dataType).asInstanceOf[DataType]
+    visitSparkDataType(ctx.dataType)
   }
 
   /* ********************************************************************************************
@@ -172,17 +172,20 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
     val tableIdent = visitTableIdentifier(ctx.tableIdentifier)
     val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty)
 
-    val dynamicPartitionKeys = partitionKeys.filter(_._2.isEmpty)
+    val dynamicPartitionKeys: Map[String, Option[String]] = partitionKeys.filter(_._2.isEmpty)
     if (ctx.EXISTS != null && dynamicPartitionKeys.nonEmpty) {
       throw new ParseException(s"Dynamic partitions do not support IF NOT EXISTS. Specified " +
         "partitions with value: " + dynamicPartitionKeys.keys.mkString("[", ",", "]"), ctx)
     }
+    val overwrite = ctx.OVERWRITE != null
+    val staticPartitionKeys: Map[String, String] =
+      partitionKeys.filter(_._2.nonEmpty).map(t => (t._1, t._2.get))
 
     InsertIntoTable(
       UnresolvedRelation(tableIdent, None),
       partitionKeys,
       query,
-      ctx.OVERWRITE != null,
+      OverwriteOptions(overwrite, if (overwrite) staticPartitionKeys else Map.empty),
       ctx.EXISTS != null)
   }
 
@@ -567,10 +570,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
         // Resolve the join type and join condition
         val (joinType, condition) = Option(join.joinCriteria) match {
           case Some(c) if c.USING != null =>
-            val columns = c.identifier.asScala.map { column =>
-              UnresolvedAttribute.quoted(column.getText)
-            }
-            (UsingJoin(baseJoinType, columns), None)
+            (UsingJoin(baseJoinType, c.identifier.asScala.map(_.getText)), None)
           case Some(c) if c.booleanExpression != null =>
             (baseJoinType, Option(expression(c.booleanExpression)))
           case None if join.NATURAL != null =>
@@ -681,8 +681,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
         // inline table comes in two styles:
         // style 1: values (1), (2), (3)  -- multiple columns are supported
         // style 2: values 1, 2, 3  -- only a single column is supported here
-        case CreateStruct(children) => children  // style 1
-        case child => Seq(child)  // style 2
+        case struct: CreateNamedStruct => struct.valExprs // style 1
+        case child => Seq(child)                          // style 2
       }
     }
 
@@ -997,7 +997,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
    * Create a [[Cast]] expression.
    */
   override def visitCast(ctx: CastContext): Expression = withOrigin(ctx) {
-    Cast(expression(ctx.expression), typedVisit(ctx.dataType))
+    Cast(expression(ctx.expression), visitSparkDataType(ctx.dataType))
   }
 
   /**
@@ -1415,6 +1415,13 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
   /* ********************************************************************************************
    * DataType parsing
    * ******************************************************************************************** */
+  /**
+   * Create a Spark DataType.
+   */
+  private def visitSparkDataType(ctx: DataTypeContext): DataType = {
+    HiveStringType.replaceCharType(typedVisit(ctx))
+  }
+
   /**
    * Resolve/create a primitive type.
    */
@@ -1429,8 +1436,9 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
       case ("double", Nil) => DoubleType
       case ("date", Nil) => DateType
       case ("timestamp", Nil) => TimestampType
-      case ("char" | "varchar" | "string", Nil) => StringType
-      case ("char" | "varchar", _ :: Nil) => StringType
+      case ("string", Nil) => StringType
+      case ("char", length :: Nil) => CharType(length.getText.toInt)
+      case ("varchar", length :: Nil) => VarcharType(length.getText.toInt)
       case ("binary", Nil) => BinaryType
       case ("decimal", Nil) => DecimalType.USER_DEFAULT
       case ("decimal", precision :: Nil) => DecimalType(precision.getText.toInt, 0)
@@ -1452,7 +1460,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
       case SqlBaseParser.MAP =>
         MapType(typedVisit(ctx.dataType(0)), typedVisit(ctx.dataType(1)))
       case SqlBaseParser.STRUCT =>
-        createStructType(ctx.complexColTypeList())
+        StructType(Option(ctx.complexColTypeList).toSeq.flatMap(visitComplexColTypeList))
     }
   }
 
@@ -1471,12 +1479,28 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
   }
 
   /**
-   * Create a [[StructField]] from a column definition.
+   * Create a top level [[StructField]] from a column definition.
    */
   override def visitColType(ctx: ColTypeContext): StructField = withOrigin(ctx) {
     import ctx._
-    val structField = StructField(identifier.getText, typedVisit(dataType), nullable = true)
-    if (STRING == null) structField else structField.withComment(string(STRING))
+
+    val builder = new MetadataBuilder
+    // Add comment to metadata
+    if (STRING != null) {
+      builder.putString("comment", string(STRING))
+    }
+    // Add Hive type string to metadata.
+    val rawDataType = typedVisit[DataType](ctx.dataType)
+    val cleanedDataType = HiveStringType.replaceCharType(rawDataType)
+    if (rawDataType != cleanedDataType) {
+      builder.putString(HIVE_TYPE_STRING, rawDataType.catalogString)
+    }
+
+    StructField(
+      identifier.getText,
+      cleanedDataType,
+      nullable = true,
+      builder.build())
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 45ee2964d4db..a5761703fd65 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -40,14 +40,13 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
   }
 
   /**
-   * Infers a set of `isNotNull` constraints from a given set of equality/comparison expressions as
-   * well as non-nullable attributes. For e.g., if an expression is of the form (`a > 5`), this
+   * Infers a set of `isNotNull` constraints from null intolerant expressions as well as
+   * non-nullable attributes. For e.g., if an expression is of the form (`a > 5`), this
    * returns a constraint of the form `isNotNull(a)`
    */
   private def constructIsNotNullConstraints(constraints: Set[Expression]): Set[Expression] = {
     // First, we propagate constraints from the null intolerant expressions.
-    var isNotNullConstraints: Set[Expression] =
-      constraints.flatMap(scanNullIntolerantExpr).map(IsNotNull(_))
+    var isNotNullConstraints: Set[Expression] = constraints.flatMap(inferIsNotNullConstraints)
 
     // Second, we infer additional constraints from non-nullable attributes that are part of the
     // operator's output
@@ -57,14 +56,28 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
     isNotNullConstraints -- constraints
   }
 
+  /**
+   * Infer the Attribute-specific IsNotNull constraints from the null intolerant child expressions
+   * of constraints.
+   */
+  private def inferIsNotNullConstraints(constraint: Expression): Seq[Expression] =
+    constraint match {
+      // When the root is IsNotNull, we can push IsNotNull through the child null intolerant
+      // expressions
+      case IsNotNull(expr) => scanNullIntolerantAttribute(expr).map(IsNotNull(_))
+      // Constraints always return true for all the inputs. That means, null will never be returned.
+      // Thus, we can infer `IsNotNull(constraint)`, and also push IsNotNull through the child
+      // null intolerant expressions.
+      case _ => scanNullIntolerantAttribute(constraint).map(IsNotNull(_))
+    }
+
   /**
    * Recursively explores the expressions which are null intolerant and returns all attributes
    * in these expressions.
    */
-  private def scanNullIntolerantExpr(expr: Expression): Seq[Attribute] = expr match {
+  private def scanNullIntolerantAttribute(expr: Expression): Seq[Attribute] = expr match {
     case a: Attribute => Seq(a)
-    case _: NullIntolerant | IsNotNull(_: NullIntolerant) =>
-      expr.children.flatMap(scanNullIntolerantExpr)
+    case _: NullIntolerant => expr.children.flatMap(scanNullIntolerantAttribute)
     case _ => Seq.empty[Attribute]
   }
 
@@ -229,31 +242,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
    * @param rule the rule to be applied to every expression in this operator.
    */
   def transformExpressionsDown(rule: PartialFunction[Expression, Expression]): this.type = {
-    var changed = false
-
-    @inline def transformExpressionDown(e: Expression): Expression = {
-      val newE = e.transformDown(rule)
-      if (newE.fastEquals(e)) {
-        e
-      } else {
-        changed = true
-        newE
-      }
-    }
-
-    def recursiveTransform(arg: Any): AnyRef = arg match {
-      case e: Expression => transformExpressionDown(e)
-      case Some(e: Expression) => Some(transformExpressionDown(e))
-      case m: Map[_, _] => m
-      case d: DataType => d // Avoid unpacking Structs
-      case seq: Traversable[_] => seq.map(recursiveTransform)
-      case other: AnyRef => other
-      case null => null
-    }
-
-    val newArgs = mapProductIterator(recursiveTransform)
-
-    if (changed) makeCopy(newArgs).asInstanceOf[this.type] else this
+    mapExpressions(_.transformDown(rule))
   }
 
   /**
@@ -263,10 +252,18 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
    * @return
    */
   def transformExpressionsUp(rule: PartialFunction[Expression, Expression]): this.type = {
+    mapExpressions(_.transformUp(rule))
+  }
+
+  /**
+   * Apply a map function to each expression present in this query operator, and return a new
+   * query operator based on the mapped expressions.
+   */
+  def mapExpressions(f: Expression => Expression): this.type = {
     var changed = false
 
-    @inline def transformExpressionUp(e: Expression): Expression = {
-      val newE = e.transformUp(rule)
+    @inline def transformExpression(e: Expression): Expression = {
+      val newE = f(e)
       if (newE.fastEquals(e)) {
         e
       } else {
@@ -276,8 +273,8 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
     }
 
     def recursiveTransform(arg: Any): AnyRef = arg match {
-      case e: Expression => transformExpressionUp(e)
-      case Some(e: Expression) => Some(transformExpressionUp(e))
+      case e: Expression => transformExpression(e)
+      case Some(e: Expression) => Some(transformExpression(e))
       case m: Map[_, _] => m
       case d: DataType => d // Avoid unpacking Structs
       case seq: Traversable[_] => seq.map(recursiveTransform)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
index 61e083e6fc2c..853e9f3b076a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
@@ -100,7 +100,7 @@ case class NaturalJoin(tpe: JoinType) extends JoinType {
   override def sql: String = "NATURAL " + tpe.sql
 }
 
-case class UsingJoin(tpe: JoinType, usingColumns: Seq[UnresolvedAttribute]) extends JoinType {
+case class UsingJoin(tpe: JoinType, usingColumns: Seq[String]) extends JoinType {
   require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe),
     "Unsupported using join type " + tpe)
   override def sql: String = "USING " + tpe.sql
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
new file mode 100644
index 000000000000..e0dd4c9f0e2d
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
+import org.apache.spark.sql.types.MetadataBuilder
+import org.apache.spark.unsafe.types.CalendarInterval
+
+object EventTimeWatermark {
+  /** The [[org.apache.spark.sql.types.Metadata]] key used to hold the eventTime watermark delay. */
+  val delayKey = "spark.watermarkDelayMs"
+
+  def getDelayMs(delay: CalendarInterval): Long = {
+    // We define month as `31 days` to simplify calculation.
+    val millisPerMonth = CalendarInterval.MICROS_PER_DAY / 1000 * 31
+    delay.milliseconds + delay.months * millisPerMonth
+  }
+}
+
+/**
+ * Used to mark a user specified column as holding the event time for a row.
+ */
+case class EventTimeWatermark(
+    eventTime: Attribute,
+    delay: CalendarInterval,
+    child: LogicalPlan) extends LogicalPlan {
+
+  // Update the metadata on the eventTime column to include the desired delay.
+  override val output: Seq[Attribute] = child.output.map { a =>
+    if (a semanticEquals eventTime) {
+      val delayMs = EventTimeWatermark.getDelayMs(delay)
+      val updatedMetadata = new MetadataBuilder()
+        .withMetadata(a.metadata)
+        .putLong(EventTimeWatermark.delayKey, delayMs)
+        .build()
+      a.withMetadata(updatedMetadata)
+    } else if (a.metadata.contains(EventTimeWatermark.delayKey)) {
+      // Remove existing watermark
+      val updatedMetadata = new MetadataBuilder()
+        .withMetadata(a.metadata)
+        .remove(EventTimeWatermark.delayKey)
+        .build()
+      a.withMetadata(updatedMetadata)
+    } else {
+      a
+    }
+  }
+
+  override val children: Seq[LogicalPlan] = child :: Nil
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index 890865d17784..91633f5124a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -75,7 +75,8 @@ case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
   }
 
   override lazy val statistics =
-    Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length)
+    Statistics(sizeInBytes =
+      (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length)
 
   def toSQL(inlineTableName: String): String = {
     require(data.nonEmpty)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index b0a4145f3776..a28cd604878c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -55,7 +55,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
    */
   def resolveOperators(rule: PartialFunction[LogicalPlan, LogicalPlan]): LogicalPlan = {
     if (!analyzed) {
-      val afterRuleOnChildren = transformChildren(rule, (t, r) => t.resolveOperators(r))
+      val afterRuleOnChildren = mapChildren(_.resolveOperators(rule))
       if (this fastEquals afterRuleOnChildren) {
         CurrentOrigin.withOrigin(origin) {
           rule.applyOrElse(this, identity[LogicalPlan])
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
index f3e2147b8f97..465fbab5716a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.commons.codec.binary.Base64
+import scala.util.control.NonFatal
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.types._
 
+
 /**
  * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
  * corresponding statistic produced by the children.  To override this behavior, override
@@ -58,60 +61,178 @@ case class Statistics(
   }
 }
 
+
 /**
- * Statistics for a column.
+ * Statistics collected for a column.
+ *
+ * 1. Supported data types are defined in `ColumnStat.supportsType`.
+ * 2. The JVM data type stored in min/max is the external data type (used in Row) for the
+ * corresponding Catalyst data type. For example, for DateType we store java.sql.Date, and for
+ * TimestampType we store java.sql.Timestamp.
+ * 3. For integral types, they are all upcasted to longs, i.e. shorts are stored as longs.
+ * 4. There is no guarantee that the statistics collected are accurate. Approximation algorithms
+ *    (sketches) might have been used, and the data collected can also be stale.
+ *
+ * @param distinctCount number of distinct values
+ * @param min minimum value
+ * @param max maximum value
+ * @param nullCount number of nulls
+ * @param avgLen average length of the values. For fixed-length types, this should be a constant.
+ * @param maxLen maximum length of the values. For fixed-length types, this should be a constant.
  */
-case class ColumnStat(statRow: InternalRow) {
+case class ColumnStat(
+    distinctCount: BigInt,
+    min: Option[Any],
+    max: Option[Any],
+    nullCount: BigInt,
+    avgLen: Long,
+    maxLen: Long) {
 
-  def forNumeric[T <: AtomicType](dataType: T): NumericColumnStat[T] = {
-    NumericColumnStat(statRow, dataType)
-  }
-  def forString: StringColumnStat = StringColumnStat(statRow)
-  def forBinary: BinaryColumnStat = BinaryColumnStat(statRow)
-  def forBoolean: BooleanColumnStat = BooleanColumnStat(statRow)
+  // We currently don't store min/max for binary/string type. This can change in the future and
+  // then we need to remove this require.
+  require(min.isEmpty || (!min.get.isInstanceOf[Array[Byte]] && !min.get.isInstanceOf[String]))
+  require(max.isEmpty || (!max.get.isInstanceOf[Array[Byte]] && !max.get.isInstanceOf[String]))
 
-  override def toString: String = {
-    // use Base64 for encoding
-    Base64.encodeBase64String(statRow.asInstanceOf[UnsafeRow].getBytes)
+  /**
+   * Returns a map from string to string that can be used to serialize the column stats.
+   * The key is the name of the field (e.g. "distinctCount" or "min"), and the value is the string
+   * representation for the value. The deserialization side is defined in [[ColumnStat.fromMap]].
+   *
+   * As part of the protocol, the returned map always contains a key called "version".
+   * In the case min/max values are null (None), they won't appear in the map.
+   */
+  def toMap: Map[String, String] = {
+    val map = new scala.collection.mutable.HashMap[String, String]
+    map.put(ColumnStat.KEY_VERSION, "1")
+    map.put(ColumnStat.KEY_DISTINCT_COUNT, distinctCount.toString)
+    map.put(ColumnStat.KEY_NULL_COUNT, nullCount.toString)
+    map.put(ColumnStat.KEY_AVG_LEN, avgLen.toString)
+    map.put(ColumnStat.KEY_MAX_LEN, maxLen.toString)
+    min.foreach { v => map.put(ColumnStat.KEY_MIN_VALUE, v.toString) }
+    max.foreach { v => map.put(ColumnStat.KEY_MAX_VALUE, v.toString) }
+    map.toMap
   }
 }
 
-object ColumnStat {
-  def apply(numFields: Int, str: String): ColumnStat = {
-    // use Base64 for decoding
-    val bytes = Base64.decodeBase64(str)
-    val unsafeRow = new UnsafeRow(numFields)
-    unsafeRow.pointTo(bytes, bytes.length)
-    ColumnStat(unsafeRow)
+
+object ColumnStat extends Logging {
+
+  // List of string keys used to serialize ColumnStat
+  val KEY_VERSION = "version"
+  private val KEY_DISTINCT_COUNT = "distinctCount"
+  private val KEY_MIN_VALUE = "min"
+  private val KEY_MAX_VALUE = "max"
+  private val KEY_NULL_COUNT = "nullCount"
+  private val KEY_AVG_LEN = "avgLen"
+  private val KEY_MAX_LEN = "maxLen"
+
+  /** Returns true iff the we support gathering column statistics on column of the given type. */
+  def supportsType(dataType: DataType): Boolean = dataType match {
+    case _: IntegralType => true
+    case _: DecimalType => true
+    case DoubleType | FloatType => true
+    case BooleanType => true
+    case DateType => true
+    case TimestampType => true
+    case BinaryType | StringType => true
+    case _ => false
   }
-}
 
-case class NumericColumnStat[T <: AtomicType](statRow: InternalRow, dataType: T) {
-  // The indices here must be consistent with `ColumnStatStruct.numericColumnStat`.
-  val numNulls: Long = statRow.getLong(0)
-  val max: T#InternalType = statRow.get(1, dataType).asInstanceOf[T#InternalType]
-  val min: T#InternalType = statRow.get(2, dataType).asInstanceOf[T#InternalType]
-  val ndv: Long = statRow.getLong(3)
-}
+  /**
+   * Creates a [[ColumnStat]] object from the given map. This is used to deserialize column stats
+   * from some external storage. The serialization side is defined in [[ColumnStat.toMap]].
+   */
+  def fromMap(table: String, field: StructField, map: Map[String, String])
+    : Option[ColumnStat] = {
+    val str2val: (String => Any) = field.dataType match {
+      case _: IntegralType => _.toLong
+      case _: DecimalType => new java.math.BigDecimal(_)
+      case DoubleType | FloatType => _.toDouble
+      case BooleanType => _.toBoolean
+      case DateType => java.sql.Date.valueOf
+      case TimestampType => java.sql.Timestamp.valueOf
+      // This version of Spark does not use min/max for binary/string types so we ignore it.
+      case BinaryType | StringType => _ => null
+      case _ =>
+        throw new AnalysisException("Column statistics deserialization is not supported for " +
+          s"column ${field.name} of data type: ${field.dataType}.")
+    }
 
-case class StringColumnStat(statRow: InternalRow) {
-  // The indices here must be consistent with `ColumnStatStruct.stringColumnStat`.
-  val numNulls: Long = statRow.getLong(0)
-  val avgColLen: Double = statRow.getDouble(1)
-  val maxColLen: Long = statRow.getInt(2)
-  val ndv: Long = statRow.getLong(3)
-}
+    try {
+      Some(ColumnStat(
+        distinctCount = BigInt(map(KEY_DISTINCT_COUNT).toLong),
+        // Note that flatMap(Option.apply) turns Option(null) into None.
+        min = map.get(KEY_MIN_VALUE).map(str2val).flatMap(Option.apply),
+        max = map.get(KEY_MAX_VALUE).map(str2val).flatMap(Option.apply),
+        nullCount = BigInt(map(KEY_NULL_COUNT).toLong),
+        avgLen = map.getOrElse(KEY_AVG_LEN, field.dataType.defaultSize.toString).toLong,
+        maxLen = map.getOrElse(KEY_MAX_LEN, field.dataType.defaultSize.toString).toLong
+      ))
+    } catch {
+      case NonFatal(e) =>
+        logWarning(s"Failed to parse column statistics for column ${field.name} in table $table", e)
+        None
+    }
+  }
 
-case class BinaryColumnStat(statRow: InternalRow) {
-  // The indices here must be consistent with `ColumnStatStruct.binaryColumnStat`.
-  val numNulls: Long = statRow.getLong(0)
-  val avgColLen: Double = statRow.getDouble(1)
-  val maxColLen: Long = statRow.getInt(2)
-}
+  /**
+   * Constructs an expression to compute column statistics for a given column.
+   *
+   * The expression should create a single struct column with the following schema:
+   * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long, maxLen: Long
+   *
+   * Together with [[rowToColumnStat]], this function is used to create [[ColumnStat]] and
+   * as a result should stay in sync with it.
+   */
+  def statExprs(col: Attribute, relativeSD: Double): CreateNamedStruct = {
+    def struct(exprs: Expression*): CreateNamedStruct = CreateStruct(exprs.map { expr =>
+      expr.transformUp { case af: AggregateFunction => af.toAggregateExpression() }
+    })
+    val one = Literal(1, LongType)
+
+    // the approximate ndv (num distinct value) should never be larger than the number of rows
+    val numNonNulls = if (col.nullable) Count(col) else Count(one)
+    val ndv = Least(Seq(HyperLogLogPlusPlus(col, relativeSD), numNonNulls))
+    val numNulls = Subtract(Count(one), numNonNulls)
+    val defaultSize = Literal(col.dataType.defaultSize, LongType)
+
+    def fixedLenTypeStruct(castType: DataType) = {
+      // For fixed width types, avg size should be the same as max size.
+      struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType), numNulls, defaultSize,
+        defaultSize)
+    }
+
+    col.dataType match {
+      case _: IntegralType => fixedLenTypeStruct(LongType)
+      case _: DecimalType => fixedLenTypeStruct(col.dataType)
+      case DoubleType | FloatType => fixedLenTypeStruct(DoubleType)
+      case BooleanType => fixedLenTypeStruct(col.dataType)
+      case DateType => fixedLenTypeStruct(col.dataType)
+      case TimestampType => fixedLenTypeStruct(col.dataType)
+      case BinaryType | StringType =>
+        // For string and binary type, we don't store min/max.
+        val nullLit = Literal(null, col.dataType)
+        struct(
+          ndv, nullLit, nullLit, numNulls,
+          // Set avg/max size to default size if all the values are null or there is no value.
+          Coalesce(Seq(Ceil(Average(Length(col))), defaultSize)),
+          Coalesce(Seq(Cast(Max(Length(col)), LongType), defaultSize)))
+      case _ =>
+        throw new AnalysisException("Analyzing column statistics is not supported for column " +
+            s"${col.name} of data type: ${col.dataType}.")
+    }
+  }
+
+  /** Convert a struct for column stats (defined in statExprs) into [[ColumnStat]]. */
+  def rowToColumnStat(row: Row): ColumnStat = {
+    ColumnStat(
+      distinctCount = BigInt(row.getLong(0)),
+      min = Option(row.get(1)),  // for string/binary min/max, get should return null
+      max = Option(row.get(2)),
+      nullCount = BigInt(row.getLong(3)),
+      avgLen = row.getLong(4),
+      maxLen = row.getLong(5)
+    )
+  }
 
-case class BooleanColumnStat(statRow: InternalRow) {
-  // The indices here must be consistent with `ColumnStatStruct.booleanColumnStat`.
-  val numNulls: Long = statRow.getLong(0)
-  val numTrues: Long = statRow.getLong(1)
-  val numFalses: Long = statRow.getLong(2)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index a48974c6322a..1f9aa7d30a4a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -21,10 +21,12 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
+import org.apache.spark.sql.catalyst.catalog.CatalogTypes
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 /**
  * When planning take() or collect() operations, this special node that is inserted at the top of
@@ -37,6 +39,14 @@ case class ReturnAnswer(child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 }
 
+/**
+ * This node is inserted at the top of a subquery when it is optimized. This makes sure we can
+ * recognize a subquery as such, and it allows us to write subquery aware transformations.
+ */
+case class Subquery(child: LogicalPlan) extends UnaryNode {
+  override def output: Seq[Attribute] = child.output
+}
+
 case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = projectList.map(_.toAttribute)
   override def maxRows: Option[Long] = child.maxRows
@@ -92,13 +102,13 @@ case class Generate(
 
   override def producedAttributes: AttributeSet = AttributeSet(generatorOutput)
 
-  def output: Seq[Attribute] = {
-    val qualified = qualifier.map(q =>
-      // prepend the new qualifier to the existed one
-      generatorOutput.map(a => a.withQualifier(Some(q)))
-    ).getOrElse(generatorOutput)
+  def qualifiedGeneratorOutput: Seq[Attribute] = qualifier.map { q =>
+    // prepend the new qualifier to the existed one
+    generatorOutput.map(a => a.withQualifier(Some(q)))
+  }.getOrElse(generatorOutput)
 
-    if (join) child.output ++ qualified else qualified
+  def output: Seq[Attribute] = {
+    if (join) child.output ++ qualifiedGeneratorOutput else qualifiedGeneratorOutput
   }
 }
 
@@ -133,7 +143,7 @@ abstract class SetOperation(left: LogicalPlan, right: LogicalPlan) extends Binar
     childrenResolved &&
       left.output.length == right.output.length &&
       left.output.zip(right.output).forall { case (l, r) =>
-        l.dataType.asNullable == r.dataType.asNullable
+        l.dataType.sameType(r.dataType)
       } && duplicateResolved
 }
 
@@ -210,8 +220,8 @@ case class Union(children: Seq[LogicalPlan]) extends LogicalPlan {
         child.output.length == children.head.output.length &&
         // compare the data types with the first child
         child.output.zip(children.head.output).forall {
-          case (l, r) => l.dataType.asNullable == r.dataType.asNullable }
-      )
+          case (l, r) => l.dataType.sameType(r.dataType)
+        })
     children.length > 1 && childrenResolved && allChildrenCompatible
   }
 
@@ -345,18 +355,50 @@ case class BroadcastHint(child: LogicalPlan) extends UnaryNode {
   override lazy val statistics: Statistics = super.statistics.copy(isBroadcastable = true)
 }
 
+/**
+ * Options for writing new data into a table.
+ *
+ * @param enabled whether to overwrite existing data in the table.
+ * @param staticPartitionKeys if non-empty, specifies that we only want to overwrite partitions
+ *                            that match this partial partition spec. If empty, all partitions
+ *                            will be overwritten.
+ */
+case class OverwriteOptions(
+    enabled: Boolean,
+    staticPartitionKeys: CatalogTypes.TablePartitionSpec = Map.empty) {
+  if (staticPartitionKeys.nonEmpty) {
+    assert(enabled, "Overwrite must be enabled when specifying specific partitions.")
+  }
+}
+
+/**
+ * Insert some data into a table.
+ *
+ * @param table the logical plan representing the table. In the future this should be a
+ *              [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] once we converge Hive tables
+ *              and data source tables.
+ * @param partition a map from the partition key to the partition value (optional). If the partition
+ *                  value is optional, dynamic partition insert will be performed.
+ *                  As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have
+ *                  Map('a' -> Some('1'), 'b' -> Some('2')),
+ *                  and `INSERT INTO tbl PARTITION (a=1, b) AS ...`
+ *                  would have Map('a' -> Some('1'), 'b' -> None).
+ * @param child the logical plan representing data to write to.
+ * @param overwrite overwrite existing table or partitions.
+ * @param ifNotExists If true, only write if the table or partition does not exist.
+ */
 case class InsertIntoTable(
     table: LogicalPlan,
     partition: Map[String, Option[String]],
     child: LogicalPlan,
-    overwrite: Boolean,
+    overwrite: OverwriteOptions,
     ifNotExists: Boolean)
   extends LogicalPlan {
 
   override def children: Seq[LogicalPlan] = child :: Nil
   override def output: Seq[Attribute] = Seq.empty
 
-  assert(overwrite || !ifNotExists)
+  assert(overwrite.enabled || !ifNotExists)
   assert(partition.values.forall(_.nonEmpty) || !ifNotExists)
 
   override lazy val resolved: Boolean = childrenResolved && table.resolved
@@ -372,6 +414,13 @@ case class InsertIntoTable(
  */
 case class With(child: LogicalPlan, cteRelations: Seq[(String, SubqueryAlias)]) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
+
+  override def simpleString: String = {
+    val cteAliases = Utils.truncatedString(cteRelations.map(_._1), "[", ", ", "]")
+    s"CTE $cteAliases"
+  }
+
+  override def innerChildren: Seq[LogicalPlan] = cteRelations.map(_._2)
 }
 
 case class WithWindowDefinition(
@@ -738,6 +787,28 @@ case class Repartition(numPartitions: Int, shuffle: Boolean, child: LogicalPlan)
   override def output: Seq[Attribute] = child.output
 }
 
+/**
+ * This method repartitions data using [[Expression]]s into `numPartitions`, and receives
+ * information about the number of partitions during execution. Used when a specific ordering or
+ * distribution is expected by the consumer of the query result. Use [[Repartition]] for RDD-like
+ * `coalesce` and `repartition`.
+ * If `numPartitions` is not specified, the number of partitions will be the number set by
+ * `spark.sql.shuffle.partitions`.
+ */
+case class RepartitionByExpression(
+    partitionExpressions: Seq[Expression],
+    child: LogicalPlan,
+    numPartitions: Option[Int] = None) extends UnaryNode {
+
+  numPartitions match {
+    case Some(n) => require(n > 0, s"Number of partitions ($n) must be positive.")
+    case None => // Ok
+  }
+
+  override def maxRows: Option[Long] = child.maxRows
+  override def output: Seq[Attribute] = child.output
+}
+
 /**
  * A relation with one row. This is used in "SELECT ..." without a from clause.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
index 0ab4c9016623..8c9107afaa0f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
@@ -39,7 +39,10 @@ object CatalystSerde {
   }
 
   def generateObjAttr[T : Encoder]: Attribute = {
-    AttributeReference("obj", encoderFor[T].deserializer.dataType, nullable = false)()
+    val enc = encoderFor[T]
+    val dataType = enc.deserializer.dataType
+    val nullable = !enc.clsTag.runtimeClass.isPrimitive
+    AttributeReference("obj", dataType, nullable)()
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala
deleted file mode 100644
index 28cbce8748fc..000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/partitioning.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans.logical
-
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, SortOrder}
-
-/**
- * Performs a physical redistribution of the data.  Used when the consumer of the query
- * result have expectations about the distribution and ordering of partitioned input data.
- */
-abstract class RedistributeData extends UnaryNode {
-  override def output: Seq[Attribute] = child.output
-}
-
-case class SortPartitions(sortExpressions: Seq[SortOrder], child: LogicalPlan)
-  extends RedistributeData
-
-/**
- * This method repartitions data using [[Expression]]s into `numPartitions`, and receives
- * information about the number of partitions during execution. Used when a specific ordering or
- * distribution is expected by the consumer of the query result. Use [[Repartition]] for RDD-like
- * `coalesce` and `repartition`.
- * If `numPartitions` is not specified, the number of partitions will be the number set by
- * `spark.sql.shuffle.partitions`.
- */
-case class RepartitionByExpression(
-    partitionExpressions: Seq[Expression],
-    child: LogicalPlan,
-    numPartitions: Option[Int] = None) extends RedistributeData {
-  numPartitions match {
-    case Some(n) => require(n > 0, s"Number of partitions ($n) must be positive.")
-    case None => // Ok
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index 6fc828f63f15..85b368c86263 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -122,7 +122,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
         logDebug(
           s"""
           |=== Result of Batch ${batch.name} ===
-          |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")}
+          |${sideBySide(batchStartPlan.treeString, curPlan.treeString).mkString("\n")}
         """.stripMargin)
       } else {
         logTrace(s"Batch ${batch.name} has no effect.")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/InternalOutputModes.scala
similarity index 88%
rename from sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/InternalOutputModes.scala
index 153f9f57faf4..351bd6fff4ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/InternalOutputModes.scala
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql
+package org.apache.spark.sql.catalyst.streaming
 
 import org.apache.spark.sql.streaming.OutputMode
 
 /**
- * Internal helper class to generate objects representing various [[OutputMode]]s,
+ * Internal helper class to generate objects representing various `OutputMode`s,
  */
 private[sql] object InternalOutputModes {
 
@@ -40,8 +40,8 @@ private[sql] object InternalOutputModes {
 
   /**
    * OutputMode in which only the rows in the streaming DataFrame/Dataset that were updated will be
-   * written to the sink every time these is some updates. This output mode can only be used in
-   * queries that contain aggregations.
+   * written to the sink every time these is some updates. If the query doesn't contain
+   * aggregations, it will be equivalent to `Append` mode.
    */
   case object Update extends OutputMode
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index ea8d8fef7bdf..26d13ba3bca0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.trees
 import java.util.UUID
 
 import scala.collection.Map
-import scala.collection.mutable.Stack
 import scala.reflect.ClassTag
 
 import org.apache.commons.lang3.ClassUtils
@@ -28,12 +27,9 @@ import org.json4s.JsonAST._
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.{EmptyRDD, RDD}
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, FunctionResource}
 import org.apache.spark.sql.catalyst.FunctionIdentifier
 import org.apache.spark.sql.catalyst.ScalaReflection._
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
@@ -195,26 +191,6 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     arr
   }
 
-  /**
-   * Returns a copy of this node where `f` has been applied to all the nodes children.
-   */
-  def mapChildren(f: BaseType => BaseType): BaseType = {
-    var changed = false
-    val newArgs = mapProductIterator {
-      case arg: TreeNode[_] if containsChild(arg) =>
-        val newChild = f(arg.asInstanceOf[BaseType])
-        if (newChild fastEquals arg) {
-          arg
-        } else {
-          changed = true
-          newChild
-        }
-      case nonChild: AnyRef => nonChild
-      case null => null
-    }
-    if (changed) makeCopy(newArgs) else this
-  }
-
   /**
    * Returns a copy of this node with the children replaced.
    * TODO: Validate somewhere (in debug mode?) that children are ordered correctly.
@@ -294,9 +270,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
 
     // Check if unchanged and then possibly return old copy to avoid gc churn.
     if (this fastEquals afterRule) {
-      transformChildren(rule, (t, r) => t.transformDown(r))
+      mapChildren(_.transformDown(rule))
     } else {
-      afterRule.transformChildren(rule, (t, r) => t.transformDown(r))
+      afterRule.mapChildren(_.transformDown(rule))
     }
   }
 
@@ -308,7 +284,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
    * @param rule the function use to transform this nodes children
    */
   def transformUp(rule: PartialFunction[BaseType, BaseType]): BaseType = {
-    val afterRuleOnChildren = transformChildren(rule, (t, r) => t.transformUp(r))
+    val afterRuleOnChildren = mapChildren(_.transformUp(rule))
     if (this fastEquals afterRuleOnChildren) {
       CurrentOrigin.withOrigin(origin) {
         rule.applyOrElse(this, identity[BaseType])
@@ -321,18 +297,14 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
   }
 
   /**
-   * Returns a copy of this node where `rule` has been recursively applied to all the children of
-   * this node.  When `rule` does not apply to a given node it is left unchanged.
-   * @param rule the function used to transform this nodes children
+   * Returns a copy of this node where `f` has been applied to all the nodes children.
    */
-  protected def transformChildren(
-      rule: PartialFunction[BaseType, BaseType],
-      nextOperation: (BaseType, PartialFunction[BaseType, BaseType]) => BaseType): BaseType = {
+  def mapChildren(f: BaseType => BaseType): BaseType = {
     if (children.nonEmpty) {
       var changed = false
       val newArgs = mapProductIterator {
         case arg: TreeNode[_] if containsChild(arg) =>
-          val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
+          val newChild = f(arg.asInstanceOf[BaseType])
           if (!(newChild fastEquals arg)) {
             changed = true
             newChild
@@ -340,7 +312,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
             arg
           }
         case Some(arg: TreeNode[_]) if containsChild(arg) =>
-          val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
+          val newChild = f(arg.asInstanceOf[BaseType])
           if (!(newChild fastEquals arg)) {
             changed = true
             Some(newChild)
@@ -349,7 +321,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
           }
         case m: Map[_, _] => m.mapValues {
           case arg: TreeNode[_] if containsChild(arg) =>
-            val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
+            val newChild = f(arg.asInstanceOf[BaseType])
             if (!(newChild fastEquals arg)) {
               changed = true
               newChild
@@ -361,7 +333,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
         case d: DataType => d // Avoid unpacking Structs
         case args: Traversable[_] => args.map {
           case arg: TreeNode[_] if containsChild(arg) =>
-            val newChild = nextOperation(arg.asInstanceOf[BaseType], rule)
+            val newChild = f(arg.asInstanceOf[BaseType])
             if (!(newChild fastEquals arg)) {
               changed = true
               newChild
@@ -369,8 +341,8 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
               arg
             }
           case tuple@(arg1: TreeNode[_], arg2: TreeNode[_]) =>
-            val newChild1 = nextOperation(arg1.asInstanceOf[BaseType], rule)
-            val newChild2 = nextOperation(arg2.asInstanceOf[BaseType], rule)
+            val newChild1 = f(arg1.asInstanceOf[BaseType])
+            val newChild2 = f(arg2.asInstanceOf[BaseType])
             if (!(newChild1 fastEquals arg1) || !(newChild2 fastEquals arg2)) {
               changed = true
               (newChild1, newChild2)
@@ -493,25 +465,43 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
 
   /**
    * Returns a string representation of the nodes in this tree, where each operator is numbered.
-   * The numbers can be used with [[trees.TreeNode.apply apply]] to easily access specific subtrees.
+   * The numbers can be used with [[TreeNode.apply]] to easily access specific subtrees.
+   *
+   * The numbers are based on depth-first traversal of the tree (with innerChildren traversed first
+   * before children).
    */
   def numberedTreeString: String =
     treeString.split("\n").zipWithIndex.map { case (line, i) => f"$i%02d $line" }.mkString("\n")
 
   /**
-   * Returns the tree node at the specified number.
+   * Returns the tree node at the specified number, used primarily for interactive debugging.
+   * Numbers for each node can be found in the [[numberedTreeString]].
+   *
+   * Note that this cannot return BaseType because logical plan's plan node might return
+   * physical plan for innerChildren, e.g. in-memory relation logical plan node has a reference
+   * to the physical plan node it is referencing.
+   */
+  def apply(number: Int): TreeNode[_] = getNodeNumbered(new MutableInt(number)).orNull
+
+  /**
+   * Returns the tree node at the specified number, used primarily for interactive debugging.
    * Numbers for each node can be found in the [[numberedTreeString]].
+   *
+   * This is a variant of [[apply]] that returns the node as BaseType (if the type matches).
    */
-  def apply(number: Int): BaseType = getNodeNumbered(new MutableInt(number))
+  def p(number: Int): BaseType = apply(number).asInstanceOf[BaseType]
 
-  protected def getNodeNumbered(number: MutableInt): BaseType = {
+  private def getNodeNumbered(number: MutableInt): Option[TreeNode[_]] = {
     if (number.i < 0) {
-      null.asInstanceOf[BaseType]
+      None
     } else if (number.i == 0) {
-      this
+      Some(this)
     } else {
       number.i -= 1
-      children.map(_.getNodeNumbered(number)).find(_ != null).getOrElse(null.asInstanceOf[BaseType])
+      // Note that this traversal order must be the same as numberedTreeString.
+      innerChildren.map(_.getNodeNumbered(number)).find(_ != None).getOrElse {
+        children.map(_.getNodeNumbered(number)).find(_ != None).flatten
+      }
     }
   }
 
@@ -527,6 +517,8 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
    * The `i`-th element in `lastChildren` indicates whether the ancestor of the current node at
    * depth `i + 1` is the last child of its own parent node.  The depth of the root node is 0, and
    * `lastChildren` for the root node should be empty.
+   *
+   * Note that this traversal (numbering) order must be the same as [[getNodeNumbered]].
    */
   def generateTreeString(
       depth: Int,
@@ -534,19 +526,16 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
       builder: StringBuilder,
       verbose: Boolean,
       prefix: String = ""): StringBuilder = {
+
     if (depth > 0) {
       lastChildren.init.foreach { isLast =>
-        val prefixFragment = if (isLast) "   " else ":  "
-        builder.append(prefixFragment)
+        builder.append(if (isLast) "   " else ":  ")
       }
-
-      val branch = if (lastChildren.last) "+- " else ":- "
-      builder.append(branch)
+      builder.append(if (lastChildren.last) "+- " else ":- ")
     }
 
     builder.append(prefix)
-    val headline = if (verbose) verboseString else simpleString
-    builder.append(headline)
+    builder.append(if (verbose) verboseString else simpleString)
     builder.append("\n")
 
     if (innerChildren.nonEmpty) {
@@ -557,9 +546,10 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     }
 
     if (children.nonEmpty) {
-      children.init.foreach(
-        _.generateTreeString(depth + 1, lastChildren :+ false, builder, verbose, prefix))
-      children.last.generateTreeString(depth + 1, lastChildren :+ true, builder, verbose, prefix)
+      children.init.foreach(_.generateTreeString(
+        depth + 1, lastChildren :+ false, builder, verbose, prefix))
+      children.last.generateTreeString(
+        depth + 1, lastChildren :+ true, builder, verbose, prefix)
     }
 
     builder
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
index cad4a08b0d83..140e86d670a5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
@@ -19,9 +19,22 @@ package org.apache.spark.sql.catalyst.util
 
 import scala.reflect.ClassTag
 
-import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
+import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, UnsafeArrayData}
 import org.apache.spark.sql.types.DataType
 
+object ArrayData {
+  def toArrayData(input: Any): ArrayData = input match {
+    case a: Array[Boolean] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Byte] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Short] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Int] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Long] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Float] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Double] => UnsafeArrayData.fromPrimitiveArray(a)
+    case other => new GenericArrayData(other)
+  }
+}
+
 abstract class ArrayData extends SpecializedGetters with Serializable {
   def numElements(): Int
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala
new file mode 100644
index 000000000000..a7f7a8a66382
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+/**
+ * Builds a map in which keys are case insensitive
+ */
+class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String]
+  with Serializable {
+
+  val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase))
+
+  override def get(k: String): Option[String] = baseMap.get(k.toLowerCase)
+
+  override def + [B1 >: String](kv: (String, B1)): Map[String, B1] =
+    baseMap + kv.copy(_1 = kv._1.toLowerCase)
+
+  override def iterator: Iterator[(String, String)] = baseMap.iterator
+
+  override def -(key: String): Map[String, String] = baseMap - key.toLowerCase
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 0b643a5b8426..15e877284296 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
 
 import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
-import java.util.{Calendar, TimeZone}
+import java.util.{Calendar, Locale, TimeZone}
 import javax.xml.bind.DatatypeConverter
 
 import scala.annotation.tailrec
@@ -79,14 +79,14 @@ object DateTimeUtils {
   // `SimpleDateFormat` is not thread-safe.
   val threadLocalTimestampFormat = new ThreadLocal[DateFormat] {
     override def initialValue(): SimpleDateFormat = {
-      new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+      new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     }
   }
 
   // `SimpleDateFormat` is not thread-safe.
   private val threadLocalDateFormat = new ThreadLocal[DateFormat] {
     override def initialValue(): SimpleDateFormat = {
-      new SimpleDateFormat("yyyy-MM-dd")
+      new SimpleDateFormat("yyyy-MM-dd", Locale.US)
     }
   }
 
@@ -511,7 +511,14 @@ object DateTimeUtils {
    */
   private[this] def getYearAndDayInYear(daysSince1970: SQLDate): (Int, Int) = {
     // add the difference (in days) between 1.1.1970 and the artificial year 0 (-17999)
-    val daysNormalized = daysSince1970 + toYearZero
+    var  daysSince1970Tmp = daysSince1970
+    // Since Julian calendar was replaced with the Gregorian calendar,
+    // the 10 days after Oct. 4 were skipped.
+    // (1582-10-04) -141428 days since 1970-01-01
+    if (daysSince1970 <= -141428) {
+      daysSince1970Tmp -= 10
+    }
+    val daysNormalized = daysSince1970Tmp + toYearZero
     val numOfQuarterCenturies = daysNormalized / daysIn400Years
     val daysInThis400 = daysNormalized % daysIn400Years + 1
     val (years, dayInYear) = numYears(daysInThis400)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index cde8bd5b9614..ca22ea24207e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -19,32 +19,44 @@ package org.apache.spark.sql.catalyst.util
 
 import java.util.regex.{Pattern, PatternSyntaxException}
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.unsafe.types.UTF8String
 
 object StringUtils {
 
-  // replace the _ with .{1} exactly match 1 time of any character
-  // replace the % with .*, match 0 or more times with any character
-  def escapeLikeRegex(v: String): String = {
-    if (!v.isEmpty) {
-      "(?s)" + (' ' +: v.init).zip(v).flatMap {
-        case (prev, '\\') => ""
-        case ('\\', c) =>
-          c match {
-            case '_' => "_"
-            case '%' => "%"
-            case _ => Pattern.quote("\\" + c)
-          }
-        case (prev, c) =>
+  /**
+   * Validate and convert SQL 'like' pattern to a Java regular expression.
+   *
+   * Underscores (_) are converted to '.' and percent signs (%) are converted to '.*', other
+   * characters are quoted literally. Escaping is done according to the rules specified in
+   * [[org.apache.spark.sql.catalyst.expressions.Like]] usage documentation. An invalid pattern will
+   * throw an [[AnalysisException]].
+   *
+   * @param pattern the SQL pattern to convert
+   * @return the equivalent Java regular expression of the pattern
+   */
+  def escapeLikeRegex(pattern: String): String = {
+    val in = pattern.toIterator
+    val out = new StringBuilder()
+
+    def fail(message: String) = throw new AnalysisException(
+      s"the pattern '$pattern' is invalid, $message")
+
+    while (in.hasNext) {
+      in.next match {
+        case '\\' if in.hasNext =>
+          val c = in.next
           c match {
-            case '_' => "."
-            case '%' => ".*"
-            case _ => Pattern.quote(Character.toString(c))
+            case '_' | '%' | '\\' => out ++= Pattern.quote(Character.toString(c))
+            case _ => fail(s"the escape character is not allowed to precede '$c'")
           }
-      }.mkString
-    } else {
-      v
+        case '\\' => fail("it is not allowed to end with the escape character")
+        case '_' => out ++= "."
+        case '%' => out ++= ".*"
+        case c => out ++= Pattern.quote(Character.toString(c))
+      }
     }
+    "(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines
   }
 
   private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
similarity index 89%
rename from sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
rename to sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 7bb3ac02fa5d..5926bb060d7a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -24,15 +24,11 @@ import scala.collection.JavaConverters._
 import scala.collection.immutable
 
 import org.apache.hadoop.fs.Path
-import org.apache.parquet.hadoop.ParquetOutputCommitter
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
-import org.apache.spark.sql.catalyst.CatalystConf
-import org.apache.spark.sql.execution.datasources.HadoopCommitProtocolWrapper
-import org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol
-import org.apache.spark.util.Utils
+import org.apache.spark.sql.catalyst.analysis.Resolver
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines the configuration options for Spark SQL.
@@ -56,11 +52,6 @@ object SQLConf {
 
   }
 
-  val WAREHOUSE_PATH = SQLConfigBuilder("spark.sql.warehouse.dir")
-    .doc("The default location for managed databases and tables.")
-    .stringConf
-    .createWithDefault(Utils.resolveURI("spark-warehouse").toString)
-
   val OPTIMIZER_MAX_ITERATIONS = SQLConfigBuilder("spark.sql.optimizer.maxIterations")
     .internal()
     .doc("The max number of iterations the optimizer and analyzer runs.")
@@ -113,7 +104,7 @@ object SQLConf {
     .doc("Configures the maximum size in bytes for a table that will be broadcast to all worker " +
       "nodes when performing a join.  By setting this value to -1 broadcasting can be disabled. " +
       "Note that currently statistics are only supported for Hive Metastore tables where the " +
-      "command<code>ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan</code> has been " +
+      "command <code>ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan</code> has been " +
       "run, and file-based data source tables where the statistics are computed directly on " +
       "the files of data.")
     .longConf
@@ -141,7 +132,7 @@ object SQLConf {
       "That is to say by default the optimizer will not choose to broadcast a table unless it " +
       "knows for sure its size is small enough.")
     .longConf
-    .createWithDefault(-1)
+    .createWithDefault(Long.MaxValue)
 
   val SHUFFLE_PARTITIONS = SQLConfigBuilder("spark.sql.shuffle.partitions")
     .doc("The default number of partitions to use when shuffling data for joins or aggregations.")
@@ -245,7 +236,7 @@ object SQLConf {
       "of org.apache.parquet.hadoop.ParquetOutputCommitter.")
     .internal()
     .stringConf
-    .createWithDefault(classOf[ParquetOutputCommitter].getName)
+    .createWithDefault("org.apache.parquet.hadoop.ParquetOutputCommitter")
 
   val PARQUET_VECTORIZED_READER_ENABLED =
     SQLConfigBuilder("spark.sql.parquet.enableVectorizedReader")
@@ -290,6 +281,25 @@ object SQLConf {
       .longConf
       .createWithDefault(250 * 1024 * 1024)
 
+  object HiveCaseSensitiveInferenceMode extends Enumeration {
+    val INFER_AND_SAVE, INFER_ONLY, NEVER_INFER = Value
+  }
+
+  val HIVE_CASE_SENSITIVE_INFERENCE = SQLConfigBuilder("spark.sql.hive.caseSensitiveInferenceMode")
+    .doc("Sets the action to take when a case-sensitive schema cannot be read from a Hive " +
+      "table's properties. Although Spark SQL itself is not case-sensitive, Hive compatible file " +
+      "formats such as Parquet are. Spark SQL must use a case-preserving schema when querying " +
+      "any table backed by files containing case-sensitive field names or queries may not return " +
+      "accurate results. Valid options include INFER_AND_SAVE (the default mode-- infer the " +
+      "case-sensitive schema from the underlying data files and write it back to the table " +
+      "properties), INFER_ONLY (infer the schema but don't attempt to write it to the table " +
+      "properties) and NEVER_INFER (fallback to using the case-insensitive metastore schema " +
+      "instead of inferring).")
+    .stringConf
+    .transform(_.toUpperCase())
+    .checkValues(HiveCaseSensitiveInferenceMode.values.map(_.toString))
+    .createWithDefault(HiveCaseSensitiveInferenceMode.NEVER_INFER.toString)
+
   val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
@@ -314,6 +324,13 @@ object SQLConf {
     .stringConf
     .createOptional
 
+  val THRIFTSERVER_INCREMENTAL_COLLECT =
+    SQLConfigBuilder("spark.sql.thriftServer.incrementalCollect")
+      .internal()
+      .doc("When true, enable incremental collection for execution in Thrift Server.")
+      .booleanConf
+      .createWithDefault(false)
+
   val THRIFTSERVER_UI_STATEMENT_LIMIT =
     SQLConfigBuilder("spark.sql.thriftserver.ui.retainedStatements")
       .doc("The number of SQL statements kept in the JDBC/ODBC web UI history.")
@@ -385,7 +402,8 @@ object SQLConf {
     SQLConfigBuilder("spark.sql.sources.commitProtocolClass")
       .internal()
       .stringConf
-      .createWithDefault(classOf[HadoopCommitProtocolWrapper].getName)
+      .createWithDefault(
+        "org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol")
 
   val PARALLEL_PARTITION_DISCOVERY_THRESHOLD =
     SQLConfigBuilder("spark.sql.sources.parallelPartitionDiscovery.threshold")
@@ -396,6 +414,14 @@ object SQLConf {
       .intConf
       .createWithDefault(32)
 
+  val PARALLEL_PARTITION_DISCOVERY_PARALLELISM =
+    SQLConfigBuilder("spark.sql.sources.parallelPartitionDiscovery.parallelism")
+      .doc("The number of parallelism to list a collection of path recursively, Set the " +
+        "number to prevent file listing from generating too many tasks.")
+      .internal()
+      .intConf
+      .createWithDefault(10000)
+
   // Whether to automatically resolve ambiguity in join conditions for self-joins.
   // See SPARK-6231.
   val DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY =
@@ -477,18 +503,17 @@ object SQLConf {
       .intConf
       .createWithDefault(10)
 
-  val STATE_STORE_MIN_VERSIONS_TO_RETAIN =
-    SQLConfigBuilder("spark.sql.streaming.stateStore.minBatchesToRetain")
-      .internal()
-      .doc("Minimum number of versions of a state store's data to retain after cleaning.")
-      .intConf
-      .createWithDefault(2)
-
   val CHECKPOINT_LOCATION = SQLConfigBuilder("spark.sql.streaming.checkpointLocation")
     .doc("The default location for storing checkpoint data for streaming queries.")
     .stringConf
     .createOptional
 
+  val MIN_BATCHES_TO_RETAIN = SQLConfigBuilder("spark.sql.streaming.minBatchesToRetain")
+    .internal()
+    .doc("The minimum number of batches that must be retained and made recoverable.")
+    .intConf
+    .createWithDefault(100)
+
   val UNSUPPORTED_OPERATION_CHECK_ENABLED =
     SQLConfigBuilder("spark.sql.streaming.unsupportedOperationCheck")
       .internal()
@@ -524,7 +549,7 @@ object SQLConf {
     SQLConfigBuilder("spark.sql.streaming.commitProtocolClass")
       .internal()
       .stringConf
-      .createWithDefault(classOf[ManifestFileCommitProtocol].getName)
+      .createWithDefault("org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol")
 
   val FILE_SINK_LOG_DELETION = SQLConfigBuilder("spark.sql.streaming.fileSink.log.deletion")
     .internal()
@@ -582,12 +607,25 @@ object SQLConf {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefault(10L)
 
+  val STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL =
+    SQLConfigBuilder("spark.sql.streaming.noDataProgressEventInterval")
+      .internal()
+      .doc("How long to wait between two progress events when there is no data")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createWithDefault(10000L)
+
   val STREAMING_METRICS_ENABLED =
     SQLConfigBuilder("spark.sql.streaming.metricsEnabled")
       .doc("Whether Dropwizard/Codahale metrics will be reported for active streaming queries.")
       .booleanConf
       .createWithDefault(false)
 
+  val STREAMING_PROGRESS_RETENTION =
+    SQLConfigBuilder("spark.sql.streaming.numRecentProgressUpdates")
+      .doc("The number of progress updates to retain for a streaming query")
+      .intConf
+      .createWithDefault(100)
+
   val NDV_MAX_ERROR =
     SQLConfigBuilder("spark.sql.statistics.ndv.maxError")
       .internal()
@@ -598,7 +636,8 @@ object SQLConf {
 
   val IGNORE_CORRUPT_FILES = SQLConfigBuilder("spark.sql.files.ignoreCorruptFiles")
     .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " +
-      "encountering corrupt files and contents that have been read will still be returned.")
+      "encountering corrupted or non-existing and contents that have been read will still be " +
+      "returned.")
     .booleanConf
     .createWithDefault(false)
 
@@ -616,7 +655,7 @@ object SQLConf {
  *
  * SQLConf is thread-safe (internally synchronized, so safe to be used in multiple threads).
  */
-private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
+class SQLConf extends Serializable with Logging {
   import SQLConf._
 
   /** Only low degree of contention is expected for conf, thus NOT using ConcurrentHashMap. */
@@ -633,8 +672,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def stateStoreMinDeltasForSnapshot: Int = getConf(STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT)
 
-  def stateStoreMinVersionsToRetain: Int = getConf(STATE_STORE_MIN_VERSIONS_TO_RETAIN)
-
   def checkpointLocation: Option[String] = getConf(CHECKPOINT_LOCATION)
 
   def isUnsupportedOperationCheckEnabled: Boolean = getConf(UNSUPPORTED_OPERATION_CHECK_ENABLED)
@@ -657,8 +694,13 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def streamingPollingDelay: Long = getConf(STREAMING_POLLING_DELAY)
 
+  def streamingNoDataProgressEventInterval: Long =
+    getConf(STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL)
+
   def streamingMetricsEnabled: Boolean = getConf(STREAMING_METRICS_ENABLED)
 
+  def streamingProgressRetention: Int = getConf(STREAMING_PROGRESS_RETENTION)
+
   def filesMaxPartitionBytes: Long = getConf(FILES_MAX_PARTITION_BYTES)
 
   def filesOpenCostInBytes: Long = getConf(FILES_OPEN_COST_IN_BYTES)
@@ -683,6 +725,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
   def minNumPostShufflePartitions: Int =
     getConf(SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS)
 
+  def minBatchesToRetain: Int = getConf(MIN_BATCHES_TO_RETAIN)
+
   def parquetFilterPushDown: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_ENABLED)
 
   def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED)
@@ -695,6 +739,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE)
 
+  def caseSensitiveInferenceMode: HiveCaseSensitiveInferenceMode.Value =
+    HiveCaseSensitiveInferenceMode.withName(getConf(HIVE_CASE_SENSITIVE_INFERENCE))
+
   def gatherFastStats: Boolean = getConf(GATHER_FASTSTAT)
 
   def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
@@ -711,6 +758,18 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE)
 
+  /**
+   * Returns the [[Resolver]] for the current configuration, which can be used to determine if two
+   * identifiers are equal.
+   */
+  def resolver: Resolver = {
+    if (caseSensitiveAnalysis) {
+      org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
+    } else {
+      org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution
+    }
+  }
+
   def subexpressionEliminationEnabled: Boolean =
     getConf(SUBEXPRESSION_ELIMINATION_ENABLED)
 
@@ -724,7 +783,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def enableRadixSort: Boolean = getConf(RADIX_SORT_ENABLED)
 
-  def defaultSizeInBytes: Long = getConf(DEFAULT_SIZE_IN_BYTES, Long.MaxValue)
+  def defaultSizeInBytes: Long = getConf(DEFAULT_SIZE_IN_BYTES)
 
   def isParquetSchemaMergingEnabled: Boolean = getConf(PARQUET_SCHEMA_MERGING_ENABLED)
 
@@ -756,6 +815,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
   def parallelPartitionDiscoveryThreshold: Int =
     getConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD)
 
+  def parallelPartitionDiscoveryParallelism: Int =
+    getConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_PARALLELISM)
+
   def bucketingEnabled: Boolean = getConf(SQLConf.BUCKETING_ENABLED)
 
   def dataFrameSelfJoinAutoResolveAmbiguity: Boolean =
@@ -765,7 +827,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def dataFramePivotMaxValues: Int = getConf(DATAFRAME_PIVOT_MAX_VALUES)
 
-  override def runSQLonFile: Boolean = getConf(RUN_SQL_ON_FILES)
+  def runSQLonFile: Boolean = getConf(RUN_SQL_ON_FILES)
 
   def enableTwoLevelAggMap: Boolean = getConf(ENABLE_TWOLEVEL_AGG_MAP)
 
@@ -773,15 +835,15 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def variableSubstituteDepth: Int = getConf(VARIABLE_SUBSTITUTE_DEPTH)
 
-  def warehousePath: String = new Path(getConf(WAREHOUSE_PATH)).toString
+  def warehousePath: String = new Path(getConf(StaticSQLConf.WAREHOUSE_PATH)).toString
 
   def ignoreCorruptFiles: Boolean = getConf(IGNORE_CORRUPT_FILES)
 
-  override def orderByOrdinal: Boolean = getConf(ORDER_BY_ORDINAL)
+  def orderByOrdinal: Boolean = getConf(ORDER_BY_ORDINAL)
 
-  override def groupByOrdinal: Boolean = getConf(GROUP_BY_ORDINAL)
+  def groupByOrdinal: Boolean = getConf(GROUP_BY_ORDINAL)
 
-  override def crossJoinEnabled: Boolean = getConf(SQLConf.CROSS_JOINS_ENABLED)
+  def crossJoinEnabled: Boolean = getConf(SQLConf.CROSS_JOINS_ENABLED)
 
   def ndvMaxError: Double = getConf(NDV_MAX_ERROR)
   /** ********************** SQLConf functionality methods ************ */
@@ -903,50 +965,3 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
     settings.clear()
   }
 }
-
-/**
- * Static SQL configuration is a cross-session, immutable Spark configuration. External users can
- * see the static sql configs via `SparkSession.conf`, but can NOT set/unset them.
- */
-object StaticSQLConf {
-  val globalConfKeys = java.util.Collections.synchronizedSet(new java.util.HashSet[String]())
-
-  private def buildConf(key: String): ConfigBuilder = {
-    ConfigBuilder(key).onCreate { entry =>
-      globalConfKeys.add(entry.key)
-      SQLConf.register(entry)
-    }
-  }
-
-  val CATALOG_IMPLEMENTATION = buildConf("spark.sql.catalogImplementation")
-    .internal()
-    .stringConf
-    .checkValues(Set("hive", "in-memory"))
-    .createWithDefault("in-memory")
-
-  val GLOBAL_TEMP_DATABASE = buildConf("spark.sql.globalTempDatabase")
-    .internal()
-    .stringConf
-    .createWithDefault("global_temp")
-
-  // This is used to control when we will split a schema's JSON string to multiple pieces
-  // in order to fit the JSON string in metastore's table property (by default, the value has
-  // a length restriction of 4000 characters, so do not use a value larger than 4000 as the default
-  // value of this property). We will split the JSON string of a schema to its length exceeds the
-  // threshold. Note that, this conf is only read in HiveExternalCatalog which is cross-session,
-  // that's why this conf has to be a static SQL conf.
-  val SCHEMA_STRING_LENGTH_THRESHOLD = buildConf("spark.sql.sources.schemaStringLengthThreshold")
-    .doc("The maximum length allowed in a single cell when " +
-      "storing additional schema information in Hive's metastore.")
-    .internal()
-    .intConf
-    .createWithDefault(4000)
-
-  // When enabling the debug, Spark SQL internal table properties are not filtered out; however,
-  // some related DDL commands (e.g., ANALYZE TABLE and CREATE TABLE LIKE) might not work properly.
-  val DEBUG_MODE = buildConf("spark.sql.debug")
-    .internal()
-    .doc("Only used for internal debugging. Not all functions are supported when it is enabled.")
-    .booleanConf
-    .createWithDefault(false)
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
new file mode 100644
index 000000000000..52ca17414ed6
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.internal
+
+import org.apache.spark.internal.config.ConfigBuilder
+import org.apache.spark.util.Utils
+
+
+/**
+ * Static SQL configuration is a cross-session, immutable Spark configuration. External users can
+ * see the static sql configs via `SparkSession.conf`, but can NOT set/unset them.
+ *//**
+ * Static SQL configuration is a cross-session, immutable Spark configuration. External users can
+ * see the static sql configs via `SparkSession.conf`, but can NOT set/unset them.
+ */
+object StaticSQLConf {
+  val globalConfKeys = java.util.Collections.synchronizedSet(new java.util.HashSet[String]())
+
+  private def buildConf(key: String): ConfigBuilder = {
+    ConfigBuilder(key).onCreate { entry =>
+      globalConfKeys.add(entry.key)
+      SQLConf.register(entry)
+    }
+  }
+
+  val WAREHOUSE_PATH = buildConf("spark.sql.warehouse.dir")
+    .doc("The default location for managed databases and tables.")
+    .stringConf
+    .createWithDefault(Utils.resolveURI("spark-warehouse").toString)
+
+  val CATALOG_IMPLEMENTATION = buildConf("spark.sql.catalogImplementation")
+    .internal()
+    .stringConf
+    .checkValues(Set("hive", "in-memory"))
+    .createWithDefault("in-memory")
+
+  val GLOBAL_TEMP_DATABASE = buildConf("spark.sql.globalTempDatabase")
+    .internal()
+    .stringConf
+    .createWithDefault("global_temp")
+
+  // This is used to control when we will split a schema's JSON string to multiple pieces
+  // in order to fit the JSON string in metastore's table property (by default, the value has
+  // a length restriction of 4000 characters, so do not use a value larger than 4000 as the default
+  // value of this property). We will split the JSON string of a schema to its length exceeds the
+  // threshold. Note that, this conf is only read in HiveExternalCatalog which is cross-session,
+  // that's why this conf has to be a static SQL conf.
+  val SCHEMA_STRING_LENGTH_THRESHOLD = buildConf("spark.sql.sources.schemaStringLengthThreshold")
+    .doc("The maximum length allowed in a single cell when " +
+      "storing additional schema information in Hive's metastore.")
+    .internal()
+    .intConf
+    .createWithDefault(4000)
+
+  // When enabling the debug, Spark SQL internal table properties are not filtered out; however,
+  // some related DDL commands (e.g., ANALYZE TABLE and CREATE TABLE LIKE) might not work properly.
+  val DEBUG_MODE = buildConf("spark.sql.debug")
+    .internal()
+    .doc("Only used for internal debugging. Not all functions are supported when it is enabled.")
+    .booleanConf
+    .createWithDefault(false)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 5d70ef01373f..98efba199ad4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -31,7 +31,9 @@ import org.apache.spark.sql.catalyst.util.ArrayData
  */
 @InterfaceStability.Stable
 object ArrayType extends AbstractDataType {
-  /** Construct a [[ArrayType]] object with the given element type. The `containsNull` is true. */
+  /**
+   * Construct a [[ArrayType]] object with the given element type. The `containsNull` is true.
+   */
   def apply(elementType: DataType): ArrayType = ArrayType(elementType, containsNull = true)
 
   override private[sql] def defaultConcreteType: DataType = ArrayType(NullType, containsNull = true)
@@ -76,10 +78,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
       ("containsNull" -> containsNull)
 
   /**
-   * The default size of a value of the ArrayType is 100 * the default size of the element type.
-   * (We assume that there are 100 elements).
+   * The default size of a value of the ArrayType is the default size of the element type.
+   * We assume that there is only 1 element on average in an array. See SPARK-18853.
    */
-  override def defaultSize: Int = 100 * elementType.defaultSize
+  override def defaultSize: Int = 1 * elementType.defaultSize
 
   override def simpleString: String = s"array<${elementType.simpleString}>"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
index a4a358a242c7..02c8318b4d41 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 
 /**
  * The data type representing `Array[Byte]` values.
- * Please use the singleton [[DataTypes.BinaryType]].
+ * Please use the singleton `DataTypes.BinaryType`.
  */
 @InterfaceStability.Stable
 class BinaryType private() extends AtomicType {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
index 059f89f9cda3..cee78f4b4ac1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 
 /**
- * The data type representing `Boolean` values. Please use the singleton [[DataTypes.BooleanType]].
+ * The data type representing `Boolean` values. Please use the singleton `DataTypes.BooleanType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
index bc6251f024e5..b1dd5eda36bd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
- * The data type representing `Byte` values. Please use the singleton [[DataTypes.ByteType]].
+ * The data type representing `Byte` values. Please use the singleton `DataTypes.ByteType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
index e121044288e5..2342036a5746 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
@@ -23,9 +23,9 @@ import org.apache.spark.annotation.InterfaceStability
  * The data type representing calendar time intervals. The calendar time interval is stored
  * internally in two components: number of months the number of microseconds.
  *
- * Note that calendar intervals are not comparable.
+ * Please use the singleton `DataTypes.CalendarIntervalType`.
  *
- * Please use the singleton [[DataTypes.CalendarIntervalType]].
+ * @note Calendar intervals are not comparable.
  *
  * @since 1.5.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 312585df1516..2642d9395ba8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -250,4 +250,28 @@ object DataType {
       case (fromDataType, toDataType) => fromDataType == toDataType
     }
   }
+
+  /**
+   * Compares two types, ignoring nullability of ArrayType, MapType, StructType, and ignoring case
+   * sensitivity of field names in StructType.
+   */
+  private[sql] def equalsIgnoreCaseAndNullability(from: DataType, to: DataType): Boolean = {
+    (from, to) match {
+      case (ArrayType(fromElement, _), ArrayType(toElement, _)) =>
+        equalsIgnoreCaseAndNullability(fromElement, toElement)
+
+      case (MapType(fromKey, fromValue, _), MapType(toKey, toValue, _)) =>
+        equalsIgnoreCaseAndNullability(fromKey, toKey) &&
+          equalsIgnoreCaseAndNullability(fromValue, toValue)
+
+      case (StructType(fromFields), StructType(toFields)) =>
+        fromFields.length == toFields.length &&
+          fromFields.zip(toFields).forall { case (l, r) =>
+            l.name.equalsIgnoreCase(r.name) &&
+              equalsIgnoreCaseAndNullability(l.dataType, r.dataType)
+          }
+
+      case (fromDataType, toDataType) => fromDataType == toDataType
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
index 8d0ecc051f4c..0c0574b84553 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 /**
  * A date type, supporting "0001-01-01" through "9999-12-31".
  *
- * Please use the singleton [[DataTypes.DateType]].
+ * Please use the singleton `DataTypes.DateType`.
  *
  * Internally, this is represented as the number of days from 1970-01-01.
  *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index d7ca0cbeedcd..4dc06fc9cf09 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression
  *
  * The default precision and scale is (10, 0).
  *
- * Please use [[DataTypes.createDecimalType()]] to create a specific instance.
+ * Please use `DataTypes.createDecimalType()` to create a specific instance.
  *
  * @since 1.3.0
  */
@@ -92,7 +92,8 @@ case class DecimalType(precision: Int, scale: Int) extends FractionalType {
   }
 
   /**
-   * The default size of a value of the DecimalType is 8 bytes (precision <= 18) or 16 bytes.
+   * The default size of a value of the DecimalType is 8 bytes when precision is at most 18,
+   * and 16 bytes otherwise.
    */
   override def defaultSize: Int = if (precision <= Decimal.MAX_LONG_DIGITS) 8 else 16
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
index c21ac0e43eee..400f7aed6ae7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.util.Utils
 
 /**
- * The data type representing `Double` values. Please use the singleton [[DataTypes.DoubleType]].
+ * The data type representing `Double` values. Please use the singleton `DataTypes.DoubleType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
index c5bf8883bad9..b9812b236d57 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.util.Utils
 
 /**
- * The data type representing `Float` values. Please use the singleton [[DataTypes.FloatType]].
+ * The data type representing `Float` values. Please use the singleton `DataTypes.FloatType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala
new file mode 100644
index 000000000000..b319eb70bc13
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.types
+
+import scala.math.Ordering
+import scala.reflect.runtime.universe.typeTag
+
+import org.apache.spark.sql.catalyst.ScalaReflectionLock
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A hive string type for compatibility. These datatypes should only used for parsing,
+ * and should NOT be used anywhere else. Any instance of these data types should be
+ * replaced by a [[StringType]] before analysis.
+ */
+sealed abstract class HiveStringType extends AtomicType {
+  private[sql] type InternalType = UTF8String
+
+  private[sql] val ordering = implicitly[Ordering[InternalType]]
+
+  @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized {
+    typeTag[InternalType]
+  }
+
+  override def defaultSize: Int = length
+
+  private[spark] override def asNullable: HiveStringType = this
+
+  def length: Int
+}
+
+object HiveStringType {
+  def replaceCharType(dt: DataType): DataType = dt match {
+    case ArrayType(et, nullable) =>
+      ArrayType(replaceCharType(et), nullable)
+    case MapType(kt, vt, nullable) =>
+      MapType(replaceCharType(kt), replaceCharType(vt), nullable)
+    case StructType(fields) =>
+      StructType(fields.map { field =>
+        field.copy(dataType = replaceCharType(field.dataType))
+      })
+    case _: HiveStringType => StringType
+    case _ => dt
+  }
+}
+
+/**
+ * Hive char type.
+ */
+case class CharType(length: Int) extends HiveStringType {
+  override def simpleString: String = s"char($length)"
+}
+
+/**
+ * Hive varchar type.
+ */
+case class VarcharType(length: Int) extends HiveStringType {
+  override def simpleString: String = s"varchar($length)"
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
index 724e59c0bcbf..dca612ecbfed 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 
 /**
- * The data type representing `Int` values. Please use the singleton [[DataTypes.IntegerType]].
+ * The data type representing `Int` values. Please use the singleton `DataTypes.IntegerType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
index 42285a9d0aa2..396c3355701c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
- * The data type representing `Long` values. Please use the singleton [[DataTypes.LongType]].
+ * The data type representing `Long` values. Please use the singleton `DataTypes.LongType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index 3a32aa43d1c3..6691b81dcea8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -25,7 +25,7 @@ import org.apache.spark.annotation.InterfaceStability
 /**
  * The data type for Maps. Keys in a map are not allowed to have `null` values.
  *
- * Please use [[DataTypes.createMapType()]] to create a specific instance.
+ * Please use `DataTypes.createMapType()` to create a specific instance.
  *
  * @param keyType The data type of map keys.
  * @param valueType The data type of map values.
@@ -56,10 +56,10 @@ case class MapType(
 
   /**
    * The default size of a value of the MapType is
-   * 100 * (the default size of the key type + the default size of the value type).
-   * (We assume that there are 100 elements).
+   * (the default size of the key type + the default size of the value type).
+   * We assume that there is only 1 element on average in a map. See SPARK-18853.
    */
-  override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize)
+  override def defaultSize: Int = 1 * (keyType.defaultSize + valueType.defaultSize)
 
   override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala
index bdf9a819d007..494225b47a27 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.InterfaceStability
 
 
 /**
- * The data type representing `NULL` values. Please use the singleton [[DataTypes.NullType]].
+ * The data type representing `NULL` values. Please use the singleton `DataTypes.NullType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
index b18fba29af0f..2d49fe076786 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
@@ -44,4 +44,9 @@ case class ObjectType(cls: Class[_]) extends DataType {
   def asNullable: DataType = this
 
   override def simpleString: String = cls.getName
+
+  override def acceptsType(other: DataType): Boolean = other match {
+    case ObjectType(otherCls) => cls.isAssignableFrom(otherCls)
+    case _ => false
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
index 3fee299d578c..1410d5ba0e0b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
- * The data type representing `Short` values. Please use the singleton [[DataTypes.ShortType]].
+ * The data type representing `Short` values. Please use the singleton `DataTypes.ShortType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
index 5d5a6f52a305..d1c0da3479d7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
- * The data type representing `String` values. Please use the singleton [[DataTypes.StringType]].
+ * The data type representing `String` values. Please use the singleton `DataTypes.StringType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
index 4540d8358aca..287599542005 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
  * The data type representing `java.sql.Timestamp` values.
- * Please use the singleton [[DataTypes.TimestampType]].
+ * Please use the singleton `DataTypes.TimestampType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala
index 346a51ea10c8..f29cbc2069e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala
@@ -21,4 +21,12 @@ package org.apache.spark.sql
  * Contains a type system for attributes produced by relations, including complex types like
  * structs, arrays and maps.
  */
-package object types
+package object types {
+  /**
+   * Metadata key used to store the raw hive type string in the metadata of StructField. This
+   * is relevant for datatypes that do not have a direct Spark SQL counterpart, such as CHAR and
+   * VARCHAR. We need to preserve the original type in order to invoke the correct object
+   * inspector in Hive.
+   */
+  val HIVE_TYPE_STRING = "HIVE_TYPE_STRING"
+}
diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
index 67a5eb0c7fe8..b67c6f3e6e85 100644
--- a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
+++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
@@ -28,7 +28,6 @@
 import java.util.Set;
 
 public class HiveHasherSuite {
-  private final static HiveHasher hasher = new HiveHasher();
 
   @Test
   public void testKnownIntegerInputs() {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
index 03bb102c67fe..f3702ec92b42 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 
 class CatalystTypeConvertersSuite extends SparkFunSuite {
@@ -61,4 +63,35 @@ class CatalystTypeConvertersSuite extends SparkFunSuite {
   test("option handling in createToCatalystConverter") {
     assert(CatalystTypeConverters.createToCatalystConverter(IntegerType)(Some(123)) === 123)
   }
+
+  test("primitive array handling") {
+    val intArray = Array(1, 100, 10000)
+    val intUnsafeArray = UnsafeArrayData.fromPrimitiveArray(intArray)
+    val intArrayType = ArrayType(IntegerType, false)
+    assert(CatalystTypeConverters.createToScalaConverter(intArrayType)(intUnsafeArray) === intArray)
+
+    val doubleArray = Array(1.1, 111.1, 11111.1)
+    val doubleUnsafeArray = UnsafeArrayData.fromPrimitiveArray(doubleArray)
+    val doubleArrayType = ArrayType(DoubleType, false)
+    assert(CatalystTypeConverters.createToScalaConverter(doubleArrayType)(doubleUnsafeArray)
+      === doubleArray)
+  }
+
+  test("An array with null handling") {
+    val intArray = Array(1, null, 100, null, 10000)
+    val intGenericArray = new GenericArrayData(intArray)
+    val intArrayType = ArrayType(IntegerType, true)
+    assert(CatalystTypeConverters.createToScalaConverter(intArrayType)(intGenericArray)
+      === intArray)
+    assert(CatalystTypeConverters.createToCatalystConverter(intArrayType)(intArray)
+      == intGenericArray)
+
+    val doubleArray = Array(1.1, null, 111.1, null, 11111.1)
+    val doubleGenericArray = new GenericArrayData(doubleArray)
+    val doubleArrayType = ArrayType(DoubleType, true)
+    assert(CatalystTypeConverters.createToScalaConverter(doubleArrayType)(doubleGenericArray)
+      === doubleArray)
+    assert(CatalystTypeConverters.createToCatalystConverter(doubleArrayType)(doubleArray)
+      == doubleGenericArray)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 21afe9fec594..96aff37a4b4f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -465,34 +465,22 @@ class AnalysisErrorSuite extends AnalysisTest {
         "another aggregate function." :: Nil)
   }
 
-  test("Join can't work on binary and map types") {
-    val plan =
-      Join(
-        LocalRelation(
-          AttributeReference("a", BinaryType)(exprId = ExprId(2)),
-          AttributeReference("b", IntegerType)(exprId = ExprId(1))),
-        LocalRelation(
-          AttributeReference("c", BinaryType)(exprId = ExprId(4)),
-          AttributeReference("d", IntegerType)(exprId = ExprId(3))),
-        Cross,
-        Some(EqualTo(AttributeReference("a", BinaryType)(exprId = ExprId(2)),
-          AttributeReference("c", BinaryType)(exprId = ExprId(4)))))
-
-    assertAnalysisError(plan, "binary type expression `a` cannot be used in join conditions" :: Nil)
-
-    val plan2 =
-      Join(
-        LocalRelation(
-          AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)),
-          AttributeReference("b", IntegerType)(exprId = ExprId(1))),
-        LocalRelation(
-          AttributeReference("c", MapType(IntegerType, StringType))(exprId = ExprId(4)),
-          AttributeReference("d", IntegerType)(exprId = ExprId(3))),
-        Cross,
-        Some(EqualTo(AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)),
-          AttributeReference("c", MapType(IntegerType, StringType))(exprId = ExprId(4)))))
-
-    assertAnalysisError(plan2, "map type expression `a` cannot be used in join conditions" :: Nil)
+  test("Join can work on binary types but can't work on map types") {
+    val left = LocalRelation('a.binary, 'b.map(StringType, StringType))
+    val right = LocalRelation('c.binary, 'd.map(StringType, StringType))
+
+    val plan1 = left.join(
+      right,
+      joinType = Cross,
+      condition = Some('a === 'c))
+
+    assertAnalysisSuccess(plan1)
+
+    val plan2 = left.join(
+      right,
+      joinType = Cross,
+      condition = Some('b === 'd))
+    assertAnalysisError(plan2, "Cannot use map type in EqualTo" :: Nil)
   }
 
   test("PredicateSubQuery is used outside of a filter") {
@@ -554,7 +542,7 @@ class AnalysisErrorSuite extends AnalysisTest {
           Filter(EqualTo(OuterReference(a), b), LocalRelation(b)))
       ),
       LocalRelation(a))
-    assertAnalysisError(plan4, "Accessing outer query column is not allowed in a LIMIT" :: Nil)
+    assertAnalysisError(plan4, "Accessing outer query column is not allowed in" :: Nil)
 
     val plan5 = Filter(
       Exists(
@@ -563,6 +551,6 @@ class AnalysisErrorSuite extends AnalysisTest {
       ),
       LocalRelation(a))
     assertAnalysisError(plan5,
-                        "Accessing outer query column is not allowed in a TABLESAMPLE" :: Nil)
+                        "Accessing outer query column is not allowed in" :: Nil)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 590774c04304..817de48de279 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.scalatest.ShouldMatchers
+
 import org.apache.spark.sql.catalyst.{SimpleCatalystConf, TableIdentifier}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
@@ -25,7 +27,8 @@ import org.apache.spark.sql.catalyst.plans.{Cross, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
-class AnalysisSuite extends AnalysisTest {
+
+class AnalysisSuite extends AnalysisTest with ShouldMatchers {
   import org.apache.spark.sql.catalyst.analysis.TestRelations._
 
   test("union project *") {
@@ -218,9 +221,36 @@ class AnalysisSuite extends AnalysisTest {
 
     // CreateStruct is a special case that we should not trim Alias for it.
     plan = testRelation.select(CreateStruct(Seq(a, (a + 1).as("a+1"))).as("col"))
-    checkAnalysis(plan, plan)
-    plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col"))
-    checkAnalysis(plan, plan)
+    expected = testRelation.select(CreateNamedStruct(Seq(
+      Literal(a.name), a,
+      Literal("a+1"), (a + 1))).as("col"))
+    checkAnalysis(plan, expected)
+  }
+
+  test("Analysis may leave unnecassary aliases") {
+    val att1 = testRelation.output.head
+    var plan = testRelation.select(
+      CreateStruct(Seq(att1, ((att1.as("aa")) + 1).as("a_plus_1"))).as("col"),
+      att1
+    )
+    val prevPlan = getAnalyzer(true).execute(plan)
+    plan = prevPlan.select(CreateArray(Seq(
+      CreateStruct(Seq(att1, (att1 + 1).as("a_plus_1"))).as("col1"),
+      /** alias should be eliminated by [[CleanupAliases]] */
+      "col".attr.as("col2")
+    )).as("arr"))
+    plan = getAnalyzer(true).execute(plan)
+
+    val expectedPlan = prevPlan.select(
+      CreateArray(Seq(
+        CreateNamedStruct(Seq(
+          Literal(att1.name), att1,
+          Literal("a_plus_1"), (att1 + 1))),
+          'col.struct(prevPlan.output(0).dataType.asInstanceOf[StructType]).notNull
+      )).as("arr")
+    )
+
+    checkAnalysis(plan, expectedPlan)
   }
 
   test("SPARK-10534: resolve attribute references in order by clause") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 542e654bbce1..744057b7c5f4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -111,6 +111,8 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertErrorForDifferingTypes(GreaterThan('intField, 'booleanField))
     assertErrorForDifferingTypes(GreaterThanOrEqual('intField, 'booleanField))
 
+    assertError(EqualTo('mapField, 'mapField), "Cannot use map type in EqualTo")
+    assertError(EqualNullSafe('mapField, 'mapField), "Cannot use map type in EqualNullSafe")
     assertError(LessThan('mapField, 'mapField),
       s"requires ${TypeCollection.Ordered.simpleString} type")
     assertError(LessThanOrEqual('mapField, 'mapField),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/PullOutNondeterministicSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/PullOutNondeterministicSuite.scala
new file mode 100644
index 000000000000..72e10eadf79f
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/PullOutNondeterministicSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+
+/**
+ * Test suite for moving non-deterministic expressions into Project.
+ */
+class PullOutNondeterministicSuite extends AnalysisTest {
+
+  private lazy val a = 'a.int
+  private lazy val b = 'b.int
+  private lazy val r = LocalRelation(a, b)
+  private lazy val rnd = Rand(10).as('_nondeterministic)
+  private lazy val rndref = rnd.toAttribute
+
+  test("no-op on filter") {
+    checkAnalysis(
+      r.where(Rand(10) > Literal(1.0)),
+      r.where(Rand(10) > Literal(1.0))
+    )
+  }
+
+  test("sort") {
+    checkAnalysis(
+      r.sortBy(SortOrder(Rand(10), Ascending)),
+      r.select(a, b, rnd).sortBy(SortOrder(rndref, Ascending)).select(a, b)
+    )
+  }
+
+  test("aggregate") {
+    checkAnalysis(
+      r.groupBy(Rand(10))(Rand(10).as("rnd")),
+      r.select(a, b, rnd).groupBy(rndref)(rndref.as("rnd"))
+    )
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
index 100ec4d53fb8..e449b9669cc7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
@@ -28,6 +28,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
   lazy val a = 'a.string
   lazy val b = 'b.string
   lazy val c = 'c.string
+  lazy val d = 'd.struct('f1.int, 'f2.long)
   lazy val aNotNull = a.notNull
   lazy val bNotNull = b.notNull
   lazy val cNotNull = c.notNull
@@ -35,10 +36,12 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
   lazy val r2 = LocalRelation(c, a)
   lazy val r3 = LocalRelation(aNotNull, bNotNull)
   lazy val r4 = LocalRelation(cNotNull, bNotNull)
+  lazy val r5 = LocalRelation(d)
+  lazy val r6 = LocalRelation(d)
 
   test("natural/using inner join") {
     val naturalPlan = r1.join(r2, NaturalJoin(Inner), None)
-    val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("a"))), None)
+    val usingPlan = r1.join(r2, UsingJoin(Inner, Seq("a")), None)
     val expected = r1.join(r2, Inner, Some(EqualTo(a, a))).select(a, b, c)
     checkAnalysis(naturalPlan, expected)
     checkAnalysis(usingPlan, expected)
@@ -46,7 +49,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using left join") {
     val naturalPlan = r1.join(r2, NaturalJoin(LeftOuter), None)
-    val usingPlan = r1.join(r2, UsingJoin(LeftOuter, Seq(UnresolvedAttribute("a"))), None)
+    val usingPlan = r1.join(r2, UsingJoin(LeftOuter, Seq("a")), None)
     val expected = r1.join(r2, LeftOuter, Some(EqualTo(a, a))).select(a, b, c)
     checkAnalysis(naturalPlan, expected)
     checkAnalysis(usingPlan, expected)
@@ -54,7 +57,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using right join") {
     val naturalPlan = r1.join(r2, NaturalJoin(RightOuter), None)
-    val usingPlan = r1.join(r2, UsingJoin(RightOuter, Seq(UnresolvedAttribute("a"))), None)
+    val usingPlan = r1.join(r2, UsingJoin(RightOuter, Seq("a")), None)
     val expected = r1.join(r2, RightOuter, Some(EqualTo(a, a))).select(a, b, c)
     checkAnalysis(naturalPlan, expected)
     checkAnalysis(usingPlan, expected)
@@ -62,7 +65,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using full outer join") {
     val naturalPlan = r1.join(r2, NaturalJoin(FullOuter), None)
-    val usingPlan = r1.join(r2, UsingJoin(FullOuter, Seq(UnresolvedAttribute("a"))), None)
+    val usingPlan = r1.join(r2, UsingJoin(FullOuter, Seq("a")), None)
     val expected = r1.join(r2, FullOuter, Some(EqualTo(a, a))).select(
       Alias(Coalesce(Seq(a, a)), "a")(), b, c)
     checkAnalysis(naturalPlan, expected)
@@ -71,7 +74,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using inner join with no nullability") {
     val naturalPlan = r3.join(r4, NaturalJoin(Inner), None)
-    val usingPlan = r3.join(r4, UsingJoin(Inner, Seq(UnresolvedAttribute("b"))), None)
+    val usingPlan = r3.join(r4, UsingJoin(Inner, Seq("b")), None)
     val expected = r3.join(r4, Inner, Some(EqualTo(bNotNull, bNotNull))).select(
       bNotNull, aNotNull, cNotNull)
     checkAnalysis(naturalPlan, expected)
@@ -80,7 +83,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using left join with no nullability") {
     val naturalPlan = r3.join(r4, NaturalJoin(LeftOuter), None)
-    val usingPlan = r3.join(r4, UsingJoin(LeftOuter, Seq(UnresolvedAttribute("b"))), None)
+    val usingPlan = r3.join(r4, UsingJoin(LeftOuter, Seq("b")), None)
     val expected = r3.join(r4, LeftOuter, Some(EqualTo(bNotNull, bNotNull))).select(
       bNotNull, aNotNull, c)
     checkAnalysis(naturalPlan, expected)
@@ -89,7 +92,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using right join with no nullability") {
     val naturalPlan = r3.join(r4, NaturalJoin(RightOuter), None)
-    val usingPlan = r3.join(r4, UsingJoin(RightOuter, Seq(UnresolvedAttribute("b"))), None)
+    val usingPlan = r3.join(r4, UsingJoin(RightOuter, Seq("b")), None)
     val expected = r3.join(r4, RightOuter, Some(EqualTo(bNotNull, bNotNull))).select(
       bNotNull, a, cNotNull)
     checkAnalysis(naturalPlan, expected)
@@ -98,7 +101,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using full outer join with no nullability") {
     val naturalPlan = r3.join(r4, NaturalJoin(FullOuter), None)
-    val usingPlan = r3.join(r4, UsingJoin(FullOuter, Seq(UnresolvedAttribute("b"))), None)
+    val usingPlan = r3.join(r4, UsingJoin(FullOuter, Seq("b")), None)
     val expected = r3.join(r4, FullOuter, Some(EqualTo(bNotNull, bNotNull))).select(
       Alias(Coalesce(Seq(b, b)), "b")(), a, c)
     checkAnalysis(naturalPlan, expected)
@@ -106,40 +109,42 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
   }
 
   test("using unresolved attribute") {
-    val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("d"))), None)
-    val error = intercept[AnalysisException] {
-      SimpleAnalyzer.checkAnalysis(usingPlan)
-    }
-    assert(error.message.contains(
-      "using columns ['d] can not be resolved given input columns: [b, a, c]"))
+    assertAnalysisError(
+      r1.join(r2, UsingJoin(Inner, Seq("d"))),
+      "USING column `d` cannot be resolved on the left side of the join" :: Nil)
+    assertAnalysisError(
+      r1.join(r2, UsingJoin(Inner, Seq("b"))),
+      "USING column `b` cannot be resolved on the right side of the join" :: Nil)
   }
 
   test("using join with a case sensitive analyzer") {
     val expected = r1.join(r2, Inner, Some(EqualTo(a, a))).select(a, b, c)
 
-    {
-      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("a"))), None)
-      checkAnalysis(usingPlan, expected, caseSensitive = true)
-    }
+    val usingPlan = r1.join(r2, UsingJoin(Inner, Seq("a")), None)
+    checkAnalysis(usingPlan, expected, caseSensitive = true)
 
-    {
-      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("A"))), None)
-      assertAnalysisError(
-        usingPlan,
-        Seq("using columns ['A] can not be resolved given input columns: [b, a, c, a]"))
-    }
+    assertAnalysisError(
+      r1.join(r2, UsingJoin(Inner, Seq("A"))),
+      "USING column `A` cannot be resolved on the left side of the join" :: Nil)
+  }
+
+  test("using join on nested fields") {
+    assertAnalysisError(
+      r5.join(r6, UsingJoin(Inner, Seq("d.f1"))),
+      "USING column `d.f1` cannot be resolved on the left side of the join. " +
+        "The left-side columns: [d]" :: Nil)
   }
 
   test("using join with a case insensitive analyzer") {
     val expected = r1.join(r2, Inner, Some(EqualTo(a, a))).select(a, b, c)
 
     {
-      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("a"))), None)
+      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq("a")), None)
       checkAnalysis(usingPlan, expected, caseSensitive = false)
     }
 
     {
-      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("A"))), None)
+      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq("A")), None)
       checkAnalysis(usingPlan, expected, caseSensitive = false)
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala
new file mode 100644
index 000000000000..4aafb2b83fb6
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference}
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project}
+
+/**
+ * Unit tests for [[ResolveSubquery]].
+ */
+class ResolveSubquerySuite extends AnalysisTest {
+
+  val a = 'a.int
+  val b = 'b.int
+  val t1 = LocalRelation(a)
+  val t2 = LocalRelation(b)
+
+  test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") {
+    val expr = Filter(In(a, Seq(ListQuery(Project(Seq(OuterReference(a)), t2)))), t1)
+    val m = intercept[AnalysisException] {
+      SimpleAnalyzer.ResolveSubquery(expr)
+    }.getMessage
+    assert(m.contains(
+      "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses"))
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index 6f69613f8531..bfa52be24bd4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -30,37 +30,211 @@ import org.apache.spark.unsafe.types.CalendarInterval
 
 class TypeCoercionSuite extends PlanTest {
 
-  test("eligible implicit type cast") {
-    def shouldCast(from: DataType, to: AbstractDataType, expected: DataType): Unit = {
-      val got = TypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
-      assert(got.map(_.dataType) == Option(expected),
-        s"Failed to cast $from to $to")
+  // scalastyle:off line.size.limit
+  // The following table shows all implicit data type conversions that are not visible to the user.
+  // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
+  // | Source Type\CAST TO  | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | BooleanType | StringType | DateType | TimestampType | ArrayType  | MapType  | StructType  | NullType | CalendarIntervalType |     DecimalType     | NumericType | IntegralType |
+  // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
+  // | ByteType             | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(3, 0)   | ByteType    | ByteType     |
+  // | ShortType            | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(5, 0)   | ShortType   | ShortType    |
+  // | IntegerType          | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(10, 0)  | IntegerType | IntegerType  |
+  // | LongType             | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(20, 0)  | LongType    | LongType     |
+  // | DoubleType           | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(30, 15) | DoubleType  | IntegerType  |
+  // | FloatType            | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(14, 7)  | FloatType   | IntegerType  |
+  // | Dec(10, 2)           | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(10, 2)  | Dec(10, 2)  | IntegerType  |
+  // | BinaryType           | X        | X         | X           | X        | X          | X         | X          | BinaryType | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | X                   | X           | X            |
+  // | BooleanType          | X        | X         | X           | X        | X          | X         | X          | X          | BooleanType | StringType | X        | X             | X          | X        | X           | X        | X                    | X                   | X           | X            |
+  // | StringType           | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | X           | StringType | DateType | TimestampType | X          | X        | X           | X        | X                    | DecimalType(38, 18) | DoubleType  | X            |
+  // | DateType             | X        | X         | X           | X        | X          | X         | X          | X          | X           | StringType | DateType | TimestampType | X          | X        | X           | X        | X                    | X                   | X           | X            |
+  // | TimestampType        | X        | X         | X           | X        | X          | X         | X          | X          | X           | StringType | DateType | TimestampType | X          | X        | X           | X        | X                    | X                   | X           | X            |
+  // | ArrayType            | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | ArrayType* | X        | X           | X        | X                    | X                   | X           | X            |
+  // | MapType              | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | X          | MapType* | X           | X        | X                    | X                   | X           | X            |
+  // | StructType           | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | X          | X        | StructType* | X        | X                    | X                   | X           | X            |
+  // | NullType             | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | BooleanType | StringType | DateType | TimestampType | ArrayType  | MapType  | StructType  | NullType | CalendarIntervalType | DecimalType(38, 18) | DoubleType  | IntegerType  |
+  // | CalendarIntervalType | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | X          | X        | X           | X        | CalendarIntervalType | X                   | X           | X            |
+  // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
+  // Note: ArrayType*, MapType*, StructType* are castable only when the internal child types also match; otherwise, not castable
+  // scalastyle:on line.size.limit
+
+  private def shouldCast(from: DataType, to: AbstractDataType, expected: DataType): Unit = {
+    val got = TypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
+    assert(got.map(_.dataType) == Option(expected),
+      s"Failed to cast $from to $to")
+  }
+
+  private def shouldNotCast(from: DataType, to: AbstractDataType): Unit = {
+    val got = TypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
+    assert(got.isEmpty, s"Should not be able to cast $from to $to, but got $got")
+  }
+
+  val integralTypes: Seq[DataType] =
+    Seq(ByteType, ShortType, IntegerType, LongType)
+  val fractionalTypes: Seq[DataType] =
+    Seq(DoubleType, FloatType, DecimalType.SYSTEM_DEFAULT, DecimalType(10, 2))
+  val numericTypes: Seq[DataType] = integralTypes ++ fractionalTypes
+  val atomicTypes: Seq[DataType] =
+    numericTypes ++ Seq(BinaryType, BooleanType, StringType, DateType, TimestampType)
+  val complexTypes: Seq[DataType] =
+    Seq(ArrayType(IntegerType),
+      ArrayType(StringType),
+      MapType(StringType, StringType),
+      new StructType().add("a1", StringType),
+      new StructType().add("a1", StringType).add("a2", IntegerType))
+  val allTypes: Seq[DataType] =
+    atomicTypes ++ complexTypes ++ Seq(NullType, CalendarIntervalType)
+
+  // Check whether the type `checkedType` can be cast to all the types in `castableTypes`,
+  // but cannot be cast to the other types in `allTypes`.
+  private def checkTypeCasting(checkedType: DataType, castableTypes: Seq[DataType]): Unit = {
+    val nonCastableTypes = allTypes.filterNot(castableTypes.contains)
+
+    castableTypes.foreach { tpe =>
+      shouldCast(checkedType, tpe, tpe)
+    }
+    nonCastableTypes.foreach { tpe =>
+      shouldNotCast(checkedType, tpe)
     }
+  }
 
-    shouldCast(NullType, NullType, NullType)
-    shouldCast(NullType, IntegerType, IntegerType)
-    shouldCast(NullType, DecimalType, DecimalType.SYSTEM_DEFAULT)
+  test("implicit type cast - ByteType") {
+    val checkedType = ByteType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.ByteDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldCast(checkedType, IntegralType, checkedType)
+  }
+
+  test("implicit type cast - ShortType") {
+    val checkedType = ShortType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.ShortDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldCast(checkedType, IntegralType, checkedType)
+  }
 
-    shouldCast(ByteType, IntegerType, IntegerType)
-    shouldCast(IntegerType, IntegerType, IntegerType)
-    shouldCast(IntegerType, LongType, LongType)
-    shouldCast(IntegerType, DecimalType, DecimalType(10, 0))
-    shouldCast(LongType, IntegerType, IntegerType)
-    shouldCast(LongType, DecimalType, DecimalType(20, 0))
+  test("implicit type cast - IntegerType") {
+    val checkedType = IntegerType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(IntegerType, DecimalType, DecimalType.IntDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldCast(checkedType, IntegralType, checkedType)
+  }
 
-    shouldCast(DateType, TimestampType, TimestampType)
-    shouldCast(TimestampType, DateType, DateType)
+  test("implicit type cast - LongType") {
+    val checkedType = LongType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.LongDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldCast(checkedType, IntegralType, checkedType)
+  }
 
-    shouldCast(StringType, IntegerType, IntegerType)
-    shouldCast(StringType, DateType, DateType)
-    shouldCast(StringType, TimestampType, TimestampType)
-    shouldCast(IntegerType, StringType, StringType)
-    shouldCast(DateType, StringType, StringType)
-    shouldCast(TimestampType, StringType, StringType)
+  test("implicit type cast - FloatType") {
+    val checkedType = FloatType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.FloatDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldNotCast(checkedType, IntegralType)
+  }
 
-    shouldCast(StringType, BinaryType, BinaryType)
-    shouldCast(BinaryType, StringType, StringType)
+  test("implicit type cast - DoubleType") {
+    val checkedType = DoubleType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.DoubleDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - DecimalType(10, 2)") {
+    val checkedType = DecimalType(10, 2)
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, checkedType)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - BinaryType") {
+    val checkedType = BinaryType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType, StringType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - BooleanType") {
+    val checkedType = BooleanType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType, StringType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - StringType") {
+    val checkedType = StringType
+    val nonCastableTypes =
+      complexTypes ++ Seq(BooleanType, NullType, CalendarIntervalType)
+    checkTypeCasting(checkedType, castableTypes = allTypes.filterNot(nonCastableTypes.contains))
+    shouldCast(checkedType, DecimalType, DecimalType.SYSTEM_DEFAULT)
+    shouldCast(checkedType, NumericType, NumericType.defaultConcreteType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - DateType") {
+    val checkedType = DateType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType, StringType, TimestampType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
 
+  test("implicit type cast - TimestampType") {
+    val checkedType = TimestampType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType, StringType, DateType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - ArrayType(StringType)") {
+    val checkedType = ArrayType(StringType)
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - MapType(StringType, StringType)") {
+    val checkedType = MapType(StringType, StringType)
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - StructType().add(\"a1\", StringType)") {
+    val checkedType = new StructType().add("a1", StringType)
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - NullType") {
+    val checkedType = NullType
+    checkTypeCasting(checkedType, castableTypes = allTypes)
+    shouldCast(checkedType, DecimalType, DecimalType.SYSTEM_DEFAULT)
+    shouldCast(checkedType, NumericType, NumericType.defaultConcreteType)
+    shouldCast(checkedType, IntegralType, IntegralType.defaultConcreteType)
+  }
+
+  test("implicit type cast - CalendarIntervalType") {
+    val checkedType = CalendarIntervalType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("eligible implicit type cast - TypeCollection") {
     shouldCast(NullType, TypeCollection(StringType, BinaryType), StringType)
 
     shouldCast(StringType, TypeCollection(StringType, BinaryType), StringType)
@@ -81,15 +255,8 @@ class TypeCoercionSuite extends PlanTest {
     shouldCast(DecimalType(10, 2), TypeCollection(DecimalType, IntegerType), DecimalType(10, 2))
     shouldCast(IntegerType, TypeCollection(DecimalType(10, 2), StringType), DecimalType(10, 2))
 
-    shouldCast(StringType, NumericType, DoubleType)
     shouldCast(StringType, TypeCollection(NumericType, BinaryType), DoubleType)
 
-    // NumericType should not be changed when function accepts any of them.
-    Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType,
-      DecimalType.SYSTEM_DEFAULT, DecimalType(10, 2)).foreach { tpe =>
-      shouldCast(tpe, NumericType, tpe)
-    }
-
     shouldCast(
       ArrayType(StringType, false),
       TypeCollection(ArrayType(StringType), StringType),
@@ -101,32 +268,8 @@ class TypeCoercionSuite extends PlanTest {
       ArrayType(StringType, true))
   }
 
-  test("ineligible implicit type cast") {
-    def shouldNotCast(from: DataType, to: AbstractDataType): Unit = {
-      val got = TypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
-      assert(got.isEmpty, s"Should not be able to cast $from to $to, but got $got")
-    }
-
-    shouldNotCast(IntegerType, DateType)
-    shouldNotCast(IntegerType, TimestampType)
-    shouldNotCast(LongType, DateType)
-    shouldNotCast(LongType, TimestampType)
-    shouldNotCast(DecimalType.SYSTEM_DEFAULT, DateType)
-    shouldNotCast(DecimalType.SYSTEM_DEFAULT, TimestampType)
-
+  test("ineligible implicit type cast - TypeCollection") {
     shouldNotCast(IntegerType, TypeCollection(DateType, TimestampType))
-
-    shouldNotCast(IntegerType, ArrayType)
-    shouldNotCast(IntegerType, MapType)
-    shouldNotCast(IntegerType, StructType)
-
-    shouldNotCast(CalendarIntervalType, StringType)
-
-    // Don't implicitly cast complex types to string.
-    shouldNotCast(ArrayType(StringType), StringType)
-    shouldNotCast(MapType(StringType, StringType), StringType)
-    shouldNotCast(new StructType().add("a1", StringType), StringType)
-    shouldNotCast(MapType(StringType, StringType), StringType)
   }
 
   test("tightest common bound for types") {
@@ -417,14 +560,20 @@ class TypeCoercionSuite extends PlanTest {
 
   test("nanvl casts") {
     ruleTest(TypeCoercion.FunctionArgumentConversion,
-      NaNvl(Literal.create(1.0, FloatType), Literal.create(1.0, DoubleType)),
-      NaNvl(Cast(Literal.create(1.0, FloatType), DoubleType), Literal.create(1.0, DoubleType)))
+      NaNvl(Literal.create(1.0f, FloatType), Literal.create(1.0, DoubleType)),
+      NaNvl(Cast(Literal.create(1.0f, FloatType), DoubleType), Literal.create(1.0, DoubleType)))
     ruleTest(TypeCoercion.FunctionArgumentConversion,
-      NaNvl(Literal.create(1.0, DoubleType), Literal.create(1.0, FloatType)),
-      NaNvl(Literal.create(1.0, DoubleType), Cast(Literal.create(1.0, FloatType), DoubleType)))
+      NaNvl(Literal.create(1.0, DoubleType), Literal.create(1.0f, FloatType)),
+      NaNvl(Literal.create(1.0, DoubleType), Cast(Literal.create(1.0f, FloatType), DoubleType)))
     ruleTest(TypeCoercion.FunctionArgumentConversion,
       NaNvl(Literal.create(1.0, DoubleType), Literal.create(1.0, DoubleType)),
       NaNvl(Literal.create(1.0, DoubleType), Literal.create(1.0, DoubleType)))
+    ruleTest(TypeCoercion.FunctionArgumentConversion,
+      NaNvl(Literal.create(1.0f, FloatType), Literal.create(null, NullType)),
+      NaNvl(Literal.create(1.0f, FloatType), Cast(Literal.create(null, NullType), FloatType)))
+    ruleTest(TypeCoercion.FunctionArgumentConversion,
+      NaNvl(Literal.create(1.0, DoubleType), Literal.create(null, NullType)),
+      NaNvl(Literal.create(1.0, DoubleType), Cast(Literal.create(null, NullType), DoubleType)))
   }
 
   test("type coercion for If") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
index ff1bb126f463..dcdb1ae08932 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.InternalOutputModes._
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -27,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference,
 import org.apache.spark.sql.catalyst.expressions.aggregate.Count
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.IntegerType
 
@@ -98,6 +98,19 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
     outputMode = Update,
     expectedMsgs = Seq("multiple streaming aggregations"))
 
+  // Aggregation: Distinct aggregates not supported on streaming relation
+  val distinctAggExprs = Seq(Count("*").toAggregateExpression(isDistinct = true).as("c"))
+  assertSupportedInStreamingPlan(
+    "distinct aggregate - aggregate on batch relation",
+    Aggregate(Nil, distinctAggExprs, batchRelation),
+    outputMode = Append)
+
+  assertNotSupportedInStreamingPlan(
+    "distinct aggregate - aggregate on streaming relation",
+    Aggregate(Nil, distinctAggExprs, streamRelation),
+    outputMode = Complete,
+    expectedMsgs = Seq("distinct aggregation"))
+
   // Inner joins: Stream-stream not supported
   testBinaryOperationInStreamingPlan(
     "inner join",
@@ -186,12 +199,17 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
     _.intersect(_),
     streamStreamSupported = false)
 
-  // Sort: supported only on batch subplans and on aggregation + complete output mode
+  // Sort: supported only on batch subplans and after aggregation on streaming plan + complete mode
   testUnaryOperatorInStreamingPlan("sort", Sort(Nil, true, _))
   assertSupportedInStreamingPlan(
-    "sort - sort over aggregated data in Complete output mode",
+    "sort - sort after aggregation in Complete output mode",
     streamRelation.groupBy()(Count("*")).sortBy(),
     Complete)
+  assertNotSupportedInStreamingPlan(
+    "sort - sort before aggregation in Complete output mode",
+    streamRelation.sortBy().groupBy()(Count("*")),
+    Complete,
+    Seq("sort", "aggregat", "complete"))
   assertNotSupportedInStreamingPlan(
     "sort - sort over aggregated data in Update output mode",
     streamRelation.groupBy()(Count("*")).sortBy(),
@@ -200,16 +218,15 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
 
 
   // Other unary operations
-  testUnaryOperatorInStreamingPlan("sort partitions", SortPartitions(Nil, _), expectedMsg = "sort")
   testUnaryOperatorInStreamingPlan(
     "sample", Sample(0.1, 1, true, 1L, _)(), expectedMsg = "sampling")
   testUnaryOperatorInStreamingPlan(
     "window", Window(Nil, Nil, Nil, _), expectedMsg = "non-time-based windows")
 
   // Output modes with aggregation and non-aggregation plans
-  testOutputMode(Append, shouldSupportAggregation = false)
-  testOutputMode(Update, shouldSupportAggregation = true)
-  testOutputMode(Complete, shouldSupportAggregation = true)
+  testOutputMode(Append, shouldSupportAggregation = false, shouldSupportNonAggregation = true)
+  testOutputMode(Update, shouldSupportAggregation = true, shouldSupportNonAggregation = true)
+  testOutputMode(Complete, shouldSupportAggregation = true, shouldSupportNonAggregation = false)
 
   /*
     =======================================================================================
@@ -311,30 +328,33 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
   /** Test output mode with and without aggregation in the streaming plan */
   def testOutputMode(
       outputMode: OutputMode,
-      shouldSupportAggregation: Boolean): Unit = {
+      shouldSupportAggregation: Boolean,
+      shouldSupportNonAggregation: Boolean): Unit = {
 
     // aggregation
     if (shouldSupportAggregation) {
-      assertNotSupportedInStreamingPlan(
-        s"$outputMode output mode - no aggregation",
-        streamRelation.where($"a" > 1),
-        outputMode = outputMode,
-        Seq("aggregation", s"$outputMode output mode"))
-
       assertSupportedInStreamingPlan(
         s"$outputMode output mode - aggregation",
         streamRelation.groupBy("a")("count(*)"),
         outputMode = outputMode)
-
     } else {
+      assertNotSupportedInStreamingPlan(
+        s"$outputMode output mode - aggregation",
+        streamRelation.groupBy("a")("count(*)"),
+        outputMode = outputMode,
+        Seq("aggregation", s"$outputMode output mode"))
+    }
+
+    // non aggregation
+    if (shouldSupportNonAggregation) {
       assertSupportedInStreamingPlan(
         s"$outputMode output mode - no aggregation",
         streamRelation.where($"a" > 1),
         outputMode = outputMode)
-
+    } else {
       assertNotSupportedInStreamingPlan(
-        s"$outputMode output mode - aggregation",
-        streamRelation.groupBy("a")("count(*)"),
+        s"$outputMode output mode - no aggregation",
+        streamRelation.where($"a" > 1),
         outputMode = outputMode,
         Seq("aggregation", s"$outputMode output mode"))
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index f283f4287c5b..f0692a8e3537 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -17,9 +17,8 @@
 
 package org.apache.spark.sql.catalyst.catalog
 
-import java.io.File
-import java.net.URI
-
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkFunSuite
@@ -27,7 +26,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException}
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 
@@ -240,6 +239,19 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     }
   }
 
+  test("alter table schema") {
+    val catalog = newBasicCatalog()
+    val tbl1 = catalog.getTable("db2", "tbl1")
+    val newSchema = StructType(Seq(
+      StructField("new_field_1", IntegerType),
+      StructField("new_field_2", StringType),
+      StructField("a", IntegerType),
+      StructField("b", StringType)))
+    catalog.alterTableSchema("db2", "tbl1", newSchema)
+    val newTbl1 = catalog.getTable("db2", "tbl1")
+    assert(newTbl1.schema == newSchema)
+  }
+
   test("get table") {
     assert(newBasicCatalog().getTable("db2", "tbl1").identifier.table == "tbl1")
   }
@@ -270,6 +282,26 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     assert(catalog.listTables("db2", "*1").toSet == Set("tbl1"))
   }
 
+  test("column names should be case-preserving and column nullability should be retained") {
+    val catalog = newBasicCatalog()
+    val tbl = CatalogTable(
+      identifier = TableIdentifier("tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = storageFormat,
+      schema = new StructType()
+        .add("HelLo", "int", nullable = false)
+        .add("WoRLd", "int", nullable = true),
+      provider = Some("hive"),
+      partitionColumnNames = Seq("WoRLd"),
+      bucketSpec = Some(BucketSpec(4, Seq("HelLo"), Nil)))
+    catalog.createTable(tbl, ignoreIfExists = false)
+
+    val readBack = catalog.getTable("db1", "tbl")
+    assert(readBack.schema == tbl.schema)
+    assert(readBack.partitionColumnNames == tbl.partitionColumnNames)
+    assert(readBack.bucketSpec == tbl.bucketSpec)
+  }
+
   // --------------------------------------------------------------------------
   // Partitions
   // --------------------------------------------------------------------------
@@ -300,17 +332,121 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     catalog.createPartitions("db2", "tbl2", Seq(part1), ignoreIfExists = true)
   }
 
+  test("create partitions without location") {
+    val catalog = newBasicCatalog()
+    val table = CatalogTable(
+      identifier = TableIdentifier("tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat.empty,
+      schema = new StructType()
+        .add("col1", "int")
+        .add("col2", "string")
+        .add("partCol1", "int")
+        .add("partCol2", "string"),
+      provider = Some("hive"),
+      partitionColumnNames = Seq("partCol1", "partCol2"))
+    catalog.createTable(table, ignoreIfExists = false)
+
+    val partition = CatalogTablePartition(Map("partCol1" -> "1", "partCol2" -> "2"), storageFormat)
+    catalog.createPartitions("db1", "tbl", Seq(partition), ignoreIfExists = false)
+
+    val partitionLocation = catalog.getPartition(
+      "db1",
+      "tbl",
+      Map("partCol1" -> "1", "partCol2" -> "2")).location
+    val tableLocation = catalog.getTable("db1", "tbl").location
+    val defaultPartitionLocation = new Path(new Path(tableLocation, "partCol1=1"), "partCol2=2")
+    assert(new Path(partitionLocation) == defaultPartitionLocation)
+  }
+
+  test("create/drop partitions in managed tables with location") {
+    val catalog = newBasicCatalog()
+    val table = CatalogTable(
+      identifier = TableIdentifier("tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat.empty,
+      schema = new StructType()
+        .add("col1", "int")
+        .add("col2", "string")
+        .add("partCol1", "int")
+        .add("partCol2", "string"),
+      provider = Some("hive"),
+      partitionColumnNames = Seq("partCol1", "partCol2"))
+    catalog.createTable(table, ignoreIfExists = false)
+
+    val newLocationPart1 = newUriForDatabase()
+    val newLocationPart2 = newUriForDatabase()
+
+    val partition1 =
+      CatalogTablePartition(Map("partCol1" -> "1", "partCol2" -> "2"),
+        storageFormat.copy(locationUri = Some(newLocationPart1)))
+    val partition2 =
+      CatalogTablePartition(Map("partCol1" -> "3", "partCol2" -> "4"),
+        storageFormat.copy(locationUri = Some(newLocationPart2)))
+    catalog.createPartitions("db1", "tbl", Seq(partition1), ignoreIfExists = false)
+    catalog.createPartitions("db1", "tbl", Seq(partition2), ignoreIfExists = false)
+
+    assert(exists(newLocationPart1))
+    assert(exists(newLocationPart2))
+
+    // the corresponding directory is dropped.
+    catalog.dropPartitions("db1", "tbl", Seq(partition1.spec),
+      ignoreIfNotExists = false, purge = false, retainData = false)
+    assert(!exists(newLocationPart1))
+
+    // all the remaining directories are dropped.
+    catalog.dropTable("db1", "tbl", ignoreIfNotExists = false, purge = false)
+    assert(!exists(newLocationPart2))
+  }
+
+  test("list partition names") {
+    val catalog = newBasicCatalog()
+    val newPart = CatalogTablePartition(Map("a" -> "1", "b" -> "%="), storageFormat)
+    catalog.createPartitions("db2", "tbl2", Seq(newPart), ignoreIfExists = false)
+
+    val partitionNames = catalog.listPartitionNames("db2", "tbl2")
+    assert(partitionNames == Seq("a=1/b=%25%3D", "a=1/b=2", "a=3/b=4"))
+  }
+
+  test("list partition names with partial partition spec") {
+    val catalog = newBasicCatalog()
+    val newPart = CatalogTablePartition(Map("a" -> "1", "b" -> "%="), storageFormat)
+    catalog.createPartitions("db2", "tbl2", Seq(newPart), ignoreIfExists = false)
+
+    val partitionNames1 = catalog.listPartitionNames("db2", "tbl2", Some(Map("a" -> "1")))
+    assert(partitionNames1 == Seq("a=1/b=%25%3D", "a=1/b=2"))
+
+    // Partial partition specs including "weird" partition values should use the unescaped values
+    val partitionNames2 = catalog.listPartitionNames("db2", "tbl2", Some(Map("b" -> "%=")))
+    assert(partitionNames2 == Seq("a=1/b=%25%3D"))
+
+    val partitionNames3 = catalog.listPartitionNames("db2", "tbl2", Some(Map("b" -> "%25%3D")))
+    assert(partitionNames3.isEmpty)
+  }
+
+  test("list partitions with partial partition spec") {
+    val catalog = newBasicCatalog()
+    val parts = catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "1")))
+    assert(parts.length == 1)
+    assert(parts.head.spec == part1.spec)
+
+    // if no partition is matched for the given partition spec, an empty list should be returned.
+    assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "unknown", "b" -> "1"))).isEmpty)
+    assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "unknown"))).isEmpty)
+  }
+
   test("drop partitions") {
     val catalog = newBasicCatalog()
     assert(catalogPartitionsEqual(catalog, "db2", "tbl2", Seq(part1, part2)))
     catalog.dropPartitions(
-      "db2", "tbl2", Seq(part1.spec), ignoreIfNotExists = false, purge = false)
+      "db2", "tbl2", Seq(part1.spec), ignoreIfNotExists = false, purge = false, retainData = false)
     assert(catalogPartitionsEqual(catalog, "db2", "tbl2", Seq(part2)))
     resetState()
     val catalog2 = newBasicCatalog()
     assert(catalogPartitionsEqual(catalog2, "db2", "tbl2", Seq(part1, part2)))
     catalog2.dropPartitions(
-      "db2", "tbl2", Seq(part1.spec, part2.spec), ignoreIfNotExists = false, purge = false)
+      "db2", "tbl2", Seq(part1.spec, part2.spec), ignoreIfNotExists = false, purge = false,
+      retainData = false)
     assert(catalog2.listPartitions("db2", "tbl2").isEmpty)
   }
 
@@ -318,11 +454,13 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     val catalog = newBasicCatalog()
     intercept[AnalysisException] {
       catalog.dropPartitions(
-        "does_not_exist", "tbl1", Seq(), ignoreIfNotExists = false, purge = false)
+        "does_not_exist", "tbl1", Seq(), ignoreIfNotExists = false, purge = false,
+        retainData = false)
     }
     intercept[AnalysisException] {
       catalog.dropPartitions(
-        "db2", "does_not_exist", Seq(), ignoreIfNotExists = false, purge = false)
+        "db2", "does_not_exist", Seq(), ignoreIfNotExists = false, purge = false,
+        retainData = false)
     }
   }
 
@@ -330,10 +468,11 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     val catalog = newBasicCatalog()
     intercept[AnalysisException] {
       catalog.dropPartitions(
-        "db2", "tbl2", Seq(part3.spec), ignoreIfNotExists = false, purge = false)
+        "db2", "tbl2", Seq(part3.spec), ignoreIfNotExists = false, purge = false,
+        retainData = false)
     }
     catalog.dropPartitions(
-      "db2", "tbl2", Seq(part3.spec), ignoreIfNotExists = true, purge = false)
+      "db2", "tbl2", Seq(part3.spec), ignoreIfNotExists = true, purge = false, retainData = false)
   }
 
   test("get partition") {
@@ -368,6 +507,46 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     intercept[AnalysisException] { catalog.getPartition("db2", "tbl2", part2.spec) }
   }
 
+  test("rename partitions should update the location for managed table") {
+    val catalog = newBasicCatalog()
+    val table = CatalogTable(
+      identifier = TableIdentifier("tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat.empty,
+      schema = new StructType()
+        .add("col1", "int")
+        .add("col2", "string")
+        .add("partCol1", "int")
+        .add("partCol2", "string"),
+      provider = Some("hive"),
+      partitionColumnNames = Seq("partCol1", "partCol2"))
+    catalog.createTable(table, ignoreIfExists = false)
+
+    val tableLocation = catalog.getTable("db1", "tbl").location
+
+    val mixedCasePart1 = CatalogTablePartition(
+      Map("partCol1" -> "1", "partCol2" -> "2"), storageFormat)
+    val mixedCasePart2 = CatalogTablePartition(
+      Map("partCol1" -> "3", "partCol2" -> "4"), storageFormat)
+
+    catalog.createPartitions("db1", "tbl", Seq(mixedCasePart1), ignoreIfExists = false)
+    assert(
+      new Path(catalog.getPartition("db1", "tbl", mixedCasePart1.spec).location) ==
+        new Path(new Path(tableLocation, "partCol1=1"), "partCol2=2"))
+
+    catalog.renamePartitions("db1", "tbl", Seq(mixedCasePart1.spec), Seq(mixedCasePart2.spec))
+    assert(
+      new Path(catalog.getPartition("db1", "tbl", mixedCasePart2.spec).location) ==
+        new Path(new Path(tableLocation, "partCol1=3"), "partCol2=4"))
+
+    // For external tables, RENAME PARTITION should not update the partition location.
+    val existingPartLoc = catalog.getPartition("db2", "tbl2", part1.spec).location
+    catalog.renamePartitions("db2", "tbl2", Seq(part1.spec), Seq(part3.spec))
+    assert(
+      new Path(catalog.getPartition("db2", "tbl2", part3.spec).location) ==
+        new Path(existingPartLoc))
+  }
+
   test("rename partitions when database/table does not exist") {
     val catalog = newBasicCatalog()
     intercept[AnalysisException] {
@@ -388,11 +567,6 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
   test("alter partitions") {
     val catalog = newBasicCatalog()
     try {
-      // Note: Before altering table partitions in Hive, you *must* set the current database
-      // to the one that contains the table of interest. Otherwise you will end up with the
-      // most helpful error message ever: "Unable to alter partition. alter is not possible."
-      // See HIVE-2742 for more detail.
-      catalog.setCurrentDatabase("db2")
       val newLocation = newUriForDatabase()
       val newSerde = "com.sparkbricks.text.EasySerde"
       val newSerdeProps = Map("spark" -> "bricks", "compressed" -> "false")
@@ -540,10 +714,11 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
   // --------------------------------------------------------------------------
 
   private def exists(uri: String, children: String*): Boolean = {
-    val base = new File(new URI(uri))
-    children.foldLeft(base) {
-      case (parent, child) => new File(parent, child)
-    }.exists()
+    val base = new Path(uri)
+    val finalPath = children.foldLeft(base) {
+      case (parent, child) => new Path(parent, child)
+    }
+    base.getFileSystem(new Configuration()).exists(finalPath)
   }
 
   test("create/drop database should create/delete the directory") {
@@ -562,7 +737,7 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     val table = CatalogTable(
       identifier = TableIdentifier("my_table", Some("db1")),
       tableType = CatalogTableType.MANAGED,
-      storage = CatalogStorageFormat(None, None, None, None, false, Map.empty),
+      storage = CatalogStorageFormat.empty,
       schema = new StructType().add("a", "int").add("b", "string"),
       provider = Some("hive")
     )
@@ -592,42 +767,69 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
 
   test("create/drop/rename partitions should create/delete/rename the directory") {
     val catalog = newBasicCatalog()
-    val databaseDir = catalog.getDatabase("db1").locationUri
     val table = CatalogTable(
       identifier = TableIdentifier("tbl", Some("db1")),
       tableType = CatalogTableType.MANAGED,
-      storage = CatalogStorageFormat(None, None, None, None, false, Map.empty),
+      storage = CatalogStorageFormat.empty,
       schema = new StructType()
         .add("col1", "int")
         .add("col2", "string")
-        .add("a", "int")
-        .add("b", "string"),
+        .add("partCol1", "int")
+        .add("partCol2", "string"),
       provider = Some("hive"),
-      partitionColumnNames = Seq("a", "b")
-    )
+      partitionColumnNames = Seq("partCol1", "partCol2"))
     catalog.createTable(table, ignoreIfExists = false)
 
+    val tableLocation = catalog.getTable("db1", "tbl").location
+
+    val part1 = CatalogTablePartition(Map("partCol1" -> "1", "partCol2" -> "2"), storageFormat)
+    val part2 = CatalogTablePartition(Map("partCol1" -> "3", "partCol2" -> "4"), storageFormat)
+    val part3 = CatalogTablePartition(Map("partCol1" -> "5", "partCol2" -> "6"), storageFormat)
+
     catalog.createPartitions("db1", "tbl", Seq(part1, part2), ignoreIfExists = false)
-    assert(exists(databaseDir, "tbl", "a=1", "b=2"))
-    assert(exists(databaseDir, "tbl", "a=3", "b=4"))
+    assert(exists(tableLocation, "partCol1=1", "partCol2=2"))
+    assert(exists(tableLocation, "partCol1=3", "partCol2=4"))
 
     catalog.renamePartitions("db1", "tbl", Seq(part1.spec), Seq(part3.spec))
-    assert(!exists(databaseDir, "tbl", "a=1", "b=2"))
-    assert(exists(databaseDir, "tbl", "a=5", "b=6"))
+    assert(!exists(tableLocation, "partCol1=1", "partCol2=2"))
+    assert(exists(tableLocation, "partCol1=5", "partCol2=6"))
 
     catalog.dropPartitions("db1", "tbl", Seq(part2.spec, part3.spec), ignoreIfNotExists = false,
-      purge = false)
-    assert(!exists(databaseDir, "tbl", "a=3", "b=4"))
-    assert(!exists(databaseDir, "tbl", "a=5", "b=6"))
-
-    val externalPartition = CatalogTablePartition(
-      Map("a" -> "7", "b" -> "8"),
+      purge = false, retainData = false)
+    assert(!exists(tableLocation, "partCol1=3", "partCol2=4"))
+    assert(!exists(tableLocation, "partCol1=5", "partCol2=6"))
+
+    val tempPath = Utils.createTempDir()
+    // create partition with existing directory is OK.
+    val partWithExistingDir = CatalogTablePartition(
+      Map("partCol1" -> "7", "partCol2" -> "8"),
       CatalogStorageFormat(
-        Some(Utils.createTempDir().getAbsolutePath),
-        None, None, None, false, Map.empty)
-    )
-    catalog.createPartitions("db1", "tbl", Seq(externalPartition), ignoreIfExists = false)
-    assert(!exists(databaseDir, "tbl", "a=7", "b=8"))
+        Some(tempPath.getAbsolutePath),
+        None, None, None, false, Map.empty))
+    catalog.createPartitions("db1", "tbl", Seq(partWithExistingDir), ignoreIfExists = false)
+
+    tempPath.delete()
+    // create partition with non-existing directory will create that directory.
+    val partWithNonExistingDir = CatalogTablePartition(
+      Map("partCol1" -> "9", "partCol2" -> "10"),
+      CatalogStorageFormat(
+        Some(tempPath.getAbsolutePath),
+        None, None, None, false, Map.empty))
+    catalog.createPartitions("db1", "tbl", Seq(partWithNonExistingDir), ignoreIfExists = false)
+    assert(tempPath.exists())
+  }
+
+  test("drop partition from external table should not delete the directory") {
+    val catalog = newBasicCatalog()
+    catalog.createPartitions("db2", "tbl1", Seq(part1), ignoreIfExists = false)
+
+    val partPath = new Path(catalog.getPartition("db2", "tbl1", part1.spec).location)
+    val fs = partPath.getFileSystem(new Configuration)
+    assert(fs.exists(partPath))
+
+    catalog.dropPartitions(
+      "db2", "tbl1", Seq(part1.spec), ignoreIfNotExists = false, purge = false, retainData = false)
+    assert(fs.exists(partPath))
   }
 }
 
@@ -659,6 +861,8 @@ abstract class CatalogTestUtils {
     CatalogTablePartition(Map("a" -> "5", "b" -> "6", "c" -> "7"), storageFormat)
   lazy val partWithUnknownColumns =
     CatalogTablePartition(Map("a" -> "5", "unknown" -> "6"), storageFormat)
+  lazy val partWithEmptyValue =
+    CatalogTablePartition(Map("a" -> "3", "b" -> ""), storageFormat)
   lazy val funcClass = "org.apache.spark.myFunc"
 
   /**
@@ -700,7 +904,7 @@ abstract class CatalogTestUtils {
     CatalogTable(
       identifier = TableIdentifier(name, database),
       tableType = CatalogTableType.EXTERNAL,
-      storage = storageFormat,
+      storage = storageFormat.copy(locationUri = Some(Utils.createTempDir().getAbsolutePath)),
       schema = new StructType()
         .add("col1", "int")
         .add("col2", "string")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 187611bc7746..41ec40512cb5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -61,6 +61,22 @@ class SessionCatalogSuite extends SparkFunSuite {
     assert(!catalog.databaseExists("does_not_exist"))
   }
 
+  def testInvalidName(func: (String) => Unit) {
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    val name = "砖"
+    // scalastyle:on
+    val e = intercept[AnalysisException] {
+      func(name)
+    }.getMessage
+    assert(e.contains(s"`$name` is not a valid name for tables/databases."))
+  }
+
+  test("create databases using invalid names") {
+    val catalog = new SessionCatalog(newEmptyCatalog())
+    testInvalidName(name => catalog.createDatabase(newDb(name), ignoreIfExists = true))
+  }
+
   test("get database when a database exists") {
     val catalog = new SessionCatalog(newBasicCatalog())
     val db1 = catalog.getDatabaseMetadata("db1")
@@ -127,6 +143,21 @@ class SessionCatalogSuite extends SparkFunSuite {
     catalog.dropDatabase("db_that_does_not_exist", ignoreIfNotExists = true, cascade = false)
   }
 
+  test("drop current database and drop default database") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    catalog.setCurrentDatabase("db1")
+    assert(catalog.getCurrentDatabase == "db1")
+    catalog.dropDatabase("db1", ignoreIfNotExists = false, cascade = true)
+    intercept[NoSuchDatabaseException] {
+      catalog.createTable(newTable("tbl1", "db1"), ignoreIfExists = false)
+    }
+    catalog.setCurrentDatabase("default")
+    assert(catalog.getCurrentDatabase == "default")
+    intercept[AnalysisException] {
+      catalog.dropDatabase("default", ignoreIfNotExists = false, cascade = true)
+    }
+  }
+
   test("alter database") {
     val catalog = new SessionCatalog(newBasicCatalog())
     val db1 = catalog.getDatabaseMetadata("db1")
@@ -179,6 +210,11 @@ class SessionCatalogSuite extends SparkFunSuite {
     assert(externalCatalog.listTables("db2").toSet == Set("tbl1", "tbl2", "tbl3"))
   }
 
+  test("create tables using invalid names") {
+    val catalog = new SessionCatalog(newEmptyCatalog())
+    testInvalidName(name => catalog.createTable(newTable(name, "db1"), ignoreIfExists = false))
+  }
+
   test("create table when database does not exist") {
     val catalog = new SessionCatalog(newBasicCatalog())
     // Creating table in non-existent database should always fail
@@ -294,6 +330,12 @@ class SessionCatalogSuite extends SparkFunSuite {
     }
   }
 
+  test("rename tables to an invalid name") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    testInvalidName(
+      name => catalog.renameTable(TableIdentifier("tbl1", Some("db2")), TableIdentifier(name)))
+  }
+
   test("rename table when database/table does not exist") {
     val catalog = new SessionCatalog(newBasicCatalog())
     intercept[NoSuchDatabaseException] {
@@ -512,13 +554,13 @@ class SessionCatalogSuite extends SparkFunSuite {
     sessionCatalog.createTable(newTable("tbl", "mydb"), ignoreIfExists = false)
     sessionCatalog.createPartitions(
       TableIdentifier("tbl", Some("mydb")), Seq(part1, part2), ignoreIfExists = false)
-    assert(catalogPartitionsEqual(externalCatalog, "mydb", "tbl", Seq(part1, part2)))
+    assert(catalogPartitionsEqual(externalCatalog.listPartitions("mydb", "tbl"), part1, part2))
     // Create partitions without explicitly specifying database
     sessionCatalog.setCurrentDatabase("mydb")
     sessionCatalog.createPartitions(
       TableIdentifier("tbl"), Seq(partWithMixedOrder), ignoreIfExists = false)
     assert(catalogPartitionsEqual(
-      externalCatalog, "mydb", "tbl", Seq(part1, part2, partWithMixedOrder)))
+      externalCatalog.listPartitions("mydb", "tbl"), part1, part2, partWithMixedOrder))
   }
 
   test("create partitions when database/table does not exist") {
@@ -566,35 +608,45 @@ class SessionCatalogSuite extends SparkFunSuite {
     }
     assert(e.getMessage.contains("Partition spec is invalid. The spec (a, unknown) must match " +
       "the partition spec (a, b) defined in table '`db2`.`tbl2`'"))
+    e = intercept[AnalysisException] {
+      catalog.createPartitions(
+        TableIdentifier("tbl2", Some("db2")),
+        Seq(partWithEmptyValue, part1), ignoreIfExists = true)
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec ([a=3, b=]) contains an " +
+      "empty partition column value"))
   }
 
   test("drop partitions") {
     val externalCatalog = newBasicCatalog()
     val sessionCatalog = new SessionCatalog(externalCatalog)
-    assert(catalogPartitionsEqual(externalCatalog, "db2", "tbl2", Seq(part1, part2)))
+    assert(catalogPartitionsEqual(externalCatalog.listPartitions("db2", "tbl2"), part1, part2))
     sessionCatalog.dropPartitions(
       TableIdentifier("tbl2", Some("db2")),
       Seq(part1.spec),
       ignoreIfNotExists = false,
-      purge = false)
-    assert(catalogPartitionsEqual(externalCatalog, "db2", "tbl2", Seq(part2)))
+      purge = false,
+      retainData = false)
+    assert(catalogPartitionsEqual(externalCatalog.listPartitions("db2", "tbl2"), part2))
     // Drop partitions without explicitly specifying database
     sessionCatalog.setCurrentDatabase("db2")
     sessionCatalog.dropPartitions(
       TableIdentifier("tbl2"),
       Seq(part2.spec),
       ignoreIfNotExists = false,
-      purge = false)
+      purge = false,
+      retainData = false)
     assert(externalCatalog.listPartitions("db2", "tbl2").isEmpty)
     // Drop multiple partitions at once
     sessionCatalog.createPartitions(
       TableIdentifier("tbl2", Some("db2")), Seq(part1, part2), ignoreIfExists = false)
-    assert(catalogPartitionsEqual(externalCatalog, "db2", "tbl2", Seq(part1, part2)))
+    assert(catalogPartitionsEqual(externalCatalog.listPartitions("db2", "tbl2"), part1, part2))
     sessionCatalog.dropPartitions(
       TableIdentifier("tbl2", Some("db2")),
       Seq(part1.spec, part2.spec),
       ignoreIfNotExists = false,
-      purge = false)
+      purge = false,
+      retainData = false)
     assert(externalCatalog.listPartitions("db2", "tbl2").isEmpty)
   }
 
@@ -605,14 +657,16 @@ class SessionCatalogSuite extends SparkFunSuite {
         TableIdentifier("tbl1", Some("unknown_db")),
         Seq(),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
     intercept[NoSuchTableException] {
       catalog.dropPartitions(
         TableIdentifier("does_not_exist", Some("db2")),
         Seq(),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
   }
 
@@ -623,13 +677,15 @@ class SessionCatalogSuite extends SparkFunSuite {
         TableIdentifier("tbl2", Some("db2")),
         Seq(part3.spec),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
     catalog.dropPartitions(
       TableIdentifier("tbl2", Some("db2")),
       Seq(part3.spec),
       ignoreIfNotExists = true,
-      purge = false)
+      purge = false,
+      retainData = false)
   }
 
   test("drop partitions with invalid partition spec") {
@@ -639,7 +695,8 @@ class SessionCatalogSuite extends SparkFunSuite {
         TableIdentifier("tbl2", Some("db2")),
         Seq(partWithMoreColumns.spec),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
     assert(e.getMessage.contains(
       "Partition spec is invalid. The spec (a, b, c) must be contained within " +
@@ -649,11 +706,22 @@ class SessionCatalogSuite extends SparkFunSuite {
         TableIdentifier("tbl2", Some("db2")),
         Seq(partWithUnknownColumns.spec),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
     assert(e.getMessage.contains(
       "Partition spec is invalid. The spec (a, unknown) must be contained within " +
         "the partition spec (a, b) defined in table '`db2`.`tbl2`'"))
+    e = intercept[AnalysisException] {
+      catalog.dropPartitions(
+        TableIdentifier("tbl2", Some("db2")),
+        Seq(partWithEmptyValue.spec, part1.spec),
+        ignoreIfNotExists = false,
+        purge = false,
+        retainData = false)
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec ([a=3, b=]) contains an " +
+      "empty partition column value"))
   }
 
   test("get partition") {
@@ -699,6 +767,11 @@ class SessionCatalogSuite extends SparkFunSuite {
     }
     assert(e.getMessage.contains("Partition spec is invalid. The spec (a, unknown) must match " +
       "the partition spec (a, b) defined in table '`db2`.`tbl1`'"))
+    e = intercept[AnalysisException] {
+      catalog.getPartition(TableIdentifier("tbl1", Some("db2")), partWithEmptyValue.spec)
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec ([a=3, b=]) contains an " +
+      "empty partition column value"))
   }
 
   test("rename partitions") {
@@ -766,6 +839,13 @@ class SessionCatalogSuite extends SparkFunSuite {
     }
     assert(e.getMessage.contains("Partition spec is invalid. The spec (a, unknown) must match " +
       "the partition spec (a, b) defined in table '`db2`.`tbl1`'"))
+    e = intercept[AnalysisException] {
+      catalog.renamePartitions(
+        TableIdentifier("tbl1", Some("db2")),
+        Seq(part1.spec), Seq(partWithEmptyValue.spec))
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec ([a=3, b=]) contains an " +
+      "empty partition column value"))
   }
 
   test("alter partitions") {
@@ -825,14 +905,85 @@ class SessionCatalogSuite extends SparkFunSuite {
     }
     assert(e.getMessage.contains("Partition spec is invalid. The spec (a, unknown) must match " +
       "the partition spec (a, b) defined in table '`db2`.`tbl1`'"))
+    e = intercept[AnalysisException] {
+      catalog.alterPartitions(TableIdentifier("tbl1", Some("db2")), Seq(partWithEmptyValue))
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec ([a=3, b=]) contains an " +
+      "empty partition column value"))
+  }
+
+  test("list partition names") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    val expectedPartitionNames = Seq("a=1/b=2", "a=3/b=4")
+    assert(catalog.listPartitionNames(TableIdentifier("tbl2", Some("db2"))) ==
+      expectedPartitionNames)
+    // List partition names without explicitly specifying database
+    catalog.setCurrentDatabase("db2")
+    assert(catalog.listPartitionNames(TableIdentifier("tbl2")) == expectedPartitionNames)
+  }
+
+  test("list partition names with partial partition spec") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    assert(
+      catalog.listPartitionNames(TableIdentifier("tbl2", Some("db2")), Some(Map("a" -> "1"))) ==
+        Seq("a=1/b=2"))
+  }
+
+  test("list partition names with invalid partial partition spec") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    var e = intercept[AnalysisException] {
+      catalog.listPartitionNames(TableIdentifier("tbl2", Some("db2")),
+        Some(partWithMoreColumns.spec))
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec (a, b, c) must be " +
+      "contained within the partition spec (a, b) defined in table '`db2`.`tbl2`'"))
+    e = intercept[AnalysisException] {
+      catalog.listPartitionNames(TableIdentifier("tbl2", Some("db2")),
+        Some(partWithUnknownColumns.spec))
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec (a, unknown) must be " +
+      "contained within the partition spec (a, b) defined in table '`db2`.`tbl2`'"))
+    e = intercept[AnalysisException] {
+      catalog.listPartitionNames(TableIdentifier("tbl2", Some("db2")),
+        Some(partWithEmptyValue.spec))
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec ([a=3, b=]) contains an " +
+      "empty partition column value"))
   }
 
   test("list partitions") {
     val catalog = new SessionCatalog(newBasicCatalog())
-    assert(catalog.listPartitions(TableIdentifier("tbl2", Some("db2"))).toSet == Set(part1, part2))
+    assert(catalogPartitionsEqual(
+      catalog.listPartitions(TableIdentifier("tbl2", Some("db2"))), part1, part2))
     // List partitions without explicitly specifying database
     catalog.setCurrentDatabase("db2")
-    assert(catalog.listPartitions(TableIdentifier("tbl2")).toSet == Set(part1, part2))
+    assert(catalogPartitionsEqual(catalog.listPartitions(TableIdentifier("tbl2")), part1, part2))
+  }
+
+  test("list partitions with partial partition spec") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    assert(catalogPartitionsEqual(
+      catalog.listPartitions(TableIdentifier("tbl2", Some("db2")), Some(Map("a" -> "1"))), part1))
+  }
+
+  test("list partitions with invalid partial partition spec") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    var e = intercept[AnalysisException] {
+      catalog.listPartitions(TableIdentifier("tbl2", Some("db2")), Some(partWithMoreColumns.spec))
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec (a, b, c) must be " +
+      "contained within the partition spec (a, b) defined in table '`db2`.`tbl2`'"))
+    e = intercept[AnalysisException] {
+      catalog.listPartitions(TableIdentifier("tbl2", Some("db2")),
+        Some(partWithUnknownColumns.spec))
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec (a, unknown) must be " +
+      "contained within the partition spec (a, b) defined in table '`db2`.`tbl2`'"))
+    e = intercept[AnalysisException] {
+      catalog.listPartitions(TableIdentifier("tbl2", Some("db2")), Some(partWithEmptyValue.spec))
+    }
+    assert(e.getMessage.contains("Partition spec is invalid. The spec ([a=3, b=]) contains an " +
+      "empty partition column value"))
   }
 
   test("list partitions when database/table does not exist") {
@@ -845,6 +996,15 @@ class SessionCatalogSuite extends SparkFunSuite {
     }
   }
 
+  private def catalogPartitionsEqual(
+      actualParts: Seq[CatalogTablePartition],
+      expectedParts: CatalogTablePartition*): Boolean = {
+    // ExternalCatalog may set a default location for partitions, here we ignore the partition
+    // location when comparing them.
+    actualParts.map(p => p.copy(storage = p.storage.copy(locationUri = None))).toSet ==
+      expectedParts.map(p => p.copy(storage = p.storage.copy(locationUri = None))).toSet
+  }
+
   // --------------------------------------------------------------------------
   // Functions
   // --------------------------------------------------------------------------
@@ -904,6 +1064,33 @@ class SessionCatalogSuite extends SparkFunSuite {
       catalog.lookupFunction(FunctionIdentifier("temp1"), arguments) === Literal(arguments.length))
   }
 
+  test("isTemporaryFunction") {
+    val externalCatalog = newBasicCatalog()
+    val sessionCatalog = new SessionCatalog(externalCatalog)
+
+    // Returns false when the function does not exist
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("temp1")))
+
+    val tempFunc1 = (e: Seq[Expression]) => e.head
+    val info1 = new ExpressionInfo("tempFunc1", "temp1")
+    sessionCatalog.createTempFunction("temp1", info1, tempFunc1, ignoreIfExists = false)
+
+    // Returns true when the function is temporary
+    assert(sessionCatalog.isTemporaryFunction(FunctionIdentifier("temp1")))
+
+    // Returns false when the function is permanent
+    assert(externalCatalog.listFunctions("db2", "*").toSet == Set("func1"))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("func1", Some("db2"))))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("db2.func1")))
+    sessionCatalog.setCurrentDatabase("db2")
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("func1")))
+
+    // Returns false when the function is built-in or hive
+    assert(FunctionRegistry.builtin.functionExists("sum"))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("sum")))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("histogram_numeric")))
+  }
+
   test("drop function") {
     val externalCatalog = newBasicCatalog()
     val sessionCatalog = new SessionCatalog(externalCatalog)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
index 2e513ea22c15..1a5569a77dc7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
@@ -191,6 +191,32 @@ class RowEncoderSuite extends SparkFunSuite {
     assert(encoder.serializer.head.nullable == false)
   }
 
+  test("RowEncoder should support primitive arrays") {
+    val schema = new StructType()
+      .add("booleanPrimitiveArray", ArrayType(BooleanType, false))
+      .add("bytePrimitiveArray", ArrayType(ByteType, false))
+      .add("shortPrimitiveArray", ArrayType(ShortType, false))
+      .add("intPrimitiveArray", ArrayType(IntegerType, false))
+      .add("longPrimitiveArray", ArrayType(LongType, false))
+      .add("floatPrimitiveArray", ArrayType(FloatType, false))
+      .add("doublePrimitiveArray", ArrayType(DoubleType, false))
+    val encoder = RowEncoder(schema).resolveAndBind()
+    val input = Seq(
+      Array(true, false),
+      Array(1.toByte, 64.toByte, Byte.MaxValue),
+      Array(1.toShort, 255.toShort, Short.MaxValue),
+      Array(1, 10000, Int.MaxValue),
+      Array(1.toLong, 1000000.toLong, Long.MaxValue),
+      Array(1.1.toFloat, 123.456.toFloat, Float.MaxValue),
+      Array(11.1111, 123456.7890123, Double.MaxValue)
+    )
+    val row = encoder.toRow(Row.fromSeq(input))
+    val convertedBack = encoder.fromRow(row)
+    input.zipWithIndex.map { case (array, index) =>
+      assert(convertedBack.getSeq(index) === array)
+    }
+  }
+
   test("RowEncoder should support array as the external type for ArrayType") {
     val schema = new StructType()
       .add("array", ArrayType(IntegerType))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala
index 3a310c0e9a7a..4188dade3fe6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
 
 
-class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class BitwiseExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   import IntegralLiteralTestUtils._
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflectionSuite.scala
index 43367c7e14c3..88d4d460751b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflectionSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.Timestamp
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
 import org.apache.spark.sql.types.{IntegerType, StringType}
@@ -85,6 +87,13 @@ class CallMethodViaReflectionSuite extends SparkFunSuite with ExpressionEvalHelp
     assert(createExpr(staticClassName, "method1").checkInputDataTypes().isSuccess)
   }
 
+  test("unsupported type checking") {
+    val ret = createExpr(staticClassName, "method1", new Timestamp(1)).checkInputDataTypes()
+    assert(ret.isFailure)
+    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
+    assert(errorMsg.contains("arguments from the third require boolean, byte, short"))
+  }
+
   test("invoking methods using acceptable types") {
     checkEvaluation(createExpr(staticClassName, "method1"), "m1")
     checkEvaluation(createExpr(staticClassName, "method2", 2), "m2")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 0cb201e4dae3..ee5d1f637374 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -97,6 +97,22 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(actual(0) == cases)
   }
 
+  test("SPARK-18091: split large if expressions into blocks due to JVM code size limit") {
+    var strExpr: Expression = Literal("abc")
+    for (_ <- 1 to 150) {
+      strExpr = Decode(Encode(strExpr, "utf-8"), "utf-8")
+    }
+
+    val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr))
+    val plan = GenerateMutableProjection.generate(expressions)
+    val actual = plan(null).toSeq(expressions.map(_.dataType))
+    val expected = Seq(UTF8String.fromString("abc"))
+
+    if (!checkResult(actual, expected)) {
+      fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")
+    }
+  }
+
   test("SPARK-14793: split wide array creation into blocks due to JVM code size limit") {
     val length = 5000
     val expressions = Seq(CreateArray(List.fill(length)(EqualTo(Literal(1), Literal(1)))))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index c76dad208ea1..020687e4b3a2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
 
-
-class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("Array and Map Size") {
     val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index 0c307b2b8576..5f124729702b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -243,11 +243,13 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     val b = AttributeReference("b", IntegerType)()
     checkMetadata(CreateStruct(Seq(a, b)))
     checkMetadata(CreateNamedStruct(Seq("a", a, "b", b)))
-    checkMetadata(CreateStructUnsafe(Seq(a, b)))
     checkMetadata(CreateNamedStructUnsafe(Seq("a", a, "b", b)))
   }
 
   test("StringToMap") {
+    val expectedDataType = MapType(StringType, StringType, valueContainsNull = true)
+    assert(new StringToMap("").dataType === expectedDataType)
+
     val s0 = Literal("a:1,b:2,c:3")
     val m0 = Map("a" -> "1", "b" -> "2", "c" -> "3")
     checkEvaluation(new StringToMap(s0), m0)
@@ -268,6 +270,10 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     val m4 = Map("a" -> "1", "b" -> "2", "c" -> "3")
     checkEvaluation(new StringToMap(s4, Literal("_")), m4)
 
+    val s5 = Literal("a")
+    val m5 = Map("a" -> null)
+    checkEvaluation(new StringToMap(s5), m5)
+
     // arguments checking
     assert(new StringToMap(Literal("a:1,b:2,c:3")).checkInputDataTypes().isSuccess)
     assert(new StringToMap(Literal(null)).checkInputDataTypes().isFailure)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 6118a34d29ea..25a7aa5a9d83 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
-import java.util.Calendar
+import java.util.{Calendar, Locale}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -30,8 +30,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   import IntegralLiteralTestUtils._
 
-  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
-  val sdfDate = new SimpleDateFormat("yyyy-MM-dd")
+  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+  val sdfDate = new SimpleDateFormat("yyyy-MM-dd", Locale.US)
   val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
   val ts = new Timestamp(sdf.parse("2013-11-08 13:10:15").getTime)
 
@@ -49,7 +49,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("DayOfYear") {
-    val sdfDay = new SimpleDateFormat("D")
+    val sdfDay = new SimpleDateFormat("D", Locale.US)
     (0 to 3).foreach { m =>
       (0 to 5).foreach { i =>
         val c = Calendar.getInstance()
@@ -60,6 +60,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       }
     }
     checkEvaluation(DayOfYear(Literal.create(null, DateType)), null)
+    checkEvaluation(DayOfYear(Literal(new Date(sdf.parse("1582-10-15 13:10:15").getTime))), 288)
+    checkEvaluation(DayOfYear(Literal(new Date(sdf.parse("1582-10-04 13:10:15").getTime))), 277)
     checkConsistencyBetweenInterpretedAndCodegen(DayOfYear, DateType)
   }
 
@@ -80,6 +82,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         }
       }
     }
+    checkEvaluation(Year(Literal(new Date(sdf.parse("1582-01-01 13:10:15").getTime))), 1582)
+    checkEvaluation(Year(Literal(new Date(sdf.parse("1581-12-31 13:10:15").getTime))), 1581)
     checkConsistencyBetweenInterpretedAndCodegen(Year, DateType)
   }
 
@@ -100,6 +104,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         }
       }
     }
+    checkEvaluation(Quarter(Literal(new Date(sdf.parse("1582-10-01 13:10:15").getTime))), 4)
+    checkEvaluation(Quarter(Literal(new Date(sdf.parse("1582-09-30 13:10:15").getTime))), 3)
     checkConsistencyBetweenInterpretedAndCodegen(Quarter, DateType)
   }
 
@@ -109,6 +115,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Month(Cast(Literal(sdfDate.format(d)), DateType)), 4)
     checkEvaluation(Month(Cast(Literal(ts), DateType)), 11)
 
+    checkEvaluation(Month(Literal(new Date(sdf.parse("1582-04-28 13:10:15").getTime))), 4)
+    checkEvaluation(Month(Literal(new Date(sdf.parse("1582-10-04 13:10:15").getTime))), 10)
+    checkEvaluation(Month(Literal(new Date(sdf.parse("1582-10-15 13:10:15").getTime))), 10)
     (2003 to 2004).foreach { y =>
       (0 to 3).foreach { m =>
         (0 to 2 * 24).foreach { i =>
@@ -130,6 +139,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(DayOfMonth(Cast(Literal(sdfDate.format(d)), DateType)), 8)
     checkEvaluation(DayOfMonth(Cast(Literal(ts), DateType)), 8)
 
+    checkEvaluation(DayOfMonth(Literal(new Date(sdf.parse("1582-04-28 13:10:15").getTime))), 28)
+    checkEvaluation(DayOfMonth(Literal(new Date(sdf.parse("1582-10-15 13:10:15").getTime))), 15)
+    checkEvaluation(DayOfMonth(Literal(new Date(sdf.parse("1582-10-04 13:10:15").getTime))), 4)
     (1999 to 2000).foreach { y =>
       val c = Calendar.getInstance()
       c.set(y, 0, 1, 0, 0, 0)
@@ -163,6 +175,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(WeekOfYear(Cast(Literal(sdfDate.format(d)), DateType)), 15)
     checkEvaluation(WeekOfYear(Cast(Literal(ts), DateType)), 45)
     checkEvaluation(WeekOfYear(Cast(Literal("2011-05-06"), DateType)), 18)
+    checkEvaluation(WeekOfYear(Literal(new Date(sdf.parse("1582-10-15 13:10:15").getTime))), 40)
+    checkEvaluation(WeekOfYear(Literal(new Date(sdf.parse("1582-10-04 13:10:15").getTime))), 40)
     checkConsistencyBetweenInterpretedAndCodegen(WeekOfYear, DateType)
   }
 
@@ -411,9 +425,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("from_unixtime") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2)
+    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
     checkEvaluation(
       FromUnixTime(Literal(0L), Literal("yyyy-MM-dd HH:mm:ss")), sdf1.format(new Timestamp(0)))
     checkEvaluation(FromUnixTime(
@@ -430,11 +444,11 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("unix_timestamp") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2)
+    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
     val fmt3 = "yy-MM-dd"
-    val sdf3 = new SimpleDateFormat(fmt3)
+    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
     val date1 = Date.valueOf("2015-07-24")
     checkEvaluation(
       UnixTimestamp(Literal(sdf1.format(new Timestamp(0))), Literal("yyyy-MM-dd HH:mm:ss")), 0L)
@@ -466,11 +480,11 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("to_unix_timestamp") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2)
+    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
     val fmt3 = "yy-MM-dd"
-    val sdf3 = new SimpleDateFormat(fmt3)
+    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
     val date1 = Date.valueOf("2015-07-24")
     checkEvaluation(
       ToUnixTimestamp(Literal(sdf1.format(new Timestamp(0))), Literal("yyyy-MM-dd HH:mm:ss")), 0L)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index f0c149c02b9a..f83650424a96 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -22,7 +22,8 @@ import org.scalactic.TripleEqualsSupport.Spread
 import org.scalatest.exceptions.TestFailedException
 import org.scalatest.prop.GeneratorDrivenPropertyChecks
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
@@ -43,13 +44,15 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
 
   protected def checkEvaluation(
       expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = {
+    val serializer = new JavaSerializer(new SparkConf()).newInstance
+    val expr: Expression = serializer.deserialize(serializer.serialize(expression))
     val catalystValue = CatalystTypeConverters.convertToCatalyst(expected)
-    checkEvaluationWithoutCodegen(expression, catalystValue, inputRow)
-    checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow)
-    if (GenerateUnsafeProjection.canSupport(expression.dataType)) {
-      checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow)
+    checkEvaluationWithoutCodegen(expr, catalystValue, inputRow)
+    checkEvaluationWithGeneratedMutableProjection(expr, catalystValue, inputRow)
+    if (GenerateUnsafeProjection.canSupport(expr.dataType)) {
+      checkEvalutionWithUnsafeProjection(expr, catalystValue, inputRow)
     }
-    checkEvaluationWithOptimization(expression, catalystValue, inputRow)
+    checkEvaluationWithOptimization(expr, catalystValue, inputRow)
   }
 
   /**
@@ -75,7 +78,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
 
   protected def evaluate(expression: Expression, inputRow: InternalRow = EmptyRow): Any = {
     expression.foreach {
-      case n: Nondeterministic => n.setInitialValues()
+      case n: Nondeterministic => n.initialize(0)
       case _ =>
     }
     expression.eval(inputRow)
@@ -121,6 +124,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
     val plan = generateProject(
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
       expression)
+    plan.initialize(0)
 
     val actual = plan(inputRow).get(0, expression.dataType)
     if (!checkResult(actual, expected)) {
@@ -182,12 +186,14 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
     var plan = generateProject(
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
       expression)
+    plan.initialize(0)
     var actual = plan(inputRow).get(0, expression.dataType)
     assert(checkResult(actual, expected))
 
     plan = generateProject(
       GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
       expression)
+    plan.initialize(0)
     actual = FromUnsafeProjection(expression.dataType :: Nil)(
       plan(inputRow)).get(0, expression.dataType)
     assert(checkResult(actual, expected))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
similarity index 85%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
index 13ce58846202..032629265269 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -24,9 +24,11 @@ import org.apache.commons.codec.digest.DigestUtils
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, RowEncoder}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
-class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("md5") {
     checkEvaluation(Md5(Literal("ABC".getBytes(StandardCharsets.UTF_8))),
@@ -69,23 +71,6 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkConsistencyBetweenInterpretedAndCodegen(Crc32, BinaryType)
   }
 
-  test("assert_true") {
-    intercept[RuntimeException] {
-      checkEvaluation(AssertTrue(Literal.create(false, BooleanType)), null)
-    }
-    intercept[RuntimeException] {
-      checkEvaluation(AssertTrue(Cast(Literal(0), BooleanType)), null)
-    }
-    intercept[RuntimeException] {
-      checkEvaluation(AssertTrue(Literal.create(null, NullType)), null)
-    }
-    intercept[RuntimeException] {
-      checkEvaluation(AssertTrue(Literal.create(null, BooleanType)), null)
-    }
-    checkEvaluation(AssertTrue(Literal.create(true, BooleanType)), null)
-    checkEvaluation(AssertTrue(Cast(Literal(1), BooleanType)), null)
-  }
-
   private val structOfString = new StructType().add("str", StringType)
   private val structOfUDT = new StructType().add("udt", new ExamplePointUDT, false)
   private val arrayOfString = ArrayType(StringType)
@@ -141,6 +126,26 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         new StructType().add("array", arrayOfString).add("map", mapOfString))
       .add("structOfUDT", structOfUDT))
 
+  test("SPARK-18207: Compute hash for a lot of expressions") {
+    val N = 1000
+    val wideRow = new GenericInternalRow(
+      Seq.tabulate(N)(i => UTF8String.fromString(i.toString)).toArray[Any])
+    val schema = StructType((1 to N).map(i => StructField("", StringType)))
+
+    val exprs = schema.fields.zipWithIndex.map { case (f, i) =>
+      BoundReference(i, f.dataType, true)
+    }
+    val murmur3HashExpr = Murmur3Hash(exprs, 42)
+    val murmur3HashPlan = GenerateMutableProjection.generate(Seq(murmur3HashExpr))
+    val murmursHashEval = Murmur3Hash(exprs, 42).eval(wideRow)
+    assert(murmur3HashPlan(wideRow).getInt(0) == murmursHashEval)
+
+    val hiveHashExpr = HiveHash(exprs)
+    val hiveHashPlan = GenerateMutableProjection.generate(Seq(hiveHashExpr))
+    val hiveHashEval = HiveHash(exprs).eval(wideRow)
+    assert(hiveHashPlan(wideRow).getInt(0) == hiveHashEval)
+  }
+
   private def testHash(inputSchema: StructType): Unit = {
     val inputGenerator = RandomDataGenerator.forType(inputSchema, nullable = false).get
     val encoder = RowEncoder(inputSchema)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index f9db649bc240..8e20bd1d9724 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.ParseModes
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.unsafe.types.UTF8String
 
 class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -43,6 +43,30 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       """{"price":19.95,"color":"red"}""")
   }
 
+  test("$['store'].bicycle") {
+    checkEvaluation(
+      GetJsonObject(Literal(json), Literal("$['store'].bicycle")),
+      """{"price":19.95,"color":"red"}""")
+  }
+
+  test("$.store['bicycle']") {
+    checkEvaluation(
+      GetJsonObject(Literal(json), Literal("$.store['bicycle']")),
+      """{"price":19.95,"color":"red"}""")
+  }
+
+  test("$['store']['bicycle']") {
+    checkEvaluation(
+      GetJsonObject(Literal(json), Literal("$['store']['bicycle']")),
+      """{"price":19.95,"color":"red"}""")
+  }
+
+  test("$['key with spaces']") {
+    checkEvaluation(GetJsonObject(
+      Literal("""{ "key with spaces": "it works" }"""), Literal("$['key with spaces']")),
+      "it works")
+  }
+
   test("$.store.book") {
     checkEvaluation(
       GetJsonObject(Literal(json), Literal("$.store.book")),
@@ -344,6 +368,22 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     )
   }
 
+  test("from_json null input column") {
+    val schema = StructType(StructField("a", IntegerType) :: Nil)
+    checkEvaluation(
+      JsonToStruct(schema, Map.empty, Literal.create(null, StringType)),
+      null
+    )
+  }
+
+  test("SPARK-19543: from_json empty input column") {
+    val schema = StructType(StructField("a", IntegerType) :: Nil)
+    checkEvaluation(
+      JsonToStruct(schema, Map.empty, Literal.create(" ", StringType)),
+      null
+    )
+  }
+
   test("to_json") {
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     val struct = Literal.create(create_row(1), schema)
@@ -352,4 +392,13 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       """{"a":1}"""
     )
   }
+
+  test("to_json null input column") {
+    val schema = StructType(StructField("a", IntegerType) :: Nil)
+    val struct = Literal.create(null, schema)
+    checkEvaluation(
+      StructToJson(Map.empty, struct),
+      null
+    )
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index 450222d8cbba..4af4da8a9f0c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
@@ -43,6 +44,7 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, TimestampType), null)
     checkEvaluation(Literal.create(null, CalendarIntervalType), null)
     checkEvaluation(Literal.create(null, ArrayType(ByteType, true)), null)
+    checkEvaluation(Literal.create(null, ArrayType(StringType, true)), null)
     checkEvaluation(Literal.create(null, MapType(StringType, IntegerType)), null)
     checkEvaluation(Literal.create(null, StructType(Seq.empty)), null)
   }
@@ -122,5 +124,28 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
-  // TODO(davies): add tests for ArrayType, MapType and StructType
+  test("array") {
+    def checkArrayLiteral(a: Array[_], elementType: DataType): Unit = {
+      val toCatalyst = (a: Array[_], elementType: DataType) => {
+        CatalystTypeConverters.createToCatalystConverter(ArrayType(elementType))(a)
+      }
+      checkEvaluation(Literal(a), toCatalyst(a, elementType))
+    }
+    checkArrayLiteral(Array(1, 2, 3), IntegerType)
+    checkArrayLiteral(Array("a", "b", "c"), StringType)
+    checkArrayLiteral(Array(1.0, 4.0), DoubleType)
+    checkArrayLiteral(Array(CalendarInterval.MICROS_PER_DAY, CalendarInterval.MICROS_PER_HOUR),
+      CalendarIntervalType)
+  }
+
+  test("unsupported types (map and struct) in literals") {
+    def checkUnsupportedTypeInLiteral(v: Any): Unit = {
+      val errMsgMap = intercept[RuntimeException] {
+        Literal(v)
+      }
+      assert(errMsgMap.getMessage.startsWith("Unsupported literal type"))
+    }
+    checkUnsupportedTypeInLiteral(Map("key1" -> 1, "key2" -> 2))
+    checkUnsupportedTypeInLiteral(("mike", 29, 1.0))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
index f88c9e8df16d..1555dd1cf58d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
 import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
 import org.apache.spark.sql.types._
 
-class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   import IntegralLiteralTestUtils._
 
@@ -546,15 +546,14 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val bdResults: Seq[BigDecimal] = Seq(BigDecimal(3.0), BigDecimal(3.1), BigDecimal(3.14),
       BigDecimal(3.142), BigDecimal(3.1416), BigDecimal(3.14159),
       BigDecimal(3.141593), BigDecimal(3.1415927))
-    // round_scale > current_scale would result in precision increase
-    // and not allowed by o.a.s.s.types.Decimal.changePrecision, therefore null
+
     (0 to 7).foreach { i =>
       checkEvaluation(Round(bdPi, i), bdResults(i), EmptyRow)
       checkEvaluation(BRound(bdPi, i), bdResults(i), EmptyRow)
     }
     (8 to 10).foreach { scale =>
-      checkEvaluation(Round(bdPi, scale), null, EmptyRow)
-      checkEvaluation(BRound(bdPi, scale), null, EmptyRow)
+      checkEvaluation(Round(bdPi, scale), bdPi, EmptyRow)
+      checkEvaluation(BRound(bdPi, scale), bdPi, EmptyRow)
     }
 
     DataTypeTestUtils.numericTypes.foreach { dataType =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
new file mode 100644
index 000000000000..a26d070a99c5
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
+
+class MiscExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("assert_true") {
+    intercept[RuntimeException] {
+      checkEvaluation(AssertTrue(Literal.create(false, BooleanType)), null)
+    }
+    intercept[RuntimeException] {
+      checkEvaluation(AssertTrue(Cast(Literal(0), BooleanType)), null)
+    }
+    intercept[RuntimeException] {
+      checkEvaluation(AssertTrue(Literal.create(null, NullType)), null)
+    }
+    intercept[RuntimeException] {
+      checkEvaluation(AssertTrue(Literal.create(null, BooleanType)), null)
+    }
+    checkEvaluation(AssertTrue(Literal.create(true, BooleanType)), null)
+    checkEvaluation(AssertTrue(Cast(Literal(1), BooleanType)), null)
+  }
+
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
similarity index 84%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
index e73637993061..5064a1f63f83 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
@@ -18,10 +18,12 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
 import org.apache.spark.sql.types._
 
-class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class NullExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   def testAllTypes(testFunc: (Any, DataType) => Unit): Unit = {
     testFunc(false, BooleanType)
@@ -86,18 +88,23 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("SPARK-16602 Nvl should support numeric-string cases") {
+    def analyze(expr: Expression): Expression = {
+      val relation = LocalRelation()
+      SimpleAnalyzer.execute(Project(Seq(Alias(expr, "c")()), relation)).expressions.head
+    }
+
     val intLit = Literal.create(1, IntegerType)
     val doubleLit = Literal.create(2.2, DoubleType)
     val stringLit = Literal.create("c", StringType)
     val nullLit = Literal.create(null, NullType)
 
-    assert(Nvl(intLit, doubleLit).replaceForTypeCoercion().dataType == DoubleType)
-    assert(Nvl(intLit, stringLit).replaceForTypeCoercion().dataType == StringType)
-    assert(Nvl(stringLit, doubleLit).replaceForTypeCoercion().dataType == StringType)
+    assert(analyze(new Nvl(intLit, doubleLit)).dataType == DoubleType)
+    assert(analyze(new Nvl(intLit, stringLit)).dataType == StringType)
+    assert(analyze(new Nvl(stringLit, doubleLit)).dataType == StringType)
 
-    assert(Nvl(nullLit, intLit).replaceForTypeCoercion().dataType == IntegerType)
-    assert(Nvl(doubleLit, nullLit).replaceForTypeCoercion().dataType == DoubleType)
-    assert(Nvl(nullLit, stringLit).replaceForTypeCoercion().dataType == StringType)
+    assert(analyze(new Nvl(nullLit, intLit)).dataType == IntegerType)
+    assert(analyze(new Nvl(doubleLit, nullLit)).dataType == DoubleType)
+    assert(analyze(new Nvl(nullLit, stringLit)).dataType == StringType)
   }
 
   test("AtLeastNNonNulls") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala
index 8cc2ab46c0c8..190fab5d249b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala
@@ -127,4 +127,14 @@ class OrderingSuite extends SparkFunSuite with ExpressionEvalHelper {
       }
     }
   }
+
+  test("SPARK-16845: GeneratedClass$SpecificOrdering grows beyond 64 KB") {
+    val sortOrder = Literal("abc").asc
+
+    // this is passing prior to SPARK-16845, and it should also be passing after SPARK-16845
+    GenerateOrdering.generate(Array.fill(40)(sortOrder))
+
+    // verify that we can support up to 5000 ordering comparisons, which should be sufficient
+    GenerateOrdering.generate(Array.fill(5000)(sortOrder))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 2a445b8cdb09..6fc3de178f6d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -21,6 +21,8 @@ import scala.collection.immutable.HashSet
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.RandomDataGenerator
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 
 
@@ -33,7 +35,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(s"3VL $name") {
       truthTable.foreach {
         case (l, r, answer) =>
-          val expr = op(Literal.create(l, BooleanType), Literal.create(r, BooleanType))
+          val expr = op(NonFoldableLiteral(l, BooleanType), NonFoldableLiteral(r, BooleanType))
           checkEvaluation(expr, answer)
       }
     }
@@ -70,7 +72,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
         (false, true) ::
         (null, null) :: Nil
     notTrueTable.foreach { case (v, answer) =>
-      checkEvaluation(Not(Literal.create(v, BooleanType)), answer)
+      checkEvaluation(Not(NonFoldableLiteral(v, BooleanType)), answer)
     }
     checkConsistencyBetweenInterpretedAndCodegen(Not, BooleanType)
   }
@@ -118,12 +120,14 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       (null, null, null) :: Nil)
 
   test("IN") {
-    checkEvaluation(In(Literal.create(null, IntegerType), Seq(Literal(1), Literal(2))), null)
-    checkEvaluation(In(Literal.create(null, IntegerType), Seq(Literal.create(null, IntegerType))),
-      null)
-    checkEvaluation(In(Literal(1), Seq(Literal.create(null, IntegerType))), null)
-    checkEvaluation(In(Literal(1), Seq(Literal(1), Literal.create(null, IntegerType))), true)
-    checkEvaluation(In(Literal(2), Seq(Literal(1), Literal.create(null, IntegerType))), null)
+    checkEvaluation(In(NonFoldableLiteral(null, IntegerType), Seq(Literal(1), Literal(2))), null)
+    checkEvaluation(In(NonFoldableLiteral(null, IntegerType),
+      Seq(NonFoldableLiteral(null, IntegerType))), null)
+    checkEvaluation(In(NonFoldableLiteral(null, IntegerType), Seq.empty), null)
+    checkEvaluation(In(Literal(1), Seq.empty), false)
+    checkEvaluation(In(Literal(1), Seq(NonFoldableLiteral(null, IntegerType))), null)
+    checkEvaluation(In(Literal(1), Seq(Literal(1), NonFoldableLiteral(null, IntegerType))), true)
+    checkEvaluation(In(Literal(2), Seq(Literal(1), NonFoldableLiteral(null, IntegerType))), null)
     checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))), true)
     checkEvaluation(In(Literal(2), Seq(Literal(1), Literal(2))), true)
     checkEvaluation(In(Literal(3), Seq(Literal(1), Literal(2))), false)
@@ -131,7 +135,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1), Literal(2)))),
       true)
 
-    val ns = Literal.create(null, StringType)
+    val ns = NonFoldableLiteral(null, StringType)
     checkEvaluation(In(ns, Seq(Literal("1"), Literal("2"))), null)
     checkEvaluation(In(ns, Seq(ns)), null)
     checkEvaluation(In(Literal("a"), Seq(ns)), null)
@@ -151,7 +155,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
           case _ => value
         }
       }
-      val input = inputData.map(Literal.create(_, t))
+      val input = inputData.map(NonFoldableLiteral(_, t))
       val expected = if (inputData(0) == null) {
         null
       } else if (inputData.slice(1, 10).contains(inputData(0))) {
@@ -275,7 +279,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("BinaryComparison: null test") {
     // Use -1 (default value for codegen) which can trigger some weird bugs, e.g. SPARK-14757
     val normalInt = Literal(-1)
-    val nullInt = Literal.create(null, IntegerType)
+    val nullInt = NonFoldableLiteral(null, IntegerType)
 
     def nullTest(op: (Expression, Expression) => Expression): Unit = {
       checkEvaluation(op(normalInt, nullInt), null)
@@ -293,4 +297,31 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(EqualNullSafe(nullInt, normalInt), false)
     checkEvaluation(EqualNullSafe(nullInt, nullInt), true)
   }
+
+  test("EqualTo on complex type") {
+    val array = new GenericArrayData(Array(1, 2, 3))
+    val struct = create_row("a", 1L, array)
+
+    val arrayType = ArrayType(IntegerType)
+    val structType = new StructType()
+      .add("1", StringType)
+      .add("2", LongType)
+      .add("3", ArrayType(IntegerType))
+
+    val projection = UnsafeProjection.create(
+      new StructType().add("array", arrayType).add("struct", structType))
+
+    val unsafeRow = projection(InternalRow(array, struct))
+
+    val unsafeArray = unsafeRow.getArray(0)
+    val unsafeStruct = unsafeRow.getStruct(1, 3)
+
+    checkEvaluation(EqualTo(
+      Literal.create(array, arrayType),
+      Literal.create(unsafeArray, arrayType)), true)
+
+    checkEvaluation(EqualTo(
+      Literal.create(struct, structType),
+      Literal.create(unsafeStruct, structType)), true)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
index b7a0d44fa7e5..752c9d5449ee 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
@@ -20,12 +20,18 @@ package org.apache.spark.sql.catalyst.expressions
 import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types.{IntegerType, LongType}
 
 class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("random") {
     checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001)
     checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001)
+
+    checkDoubleEvaluation(
+      new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001)
+    checkDoubleEvaluation(
+      new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001)
   }
 
   test("SPARK-9127 codegen with long seed") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 5299549e7b4d..1ce150e09198 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -18,16 +18,38 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.types.{IntegerType, StringType}
 
 /**
  * Unit tests for regular expression (regexp) related SQL expressions.
  */
 class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
-  test("LIKE literal Regular Expression") {
-    checkEvaluation(Literal.create(null, StringType).like("a"), null)
+  /**
+   * Check if a given expression evaluates to an expected output, in case the input is
+   * a literal and in case the input is in the form of a row.
+   * @tparam A type of input
+   * @param mkExpr the expression to test for a given input
+   * @param input value that will be used to create the expression, as literal and in the form
+   *        of a row
+   * @param expected the expected output of the expression
+   * @param inputToExpression an implicit conversion from the input type to its corresponding
+   *        sql expression
+   */
+  def checkLiteralRow[A](mkExpr: Expression => Expression, input: A, expected: Any)
+    (implicit inputToExpression: A => Expression): Unit = {
+    checkEvaluation(mkExpr(input), expected) // check literal input
+
+    val regex = 'a.string.at(0)
+    checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input
+  }
+
+  test("LIKE Pattern") {
+
+    // null handling
+    checkLiteralRow(Literal.create(null, StringType).like(_), "a", null)
     checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
     checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null)
     checkEvaluation(
@@ -39,45 +61,64 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null)
 
-    checkEvaluation("abdef" like "abdef", true)
-    checkEvaluation("a_%b" like "a\\__b", true)
-    checkEvaluation("addb" like "a_%b", true)
-    checkEvaluation("addb" like "a\\__b", false)
-    checkEvaluation("addb" like "a%\\%b", false)
-    checkEvaluation("a_%b" like "a%\\%b", true)
-    checkEvaluation("addb" like "a%", true)
-    checkEvaluation("addb" like "**", false)
-    checkEvaluation("abc" like "a%", true)
-    checkEvaluation("abc"  like "b%", false)
-    checkEvaluation("abc"  like "bc%", false)
-    checkEvaluation("a\nb" like "a_b", true)
-    checkEvaluation("ab" like "a%b", true)
-    checkEvaluation("a\nb" like "a%b", true)
-  }
+    // simple patterns
+    checkLiteralRow("abdef" like _, "abdef", true)
+    checkLiteralRow("a_%b" like _, "a\\__b", true)
+    checkLiteralRow("addb" like _, "a_%b", true)
+    checkLiteralRow("addb" like _, "a\\__b", false)
+    checkLiteralRow("addb" like _, "a%\\%b", false)
+    checkLiteralRow("a_%b" like _, "a%\\%b", true)
+    checkLiteralRow("addb" like _, "a%", true)
+    checkLiteralRow("addb" like _, "**", false)
+    checkLiteralRow("abc" like _, "a%", true)
+    checkLiteralRow("abc"  like _, "b%", false)
+    checkLiteralRow("abc"  like _, "bc%", false)
+    checkLiteralRow("a\nb" like _, "a_b", true)
+    checkLiteralRow("ab" like _, "a%b", true)
+    checkLiteralRow("a\nb" like _, "a%b", true)
+
+    // empty input
+    checkLiteralRow("" like _, "", true)
+    checkLiteralRow("a" like _, "", false)
+    checkLiteralRow("" like _, "a", false)
+
+    // SI-17647 double-escaping backslash
+    checkLiteralRow("""\\\\""" like _, """%\\%""", true)
+    checkLiteralRow("""%%""" like _, """%%""", true)
+    checkLiteralRow("""\__""" like _, """\\\__""", true)
+    checkLiteralRow("""\\\__""" like _, """%\\%\%""", false)
+    checkLiteralRow("""_\\\%""" like _, """%\\""", false)
+
+    // unicode
+    // scalastyle:off nonascii
+    checkLiteralRow("a\u20ACa" like _, "_\u20AC_", true)
+    checkLiteralRow("a€a" like _, "_€_", true)
+    checkLiteralRow("a€a" like _, "_\u20AC_", true)
+    checkLiteralRow("a\u20ACa" like _, "_€_", true)
+    // scalastyle:on nonascii
+
+    // invalid escaping
+    val invalidEscape = intercept[AnalysisException] {
+      evaluate("""a""" like """\a""")
+    }
+    assert(invalidEscape.getMessage.contains("pattern"))
+
+    val endEscape = intercept[AnalysisException] {
+      evaluate("""a""" like """a\""")
+    }
+    assert(endEscape.getMessage.contains("pattern"))
+
+    // case
+    checkLiteralRow("A" like _, "a%", false)
+    checkLiteralRow("a" like _, "A%", false)
+    checkLiteralRow("AaA" like _, "_a_", true)
 
-  test("LIKE Non-literal Regular Expression") {
-    val regEx = 'a.string.at(0)
-    checkEvaluation("abcd" like regEx, null, create_row(null))
-    checkEvaluation("abdef" like regEx, true, create_row("abdef"))
-    checkEvaluation("a_%b" like regEx, true, create_row("a\\__b"))
-    checkEvaluation("addb" like regEx, true, create_row("a_%b"))
-    checkEvaluation("addb" like regEx, false, create_row("a\\__b"))
-    checkEvaluation("addb" like regEx, false, create_row("a%\\%b"))
-    checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b"))
-    checkEvaluation("addb" like regEx, true, create_row("a%"))
-    checkEvaluation("addb" like regEx, false, create_row("**"))
-    checkEvaluation("abc" like regEx, true, create_row("a%"))
-    checkEvaluation("abc" like regEx, false, create_row("b%"))
-    checkEvaluation("abc" like regEx, false, create_row("bc%"))
-    checkEvaluation("a\nb" like regEx, true, create_row("a_b"))
-    checkEvaluation("ab" like regEx, true, create_row("a%b"))
-    checkEvaluation("a\nb" like regEx, true, create_row("a%b"))
-
-    checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%"))
+    // example
+    checkLiteralRow("""%SystemDrive%\Users\John""" like _, """\%SystemDrive\%\\Users%""", true)
   }
 
-  test("RLIKE literal Regular Expression") {
-    checkEvaluation(Literal.create(null, StringType) rlike "abdef", null)
+  test("RLIKE Regular Expression") {
+    checkLiteralRow(Literal.create(null, StringType) rlike _, "abdef", null)
     checkEvaluation("abdef" rlike Literal.create(null, StringType), null)
     checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null)
     checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true)
@@ -87,42 +128,32 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null)
 
-    checkEvaluation("abdef" rlike "abdef", true)
-    checkEvaluation("abbbbc" rlike "a.*c", true)
+    checkLiteralRow("abdef" rlike _, "abdef", true)
+    checkLiteralRow("abbbbc" rlike _, "a.*c", true)
 
-    checkEvaluation("fofo" rlike "^fo", true)
-    checkEvaluation("fo\no" rlike "^fo\no$", true)
-    checkEvaluation("Bn" rlike "^Ba*n", true)
-    checkEvaluation("afofo" rlike "fo", true)
-    checkEvaluation("afofo" rlike "^fo", false)
-    checkEvaluation("Baan" rlike "^Ba?n", false)
-    checkEvaluation("axe" rlike "pi|apa", false)
-    checkEvaluation("pip" rlike "^(pi)*$", false)
+    checkLiteralRow("fofo" rlike _, "^fo", true)
+    checkLiteralRow("fo\no" rlike _, "^fo\no$", true)
+    checkLiteralRow("Bn" rlike _, "^Ba*n", true)
+    checkLiteralRow("afofo" rlike _, "fo", true)
+    checkLiteralRow("afofo" rlike _, "^fo", false)
+    checkLiteralRow("Baan" rlike _, "^Ba?n", false)
+    checkLiteralRow("axe" rlike _, "pi|apa", false)
+    checkLiteralRow("pip" rlike _, "^(pi)*$", false)
 
-    checkEvaluation("abc"  rlike "^ab", true)
-    checkEvaluation("abc"  rlike "^bc", false)
-    checkEvaluation("abc"  rlike "^ab", true)
-    checkEvaluation("abc"  rlike "^bc", false)
+    checkLiteralRow("abc"  rlike _, "^ab", true)
+    checkLiteralRow("abc"  rlike _, "^bc", false)
+    checkLiteralRow("abc"  rlike _, "^ab", true)
+    checkLiteralRow("abc"  rlike _, "^bc", false)
 
     intercept[java.util.regex.PatternSyntaxException] {
       evaluate("abbbbc" rlike "**")
     }
-  }
-
-  test("RLIKE Non-literal Regular Expression") {
-    val regEx = 'a.string.at(0)
-    checkEvaluation("abdef" rlike regEx, true, create_row("abdef"))
-    checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c"))
-    checkEvaluation("fofo" rlike regEx, true, create_row("^fo"))
-    checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$"))
-    checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n"))
-
     intercept[java.util.regex.PatternSyntaxException] {
-      evaluate("abbbbc" rlike regEx, create_row("**"))
+      val regex = 'a.string.at(0)
+      evaluate("abbbbc" rlike regex, create_row("**"))
     }
   }
 
-
   test("RegexReplace") {
     val row1 = create_row("100-200", "(\\d+)", "num")
     val row2 = create_row("100-200", "(\\d+)", "###")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
index 17f6b71bb270..cc53880af5b2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
@@ -50,6 +50,13 @@ class HyperLogLogPlusPlusSuite extends SparkFunSuite {
     assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.")
   }
 
+  test("test invalid parameter relativeSD") {
+    // `relativeSD` should be at most 39%.
+    intercept[IllegalArgumentException] {
+      new HyperLogLogPlusPlus(new BoundReference(0, IntegerType, true), relativeSD = 0.4)
+    }
+  }
+
   test("add nulls") {
     val (hll, input, buffer) = createEstimator(0.05)
     input.setNullAt(0)
@@ -83,7 +90,7 @@ class HyperLogLogPlusPlusSuite extends SparkFunSuite {
   test("deterministic cardinality estimation") {
     val repeats = 10
     testCardinalityEstimates(
-      Seq(0.1, 0.05, 0.025, 0.01),
+      Seq(0.1, 0.05, 0.025, 0.01, 0.001),
       Seq(100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000).map(_ * repeats),
       i => i / repeats,
       i => i / repeats)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
new file mode 100644
index 000000000000..d7c25271f356
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.ArrayData
+import org.apache.spark.sql.types._
+import org.apache.spark.util.collection.OpenHashMap
+
+class PercentileSuite extends SparkFunSuite {
+
+  private val random = new java.util.Random()
+
+  private val data = (0 until 10000).map { _ =>
+    random.nextInt(10000)
+  }
+
+  test("serialize and de-serialize") {
+    val agg = new Percentile(BoundReference(0, IntegerType, true), Literal(0.5))
+
+    // Check empty serialize and deserialize
+    val buffer = new OpenHashMap[AnyRef, Long]()
+    assert(compareEquals(agg.deserialize(agg.serialize(buffer)), buffer))
+
+    // Check non-empty buffer serializa and deserialize.
+    data.foreach { key =>
+      buffer.changeValue(new Integer(key), 1L, _ + 1L)
+    }
+    assert(compareEquals(agg.deserialize(agg.serialize(buffer)), buffer))
+  }
+
+  test("class Percentile, high level interface, update, merge, eval...") {
+    val count = 10000
+    val data = (1 to count)
+    val percentages = Seq(0, 0.25, 0.5, 0.75, 1)
+    val expectedPercentiles = Seq(1, 2500.75, 5000.5, 7500.25, 10000)
+    val childExpression = Cast(BoundReference(0, IntegerType, nullable = false), DoubleType)
+    val percentageExpression = CreateArray(percentages.toSeq.map(Literal(_)))
+    val agg = new Percentile(childExpression, percentageExpression)
+
+    assert(agg.nullable)
+    val group1 = (0 until data.length / 2)
+    val group1Buffer = agg.createAggregationBuffer()
+    group1.foreach { index =>
+      val input = InternalRow(data(index))
+      agg.update(group1Buffer, input)
+    }
+
+    val group2 = (data.length / 2 until data.length)
+    val group2Buffer = agg.createAggregationBuffer()
+    group2.foreach { index =>
+      val input = InternalRow(data(index))
+      agg.update(group2Buffer, input)
+    }
+
+    val mergeBuffer = agg.createAggregationBuffer()
+    agg.merge(mergeBuffer, group1Buffer)
+    agg.merge(mergeBuffer, group2Buffer)
+
+    agg.eval(mergeBuffer) match {
+      case arrayData: ArrayData =>
+        val percentiles = arrayData.toDoubleArray()
+        assert(percentiles.zip(expectedPercentiles)
+          .forall(pair => pair._1 == pair._2))
+    }
+  }
+
+  test("class Percentile, low level interface, update, merge, eval...") {
+    val childExpression = Cast(BoundReference(0, IntegerType, nullable = true), DoubleType)
+    val inputAggregationBufferOffset = 1
+    val mutableAggregationBufferOffset = 2
+    val percentage = 0.5
+
+    // Phase one, partial mode aggregation
+    val agg = new Percentile(childExpression, Literal(percentage))
+      .withNewInputAggBufferOffset(inputAggregationBufferOffset)
+      .withNewMutableAggBufferOffset(mutableAggregationBufferOffset)
+
+    val mutableAggBuffer = new GenericInternalRow(
+      new Array[Any](mutableAggregationBufferOffset + 1))
+    agg.initialize(mutableAggBuffer)
+    val dataCount = 10
+    (1 to dataCount).foreach { data =>
+      agg.update(mutableAggBuffer, InternalRow(data))
+    }
+    agg.serializeAggregateBufferInPlace(mutableAggBuffer)
+
+    // Serialize the aggregation buffer
+    val serialized = mutableAggBuffer.getBinary(mutableAggregationBufferOffset)
+    val inputAggBuffer = new GenericInternalRow(Array[Any](null, serialized))
+
+    // Phase 2: final mode aggregation
+    // Re-initialize the aggregation buffer
+    agg.initialize(mutableAggBuffer)
+    agg.merge(mutableAggBuffer, inputAggBuffer)
+    val expectedPercentile = 5.5
+    assert(agg.eval(mutableAggBuffer).asInstanceOf[Double] == expectedPercentile)
+  }
+
+  test("call from sql query") {
+    // sql, single percentile
+    assertEqual(
+      s"percentile(`a`, 0.5D)",
+      new Percentile("a".attr, Literal(0.5)).sql: String)
+
+    // sql, array of percentile
+    assertEqual(
+      s"percentile(`a`, array(0.25D, 0.5D, 0.75D))",
+      new Percentile("a".attr, CreateArray(Seq(0.25, 0.5, 0.75).map(Literal(_)))).sql: String)
+
+    // sql(isDistinct = false), single percentile
+    assertEqual(
+      s"percentile(`a`, 0.5D)",
+      new Percentile("a".attr, Literal(0.5)).sql(isDistinct = false))
+
+    // sql(isDistinct = false), array of percentile
+    assertEqual(
+      s"percentile(`a`, array(0.25D, 0.5D, 0.75D))",
+      new Percentile("a".attr, CreateArray(Seq(0.25, 0.5, 0.75).map(Literal(_))))
+        .sql(isDistinct = false))
+
+    // sql(isDistinct = true), single percentile
+    assertEqual(
+      s"percentile(DISTINCT `a`, 0.5D)",
+      new Percentile("a".attr, Literal(0.5)).sql(isDistinct = true))
+
+    // sql(isDistinct = true), array of percentile
+    assertEqual(
+      s"percentile(DISTINCT `a`, array(0.25D, 0.5D, 0.75D))",
+      new Percentile("a".attr, CreateArray(Seq(0.25, 0.5, 0.75).map(Literal(_))))
+        .sql(isDistinct = true))
+  }
+
+  test("fail analysis if childExpression is invalid") {
+    val validDataTypes = Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType)
+    val percentage = Literal(0.5)
+
+    validDataTypes.foreach { dataType =>
+      val child = AttributeReference("a", dataType)()
+      val percentile = new Percentile(child, percentage)
+      assertEqual(percentile.checkInputDataTypes(), TypeCheckSuccess)
+    }
+
+    val invalidDataTypes = Seq(BooleanType, StringType, DateType, TimestampType,
+      CalendarIntervalType, NullType)
+
+    invalidDataTypes.foreach { dataType =>
+      val child = AttributeReference("a", dataType)()
+      val percentile = new Percentile(child, percentage)
+      assertEqual(percentile.checkInputDataTypes(),
+        TypeCheckFailure(s"argument 1 requires numeric type, however, " +
+            s"'`a`' is of ${dataType.simpleString} type."))
+    }
+  }
+
+  test("fails analysis if percentage(s) are invalid") {
+    val child = Cast(BoundReference(0, IntegerType, nullable = false), DoubleType)
+    val input = InternalRow(1)
+
+    val validPercentages = Seq(Literal(0D), Literal(0.5), Literal(1D),
+      CreateArray(Seq(0, 0.5, 1).map(Literal(_))))
+
+    validPercentages.foreach { percentage =>
+      val percentile1 = new Percentile(child, percentage)
+      assertEqual(percentile1.checkInputDataTypes(), TypeCheckSuccess)
+    }
+
+    val invalidPercentages = Seq(Literal(-0.5), Literal(1.5), Literal(2D),
+      CreateArray(Seq(-0.5, 0, 2).map(Literal(_))))
+
+    invalidPercentages.foreach { percentage =>
+      val percentile2 = new Percentile(child, percentage)
+      assertEqual(percentile2.checkInputDataTypes(),
+        TypeCheckFailure(s"Percentage(s) must be between 0.0 and 1.0, " +
+        s"but got ${percentage.simpleString}"))
+    }
+
+    val nonFoldablePercentage = Seq(NonFoldableLiteral(0.5),
+      CreateArray(Seq(0, 0.5, 1).map(NonFoldableLiteral(_))))
+
+    nonFoldablePercentage.foreach { percentage =>
+      val percentile3 = new Percentile(child, percentage)
+      assertEqual(percentile3.checkInputDataTypes(),
+        TypeCheckFailure(s"The percentage(s) must be a constant literal, " +
+          s"but got ${percentage}"))
+    }
+
+    val invalidDataTypes = Seq(ByteType, ShortType, IntegerType, LongType, FloatType,
+      BooleanType, StringType, DateType, TimestampType, CalendarIntervalType, NullType)
+
+    invalidDataTypes.foreach { dataType =>
+      val percentage = Literal(0.5, dataType)
+      val percentile4 = new Percentile(child, percentage)
+      assertEqual(percentile4.checkInputDataTypes(),
+        TypeCheckFailure(s"argument 2 requires double type, however, " +
+          s"'0.5' is of ${dataType.simpleString} type."))
+    }
+  }
+
+  test("null handling") {
+    val childExpression = Cast(BoundReference(0, IntegerType, nullable = true), DoubleType)
+    val agg = new Percentile(childExpression, Literal(0.5))
+    val buffer = new GenericInternalRow(new Array[Any](1))
+    agg.initialize(buffer)
+    // Empty aggregation buffer
+    assert(agg.eval(buffer) == null)
+    // Empty input row
+    agg.update(buffer, InternalRow(null))
+    assert(agg.eval(buffer) == null)
+
+    // Add some non-empty row
+    agg.update(buffer, InternalRow(0))
+    assert(agg.eval(buffer) != null)
+  }
+
+  private def compareEquals(
+      left: OpenHashMap[AnyRef, Long], right: OpenHashMap[AnyRef, Long]): Boolean = {
+    left.size == right.size && left.forall { case (key, count) =>
+      right.apply(key) == count
+    }
+  }
+
+  private def assertEqual[T](left: T, right: T): Unit = {
+    assert(left == right)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
index 06dc3bd33b90..fe5cb8eda824 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
@@ -31,19 +31,22 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
     // Use an Add to wrap two of them together in case we only initialize the top level expressions.
     val expr = And(NondeterministicExpression(), NondeterministicExpression())
     val instance = UnsafeProjection.create(Seq(expr))
+    instance.initialize(0)
     assert(instance.apply(null).getBoolean(0) === false)
   }
 
   test("GenerateMutableProjection should initialize expressions") {
     val expr = And(NondeterministicExpression(), NondeterministicExpression())
     val instance = GenerateMutableProjection.generate(Seq(expr))
+    instance.initialize(0)
     assert(instance.apply(null).getBoolean(0) === false)
   }
 
   test("GeneratePredicate should initialize expressions") {
     val expr = And(NondeterministicExpression(), NondeterministicExpression())
     val instance = GeneratePredicate.generate(expr)
-    assert(instance.apply(null) === false)
+    instance.initialize(0)
+    assert(instance.eval(null) === false)
   }
 
   test("GenerateUnsafeProjection should not share expression instances") {
@@ -73,13 +76,13 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
   test("GeneratePredicate should not share expression instances") {
     val expr1 = MutableExpression()
     val instance1 = GeneratePredicate.generate(expr1)
-    assert(instance1.apply(null) === false)
+    assert(instance1.eval(null) === false)
 
     val expr2 = MutableExpression()
     expr2.mutableState = true
     val instance2 = GeneratePredicate.generate(expr2)
-    assert(instance1.apply(null) === false)
-    assert(instance2.apply(null) === true)
+    assert(instance1.eval(null) === false)
+    assert(instance2.eval(null) === true)
   }
 
 }
@@ -89,7 +92,7 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
  */
 case class NondeterministicExpression()
   extends LeafExpression with Nondeterministic with CodegenFallback {
-  override protected def initInternal(): Unit = { }
+  override protected def initializeInternal(partitionIndex: Int): Unit = {}
   override protected def evalInternal(input: InternalRow): Any = false
   override def nullable: Boolean = false
   override def dataType: DataType = BooleanType
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjectionSuite.scala
new file mode 100644
index 000000000000..e9d21f8a8ebc
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjectionSuite.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.BoundReference
+import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
+import org.apache.spark.sql.types.{DataType, Decimal, StringType, StructType}
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
+
+class GenerateUnsafeProjectionSuite extends SparkFunSuite {
+  test("Test unsafe projection string access pattern") {
+    val dataType = (new StructType).add("a", StringType)
+    val exprs = BoundReference(0, dataType, nullable = true) :: Nil
+    val projection = GenerateUnsafeProjection.generate(exprs)
+    val result = projection.apply(InternalRow(AlwaysNull))
+    assert(!result.isNullAt(0))
+    assert(result.getStruct(0, 1).isNullAt(0))
+  }
+}
+
+object AlwaysNull extends InternalRow {
+  override def numFields: Int = 1
+  override def setNullAt(i: Int): Unit = {}
+  override def copy(): InternalRow = this
+  override def anyNull: Boolean = true
+  override def isNullAt(ordinal: Int): Boolean = true
+  override def update(i: Int, value: Any): Unit = notSupported
+  override def getBoolean(ordinal: Int): Boolean = notSupported
+  override def getByte(ordinal: Int): Byte = notSupported
+  override def getShort(ordinal: Int): Short = notSupported
+  override def getInt(ordinal: Int): Int = notSupported
+  override def getLong(ordinal: Int): Long = notSupported
+  override def getFloat(ordinal: Int): Float = notSupported
+  override def getDouble(ordinal: Int): Double = notSupported
+  override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = notSupported
+  override def getUTF8String(ordinal: Int): UTF8String = notSupported
+  override def getBinary(ordinal: Int): Array[Byte] = notSupported
+  override def getInterval(ordinal: Int): CalendarInterval = notSupported
+  override def getStruct(ordinal: Int, numFields: Int): InternalRow = notSupported
+  override def getArray(ordinal: Int): ArrayData = notSupported
+  override def getMap(ordinal: Int): MapData = notSupported
+  override def get(ordinal: Int, dataType: DataType): AnyRef = notSupported
+  private def notSupported: Nothing = throw new UnsupportedOperationException
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseWindowSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseWindowSuite.scala
index 797076e55cfc..52054c2f8bd8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseWindowSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseWindowSuite.scala
@@ -46,12 +46,15 @@ class CollapseWindowSuite extends PlanTest {
       .window(Seq(sum(b).as('sum_b)), partitionSpec1, orderSpec1)
       .window(Seq(avg(b).as('avg_b)), partitionSpec1, orderSpec1)
 
-    val optimized = Optimize.execute(query.analyze)
+    val analyzed = query.analyze
+    val optimized = Optimize.execute(analyzed)
+    assert(analyzed.output === optimized.output)
+
     val correctAnswer = testRelation.window(Seq(
-        avg(b).as('avg_b),
-        sum(b).as('sum_b),
-        max(a).as('max_a),
-        min(a).as('min_a)), partitionSpec1, orderSpec1)
+      min(a).as('min_a),
+      max(a).as('max_a),
+      sum(b).as('sum_b),
+      avg(b).as('avg_b)), partitionSpec1, orderSpec1)
 
     comparePlans(optimized, correctAnswer)
   }
@@ -75,4 +78,15 @@ class CollapseWindowSuite extends PlanTest {
 
     comparePlans(optimized2, correctAnswer2)
   }
+
+  test("Don't collapse adjacent windows with dependent columns") {
+    val query = testRelation
+      .window(Seq(sum(a).as('sum_a)), partitionSpec1, orderSpec1)
+      .window(Seq(max('sum_a).as('max_sum_a)), partitionSpec1, orderSpec1)
+      .analyze
+
+    val expected = query.analyze
+    val optimized = Optimize.execute(query.analyze)
+    comparePlans(optimized, expected)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 019f132d94cb..150ebd2c406f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -134,15 +134,20 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
-  test("nondeterministic: can't push down filter with nondeterministic condition through project") {
+  test("nondeterministic: can always push down filter through project with deterministic field") {
     val originalQuery = testRelation
-      .select(Rand(10).as('rand), 'a)
-      .where('rand > 5 || 'a > 5)
+      .select('a)
+      .where(Rand(10) > 5 || 'a > 5)
       .analyze
 
     val optimized = Optimize.execute(originalQuery)
 
-    comparePlans(optimized, originalQuery)
+    val correctAnswer = testRelation
+      .where(Rand(10) > 5 || 'a > 5)
+      .select('a)
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
   }
 
   test("nondeterministic: can't push down filter through project with nondeterministic field") {
@@ -156,6 +161,34 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, originalQuery)
   }
 
+  test("nondeterministic: can't push down filter through aggregate with nondeterministic field") {
+    val originalQuery = testRelation
+      .groupBy('a)('a, Rand(10).as('rand))
+      .where('a > 5)
+      .analyze
+
+    val optimized = Optimize.execute(originalQuery)
+
+    comparePlans(optimized, originalQuery)
+  }
+
+  test("nondeterministic: push down part of filter through aggregate with deterministic field") {
+    val originalQuery = testRelation
+      .groupBy('a)('a)
+      .where('a > 5 && Rand(10) > 5)
+      .analyze
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    val correctAnswer = testRelation
+      .where('a > 5)
+      .groupBy('a)('a)
+      .where(Rand(10) > 5)
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("filters: combines filters") {
     val originalQuery = testRelation
       .select('a)
@@ -514,6 +547,56 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
   }
 
+  test("joins: push down where clause into left anti join") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val originalQuery =
+      x.join(y, LeftAnti, Some("x.b".attr === "y.b".attr))
+        .where("x.a".attr > 10)
+        .analyze
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer =
+      x.where("x.a".attr > 10)
+        .join(y, LeftAnti, Some("x.b".attr === "y.b".attr))
+        .analyze
+    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+  }
+
+  test("joins: only push down join conditions to the right of a left anti join") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val originalQuery =
+      x.join(y,
+        LeftAnti,
+        Some("x.b".attr === "y.b".attr && "y.a".attr > 10 && "x.a".attr > 10)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer =
+      x.join(
+        y.where("y.a".attr > 10),
+        LeftAnti,
+        Some("x.b".attr === "y.b".attr && "x.a".attr > 10))
+        .analyze
+    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+  }
+
+  test("joins: only push down join conditions to the right of an existence join") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val fillerVal = 'val.boolean
+    val originalQuery =
+      x.join(y,
+        ExistenceJoin(fillerVal),
+        Some("x.a".attr > 1 && "y.b".attr > 2)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer =
+      x.join(
+        y.where("y.b".attr > 2),
+        ExistenceJoin(fillerVal),
+        Some("x.a".attr > 1))
+      .analyze
+    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+  }
+
   val testRelationWithArrayType = LocalRelation('a.int, 'b.int, 'c_arr.array(IntegerType))
 
   test("generate: predicate referenced no generated column") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
index 355b3fc4aa63..d128315b6886 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
@@ -116,16 +116,49 @@ class FoldablePropagationSuite extends PlanTest {
   test("Propagate in subqueries of Union queries") {
     val query = Union(
       Seq(
-        testRelation.select(Literal(1).as('x), 'a).select('x + 'a),
-        testRelation.select(Literal(2).as('x), 'a).select('x + 'a)))
+        testRelation.select(Literal(1).as('x), 'a).select('x, 'x + 'a),
+        testRelation.select(Literal(2).as('x), 'a).select('x, 'x + 'a)))
       .select('x)
     val optimized = Optimize.execute(query.analyze)
     val correctAnswer = Union(
       Seq(
-        testRelation.select(Literal(1).as('x), 'a).select((Literal(1).as('x) + 'a).as("(x + a)")),
-        testRelation.select(Literal(2).as('x), 'a).select((Literal(2).as('x) + 'a).as("(x + a)"))))
+        testRelation.select(Literal(1).as('x), 'a)
+          .select(Literal(1).as('x), (Literal(1).as('x) + 'a).as("(x + a)")),
+        testRelation.select(Literal(2).as('x), 'a)
+          .select(Literal(2).as('x), (Literal(2).as('x) + 'a).as("(x + a)"))))
       .select('x).analyze
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("Propagate in inner join") {
+    val ta = testRelation.select('a, Literal(1).as('tag))
+      .union(testRelation.select('a, Literal(2).as('tag)))
+      .subquery('ta)
+    val tb = testRelation.select('a, Literal(1).as('tag))
+      .union(testRelation.select('a, Literal(2).as('tag)))
+      .subquery('tb)
+    val query = ta.join(tb, Inner,
+      Some("ta.a".attr === "tb.a".attr && "ta.tag".attr === "tb.tag".attr))
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = query.analyze
+    comparePlans(optimized, correctAnswer)
+  }
 
+  test("Propagate in expand") {
+    val c1 = Literal(1).as('a)
+    val c2 = Literal(2).as('b)
+    val a1 = c1.toAttribute.withNullability(true)
+    val a2 = c2.toAttribute.withNullability(true)
+    val expand = Expand(
+      Seq(Seq(Literal(null), 'b), Seq('a, Literal(null))),
+      Seq(a1, a2),
+      OneRowRelation.select(c1, c2))
+    val query = expand.where(a1.isNotNull).select(a1, a2).analyze
+    val optimized = Optimize.execute(query)
+    val correctExpand = expand.copy(projections = Seq(
+      Seq(Literal(null), c2),
+      Seq(c1, Literal(null))))
+    val correctAnswer = correctExpand.where(a1.isNotNull).select(a1, a2).analyze
     comparePlans(optimized, correctAnswer)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
index 908dde7a6698..2285be16938d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
@@ -142,7 +142,7 @@ class PropagateEmptyRelationSuite extends PlanTest {
     comparePlans(optimized, correctAnswer.analyze)
   }
 
-  test("propagate empty relation through Aggregate without aggregate function") {
+  test("propagate empty relation through Aggregate with grouping expressions") {
     val query = testRelation1
       .where(false)
       .groupBy('a)('a, ('a + 1).as('x))
@@ -153,13 +153,13 @@ class PropagateEmptyRelationSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
-  test("don't propagate empty relation through Aggregate with aggregate function") {
+  test("don't propagate empty relation through Aggregate without grouping expressions") {
     val query = testRelation1
       .where(false)
-      .groupBy('a)(count('a))
+      .groupBy()()
 
     val optimized = Optimize.execute(query.analyze)
-    val correctAnswer = LocalRelation('a.int).groupBy('a)(count('a)).analyze
+    val correctAnswer = LocalRelation('a.int).groupBy()().analyze
 
     comparePlans(optimized, correctAnswer)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
similarity index 56%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
index 7c26cb5598b3..1973b5abb462 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
@@ -25,10 +25,15 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.types.MetadataBuilder
 
-class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper {
+class RemoveRedundantAliasAndProjectSuite extends PlanTest with PredicateHelper {
 
   object Optimize extends RuleExecutor[LogicalPlan] {
-    val batches = Batch("RemoveAliasOnlyProject", FixedPoint(50), RemoveAliasOnlyProject) :: Nil
+    val batches = Batch(
+      "RemoveAliasOnlyProject",
+      FixedPoint(50),
+      PushProjectionThroughUnion,
+      RemoveRedundantAliases,
+      RemoveRedundantProject) :: Nil
   }
 
   test("all expressions in project list are aliased child output") {
@@ -42,7 +47,8 @@ class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper {
     val relation = LocalRelation('a.int, 'b.int)
     val query = relation.select('b as 'b, 'a as 'a).analyze
     val optimized = Optimize.execute(query)
-    comparePlans(optimized, query)
+    val expected = relation.select('b, 'a).analyze
+    comparePlans(optimized, expected)
   }
 
   test("some expressions in project list are aliased child output") {
@@ -56,14 +62,16 @@ class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper {
     val relation = LocalRelation('a.int, 'b.int)
     val query = relation.select('b as 'b, 'a).analyze
     val optimized = Optimize.execute(query)
-    comparePlans(optimized, query)
+    val expected = relation.select('b, 'a).analyze
+    comparePlans(optimized, expected)
   }
 
   test("some expressions in project list are not Alias or Attribute") {
     val relation = LocalRelation('a.int, 'b.int)
     val query = relation.select('a as 'a, 'b + 1).analyze
     val optimized = Optimize.execute(query)
-    comparePlans(optimized, query)
+    val expected = relation.select('a, 'b + 1).analyze
+    comparePlans(optimized, expected)
   }
 
   test("some expressions in project list are aliased child output but with metadata") {
@@ -74,4 +82,46 @@ class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper {
     val optimized = Optimize.execute(query)
     comparePlans(optimized, query)
   }
+
+  test("retain deduplicating alias in self-join") {
+    val relation = LocalRelation('a.int)
+    val fragment = relation.select('a as 'a)
+    val query = fragment.select('a as 'a).join(fragment.select('a as 'a)).analyze
+    val optimized = Optimize.execute(query)
+    val expected = relation.join(relation.select('a as 'a)).analyze
+    comparePlans(optimized, expected)
+  }
+
+  test("alias removal should not break after push project through union") {
+    val r1 = LocalRelation('a.int)
+    val r2 = LocalRelation('b.int)
+    val query = r1.select('a as 'a).union(r2.select('b as 'b)).select('a).analyze
+    val optimized = Optimize.execute(query)
+    val expected = r1.union(r2)
+    comparePlans(optimized, expected)
+  }
+
+  test("remove redundant alias from aggregate") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation.groupBy('a as 'a)('a as 'a, sum('b)).analyze
+    val optimized = Optimize.execute(query)
+    val expected = relation.groupBy('a)('a, sum('b)).analyze
+    comparePlans(optimized, expected)
+  }
+
+  test("remove redundant alias from window") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = relation.window(Seq('b as 'b), Seq('a as 'a), Seq()).analyze
+    val optimized = Optimize.execute(query)
+    val expected = relation.window(Seq('b), Seq('a), Seq()).analyze
+    comparePlans(optimized, expected)
+  }
+
+  test("do not remove output attributes from a subquery") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val query = Subquery(relation.select('a as "a", 'b as "b").where('b < 10).select('a).analyze)
+    val optimized = Optimize.execute(query)
+    val expected = Subquery(relation.select('a as "a", 'b).where('b < 10).select('a).analyze)
+    comparePlans(optimized, expected)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index 17cfc8158803..2fecb8dc4a60 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -298,6 +298,8 @@ class ExpressionParserSuite extends PlanTest {
       CaseKeyWhen("a" ===  "a", Seq(true, 1)))
     assertEqual("case when a = 1 then b when a = 2 then c else d end",
       CaseWhen(Seq(('a === 1, 'b.expr), ('a === 2, 'c.expr)), 'd))
+    assertEqual("case when (1) + case when a > b then c else d end then f else g end",
+      CaseWhen(Seq((Literal(1) + CaseWhen(Seq(('a > 'b, 'c.expr)), 'd.expr), 'f.expr)), 'g))
   }
 
   test("dereference") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index ca86304d4d40..304beb121ff6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -180,7 +180,16 @@ class PlanParserSuite extends PlanTest {
         partition: Map[String, Option[String]],
         overwrite: Boolean = false,
         ifNotExists: Boolean = false): LogicalPlan =
-      InsertIntoTable(table("s"), partition, plan, overwrite, ifNotExists)
+      InsertIntoTable(
+        table("s"), partition, plan,
+        OverwriteOptions(
+          overwrite,
+          if (overwrite && partition.nonEmpty) {
+            partition.map(kv => (kv._1, kv._2.get))
+          } else {
+            Map.empty
+          }),
+        ifNotExists)
 
     // Single inserts
     assertEqual(s"insert overwrite table s $sql",
@@ -196,9 +205,9 @@ class PlanParserSuite extends PlanTest {
     val plan2 = table("t").where('x > 5).select(star())
     assertEqual("from t insert into s select * limit 1 insert into u select * where x > 5",
       InsertIntoTable(
-        table("s"), Map.empty, plan.limit(1), overwrite = false, ifNotExists = false).union(
+        table("s"), Map.empty, plan.limit(1), OverwriteOptions(false), ifNotExists = false).union(
         InsertIntoTable(
-          table("u"), Map.empty, plan2, overwrite = false, ifNotExists = false)))
+          table("u"), Map.empty, plan2, OverwriteOptions(false), ifNotExists = false)))
   }
 
   test ("insert with if not exists") {
@@ -339,7 +348,7 @@ class PlanParserSuite extends PlanTest {
     val testUsingJoin = (sql: String, jt: JoinType) => {
       assertEqual(
         s"select * from t $sql u using(a, b)",
-        table("t").join(table("u"), UsingJoin(jt, Seq('a.attr, 'b.attr)), None).select(star()))
+        table("t").join(table("u"), UsingJoin(jt, Seq("a", "b")), None).select(star()))
     }
     val testAll = Seq(testUnconditionalJoin, testConditionalJoin, testNaturalJoin, testUsingJoin)
     val testExistence = Seq(testUnconditionalJoin, testConditionalJoin, testUsingJoin)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index 8068ce922e63..a191aa8fee70 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -351,6 +351,15 @@ class ConstraintPropagationSuite extends SparkFunSuite {
         IsNotNull(IsNotNull(resolveColumn(tr, "b"))),
         IsNotNull(resolveColumn(tr, "a")),
         IsNotNull(resolveColumn(tr, "c")))))
+
+    verifyConstraints(
+      tr.where('a.attr === 1 && IsNotNull(resolveColumn(tr, "b")) &&
+        IsNotNull(resolveColumn(tr, "c"))).analyze.constraints,
+      ExpressionSet(Seq(
+        resolveColumn(tr, "a") === 1,
+        IsNotNull(resolveColumn(tr, "c")),
+        IsNotNull(resolveColumn(tr, "a")),
+        IsNotNull(resolveColumn(tr, "b")))))
   }
 
   test("infer IsNotNull constraints from non-nullable attributes") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 3eff12f9eed1..37e3dfabd0b2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -489,9 +489,10 @@ class TreeNodeSuite extends SparkFunSuite {
         "owner" -> "",
         "createTime" -> 0,
         "lastAccessTime" -> -1,
-        "partitionProviderIsHive" -> false,
+        "tracksPartitionsInCatalog" -> false,
         "properties" -> JNull,
-        "unsupportedFeatures" -> List.empty[String]))
+        "unsupportedFeatures" -> List.empty[String],
+        "schemaPreservesCase" -> JBool(true)))
 
     // For unknown case class, returns JNull.
     val bigValue = new Array[Int](10000)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 4f516d006458..e0a9a0c3d5c0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
-import java.util.{Calendar, TimeZone}
+import java.util.{Calendar, Locale, TimeZone}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
@@ -68,8 +68,8 @@ class DateTimeUtilsSuite extends SparkFunSuite {
       assert(d2.toString === d1.toString)
     }
 
-    val df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
-    val df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z")
+    val df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+    val df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z", Locale.US)
 
     checkFromToJavaDate(new Date(100))
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala
index 2ffc18a8d14f..78fee5135c3a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/StringUtilsSuite.scala
@@ -24,9 +24,9 @@ class StringUtilsSuite extends SparkFunSuite {
 
   test("escapeLikeRegex") {
     assert(escapeLikeRegex("abdef") === "(?s)\\Qa\\E\\Qb\\E\\Qd\\E\\Qe\\E\\Qf\\E")
-    assert(escapeLikeRegex("a\\__b") === "(?s)\\Qa\\E_.\\Qb\\E")
+    assert(escapeLikeRegex("a\\__b") === "(?s)\\Qa\\E\\Q_\\E.\\Qb\\E")
     assert(escapeLikeRegex("a_%b") === "(?s)\\Qa\\E..*\\Qb\\E")
-    assert(escapeLikeRegex("a%\\%b") === "(?s)\\Qa\\E.*%\\Qb\\E")
+    assert(escapeLikeRegex("a%\\%b") === "(?s)\\Qa\\E.*\\Q%\\E\\Qb\\E")
     assert(escapeLikeRegex("a%") === "(?s)\\Qa\\E.*")
     assert(escapeLikeRegex("**") === "(?s)\\Q*\\E\\Q*\\E")
     assert(escapeLikeRegex("a_b") === "(?s)\\Qa\\E.\\Qb\\E")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index b8ab9a9963de..12d2c00dc9c4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -253,7 +253,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkDataTypeJsonRepr(structType)
 
   def checkDefaultSize(dataType: DataType, expectedDefaultSize: Int): Unit = {
-    test(s"Check the default size of ${dataType}") {
+    test(s"Check the default size of $dataType") {
       assert(dataType.defaultSize === expectedDefaultSize)
     }
   }
@@ -272,18 +272,18 @@ class DataTypeSuite extends SparkFunSuite {
   checkDefaultSize(TimestampType, 8)
   checkDefaultSize(StringType, 20)
   checkDefaultSize(BinaryType, 100)
-  checkDefaultSize(ArrayType(DoubleType, true), 800)
-  checkDefaultSize(ArrayType(StringType, false), 2000)
-  checkDefaultSize(MapType(IntegerType, StringType, true), 2400)
-  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400)
-  checkDefaultSize(structType, 812)
+  checkDefaultSize(ArrayType(DoubleType, true), 8)
+  checkDefaultSize(ArrayType(StringType, false), 20)
+  checkDefaultSize(MapType(IntegerType, StringType, true), 24)
+  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 12)
+  checkDefaultSize(structType, 20)
 
   def checkEqualsIgnoreCompatibleNullability(
       from: DataType,
       to: DataType,
       expected: Boolean): Unit = {
     val testName =
-      s"equalsIgnoreCompatibleNullability: (from: ${from}, to: ${to})"
+      s"equalsIgnoreCompatibleNullability: (from: $from, to: $to)"
     test(testName) {
       assert(DataType.equalsIgnoreCompatibleNullability(from, to) === expected)
     }
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 7da77158ff07..a03a9593e852 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -74,6 +74,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-column</artifactId>
@@ -91,6 +103,10 @@
       <artifactId>jackson-databind</artifactId>
       <version>${fasterxml.jackson.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.xbean</groupId>
+      <artifactId>xbean-asm5-shaded</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
@@ -122,11 +138,6 @@
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.apache.xbean</groupId>
-      <artifactId>xbean-asm5-shaded</artifactId>
-      <scope>test</scope>
-    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java b/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java
index 086547c793e3..730a4ae8d560 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java
@@ -69,6 +69,16 @@ protected void append(InternalRow row) {
     currentRows.add(row);
   }
 
+  /**
+   * Returns whether this iterator should stop fetching next row from [[CodegenSupport#inputRDDs]].
+   *
+   * If it returns true, the caller should exit the loop that [[InputAdapter]] generates.
+   * This interface is mainly used to limit the number of input rows.
+   */
+  protected boolean stopEarly() {
+    return false;
+  }
+
   /**
    * Returns whether `processNext()` should stop processing next row from `input` or not.
    *
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetLogRedirector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetLogRedirector.java
new file mode 100644
index 000000000000..7a7f32ee1e87
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetLogRedirector.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.parquet;
+
+import java.io.Serializable;
+import java.util.logging.Handler;
+import java.util.logging.Logger;
+
+import org.apache.parquet.Log;
+import org.slf4j.bridge.SLF4JBridgeHandler;
+
+// Redirects the JUL logging for parquet-mr versions <= 1.8 to SLF4J logging using
+// SLF4JBridgeHandler. Parquet-mr versions >= 1.9 use SLF4J directly
+final class ParquetLogRedirector implements Serializable {
+  // Client classes should hold a reference to INSTANCE to ensure redirection occurs. This is
+  // especially important for Serializable classes where fields are set but constructors are
+  // ignored
+  static final ParquetLogRedirector INSTANCE = new ParquetLogRedirector();
+
+  // JUL loggers must be held by a strong reference, otherwise they may get destroyed by GC.
+  // However, the root JUL logger used by Parquet isn't properly referenced.  Here we keep
+  // references to loggers in both parquet-mr <= 1.6 and 1.7/1.8
+  private static final Logger apacheParquetLogger =
+    Logger.getLogger(Log.class.getPackage().getName());
+  private static final Logger parquetLogger = Logger.getLogger("parquet");
+
+  static {
+    // For parquet-mr 1.7 and 1.8, which are under `org.apache.parquet` namespace.
+    try {
+      Class.forName(Log.class.getName());
+      redirect(Logger.getLogger(Log.class.getPackage().getName()));
+    } catch (ClassNotFoundException ex) {
+      throw new RuntimeException(ex);
+    }
+
+    // For parquet-mr 1.6.0 and lower versions bundled with Hive, which are under `parquet`
+    // namespace.
+    try {
+      Class.forName("parquet.Log");
+      redirect(Logger.getLogger("parquet"));
+    } catch (Throwable t) {
+      // SPARK-9974: com.twitter:parquet-hadoop-bundle:1.6.0 is not packaged into the assembly
+      // when Spark is built with SBT. So `parquet.Log` may not be found.  This try/catch block
+      // should be removed after this issue is fixed.
+    }
+  }
+
+  private ParquetLogRedirector() {
+  }
+
+  private static void redirect(Logger logger) {
+    for (Handler handler : logger.getHandlers()) {
+      logger.removeHandler(handler);
+    }
+    logger.setUseParentHandlers(false);
+    logger.addHandler(new SLF4JBridgeHandler());
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
index a6ce4c2edc23..8b7b0e655b31 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
@@ -198,21 +198,25 @@ public boolean anyNull() {
 
     @Override
     public Decimal getDecimal(int ordinal, int precision, int scale) {
+      if (columns[ordinal].isNullAt(rowId)) return null;
       return columns[ordinal].getDecimal(rowId, precision, scale);
     }
 
     @Override
     public UTF8String getUTF8String(int ordinal) {
+      if (columns[ordinal].isNullAt(rowId)) return null;
       return columns[ordinal].getUTF8String(rowId);
     }
 
     @Override
     public byte[] getBinary(int ordinal) {
+      if (columns[ordinal].isNullAt(rowId)) return null;
       return columns[ordinal].getBinary(rowId);
     }
 
     @Override
     public CalendarInterval getInterval(int ordinal) {
+      if (columns[ordinal].isNullAt(rowId)) return null;
       final int months = columns[ordinal].getChildColumn(0).getInt(rowId);
       final long microseconds = columns[ordinal].getChildColumn(1).getLong(rowId);
       return new CalendarInterval(months, microseconds);
@@ -220,11 +224,13 @@ public CalendarInterval getInterval(int ordinal) {
 
     @Override
     public InternalRow getStruct(int ordinal, int numFields) {
+      if (columns[ordinal].isNullAt(rowId)) return null;
       return columns[ordinal].getStruct(rowId);
     }
 
     @Override
     public ArrayData getArray(int ordinal) {
+      if (columns[ordinal].isNullAt(rowId)) return null;
       return columns[ordinal].getArray(rowId);
     }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 12fa109cec82..e988c0722bd7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -177,7 +177,7 @@ public void putShort(int rowId, short value) {
   @Override
   public void putShorts(int rowId, int count, short value) {
     long offset = data + 2 * rowId;
-    for (int i = 0; i < count; ++i, offset += 4) {
+    for (int i = 0; i < count; ++i, offset += 2) {
       Platform.putShort(null, offset, value);
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 249408e0fbce..e99d7865bda9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -97,7 +97,7 @@ class TypedColumn[-T, U](
 }
 
 /**
- * A column that will be computed based on the data in a [[DataFrame]].
+ * A column that will be computed based on the data in a `DataFrame`.
  *
  * A new column is constructed based on the input columns present in a dataframe:
  *
@@ -118,7 +118,7 @@ class TypedColumn[-T, U](
  *   $"a" === $"b"
  * }}}
  *
- * Note that the internal Catalyst expression can be accessed via "expr", but this method is for
+ * @note The internal Catalyst expression can be accessed via "expr", but this method is for
  * debugging purposes only and can change in any future Spark releases.
  *
  * @groupname java_expr_ops Java-specific expression operators
@@ -186,6 +186,9 @@ class Column(val expr: Expression) extends Logging {
     case a: AggregateExpression if a.aggregateFunction.isInstanceOf[TypedAggregateExpression] =>
       UnresolvedAlias(a, Some(Column.generateAlias))
 
+    // Wait until the struct is resolved. This will generate a nicer looking alias.
+    case struct: CreateNamedStructLike => UnresolvedAlias(struct)
+
     case expr: Expression => Alias(expr, usePrettyExpression(expr).sql)()
   }
 
@@ -798,7 +801,7 @@ class Column(val expr: Expression) extends Logging {
 
   /**
    * An expression that gets an item at position `ordinal` out of an array,
-   * or gets a value by key `key` in a [[MapType]].
+   * or gets a value by key `key` in a `MapType`.
    *
    * @group expr_ops
    * @since 1.3.0
@@ -806,7 +809,7 @@ class Column(val expr: Expression) extends Logging {
   def getItem(key: Any): Column = withExpr { UnresolvedExtractValue(expr, Literal(key)) }
 
   /**
-   * An expression that gets a field by name in a [[StructType]].
+   * An expression that gets a field by name in a `StructType`.
    *
    * @group expr_ops
    * @since 1.3.0
@@ -1192,92 +1195,92 @@ class Column(val expr: Expression) extends Logging {
 class ColumnName(name: String) extends Column(name) {
 
   /**
-   * Creates a new [[StructField]] of type boolean.
+   * Creates a new `StructField` of type boolean.
    * @since 1.3.0
    */
   def boolean: StructField = StructField(name, BooleanType)
 
   /**
-   * Creates a new [[StructField]] of type byte.
+   * Creates a new `StructField` of type byte.
    * @since 1.3.0
    */
   def byte: StructField = StructField(name, ByteType)
 
   /**
-   * Creates a new [[StructField]] of type short.
+   * Creates a new `StructField` of type short.
    * @since 1.3.0
    */
   def short: StructField = StructField(name, ShortType)
 
   /**
-   * Creates a new [[StructField]] of type int.
+   * Creates a new `StructField` of type int.
    * @since 1.3.0
    */
   def int: StructField = StructField(name, IntegerType)
 
   /**
-   * Creates a new [[StructField]] of type long.
+   * Creates a new `StructField` of type long.
    * @since 1.3.0
    */
   def long: StructField = StructField(name, LongType)
 
   /**
-   * Creates a new [[StructField]] of type float.
+   * Creates a new `StructField` of type float.
    * @since 1.3.0
    */
   def float: StructField = StructField(name, FloatType)
 
   /**
-   * Creates a new [[StructField]] of type double.
+   * Creates a new `StructField` of type double.
    * @since 1.3.0
    */
   def double: StructField = StructField(name, DoubleType)
 
   /**
-   * Creates a new [[StructField]] of type string.
+   * Creates a new `StructField` of type string.
    * @since 1.3.0
    */
   def string: StructField = StructField(name, StringType)
 
   /**
-   * Creates a new [[StructField]] of type date.
+   * Creates a new `StructField` of type date.
    * @since 1.3.0
    */
   def date: StructField = StructField(name, DateType)
 
   /**
-   * Creates a new [[StructField]] of type decimal.
+   * Creates a new `StructField` of type decimal.
    * @since 1.3.0
    */
   def decimal: StructField = StructField(name, DecimalType.USER_DEFAULT)
 
   /**
-   * Creates a new [[StructField]] of type decimal.
+   * Creates a new `StructField` of type decimal.
    * @since 1.3.0
    */
   def decimal(precision: Int, scale: Int): StructField =
     StructField(name, DecimalType(precision, scale))
 
   /**
-   * Creates a new [[StructField]] of type timestamp.
+   * Creates a new `StructField` of type timestamp.
    * @since 1.3.0
    */
   def timestamp: StructField = StructField(name, TimestampType)
 
   /**
-   * Creates a new [[StructField]] of type binary.
+   * Creates a new `StructField` of type binary.
    * @since 1.3.0
    */
   def binary: StructField = StructField(name, BinaryType)
 
   /**
-   * Creates a new [[StructField]] of type array.
+   * Creates a new `StructField` of type array.
    * @since 1.3.0
    */
   def array(dataType: DataType): StructField = StructField(name, ArrayType(dataType))
 
   /**
-   * Creates a new [[StructField]] of type map.
+   * Creates a new `StructField` of type map.
    * @since 1.3.0
    */
   def map(keyType: DataType, valueType: DataType): StructField =
@@ -1286,13 +1289,13 @@ class ColumnName(name: String) extends Column(name) {
   def map(mapType: MapType): StructField = StructField(name, mapType)
 
   /**
-   * Creates a new [[StructField]] of type struct.
+   * Creates a new `StructField` of type struct.
    * @since 1.3.0
    */
   def struct(fields: StructField*): StructField = struct(StructType(fields))
 
   /**
-   * Creates a new [[StructField]] of type struct.
+   * Creates a new `StructField` of type struct.
    * @since 1.3.0
    */
   def struct(structType: StructType): StructField = StructField(name, structType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index 0d43f09bc54c..3fbc39142cd2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
 
 
 /**
- * Functionality for working with missing data in [[DataFrame]]s.
+ * Functionality for working with missing data in `DataFrame`s.
  *
  * @since 1.3.1
  */
@@ -36,14 +36,14 @@ import org.apache.spark.sql.types._
 final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing any null or NaN values.
+   * Returns a new `DataFrame` that drops rows containing any null or NaN values.
    *
    * @since 1.3.1
    */
   def drop(): DataFrame = drop("any", df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing null or NaN values.
+   * Returns a new `DataFrame` that drops rows containing null or NaN values.
    *
    * If `how` is "any", then drop rows containing any null or NaN values.
    * If `how` is "all", then drop rows only if every column is null or NaN for that row.
@@ -53,7 +53,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(how: String): DataFrame = drop(how, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing any null or NaN values
+   * Returns a new `DataFrame` that drops rows containing any null or NaN values
    * in the specified columns.
    *
    * @since 1.3.1
@@ -61,7 +61,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(cols: Array[String]): DataFrame = drop(cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing any null or NaN values
+   * (Scala-specific) Returns a new `DataFrame` that drops rows containing any null or NaN values
    * in the specified columns.
    *
    * @since 1.3.1
@@ -69,7 +69,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(cols: Seq[String]): DataFrame = drop(cols.size, cols)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing null or NaN values
+   * Returns a new `DataFrame` that drops rows containing null or NaN values
    * in the specified columns.
    *
    * If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
@@ -80,7 +80,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(how: String, cols: Array[String]): DataFrame = drop(how, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing null or NaN values
+   * (Scala-specific) Returns a new `DataFrame` that drops rows containing null or NaN values
    * in the specified columns.
    *
    * If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
@@ -97,7 +97,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   }
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing
+   * Returns a new `DataFrame` that drops rows containing
    * less than `minNonNulls` non-null and non-NaN values.
    *
    * @since 1.3.1
@@ -105,7 +105,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(minNonNulls: Int): DataFrame = drop(minNonNulls, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing
+   * Returns a new `DataFrame` that drops rows containing
    * less than `minNonNulls` non-null and non-NaN values in the specified columns.
    *
    * @since 1.3.1
@@ -113,7 +113,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(minNonNulls: Int, cols: Array[String]): DataFrame = drop(minNonNulls, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing less than
+   * (Scala-specific) Returns a new `DataFrame` that drops rows containing less than
    * `minNonNulls` non-null and non-NaN values in the specified columns.
    *
    * @since 1.3.1
@@ -126,21 +126,35 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   }
 
   /**
-   * Returns a new [[DataFrame]] that replaces null or NaN values in numeric columns with `value`.
+   * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`.
    *
+   * @since 2.1.1
+   */
+  def fill(value: Long): DataFrame = fill(value, df.columns)
+
+  /**
+   * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`.
    * @since 1.3.1
    */
   def fill(value: Double): DataFrame = fill(value, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that replaces null values in string columns with `value`.
+   * Returns a new `DataFrame` that replaces null values in string columns with `value`.
    *
    * @since 1.3.1
    */
   def fill(value: String): DataFrame = fill(value, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that replaces null or NaN values in specified numeric columns.
+   * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns.
+   * If a specified column is not a numeric column, it is ignored.
+   *
+   * @since 2.1.1
+   */
+  def fill(value: Long, cols: Array[String]): DataFrame = fill(value, cols.toSeq)
+
+  /**
+   * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns.
    * If a specified column is not a numeric column, it is ignored.
    *
    * @since 1.3.1
@@ -148,26 +162,24 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def fill(value: Double, cols: Array[String]): DataFrame = fill(value, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that replaces null or NaN values in specified
+   * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified
+   * numeric columns. If a specified column is not a numeric column, it is ignored.
+   *
+   * @since 2.1.1
+   */
+  def fill(value: Long, cols: Seq[String]): DataFrame = fillValue(value, cols)
+
+  /**
+   * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified
    * numeric columns. If a specified column is not a numeric column, it is ignored.
    *
    * @since 1.3.1
    */
-  def fill(value: Double, cols: Seq[String]): DataFrame = {
-    val columnEquals = df.sparkSession.sessionState.analyzer.resolver
-    val projections = df.schema.fields.map { f =>
-      // Only fill if the column is part of the cols list.
-      if (f.dataType.isInstanceOf[NumericType] && cols.exists(col => columnEquals(f.name, col))) {
-        fillCol[Double](f, value)
-      } else {
-        df.col(f.name)
-      }
-    }
-    df.select(projections : _*)
-  }
+  def fill(value: Double, cols: Seq[String]): DataFrame = fillValue(value, cols)
+
 
   /**
-   * Returns a new [[DataFrame]] that replaces null values in specified string columns.
+   * Returns a new `DataFrame` that replaces null values in specified string columns.
    * If a specified column is not a string column, it is ignored.
    *
    * @since 1.3.1
@@ -175,26 +187,15 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def fill(value: String, cols: Array[String]): DataFrame = fill(value, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that replaces null values in
+   * (Scala-specific) Returns a new `DataFrame` that replaces null values in
    * specified string columns. If a specified column is not a string column, it is ignored.
    *
    * @since 1.3.1
    */
-  def fill(value: String, cols: Seq[String]): DataFrame = {
-    val columnEquals = df.sparkSession.sessionState.analyzer.resolver
-    val projections = df.schema.fields.map { f =>
-      // Only fill if the column is part of the cols list.
-      if (f.dataType.isInstanceOf[StringType] && cols.exists(col => columnEquals(f.name, col))) {
-        fillCol[String](f, value)
-      } else {
-        df.col(f.name)
-      }
-    }
-    df.select(projections : _*)
-  }
+  def fill(value: String, cols: Seq[String]): DataFrame = fillValue(value, cols)
 
   /**
-   * Returns a new [[DataFrame]] that replaces null values.
+   * Returns a new `DataFrame` that replaces null values.
    *
    * The key of the map is the column name, and the value of the map is the replacement value.
    * The value must be of the following type:
@@ -210,10 +211,10 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *
    * @since 1.3.1
    */
-  def fill(valueMap: java.util.Map[String, Any]): DataFrame = fill0(valueMap.asScala.toSeq)
+  def fill(valueMap: java.util.Map[String, Any]): DataFrame = fillMap(valueMap.asScala.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that replaces null values.
+   * (Scala-specific) Returns a new `DataFrame` that replaces null values.
    *
    * The key of the map is the column name, and the value of the map is the replacement value.
    * The value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`, `Boolean`.
@@ -230,7 +231,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    *
    * @since 1.3.1
    */
-  def fill(valueMap: Map[String, Any]): DataFrame = fill0(valueMap.toSeq)
+  def fill(valueMap: Map[String, Any]): DataFrame = fillMap(valueMap.toSeq)
 
   /**
    * Replaces values matching keys in `replacement` map with the corresponding values.
@@ -368,7 +369,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
     df.select(projections : _*)
   }
 
-  private def fill0(values: Seq[(String, Any)]): DataFrame = {
+  private def fillMap(values: Seq[(String, Any)]): DataFrame = {
     // Error handling
     values.foreach { case (colName, replaceValue) =>
       // Check column name exists
@@ -409,7 +410,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
         nanvl(df.col(quotedColName), lit(null)) // nanvl only supports these types
       case _ => df.col(quotedColName)
     }
-    coalesce(colValue, lit(replacement)).cast(col.dataType).as(col.name)
+    coalesce(colValue, lit(replacement).cast(col.dataType)).as(col.name)
   }
 
   /**
@@ -435,4 +436,38 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
     case v => throw new IllegalArgumentException(
       s"Unsupported value type ${v.getClass.getName} ($v).")
   }
+
+  /**
+   * Returns a new `DataFrame` that replaces null or NaN values in specified
+   * numeric, string columns. If a specified column is not a numeric, string column,
+   * it is ignored.
+   */
+  private def fillValue[T](value: T, cols: Seq[String]): DataFrame = {
+    // the fill[T] which T is  Long/Double,
+    // should apply on all the NumericType Column, for example:
+    // val input = Seq[(java.lang.Integer, java.lang.Double)]((null, 164.3)).toDF("a","b")
+    // input.na.fill(3.1)
+    // the result is (3,164.3), not (null, 164.3)
+    val targetType = value match {
+      case _: Double | _: Long => NumericType
+      case _: String => StringType
+      case _ => throw new IllegalArgumentException(
+        s"Unsupported value type ${value.getClass.getName} ($value).")
+    }
+
+    val columnEquals = df.sparkSession.sessionState.analyzer.resolver
+    val projections = df.schema.fields.map { f =>
+      val typeMatches = (targetType, f.dataType) match {
+        case (NumericType, dt) => dt.isInstanceOf[NumericType]
+        case (StringType, dt) => dt == StringType
+      }
+      // Only fill if the column is part of the cols list.
+      if (typeMatches && cols.exists(col => columnEquals(f.name, col))) {
+        fillCol[T](f, value)
+      } else {
+        df.col(f.name)
+      }
+    }
+    df.select(projections : _*)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index a77937efd7e1..365b50dee93c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.types.StructType
 
 /**
  * Interface used to load a [[Dataset]] from external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[SparkSession.read]] to access this.
+ * key-value stores, etc). Use `SparkSession.read` to access this.
  *
  * @since 1.4.0
  */
@@ -116,7 +116,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads input in as a [[DataFrame]], for data sources that don't require a path (e.g. external
+   * Loads input in as a `DataFrame`, for data sources that don't require a path (e.g. external
    * key-value stores).
    *
    * @since 1.4.0
@@ -126,7 +126,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by
+   * Loads input in as a `DataFrame`, for data sources that require a path (e.g. data backed by
    * a local or distributed file system).
    *
    * @since 1.4.0
@@ -136,7 +136,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads input in as a [[DataFrame]], for data sources that support multiple paths.
+   * Loads input in as a `DataFrame`, for data sources that support multiple paths.
    * Only works if the source is a HadoopFsRelationProvider.
    *
    * @since 1.6.0
@@ -153,17 +153,21 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table and connection properties.
    *
    * @since 1.4.0
    */
   def jdbc(url: String, table: String, properties: Properties): DataFrame = {
-    jdbc(url, table, JDBCRelation.columnPartition(null), properties)
+    // properties should override settings in extraOptions.
+    this.extraOptions = this.extraOptions ++ properties.asScala
+    // explicit url and dbtable should override all
+    this.extraOptions += (JDBCOptions.JDBC_URL -> url, JDBCOptions.JDBC_TABLE_NAME -> table)
+    format("jdbc").load()
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table. Partitions of the table will be retrieved in parallel based on the parameters
    * passed to this function.
    *
@@ -177,7 +181,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * @param upperBound the maximum value of `columnName` used to decide partition stride.
    * @param numPartitions the number of partitions. This, along with `lowerBound` (inclusive),
    *                      `upperBound` (exclusive), form partition strides for generated WHERE
-   *                      clause expressions used to split the column `columnName` evenly.
+   *                      clause expressions used to split the column `columnName` evenly. When
+   *                      the input is less than 1, the number is set to 1.
    * @param connectionProperties JDBC database connection arguments, a list of arbitrary string
    *                             tag/value. Normally at least a "user" and "password" property
    *                             should be included. "fetchsize" can be used to control the
@@ -192,16 +197,20 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       upperBound: Long,
       numPartitions: Int,
       connectionProperties: Properties): DataFrame = {
-    val partitioning = JDBCPartitioningInfo(columnName, lowerBound, upperBound, numPartitions)
-    val parts = JDBCRelation.columnPartition(partitioning)
-    jdbc(url, table, parts, connectionProperties)
+    // columnName, lowerBound, upperBound and numPartitions override settings in extraOptions.
+    this.extraOptions ++= Map(
+      JDBCOptions.JDBC_PARTITION_COLUMN -> columnName,
+      JDBCOptions.JDBC_LOWER_BOUND -> lowerBound.toString,
+      JDBCOptions.JDBC_UPPER_BOUND -> upperBound.toString,
+      JDBCOptions.JDBC_NUM_PARTITIONS -> numPartitions.toString)
+    jdbc(url, table, connectionProperties)
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table using connection properties. The `predicates` parameter gives a list
    * expressions suitable for inclusion in WHERE clauses; each one defines one partition
-   * of the [[DataFrame]].
+   * of the `DataFrame`.
    *
    * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
    * your external database systems.
@@ -220,27 +229,19 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       table: String,
       predicates: Array[String],
       connectionProperties: Properties): DataFrame = {
+    // connectionProperties should override settings in extraOptions.
+    val params = extraOptions.toMap ++ connectionProperties.asScala.toMap
+    val options = new JDBCOptions(url, table, params)
     val parts: Array[Partition] = predicates.zipWithIndex.map { case (part, i) =>
       JDBCPartition(part, i) : Partition
     }
-    jdbc(url, table, parts, connectionProperties)
-  }
-
-  private def jdbc(
-      url: String,
-      table: String,
-      parts: Array[Partition],
-      connectionProperties: Properties): DataFrame = {
-    // connectionProperties should override settings in extraOptions.
-    this.extraOptions = this.extraOptions ++ connectionProperties.asScala
-    // explicit url and dbtable should override all
-    this.extraOptions += ("url" -> url, "dbtable" -> table)
-    format("jdbc").load()
+    val relation = JDBCRelation(parts, options)(sparkSession)
+    sparkSession.baseRelationToDataFrame(relation)
   }
 
   /**
-   * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]])
-   * and returns the result as a [[DataFrame]].
+   * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or
+   * newline-delimited JSON</a>) and returns the result as a `DataFrame`.
    * See the documentation on the overloaded `json()` method with varargs for more details.
    *
    * @since 1.4.0
@@ -251,8 +252,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]])
-   * and returns the result as a [[DataFrame]].
+   * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or
+   * newline-delimited JSON</a>) and returns the result as a `DataFrame`.
    *
    * This function goes through the input once to determine the input schema. If you know the
    * schema in advance, use the version that specifies the schema to avoid the extra scan.
@@ -297,8 +298,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def json(paths: String*): DataFrame = format("json").load(paths : _*)
 
   /**
-   * Loads a `JavaRDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format
-   * or newline-delimited JSON]]) and returns the result as a [[DataFrame]].
+   * Loads a `JavaRDD[String]` storing JSON objects (<a href="http://jsonlines.org/">JSON
+   * Lines text format or newline-delimited JSON</a>) and returns the result as
+   * a `DataFrame`.
    *
    * Unless the schema is specified using [[schema]] function, this function goes through the
    * input once to determine the input schema.
@@ -309,8 +311,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def json(jsonRDD: JavaRDD[String]): DataFrame = json(jsonRDD.rdd)
 
   /**
-   * Loads an `RDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format or
-   * newline-delimited JSON]]) and returns the result as a [[DataFrame]].
+   * Loads an `RDD[String]` storing JSON objects (<a href="http://jsonlines.org/">JSON Lines
+   * text format or newline-delimited JSON</a>) and returns the result as a `DataFrame`.
    *
    * Unless the schema is specified using [[schema]] function, this function goes through the
    * input once to determine the input schema.
@@ -340,7 +342,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a CSV file and returns the result as a [[DataFrame]]. See the documentation on the
+   * Loads a CSV file and returns the result as a `DataFrame`. See the documentation on the
    * other overloaded `csv()` method for more details.
    *
    * @since 2.0.0
@@ -351,7 +353,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a CSV file and returns the result as a [[DataFrame]].
+   * Loads a CSV file and returns the result as a `DataFrame`.
    *
    * This function will go through the input once to determine the input schema if `inferSchema`
    * is enabled. To avoid going through the entire data once, disable `inferSchema` option or
@@ -391,7 +393,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that
    * indicates a timestamp format. Custom date formats follow the formats at
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
-   * `java.sql.Timestamp.valueOf()` and `java.sql.Date.valueOf()` or ISO 8601 format.</li>
    * <li>`maxColumns` (default `20480`): defines a hard limit of how many columns
    * a record can have.</li>
    * <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
@@ -414,7 +415,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def csv(paths: String*): DataFrame = format("csv").load(paths : _*)
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]]. See the documentation
+   * Loads a Parquet file, returning the result as a `DataFrame`. See the documentation
    * on the other overloaded `parquet()` method for more details.
    *
    * @since 2.0.0
@@ -425,7 +426,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]].
+   * Loads a Parquet file, returning the result as a `DataFrame`.
    *
    * You can set the following Parquet-specific option(s) for reading Parquet files:
    * <ul>
@@ -441,7 +442,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads an ORC file and returns the result as a [[DataFrame]].
+   * Loads an ORC file and returns the result as a `DataFrame`.
    *
    * @param path input path
    * @since 1.5.0
@@ -453,7 +454,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads an ORC file and returns the result as a [[DataFrame]].
+   * Loads an ORC file and returns the result as a `DataFrame`.
    *
    * @param paths input paths
    * @since 2.0.0
@@ -463,7 +464,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def orc(paths: String*): DataFrame = format("orc").load(paths: _*)
 
   /**
-   * Returns the specified table as a [[DataFrame]].
+   * Returns the specified table as a `DataFrame`.
    *
    * @since 1.4.0
    */
@@ -474,7 +475,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
+   * Loads text files and returns a `DataFrame` whose schema starts with a string column named
    * "value", and followed by partitioned columns if there are any. See the documentation on
    * the other overloaded `text()` method for more details.
    *
@@ -486,7 +487,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
+   * Loads text files and returns a `DataFrame` whose schema starts with a string column named
    * "value", and followed by partitioned columns if there are any.
    *
    * Each line in the text files is a new row in the resulting DataFrame. For example:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index b5bbcee37150..89c3a74f4f06 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.sketch.{BloomFilter, CountMinSketch}
 
 /**
- * Statistic functions for [[DataFrame]]s.
+ * Statistic functions for `DataFrame`s.
  *
  * @since 1.4.0
  */
@@ -44,23 +44,26 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * of `x` is close to (p * N).
    * More precisely,
    *
-   *   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+   * {{{
+   *   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N)
+   * }}}
    *
    * This method implements a variation of the Greenwald-Khanna algorithm (with some speed
    * optimizations).
-   * The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient
-   * Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+   * The algorithm was first present in <a href="http://dx.doi.org/10.1145/375663.375670">
+   * Space-efficient Online Computation of Quantile Summaries</a> by Greenwald and Khanna.
    *
-   * Note that NaN values will be removed from the numerical column before calculation
    * @param col the name of the numerical column
    * @param probabilities a list of quantile probabilities
    *   Each number must belong to [0, 1].
    *   For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
-   * @param relativeError The relative target precision to achieve (>= 0).
+   * @param relativeError The relative target precision to achieve (greater or equal to 0).
    *   If set to zero, the exact quantiles are computed, which could be very expensive.
    *   Note that values greater than 1 are accepted but give the same result as 1.
    * @return the approximate quantiles at the given probabilities
    *
+   * @note NaN values will be removed from the numerical column before calculation
+   *
    * @since 2.0.0
    */
   def approxQuantile(
@@ -183,11 +186,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   /**
    * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp,
+   * Schenker, and Papadimitriou.
    * The `support` should be greater than 1e-4.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
-   * backward compatibility of the schema of the resulting [[DataFrame]].
+   * backward compatibility of the schema of the resulting `DataFrame`.
    *
    * @param cols the names of the columns to search frequent items in.
    * @param support The minimum frequency for an item to be considered `frequent`. Should be greater
@@ -229,11 +233,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   /**
    * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp,
+   * Schenker, and Papadimitriou.
    * Uses a `default` support of 1%.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
-   * backward compatibility of the schema of the resulting [[DataFrame]].
+   * backward compatibility of the schema of the resulting `DataFrame`.
    *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
@@ -247,10 +252,11 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   /**
    * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker,
+   * and Papadimitriou.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
-   * backward compatibility of the schema of the resulting [[DataFrame]].
+   * backward compatibility of the schema of the resulting `DataFrame`.
    *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
@@ -290,11 +296,12 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   /**
    * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker,
+   * and Papadimitriou.
    * Uses a `default` support of 1%.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
-   * backward compatibility of the schema of the resulting [[DataFrame]].
+   * backward compatibility of the schema of the resulting `DataFrame`.
    *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
@@ -312,7 +319,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *                  its fraction as zero.
    * @param seed random seed
    * @tparam T stratum type
-   * @return a new [[DataFrame]] that represents the stratified sample
+   * @return a new `DataFrame` that represents the stratified sample
    *
    * {{{
    *    val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
@@ -349,7 +356,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *                  its fraction as zero.
    * @param seed random seed
    * @tparam T stratum type
-   * @return a new [[DataFrame]] that represents the stratified sample
+   * @return a new `DataFrame` that represents the stratified sample
    *
    * @since 1.5.0
    */
@@ -364,7 +371,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param depth depth of the sketch
    * @param width width of the sketch
    * @param seed random seed
-   * @return a [[CountMinSketch]] over column `colName`
+   * @return a `CountMinSketch` over column `colName`
    * @since 2.0.0
    */
   def countMinSketch(colName: String, depth: Int, width: Int, seed: Int): CountMinSketch = {
@@ -378,7 +385,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param eps relative error of the sketch
    * @param confidence confidence of the sketch
    * @param seed random seed
-   * @return a [[CountMinSketch]] over column `colName`
+   * @return a `CountMinSketch` over column `colName`
    * @since 2.0.0
    */
   def countMinSketch(
@@ -393,7 +400,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param depth depth of the sketch
    * @param width width of the sketch
    * @param seed random seed
-   * @return a [[CountMinSketch]] over column `colName`
+   * @return a `CountMinSketch` over column `colName`
    * @since 2.0.0
    */
   def countMinSketch(col: Column, depth: Int, width: Int, seed: Int): CountMinSketch = {
@@ -407,7 +414,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param eps relative error of the sketch
    * @param confidence confidence of the sketch
    * @param seed random seed
-   * @return a [[CountMinSketch]] over column `colName`
+   * @return a `CountMinSketch` over column `colName`
    * @since 2.0.0
    */
   def countMinSketch(col: Column, eps: Double, confidence: Double, seed: Int): CountMinSketch = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 11dd1df90993..44c407d9cd69 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -24,15 +24,15 @@ import scala.collection.JavaConverters._
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType}
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Union}
-import org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand
-import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, CreateTable, DataSource, HadoopFsRelation}
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, OverwriteOptions}
+import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, DDLUtils}
+import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, HadoopFsRelation}
 import org.apache.spark.sql.types.StructType
 
 /**
  * Interface used to write a [[Dataset]] to external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[Dataset.write]] to access this.
+ * key-value stores, etc). Use `Dataset.write` to access this.
  *
  * @since 1.4.0
  */
@@ -150,7 +150,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * predicates on the partitioned columns. In order for partitioning to work well, the number
    * of distinct values in each column should typically be less than tens of thousands.
    *
-   * This was initially applicable for Parquet but in 1.5+ covers JSON, text, ORC and avro as well.
+   * This is applicable for all file-based data sources (e.g. Parquet, JSON) staring Spark 2.1.0.
    *
    * @since 1.4.0
    */
@@ -164,7 +164,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * Buckets the output by the given columns. If specified, the output is laid out on the file
    * system similar to Hive's bucketing scheme.
    *
-   * This is applicable for Parquet, JSON and ORC.
+   * This is applicable for all file-based data sources (e.g. Parquet, JSON) staring Spark 2.1.0.
    *
    * @since 2.0
    */
@@ -178,7 +178,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   /**
    * Sorts the output in each bucket by the given columns.
    *
-   * This is applicable for Parquet, JSON and ORC.
+   * This is applicable for all file-based data sources (e.g. Parquet, JSON) staring Spark 2.1.0.
    *
    * @since 2.0
    */
@@ -189,7 +189,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] at the specified path.
+   * Saves the content of the `DataFrame` at the specified path.
    *
    * @since 1.4.0
    */
@@ -199,7 +199,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] as the specified table.
+   * Saves the content of the `DataFrame` as the specified table.
    *
    * @since 1.4.0
    */
@@ -215,10 +215,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
     dataSource.write(mode, df)
   }
   /**
-   * Inserts the content of the [[DataFrame]] to the specified table. It requires that
-   * the schema of the [[DataFrame]] is the same as the schema of the table.
+   * Inserts the content of the `DataFrame` to the specified table. It requires that
+   * the schema of the `DataFrame` is the same as the schema of the table.
    *
-   * Note: Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based
+   * @note Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based
    * resolution. For example:
    *
    * {{{
@@ -259,7 +259,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         table = UnresolvedRelation(tableIdent),
         partition = Map.empty[String, Option[String]],
         child = df.logicalPlan,
-        overwrite = mode == SaveMode.Overwrite,
+        overwrite = OverwriteOptions(mode == SaveMode.Overwrite),
         ifNotExists = false)).toRdd
   }
 
@@ -322,15 +322,15 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] as the specified table.
+   * Saves the content of the `DataFrame` as the specified table.
    *
    * In the case the table already exists, behavior of this function depends on the
    * save mode, specified by the `mode` function (default to throwing an exception).
-   * When `mode` is `Overwrite`, the schema of the [[DataFrame]] does not need to be
+   * When `mode` is `Overwrite`, the schema of the `DataFrame` does not need to be
    * the same as that of the existing table.
    *
    * When `mode` is `Append`, if there is an existing table, we will use the format and options of
-   * the existing table. The column order in the schema of the [[DataFrame]] doesn't need to be same
+   * the existing table. The column order in the schema of the `DataFrame` doesn't need to be same
    * as that of the existing table. Unlike `insertInto`, `saveAsTable` will use the column names to
    * find the correct column positions. For example:
    *
@@ -346,7 +346,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    *    +---+---+
    * }}}
    *
-   * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input
+   * When the DataFrame is created from a non-partitioned `HadoopFsRelation` with a single input
    * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC
    * and Parquet), the table is persisted in a Hive compatible format, which means other systems
    * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL
@@ -359,7 +359,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   private def saveAsTable(tableIdent: TableIdentifier): Unit = {
-    if (source.toLowerCase == "hive") {
+    if (source.toLowerCase == DDLUtils.HIVE_PROVIDER) {
       throw new AnalysisException("Cannot create hive serde table with saveAsTable API")
     }
 
@@ -373,7 +373,19 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         throw new AnalysisException(s"Table $tableIdent already exists.")
 
       case _ =>
-        val tableType = if (new CaseInsensitiveMap(extraOptions.toMap).contains("path")) {
+        val existingTable = if (tableExists) {
+          Some(df.sparkSession.sessionState.catalog.getTableMetadata(tableIdent))
+        } else {
+          None
+        }
+        val storage = if (tableExists) {
+          existingTable.get.storage
+        } else {
+          DataSource.buildStorageFormatFromOptions(extraOptions.toMap)
+        }
+        val tableType = if (tableExists) {
+          existingTable.get.tableType
+        } else if (storage.locationUri.isDefined) {
           CatalogTableType.EXTERNAL
         } else {
           CatalogTableType.MANAGED
@@ -382,7 +394,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         val tableDesc = CatalogTable(
           identifier = tableIdent,
           tableType = tableType,
-          storage = CatalogStorageFormat.empty.copy(properties = extraOptions.toMap),
+          storage = storage,
           schema = new StructType,
           provider = Some(source),
           partitionColumnNames = partitioningColumns.getOrElse(Nil),
@@ -390,17 +402,11 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         )
         df.sparkSession.sessionState.executePlan(
           CreateTable(tableDesc, mode, Some(df.logicalPlan))).toRdd
-        if (tableDesc.partitionColumnNames.nonEmpty &&
-            df.sparkSession.sqlContext.conf.manageFilesourcePartitions) {
-          // Need to recover partitions into the metastore so our saved data is visible.
-          df.sparkSession.sessionState.executePlan(
-            AlterTableRecoverPartitionsCommand(tableDesc.identifier)).toRdd
-        }
     }
   }
 
   /**
-   * Saves the content of the [[DataFrame]] to an external database table via JDBC. In the case the
+   * Saves the content of the `DataFrame` to an external database table via JDBC. In the case the
    * table already exists in the external database, behavior of this function depends on the
    * save mode, specified by the `mode` function (default to throwing an exception).
    *
@@ -441,8 +447,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in JSON format ([[http://jsonlines.org/ JSON Lines text
-   * format or newline-delimited JSON]]) at the specified path.
+   * Saves the content of the `DataFrame` in JSON format (<a href="http://jsonlines.org/">
+   * JSON Lines text format or newline-delimited JSON</a>) at the specified path.
    * This is equivalent to:
    * {{{
    *   format("json").save(path)
@@ -468,7 +474,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in Parquet format at the specified path.
+   * Saves the content of the `DataFrame` in Parquet format at the specified path.
    * This is equivalent to:
    * {{{
    *   format("parquet").save(path)
@@ -489,7 +495,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in ORC format at the specified path.
+   * Saves the content of the `DataFrame` in ORC format at the specified path.
    * This is equivalent to:
    * {{{
    *   format("orc").save(path)
@@ -510,7 +516,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in a text file at the specified path.
+   * Saves the content of the `DataFrame` in a text file at the specified path.
    * The DataFrame must have only one column that is of string type.
    * Each row becomes a new line in the output file. For example:
    * {{{
@@ -535,7 +541,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in CSV format at the specified path.
+   * Saves the content of the `DataFrame` in CSV format at the specified path.
    * This is equivalent to:
    * {{{
    *   format("csv").save(path)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index eb2b20afc37c..ac34e0bf8964 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -50,6 +50,7 @@ import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.DataStreamWriter
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.Utils
 
 private[sql] object Dataset {
@@ -67,7 +68,7 @@ private[sql] object Dataset {
 /**
  * A Dataset is a strongly typed collection of domain-specific objects that can be transformed
  * in parallel using functional or relational operations. Each Dataset also has an untyped view
- * called a [[DataFrame]], which is a Dataset of [[Row]].
+ * called a `DataFrame`, which is a Dataset of [[Row]].
  *
  * Operations available on Datasets are divided into transformations and actions. Transformations
  * are the ones that produce new Datasets, and actions are the ones that trigger computation and
@@ -127,7 +128,7 @@ private[sql] object Dataset {
  *
  *   people.filter("age > 30")
  *     .join(department, people("deptId") === department("id"))
- *     .groupBy(department("name"), "gender")
+ *     .groupBy(department("name"), people("gender"))
  *     .agg(avg(people("salary")), max(people("age")))
  * }}}
  *
@@ -137,9 +138,9 @@ private[sql] object Dataset {
  *   Dataset<Row> people = spark.read().parquet("...");
  *   Dataset<Row> department = spark.read().parquet("...");
  *
- *   people.filter("age".gt(30))
- *     .join(department, people.col("deptId").equalTo(department("id")))
- *     .groupBy(department.col("name"), "gender")
+ *   people.filter(people.col("age").gt(30))
+ *     .join(department, people.col("deptId").equalTo(department.col("id")))
+ *     .groupBy(department.col("name"), people.col("gender"))
  *     .agg(avg(people.col("salary")), max(people.col("age")));
  * }}}
  *
@@ -362,7 +363,7 @@ class Dataset[T] private[sql](
    *  - When `U` is a tuple, the columns will be be mapped by ordinal (i.e. the first column will
    *    be assigned to `_1`).
    *  - When `U` is a primitive type (i.e. String, Int, etc), then the first column of the
-   *    [[DataFrame]] will be used.
+   *    `DataFrame` will be used.
    *
    * If the schema of the Dataset does not match the desired `U` type, you can use `select`
    * along with `alias` or `as` to rearrange or rename as required.
@@ -376,7 +377,7 @@ class Dataset[T] private[sql](
 
   /**
    * Converts this strongly typed collection of data to generic `DataFrame` with columns renamed.
-   * This can be quite convenient in conversion from a RDD of tuples into a [[DataFrame]] with
+   * This can be quite convenient in conversion from an RDD of tuples into a `DataFrame` with
    * meaningful names. For example:
    * {{{
    *   val rdd: RDD[(Int, String)] = ...
@@ -471,12 +472,12 @@ class Dataset[T] private[sql](
   /**
    * Returns true if this Dataset contains one or more sources that continuously
    * return data as it arrives. A Dataset that reads data from a streaming source
-   * must be executed as a [[StreamingQuery]] using the `start()` method in
-   * [[DataStreamWriter]]. Methods that return a single answer, e.g. `count()` or
+   * must be executed as a `StreamingQuery` using the `start()` method in
+   * `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or
    * `collect()`, will throw an [[AnalysisException]] when there is a streaming
    * source present.
    *
-   * @group basic
+   * @group streaming
    * @since 2.0.0
    */
   @Experimental
@@ -484,7 +485,10 @@ class Dataset[T] private[sql](
   def isStreaming: Boolean = logicalPlan.isStreaming
 
   /**
-   * Returns a checkpointed version of this Dataset.
+   * Eagerly checkpoint a Dataset and return the new Dataset. Checkpointing can be used to truncate
+   * the logical plan of this Dataset, which is especially useful in iterative algorithms where the
+   * plan may grow exponentially. It will be saved to files inside the checkpoint
+   * directory set with `SparkContext#setCheckpointDir`.
    *
    * @group basic
    * @since 2.1.0
@@ -494,9 +498,10 @@ class Dataset[T] private[sql](
   def checkpoint(): Dataset[T] = checkpoint(eager = true)
 
   /**
-   * Returns a checkpointed version of this Dataset.
-   *
-   * @param eager When true, materializes the underlying checkpointed RDD eagerly.
+   * Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
+   * logical plan of this Dataset, which is especially useful in iterative algorithms where the
+   * plan may grow exponentially. It will be saved to files inside the checkpoint
+   * directory set with `SparkContext#setCheckpointDir`.
    *
    * @group basic
    * @since 2.1.0
@@ -535,6 +540,43 @@ class Dataset[T] private[sql](
       )(sparkSession)).as[T]
   }
 
+  /**
+   * :: Experimental ::
+   * Defines an event time watermark for this [[Dataset]]. A watermark tracks a point in time
+   * before which we assume no more late data is going to arrive.
+   *
+   * Spark will use this watermark for several purposes:
+   *  - To know when a given time window aggregation can be finalized and thus can be emitted when
+   *    using output modes that do not allow updates.
+   *  - To minimize the amount of state that we need to keep for on-going aggregations.
+   *
+   *  The current watermark is computed by looking at the `MAX(eventTime)` seen across
+   *  all of the partitions in the query minus a user specified `delayThreshold`.  Due to the cost
+   *  of coordinating this value across partitions, the actual watermark used is only guaranteed
+   *  to be at least `delayThreshold` behind the actual event time.  In some cases we may still
+   *  process records that arrive more than `delayThreshold` late.
+   *
+   * @param eventTime the name of the column that contains the event time of the row.
+   * @param delayThreshold the minimum delay to wait to data to arrive late, relative to the latest
+   *                       record that has been processed in the form of an interval
+   *                       (e.g. "1 minute" or "5 hours"). NOTE: This should not be negative.
+   *
+   * @group streaming
+   * @since 2.1.0
+   */
+  @Experimental
+  @InterfaceStability.Evolving
+  // We only accept an existing column name, not a derived column here as a watermark that is
+  // defined on a derived column cannot referenced elsewhere in the plan.
+  def withWatermark(eventTime: String, delayThreshold: String): Dataset[T] = withTypedPlan {
+    val parsedDelay =
+      Option(CalendarInterval.fromString("interval " + delayThreshold))
+        .getOrElse(throw new AnalysisException(s"Unable to parse time delay '$delayThreshold'"))
+    require(parsedDelay.milliseconds >= 0 && parsedDelay.months >= 0,
+      s"delay threshold ($delayThreshold) should not be negative.")
+    EventTimeWatermark(UnresolvedAttribute(eventTime), parsedDelay, logicalPlan)
+  }
+
   /**
    * Displays the Dataset in a tabular form. Strings more than 20 characters will be truncated,
    * and all cells will be aligned right. For example:
@@ -645,7 +687,7 @@ class Dataset[T] private[sql](
   def stat: DataFrameStatFunctions = new DataFrameStatFunctions(toDF())
 
   /**
-   * Join with another [[DataFrame]].
+   * Join with another `DataFrame`.
    *
    * Behaves as an INNER JOIN and requires a subsequent join predicate.
    *
@@ -659,7 +701,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Inner equi-join with another [[DataFrame]] using the given column.
+   * Inner equi-join with another `DataFrame` using the given column.
    *
    * Different from other join functions, the join column will only appear once in the output,
    * i.e. similar to SQL's `JOIN USING` syntax.
@@ -669,13 +711,13 @@ class Dataset[T] private[sql](
    *   df1.join(df2, "user_id")
    * }}}
    *
-   * Note that if you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
-   * there is no way to disambiguate which side of the join you would like to reference.
-   *
    * @param right Right side of the join operation.
    * @param usingColumn Name of the column to join on. This column must exist on both sides.
    *
+   * @note If you perform a self-join using this function without aliasing the input
+   * `DataFrame`s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
+   *
    * @group untypedrel
    * @since 2.0.0
    */
@@ -684,7 +726,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Inner equi-join with another [[DataFrame]] using the given columns.
+   * Inner equi-join with another `DataFrame` using the given columns.
    *
    * Different from other join functions, the join columns will only appear once in the output,
    * i.e. similar to SQL's `JOIN USING` syntax.
@@ -694,13 +736,13 @@ class Dataset[T] private[sql](
    *   df1.join(df2, Seq("user_id", "user_name"))
    * }}}
    *
-   * Note that if you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
-   * there is no way to disambiguate which side of the join you would like to reference.
-   *
    * @param right Right side of the join operation.
    * @param usingColumns Names of the columns to join on. This columns must exist on both sides.
    *
+   * @note If you perform a self-join using this function without aliasing the input
+   * `DataFrame`s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
+   *
    * @group untypedrel
    * @since 2.0.0
    */
@@ -709,18 +751,22 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Equi-join with another [[DataFrame]] using the given columns.
+   * Equi-join with another `DataFrame` using the given columns. A cross join with a predicate
+   * is specified as an inner join. If you would explicitly like to perform a cross join use the
+   * `crossJoin` method.
    *
    * Different from other join functions, the join columns will only appear once in the output,
    * i.e. similar to SQL's `JOIN USING` syntax.
    *
-   * Note that if you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
-   * there is no way to disambiguate which side of the join you would like to reference.
-   *
    * @param right Right side of the join operation.
    * @param usingColumns Names of the columns to join on. This columns must exist on both sides.
-   * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
+   * @param joinType Type of join to perform. Default `inner`. Must be one of:
+   *                 `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,
+   *                 `right`, `right_outer`, `left_semi`, `left_anti`.
+   *
+   * @note If you perform a self-join using this function without aliasing the input
+   * `DataFrame`s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
    *
    * @group untypedrel
    * @since 2.0.0
@@ -736,13 +782,13 @@ class Dataset[T] private[sql](
       Join(
         joined.left,
         joined.right,
-        UsingJoin(JoinType(joinType), usingColumns.map(UnresolvedAttribute(_))),
+        UsingJoin(JoinType(joinType), usingColumns),
         None)
     }
   }
 
   /**
-   * Inner join with another [[DataFrame]], using the given join expression.
+   * Inner join with another `DataFrame`, using the given join expression.
    *
    * {{{
    *   // The following two are equivalent:
@@ -756,7 +802,7 @@ class Dataset[T] private[sql](
   def join(right: Dataset[_], joinExprs: Column): DataFrame = join(right, joinExprs, "inner")
 
   /**
-   * Join with another [[DataFrame]], using the given join expression. The following performs
+   * Join with another `DataFrame`, using the given join expression. The following performs
    * a full outer join between `df1` and `df2`.
    *
    * {{{
@@ -771,7 +817,9 @@ class Dataset[T] private[sql](
    *
    * @param right Right side of the join.
    * @param joinExprs Join expression.
-   * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
+   * @param joinType Type of join to perform. Default `inner`. Must be one of:
+   *                 `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,
+   *                 `right`, `right_outer`, `left_semi`, `left_anti`.
    *
    * @group untypedrel
    * @since 2.0.0
@@ -820,12 +868,12 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Explicit cartesian join with another [[DataFrame]].
-   *
-   * Note that cartesian joins are very expensive without an extra filter that can be pushed down.
+   * Explicit cartesian join with another `DataFrame`.
    *
    * @param right Right side of the join operation.
    *
+   * @note Cartesian joins are very expensive without an extra filter that can be pushed down.
+   *
    * @group untypedrel
    * @since 2.1.0
    */
@@ -835,7 +883,7 @@ class Dataset[T] private[sql](
 
   /**
    * :: Experimental ::
-   * Joins this Dataset returning a [[Tuple2]] for each pair where `condition` evaluates to
+   * Joins this Dataset returning a `Tuple2` for each pair where `condition` evaluates to
    * true.
    *
    * This is similar to the relation `join` function with one important difference in the
@@ -848,7 +896,9 @@ class Dataset[T] private[sql](
    *
    * @param other Right side of the join.
    * @param condition Join expression.
-   * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
+   * @param joinType Type of join to perform. Default `inner`. Must be one of:
+   *                 `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,
+   *                 `right`, `right_outer`, `left_semi`, `left_anti`.
    *
    * @group typedrel
    * @since 1.6.0
@@ -916,7 +966,7 @@ class Dataset[T] private[sql](
 
   /**
    * :: Experimental ::
-   * Using inner equi-join to join this Dataset returning a [[Tuple2]] for each pair
+   * Using inner equi-join to join this Dataset returning a `Tuple2` for each pair
    * where `condition` evaluates to true.
    *
    * @param other Right side of the join.
@@ -1010,7 +1060,8 @@ class Dataset[T] private[sql](
 
   /**
    * Selects column based on the column name and return it as a [[Column]].
-   * Note that the column name can also reference to a nested column like `a.b`.
+   *
+   * @note The column name can also reference to a nested column like `a.b`.
    *
    * @group untypedrel
    * @since 2.0.0
@@ -1019,7 +1070,8 @@ class Dataset[T] private[sql](
 
   /**
    * Selects column based on the column name and return it as a [[Column]].
-   * Note that the column name can also reference to a nested column like `a.b`.
+   *
+   * @note The column name can also reference to a nested column like `a.b`.
    *
    * @group untypedrel
    * @since 2.0.0
@@ -1587,7 +1639,7 @@ class Dataset[T] private[sql](
    * Returns a new Dataset containing rows only in both this Dataset and another Dataset.
    * This is equivalent to `INTERSECT` in SQL.
    *
-   * Note that, equality checking is performed directly on the encoded representation of the data
+   * @note Equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * @group typedrel
@@ -1601,7 +1653,7 @@ class Dataset[T] private[sql](
    * Returns a new Dataset containing rows in this Dataset but not in another Dataset.
    * This is equivalent to `EXCEPT` in SQL.
    *
-   * Note that, equality checking is performed directly on the encoded representation of the data
+   * @note Equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * @group typedrel
@@ -1612,12 +1664,15 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns a new Dataset by sampling a fraction of rows.
+   * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed.
    *
    * @param withReplacement Sample with replacement or not.
    * @param fraction Fraction of rows to generate.
    * @param seed Seed for sampling.
    *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[Dataset]].
+   *
    * @group typedrel
    * @since 1.6.0
    */
@@ -1631,11 +1686,14 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns a new Dataset by sampling a fraction of rows, using a random seed.
+   * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed.
    *
    * @param withReplacement Sample with replacement or not.
    * @param fraction Fraction of rows to generate.
    *
+   * @note This is NOT guaranteed to provide exactly the fraction of the total count
+   * of the given [[Dataset]].
+   *
    * @group typedrel
    * @since 1.6.0
    */
@@ -1663,15 +1721,23 @@ class Dataset[T] private[sql](
     // It is possible that the underlying dataframe doesn't guarantee the ordering of rows in its
     // constituent partitions each time a split is materialized which could result in
     // overlapping splits. To prevent this, we explicitly sort each input partition to make the
-    // ordering deterministic.
-    // MapType cannot be sorted.
-    val sorted = Sort(logicalPlan.output.filterNot(_.dataType.isInstanceOf[MapType])
-      .map(SortOrder(_, Ascending)), global = false, logicalPlan)
+    // ordering deterministic. Note that MapTypes cannot be sorted and are explicitly pruned out
+    // from the sort order.
+    val sortOrder = logicalPlan.output
+      .filter(attr => RowOrdering.isOrderable(attr.dataType))
+      .map(SortOrder(_, Ascending))
+    val plan = if (sortOrder.nonEmpty) {
+      Sort(sortOrder, global = false, logicalPlan)
+    } else {
+      // SPARK-12662: If sort order is empty, we materialize the dataset to guarantee determinism
+      cache()
+      logicalPlan
+    }
     val sum = weights.sum
     val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
     normalizedCumWeights.sliding(2).map { x =>
       new Dataset[T](
-        sparkSession, Sample(x(0), x(1), withReplacement = false, seed, sorted)(), encoder)
+        sparkSession, Sample(x(0), x(1), withReplacement = false, seed, plan)(), encoder)
     }.toArray
   }
 
@@ -1960,10 +2026,7 @@ class Dataset[T] private[sql](
       if (groupColExprIds.contains(attr.exprId)) {
         attr
       } else {
-        // Removing duplicate rows should not change output attributes. We should keep
-        // the original exprId of the attribute. Otherwise, to select a column in original
-        // dataset will cause analysis exception due to unresolved attribute.
-        Alias(new First(attr).toAggregateExpression(), attr.name)(exprId = attr.exprId)
+        Alias(new First(attr).toAggregateExpression(), attr.name)()
       }
     }
     Aggregate(groupCols, aggCols, logicalPlan)
@@ -2184,7 +2247,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns a new [[DataFrame]] that contains the result of applying a serialized R function
+   * Returns a new `DataFrame` that contains the result of applying a serialized R function
    * `func` to each partition.
    */
   private[sql] def mapPartitionsInR(
@@ -2335,7 +2398,7 @@ class Dataset[T] private[sql](
    *
    * The iterator will consume as much memory as the largest partition in this Dataset.
    *
-   * Note: this results in multiple Spark jobs, and if the input Dataset is the result
+   * @note this results in multiple Spark jobs, and if the input Dataset is the result
    * of a wide transformation (e.g. join with different partitioners), to avoid
    * recomputing the input Dataset should be cached first.
    *
@@ -2398,9 +2461,17 @@ class Dataset[T] private[sql](
 
   /**
    * Returns a new Dataset that has exactly `numPartitions` partitions.
-   * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g.
+   * Similar to coalesce defined on an `RDD`, this operation results in a narrow dependency, e.g.
    * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
-   * the 100 new partitions will claim 10 of the current partitions.
+   * the 100 new partitions will claim 10 of the current partitions.  If a larger number of
+   * partitions is requested, it will stay at the current number of partitions.
+   *
+   * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
+   * this may result in your computation taking place on fewer nodes than
+   * you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+   * you can call repartition. This will add a shuffle step, but means the
+   * current upstream partitions will be executed in parallel (per whatever
+   * the current partitioning is).
    *
    * @group typedrel
    * @since 1.6.0
@@ -2413,7 +2484,7 @@ class Dataset[T] private[sql](
    * Returns a new Dataset that contains only the unique rows from this Dataset.
    * This is an alias for `dropDuplicates`.
    *
-   * Note that, equality checking is performed directly on the encoded representation of the data
+   * @note Equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * @group typedrel
@@ -2488,7 +2559,7 @@ class Dataset[T] private[sql](
   def unpersist(): this.type = unpersist(blocking = false)
 
   /**
-   * Represents the content of the Dataset as an [[RDD]] of [[T]].
+   * Represents the content of the Dataset as an `RDD` of [[T]].
    *
    * @group basic
    * @since 1.6.0
@@ -2502,14 +2573,14 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns the content of the Dataset as a [[JavaRDD]] of [[T]]s.
+   * Returns the content of the Dataset as a `JavaRDD` of [[T]]s.
    * @group basic
    * @since 1.6.0
    */
   def toJavaRDD: JavaRDD[T] = rdd.toJavaRDD()
 
   /**
-   * Returns the content of the Dataset as a [[JavaRDD]] of [[T]]s.
+   * Returns the content of the Dataset as a `JavaRDD` of [[T]]s.
    * @group basic
    * @since 1.6.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
index 1163035e315f..372ec262f576 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
@@ -18,18 +18,17 @@
 package org.apache.spark.sql
 
 import org.apache.spark.annotation.{Experimental, InterfaceStability}
-import org.apache.spark.sql.streaming.StreamingQuery
 
 /**
  * :: Experimental ::
- * A class to consume data generated by a [[StreamingQuery]]. Typically this is used to send the
+ * A class to consume data generated by a `StreamingQuery`. Typically this is used to send the
  * generated data to external systems. Each partition will use a new deserialized instance, so you
  * usually should do all the initialization (e.g. opening a connection or initiating a transaction)
  * in the `open` method.
  *
  * Scala example:
  * {{{
- *   datasetOfString.write.foreach(new ForeachWriter[String] {
+ *   datasetOfString.writeStream.foreach(new ForeachWriter[String] {
  *
  *     def open(partitionId: Long, version: Long): Boolean = {
  *       // open connection
@@ -47,7 +46,7 @@ import org.apache.spark.sql.streaming.StreamingQuery
  *
  * Java example:
  * {{{
- *  datasetOfString.write().foreach(new ForeachWriter<String>() {
+ *  datasetOfString.writeStream().foreach(new ForeachWriter<String>() {
  *
  *    @Override
  *    public boolean open(long partitionId, long version) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 31ce8eb25e80..395d709f2659 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -131,7 +131,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
@@ -160,7 +160,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
@@ -182,7 +182,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
@@ -205,7 +205,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index f019d1e9dace..0fe8d87ebd6b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.NumericType
 import org.apache.spark.sql.types.StructType
 
 /**
- * A set of methods for aggregations on a [[DataFrame]], created by [[Dataset.groupBy]].
+ * A set of methods for aggregations on a `DataFrame`, created by `Dataset.groupBy`.
  *
  * The main method is the agg function, which has multiple variants. This class also contains
  * convenience some first order statistics such as mean, sum for convenience.
@@ -129,7 +129,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * (Scala-specific) Compute aggregates by specifying the column names and
-   * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+   * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
    *
    * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
    * {{{
@@ -150,7 +150,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * (Scala-specific) Compute aggregates by specifying a map from column name to
-   * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+   * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
    *
    * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
    * {{{
@@ -171,7 +171,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * (Java-specific) Compute aggregates by specifying a map from column name to
-   * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+   * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
    *
    * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
    * {{{
@@ -228,7 +228,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Count the number of rows for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    *
    * @since 1.3.0
    */
@@ -236,7 +236,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the average value for each numeric columns for each group. This is an alias for `avg`.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the average values for them.
    *
    * @since 1.3.0
@@ -248,7 +248,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the max value for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the max values for them.
    *
    * @since 1.3.0
@@ -260,7 +260,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the mean value for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the mean values for them.
    *
    * @since 1.3.0
@@ -272,7 +272,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the min value for each numeric column for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the min values for them.
    *
    * @since 1.3.0
@@ -284,7 +284,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the sum for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the sum for them.
    *
    * @since 1.3.0
@@ -295,7 +295,7 @@ class RelationalGroupedDataset protected[sql](
   }
 
   /**
-   * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+   * Pivots a column of the current `DataFrame` and perform the specified aggregation.
    * There are two versions of pivot function: one that requires the caller to specify the list
    * of distinct values to pivot on, and one that does not. The latter is more concise but less
    * efficient, because Spark needs to first compute the list of distinct values internally.
@@ -335,7 +335,7 @@ class RelationalGroupedDataset protected[sql](
   }
 
   /**
-   * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+   * Pivots a column of the current `DataFrame` and perform the specified aggregation.
    * There are two versions of pivot function: one that requires the caller to specify the list
    * of distinct values to pivot on, and one that does not. The latter is more concise but less
    * efficient, because Spark needs to first compute the list of distinct values internally.
@@ -367,7 +367,7 @@ class RelationalGroupedDataset protected[sql](
   }
 
   /**
-   * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+   * Pivots a column of the current `DataFrame` and perform the specified aggregation.
    * There are two versions of pivot function: one that requires the caller to specify the list
    * of distinct values to pivot on, and one that does not. The latter is more concise but less
    * efficient, because Spark needs to first compute the list of distinct values internally.
@@ -392,12 +392,12 @@ class RelationalGroupedDataset protected[sql](
    * Applies the given serialized R function `func` to each group of data. For each unique group,
    * the function will be passed the group key and an iterator that contains all of the elements in
    * the group. The function can return an iterator containing elements of an arbitrary type which
-   * will be returned as a new [[DataFrame]].
+   * will be returned as a new `DataFrame`.
    *
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
index 9108d19d0a0c..edfcd7d56dc8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 
 
 /**
- * Runtime configuration interface for Spark. To access this, use [[SparkSession.conf]].
+ * Runtime configuration interface for Spark. To access this, use `SparkSession.conf`.
  *
  * Options set here are automatically propagated to the Hadoop configuration during I/O.
  *
@@ -65,7 +65,8 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) {
   /**
    * Returns the value of Spark runtime configuration property for the given key.
    *
-   * @throws NoSuchElementException if the key is not set and does not have a default value
+   * @throws java.util.NoSuchElementException if the key is not set and does not have a default
+   *                                          value
    * @since 2.0.0
    */
   @throws[NoSuchElementException]("if the key is not set")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 3c5cf037c578..1a7fd689a04d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -84,7 +84,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Returns a [[SQLContext]] as new session, with separated SQL configurations, temporary
-   * tables, registered functions, but sharing the same [[SparkContext]], cached data and
+   * tables, registered functions, but sharing the same `SparkContext`, cached data and
    * other things.
    *
    * @since 1.6.0
@@ -172,7 +172,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   def experimental: ExperimentalMethods = sparkSession.experimental
 
   /**
-   * Returns a [[DataFrame]] with no rows or columns.
+   * Returns a `DataFrame` with no rows or columns.
    *
    * @group basic
    * @since 1.3.0
@@ -181,9 +181,6 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * A collection of methods for registering user-defined functions (UDF).
-   * Note that the user-defined functions must be deterministic. Due to optimization,
-   * duplicate invocations may be eliminated or the function may even be invoked more times than
-   * it is present in the query.
    *
    * The following example registers a Scala closure as UDF:
    * {{{
@@ -208,6 +205,10 @@ class SQLContext private[sql](val sparkSession: SparkSession)
    *       DataTypes.StringType);
    * }}}
    *
+   * @note The user-defined functions must be deterministic. Due to optimization,
+   * duplicate invocations may be eliminated or the function may even be invoked more times than
+   * it is present in the query.
+   *
    * @group basic
    * @since 1.3.0
    */
@@ -253,7 +254,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   /**
    * :: Experimental ::
    * (Scala-specific) Implicit methods available in Scala for converting
-   * common Scala objects into [[DataFrame]]s.
+   * common Scala objects into `DataFrame`s.
    *
    * {{{
    *   val sqlContext = new SQLContext(sc)
@@ -297,7 +298,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]].
+   * Convert a [[BaseRelation]] created for external data sources into a `DataFrame`.
    *
    * @group dataframes
    * @since 1.3.0
@@ -308,7 +309,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from an [[RDD]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    * Example:
@@ -437,7 +438,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from a [[JavaRDD]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from a [[JavaRDD]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    *
@@ -452,7 +453,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from a [[java.util.List]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from a [[java.util.List]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided List matches
    * the provided schema. Otherwise, there will be runtime exception.
    *
@@ -503,7 +504,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a
-   * [[DataFrame]].
+   * `DataFrame`.
    * {{{
    *   sqlContext.read.parquet("/path/to/file.parquet")
    *   sqlContext.read.schema(schema).json("/path/to/file.json")
@@ -517,7 +518,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Returns a [[DataStreamReader]] that can be used to read streaming data in as a [[DataFrame]].
+   * Returns a [[DataStreamReader]] that can be used to read streaming data in as a `DataFrame`.
    * {{{
    *   sparkSession.readStream.parquet("/path/to/directory/of/parquet/files")
    *   sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files")
@@ -616,7 +617,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Registers the given [[DataFrame]] as a temporary table in the catalog. Temporary tables exist
+   * Registers the given `DataFrame` as a temporary table in the catalog. Temporary tables exist
    * only during the lifetime of this instance of SQLContext.
    */
   private[sql] def registerDataFrameAsTable(df: DataFrame, tableName: String): Unit = {
@@ -637,7 +638,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements
    * in a range from 0 to `end` (exclusive) with step value 1.
    *
    * @since 1.4.1
@@ -649,7 +650,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with step value 1.
    *
    * @since 1.4.0
@@ -661,7 +662,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with a step value.
    *
    * @since 2.0.0
@@ -675,7 +676,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements
    * in an range from `start` to `end` (exclusive) with an step value, with partition number
    * specified.
    *
@@ -689,7 +690,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is
+   * Executes a SQL query using Spark, returning the result as a `DataFrame`. The dialect that is
    * used for SQL parsing can be configured with 'spark.sql.dialect'.
    *
    * @group basic
@@ -698,7 +699,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   def sql(sqlText: String): DataFrame = sparkSession.sql(sqlText)
 
   /**
-   * Returns the specified table as a [[DataFrame]].
+   * Returns the specified table as a `DataFrame`.
    *
    * @group ddl_ops
    * @since 1.3.0
@@ -708,7 +709,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Returns a [[DataFrame]] containing names of existing tables in the current database.
+   * Returns a `DataFrame` containing names of existing tables in the current database.
    * The returned DataFrame has two columns, tableName and isTemporary (a Boolean
    * indicating if a table is a temporary one or not).
    *
@@ -720,7 +721,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Returns a [[DataFrame]] containing names of existing tables in the given database.
+   * Returns a `DataFrame` containing names of existing tables in the given database.
    * The returned DataFrame has two columns, tableName and isTemporary (a Boolean
    * indicating if a table is a temporary one or not).
    *
@@ -746,7 +747,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
    * @since 1.3.0
    */
   def tableNames(): Array[String] = {
-    sparkSession.catalog.listTables().collect().map(_.name)
+    tableNames(sparkSession.catalog.currentDatabase)
   }
 
   /**
@@ -756,7 +757,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
    * @since 1.3.0
    */
   def tableNames(databaseName: String): Array[String] = {
-    sparkSession.catalog.listTables(databaseName).collect().map(_.name)
+    sessionState.catalog.listTables(databaseName).map(_.table).toArray
   }
 
   ////////////////////////////////////////////////////////////////////////////
@@ -798,8 +799,8 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
-   * [[DataFrame]] if no paths are passed in.
+   * Loads a Parquet file, returning the result as a `DataFrame`. This function returns an empty
+   * `DataFrame` if no paths are passed in.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().parquet()`.
@@ -815,7 +816,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
+   * Loads a JSON file (one object per line), returning the result as a `DataFrame`.
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
@@ -828,7 +829,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads a JSON file (one object per line) and applies the given schema,
-   * returning the result as a [[DataFrame]].
+   * returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -849,7 +850,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * [[DataFrame]].
+   * `DataFrame`.
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
@@ -860,7 +861,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * [[DataFrame]].
+   * `DataFrame`.
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
@@ -871,7 +872,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
-   * returning the result as a [[DataFrame]].
+   * returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -882,8 +883,8 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the given
-   * schema, returning the result as a [[DataFrame]].
+   * Loads an JavaRDD[String] storing JSON objects (one object per record) and applies the given
+   * schema, returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -895,7 +896,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record) inferring the
-   * schema, returning the result as a [[DataFrame]].
+   * schema, returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -907,7 +908,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the
-   * schema, returning the result as a [[DataFrame]].
+   * schema, returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -994,7 +995,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table.
    *
    * @group specificdata
@@ -1006,7 +1007,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table.  Partitions of the table will be retrieved in parallel based on the parameters
    * passed to this function.
    *
@@ -1030,10 +1031,10 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table. The theParts parameter gives a list expressions
    * suitable for inclusion in WHERE clauses; each one defines one partition
-   * of the [[DataFrame]].
+   * of the `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().jdbc()`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 3045eb69f427..f3dde480eabe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import java.beans.Introspector
+import java.io.Closeable
 import java.util.concurrent.atomic.AtomicReference
 
 import scala.collection.JavaConverters._
@@ -72,7 +73,7 @@ import org.apache.spark.util.Utils
 class SparkSession private(
     @transient val sparkContext: SparkContext,
     @transient private val existingSharedState: Option[SharedState])
-  extends Serializable with Logging { self =>
+  extends Serializable with Closeable with Logging { self =>
 
   private[sql] def this(sc: SparkContext) {
     this(sc, None)
@@ -92,7 +93,7 @@ class SparkSession private(
    * ----------------------- */
 
   /**
-   * State shared across sessions, including the [[SparkContext]], cached data, listener,
+   * State shared across sessions, including the `SparkContext`, cached data, listener,
    * and a catalog that interacts with external systems.
    */
   @transient
@@ -124,7 +125,7 @@ class SparkSession private(
    *
    * This is the interface through which the user can get and set all Spark and Hadoop
    * configurations that are relevant to Spark SQL. When getting the value of a config,
-   * this defaults to the value set in the underlying [[SparkContext]], if any.
+   * this defaults to the value set in the underlying `SparkContext`, if any.
    *
    * @since 2.0.0
    */
@@ -154,9 +155,6 @@ class SparkSession private(
 
   /**
    * A collection of methods for registering user-defined functions (UDF).
-   * Note that the user-defined functions must be deterministic. Due to optimization,
-   * duplicate invocations may be eliminated or the function may even be invoked more times than
-   * it is present in the query.
    *
    * The following example registers a Scala closure as UDF:
    * {{{
@@ -181,14 +179,18 @@ class SparkSession private(
    *       DataTypes.StringType);
    * }}}
    *
+   * @note The user-defined functions must be deterministic. Due to optimization,
+   * duplicate invocations may be eliminated or the function may even be invoked more times than
+   * it is present in the query.
+   *
    * @since 2.0.0
    */
   def udf: UDFRegistration = sessionState.udf
 
   /**
    * :: Experimental ::
-   * Returns a [[StreamingQueryManager]] that allows managing all the
-   * [[StreamingQuery StreamingQueries]] active on `this`.
+   * Returns a `StreamingQueryManager` that allows managing all the
+   * `StreamingQuery`s active on `this`.
    *
    * @since 2.0.0
    */
@@ -198,9 +200,9 @@ class SparkSession private(
 
   /**
    * Start a new session with isolated SQL configurations, temporary tables, registered
-   * functions are isolated, but sharing the underlying [[SparkContext]] and cached data.
+   * functions are isolated, but sharing the underlying `SparkContext` and cached data.
    *
-   * Note: Other than the [[SparkContext]], all shared state is initialized lazily.
+   * @note Other than the `SparkContext`, all shared state is initialized lazily.
    * This method will force the initialization of the shared state to ensure that parent
    * and child sessions are set up with the same shared state. If the underlying catalog
    * implementation is Hive, this will initialize the metastore, which may take some time.
@@ -217,7 +219,7 @@ class SparkSession private(
    * --------------------------------- */
 
   /**
-   * Returns a [[DataFrame]] with no rows or columns.
+   * Returns a `DataFrame` with no rows or columns.
    *
    * @since 2.0.0
    */
@@ -241,7 +243,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, tuples).
+   * Creates a `DataFrame` from an RDD of Product (e.g. case classes, tuples).
    *
    * @since 2.0.0
    */
@@ -255,7 +257,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] from a local Seq of Product.
+   * Creates a `DataFrame` from a local Seq of Product.
    *
    * @since 2.0.0
    */
@@ -270,7 +272,7 @@ class SparkSession private(
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from an `RDD` containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    * Example:
@@ -307,7 +309,7 @@ class SparkSession private(
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from a [[JavaRDD]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from a `JavaRDD` containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    *
@@ -321,7 +323,7 @@ class SparkSession private(
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from a [[java.util.List]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from a [[java.util.List]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided List matches
    * the provided schema. Otherwise, there will be runtime exception.
    *
@@ -379,7 +381,7 @@ class SparkSession private(
   }
 
   /**
-   * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]].
+   * Convert a `BaseRelation` created for external data sources into a `DataFrame`.
    *
    * @since 2.0.0
    */
@@ -468,7 +470,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
    * in a range from 0 to `end` (exclusive) with step value 1.
    *
    * @since 2.0.0
@@ -479,7 +481,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with step value 1.
    *
    * @since 2.0.0
@@ -492,7 +494,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with a step value.
    *
    * @since 2.0.0
@@ -505,7 +507,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with a step value, with partition number
    * specified.
    *
@@ -518,7 +520,7 @@ class SparkSession private(
   }
 
   /**
-   * Creates a [[DataFrame]] from an RDD[Row].
+   * Creates a `DataFrame` from an RDD[Row].
    * User can specify whether the input rows should be converted to Catalyst rows.
    */
   private[sql] def internalCreateDataFrame(
@@ -531,7 +533,7 @@ class SparkSession private(
   }
 
   /**
-   * Creates a [[DataFrame]] from an RDD[Row].
+   * Creates a `DataFrame` from an RDD[Row].
    * User can specify whether the input rows should be converted to Catalyst rows.
    */
   private[sql] def createDataFrame(
@@ -564,7 +566,7 @@ class SparkSession private(
   @transient lazy val catalog: Catalog = new CatalogImpl(self)
 
   /**
-   * Returns the specified table as a [[DataFrame]].
+   * Returns the specified table as a `DataFrame`.
    *
    * @since 2.0.0
    */
@@ -581,7 +583,7 @@ class SparkSession private(
    * ----------------- */
 
   /**
-   * Executes a SQL query using Spark, returning the result as a [[DataFrame]].
+   * Executes a SQL query using Spark, returning the result as a `DataFrame`.
    * The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'.
    *
    * @since 2.0.0
@@ -592,7 +594,7 @@ class SparkSession private(
 
   /**
    * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a
-   * [[DataFrame]].
+   * `DataFrame`.
    * {{{
    *   sparkSession.read.parquet("/path/to/file.parquet")
    *   sparkSession.read.schema(schema).json("/path/to/file.json")
@@ -604,7 +606,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Returns a [[DataStreamReader]] that can be used to read streaming data in as a [[DataFrame]].
+   * Returns a `DataStreamReader` that can be used to read streaming data in as a `DataFrame`.
    * {{{
    *   sparkSession.readStream.parquet("/path/to/directory/of/parquet/files")
    *   sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files")
@@ -616,13 +618,29 @@ class SparkSession private(
   @InterfaceStability.Evolving
   def readStream: DataStreamReader = new DataStreamReader(self)
 
+  /**
+   * Executes some code block and prints to stdout the time taken to execute the block. This is
+   * available in Scala only and is used primarily for interactive testing and debugging.
+   *
+   * @since 2.1.0
+   */
+  @InterfaceStability.Stable
+  def time[T](f: => T): T = {
+    val start = System.nanoTime()
+    val ret = f
+    val end = System.nanoTime()
+    // scalastyle:off println
+    println(s"Time taken: ${(end - start) / 1000 / 1000} ms")
+    // scalastyle:on println
+    ret
+  }
 
   // scalastyle:off
   // Disable style checker so "implicits" object can start with lowercase i
   /**
    * :: Experimental ::
    * (Scala-specific) Implicit methods available in Scala for converting
-   * common Scala objects into [[DataFrame]]s.
+   * common Scala objects into `DataFrame`s.
    *
    * {{{
    *   val sparkSession = SparkSession.builder.getOrCreate()
@@ -639,7 +657,7 @@ class SparkSession private(
   // scalastyle:on
 
   /**
-   * Stop the underlying [[SparkContext]].
+   * Stop the underlying `SparkContext`.
    *
    * @since 2.0.0
    */
@@ -647,6 +665,13 @@ class SparkSession private(
     sparkContext.stop()
   }
 
+  /**
+   * Synonym for `stop()`.
+   *
+   * @since 2.1.0
+   */
+  override def close(): Unit = stop()
+
   /**
    * Parses the data type in our internal string representation. The data type string should
    * have the same format as the one generated by `toString` in scala.
@@ -717,7 +742,7 @@ object SparkSession {
 
     /**
      * Sets a config option. Options set using this method are automatically propagated to
-     * both [[SparkConf]] and SparkSession's own configuration.
+     * both `SparkConf` and SparkSession's own configuration.
      *
      * @since 2.0.0
      */
@@ -728,7 +753,7 @@ object SparkSession {
 
     /**
      * Sets a config option. Options set using this method are automatically propagated to
-     * both [[SparkConf]] and SparkSession's own configuration.
+     * both `SparkConf` and SparkSession's own configuration.
      *
      * @since 2.0.0
      */
@@ -739,7 +764,7 @@ object SparkSession {
 
     /**
      * Sets a config option. Options set using this method are automatically propagated to
-     * both [[SparkConf]] and SparkSession's own configuration.
+     * both `SparkConf` and SparkSession's own configuration.
      *
      * @since 2.0.0
      */
@@ -750,7 +775,7 @@ object SparkSession {
 
     /**
      * Sets a config option. Options set using this method are automatically propagated to
-     * both [[SparkConf]] and SparkSession's own configuration.
+     * both `SparkConf` and SparkSession's own configuration.
      *
      * @since 2.0.0
      */
@@ -760,7 +785,7 @@ object SparkSession {
     }
 
     /**
-     * Sets a list of config options based on the given [[SparkConf]].
+     * Sets a list of config options based on the given `SparkConf`.
      *
      * @since 2.0.0
      */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 0444ad10d34f..a57673334c10 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -23,8 +23,6 @@ import java.lang.reflect.{ParameterizedType, Type}
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
 
-import com.google.common.reflect.TypeToken
-
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.api.java._
@@ -38,8 +36,13 @@ import org.apache.spark.sql.types.{DataType, DataTypes}
 import org.apache.spark.util.Utils
 
 /**
- * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
- * Note that the user-defined functions must be deterministic.
+ * Functions for registering user-defined functions. Use `SparkSession.udf` to access this:
+ *
+ * {{{
+ *   spark.udf
+ * }}}
+ *
+ * @note The user-defined functions must be deterministic.
  *
  * @since 1.3.0
  */
@@ -93,7 +96,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
           val dataType = ScalaReflection.schemaFor[RT].dataType
           val inputTypes = Try($inputTypes).toOption
-          def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+          def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
           functionRegistry.registerFunction(name, builder)
           UserDefinedFunction(func, dataType, inputTypes)
         }""")
@@ -110,9 +113,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
          | * @since 1.3.0
          | */
          |def register(name: String, f: UDF$i[$extTypeArgs, _], returnType: DataType): Unit = {
+         |  val func = f$anyCast.call($anyParams)
          |  functionRegistry.registerFunction(
          |    name,
-         |    (e: Seq[Expression]) => ScalaUDF(f$anyCast.call($anyParams), returnType, e))
+         |    (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
          |}""".stripMargin)
     }
     */
@@ -125,7 +129,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag](name: String, func: Function0[RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -138,7 +142,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -151,7 +155,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -164,7 +168,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -177,7 +181,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -190,7 +194,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -203,7 +207,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -216,7 +220,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -229,7 +233,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -242,7 +246,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -255,7 +259,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -268,7 +272,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -281,7 +285,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -294,7 +298,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -307,7 +311,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -320,7 +324,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -333,7 +337,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -346,7 +350,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -359,7 +363,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -372,7 +376,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -385,7 +389,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -398,7 +402,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: ScalaReflection.schemaFor[A21].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -411,7 +415,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): UserDefinedFunction = {
     val dataType = ScalaReflection.schemaFor[RT].dataType
     val inputTypes = Try(ScalaReflection.schemaFor[A1].dataType :: ScalaReflection.schemaFor[A2].dataType :: ScalaReflection.schemaFor[A3].dataType :: ScalaReflection.schemaFor[A4].dataType :: ScalaReflection.schemaFor[A5].dataType :: ScalaReflection.schemaFor[A6].dataType :: ScalaReflection.schemaFor[A7].dataType :: ScalaReflection.schemaFor[A8].dataType :: ScalaReflection.schemaFor[A9].dataType :: ScalaReflection.schemaFor[A10].dataType :: ScalaReflection.schemaFor[A11].dataType :: ScalaReflection.schemaFor[A12].dataType :: ScalaReflection.schemaFor[A13].dataType :: ScalaReflection.schemaFor[A14].dataType :: ScalaReflection.schemaFor[A15].dataType :: ScalaReflection.schemaFor[A16].dataType :: ScalaReflection.schemaFor[A17].dataType :: ScalaReflection.schemaFor[A18].dataType :: ScalaReflection.schemaFor[A19].dataType :: ScalaReflection.schemaFor[A20].dataType :: ScalaReflection.schemaFor[A21].dataType :: ScalaReflection.schemaFor[A22].dataType :: Nil).toOption
-    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil))
+    def builder(e: Seq[Expression]) = ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name))
     functionRegistry.registerFunction(name, builder)
     UserDefinedFunction(func, dataType, inputTypes)
   }
@@ -445,7 +449,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
           val udfReturnType = udfInterfaces(0).getActualTypeArguments.last
           var returnType = returnDataType
           if (returnType == null) {
-            returnType = JavaTypeInference.inferDataType(TypeToken.of(udfReturnType))._1
+            returnType = JavaTypeInference.inferDataType(udfReturnType)._1
           }
 
           udfInterfaces(0).getActualTypeArguments.length match {
@@ -489,9 +493,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF1[Any, Any]].call(_: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -499,9 +504,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -509,9 +515,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -519,9 +526,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -529,9 +537,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -539,9 +548,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -549,9 +559,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -559,9 +570,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -569,9 +581,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -579,9 +592,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -589,9 +603,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -599,9 +614,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -609,9 +625,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -619,9 +636,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -629,9 +647,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -639,9 +658,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -649,9 +669,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -659,9 +680,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -669,9 +691,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -679,9 +702,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -689,9 +713,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   /**
@@ -699,9 +724,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
    * @since 1.3.0
    */
   def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
+    val func = f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
     functionRegistry.registerFunction(
       name,
-      (e: Seq[Expression]) => ScalaUDF(f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any), returnType, e))
+      (e: Seq[Expression]) => ScalaUDF(func, returnType, e))
   }
 
   // scalastyle:on line.size.limit
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 9de6510c634b..a4c5bf756cd5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.types._
 
 private[sql] object SQLUtils extends Logging {
-  SerDe.registerSqlSerDe((readSqlObject, writeSqlObject))
+  SerDe.setSQLReadObject(readSqlObject).setSQLWriteObject(writeSqlObject)
 
   private[this] def withHiveExternalCatalog(sc: SparkContext): SparkContext = {
     sc.conf.set(CATALOG_IMPLEMENTATION.key, "hive")
@@ -47,12 +47,14 @@ private[sql] object SQLUtils extends Logging {
       jsc: JavaSparkContext,
       sparkConfigMap: JMap[Object, Object],
       enableHiveSupport: Boolean): SparkSession = {
-    val spark = if (SparkSession.hiveClassesArePresent && enableHiveSupport) {
+    val spark = if (SparkSession.hiveClassesArePresent && enableHiveSupport
+        && jsc.sc.conf.get(CATALOG_IMPLEMENTATION.key, "hive").toLowerCase == "hive") {
       SparkSession.builder().sparkContext(withHiveExternalCatalog(jsc.sc)).getOrCreate()
     } else {
       if (enableHiveSupport) {
         logWarning("SparkR: enableHiveSupport is requested for SparkSession but " +
-          "Spark is not built with Hive; falling back to without Hive support.")
+          s"Spark is not built with Hive or ${CATALOG_IMPLEMENTATION.key} is not set to 'hive', " +
+          "falling back to without Hive support.")
       }
       SparkSession.builder().sparkContext(jsc.sc).getOrCreate()
     }
@@ -158,7 +160,7 @@ private[sql] object SQLUtils extends Logging {
     val dis = new DataInputStream(bis)
     val num = SerDe.readInt(dis)
     Row.fromSeq((0 until num).map { i =>
-      doConversion(SerDe.readObject(dis), schema.fields(i).dataType)
+      doConversion(SerDe.readObject(dis, jvmObjectTracker = null), schema.fields(i).dataType)
     })
   }
 
@@ -167,7 +169,7 @@ private[sql] object SQLUtils extends Logging {
     val dos = new DataOutputStream(bos)
 
     val cols = (0 until row.length).map(row(_).asInstanceOf[Object]).toArray
-    SerDe.writeObject(dos, cols)
+    SerDe.writeObject(dos, cols, jvmObjectTracker = null)
     bos.toByteArray()
   }
 
@@ -247,7 +249,7 @@ private[sql] object SQLUtils extends Logging {
     dataType match {
       case 's' =>
         // Read StructType for DataFrame
-        val fields = SerDe.readList(dis).asInstanceOf[Array[Object]]
+        val fields = SerDe.readList(dis, jvmObjectTracker = null).asInstanceOf[Array[Object]]
         Row.fromSeq(fields)
       case _ => null
     }
@@ -258,8 +260,8 @@ private[sql] object SQLUtils extends Logging {
       // Handle struct type in DataFrame
       case v: GenericRowWithSchema =>
         dos.writeByte('s')
-        SerDe.writeObject(dos, v.schema.fieldNames)
-        SerDe.writeObject(dos, v.values)
+        SerDe.writeObject(dos, v.schema.fieldNames, jvmObjectTracker = null)
+        SerDe.writeObject(dos, v.values, jvmObjectTracker = null)
         true
       case _ =>
         false
@@ -276,11 +278,12 @@ private[sql] object SQLUtils extends Logging {
   }
 
   def getTableNames(sparkSession: SparkSession, databaseName: String): Array[String] = {
-    databaseName match {
-      case n: String if n != null && n.trim.nonEmpty =>
-        sparkSession.catalog.listTables(n).collect().map(_.name)
+    val db = databaseName match {
+      case _ if databaseName != null && databaseName.trim.nonEmpty =>
+        databaseName
       case _ =>
-        sparkSession.catalog.listTables().collect().map(_.name)
+        sparkSession.catalog.currentDatabase
     }
+    sparkSession.sessionState.catalog.listTables(db).map(_.table).toArray
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
index aecdda1c3649..6b061f8ab274 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
@@ -300,6 +300,13 @@ abstract class Catalog {
    */
   def dropGlobalTempView(viewName: String): Boolean
 
+  /**
+   * Recover all the partitions in the directory of a table and update the catalog.
+   *
+   * @since 2.1.1
+   */
+  def recoverPartitions(tableName: String): Unit
+
   /**
    * Returns true if the table is currently cached in-memory.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
index 6f821f80cc4c..380454267eaf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
@@ -138,9 +138,14 @@ class SQLBuilder private (
     case g: Generate =>
       generateToSQL(g)
 
-    case Limit(limitExpr, child) =>
+    // This prevents a pattern of `((...) AS gen_subquery_0 LIMIT 1)` which does not work.
+    // For example, `SELECT * FROM (SELECT id FROM tbl TABLESAMPLE (2 ROWS))` makes this plan.
+    case Limit(limitExpr, child: SubqueryAlias) =>
       s"${toSQL(child)} LIMIT ${limitExpr.sql}"
 
+    case Limit(limitExpr, child) =>
+      s"(${toSQL(child)} LIMIT ${limitExpr.sql})"
+
     case Filter(condition, child) =>
       val whereOrHaving = child match {
         case _: Aggregate => "HAVING"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 526623a36d2a..0ea806d6cb50 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -19,9 +19,12 @@ package org.apache.spark.sql.execution
 
 import java.util.concurrent.locks.ReentrantReadWriteLock
 
+import scala.collection.JavaConverters._
+
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
@@ -44,7 +47,7 @@ case class CachedData(plan: LogicalPlan, cachedRepresentation: InMemoryRelation)
 class CacheManager extends Logging {
 
   @transient
-  private val cachedData = new scala.collection.mutable.ArrayBuffer[CachedData]
+  private val cachedData = new java.util.LinkedList[CachedData]
 
   @transient
   private val cacheLock = new ReentrantReadWriteLock
@@ -69,7 +72,7 @@ class CacheManager extends Logging {
 
   /** Clears all cached tables. */
   def clearCache(): Unit = writeLock {
-    cachedData.foreach(_.cachedRepresentation.cachedColumnBuffers.unpersist())
+    cachedData.asScala.foreach(_.cachedRepresentation.cachedColumnBuffers.unpersist())
     cachedData.clear()
   }
 
@@ -87,92 +90,109 @@ class CacheManager extends Logging {
       query: Dataset[_],
       tableName: Option[String] = None,
       storageLevel: StorageLevel = MEMORY_AND_DISK): Unit = writeLock {
-    val planToCache = query.queryExecution.analyzed
+    val planToCache = query.logicalPlan
     if (lookupCachedData(planToCache).nonEmpty) {
       logWarning("Asked to cache already cached data.")
     } else {
       val sparkSession = query.sparkSession
-      cachedData +=
-        CachedData(
-          planToCache,
-          InMemoryRelation(
-            sparkSession.sessionState.conf.useCompression,
-            sparkSession.sessionState.conf.columnBatchSize,
-            storageLevel,
-            sparkSession.sessionState.executePlan(planToCache).executedPlan,
-            tableName))
+      cachedData.add(CachedData(
+        planToCache,
+        InMemoryRelation(
+          sparkSession.sessionState.conf.useCompression,
+          sparkSession.sessionState.conf.columnBatchSize,
+          storageLevel,
+          sparkSession.sessionState.executePlan(planToCache).executedPlan,
+          tableName)))
     }
   }
 
   /**
-   * Tries to remove the data for the given [[Dataset]] from the cache.
-   * No operation, if it's already uncached.
+   * Un-cache all the cache entries that refer to the given plan.
+   */
+  def uncacheQuery(query: Dataset[_], blocking: Boolean = true): Unit = writeLock {
+    uncacheQuery(query.sparkSession, query.logicalPlan, blocking)
+  }
+
+  /**
+   * Un-cache all the cache entries that refer to the given plan.
    */
-  def uncacheQuery(query: Dataset[_], blocking: Boolean = true): Boolean = writeLock {
-    val planToCache = query.queryExecution.analyzed
-    val dataIndex = cachedData.indexWhere(cd => planToCache.sameResult(cd.plan))
-    val found = dataIndex >= 0
-    if (found) {
-      cachedData(dataIndex).cachedRepresentation.cachedColumnBuffers.unpersist(blocking)
-      cachedData.remove(dataIndex)
+  def uncacheQuery(spark: SparkSession, plan: LogicalPlan, blocking: Boolean): Unit = writeLock {
+    val it = cachedData.iterator()
+    while (it.hasNext) {
+      val cd = it.next()
+      if (cd.plan.find(_.sameResult(plan)).isDefined) {
+        cd.cachedRepresentation.cachedColumnBuffers.unpersist(blocking)
+        it.remove()
+      }
     }
-    found
+  }
+
+  /**
+   * Tries to re-cache all the cache entries that refer to the given plan.
+   */
+  def recacheByPlan(spark: SparkSession, plan: LogicalPlan): Unit = writeLock {
+    recacheByCondition(spark, _.find(_.sameResult(plan)).isDefined)
+  }
+
+  private def recacheByCondition(spark: SparkSession, condition: LogicalPlan => Boolean): Unit = {
+    val it = cachedData.iterator()
+    val needToRecache = scala.collection.mutable.ArrayBuffer.empty[CachedData]
+    while (it.hasNext) {
+      val cd = it.next()
+      if (condition(cd.plan)) {
+        cd.cachedRepresentation.cachedColumnBuffers.unpersist()
+        // Remove the cache entry before we create a new one, so that we can have a different
+        // physical plan.
+        it.remove()
+        val newCache = InMemoryRelation(
+          useCompression = cd.cachedRepresentation.useCompression,
+          batchSize = cd.cachedRepresentation.batchSize,
+          storageLevel = cd.cachedRepresentation.storageLevel,
+          child = spark.sessionState.executePlan(cd.plan).executedPlan,
+          tableName = cd.cachedRepresentation.tableName)
+        needToRecache += cd.copy(cachedRepresentation = newCache)
+      }
+    }
+
+    needToRecache.foreach(cachedData.add)
   }
 
   /** Optionally returns cached data for the given [[Dataset]] */
   def lookupCachedData(query: Dataset[_]): Option[CachedData] = readLock {
-    lookupCachedData(query.queryExecution.analyzed)
+    lookupCachedData(query.logicalPlan)
   }
 
   /** Optionally returns cached data for the given [[LogicalPlan]]. */
   def lookupCachedData(plan: LogicalPlan): Option[CachedData] = readLock {
-    cachedData.find(cd => plan.sameResult(cd.plan))
+    cachedData.asScala.find(cd => plan.sameResult(cd.plan))
   }
 
   /** Replaces segments of the given logical plan with cached versions where possible. */
   def useCachedData(plan: LogicalPlan): LogicalPlan = {
-    plan transformDown {
+    val newPlan = plan transformDown {
       case currentFragment =>
         lookupCachedData(currentFragment)
           .map(_.cachedRepresentation.withOutput(currentFragment.output))
           .getOrElse(currentFragment)
     }
-  }
 
-  /**
-   * Invalidates the cache of any data that contains `plan`. Note that it is possible that this
-   * function will over invalidate.
-   */
-  def invalidateCache(plan: LogicalPlan): Unit = writeLock {
-    cachedData.foreach {
-      case data if data.plan.collect { case p if p.sameResult(plan) => p }.nonEmpty =>
-        data.cachedRepresentation.recache()
-      case _ =>
+    newPlan transformAllExpressions {
+      case s: SubqueryExpression => s.withNewPlan(useCachedData(s.plan))
     }
   }
 
   /**
-   * Invalidates the cache of any data that contains `resourcePath` in one or more
+   * Tries to re-cache all the cache entries that contain `resourcePath` in one or more
    * `HadoopFsRelation` node(s) as part of its logical plan.
    */
-  def invalidateCachedPath(
-      sparkSession: SparkSession, resourcePath: String): Unit = writeLock {
+  def recacheByPath(spark: SparkSession, resourcePath: String): Unit = writeLock {
     val (fs, qualifiedPath) = {
       val path = new Path(resourcePath)
-      val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
-      (fs, path.makeQualified(fs.getUri, fs.getWorkingDirectory))
+      val fs = path.getFileSystem(spark.sessionState.newHadoopConf())
+      (fs, fs.makeQualified(path))
     }
 
-    cachedData.foreach {
-      case data if data.plan.find(lookupAndRefresh(_, fs, qualifiedPath)).isDefined =>
-        val dataIndex = cachedData.indexWhere(cd => data.plan.sameResult(cd.plan))
-        if (dataIndex >= 0) {
-          data.cachedRepresentation.cachedColumnBuffers.unpersist(blocking = true)
-          cachedData.remove(dataIndex)
-        }
-        sparkSession.sharedState.cacheManager.cacheQuery(Dataset.ofRows(sparkSession, data.plan))
-      case _ => // Do Nothing
-    }
+    recacheByCondition(spark, _.find(lookupAndRefresh(_, fs, qualifiedPath)).isDefined)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index fdd1fa364825..b4aed2321835 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -71,8 +71,9 @@ case class RowDataSourceScanExec(
     val unsafeRow = if (outputUnsafeRows) {
       rdd
     } else {
-      rdd.mapPartitionsInternal { iter =>
+      rdd.mapPartitionsWithIndexInternal { (index, iter) =>
         val proj = UnsafeProjection.create(schema)
+        proj.initialize(index)
         iter.map(proj)
       }
     }
@@ -135,7 +136,7 @@ case class RowDataSourceScanExec(
  * @param outputSchema Output schema of the scan.
  * @param partitionFilters Predicates to use for partition pruning.
  * @param dataFilters Data source filters to use for filtering data within partitions.
- * @param metastoreTableIdentifier
+ * @param metastoreTableIdentifier identifier for the table in the metastore.
  */
 case class FileSourceScanExec(
     @transient relation: HadoopFsRelation,
@@ -146,10 +147,10 @@ case class FileSourceScanExec(
     override val metastoreTableIdentifier: Option[TableIdentifier])
   extends DataSourceScanExec {
 
-  val supportsBatch = relation.fileFormat.supportBatch(
+  val supportsBatch: Boolean = relation.fileFormat.supportBatch(
     relation.sparkSession, StructType.fromAttributes(output))
 
-  val needsUnsafeRowConversion = if (relation.fileFormat.isInstanceOf[ParquetSource]) {
+  val needsUnsafeRowConversion: Boolean = if (relation.fileFormat.isInstanceOf[ParquetSource]) {
     SparkSession.getActiveSession.get.sessionState.conf.parquetVectorizedReaderEnabled
   } else {
     false
@@ -284,8 +285,9 @@ case class FileSourceScanExec(
       val unsafeRows = {
         val scan = inputRDD
         if (needsUnsafeRowConversion) {
-          scan.mapPartitionsInternal { iter =>
+          scan.mapPartitionsWithIndexInternal { (index, iter) =>
             val proj = UnsafeProjection.create(schema)
+            proj.initialize(index)
             iter.map(proj)
           }
         } else {
@@ -317,7 +319,7 @@ case class FileSourceScanExec(
     val input = ctx.freshName("input")
     ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
     val exprRows = output.zipWithIndex.map{ case (a, i) =>
-      new BoundReference(i, a.dataType, a.nullable)
+      BoundReference(i, a.dataType, a.nullable)
     }
     val row = ctx.freshName("row")
     ctx.INPUT_ROW = row
@@ -514,7 +516,6 @@ case class FileSourceScanExec(
     }
 
     // Assign files to partitions using "First Fit Decreasing" (FFD)
-    // TODO: consider adding a slop factor here?
     splitFiles.foreach { file =>
       if (currentSize + file.length > maxSplitBytes) {
         closePartition()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index 455fb5bfbb6f..aab087cd9871 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -190,8 +190,9 @@ case class RDDScanExec(
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
-    rdd.mapPartitionsInternal { iter =>
+    rdd.mapPartitionsWithIndexInternal { (index, iter) =>
       val proj = UnsafeProjection.create(schema)
+      proj.initialize(index)
       iter.map { r =>
         numOutputRows += 1
         proj(r)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
index 266312956266..1d9f96bcb534 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
@@ -45,17 +45,26 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
  *              it.
  * @param outer when true, each input row will be output at least once, even if the output of the
  *              given `generator` is empty. `outer` has no effect when `join` is false.
- * @param output the output attributes of this node, which constructed in analysis phase,
- *               and we can not change it, as the parent node bound with it already.
+ * @param generatorOutput the qualified output attributes of the generator of this node, which
+ *                        constructed in analysis phase, and we can not change it, as the
+ *                        parent node bound with it already.
  */
 case class GenerateExec(
     generator: Generator,
     join: Boolean,
     outer: Boolean,
-    output: Seq[Attribute],
+    generatorOutput: Seq[Attribute],
     child: SparkPlan)
   extends UnaryExecNode {
 
+  override def output: Seq[Attribute] = {
+    if (join) {
+      child.output ++ generatorOutput
+    } else {
+      generatorOutput
+    }
+  }
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
@@ -94,8 +103,9 @@ case class GenerateExec(
     }
 
     val numOutputRows = longMetric("numOutputRows")
-    rows.mapPartitionsInternal { iter =>
+    rows.mapPartitionsWithIndexInternal { (index, iter) =>
       val proj = UnsafeProjection.create(output, output)
+      proj.initialize(index)
       iter.map { r =>
         numOutputRows += 1
         proj(r)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index cb45a6d78b9b..9b53d21deed9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -45,9 +45,14 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
   protected def planner = sparkSession.sessionState.planner
 
   def assertAnalyzed(): Unit = {
-    try sparkSession.sessionState.analyzer.checkAnalysis(analyzed) catch {
+    // Analyzer is invoked outside the try block to avoid calling it again from within the
+    // catch block below.
+    analyzed
+    try {
+      sparkSession.sessionState.analyzer.checkAnalysis(analyzed)
+    } catch {
       case e: AnalysisException =>
-        val ae = new AnalysisException(e.message, e.line, e.startPosition, Some(analyzed))
+        val ae = new AnalysisException(e.message, e.line, e.startPosition, Option(analyzed))
         ae.setStackTrace(e.getStackTrace)
         throw ae
     }
@@ -104,7 +109,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
     ReuseSubquery(sparkSession.sessionState.conf))
 
   protected def stringOrError[A](f: => A): String =
-    try f.toString catch { case e: Throwable => e.toString }
+    try f.toString catch { case e: AnalysisException => e.toString }
 
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 24d0cffef82a..cadab37a449a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.expressions.codegen.{Predicate => GenPredicate, _}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetric
@@ -354,7 +354,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   }
 
   protected def newPredicate(
-      expression: Expression, inputSchema: Seq[Attribute]): (InternalRow) => Boolean = {
+      expression: Expression, inputSchema: Seq[Attribute]): GenPredicate = {
     GeneratePredicate.generate(expression, inputSchema)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index fe183d0097d0..7a659ea15182 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -233,7 +233,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
    * Create an [[UncacheTableCommand]] logical plan.
    */
   override def visitUncacheTable(ctx: UncacheTableContext): LogicalPlan = withOrigin(ctx) {
-    UncacheTableCommand(visitTableIdentifier(ctx.tableIdentifier))
+    UncacheTableCommand(visitTableIdentifier(ctx.tableIdentifier), ctx.EXISTS != null)
   }
 
   /**
@@ -322,7 +322,20 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
   }
 
   /**
-   * Create a [[CreateTable]] logical plan.
+   * Create a data source table, returning a [[CreateTable]] logical plan.
+   *
+   * Expected format:
+   * {{{
+   *   CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
+   *   USING table_provider
+   *   [OPTIONS table_property_list]
+   *   [PARTITIONED BY (col_name, col_name, ...)]
+   *   [CLUSTERED BY (col_name, col_name, ...)
+   *    [SORTED BY (col_name [ASC|DESC], ...)]
+   *    INTO num_buckets BUCKETS
+   *   ]
+   *   [AS select_statement];
+   * }}}
    */
   override def visitCreateTableUsing(ctx: CreateTableUsingContext): LogicalPlan = withOrigin(ctx) {
     val (table, temp, ifNotExists, external) = visitCreateTableHeader(ctx.createTableHeader)
@@ -331,7 +344,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     }
     val options = Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val provider = ctx.tableProvider.qualifiedName.getText
-    if (provider.toLowerCase == "hive") {
+    if (provider.toLowerCase == DDLUtils.HIVE_PROVIDER) {
       throw new AnalysisException("Cannot create hive serde table with CREATE TABLE USING")
     }
     val schema = Option(ctx.colTypeList()).map(createSchema)
@@ -343,7 +356,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
 
     // TODO: this may be wrong for non file-based data source like JDBC, which should be external
     // even there is no `path` in options. We should consider allow the EXTERNAL keyword.
-    val tableType = if (new CaseInsensitiveMap(options).contains("path")) {
+    val storage = DataSource.buildStorageFormatFromOptions(options)
+    val tableType = if (storage.locationUri.isDefined) {
       CatalogTableType.EXTERNAL
     } else {
       CatalogTableType.MANAGED
@@ -352,7 +366,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     val tableDesc = CatalogTable(
       identifier = table,
       tableType = tableType,
-      storage = CatalogStorageFormat.empty.copy(properties = options),
+      storage = storage,
       schema = schema.getOrElse(new StructType),
       provider = Some(provider),
       partitionColumnNames = partitionColumnNames,
@@ -370,6 +384,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
         operationNotAllowed("CREATE TEMPORARY TABLE ... USING ... AS query", ctx)
       }
 
+      // Don't allow explicit specification of schema for CTAS
+      if (schema.nonEmpty) {
+        operationNotAllowed(
+          "Schema may not be specified in a Create Table As Select (CTAS) statement",
+          ctx)
+      }
       CreateTable(tableDesc, mode, Some(query))
     } else {
       if (temp) {
@@ -813,8 +833,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     AlterTableDropPartitionCommand(
       visitTableIdentifier(ctx.tableIdentifier),
       ctx.partitionSpec.asScala.map(visitNonOptionalPartitionSpec),
-      ctx.EXISTS != null,
-      ctx.PURGE != null)
+      ifExists = ctx.EXISTS != null,
+      purge = ctx.PURGE != null,
+      retainData = false)
   }
 
   /**
@@ -1033,7 +1054,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
       tableType = tableType,
       storage = storage,
       schema = schema,
-      provider = Some("hive"),
+      provider = Some(DDLUtils.HIVE_PROVIDER),
       partitionColumnNames = partitionCols.map(_.name),
       properties = properties,
       comment = comment)
@@ -1051,7 +1072,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
             "CTAS statement."
           operationNotAllowed(errorMessage, ctx)
         }
-        // Just use whatever is projected in the select statement as our schema
+
+        // Don't allow explicit specification of schema for CTAS.
         if (schema.nonEmpty) {
           operationNotAllowed(
             "Schema may not be specified in a Create Table As Select (CTAS) statement",
@@ -1062,17 +1084,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
         if (conf.convertCTAS && !hasStorageProperties) {
           // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties
           // are empty Maps.
-          val optionsWithPath = if (location.isDefined) {
-            Map("path" -> location.get)
-          } else {
-            Map.empty[String, String]
-          }
-
           val newTableDesc = tableDesc.copy(
-            storage = CatalogStorageFormat.empty.copy(properties = optionsWithPath),
-            provider = Some(conf.defaultDataSourceName)
-          )
-
+            storage = CatalogStorageFormat.empty.copy(locationUri = location),
+            provider = Some(conf.defaultDataSourceName))
           CreateTable(newTableDesc, mode, Some(q))
         } else {
           CreateTable(tableDesc, mode, Some(q))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 5412aca95dcf..ba82ec156e85 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -18,20 +18,23 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{execution, SaveMode, Strategy}
+import org.apache.spark.sql.{SaveMode, Strategy}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, EventTimeWatermark, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution
 import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.exchange.ShuffleExchange
 import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight}
-import org.apache.spark.sql.execution.streaming.{MemoryPlan, StreamingExecutionRelation, StreamingRelation, StreamingRelationExec}
+import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.streaming.StreamingQuery
 
 /**
  * Converts a logical plan into zero or more SparkPlans.  This API is exposed for experimenting
@@ -112,7 +115,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      */
     private def canBroadcast(plan: LogicalPlan): Boolean = {
       plan.statistics.isBroadcastable ||
-        plan.statistics.sizeInBytes <= conf.autoBroadcastJoinThreshold
+        (plan.statistics.sizeInBytes >= 0 &&
+          plan.statistics.sizeInBytes <= conf.autoBroadcastJoinThreshold)
     }
 
     /**
@@ -224,6 +228,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
    */
   object StatefulAggregationStrategy extends Strategy {
     override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case EventTimeWatermark(columnName, delay, child) =>
+        EventTimeWatermarkExec(columnName, delay, planLater(child)) :: Nil
+
       case PhysicalAggregation(
         namedGroupingExpressions, aggregateExpressions, rewrittenResultExpressions, child) =>
 
@@ -369,10 +376,6 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         } else {
           execution.CoalesceExec(numPartitions, planLater(child)) :: Nil
         }
-      case logical.SortPartitions(sortExprs, child) =>
-        // This sort only sorts tuples within a partition. Its requiredDistribution will be
-        // an UnspecifiedDistribution.
-        execution.SortExec(sortExprs, global = false, child = planLater(child)) :: Nil
       case logical.Sort(sortExprs, global, child) =>
         execution.SortExec(sortExprs, global, planLater(child)) :: Nil
       case logical.Project(projectList, child) =>
@@ -397,7 +400,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.UnionExec(unionChildren.map(planLater)) :: Nil
       case g @ logical.Generate(generator, join, outer, _, _, child) =>
         execution.GenerateExec(
-          generator, join = join, outer = outer, g.output, planLater(child)) :: Nil
+          generator, join = join, outer = outer, g.qualifiedGeneratorOutput,
+          planLater(child)) :: Nil
       case logical.OneRowRelation =>
         execution.RDDScanExec(Nil, singleRowRdd, "OneRowRelation") :: Nil
       case r: logical.Range =>
@@ -415,7 +419,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
   object DDLStrategy extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case CreateTable(tableDesc, mode, None) if tableDesc.provider.get == "hive" =>
+      case CreateTable(tableDesc, mode, None)
+        if tableDesc.provider.get == DDLUtils.HIVE_PROVIDER =>
         val cmd = CreateTableCommand(tableDesc, ifNotExists = mode == SaveMode.Ignore)
         ExecutedCommandExec(cmd) :: Nil
 
@@ -427,7 +432,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       // CREATE TABLE ... AS SELECT ... for hive serde table is handled in hive module, by rule
       // `CreateTables`
 
-      case CreateTable(tableDesc, mode, Some(query)) if tableDesc.provider.get != "hive" =>
+      case CreateTable(tableDesc, mode, Some(query))
+        if tableDesc.provider.get != DDLUtils.HIVE_PROVIDER =>
         val cmd =
           CreateDataSourceTableAsSelectCommand(
             tableDesc,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 6303483f22fd..2ead8f6baae6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -241,7 +241,7 @@ case class InputAdapter(child: SparkPlan) extends UnaryExecNode with CodegenSupp
     ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
     val row = ctx.freshName("row")
     s"""
-       | while ($input.hasNext()) {
+       | while ($input.hasNext() && !stopEarly()) {
        |   InternalRow $row = (InternalRow) $input.next();
        |   ${consume(ctx, null, row).trim}
        |   if (shouldStop()) return;
@@ -331,6 +331,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
           partitionIndex = index;
           this.inputs = inputs;
           ${ctx.initMutableStates()}
+          ${ctx.initPartition()}
         }
 
         ${ctx.declareAddedFunctions()}
@@ -383,10 +384,13 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     } else {
       // Right now, we support up to two input RDDs.
       rdds.head.zipPartitions(rdds(1)) { (leftIter, rightIter) =>
-        val partitionIndex = TaskContext.getPartitionId()
+        Iterator((leftIter, rightIter))
+        // a small hack to obtain the correct partition index
+      }.mapPartitionsWithIndex { (index, zippedIter) =>
+        val (leftIter, rightIter) = zippedIter.next()
         val clazz = CodeGenerator.compile(cleanedSource)
         val buffer = clazz.generate(references).asInstanceOf[BufferedRowIterator]
-        buffer.init(partitionIndex, Array(leftIter, rightIter))
+        buffer.init(index, Array(leftIter, rightIter))
         new Iterator[InternalRow] {
           override def hasNext: Boolean = {
             val v = buffer.hasNext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
index 4fbb9d554c9b..f7ea8970edf9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
@@ -313,8 +313,13 @@ object AggUtils {
     }
     // Note: stateId and returnAllStates are filled in later with preparation rules
     // in IncrementalExecution.
-    val saved = StateStoreSaveExec(
-      groupingAttributes, stateId = None, returnAllStates = None, partialMerged2)
+    val saved =
+      StateStoreSaveExec(
+        groupingAttributes,
+        stateId = None,
+        outputMode = None,
+        eventTimeWatermark = None,
+        partialMerged2)
 
     val finalAndCompleteAggregate: SparkPlan = {
       val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
index a77e178546ef..1b6e6d2f6509 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
@@ -43,28 +43,30 @@ class RowBasedHashMapGenerator(
   extends HashMapGenerator (ctx, aggregateExpressions, generatedClassName,
     groupingKeySchema, bufferSchema) {
 
-  protected def initializeAggregateHashMap(): String = {
+  override protected def initializeAggregateHashMap(): String = {
     val generatedKeySchema: String =
       s"new org.apache.spark.sql.types.StructType()" +
         groupingKeySchema.map { key =>
+          val keyName = ctx.addReferenceObj(key.name)
           key.dataType match {
             case d: DecimalType =>
-              s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.createDecimalType(
+              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.createDecimalType(
                   |${d.precision}, ${d.scale}))""".stripMargin
             case _ =>
-              s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
+              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
           }
         }.mkString("\n").concat(";")
 
     val generatedValueSchema: String =
       s"new org.apache.spark.sql.types.StructType()" +
         bufferSchema.map { key =>
+          val keyName = ctx.addReferenceObj(key.name)
           key.dataType match {
             case d: DecimalType =>
-              s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.createDecimalType(
+              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.createDecimalType(
                   |${d.precision}, ${d.scale}))""".stripMargin
             case _ =>
-              s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
+              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
           }
         }.mkString("\n").concat(";")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index 7418df90b824..586328a6efb9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -48,28 +48,30 @@ class VectorizedHashMapGenerator(
   extends HashMapGenerator (ctx, aggregateExpressions, generatedClassName,
     groupingKeySchema, bufferSchema) {
 
-  protected def initializeAggregateHashMap(): String = {
+  override protected def initializeAggregateHashMap(): String = {
     val generatedSchema: String =
       s"new org.apache.spark.sql.types.StructType()" +
         (groupingKeySchema ++ bufferSchema).map { key =>
+          val keyName = ctx.addReferenceObj(key.name)
           key.dataType match {
             case d: DecimalType =>
-              s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.createDecimalType(
+              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.createDecimalType(
                   |${d.precision}, ${d.scale}))""".stripMargin
             case _ =>
-              s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
+              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
           }
         }.mkString("\n").concat(";")
 
     val generatedAggBufferSchema: String =
       s"new org.apache.spark.sql.types.StructType()" +
         bufferSchema.map { key =>
+          val keyName = ctx.addReferenceObj(key.name)
           key.dataType match {
             case d: DecimalType =>
-              s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.createDecimalType(
+              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.createDecimalType(
                   |${d.precision}, ${d.scale}))""".stripMargin
             case _ =>
-              s""".add("${key.name}", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
+              s""".add("$keyName", org.apache.spark.sql.types.DataTypes.${key.dataType})"""
           }
         }.mkString("\n").concat(";")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index a5291e0c12f8..b00223a86d4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -70,9 +70,10 @@ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan)
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val project = UnsafeProjection.create(projectList, child.output,
         subexpressionEliminationEnabled)
+      project.initialize(index)
       iter.map(project)
     }
   }
@@ -89,7 +90,13 @@ case class FilterExec(condition: Expression, child: SparkPlan)
 
   // Split out all the IsNotNulls from condition.
   private val (notNullPreds, otherPreds) = splitConjunctivePredicates(condition).partition {
-    case IsNotNull(a: NullIntolerant) if a.references.subsetOf(child.outputSet) => true
+    case IsNotNull(a) => isNullIntolerant(a) && a.references.subsetOf(child.outputSet)
+    case _ => false
+  }
+
+  // If one expression and its children are null intolerant, it is null intolerant.
+  private def isNullIntolerant(expr: Expression): Boolean = expr match {
+    case e: NullIntolerant => e.children.forall(isNullIntolerant)
     case _ => false
   }
 
@@ -205,10 +212,11 @@ case class FilterExec(condition: Expression, child: SparkPlan)
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val predicate = newPredicate(condition, child.output)
+      predicate.initialize(0)
       iter.filter { row =>
-        val r = predicate(row)
+        val r = predicate.eval(row)
         if (r) numOutputRows += 1
         r
       }
@@ -487,7 +495,15 @@ case class UnionExec(children: Seq[SparkPlan]) extends SparkPlan {
  * Physical plan for returning a new RDD that has exactly `numPartitions` partitions.
  * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g.
  * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
- * the 100 new partitions will claim 10 of the current partitions.
+ * the 100 new partitions will claim 10 of the current partitions.  If a larger number of partitions
+ * is requested, it will stay at the current number of partitions.
+ *
+ * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
+ * this may result in your computation taking place on fewer nodes than
+ * you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+ * you see ShuffleExchange. This will add a shuffle step, but means the
+ * current upstream partitions will be executed in parallel (per whatever
+ * the current partitioning is).
  */
 case class CoalesceExec(numPartitions: Int, child: SparkPlan) extends UnaryExecNode {
   override def output: Seq[Attribute] = child.output
@@ -549,13 +565,7 @@ case class SubqueryExec(name: String, child: SparkPlan) extends UnaryExecNode {
         val dataSize = rows.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
         longMetric("dataSize") += dataSize
 
-        // There are some cases we don't care about the metrics and call `SparkPlan.doExecute`
-        // directly without setting an execution id. We should be tolerant to it.
-        if (executionId != null) {
-          sparkContext.listenerBus.post(SparkListenerDriverAccumUpdates(
-            executionId.toLong, metrics.values.map(m => m.id -> m.value).toSeq))
-        }
-
+        SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
         rows
       }
     }(SubqueryExec.executionContext)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 56bd5c1891e8..949f8b61f297 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -24,7 +24,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.SparkPlan
@@ -64,7 +63,7 @@ case class InMemoryRelation(
     val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator)
   extends logical.LeafNode with MultiInstanceRelation {
 
-  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(child)
+  override protected def innerChildren: Seq[SparkPlan] = Seq(child)
 
   override def producedAttributes: AttributeSet = outputSet
 
@@ -86,12 +85,6 @@ case class InMemoryRelation(
     buildBuffers()
   }
 
-  def recache(): Unit = {
-    _cachedColumnBuffers.unpersist()
-    _cachedColumnBuffers = null
-    buildBuffers()
-  }
-
   private def buildBuffers(): Unit = {
     val output = child.output
     val cached = child.execute().mapPartitionsInternal { rowIterator =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index b87016d5a569..9028caa446e8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -132,10 +132,11 @@ case class InMemoryTableScanExec(
     val relOutput: AttributeSeq = relation.output
     val buffers = relation.cachedColumnBuffers
 
-    buffers.mapPartitionsInternal { cachedBatchIterator =>
+    buffers.mapPartitionsWithIndexInternal { (index, cachedBatchIterator) =>
       val partitionFilter = newPredicate(
         partitionFilters.reduceOption(And).getOrElse(Literal(true)),
         schema)
+      partitionFilter.initialize(index)
 
       // Find the ordinals and data types of the requested columns.
       val (requestedColumnIndices, requestedColumnDataTypes) =
@@ -147,7 +148,7 @@ case class InMemoryTableScanExec(
       val cachedBatchesToScan =
         if (inMemoryPartitionPruningEnabled) {
           cachedBatchIterator.filter { cachedBatch =>
-            if (!partitionFilter(cachedBatch.stats)) {
+            if (!partitionFilter.eval(cachedBatch.stats)) {
               def statsString: String = schemaIndex.map {
                 case (a, i) =>
                   val value = cachedBatch.stats.get(i, a.dataType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index f873f34a845e..9dffe3614a87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -17,17 +17,15 @@
 
 package org.apache.spark.sql.execution.command
 
-import scala.collection.mutable
-
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, ColumnStat, LogicalPlan, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.types._
 
 
 /**
@@ -44,13 +42,16 @@ case class AnalyzeColumnCommand(
     val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db))
     val relation = EliminateSubqueryAliases(sessionState.catalog.lookupRelation(tableIdentWithDB))
 
-    relation match {
+    // Compute total size
+    val (catalogTable: CatalogTable, sizeInBytes: Long) = relation match {
       case catalogRel: CatalogRelation =>
-        updateStats(catalogRel.catalogTable,
+        // This is a Hive serde format table
+        (catalogRel.catalogTable,
           AnalyzeTableCommand.calculateTotalSize(sessionState, catalogRel.catalogTable))
 
       case logicalRel: LogicalRelation if logicalRel.catalogTable.isDefined =>
-        updateStats(logicalRel.catalogTable.get,
+        // This is a data source format table
+        (logicalRel.catalogTable.get,
           AnalyzeTableCommand.calculateTotalSize(sessionState, logicalRel.catalogTable.get))
 
       case otherRelation =>
@@ -58,45 +59,55 @@ case class AnalyzeColumnCommand(
           s"${otherRelation.nodeName}.")
     }
 
-    def updateStats(catalogTable: CatalogTable, newTotalSize: Long): Unit = {
-      val (rowCount, columnStats) = computeColStats(sparkSession, relation)
-      // We also update table-level stats in order to keep them consistent with column-level stats.
-      val statistics = Statistics(
-        sizeInBytes = newTotalSize,
-        rowCount = Some(rowCount),
-        // Newly computed column stats should override the existing ones.
-        colStats = catalogTable.stats.map(_.colStats).getOrElse(Map()) ++ columnStats)
-      sessionState.catalog.alterTable(catalogTable.copy(stats = Some(statistics)))
-      // Refresh the cached data source table in the catalog.
-      sessionState.catalog.refreshTable(tableIdentWithDB)
-    }
+    // Compute stats for each column
+    val (rowCount, newColStats) =
+      AnalyzeColumnCommand.computeColumnStats(sparkSession, tableIdent.table, relation, columnNames)
+
+    // We also update table-level stats in order to keep them consistent with column-level stats.
+    val statistics = Statistics(
+      sizeInBytes = sizeInBytes,
+      rowCount = Some(rowCount),
+      // Newly computed column stats should override the existing ones.
+      colStats = catalogTable.stats.map(_.colStats).getOrElse(Map.empty) ++ newColStats)
+
+    sessionState.catalog.alterTable(catalogTable.copy(stats = Some(statistics)))
+
+    // Refresh the cached data source table in the catalog.
+    sessionState.catalog.refreshTable(tableIdentWithDB)
 
     Seq.empty[Row]
   }
+}
 
-  def computeColStats(
+object AnalyzeColumnCommand extends Logging {
+
+  /**
+   * Compute stats for the given columns.
+   * @return (row count, map from column name to ColumnStats)
+   *
+   * This is visible for testing.
+   */
+  def computeColumnStats(
       sparkSession: SparkSession,
-      relation: LogicalPlan): (Long, Map[String, ColumnStat]) = {
+      tableName: String,
+      relation: LogicalPlan,
+      columnNames: Seq[String]): (Long, Map[String, ColumnStat]) = {
 
-    // check correctness of column names
-    val attributesToAnalyze = mutable.MutableList[Attribute]()
-    val duplicatedColumns = mutable.MutableList[String]()
+    // Resolve the column names and dedup using AttributeSet
     val resolver = sparkSession.sessionState.conf.resolver
-    columnNames.foreach { col =>
+    val attributesToAnalyze = AttributeSet(columnNames.map { col =>
       val exprOption = relation.output.find(attr => resolver(attr.name, col))
-      val expr = exprOption.getOrElse(throw new AnalysisException(s"Invalid column name: $col."))
-      // do deduplication
-      if (!attributesToAnalyze.contains(expr)) {
-        attributesToAnalyze += expr
-      } else {
-        duplicatedColumns += col
+      exprOption.getOrElse(throw new AnalysisException(s"Column $col does not exist."))
+    }).toSeq
+
+    // Make sure the column types are supported for stats gathering.
+    attributesToAnalyze.foreach { attr =>
+      if (!ColumnStat.supportsType(attr.dataType)) {
+        throw new AnalysisException(
+          s"Column ${attr.name} in table $tableName is of type ${attr.dataType}, " +
+            "and Spark does not support statistics collection on this column type.")
       }
     }
-    if (duplicatedColumns.nonEmpty) {
-      logWarning("Duplicate column names were deduplicated in `ANALYZE TABLE` statement. " +
-        s"Input columns: ${columnNames.mkString("(", ", ", ")")}. " +
-        s"Duplicate columns: ${duplicatedColumns.mkString("(", ", ", ")")}.")
-    }
 
     // Collect statistics per column.
     // The first element in the result will be the overall row count, the following elements
@@ -104,78 +115,15 @@ case class AnalyzeColumnCommand(
     // The layout of each struct follows the layout of the ColumnStats.
     val ndvMaxErr = sparkSession.sessionState.conf.ndvMaxError
     val expressions = Count(Literal(1)).toAggregateExpression() +:
-      attributesToAnalyze.map(ColumnStatStruct(_, ndvMaxErr))
+        attributesToAnalyze.map(ColumnStat.statExprs(_, ndvMaxErr))
+
     val namedExpressions = expressions.map(e => Alias(e, e.toString)())
-    val statsRow = Dataset.ofRows(sparkSession, Aggregate(Nil, namedExpressions, relation))
-      .queryExecution.toRdd.collect().head
+    val statsRow = Dataset.ofRows(sparkSession, Aggregate(Nil, namedExpressions, relation)).head()
 
-    // unwrap the result
     val rowCount = statsRow.getLong(0)
     val columnStats = attributesToAnalyze.zipWithIndex.map { case (expr, i) =>
-      val numFields = ColumnStatStruct.numStatFields(expr.dataType)
-      (expr.name, ColumnStat(statsRow.getStruct(i + 1, numFields)))
+      (expr.name, ColumnStat.rowToColumnStat(statsRow.getStruct(i + 1)))
     }.toMap
     (rowCount, columnStats)
   }
 }
-
-object ColumnStatStruct {
-  private val zero = Literal(0, LongType)
-  private val one = Literal(1, LongType)
-
-  private def numNulls(e: Expression): Expression = {
-    if (e.nullable) Sum(If(IsNull(e), one, zero)) else zero
-  }
-  private def max(e: Expression): Expression = Max(e)
-  private def min(e: Expression): Expression = Min(e)
-  private def ndv(e: Expression, relativeSD: Double): Expression = {
-    // the approximate ndv should never be larger than the number of rows
-    Least(Seq(HyperLogLogPlusPlus(e, relativeSD), Count(one)))
-  }
-  private def avgLength(e: Expression): Expression = Average(Length(e))
-  private def maxLength(e: Expression): Expression = Max(Length(e))
-  private def numTrues(e: Expression): Expression = Sum(If(e, one, zero))
-  private def numFalses(e: Expression): Expression = Sum(If(Not(e), one, zero))
-
-  private def getStruct(exprs: Seq[Expression]): CreateStruct = {
-    CreateStruct(exprs.map { expr: Expression =>
-      expr.transformUp {
-        case af: AggregateFunction => af.toAggregateExpression()
-      }
-    })
-  }
-
-  private def numericColumnStat(e: Expression, relativeSD: Double): Seq[Expression] = {
-    Seq(numNulls(e), max(e), min(e), ndv(e, relativeSD))
-  }
-
-  private def stringColumnStat(e: Expression, relativeSD: Double): Seq[Expression] = {
-    Seq(numNulls(e), avgLength(e), maxLength(e), ndv(e, relativeSD))
-  }
-
-  private def binaryColumnStat(e: Expression): Seq[Expression] = {
-    Seq(numNulls(e), avgLength(e), maxLength(e))
-  }
-
-  private def booleanColumnStat(e: Expression): Seq[Expression] = {
-    Seq(numNulls(e), numTrues(e), numFalses(e))
-  }
-
-  def numStatFields(dataType: DataType): Int = {
-    dataType match {
-      case BinaryType | BooleanType => 3
-      case _ => 4
-    }
-  }
-
-  def apply(attr: Attribute, relativeSD: Double): CreateStruct = attr.dataType match {
-    // Use aggregate functions to compute statistics we need.
-    case _: NumericType | TimestampType | DateType => getStruct(numericColumnStat(attr, relativeSD))
-    case StringType => getStruct(stringColumnStat(attr, relativeSD))
-    case BinaryType => getStruct(binaryColumnStat(attr))
-    case BooleanType => getStruct(booleanColumnStat(attr))
-    case otherType =>
-      throw new AnalysisException("Analyzing columns is not supported for column " +
-        s"${attr.name} of data type: ${attr.dataType}.")
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
index af6def52d07d..dc8d97594c7a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
@@ -60,6 +60,13 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
       }
       (keyValueOutput, runFunc)
 
+    case Some((key @ SetCommand.VariableName(name), Some(value))) =>
+      val runFunc = (sparkSession: SparkSession) => {
+        sparkSession.conf.set(name, value)
+        Seq(Row(key, value))
+      }
+      (keyValueOutput, runFunc)
+
     // Configures a single property.
     case Some((key, Some(value))) =>
       val runFunc = (sparkSession: SparkSession) => {
@@ -117,6 +124,10 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
 
 }
 
+object SetCommand {
+  val VariableName = """hivevar:([^=]+)""".r
+}
+
 /**
  * This command is for resetting SQLConf to the default values. Command that runs
  * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
index c31f4dc9aba4..336f14dd97ae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.command
 
 import org.apache.spark.sql.{Dataset, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
@@ -49,10 +50,17 @@ case class CacheTableCommand(
 }
 
 
-case class UncacheTableCommand(tableIdent: TableIdentifier) extends RunnableCommand {
+case class UncacheTableCommand(
+    tableIdent: TableIdentifier,
+    ifExists: Boolean) extends RunnableCommand {
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
-    sparkSession.catalog.uncacheTable(tableIdent.quotedString)
+    val tableId = tableIdent.quotedString
+    try {
+      sparkSession.catalog.uncacheTable(tableId)
+    } catch {
+      case _: NoSuchTableException if ifExists => // don't throw
+    }
     Seq.empty[Row]
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index d82e54e57564..58f507119325 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -86,25 +86,25 @@ case class ExecutedCommandExec(cmd: RunnableCommand) extends SparkPlan {
  * }}}
  *
  * @param logicalPlan plan to explain
- * @param output output schema
  * @param extended whether to do extended explain or not
  * @param codegen whether to output generated code from whole-stage codegen or not
  */
 case class ExplainCommand(
     logicalPlan: LogicalPlan,
-    override val output: Seq[Attribute] =
-      Seq(AttributeReference("plan", StringType, nullable = true)()),
     extended: Boolean = false,
     codegen: Boolean = false)
   extends RunnableCommand {
 
+  override val output: Seq[Attribute] =
+    Seq(AttributeReference("plan", StringType, nullable = true)())
+
   // Run through the optimizer to generate the physical plan.
   override def run(sparkSession: SparkSession): Seq[Row] = try {
     val queryExecution =
       if (logicalPlan.isStreaming) {
         // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the
         // output mode does not matter since there is no `Sink`.
-        new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0)
+        new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0)
       } else {
         sparkSession.sessionState.executePlan(logicalPlan)
       }
@@ -121,3 +121,25 @@ case class ExplainCommand(
     ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_))
   }
 }
+
+/** An explain command for users to see how a streaming batch is executed. */
+case class StreamingExplainCommand(
+    queryExecution: IncrementalExecution,
+    extended: Boolean) extends RunnableCommand {
+
+  override val output: Seq[Attribute] =
+    Seq(AttributeReference("plan", StringType, nullable = true)())
+
+  // Run through the optimizer to generate the physical plan.
+  override def run(sparkSession: SparkSession): Seq[Row] = try {
+    val outputString =
+      if (extended) {
+        queryExecution.toString
+      } else {
+        queryExecution.simpleString
+      }
+    Seq(Row(outputString))
+  } catch { case cause: TreeNodeException[_] =>
+    ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 2a9743130d4c..182d182faa21 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -18,13 +18,11 @@
 package org.apache.spark.sql.execution.command
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.plans.QueryPlan
+import org.apache.spark.sql.catalyst.expressions.NamedExpression
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation}
-import org.apache.spark.sql.types._
+import org.apache.spark.sql.sources.BaseRelation
 
 /**
  * A command used to create a data source table.
@@ -57,13 +55,22 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 
     // Create the relation to validate the arguments before writing the metadata to the metastore,
     // and infer the table schema and partition if users didn't specify schema in CREATE TABLE.
+    val pathOption = table.storage.locationUri.map("path" -> _)
+    // Fill in some default table options from the session conf
+    val tableWithDefaultOptions = table.copy(
+      identifier = table.identifier.copy(
+        database = Some(
+          table.identifier.database.getOrElse(sessionState.catalog.getCurrentDatabase))),
+      tracksPartitionsInCatalog = sparkSession.sessionState.conf.manageFilesourcePartitions)
     val dataSource: BaseRelation =
       DataSource(
         sparkSession = sparkSession,
         userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
+        partitionColumns = table.partitionColumnNames,
         className = table.provider.get,
         bucketSpec = table.bucketSpec,
-        options = table.storage.properties).resolveRelation()
+        options = table.storage.properties ++ pathOption,
+        catalogTable = Some(tableWithDefaultOptions)).resolveRelation()
 
     dataSource match {
       case fs: HadoopFsRelation =>
@@ -85,20 +92,13 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
       }
     }
 
-    val optionsWithPath = if (table.tableType == CatalogTableType.MANAGED) {
-      table.storage.properties + ("path" -> sessionState.catalog.defaultTablePath(table.identifier))
-    } else {
-      table.storage.properties
-    }
-
     val newTable = table.copy(
-      storage = table.storage.copy(properties = optionsWithPath),
       schema = dataSource.schema,
       partitionColumnNames = partitionColumnNames,
       // If metastore partition management for file source tables is enabled, we start off with
       // partition provider hive, but no partitions in the metastore. The user has to call
       // `msck repair table` to populate the table partitions.
-      partitionProviderIsHive = partitionColumnNames.nonEmpty &&
+      tracksPartitionsInCatalog = partitionColumnNames.nonEmpty &&
         sparkSession.sessionState.conf.manageFilesourcePartitions)
     // We will return Nil or throw exception at the beginning if the table already exists, so when
     // we reach here, the table should not exist and we should set `ignoreIfExists` to false.
@@ -140,15 +140,10 @@ case class CreateDataSourceTableAsSelectCommand(
     val tableIdentWithDB = table.identifier.copy(database = Some(db))
     val tableName = tableIdentWithDB.unquotedString
 
-    val optionsWithPath = if (table.tableType == CatalogTableType.MANAGED) {
-      table.storage.properties + ("path" -> sessionState.catalog.defaultTablePath(table.identifier))
-    } else {
-      table.storage.properties
-    }
-
     var createMetastoreTable = false
-    var existingSchema = Option.empty[StructType]
-    if (sparkSession.sessionState.catalog.tableExists(tableIdentWithDB)) {
+    // We may need to reorder the columns of the query to match the existing table.
+    var reorderedColumns = Option.empty[Seq[NamedExpression]]
+    if (sessionState.catalog.tableExists(tableIdentWithDB)) {
       // Check if we need to throw an exception or just return.
       mode match {
         case SaveMode.ErrorIfExists =>
@@ -161,41 +156,76 @@ case class CreateDataSourceTableAsSelectCommand(
           // Since the table already exists and the save mode is Ignore, we will just return.
           return Seq.empty[Row]
         case SaveMode.Append =>
+          val existingTable = sessionState.catalog.getTableMetadata(tableIdentWithDB)
+
+          if (existingTable.provider.get == DDLUtils.HIVE_PROVIDER) {
+            throw new AnalysisException(s"Saving data in the Hive serde table $tableName is " +
+              "not supported yet. Please use the insertInto() API as an alternative.")
+          }
+
           // Check if the specified data source match the data source of the existing table.
-          val dataSource = DataSource(
-            sparkSession = sparkSession,
-            userSpecifiedSchema = Some(query.schema.asNullable),
-            partitionColumns = table.partitionColumnNames,
-            bucketSpec = table.bucketSpec,
-            className = provider,
-            options = optionsWithPath)
+          val existingProvider = DataSource.lookupDataSource(existingTable.provider.get)
+          val specifiedProvider = DataSource.lookupDataSource(table.provider.get)
           // TODO: Check that options from the resolved relation match the relation that we are
           // inserting into (i.e. using the same compression).
+          if (existingProvider != specifiedProvider) {
+            throw new AnalysisException(s"The format of the existing table $tableName is " +
+              s"`${existingProvider.getSimpleName}`. It doesn't match the specified format " +
+              s"`${specifiedProvider.getSimpleName}`.")
+          }
+
+          if (query.schema.length != existingTable.schema.length) {
+            throw new AnalysisException(
+              s"The column number of the existing table $tableName" +
+                s"(${existingTable.schema.catalogString}) doesn't match the data schema" +
+                s"(${query.schema.catalogString})")
+          }
+
+          val resolver = sessionState.conf.resolver
+          val tableCols = existingTable.schema.map(_.name)
+
+          reorderedColumns = Some(existingTable.schema.map { f =>
+            query.resolve(Seq(f.name), resolver).getOrElse {
+              val inputColumns = query.schema.map(_.name).mkString(", ")
+              throw new AnalysisException(
+                s"cannot resolve '${f.name}' given input columns: [$inputColumns]")
+            }
+          })
+
+          // In `AnalyzeCreateTable`, we verified the consistency between the user-specified table
+          // definition(partition columns, bucketing) and the SELECT query, here we also need to
+          // verify the the consistency between the user-specified table definition and the existing
+          // table definition.
+
+          // Check if the specified partition columns match the existing table.
+          val specifiedPartCols = CatalogUtils.normalizePartCols(
+            tableName, tableCols, table.partitionColumnNames, resolver)
+          if (specifiedPartCols != existingTable.partitionColumnNames) {
+            throw new AnalysisException(
+              s"""
+                |Specified partitioning does not match that of the existing table $tableName.
+                |Specified partition columns: [${specifiedPartCols.mkString(", ")}]
+                |Existing partition columns: [${existingTable.partitionColumnNames.mkString(", ")}]
+              """.stripMargin)
+          }
 
-          // Pass a table identifier with database part, so that `lookupRelation` won't get temp
-          // views unexpectedly.
-          EliminateSubqueryAliases(sessionState.catalog.lookupRelation(tableIdentWithDB)) match {
-            case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _, _) =>
-              // check if the file formats match
-              l.relation match {
-                case r: HadoopFsRelation if r.fileFormat.getClass != dataSource.providingClass =>
-                  throw new AnalysisException(
-                    s"The file format of the existing table $tableName is " +
-                      s"`${r.fileFormat.getClass.getName}`. It doesn't match the specified " +
-                      s"format `$provider`")
-                case _ =>
-              }
-              if (query.schema.size != l.schema.size) {
-                throw new AnalysisException(
-                  s"The column number of the existing schema[${l.schema}] " +
-                    s"doesn't match the data schema[${query.schema}]'s")
-              }
-              existingSchema = Some(l.schema)
-            case s: SimpleCatalogRelation if DDLUtils.isDatasourceTable(s.metadata) =>
-              existingSchema = Some(s.metadata.schema)
-            case o =>
-              throw new AnalysisException(s"Saving data in ${o.toString} is not supported.")
+          // Check if the specified bucketing match the existing table.
+          val specifiedBucketSpec = table.bucketSpec.map { bucketSpec =>
+            CatalogUtils.normalizeBucketSpec(tableName, tableCols, bucketSpec, resolver)
           }
+          if (specifiedBucketSpec != existingTable.bucketSpec) {
+            val specifiedBucketString =
+              specifiedBucketSpec.map(_.toString).getOrElse("not bucketed")
+            val existingBucketString =
+              existingTable.bucketSpec.map(_.toString).getOrElse("not bucketed")
+            throw new AnalysisException(
+              s"""
+                |Specified bucketing does not match that of the existing table $tableName.
+                |Specified bucketing: $specifiedBucketString
+                |Existing bucketing: $existingBucketString
+              """.stripMargin)
+          }
+
         case SaveMode.Overwrite =>
           sessionState.catalog.dropTable(tableIdentWithDB, ignoreIfNotExists = true, purge = false)
           // Need to create the table again.
@@ -207,22 +237,30 @@ case class CreateDataSourceTableAsSelectCommand(
     }
 
     val data = Dataset.ofRows(sparkSession, query)
-    val df = existingSchema match {
-      // If we are inserting into an existing table, just use the existing schema.
-      case Some(s) => data.selectExpr(s.fieldNames: _*)
+    val df = reorderedColumns match {
+      // Reorder the columns of the query to match the existing table.
+      case Some(cols) => data.select(cols.map(Column(_)): _*)
       case None => data
     }
 
+    val tableLocation = if (table.tableType == CatalogTableType.MANAGED) {
+      Some(sessionState.catalog.defaultTablePath(table.identifier))
+    } else {
+      table.storage.locationUri
+    }
+
     // Create the relation based on the data of df.
+    val pathOption = tableLocation.map("path" -> _)
     val dataSource = DataSource(
       sparkSession,
       className = provider,
       partitionColumns = table.partitionColumnNames,
       bucketSpec = table.bucketSpec,
-      options = optionsWithPath)
+      options = table.storage.properties ++ pathOption,
+      catalogTable = Some(table))
 
     val result = try {
-      dataSource.write(mode, df)
+      dataSource.writeAndRead(mode, df)
     } catch {
       case ex: AnalysisException =>
         logError(s"Failed to write to table $tableName in $mode mode", ex)
@@ -230,7 +268,7 @@ case class CreateDataSourceTableAsSelectCommand(
     }
     if (createMetastoreTable) {
       val newTable = table.copy(
-        storage = table.storage.copy(properties = optionsWithPath),
+        storage = table.storage.copy(locationUri = tableLocation),
         // We will use the schema of resolved.relation as the schema of the table (instead of
         // the schema of df). It is important since the nullability may be changed by the relation
         // provider (for example, see org.apache.spark.sql.parquet.DefaultSource).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 61e0550cef5e..f9afe466d9f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -28,11 +28,11 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.Resolver
-import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTablePartition, CatalogTableType, SessionCatalog}
+import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, PartitioningUtils}
+import org.apache.spark.sql.execution.datasources.PartitioningUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
 
@@ -199,9 +199,9 @@ case class DropTableCommand(
       }
     }
     try {
-      sparkSession.sharedState.cacheManager.uncacheQuery(
-        sparkSession.table(tableName.quotedString))
+      sparkSession.sharedState.cacheManager.uncacheQuery(sparkSession.table(tableName))
     } catch {
+      case _: NoSuchTableException if ifExists =>
       case NonFatal(e) => log.warn(e.toString, e)
     }
     catalog.refreshTable(tableName)
@@ -328,13 +328,12 @@ case class AlterTableSerDePropertiesCommand(
 /**
  * Add Partition in ALTER TABLE: add the table partitions.
  *
- * 'partitionSpecsAndLocs': the syntax of ALTER VIEW is identical to ALTER TABLE,
- * EXCEPT that it is ILLEGAL to specify a LOCATION clause.
  * An error message will be issued if the partition exists, unless 'ifNotExists' is true.
  *
  * The syntax of this command is:
  * {{{
- *   ALTER TABLE table ADD [IF NOT EXISTS] PARTITION spec [LOCATION 'loc1']
+ *   ALTER TABLE table ADD [IF NOT EXISTS] PARTITION spec1 [LOCATION 'loc1']
+ *                                         PARTITION spec2 [LOCATION 'loc2']
  * }}}
  */
 case class AlterTableAddPartitionCommand(
@@ -420,7 +419,8 @@ case class AlterTableDropPartitionCommand(
     tableName: TableIdentifier,
     specs: Seq[TablePartitionSpec],
     ifExists: Boolean,
-    purge: Boolean)
+    purge: Boolean,
+    retainData: Boolean)
   extends RunnableCommand {
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
@@ -438,7 +438,8 @@ case class AlterTableDropPartitionCommand(
     }
 
     catalog.dropPartitions(
-      table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge)
+      table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge,
+      retainData = retainData)
     Seq.empty[Row]
   }
 
@@ -485,14 +486,6 @@ case class AlterTableRecoverPartitionsCommand(
     }
   }
 
-  private def getBasePath(table: CatalogTable): Option[String] = {
-    if (table.provider == Some("hive")) {
-      table.storage.locationUri
-    } else {
-      new CaseInsensitiveMap(table.storage.properties).get("path")
-    }
-  }
-
   override def run(spark: SparkSession): Seq[Row] = {
     val catalog = spark.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
@@ -503,13 +496,12 @@ case class AlterTableRecoverPartitionsCommand(
         s"Operation not allowed: $cmd only works on partitioned tables: $tableIdentWithDB")
     }
 
-    val tablePath = getBasePath(table)
-    if (tablePath.isEmpty) {
+    if (table.storage.locationUri.isEmpty) {
       throw new AnalysisException(s"Operation not allowed: $cmd only works on table with " +
         s"location provided: $tableIdentWithDB")
     }
 
-    val root = new Path(tablePath.get)
+    val root = new Path(table.location)
     logInfo(s"Recover all the partitions in $root")
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
 
@@ -532,7 +524,7 @@ case class AlterTableRecoverPartitionsCommand(
     // Updates the table to indicate that its partition metadata is stored in the Hive metastore.
     // This is always the case for Hive format tables, but is not true for Datasource tables created
     // before Spark 2.1 unless they are converted via `msck repair table`.
-    spark.sessionState.catalog.alterTable(table.copy(partitionProviderIsHive = true))
+    spark.sessionState.catalog.alterTable(table.copy(tracksPartitionsInCatalog = true))
     catalog.refreshTable(tableName)
     logInfo(s"Recovered all partitions ($total).")
     Seq.empty[Row]
@@ -567,9 +559,9 @@ case class AlterTableRecoverPartitionsCommand(
       val name = st.getPath.getName
       if (st.isDirectory && name.contains("=")) {
         val ps = name.split("=", 2)
-        val columnName = PartitioningUtils.unescapePathName(ps(0))
+        val columnName = ExternalCatalogUtils.unescapePathName(ps(0))
         // TODO: Validate the value
-        val value = PartitioningUtils.unescapePathName(ps(1))
+        val value = ExternalCatalogUtils.unescapePathName(ps(1))
         if (resolver(columnName, partitionNames.head)) {
           scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(partitionNames.head -> value),
             partitionNames.drop(1), threshold, resolver)
@@ -688,15 +680,7 @@ case class AlterTableSetLocationCommand(
         catalog.alterPartitions(table.identifier, Seq(newPart))
       case None =>
         // No partition spec is specified, so we set the location for the table itself
-        val newTable =
-          if (DDLUtils.isDatasourceTable(table)) {
-            table.withNewStorage(
-              locationUri = Some(location),
-              properties = table.storage.properties ++ Map("path" -> location))
-          } else {
-            table.withNewStorage(locationUri = Some(location))
-          }
-        catalog.alterTable(newTable)
+        catalog.alterTable(table.withNewStorage(locationUri = Some(location)))
     }
     Seq.empty[Row]
   }
@@ -704,8 +688,10 @@ case class AlterTableSetLocationCommand(
 
 
 object DDLUtils {
+  val HIVE_PROVIDER = "hive"
+
   def isDatasourceTable(table: CatalogTable): Boolean = {
-    table.provider.isDefined && table.provider.get != "hive"
+    table.provider.isDefined && table.provider.get != HIVE_PROVIDER
   }
 
   /**
@@ -719,7 +705,7 @@ object DDLUtils {
         s"$action is not allowed on $tableName since filesource partition management is " +
           "disabled (spark.sql.hive.manageFilesourcePartitions = false).")
     }
-    if (!table.partitionProviderIsHive && isDatasourceTable(table)) {
+    if (!table.tracksPartitionsInCatalog && isDatasourceTable(table)) {
       throw new AnalysisException(
         s"$action is not allowed on $tableName since its partition metadata is not stored in " +
           "the Hive metastore. To import this information into the metastore, run " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
index 24d825f5cb33..ea5398761c46 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
@@ -103,18 +103,22 @@ case class DescribeFunctionCommand(
     functionName.funcName.toLowerCase match {
       case "<>" =>
         Row(s"Function: $functionName") ::
-          Row(s"Usage: a <> b - Returns TRUE if a is not equal to b") :: Nil
+          Row("Usage: expr1 <> expr2 - " +
+            "Returns true if `expr1` is not equal to `expr2`.") :: Nil
       case "!=" =>
         Row(s"Function: $functionName") ::
-          Row(s"Usage: a != b - Returns TRUE if a is not equal to b") :: Nil
+          Row("Usage: expr1 != expr2 - " +
+            "Returns true if `expr1` is not equal to `expr2`.") :: Nil
       case "between" =>
-        Row(s"Function: between") ::
-          Row(s"Usage: a [NOT] BETWEEN b AND c - " +
-            s"evaluate if a is [not] in between b and c") :: Nil
+        Row("Function: between") ::
+          Row("Usage: expr1 [NOT] BETWEEN expr2 AND expr3 - " +
+            "evaluate if `expr1` is [not] in between `expr2` and `expr3`.") :: Nil
       case "case" =>
-        Row(s"Function: case") ::
-          Row(s"Usage: CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - " +
-            s"When a = b, returns c; when a = d, return e; else return f") :: Nil
+        Row("Function: case") ::
+          Row("Usage: CASE expr1 WHEN expr2 THEN expr3 " +
+            "[WHEN expr4 THEN expr5]* [ELSE expr6] END - " +
+            "When `expr1` = `expr2`, returns `expr3`; " +
+            "when `expr1` = `expr4`, return `expr5`; else return `expr6`.") :: Nil
       case _ =>
         try {
           val info = sparkSession.sessionState.catalog.lookupFunctionInfo(functionName)
@@ -126,7 +130,7 @@ case class DescribeFunctionCommand(
 
           if (isExtended) {
             result :+
-              Row(s"Extended Usage:\n${replaceFunctionName(info.getExtended, name)}")
+              Row(s"Extended Usage:${replaceFunctionName(info.getExtended, info.getName)}")
           } else {
             result
           }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 4acfffb62804..c0f96251316b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
@@ -62,25 +63,6 @@ case class CreateTableLikeCommand(
     val catalog = sparkSession.sessionState.catalog
     val sourceTableDesc = catalog.getTempViewOrPermanentTableMetadata(sourceTable)
 
-    // Storage format
-    val newStorage =
-      if (sourceTableDesc.tableType == CatalogTableType.VIEW) {
-        val newPath = catalog.defaultTablePath(targetTable)
-        CatalogStorageFormat.empty.copy(properties = Map("path" -> newPath))
-      } else if (DDLUtils.isDatasourceTable(sourceTableDesc)) {
-        val newPath = catalog.defaultTablePath(targetTable)
-        val newSerdeProp =
-          sourceTableDesc.storage.properties.filterKeys(_.toLowerCase != "path") ++
-            Map("path" -> newPath)
-        sourceTableDesc.storage.copy(
-          locationUri = None,
-          properties = newSerdeProp)
-      } else {
-        sourceTableDesc.storage.copy(
-          locationUri = None,
-          properties = sourceTableDesc.storage.properties)
-      }
-
     val newProvider = if (sourceTableDesc.tableType == CatalogTableType.VIEW) {
       Some(sparkSession.sessionState.conf.defaultDataSourceName)
     } else {
@@ -91,7 +73,8 @@ case class CreateTableLikeCommand(
       CatalogTable(
         identifier = targetTable,
         tableType = CatalogTableType.MANAGED,
-        storage = newStorage,
+        // We are creating a new managed table, which should not have custom table location.
+        storage = sourceTableDesc.storage.copy(locationUri = None),
         schema = sourceTableDesc.schema,
         provider = newProvider,
         partitionColumnNames = sourceTableDesc.partitionColumnNames,
@@ -170,13 +153,6 @@ case class AlterTableRenameCommand(
           case NonFatal(e) => log.warn(e.toString, e)
         }
       }
-      // For datasource tables, we also need to update the "path" serde property
-      if (DDLUtils.isDatasourceTable(table) && table.tableType == CatalogTableType.MANAGED) {
-        val newPath = catalog.defaultTablePath(newName)
-        val newTable = table.withNewStorage(
-          properties = table.storage.properties ++ Map("path" -> newPath))
-        catalog.alterTable(newTable)
-      }
       // Invalidate the table last, otherwise uncaching the table would load the logical plan
       // back into the hive metastore cache
       catalog.refreshTable(oldName)
@@ -329,6 +305,10 @@ case class LoadDataCommand(
         isOverwrite,
         holdDDLTime = false)
     }
+
+    // Refresh the metadata cache to ensure the data visible to the users
+    catalog.refreshTable(targetTable.identifier)
+
     Seq.empty[Row]
   }
 }
@@ -348,31 +328,47 @@ case class TruncateTableCommand(
   override def run(spark: SparkSession): Seq[Row] = {
     val catalog = spark.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
-    val tableIdentwithDB = table.identifier.quotedString
+    val tableIdentWithDB = table.identifier.quotedString
 
     if (table.tableType == CatalogTableType.EXTERNAL) {
       throw new AnalysisException(
-        s"Operation not allowed: TRUNCATE TABLE on external tables: $tableIdentwithDB")
+        s"Operation not allowed: TRUNCATE TABLE on external tables: $tableIdentWithDB")
     }
     if (table.tableType == CatalogTableType.VIEW) {
       throw new AnalysisException(
-        s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentwithDB")
+        s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentWithDB")
     }
     if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) {
       throw new AnalysisException(
         s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " +
-        s"for tables that are not partitioned: $tableIdentwithDB")
+        s"for tables that are not partitioned: $tableIdentWithDB")
     }
     if (partitionSpec.isDefined) {
       DDLUtils.verifyPartitionProviderIsHive(spark, table, "TRUNCATE TABLE ... PARTITION")
     }
+
+    val partCols = table.partitionColumnNames
     val locations =
-      if (DDLUtils.isDatasourceTable(table)) {
-        Seq(table.storage.properties.get("path"))
-      } else if (table.partitionColumnNames.isEmpty) {
+      if (partCols.isEmpty) {
         Seq(table.storage.locationUri)
       } else {
-        catalog.listPartitions(table.identifier, partitionSpec).map(_.storage.locationUri)
+        val normalizedSpec = partitionSpec.map { spec =>
+          PartitioningUtils.normalizePartitionSpec(
+            spec,
+            partCols,
+            table.identifier.quotedString,
+            spark.sessionState.conf.resolver)
+        }
+        val partLocations =
+          catalog.listPartitions(table.identifier, normalizedSpec).map(_.storage.locationUri)
+
+        // Fail if the partition spec is fully specified (not partial) and the partition does not
+        // exist.
+        for (spec <- partitionSpec if partLocations.isEmpty && spec.size == partCols.length) {
+          throw new NoSuchPartitionException(table.database, table.identifier.table, spec)
+        }
+
+        partLocations
       }
     val hadoopConf = spark.sessionState.newHadoopConf()
     locations.foreach { location =>
@@ -385,7 +381,7 @@ case class TruncateTableCommand(
         } catch {
           case NonFatal(e) =>
             throw new AnalysisException(
-              s"Failed to truncate table $tableIdentwithDB when removing data of the path: $path " +
+              s"Failed to truncate table $tableIdentWithDB when removing data of the path: $path " +
                 s"because of ${e.toString}")
         }
       }
@@ -398,7 +394,7 @@ case class TruncateTableCommand(
       spark.sharedState.cacheManager.uncacheQuery(spark.table(table.identifier))
     } catch {
       case NonFatal(e) =>
-        log.warn(s"Exception when attempting to uncache table $tableIdentwithDB", e)
+        log.warn(s"Exception when attempting to uncache table $tableIdentWithDB", e)
     }
     Seq.empty[Row]
   }
@@ -439,7 +435,13 @@ case class DescribeTableCommand(
       describeSchema(catalog.lookupRelation(table).schema, result)
     } else {
       val metadata = catalog.getTableMetadata(table)
-      describeSchema(metadata.schema, result)
+      if (metadata.schema.isEmpty) {
+        // In older version(prior to 2.1) of Spark, the table schema can be empty and should be
+        // inferred at runtime. We should still support it.
+        describeSchema(catalog.lookupRelation(metadata.identifier).schema, result)
+      } else {
+        describeSchema(metadata.schema, result)
+      }
 
       describePartitionInfo(metadata, result)
 
@@ -490,8 +492,8 @@ case class DescribeTableCommand(
 
     if (table.tableType == CatalogTableType.VIEW) describeViewInfo(table, buffer)
 
-    if (DDLUtils.isDatasourceTable(table) && table.partitionProviderIsHive) {
-      append(buffer, "Partition Provider:", "Hive", "")
+    if (DDLUtils.isDatasourceTable(table) && table.tracksPartitionsInCatalog) {
+      append(buffer, "Partition Provider:", "Catalog", "")
     }
   }
 
@@ -505,7 +507,8 @@ case class DescribeTableCommand(
     describeBucketingInfo(metadata, buffer)
 
     append(buffer, "Storage Desc Parameters:", "", "")
-    metadata.storage.properties.foreach { case (key, value) =>
+    val maskedProperties = CatalogUtils.maskCredentials(metadata.storage.properties)
+    maskedProperties.foreach { case (key, value) =>
       append(buffer, s"  $key", value, "")
     }
   }
@@ -716,12 +719,6 @@ case class ShowPartitionsCommand(
     AttributeReference("partition", StringType, nullable = false)() :: Nil
   }
 
-  private def getPartName(spec: TablePartitionSpec, partColNames: Seq[String]): String = {
-    partColNames.map { name =>
-      PartitioningUtils.escapePathName(name) + "=" + PartitioningUtils.escapePathName(spec(name))
-    }.mkString(File.separator)
-  }
-
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
@@ -758,10 +755,7 @@ case class ShowPartitionsCommand(
       }
     }
 
-    val partNames = catalog.listPartitions(tableName, spec).map { p =>
-      getPartName(p.spec, table.partitionColumnNames)
-    }
-
+    val partNames = catalog.listPartitionNames(tableName, spec)
     partNames.map(Row(_))
   }
 }
@@ -788,7 +782,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
   private def showCreateHiveTable(metadata: CatalogTable): String = {
     def reportUnsupportedError(features: Seq[String]): Unit = {
       throw new AnalysisException(
-        s"Failed to execute SHOW CREATE TABLE against table ${metadata.identifier.quotedString}, " +
+        s"Failed to execute SHOW CREATE TABLE against table/view ${metadata.identifier}, " +
           "which is created by Hive and uses the following unsupported feature(s)\n" +
           features.map(" - " + _).mkString("\n")
       )
@@ -916,17 +910,18 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
   }
 
   private def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = {
-    val props = metadata.properties
-
     builder ++= s"USING ${metadata.provider.get}\n"
 
-    val dataSourceOptions = metadata.storage.properties.filterNot {
-      case (key, value) =>
+    val dataSourceOptions = metadata.storage.properties.map {
+      case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'"
+    } ++ metadata.storage.locationUri.flatMap { location =>
+      if (metadata.tableType == MANAGED) {
         // If it's a managed table, omit PATH option. Spark SQL always creates external table
         // when the table creation DDL contains the PATH option.
-        key.toLowerCase == "path" && metadata.tableType == MANAGED
-    }.map {
-      case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'"
+        None
+      } else {
+        Some(s"path '${escapeSingleQuotedString(location)}'")
+      }
     }
 
     if (dataSourceOptions.nonEmpty) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index bbcd9c4ef564..154141bf83c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -19,21 +19,23 @@ package org.apache.spark.sql.execution.command
 
 import scala.util.control.NonFatal
 
-import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession}
+import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.{SQLBuilder, TableIdentifier}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.expressions.Alias
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
-import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation}
-import org.apache.spark.sql.types.{MetadataBuilder, StructType}
+import org.apache.spark.sql.types.MetadataBuilder
 
 
 /**
  * ViewType is used to specify the expected view type when we want to create or replace a view in
  * [[CreateViewCommand]].
  */
-sealed trait ViewType
+sealed trait ViewType {
+  override def toString: String = getClass.getSimpleName.stripSuffix("$")
+}
 
 /**
  * LocalTempView means session-scoped local temporary views. Its lifetime is the lifetime of the
@@ -131,6 +133,10 @@ case class CreateViewCommand(
         s"specified by CREATE VIEW (num: `${userSpecifiedColumns.length}`).")
     }
 
+    // When creating a permanent view, not allowed to reference temporary objects.
+    // This should be called after `qe.assertAnalyzed()` (i.e., `child` can be resolved)
+    verifyTemporaryObjectsNotExists(sparkSession)
+
     val aliasedPlan = if (userSpecifiedColumns.isEmpty) {
       analyzedPlan
     } else {
@@ -172,6 +178,34 @@ case class CreateViewCommand(
     Seq.empty[Row]
   }
 
+  /**
+   * Permanent views are not allowed to reference temp objects, including temp function and views
+   */
+  private def verifyTemporaryObjectsNotExists(sparkSession: SparkSession): Unit = {
+    if (!isTemporary) {
+      // This func traverses the unresolved plan `child`. Below are the reasons:
+      // 1) Analyzer replaces unresolved temporary views by a SubqueryAlias with the corresponding
+      // logical plan. After replacement, it is impossible to detect whether the SubqueryAlias is
+      // added/generated from a temporary view.
+      // 2) The temp functions are represented by multiple classes. Most are inaccessible from this
+      // package (e.g., HiveGenericUDF).
+      child.collect {
+        // Disallow creating permanent views based on temporary views.
+        case s: UnresolvedRelation
+          if sparkSession.sessionState.catalog.isTemporaryTable(s.tableIdentifier) =>
+          throw new AnalysisException(s"Not allowed to create a permanent view $name by " +
+            s"referencing a temporary view ${s.tableIdentifier}")
+        case other if !other.resolved => other.expressions.flatMap(_.collect {
+          // Disallow creating permanent views based on temporary UDFs.
+          case e: UnresolvedFunction
+            if sparkSession.sessionState.catalog.isTemporaryFunction(e.name) =>
+            throw new AnalysisException(s"Not allowed to create a permanent view $name by " +
+              s"referencing a temporary function `${e.name}`")
+        })
+      }
+    }
+  }
+
   /**
    * Returns a [[CatalogTable]] that can be used to save in the catalog. This comment canonicalize
    * SQL based on the analyzed plan, and also creates the proper schema for the view.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
index 092aabc89a36..1235a4b12f1d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.SparkSession
@@ -37,14 +38,15 @@ class CatalogFileIndex(
     val table: CatalogTable,
     override val sizeInBytes: Long) extends FileIndex {
 
-  protected val hadoopConf = sparkSession.sessionState.newHadoopConf
+  protected val hadoopConf: Configuration = sparkSession.sessionState.newHadoopConf()
 
-  private val fileStatusCache = FileStatusCache.newCache(sparkSession)
+  /** Globally shared (not exclusive to this table) cache for file statuses to speed up listing. */
+  private val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
 
   assert(table.identifier.database.isDefined,
     "The table identifier must be qualified in CatalogFileIndex")
 
-  private val baseLocation = table.storage.locationUri
+  private val baseLocation: Option[String] = table.storage.locationUri
 
   override def partitionSchema: StructType = table.partitionSchema
 
@@ -67,13 +69,17 @@ class CatalogFileIndex(
       val selectedPartitions = sparkSession.sessionState.catalog.listPartitionsByFilter(
         table.identifier, filters)
       val partitions = selectedPartitions.map { p =>
-        PartitionPath(p.toRow(partitionSchema), p.storage.locationUri.get)
+        val path = new Path(p.location)
+        val fs = path.getFileSystem(hadoopConf)
+        PartitionPath(
+          p.toRow(partitionSchema), path.makeQualified(fs.getUri, fs.getWorkingDirectory))
       }
       val partitionSpec = PartitionSpec(partitionSchema, partitions)
       new PrunedInMemoryFileIndex(
         sparkSession, new Path(baseLocation.get), fileStatusCache, partitionSpec)
     } else {
-      new InMemoryFileIndex(sparkSession, rootPaths, table.storage.properties, None)
+      new InMemoryFileIndex(
+        sparkSession, rootPaths, table.storage.properties, partitionSchema = None)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index d980e6a15aab..af70bf7e5c64 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -29,15 +29,14 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
@@ -62,8 +61,12 @@ import org.apache.spark.util.Utils
  *              qualified. This option only works when reading from a [[FileFormat]].
  * @param userSpecifiedSchema An optional specification of the schema of the data. When present
  *                            we skip attempting to infer the schema.
- * @param partitionColumns A list of column names that the relation is partitioned by. When this
- *                         list is empty, the relation is unpartitioned.
+ * @param partitionColumns A list of column names that the relation is partitioned by. This list is
+ *                         generally empty during the read path, unless this DataSource is managed
+ *                         by Hive. In these cases, during `resolveRelation`, we will call
+ *                         `getOrInferFileFormatSchema` for file based DataSources to infer the
+ *                         partitioning. In other cases, if this list is empty, then this table
+ *                         is unpartitioned.
  * @param bucketSpec An optional specification for bucketing (hash-partitioning) of the data.
  * @param catalogTable Optional catalog table reference that can be used to push down operations
  *                     over the datasource to the catalog service.
@@ -80,141 +83,113 @@ case class DataSource(
 
   case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String])
 
-  lazy val providingClass: Class[_] = lookupDataSource(className)
+  lazy val providingClass: Class[_] = DataSource.lookupDataSource(className)
   lazy val sourceInfo = sourceSchema()
-
-  /** A map to maintain backward compatibility in case we move data sources around. */
-  private val backwardCompatibilityMap: Map[String, String] = {
-    val jdbc = classOf[JdbcRelationProvider].getCanonicalName
-    val json = classOf[JsonFileFormat].getCanonicalName
-    val parquet = classOf[ParquetFileFormat].getCanonicalName
-    val csv = classOf[CSVFileFormat].getCanonicalName
-    val libsvm = "org.apache.spark.ml.source.libsvm.LibSVMFileFormat"
-    val orc = "org.apache.spark.sql.hive.orc.OrcFileFormat"
-
-    Map(
-      "org.apache.spark.sql.jdbc" -> jdbc,
-      "org.apache.spark.sql.jdbc.DefaultSource" -> jdbc,
-      "org.apache.spark.sql.execution.datasources.jdbc.DefaultSource" -> jdbc,
-      "org.apache.spark.sql.execution.datasources.jdbc" -> jdbc,
-      "org.apache.spark.sql.json" -> json,
-      "org.apache.spark.sql.json.DefaultSource" -> json,
-      "org.apache.spark.sql.execution.datasources.json" -> json,
-      "org.apache.spark.sql.execution.datasources.json.DefaultSource" -> json,
-      "org.apache.spark.sql.parquet" -> parquet,
-      "org.apache.spark.sql.parquet.DefaultSource" -> parquet,
-      "org.apache.spark.sql.execution.datasources.parquet" -> parquet,
-      "org.apache.spark.sql.execution.datasources.parquet.DefaultSource" -> parquet,
-      "org.apache.spark.sql.hive.orc.DefaultSource" -> orc,
-      "org.apache.spark.sql.hive.orc" -> orc,
-      "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm,
-      "org.apache.spark.ml.source.libsvm" -> libsvm,
-      "com.databricks.spark.csv" -> csv
-    )
-  }
+  private val caseInsensitiveOptions = new CaseInsensitiveMap(options)
 
   /**
-   * Class that were removed in Spark 2.0. Used to detect incompatibility libraries for Spark 2.0.
-   */
-  private val spark2RemovedClasses = Set(
-    "org.apache.spark.sql.DataFrame",
-    "org.apache.spark.sql.sources.HadoopFsRelationProvider",
-    "org.apache.spark.Logging")
-
-  /** Given a provider name, look up the data source class definition. */
-  private def lookupDataSource(provider0: String): Class[_] = {
-    val provider = backwardCompatibilityMap.getOrElse(provider0, provider0)
-    val provider2 = s"$provider.DefaultSource"
-    val loader = Utils.getContextOrSparkClassLoader
-    val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader)
-
-    try {
-      serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider)).toList match {
-        // the provider format did not match any given registered aliases
-        case Nil =>
-          try {
-            Try(loader.loadClass(provider)).orElse(Try(loader.loadClass(provider2))) match {
-              case Success(dataSource) =>
-                // Found the data source using fully qualified path
-                dataSource
-              case Failure(error) =>
-                if (provider.toLowerCase == "orc" ||
-                  provider.startsWith("org.apache.spark.sql.hive.orc")) {
-                  throw new AnalysisException(
-                    "The ORC data source must be used with Hive support enabled")
-                } else if (provider.toLowerCase == "avro" ||
-                  provider == "com.databricks.spark.avro") {
-                  throw new AnalysisException(
-                    s"Failed to find data source: ${provider.toLowerCase}. Please find an Avro " +
-                      "package at " +
-                      "https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects")
-                } else {
-                  throw new ClassNotFoundException(
-                    s"Failed to find data source: $provider. Please find packages at " +
-                      "https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects",
-                    error)
-                }
-            }
-          } catch {
-            case e: NoClassDefFoundError => // This one won't be caught by Scala NonFatal
-              // NoClassDefFoundError's class name uses "/" rather than "." for packages
-              val className = e.getMessage.replaceAll("/", ".")
-              if (spark2RemovedClasses.contains(className)) {
-                throw new ClassNotFoundException(s"$className was removed in Spark 2.0. " +
-                  "Please check if your library is compatible with Spark 2.0", e)
-              } else {
-                throw e
-              }
-          }
-        case head :: Nil =>
-          // there is exactly one registered alias
-          head.getClass
-        case sources =>
-          // There are multiple registered aliases for the input
-          sys.error(s"Multiple sources found for $provider " +
-            s"(${sources.map(_.getClass.getName).mkString(", ")}), " +
-            "please specify the fully qualified class name.")
-      }
-    } catch {
-      case e: ServiceConfigurationError if e.getCause.isInstanceOf[NoClassDefFoundError] =>
-        // NoClassDefFoundError's class name uses "/" rather than "." for packages
-        val className = e.getCause.getMessage.replaceAll("/", ".")
-        if (spark2RemovedClasses.contains(className)) {
-          throw new ClassNotFoundException(s"Detected an incompatible DataSourceRegister. " +
-            "Please remove the incompatible library from classpath or upgrade it. " +
-            s"Error: ${e.getMessage}", e)
-        } else {
-          throw e
-        }
-    }
-  }
-
-  /**
-   * Infer the schema of the given FileFormat, returns a pair of schema and partition column names.
+   * Get the schema of the given FileFormat, if provided by `userSpecifiedSchema`, or try to infer
+   * it. In the read path, only managed tables by Hive provide the partition columns properly when
+   * initializing this class. All other file based data sources will try to infer the partitioning,
+   * and then cast the inferred types to user specified dataTypes if the partition columns exist
+   * inside `userSpecifiedSchema`, otherwise we can hit data corruption bugs like SPARK-18510.
+   * This method will try to skip file scanning whether `userSpecifiedSchema` and
+   * `partitionColumns` are provided. Here are some code paths that use this method:
+   *   1. `spark.read` (no schema): Most amount of work. Infer both schema and partitioning columns
+   *   2. `spark.read.schema(userSpecifiedSchema)`: Parse partitioning columns, cast them to the
+   *     dataTypes provided in `userSpecifiedSchema` if they exist or fallback to inferred
+   *     dataType if they don't.
+   *   3. `spark.readStream.schema(userSpecifiedSchema)`: For streaming use cases, users have to
+   *     provide the schema. Here, we also perform partition inference like 2, and try to use
+   *     dataTypes in `userSpecifiedSchema`. All subsequent triggers for this stream will re-use
+   *     this information, therefore calls to this method should be very cheap, i.e. there won't
+   *     be any further inference in any triggers.
+   *   4. `df.saveAsTable(tableThatExisted)`: In this case, we call this method to resolve the
+   *     existing table's partitioning scheme. This is achieved by not providing
+   *     `userSpecifiedSchema`. For this case, we add the boolean `justPartitioning` for an early
+   *     exit, if we don't care about the schema of the original table.
+   *
+   * @param format the file format object for this DataSource
+   * @param justPartitioning Whether to exit early and provide just the schema partitioning.
+   * @return A pair of the data schema (excluding partition columns) and the schema of the partition
+   *         columns. If `justPartitioning` is `true`, then the dataSchema will be provided as
+   *         `null`.
    */
-  private def inferFileFormatSchema(format: FileFormat): (StructType, Seq[String]) = {
-    userSpecifiedSchema.map(_ -> partitionColumns).orElse {
-      val caseInsensitiveOptions = new CaseInsensitiveMap(options)
-      val allPaths = caseInsensitiveOptions.get("path")
+  private def getOrInferFileFormatSchema(
+      format: FileFormat,
+      justPartitioning: Boolean = false): (StructType, StructType) = {
+    // the operations below are expensive therefore try not to do them if we don't need to, e.g.,
+    // in streaming mode, we have already inferred and registered partition columns, we will
+    // never have to materialize the lazy val below
+    lazy val tempFileIndex = {
+      val allPaths = caseInsensitiveOptions.get("path") ++ paths
+      val hadoopConf = sparkSession.sessionState.newHadoopConf()
       val globbedPaths = allPaths.toSeq.flatMap { path =>
         val hdfsPath = new Path(path)
-        val fs = hdfsPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
+        val fs = hdfsPath.getFileSystem(hadoopConf)
         val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
         SparkHadoopUtil.get.globPathIfNecessary(qualified)
       }.toArray
-      val fileCatalog = new InMemoryFileIndex(sparkSession, globbedPaths, options, None)
-      val partitionSchema = fileCatalog.partitionSpec().partitionColumns
-      val inferred = format.inferSchema(
+      new InMemoryFileIndex(sparkSession, globbedPaths, options, None)
+    }
+    val partitionSchema = if (partitionColumns.isEmpty) {
+      // Try to infer partitioning, because no DataSource in the read path provides the partitioning
+      // columns properly unless it is a Hive DataSource
+      val resolved = tempFileIndex.partitionSchema.map { partitionField =>
+        val equality = sparkSession.sessionState.conf.resolver
+        // SPARK-18510: try to get schema from userSpecifiedSchema, otherwise fallback to inferred
+        userSpecifiedSchema.flatMap(_.find(f => equality(f.name, partitionField.name))).getOrElse(
+          partitionField)
+      }
+      StructType(resolved)
+    } else {
+      // maintain old behavior before SPARK-18510. If userSpecifiedSchema is empty used inferred
+      // partitioning
+      if (userSpecifiedSchema.isEmpty) {
+        val inferredPartitions = tempFileIndex.partitionSchema
+        inferredPartitions
+      } else {
+        val partitionFields = partitionColumns.map { partitionColumn =>
+          val equality = sparkSession.sessionState.conf.resolver
+          userSpecifiedSchema.flatMap(_.find(c => equality(c.name, partitionColumn))).orElse {
+            val inferredPartitions = tempFileIndex.partitionSchema
+            val inferredOpt = inferredPartitions.find(p => equality(p.name, partitionColumn))
+            if (inferredOpt.isDefined) {
+              logDebug(
+                s"""Type of partition column: $partitionColumn not found in specified schema
+                   |for $format.
+                   |User Specified Schema
+                   |=====================
+                   |${userSpecifiedSchema.orNull}
+                   |
+                   |Falling back to inferred dataType if it exists.
+                 """.stripMargin)
+            }
+            inferredOpt
+          }.getOrElse {
+            throw new AnalysisException(s"Failed to resolve the schema for $format for " +
+              s"the partition column: $partitionColumn. It must be specified manually.")
+          }
+        }
+        StructType(partitionFields)
+      }
+    }
+    if (justPartitioning) {
+      return (null, partitionSchema)
+    }
+    val dataSchema = userSpecifiedSchema.map { schema =>
+      val equality = sparkSession.sessionState.conf.resolver
+      StructType(schema.filterNot(f => partitionSchema.exists(p => equality(p.name, f.name))))
+    }.orElse {
+      format.inferSchema(
         sparkSession,
         caseInsensitiveOptions,
-        fileCatalog.allFiles())
-
-      inferred.map { inferredSchema =>
-        StructType(inferredSchema ++ partitionSchema) -> partitionSchema.map(_.name)
-      }
+        tempFileIndex.allFiles())
     }.getOrElse {
-      throw new AnalysisException("Unable to infer schema. It must be specified manually.")
+      throw new AnalysisException(
+        s"Unable to infer schema for $format. It must be specified manually.")
     }
+    (dataSchema, partitionSchema)
   }
 
   /** Returns the name and schema of the source that can be used to continually read data. */
@@ -222,11 +197,10 @@ case class DataSource(
     providingClass.newInstance() match {
       case s: StreamSourceProvider =>
         val (name, schema) = s.sourceSchema(
-          sparkSession.sqlContext, userSpecifiedSchema, className, options)
+          sparkSession.sqlContext, userSpecifiedSchema, className, caseInsensitiveOptions)
         SourceInfo(name, schema, Nil)
 
       case format: FileFormat =>
-        val caseInsensitiveOptions = new CaseInsensitiveMap(options)
         val path = caseInsensitiveOptions.getOrElse("path", {
           throw new IllegalArgumentException("'path' is not specified")
         })
@@ -252,8 +226,11 @@ case class DataSource(
               "you may be able to create a static DataFrame on that directory with " +
               "'spark.read.load(directory)' and infer schema from it.")
         }
-        val (schema, partCols) = inferFileFormatSchema(format)
-        SourceInfo(s"FileSource[$path]", schema, partCols)
+        val (dataSchema, partitionSchema) = getOrInferFileFormatSchema(format)
+        SourceInfo(
+          s"FileSource[$path]",
+          StructType(dataSchema ++ partitionSchema),
+          partitionSchema.fieldNames)
 
       case _ =>
         throw new UnsupportedOperationException(
@@ -266,10 +243,14 @@ case class DataSource(
     providingClass.newInstance() match {
       case s: StreamSourceProvider =>
         s.createSource(
-          sparkSession.sqlContext, metadataPath, userSpecifiedSchema, className, options)
+          sparkSession.sqlContext,
+          metadataPath,
+          userSpecifiedSchema,
+          className,
+          caseInsensitiveOptions)
 
       case format: FileFormat =>
-        val path = new CaseInsensitiveMap(options).getOrElse("path", {
+        val path = caseInsensitiveOptions.getOrElse("path", {
           throw new IllegalArgumentException("'path' is not specified")
         })
         new FileStreamSource(
@@ -279,7 +260,7 @@ case class DataSource(
           schema = sourceInfo.schema,
           partitionColumns = sourceInfo.partitionColumns,
           metadataPath = metadataPath,
-          options = options)
+          options = caseInsensitiveOptions)
       case _ =>
         throw new UnsupportedOperationException(
           s"Data source $className does not support streamed reading")
@@ -290,18 +271,17 @@ case class DataSource(
   def createSink(outputMode: OutputMode): Sink = {
     providingClass.newInstance() match {
       case s: StreamSinkProvider =>
-        s.createSink(sparkSession.sqlContext, options, partitionColumns, outputMode)
+        s.createSink(sparkSession.sqlContext, caseInsensitiveOptions, partitionColumns, outputMode)
 
-      case parquet: parquet.ParquetFileFormat =>
-        val caseInsensitiveOptions = new CaseInsensitiveMap(options)
+      case fileFormat: FileFormat =>
         val path = caseInsensitiveOptions.getOrElse("path", {
           throw new IllegalArgumentException("'path' is not specified")
         })
         if (outputMode != OutputMode.Append) {
-          throw new IllegalArgumentException(
+          throw new AnalysisException(
             s"Data source $className does not support $outputMode output mode")
         }
-        new FileStreamSink(sparkSession, path, parquet, partitionColumns, options)
+        new FileStreamSink(sparkSession, path, fileFormat, partitionColumns, caseInsensitiveOptions)
 
       case _ =>
         throw new UnsupportedOperationException(
@@ -342,7 +322,6 @@ case class DataSource(
    *                        that files already exist, we don't need to check them again.
    */
   def resolveRelation(checkFilesExist: Boolean = true): BaseRelation = {
-    val caseInsensitiveOptions = new CaseInsensitiveMap(options)
     val relation = (providingClass.newInstance(), userSpecifiedSchema) match {
       // TODO: Throw when too much is given.
       case (dataSource: SchemaRelationProvider, Some(schema)) =>
@@ -378,18 +357,19 @@ case class DataSource(
 
         HadoopFsRelation(
           fileCatalog,
-          partitionSchema = fileCatalog.partitionSpec().partitionColumns,
+          partitionSchema = fileCatalog.partitionSchema,
           dataSchema = dataSchema,
           bucketSpec = None,
           format,
-          options)(sparkSession)
+          caseInsensitiveOptions)(sparkSession)
 
       // This is a non-streaming file based datasource.
       case (format: FileFormat, _) =>
         val allPaths = caseInsensitiveOptions.get("path") ++ paths
+        val hadoopConf = sparkSession.sessionState.newHadoopConf()
         val globbedPaths = allPaths.flatMap { path =>
           val hdfsPath = new Path(path)
-          val fs = hdfsPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
+          val fs = hdfsPath.getFileSystem(hadoopConf)
           val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
           val globPath = SparkHadoopUtil.get.globPathIfNecessary(qualified)
 
@@ -397,52 +377,29 @@ case class DataSource(
             throw new AnalysisException(s"Path does not exist: $qualified")
           }
           // Sufficient to check head of the globPath seq for non-glob scenario
+          // Don't need to check once again if files exist in streaming mode
           if (checkFilesExist && !fs.exists(globPath.head)) {
             throw new AnalysisException(s"Path does not exist: ${globPath.head}")
           }
           globPath
         }.toArray
 
-        // If they gave a schema, then we try and figure out the types of the partition columns
-        // from that schema.
-        val partitionSchema = userSpecifiedSchema.map { schema =>
-          StructType(
-            partitionColumns.map { c =>
-              // TODO: Case sensitivity.
-              schema
-                  .find(_.name.toLowerCase() == c.toLowerCase())
-                  .getOrElse(throw new AnalysisException(s"Invalid partition column '$c'"))
-            })
-        }
+        val (dataSchema, partitionSchema) = getOrInferFileFormatSchema(format)
 
         val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
-            catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {
+            catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog) {
+          val defaultTableSize = sparkSession.sessionState.conf.defaultSizeInBytes
           new CatalogFileIndex(
             sparkSession,
             catalogTable.get,
-            catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L))
+            catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(defaultTableSize))
         } else {
-          new InMemoryFileIndex(
-            sparkSession, globbedPaths, options, partitionSchema)
-        }
-
-        val dataSchema = userSpecifiedSchema.map { schema =>
-          val equality = sparkSession.sessionState.conf.resolver
-          StructType(schema.filterNot(f => partitionColumns.exists(equality(_, f.name))))
-        }.orElse {
-          format.inferSchema(
-            sparkSession,
-            caseInsensitiveOptions,
-            fileCatalog.asInstanceOf[InMemoryFileIndex].allFiles())
-        }.getOrElse {
-          throw new AnalysisException(
-            s"Unable to infer schema for $format at ${allPaths.take(2).mkString(",")}. " +
-              "It must be specified manually")
+          new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(partitionSchema))
         }
 
         HadoopFsRelation(
           fileCatalog,
-          partitionSchema = fileCatalog.partitionSchema,
+          partitionSchema = partitionSchema,
           dataSchema = dataSchema.asNullable,
           bucketSpec = bucketSpec,
           format,
@@ -456,88 +413,232 @@ case class DataSource(
     relation
   }
 
-  /** Writes the given [[DataFrame]] out to this [[DataSource]]. */
-  def write(
-      mode: SaveMode,
-      data: DataFrame): BaseRelation = {
+  /**
+   * Writes the given [[DataFrame]] out in this [[FileFormat]].
+   */
+  private def writeInFileFormat(format: FileFormat, mode: SaveMode, data: DataFrame): Unit = {
+    // Don't glob path for the write path.  The contracts here are:
+    //  1. Only one output path can be specified on the write path;
+    //  2. Output path must be a legal HDFS style file system path;
+    //  3. It's OK that the output path doesn't exist yet;
+    val allPaths = paths ++ caseInsensitiveOptions.get("path")
+    val outputPath = if (allPaths.length == 1) {
+      val path = new Path(allPaths.head)
+      val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
+      path.makeQualified(fs.getUri, fs.getWorkingDirectory)
+    } else {
+      throw new IllegalArgumentException("Expected exactly one path to be specified, but " +
+        s"got: ${allPaths.mkString(", ")}")
+    }
+
+    val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
+    PartitioningUtils.validatePartitionColumn(
+      data.schema, partitionColumns, caseSensitive)
+
+    // If we are appending to a table that already exists, make sure the partitioning matches
+    // up.  If we fail to load the table for whatever reason, ignore the check.
+    if (mode == SaveMode.Append) {
+      val existingPartitionColumns = Try {
+        getOrInferFileFormatSchema(format, justPartitioning = true)._2.fieldNames.toList
+      }.getOrElse(Seq.empty[String])
+      // TODO: Case sensitivity.
+      val sameColumns =
+        existingPartitionColumns.map(_.toLowerCase()) == partitionColumns.map(_.toLowerCase())
+      if (existingPartitionColumns.nonEmpty && !sameColumns) {
+        throw new AnalysisException(
+          s"""Requested partitioning does not match existing partitioning.
+             |Existing partitioning columns:
+             |  ${existingPartitionColumns.mkString(", ")}
+             |Requested partitioning columns:
+             |  ${partitionColumns.mkString(", ")}
+             |""".stripMargin)
+      }
+    }
+
+    // SPARK-17230: Resolve the partition columns so InsertIntoHadoopFsRelationCommand does
+    // not need to have the query as child, to avoid to analyze an optimized query,
+    // because InsertIntoHadoopFsRelationCommand will be optimized first.
+    val columns = partitionColumns.map { name =>
+      val plan = data.logicalPlan
+      plan.resolve(name :: Nil, data.sparkSession.sessionState.analyzer.resolver).getOrElse {
+        throw new AnalysisException(
+          s"Unable to resolve $name given [${plan.output.map(_.name).mkString(", ")}]")
+      }.asInstanceOf[Attribute]
+    }
+    // For partitioned relation r, r.schema's column ordering can be different from the column
+    // ordering of data.logicalPlan (partition columns are all moved after data column).  This
+    // will be adjusted within InsertIntoHadoopFsRelation.
+    val plan =
+      InsertIntoHadoopFsRelationCommand(
+        outputPath = outputPath,
+        staticPartitionKeys = Map.empty,
+        customPartitionLocations = Map.empty,
+        partitionColumns = columns,
+        bucketSpec = bucketSpec,
+        fileFormat = format,
+        refreshFunction = _ => Unit, // No existing table needs to be refreshed.
+        options = options,
+        query = data.logicalPlan,
+        mode = mode,
+        catalogTable = catalogTable)
+    sparkSession.sessionState.executePlan(plan).toRdd
+  }
+
+  /**
+   * Writes the given [[DataFrame]] out to this [[DataSource]] and returns a [[BaseRelation]] for
+   * the following reading.
+   */
+  def writeAndRead(mode: SaveMode, data: DataFrame): BaseRelation = {
     if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
       throw new AnalysisException("Cannot save interval data type into external storage.")
     }
 
     providingClass.newInstance() match {
       case dataSource: CreatableRelationProvider =>
-        dataSource.createRelation(sparkSession.sqlContext, mode, options, data)
+        dataSource.createRelation(sparkSession.sqlContext, mode, caseInsensitiveOptions, data)
       case format: FileFormat =>
-        // Don't glob path for the write path.  The contracts here are:
-        //  1. Only one output path can be specified on the write path;
-        //  2. Output path must be a legal HDFS style file system path;
-        //  3. It's OK that the output path doesn't exist yet;
-        val caseInsensitiveOptions = new CaseInsensitiveMap(options)
-        val outputPath = {
-          val path = new Path(caseInsensitiveOptions.getOrElse("path", {
-            throw new IllegalArgumentException("'path' is not specified")
-          }))
-          val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
-          path.makeQualified(fs.getUri, fs.getWorkingDirectory)
-        }
-
-        val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
-        PartitioningUtils.validatePartitionColumn(
-          data.schema, partitionColumns, caseSensitive)
-
-        // If we are appending to a table that already exists, make sure the partitioning matches
-        // up.  If we fail to load the table for whatever reason, ignore the check.
-        if (mode == SaveMode.Append) {
-          val existingPartitionColumns = Try {
-            resolveRelation()
-              .asInstanceOf[HadoopFsRelation]
-              .partitionSchema
-              .fieldNames
-              .toSeq
-          }.getOrElse(Seq.empty[String])
-          // TODO: Case sensitivity.
-          val sameColumns =
-            existingPartitionColumns.map(_.toLowerCase()) == partitionColumns.map(_.toLowerCase())
-          if (existingPartitionColumns.size > 0 && !sameColumns) {
-            throw new AnalysisException(
-              s"""Requested partitioning does not match existing partitioning.
-                 |Existing partitioning columns:
-                 |  ${existingPartitionColumns.mkString(", ")}
-                 |Requested partitioning columns:
-                 |  ${partitionColumns.mkString(", ")}
-                 |""".stripMargin)
-          }
-        }
-
-        // SPARK-17230: Resolve the partition columns so InsertIntoHadoopFsRelationCommand does
-        // not need to have the query as child, to avoid to analyze an optimized query,
-        // because InsertIntoHadoopFsRelationCommand will be optimized first.
-        val columns = partitionColumns.map { name =>
-          val plan = data.logicalPlan
-          plan.resolve(name :: Nil, data.sparkSession.sessionState.analyzer.resolver).getOrElse {
-            throw new AnalysisException(
-              s"Unable to resolve ${name} given [${plan.output.map(_.name).mkString(", ")}]")
-          }.asInstanceOf[Attribute]
-        }
-        // For partitioned relation r, r.schema's column ordering can be different from the column
-        // ordering of data.logicalPlan (partition columns are all moved after data column).  This
-        // will be adjusted within InsertIntoHadoopFsRelation.
-        val plan =
-          InsertIntoHadoopFsRelationCommand(
-            outputPath,
-            columns,
-            bucketSpec,
-            format,
-            _ => Unit, // No existing table needs to be refreshed.
-            options,
-            data.logicalPlan,
-            mode)
-        sparkSession.sessionState.executePlan(plan).toRdd
+        writeInFileFormat(format, mode, data)
         // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring it.
         copy(userSpecifiedSchema = Some(data.schema.asNullable)).resolveRelation()
+      case _ =>
+        sys.error(s"${providingClass.getCanonicalName} does not allow create table as select.")
+    }
+  }
+
+  /**
+   * Writes the given [[DataFrame]] out to this [[DataSource]].
+   */
+  def write(mode: SaveMode, data: DataFrame): Unit = {
+    if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
+      throw new AnalysisException("Cannot save interval data type into external storage.")
+    }
 
+    providingClass.newInstance() match {
+      case dataSource: CreatableRelationProvider =>
+        dataSource.createRelation(sparkSession.sqlContext, mode, caseInsensitiveOptions, data)
+      case format: FileFormat =>
+        writeInFileFormat(format, mode, data)
       case _ =>
         sys.error(s"${providingClass.getCanonicalName} does not allow create table as select.")
     }
   }
 }
+
+object DataSource {
+
+  /** A map to maintain backward compatibility in case we move data sources around. */
+  private val backwardCompatibilityMap: Map[String, String] = {
+    val jdbc = classOf[JdbcRelationProvider].getCanonicalName
+    val json = classOf[JsonFileFormat].getCanonicalName
+    val parquet = classOf[ParquetFileFormat].getCanonicalName
+    val csv = classOf[CSVFileFormat].getCanonicalName
+    val libsvm = "org.apache.spark.ml.source.libsvm.LibSVMFileFormat"
+    val orc = "org.apache.spark.sql.hive.orc.OrcFileFormat"
+
+    Map(
+      "org.apache.spark.sql.jdbc" -> jdbc,
+      "org.apache.spark.sql.jdbc.DefaultSource" -> jdbc,
+      "org.apache.spark.sql.execution.datasources.jdbc.DefaultSource" -> jdbc,
+      "org.apache.spark.sql.execution.datasources.jdbc" -> jdbc,
+      "org.apache.spark.sql.json" -> json,
+      "org.apache.spark.sql.json.DefaultSource" -> json,
+      "org.apache.spark.sql.execution.datasources.json" -> json,
+      "org.apache.spark.sql.execution.datasources.json.DefaultSource" -> json,
+      "org.apache.spark.sql.parquet" -> parquet,
+      "org.apache.spark.sql.parquet.DefaultSource" -> parquet,
+      "org.apache.spark.sql.execution.datasources.parquet" -> parquet,
+      "org.apache.spark.sql.execution.datasources.parquet.DefaultSource" -> parquet,
+      "org.apache.spark.sql.hive.orc.DefaultSource" -> orc,
+      "org.apache.spark.sql.hive.orc" -> orc,
+      "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm,
+      "org.apache.spark.ml.source.libsvm" -> libsvm,
+      "com.databricks.spark.csv" -> csv
+    )
+  }
+
+  /**
+   * Class that were removed in Spark 2.0. Used to detect incompatibility libraries for Spark 2.0.
+   */
+  private val spark2RemovedClasses = Set(
+    "org.apache.spark.sql.DataFrame",
+    "org.apache.spark.sql.sources.HadoopFsRelationProvider",
+    "org.apache.spark.Logging")
+
+  /** Given a provider name, look up the data source class definition. */
+  def lookupDataSource(provider: String): Class[_] = {
+    val provider1 = backwardCompatibilityMap.getOrElse(provider, provider)
+    val provider2 = s"$provider1.DefaultSource"
+    val loader = Utils.getContextOrSparkClassLoader
+    val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader)
+
+    try {
+      serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider1)).toList match {
+        // the provider format did not match any given registered aliases
+        case Nil =>
+          try {
+            Try(loader.loadClass(provider1)).orElse(Try(loader.loadClass(provider2))) match {
+              case Success(dataSource) =>
+                // Found the data source using fully qualified path
+                dataSource
+              case Failure(error) =>
+                if (provider1.toLowerCase == "orc" ||
+                  provider1.startsWith("org.apache.spark.sql.hive.orc")) {
+                  throw new AnalysisException(
+                    "The ORC data source must be used with Hive support enabled")
+                } else if (provider1.toLowerCase == "avro" ||
+                  provider1 == "com.databricks.spark.avro") {
+                  throw new AnalysisException(
+                    s"Failed to find data source: ${provider1.toLowerCase}. Please find an Avro " +
+                      "package at http://spark.apache.org/third-party-projects.html")
+                } else {
+                  throw new ClassNotFoundException(
+                    s"Failed to find data source: $provider1. Please find packages at " +
+                      "http://spark.apache.org/third-party-projects.html",
+                    error)
+                }
+            }
+          } catch {
+            case e: NoClassDefFoundError => // This one won't be caught by Scala NonFatal
+              // NoClassDefFoundError's class name uses "/" rather than "." for packages
+              val className = e.getMessage.replaceAll("/", ".")
+              if (spark2RemovedClasses.contains(className)) {
+                throw new ClassNotFoundException(s"$className was removed in Spark 2.0. " +
+                  "Please check if your library is compatible with Spark 2.0", e)
+              } else {
+                throw e
+              }
+          }
+        case head :: Nil =>
+          // there is exactly one registered alias
+          head.getClass
+        case sources =>
+          // There are multiple registered aliases for the input
+          sys.error(s"Multiple sources found for $provider1 " +
+            s"(${sources.map(_.getClass.getName).mkString(", ")}), " +
+            "please specify the fully qualified class name.")
+      }
+    } catch {
+      case e: ServiceConfigurationError if e.getCause.isInstanceOf[NoClassDefFoundError] =>
+        // NoClassDefFoundError's class name uses "/" rather than "." for packages
+        val className = e.getCause.getMessage.replaceAll("/", ".")
+        if (spark2RemovedClasses.contains(className)) {
+          throw new ClassNotFoundException(s"Detected an incompatible DataSourceRegister. " +
+            "Please remove the incompatible library from classpath or upgrade it. " +
+            s"Error: ${e.getMessage}", e)
+        } else {
+          throw e
+        }
+    }
+  }
+
+  /**
+   * When creating a data source table, the `path` option has a special meaning: the table location.
+   * This method extracts the `path` option and treat it as table location to build a
+   * [[CatalogStorageFormat]]. Note that, the `path` option is removed from options after this.
+   */
+  def buildStorageFormatFromOptions(options: Map[String, String]): CatalogStorageFormat = {
+    val path = new CaseInsensitiveMap(options).get("path")
+    val optionsWithoutPath = options.filterKeys(_.toLowerCase != "path")
+    CatalogStorageFormat.empty.copy(locationUri = path, properties = optionsWithoutPath)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 34b77cab65de..5062da19e5d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -19,13 +19,15 @@ package org.apache.spark.sql.execution.datasources
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SimpleCatalogRelation}
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTablePartition, SimpleCatalogRelation}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
@@ -35,7 +37,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPartitioning}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
-import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, DDLUtils, ExecutedCommandExec}
+import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -159,9 +161,9 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
       insert.copy(partition = parts.map(p => (p._1, None)), child = Project(projectList, query))
 
 
-    case i @ logical.InsertIntoTable(
-           l @ LogicalRelation(t: HadoopFsRelation, _, _), part, query, overwrite, false)
-        if query.resolved && t.schema.asNullable == query.schema.asNullable =>
+    case logical.InsertIntoTable(
+      l @ LogicalRelation(t: HadoopFsRelation, _, table), _, query, overwrite, false)
+        if query.resolved && t.schema.sameType(query.schema) =>
 
       // Sanity checks
       if (t.location.rootPaths.size != 1) {
@@ -174,37 +176,106 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.rootPaths
       }.flatten
 
-      val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
-      if (overwrite && inputPaths.contains(outputPath)) {
+      val mode = if (overwrite.enabled) SaveMode.Overwrite else SaveMode.Append
+      if (overwrite.enabled && inputPaths.contains(outputPath)) {
         throw new AnalysisException(
           "Cannot overwrite a path that is also being read from.")
       }
 
+      val partitionSchema = query.resolve(
+        t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver)
+      val partitionsTrackedByCatalog =
+        t.sparkSession.sessionState.conf.manageFilesourcePartitions &&
+        l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty &&
+        l.catalogTable.get.tracksPartitionsInCatalog
+
+      var initialMatchingPartitions: Seq[TablePartitionSpec] = Nil
+      var customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty
+
+      val staticPartitionKeys: TablePartitionSpec = if (overwrite.enabled) {
+        overwrite.staticPartitionKeys.map { case (k, v) =>
+          (partitionSchema.map(_.name).find(_.equalsIgnoreCase(k)).get, v)
+        }
+      } else {
+        Map.empty
+      }
+
+      // When partitions are tracked by the catalog, compute all custom partition locations that
+      // may be relevant to the insertion job.
+      if (partitionsTrackedByCatalog) {
+        val matchingPartitions = t.sparkSession.sessionState.catalog.listPartitions(
+          l.catalogTable.get.identifier, Some(staticPartitionKeys))
+        initialMatchingPartitions = matchingPartitions.map(_.spec)
+        customPartitionLocations = getCustomPartitionLocations(
+          t.sparkSession, l.catalogTable.get, outputPath, matchingPartitions)
+      }
+
+      // Callback for updating metastore partition metadata after the insertion job completes.
+      // TODO(ekl) consider moving this into InsertIntoHadoopFsRelationCommand
       def refreshPartitionsCallback(updatedPartitions: Seq[TablePartitionSpec]): Unit = {
-        if (l.catalogTable.isDefined &&
-            l.catalogTable.get.partitionColumnNames.nonEmpty &&
-            l.catalogTable.get.partitionProviderIsHive) {
-          val metastoreUpdater = AlterTableAddPartitionCommand(
-            l.catalogTable.get.identifier,
-            updatedPartitions.map(p => (p, None)),
-            ifNotExists = true)
-          metastoreUpdater.run(t.sparkSession)
+        if (partitionsTrackedByCatalog) {
+          val newPartitions = updatedPartitions.toSet -- initialMatchingPartitions
+          if (newPartitions.nonEmpty) {
+            AlterTableAddPartitionCommand(
+              l.catalogTable.get.identifier, newPartitions.toSeq.map(p => (p, None)),
+              ifNotExists = true).run(t.sparkSession)
+          }
+          if (overwrite.enabled) {
+            val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions
+            if (deletedPartitions.nonEmpty) {
+              AlterTableDropPartitionCommand(
+                l.catalogTable.get.identifier, deletedPartitions.toSeq,
+                ifExists = true, purge = false,
+                retainData = true /* already deleted */).run(t.sparkSession)
+            }
+          }
         }
         t.location.refresh()
       }
 
       val insertCmd = InsertIntoHadoopFsRelationCommand(
         outputPath,
-        query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver),
+        staticPartitionKeys,
+        customPartitionLocations,
+        partitionSchema,
         t.bucketSpec,
         t.fileFormat,
         refreshPartitionsCallback,
         t.options,
         query,
-        mode)
+        mode,
+        table)
 
       insertCmd
   }
+
+  /**
+   * Given a set of input partitions, returns those that have locations that differ from the
+   * Hive default (e.g. /k1=v1/k2=v2). These partitions were manually assigned locations by
+   * the user.
+   *
+   * @return a mapping from partition specs to their custom locations
+   */
+  private def getCustomPartitionLocations(
+      spark: SparkSession,
+      table: CatalogTable,
+      basePath: Path,
+      partitions: Seq[CatalogTablePartition]): Map[TablePartitionSpec, String] = {
+    val hadoopConf = spark.sessionState.newHadoopConf
+    val fs = basePath.getFileSystem(hadoopConf)
+    val qualifiedBasePath = basePath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+    partitions.flatMap { p =>
+      val defaultLocation = qualifiedBasePath.suffix(
+        "/" + PartitioningUtils.getPathFragment(p.spec, table.partitionSchema)).toString
+      val catalogLocation = new Path(p.location).makeQualified(
+        fs.getUri, fs.getWorkingDirectory).toString
+      if (catalogLocation != defaultLocation) {
+        Some(p.spec -> catalogLocation)
+      } else {
+        None
+      }
+    }.toMap
+  }
 }
 
 
@@ -217,6 +288,7 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
       sparkSession: SparkSession,
       simpleCatalogRelation: SimpleCatalogRelation): LogicalPlan = {
     val table = simpleCatalogRelation.catalogTable
+    val pathOption = table.storage.locationUri.map("path" -> _)
     val dataSource =
       DataSource(
         sparkSession,
@@ -224,10 +296,10 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
         partitionColumns = table.partitionColumnNames,
         bucketSpec = table.bucketSpec,
         className = table.provider.get,
-        options = table.storage.properties)
+        options = table.storage.properties ++ pathOption)
 
     LogicalRelation(
-      dataSource.resolveRelation(),
+      dataSource.resolveRelation(checkFilesExist = false),
       expectedOutputAttributes = Some(simpleCatalogRelation.output),
       catalogTable = Some(table))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
deleted file mode 100644
index f5dd5ce22919..000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import java.util.{Date, UUID}
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-
-import org.apache.spark.SparkHadoopWriter
-import org.apache.spark.internal.Logging
-import org.apache.spark.mapred.SparkHadoopMapRedUtil
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.util.Utils
-
-
-object FileCommitProtocol {
-  class TaskCommitMessage(val obj: Any) extends Serializable
-
-  object EmptyTaskCommitMessage extends TaskCommitMessage(null)
-
-  /**
-   * Instantiates a FileCommitProtocol using the given className.
-   */
-  def instantiate(className: String, outputPath: String, isAppend: Boolean): FileCommitProtocol = {
-    try {
-      val clazz = Utils.classForName(className).asInstanceOf[Class[FileCommitProtocol]]
-
-      // First try the one with argument (outputPath: String, isAppend: Boolean).
-      // If that doesn't exist, try the one with (outputPath: String).
-      try {
-        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[Boolean])
-        ctor.newInstance(outputPath, isAppend.asInstanceOf[java.lang.Boolean])
-      } catch {
-        case _: NoSuchMethodException =>
-          val ctor = clazz.getDeclaredConstructor(classOf[String])
-          ctor.newInstance(outputPath)
-      }
-    } catch {
-      case e: ClassNotFoundException =>
-        throw e
-    }
-  }
-}
-
-
-/**
- * An interface to define how a single Spark job commits its outputs. Two notes:
- *
- * 1. Implementations must be serializable, as the committer instance instantiated on the driver
- *    will be used for tasks on executors.
- * 2. A committer should not be reused across multiple Spark jobs.
- *
- * The proper call sequence is:
- *
- * 1. Driver calls setupJob.
- * 2. As part of each task's execution, executor calls setupTask and then commitTask
- *    (or abortTask if task failed).
- * 3. When all necessary tasks completed successfully, the driver calls commitJob. If the job
- *    failed to execute (e.g. too many failed tasks), the job should call abortJob.
- */
-abstract class FileCommitProtocol {
-  import FileCommitProtocol._
-
-  /**
-   * Setups up a job. Must be called on the driver before any other methods can be invoked.
-   */
-  def setupJob(jobContext: JobContext): Unit
-
-  /**
-   * Commits a job after the writes succeed. Must be called on the driver.
-   */
-  def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit
-
-  /**
-   * Aborts a job after the writes fail. Must be called on the driver.
-   *
-   * Calling this function is a best-effort attempt, because it is possible that the driver
-   * just crashes (or killed) before it can call abort.
-   */
-  def abortJob(jobContext: JobContext): Unit
-
-  /**
-   * Sets up a task within a job.
-   * Must be called before any other task related methods can be invoked.
-   */
-  def setupTask(taskContext: TaskAttemptContext): Unit
-
-  /**
-   * Notifies the commit protocol to add a new file, and gets back the full path that should be
-   * used. Must be called on the executors when running tasks.
-   *
-   * Note that the returned temp file may have an arbitrary path. The commit protocol only
-   * promises that the file will be at the location specified by the arguments after job commit.
-   *
-   * A full file path consists of the following parts:
-   *  1. the base path
-   *  2. some sub-directory within the base path, used to specify partitioning
-   *  3. file prefix, usually some unique job id with the task id
-   *  4. bucket id
-   *  5. source specific file extension, e.g. ".snappy.parquet"
-   *
-   * The "dir" parameter specifies 2, and "ext" parameter specifies both 4 and 5, and the rest
-   * are left to the commit protocol implementation to decide.
-   */
-  def newTaskTempFile(taskContext: TaskAttemptContext, dir: Option[String], ext: String): String
-
-  /**
-   * Commits a task after the writes succeed. Must be called on the executors when running tasks.
-   */
-  def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage
-
-  /**
-   * Aborts a task after the writes have failed. Must be called on the executors when running tasks.
-   *
-   * Calling this function is a best-effort attempt, because it is possible that the executor
-   * just crashes (or killed) before it can call abort.
-   */
-  def abortTask(taskContext: TaskAttemptContext): Unit
-}
-
-
-/**
- * An [[FileCommitProtocol]] implementation backed by an underlying Hadoop OutputCommitter
- * (from the newer mapreduce API, not the old mapred API).
- *
- * Unlike Hadoop's OutputCommitter, this implementation is serializable.
- */
-class HadoopCommitProtocolWrapper(path: String, isAppend: Boolean)
-  extends FileCommitProtocol with Serializable with Logging {
-
-  import FileCommitProtocol._
-
-  /** OutputCommitter from Hadoop is not serializable so marking it transient. */
-  @transient private var committer: OutputCommitter = _
-
-  /** UUID used to identify the job in file name. */
-  private val uuid: String = UUID.randomUUID().toString
-
-  private def setupCommitter(context: TaskAttemptContext): Unit = {
-    committer = context.getOutputFormatClass.newInstance().getOutputCommitter(context)
-
-    if (!isAppend) {
-      // If we are appending data to an existing dir, we will only use the output committer
-      // associated with the file output format since it is not safe to use a custom
-      // committer for appending. For example, in S3, direct parquet output committer may
-      // leave partial data in the destination dir when the appending job fails.
-      // See SPARK-8578 for more details.
-      val configuration = context.getConfiguration
-      val clazz =
-        configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
-
-      if (clazz != null) {
-        logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
-
-        // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
-        // has an associated output committer. To override this output committer,
-        // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
-        // If a data source needs to override the output committer, it needs to set the
-        // output committer in prepareForWrite method.
-        if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) {
-          // The specified output committer is a FileOutputCommitter.
-          // So, we will use the FileOutputCommitter-specified constructor.
-          val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
-          committer = ctor.newInstance(new Path(path), context)
-        } else {
-          // The specified output committer is just an OutputCommitter.
-          // So, we will use the no-argument constructor.
-          val ctor = clazz.getDeclaredConstructor()
-          committer = ctor.newInstance()
-        }
-      }
-    }
-    logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
-  }
-
-  override def newTaskTempFile(
-      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
-    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
-    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
-    // the file name is fine and won't overflow.
-    val split = taskContext.getTaskAttemptID.getTaskID.getId
-    val filename = f"part-$split%05d-$uuid$ext"
-
-    val stagingDir: String = committer match {
-      // For FileOutputCommitter it has its own staging path called "work path".
-      case f: FileOutputCommitter => Option(f.getWorkPath.toString).getOrElse(path)
-      case _ => path
-    }
-
-    dir.map { d =>
-      new Path(new Path(stagingDir, d), filename).toString
-    }.getOrElse {
-      new Path(stagingDir, filename).toString
-    }
-  }
-
-  override def setupJob(jobContext: JobContext): Unit = {
-    // Setup IDs
-    val jobId = SparkHadoopWriter.createJobID(new Date, 0)
-    val taskId = new TaskID(jobId, TaskType.MAP, 0)
-    val taskAttemptId = new TaskAttemptID(taskId, 0)
-
-    // Set up the configuration object
-    jobContext.getConfiguration.set("mapred.job.id", jobId.toString)
-    jobContext.getConfiguration.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
-    jobContext.getConfiguration.set("mapred.task.id", taskAttemptId.toString)
-    jobContext.getConfiguration.setBoolean("mapred.task.is.map", true)
-    jobContext.getConfiguration.setInt("mapred.task.partition", 0)
-
-    val taskAttemptContext = new TaskAttemptContextImpl(jobContext.getConfiguration, taskAttemptId)
-    setupCommitter(taskAttemptContext)
-
-    committer.setupJob(jobContext)
-  }
-
-  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
-    committer.commitJob(jobContext)
-  }
-
-  override def abortJob(jobContext: JobContext): Unit = {
-    committer.abortJob(jobContext, JobStatus.State.FAILED)
-  }
-
-  override def setupTask(taskContext: TaskAttemptContext): Unit = {
-    setupCommitter(taskContext)
-    committer.setupTask(taskContext)
-  }
-
-  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
-    val attemptId = taskContext.getTaskAttemptID
-    SparkHadoopMapRedUtil.commitTask(
-      committer, taskContext, attemptId.getJobID.getId, attemptId.getTaskID.getId)
-    EmptyTaskCommitMessage
-  }
-
-  override def abortTask(taskContext: TaskAttemptContext): Unit = {
-    committer.abortTask(taskContext)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index 4f4aaaa5026f..6784ee243c93 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -148,7 +148,8 @@ trait FileFormat {
  * The base class file format that is based on text file.
  */
 abstract class TextBasedFileFormat extends FileFormat {
-  private var codecFactory: CompressionCodecFactory = null
+  private var codecFactory: CompressionCodecFactory = _
+
   override def isSplitable(
       sparkSession: SparkSession,
       options: Map[String, String],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index bc00a0a749c0..92b22b813312 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -29,15 +29,16 @@ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 import org.apache.spark.sql.{Dataset, SparkSession}
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, ExternalCatalogUtils}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.{SQLExecution, UnsafeKVExternalSorter}
-import org.apache.spark.sql.execution.datasources.FileCommitProtocol.TaskCommitMessage
+import org.apache.spark.sql.execution.{QueryExecution, SQLExecution, UnsafeKVExternalSorter}
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
@@ -46,6 +47,10 @@ import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
 /** A helper object for writing FileFormat data out to a location. */
 object FileFormatWriter extends Logging {
 
+  /** Describes how output files should be placed in the filesystem. */
+  case class OutputSpec(
+    outputPath: String, customPartitionLocations: Map[TablePartitionSpec, String])
+
   /** A shared job description for all the write tasks. */
   private class WriteJobDescription(
       val uuid: String,  // prevent collision between different (appending) write jobs
@@ -55,7 +60,8 @@ object FileFormatWriter extends Logging {
       val partitionColumns: Seq[Attribute],
       val nonPartitionColumns: Seq[Attribute],
       val bucketSpec: Option[BucketSpec],
-      val path: String)
+      val path: String,
+      val customPartitionLocations: Map[TablePartitionSpec, String])
     extends Serializable {
 
     assert(AttributeSet(allColumns) == AttributeSet(partitionColumns ++ nonPartitionColumns),
@@ -79,10 +85,10 @@ object FileFormatWriter extends Logging {
    */
   def write(
       sparkSession: SparkSession,
-      plan: LogicalPlan,
+      queryExecution: QueryExecution,
       fileFormat: FileFormat,
       committer: FileCommitProtocol,
-      outputPath: String,
+      outputSpec: OutputSpec,
       hadoopConf: Configuration,
       partitionColumns: Seq[Attribute],
       bucketSpec: Option[BucketSpec],
@@ -92,11 +98,10 @@ object FileFormatWriter extends Logging {
     val job = Job.getInstance(hadoopConf)
     job.setOutputKeyClass(classOf[Void])
     job.setOutputValueClass(classOf[InternalRow])
-    FileOutputFormat.setOutputPath(job, new Path(outputPath))
+    FileOutputFormat.setOutputPath(job, new Path(outputSpec.outputPath))
 
     val partitionSet = AttributeSet(partitionColumns)
-    val dataColumns = plan.output.filterNot(partitionSet.contains)
-    val queryExecution = Dataset.ofRows(sparkSession, plan).queryExecution
+    val dataColumns = queryExecution.logical.output.filterNot(partitionSet.contains)
 
     // Note: prepareWrite has side effect. It sets "job".
     val outputWriterFactory =
@@ -106,11 +111,12 @@ object FileFormatWriter extends Logging {
       uuid = UUID.randomUUID().toString,
       serializableHadoopConf = new SerializableConfiguration(job.getConfiguration),
       outputWriterFactory = outputWriterFactory,
-      allColumns = plan.output,
+      allColumns = queryExecution.logical.output,
       partitionColumns = partitionColumns,
       nonPartitionColumns = dataColumns,
       bucketSpec = bucketSpec,
-      path = outputPath)
+      path = outputSpec.outputPath,
+      customPartitionLocations = outputSpec.customPartitionLocations)
 
     SQLExecution.withNewExecutionId(sparkSession, queryExecution) {
       // This call shouldn't be put into the `try` block below because it only initializes and
@@ -279,14 +285,11 @@ object FileFormatWriter extends Logging {
     /** Expressions that given a partition key build a string like: col1=val/col2=val/... */
     private def partitionStringExpression: Seq[Expression] = {
       description.partitionColumns.zipWithIndex.flatMap { case (c, i) =>
-        val escaped = ScalaUDF(
-          PartitioningUtils.escapePathName _,
+        val partitionName = ScalaUDF(
+          ExternalCatalogUtils.getPartitionPathString _,
           StringType,
-          Seq(Cast(c, StringType)),
-          Seq(StringType))
-        val str = If(IsNull(c), Literal(PartitioningUtils.DEFAULT_PARTITION_NAME), escaped)
-        val partitionName = Literal(c.name + "=") :: str :: Nil
-        if (i == 0) partitionName else Literal(Path.SEPARATOR) :: partitionName
+          Seq(Literal(c.name), Cast(c, StringType)))
+        if (i == 0) Seq(partitionName) else Seq(Literal(Path.SEPARATOR), partitionName)
       }
     }
 
@@ -307,7 +310,17 @@ object FileFormatWriter extends Logging {
       }
       val ext = bucketId + description.outputWriterFactory.getFileExtension(taskAttemptContext)
 
-      val path = committer.newTaskTempFile(taskAttemptContext, partDir, ext)
+      val customPath = partDir match {
+        case Some(dir) =>
+          description.customPartitionLocations.get(PartitioningUtils.parsePathFragment(dir))
+        case _ =>
+          None
+      }
+      val path = if (customPath.isDefined) {
+        committer.newTaskTempFileAbsPath(taskAttemptContext, customPath.get, ext)
+      } else {
+        committer.newTaskTempFile(taskAttemptContext, partDir, ext)
+      }
       val newWriter = description.outputWriterFactory.newInstance(
         path = path,
         dataSchema = description.nonPartitionColumns.toStructType,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 89944570df66..843459221e68 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -21,7 +21,7 @@ import java.io.IOException
 
 import scala.collection.mutable
 
-import org.apache.spark.{Partition => RDDPartition, TaskContext}
+import org.apache.spark.{Partition => RDDPartition, TaskContext, TaskKilledException}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{InputFileNameHolder, RDD}
 import org.apache.spark.sql.SparkSession
@@ -99,7 +99,15 @@ class FileScanRDD(
       private[this] var currentFile: PartitionedFile = null
       private[this] var currentIterator: Iterator[Object] = null
 
-      def hasNext: Boolean = (currentIterator != null && currentIterator.hasNext) || nextIterator()
+      def hasNext: Boolean = {
+        // Kill the task in case it has been marked as killed. This logic is from
+        // InterruptibleIterator, but we inline it here instead of wrapping the iterator in order
+        // to avoid performance overhead.
+        if (context.isInterrupted()) {
+          throw new TaskKilledException
+        }
+        (currentIterator != null && currentIterator.hasNext) || nextIterator()
+      }
       def next(): Object = {
         val nextElement = currentIterator.next()
         // TODO: we should have a better separation of row based and batch based scan, so that we
@@ -126,7 +134,17 @@ class FileScanRDD(
           try {
             if (ignoreCorruptFiles) {
               currentIterator = new NextIterator[Object] {
-                private val internalIter = readFunction(currentFile)
+                private val internalIter = {
+                  try {
+                    // The readFunction may read files before consuming the iterator.
+                    // E.g., vectorized Parquet reader.
+                    readFunction(currentFile)
+                  } catch {
+                    case e @(_: RuntimeException | _: IOException) =>
+                      logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e)
+                      Iterator.empty
+                  }
+                }
 
                 override def getNext(): AnyRef = {
                   try {
@@ -138,6 +156,7 @@ class FileScanRDD(
                     }
                   } catch {
                     case e: IOException =>
+                      logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e)
                       finished = true
                       null
                   }
@@ -149,6 +168,9 @@ class FileScanRDD(
               currentIterator = readFunction(currentFile)
             }
           } catch {
+            case e: IOException if ignoreCorruptFiles =>
+              logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e)
+              currentIterator = Iterator.empty
             case e: java.io.FileNotFoundException =>
               throw new java.io.FileNotFoundException(
                 e.getMessage + "\n" +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
index 55ca4f11068f..ead323320243 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -86,7 +86,7 @@ object FileSourceStrategy extends Strategy with Logging {
       val dataFilters = normalizedFilters.filter(_.references.intersect(partitionSet).isEmpty)
 
       // Predicates with both partition keys and attributes need to be evaluated after the scan.
-      val afterScanFilters = filterSet -- partitionKeyFilters
+      val afterScanFilters = filterSet -- partitionKeyFilters.filter(_.references.nonEmpty)
       logInfo(s"Post-Scan Filters: ${afterScanFilters.mkString(",")}")
 
       val filterAttributes = AttributeSet(afterScanFilters)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index 7c2e6fd04d5d..aea27bd4c4d7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import java.util.concurrent.ConcurrentHashMap
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
@@ -26,9 +25,38 @@ import com.google.common.cache._
 import org.apache.hadoop.fs.{FileStatus, Path}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.util.{SerializableConfiguration, SizeEstimator}
+import org.apache.spark.util.SizeEstimator
+
+
+/**
+ * Use [[FileStatusCache.getOrCreate()]] to construct a globally shared file status cache.
+ */
+object FileStatusCache {
+  private var sharedCache: SharedInMemoryCache = _
+
+  /**
+   * @return a new FileStatusCache based on session configuration. Cache memory quota is
+   *         shared across all clients.
+   */
+  def getOrCreate(session: SparkSession): FileStatusCache = synchronized {
+    if (session.sqlContext.conf.manageFilesourcePartitions &&
+      session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
+      if (sharedCache == null) {
+        sharedCache = new SharedInMemoryCache(
+          session.sqlContext.conf.filesourcePartitionFileCacheSize)
+      }
+      sharedCache.createForNewClient()
+    } else {
+      NoopCache
+    }
+  }
+
+  def resetForTesting(): Unit = synchronized {
+    sharedCache = null
+  }
+}
+
 
 /**
  * A cache of the leaf files of partition directories. We cache these files in order to speed
@@ -55,32 +83,6 @@ abstract class FileStatusCache {
   def invalidateAll(): Unit
 }
 
-object FileStatusCache {
-  private var sharedCache: SharedInMemoryCache = null
-
-  /**
-   * @return a new FileStatusCache based on session configuration. Cache memory quota is
-   *         shared across all clients.
-   */
-  def newCache(session: SparkSession): FileStatusCache = {
-    synchronized {
-      if (session.sqlContext.conf.manageFilesourcePartitions &&
-          session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
-        if (sharedCache == null) {
-          sharedCache = new SharedInMemoryCache(
-            session.sqlContext.conf.filesourcePartitionFileCacheSize)
-        }
-        sharedCache.getForNewClient()
-      } else {
-        NoopCache
-      }
-    }
-  }
-
-  def resetForTesting(): Unit = synchronized {
-    sharedCache = null
-  }
-}
 
 /**
  * An implementation that caches partition file statuses in memory.
@@ -88,37 +90,58 @@ object FileStatusCache {
  * @param maxSizeInBytes max allowable cache size before entries start getting evicted
  */
 private class SharedInMemoryCache(maxSizeInBytes: Long) extends Logging {
-  import FileStatusCache._
 
   // Opaque object that uniquely identifies a shared cache user
   private type ClientId = Object
 
+
   private val warnedAboutEviction = new AtomicBoolean(false)
 
   // we use a composite cache key in order to distinguish entries inserted by different clients
-  private val cache: Cache[(ClientId, Path), Array[FileStatus]] = CacheBuilder.newBuilder()
-    .weigher(new Weigher[(ClientId, Path), Array[FileStatus]] {
+  private val cache: Cache[(ClientId, Path), Array[FileStatus]] = {
+    // [[Weigher]].weigh returns Int so we could only cache objects < 2GB
+    // instead, the weight is divided by this factor (which is smaller
+    // than the size of one [[FileStatus]]).
+    // so it will support objects up to 64GB in size.
+    val weightScale = 32
+    val weigher = new Weigher[(ClientId, Path), Array[FileStatus]] {
       override def weigh(key: (ClientId, Path), value: Array[FileStatus]): Int = {
-        (SizeEstimator.estimate(key) + SizeEstimator.estimate(value)).toInt
-      }})
-    .removalListener(new RemovalListener[(ClientId, Path), Array[FileStatus]]() {
-      override def onRemoval(removed: RemovalNotification[(ClientId, Path), Array[FileStatus]]) = {
-        if (removed.getCause() == RemovalCause.SIZE &&
-            warnedAboutEviction.compareAndSet(false, true)) {
+        val estimate = (SizeEstimator.estimate(key) + SizeEstimator.estimate(value)) / weightScale
+        if (estimate > Int.MaxValue) {
+          logWarning(s"Cached table partition metadata size is too big. Approximating to " +
+            s"${Int.MaxValue.toLong * weightScale}.")
+          Int.MaxValue
+        } else {
+          estimate.toInt
+        }
+      }
+    }
+    val removalListener = new RemovalListener[(ClientId, Path), Array[FileStatus]]() {
+      override def onRemoval(
+          removed: RemovalNotification[(ClientId, Path),
+          Array[FileStatus]]): Unit = {
+        if (removed.getCause == RemovalCause.SIZE &&
+          warnedAboutEviction.compareAndSet(false, true)) {
           logWarning(
             "Evicting cached table partition metadata from memory due to size constraints " +
-            "(spark.sql.hive.filesourcePartitionFileCacheSize = " + maxSizeInBytes + " bytes). " +
-            "This may impact query planning performance.")
+              "(spark.sql.hive.filesourcePartitionFileCacheSize = "
+              + maxSizeInBytes + " bytes). This may impact query planning performance.")
         }
-      }})
-    .maximumWeight(maxSizeInBytes)
-    .build()
+      }
+    }
+    CacheBuilder.newBuilder()
+      .weigher(weigher)
+      .removalListener(removalListener)
+      .maximumWeight(maxSizeInBytes / weightScale)
+      .build[(ClientId, Path), Array[FileStatus]]()
+  }
+
 
   /**
    * @return a FileStatusCache that does not share any entries with any other client, but does
    *         share memory resources for the purpose of cache eviction.
    */
-  def getForNewClient(): FileStatusCache = new FileStatusCache {
+  def createForNewClient(): FileStatusCache = new FileStatusCache {
     val clientId = new Object()
 
     override def getLeafFiles(path: Path): Option[Array[FileStatus]] = {
@@ -126,7 +149,7 @@ private class SharedInMemoryCache(maxSizeInBytes: Long) extends Logging {
     }
 
     override def putLeafFiles(path: Path, leafFiles: Array[FileStatus]): Unit = {
-      cache.put((clientId, path), leafFiles.toArray)
+      cache.put((clientId, path), leafFiles)
     }
 
     override def invalidateAll(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index 014abd454f5c..9a08524476ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import scala.collection.mutable
+
 import org.apache.spark.sql.{SparkSession, SQLContext}
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
 import org.apache.spark.sql.execution.FileRelation
 import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StructField, StructType}
 
 
 /**
@@ -49,10 +51,16 @@ case class HadoopFsRelation(
   override def sqlContext: SQLContext = sparkSession.sqlContext
 
   val schema: StructType = {
-    val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet
-    StructType(dataSchema ++ partitionSchema.filterNot { column =>
-      dataSchemaColumnNames.contains(column.name.toLowerCase)
-    })
+    val getColName: (StructField => String) =
+      if (sparkSession.sessionState.conf.caseSensitiveAnalysis) _.name else _.name.toLowerCase
+    val overlappedPartCols = mutable.Map.empty[String, StructField]
+    partitionSchema.foreach { partitionField =>
+      if (dataSchema.exists(getColName(_) == getColName(partitionField))) {
+        overlappedPartCols += getColName(partitionField) -> partitionField
+      }
+    }
+    StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++
+      partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f))))
   }
 
   def partitionSchemaOption: Option[StructType] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
index 7531f0ae02e7..ee4d0863d977 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -66,8 +66,8 @@ class InMemoryFileIndex(
   }
 
   override def refresh(): Unit = {
-    refresh0()
     fileStatusCache.invalidateAll()
+    refresh0()
   }
 
   private def refresh0(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
index b2ff68a833fe..ac7e3bdfc32e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OverwriteOptions}
 import org.apache.spark.sql.execution.command.RunnableCommand
 import org.apache.spark.sql.sources.InsertableRelation
 
@@ -30,7 +30,7 @@ import org.apache.spark.sql.sources.InsertableRelation
 case class InsertIntoDataSourceCommand(
     logicalRelation: LogicalRelation,
     query: LogicalPlan,
-    overwrite: Boolean)
+    overwrite: OverwriteOptions)
   extends RunnableCommand {
 
   override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)
@@ -40,10 +40,11 @@ case class InsertIntoDataSourceCommand(
     val data = Dataset.ofRows(sparkSession, query)
     // Apply the schema of the existing table to the new data.
     val df = sparkSession.internalCreateDataFrame(data.queryExecution.toRdd, logicalRelation.schema)
-    relation.insert(df, overwrite)
+    relation.insert(df, overwrite.enabled)
 
-    // Invalidate the cache.
-    sparkSession.sharedState.cacheManager.invalidateCache(logicalRelation)
+    // Re-cache all cached plans(including this relation itself, if it's cached) that refer to this
+    // data source relation.
+    sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, logicalRelation)
 
     Seq.empty[Row]
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 230c74a47ba2..a9bde903b3b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -19,10 +19,11 @@ package org.apache.spark.sql.execution.datasources
 
 import java.io.IOException
 
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 
+import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -30,23 +31,33 @@ import org.apache.spark.sql.execution.command.RunnableCommand
 
 /**
  * A command for writing data to a [[HadoopFsRelation]].  Supports both overwriting and appending.
- * Writing to dynamic partitions is also supported.  Each [[InsertIntoHadoopFsRelationCommand]]
- * issues a single write job, and owns a UUID that identifies this job.  Each concrete
- * implementation of [[HadoopFsRelation]] should use this UUID together with task id to generate
- * unique file path for each task output file.  This UUID is passed to executor side via a
- * property named `spark.sql.sources.writeJobUUID`.
+ * Writing to dynamic partitions is also supported.
+ *
+ * @param staticPartitionKeys partial partitioning spec for write. This defines the scope of
+ *                            partition overwrites: when the spec is empty, all partitions are
+ *                            overwritten. When it covers a prefix of the partition keys, only
+ *                            partitions matching the prefix are overwritten.
+ * @param customPartitionLocations mapping of partition specs to their custom locations. The
+ *                                 caller should guarantee that exactly those table partitions
+ *                                 falling under the specified static partition keys are contained
+ *                                 in this map, and that no other partitions are.
  */
 case class InsertIntoHadoopFsRelationCommand(
     outputPath: Path,
+    staticPartitionKeys: TablePartitionSpec,
+    customPartitionLocations: Map[TablePartitionSpec, String],
     partitionColumns: Seq[Attribute],
     bucketSpec: Option[BucketSpec],
     fileFormat: FileFormat,
-    refreshFunction: (Seq[TablePartitionSpec]) => Unit,
+    refreshFunction: Seq[TablePartitionSpec] => Unit,
     options: Map[String, String],
     @transient query: LogicalPlan,
-    mode: SaveMode)
+    mode: SaveMode,
+    catalogTable: Option[CatalogTable])
   extends RunnableCommand {
 
+  import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
+
   override protected def innerChildren: Seq[LogicalPlan] = query :: Nil
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
@@ -68,10 +79,7 @@ case class InsertIntoHadoopFsRelationCommand(
       case (SaveMode.ErrorIfExists, true) =>
         throw new AnalysisException(s"path $qualifiedOutputPath already exists.")
       case (SaveMode.Overwrite, true) =>
-        if (!fs.delete(qualifiedOutputPath, true /* recursively */)) {
-          throw new IOException(s"Unable to clear output " +
-            s"directory $qualifiedOutputPath prior to writing to it")
-        }
+        deleteMatchingPartitions(fs, qualifiedOutputPath)
         true
       case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) =>
         true
@@ -86,15 +94,17 @@ case class InsertIntoHadoopFsRelationCommand(
     if (doInsertion) {
       val committer = FileCommitProtocol.instantiate(
         sparkSession.sessionState.conf.fileCommitProtocolClass,
-        outputPath.toString,
-        isAppend)
+        jobId = java.util.UUID.randomUUID().toString,
+        outputPath = outputPath.toString,
+        isAppend = isAppend)
 
       FileFormatWriter.write(
         sparkSession = sparkSession,
-        plan = query,
+        queryExecution = Dataset.ofRows(sparkSession, query).queryExecution,
         fileFormat = fileFormat,
         committer = committer,
-        outputPath = qualifiedOutputPath.toString,
+        outputSpec = FileFormatWriter.OutputSpec(
+          qualifiedOutputPath.toString, customPartitionLocations),
         hadoopConf = hadoopConf,
         partitionColumns = partitionColumns,
         bucketSpec = bucketSpec,
@@ -106,4 +116,40 @@ case class InsertIntoHadoopFsRelationCommand(
 
     Seq.empty[Row]
   }
+
+  /**
+   * Deletes all partition files that match the specified static prefix. Partitions with custom
+   * locations are also cleared based on the custom locations map given to this class.
+   */
+  private def deleteMatchingPartitions(fs: FileSystem, qualifiedOutputPath: Path): Unit = {
+    val staticPartitionPrefix = if (staticPartitionKeys.nonEmpty) {
+      "/" + partitionColumns.flatMap { p =>
+        staticPartitionKeys.get(p.name) match {
+          case Some(value) =>
+            Some(escapePathName(p.name) + "=" + escapePathName(value))
+          case None =>
+            None
+        }
+      }.mkString("/")
+    } else {
+      ""
+    }
+    // first clear the path determined by the static partition keys (e.g. /table/foo=1)
+    val staticPrefixPath = qualifiedOutputPath.suffix(staticPartitionPrefix)
+    if (fs.exists(staticPrefixPath) && !fs.delete(staticPrefixPath, true /* recursively */)) {
+      throw new IOException(s"Unable to clear output " +
+        s"directory $staticPrefixPath prior to writing to it")
+    }
+    // now clear all custom partition locations (e.g. /custom/dir/where/foo=2/bar=4)
+    for ((spec, customLoc) <- customPartitionLocations) {
+      assert(
+        (staticPartitionKeys.toSet -- spec).isEmpty,
+        "Custom partition location did not match static partitioning keys")
+      val path = new Path(customLoc)
+      if (fs.exists(path) && !fs.delete(path, true)) {
+        throw new IOException(s"Unable to clear partition " +
+          s"directory $path prior to writing to it")
+      }
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
index a8a722dd3c62..825a0f70dda6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
@@ -128,7 +128,6 @@ abstract class PartitioningAwareFileIndex(
       case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
         val spec = PartitioningUtils.parsePartitions(
           leafDirs,
-          PartitioningUtils.DEFAULT_PARTITION_NAME,
           typeInference = false,
           basePaths = basePaths)
 
@@ -148,7 +147,6 @@ abstract class PartitioningAwareFileIndex(
       case _ =>
         PartitioningUtils.parsePartitions(
           leafDirs,
-          PartitioningUtils.DEFAULT_PARTITION_NAME,
           typeInference = sparkSession.sessionState.conf.partitionColumnTypeInferenceEnabled,
           basePaths = basePaths)
     }
@@ -251,12 +249,9 @@ abstract class PartitioningAwareFileIndex(
           pathsToFetch += path
       }
     }
-    val discovered = if (pathsToFetch.length >=
-        sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-      PartitioningAwareFileIndex.listLeafFilesInParallel(pathsToFetch, hadoopConf, sparkSession)
-    } else {
-      PartitioningAwareFileIndex.listLeafFilesInSerial(pathsToFetch, hadoopConf)
-    }
+    val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, this.getClass))
+    val discovered = PartitioningAwareFileIndex.bulkListLeafFiles(
+      pathsToFetch, hadoopConf, filter, sparkSession)
     discovered.foreach { case (path, leafFiles) =>
       HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
       fileStatusCache.putLeafFiles(path, leafFiles.toArray)
@@ -288,45 +283,46 @@ object PartitioningAwareFileIndex extends Logging {
       blockLocations: Array[SerializableBlockLocation])
 
   /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[(Path, Seq[FileStatus])] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.map { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      (path, listLeafFiles0(fs, path, filter))
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   * Lists a collection of paths recursively. Picks the listing strategy adaptively depending
+   * on the number of paths to list.
+   *
+   * This may only be called on the driver.
+   *
+   * @return for each input path, the set of discovered files for the path
    */
-  private def listLeafFilesInParallel(
+  private def bulkListLeafFiles(
       paths: Seq[Path],
       hadoopConf: Configuration,
+      filter: PathFilter,
       sparkSession: SparkSession): Seq[(Path, Seq[FileStatus])] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+
+    // Short-circuits parallel listing when serial listing is likely to be faster.
+    if (paths.size < sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+      return paths.map { path =>
+        (path, listLeafFiles(path, hadoopConf, filter, Some(sparkSession)))
+      }
+    }
+
     logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+    HiveCatalogMetrics.incrementParallelListingJobCount(1)
 
     val sparkContext = sparkSession.sparkContext
     val serializableConfiguration = new SerializableConfiguration(hadoopConf)
     val serializedPaths = paths.map(_.toString)
+    val parallelPartitionDiscoveryParallelism =
+      sparkSession.sessionState.conf.parallelPartitionDiscoveryParallelism
 
     // Set the number of parallelism to prevent following file listing from generating many tasks
     // in case of large #defaultParallelism.
-    val numParallelism = Math.min(paths.size, 10000)
+    val numParallelism = Math.min(paths.size, parallelPartitionDiscoveryParallelism)
 
     val statusMap = sparkContext
       .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
+      .mapPartitions { pathStrings =>
         val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+        pathStrings.map(new Path(_)).toSeq.map { path =>
+          (path, listLeafFiles(path, hadoopConf, filter, None))
+        }.iterator
       }.map { case (path, statuses) =>
         val serializableStatuses = statuses.map { status =>
           // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
@@ -374,11 +370,20 @@ object PartitioningAwareFileIndex extends Logging {
   }
 
   /**
-   * List a single path, provided as a FileStatus, in serial.
+   * Lists a single filesystem path recursively. If a SparkSession object is specified, this
+   * function may launch Spark jobs to parallelize listing.
+   *
+   * If sessionOpt is None, this may be called on executors.
+   *
+   * @return all children of path that match the specified filter.
    */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+  private def listLeafFiles(
+      path: Path,
+      hadoopConf: Configuration,
+      filter: PathFilter,
+      sessionOpt: Option[SparkSession]): Seq[FileStatus] = {
     logTrace(s"Listing $path")
+    val fs = path.getFileSystem(hadoopConf)
     val name = path.getName.toLowerCase
     if (shouldFilterOut(name)) {
       Seq.empty[FileStatus]
@@ -393,9 +398,15 @@ object PartitioningAwareFileIndex extends Logging {
       }
 
       val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+        val (dirs, topLevelFiles) = statuses.partition(_.isDirectory)
+        val nestedFiles: Seq[FileStatus] = sessionOpt match {
+          case Some(session) =>
+            bulkListLeafFiles(dirs.map(_.getPath), hadoopConf, filter, session).flatMap(_._2)
+          case _ =>
+            dirs.flatMap(dir => listLeafFiles(dir.getPath, hadoopConf, filter, sessionOpt))
+        }
+        val allFiles = topLevelFiles ++ nestedFiles
+        if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
       }
 
       allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index b51b41869bf0..ef29ee22e950 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -25,7 +25,6 @@ import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.util.Shell
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
@@ -56,15 +55,16 @@ object PartitionSpec {
 }
 
 object PartitioningUtils {
-  // This duplicates default value of Hive `ConfVars.DEFAULTPARTITIONNAME`, since sql/core doesn't
-  // depend on Hive.
-  val DEFAULT_PARTITION_NAME = "__HIVE_DEFAULT_PARTITION__"
 
   private[datasources] case class PartitionValues(columnNames: Seq[String], literals: Seq[Literal])
   {
     require(columnNames.size == literals.size)
   }
 
+  import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.DEFAULT_PARTITION_NAME
+  import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
+  import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.unescapePathName
+
   /**
    * Given a group of qualified paths, tries to parse them and returns a partition specification.
    * For example, given:
@@ -90,12 +90,11 @@ object PartitioningUtils {
    */
   private[datasources] def parsePartitions(
       paths: Seq[Path],
-      defaultPartitionName: String,
       typeInference: Boolean,
       basePaths: Set[Path]): PartitionSpec = {
     // First, we need to parse every partition's path and see if we can find partition values.
     val (partitionValues, optDiscoveredBasePaths) = paths.map { path =>
-      parsePartition(path, defaultPartitionName, typeInference, basePaths)
+      parsePartition(path, typeInference, basePaths)
     }.unzip
 
     // We create pairs of (path -> path's partition value) here
@@ -119,7 +118,7 @@ object PartitioningUtils {
       //   "hdfs://host:9000/invalidPath"
       //   "hdfs://host:9000/path"
       // TODO: Selective case sensitivity.
-      val discoveredBasePaths = optDiscoveredBasePaths.flatMap(x => x).map(_.toString.toLowerCase())
+      val discoveredBasePaths = optDiscoveredBasePaths.flatten.map(_.toString.toLowerCase())
       assert(
         discoveredBasePaths.distinct.size == 1,
         "Conflicting directory structures detected. Suspicious paths:\b" +
@@ -173,7 +172,6 @@ object PartitioningUtils {
    */
   private[datasources] def parsePartition(
       path: Path,
-      defaultPartitionName: String,
       typeInference: Boolean,
       basePaths: Set[Path]): (Option[PartitionValues], Option[Path]) = {
     val columns = ArrayBuffer.empty[(String, Literal)]
@@ -196,7 +194,7 @@ object PartitioningUtils {
         // Let's say currentPath is a path of "/table/a=1/", currentPath.getName will give us a=1.
         // Once we get the string, we try to parse it and find the partition column and value.
         val maybeColumn =
-          parsePartitionColumn(currentPath.getName, defaultPartitionName, typeInference)
+          parsePartitionColumn(currentPath.getName, typeInference)
         maybeColumn.foreach(columns += _)
 
         // Now, we determine if we should stop.
@@ -228,7 +226,6 @@ object PartitioningUtils {
 
   private def parsePartitionColumn(
       columnSpec: String,
-      defaultPartitionName: String,
       typeInference: Boolean): Option[(String, Literal)] = {
     val equalSignIndex = columnSpec.indexOf('=')
     if (equalSignIndex == -1) {
@@ -240,20 +237,38 @@ object PartitioningUtils {
       val rawColumnValue = columnSpec.drop(equalSignIndex + 1)
       assert(rawColumnValue.nonEmpty, s"Empty partition column value in '$columnSpec'")
 
-      val literal = inferPartitionColumnValue(rawColumnValue, defaultPartitionName, typeInference)
+      val literal = inferPartitionColumnValue(rawColumnValue, typeInference)
       Some(columnName -> literal)
     }
   }
 
   /**
    * Given a partition path fragment, e.g. `fieldOne=1/fieldTwo=2`, returns a parsed spec
-   * for that fragment, e.g. `Map(("fieldOne", "1"), ("fieldTwo", "2"))`.
+   * for that fragment as a `TablePartitionSpec`, e.g. `Map(("fieldOne", "1"), ("fieldTwo", "2"))`.
    */
   def parsePathFragment(pathFragment: String): TablePartitionSpec = {
+    parsePathFragmentAsSeq(pathFragment).toMap
+  }
+
+  /**
+   * Given a partition path fragment, e.g. `fieldOne=1/fieldTwo=2`, returns a parsed spec
+   * for that fragment as a `Seq[(String, String)]`, e.g.
+   * `Seq(("fieldOne", "1"), ("fieldTwo", "2"))`.
+   */
+  def parsePathFragmentAsSeq(pathFragment: String): Seq[(String, String)] = {
     pathFragment.split("/").map { kv =>
       val pair = kv.split("=", 2)
       (unescapePathName(pair(0)), unescapePathName(pair(1)))
-    }.toMap
+    }
+  }
+
+  /**
+   * This is the inverse of parsePathFragment().
+   */
+  def getPathFragment(spec: TablePartitionSpec, partitionSchema: StructType): String = {
+    partitionSchema.map { field =>
+      escapePathName(field.name) + "=" + escapePathName(spec(field.name))
+    }.mkString("/")
   }
 
   /**
@@ -355,7 +370,6 @@ object PartitioningUtils {
    */
   private[datasources] def inferPartitionColumnValue(
       raw: String,
-      defaultPartitionName: String,
       typeInference: Boolean): Literal = {
     val decimalTry = Try {
       // `BigDecimal` conversion can fail when the `field` is not a form of number.
@@ -380,14 +394,14 @@ object PartitioningUtils {
         .orElse(Try(Literal(JTimestamp.valueOf(unescapePathName(raw)))))
         // Then falls back to string
         .getOrElse {
-          if (raw == defaultPartitionName) {
+          if (raw == DEFAULT_PARTITION_NAME) {
             Literal.create(null, NullType)
           } else {
             Literal.create(unescapePathName(raw), StringType)
           }
         }
     } else {
-      if (raw == defaultPartitionName) {
+      if (raw == DEFAULT_PARTITION_NAME) {
         Literal.create(null, NullType)
       } else {
         Literal.create(unescapePathName(raw), StringType)
@@ -450,77 +464,4 @@ object PartitioningUtils {
       Literal.create(Cast(l, desiredType).eval(), desiredType)
     }
   }
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-  // The following string escaping code is mainly copied from Hive (o.a.h.h.common.FileUtils).
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-
-  val charToEscape = {
-    val bitSet = new java.util.BitSet(128)
-
-    /**
-     * ASCII 01-1F are HTTP control characters that need to be escaped.
-     * \u000A and \u000D are \n and \r, respectively.
-     */
-    val clist = Array(
-      '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009',
-      '\n', '\u000B', '\u000C', '\r', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
-      '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
-      '\u001D', '\u001E', '\u001F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F',
-      '{', '[', ']', '^')
-
-    clist.foreach(bitSet.set(_))
-
-    if (Shell.WINDOWS) {
-      Array(' ', '<', '>', '|').foreach(bitSet.set(_))
-    }
-
-    bitSet
-  }
-
-  def needsEscaping(c: Char): Boolean = {
-    c >= 0 && c < charToEscape.size() && charToEscape.get(c)
-  }
-
-  def escapePathName(path: String): String = {
-    val builder = new StringBuilder()
-    path.foreach { c =>
-      if (needsEscaping(c)) {
-        builder.append('%')
-        builder.append(f"${c.asInstanceOf[Int]}%02X")
-      } else {
-        builder.append(c)
-      }
-    }
-
-    builder.toString()
-  }
-
-  def unescapePathName(path: String): String = {
-    val sb = new StringBuilder
-    var i = 0
-
-    while (i < path.length) {
-      val c = path.charAt(i)
-      if (c == '%' && i + 2 < path.length) {
-        val code: Int = try {
-          Integer.parseInt(path.substring(i + 1, i + 3), 16)
-        } catch {
-          case _: Exception => -1
-        }
-        if (code >= 0) {
-          sb.append(code.asInstanceOf[Char])
-          i += 3
-        } else {
-          sb.append(c)
-          i += 1
-        }
-      } else {
-        sb.append(c)
-        i += 1
-      }
-    }
-
-    sb.toString()
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
new file mode 100644
index 000000000000..9b9ed28412ca
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
+ * Hadoop output committer using an option specified in SQLConf.
+ */
+class SQLHadoopMapReduceCommitProtocol(jobId: String, path: String, isAppend: Boolean)
+  extends HadoopMapReduceCommitProtocol(jobId, path) with Serializable with Logging {
+
+  override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
+    var committer = context.getOutputFormatClass.newInstance().getOutputCommitter(context)
+
+    if (!isAppend) {
+      // If we are appending data to an existing dir, we will only use the output committer
+      // associated with the file output format since it is not safe to use a custom
+      // committer for appending. For example, in S3, direct parquet output committer may
+      // leave partial data in the destination dir when the appending job fails.
+      // See SPARK-8578 for more details.
+      val configuration = context.getConfiguration
+      val clazz =
+        configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
+
+      if (clazz != null) {
+        logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
+
+        // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
+        // has an associated output committer. To override this output committer,
+        // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
+        // If a data source needs to override the output committer, it needs to set the
+        // output committer in prepareForWrite method.
+        if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) {
+          // The specified output committer is a FileOutputCommitter.
+          // So, we will use the FileOutputCommitter-specified constructor.
+          val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
+          committer = ctor.newInstance(new Path(path), context)
+        } else {
+          // The specified output committer is just an OutputCommitter.
+          // So, we will use the no-argument constructor.
+          val ctor = clazz.getDeclaredConstructor()
+          committer = ctor.newInstance()
+        }
+      }
+    }
+    logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
+    committer
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
index 3ab775c90923..49a991f6ffa2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -85,7 +85,9 @@ private[csv] object CSVInferSchema {
         case NullType => tryParseInteger(field, options)
         case IntegerType => tryParseInteger(field, options)
         case LongType => tryParseLong(field, options)
-        case _: DecimalType => tryParseDecimal(field, options)
+        case _: DecimalType =>
+          // DecimalTypes have different precisions and scales, so we try to find the common type.
+          findTightestCommonType(typeSoFar, tryParseDecimal(field, options)).getOrElse(StringType)
         case DoubleType => tryParseDouble(field, options)
         case TimestampType => tryParseTimestamp(field, options)
         case BooleanType => tryParseBoolean(field, options)
@@ -221,18 +223,27 @@ private[csv] object CSVTypeCast {
    * Currently we do not support complex types (ArrayType, MapType, StructType).
    *
    * For string types, this is simply the datum. For other types.
-   * For other nullable types, this is null if the string datum is empty.
+   * For other nullable types, returns null if it is null or equals to the value specified
+   * in `nullValue` option.
    *
    * @param datum string value
-   * @param castType SparkSQL type
+   * @param name field name in schema.
+   * @param castType data type to cast `datum` into.
+   * @param nullable nullability for the field.
+   * @param options CSV options.
    */
   def castTo(
       datum: String,
+      name: String,
       castType: DataType,
       nullable: Boolean = true,
       options: CSVOptions = CSVOptions()): Any = {
 
-    if (nullable && datum == options.nullValue) {
+    // datum can be null if the number of fields found is less than the length of the schema
+    if (datum == options.nullValue || datum == null) {
+      if (!nullable) {
+        throw new RuntimeException(s"null value found but field $name is not nullable.")
+      }
       null
     } else {
       castType match {
@@ -247,7 +258,7 @@ private[csv] object CSVTypeCast {
             case options.positiveInf => Float.PositiveInfinity
             case _ =>
               Try(datum.toFloat)
-                .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
+                .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).floatValue())
           }
         case _: DoubleType =>
           datum match {
@@ -256,7 +267,7 @@ private[csv] object CSVTypeCast {
             case options.positiveInf => Double.PositiveInfinity
             case _ =>
               Try(datum.toDouble)
-                .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
+                .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).doubleValue())
           }
         case _: BooleanType => datum.toBoolean
         case dt: DecimalType =>
@@ -281,7 +292,7 @@ private[csv] object CSVTypeCast {
               DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(datum).getTime)
             }
         case _: StringType => UTF8String.fromString(datum)
-        case udt: UserDefinedType[_] => castTo(datum, udt.sqlType, nullable, options)
+        case udt: UserDefinedType[_] => castTo(datum, name, udt.sqlType, nullable, options)
         case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 014614eb997a..21e50307b5ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -18,15 +18,18 @@
 package org.apache.spark.sql.execution.datasources.csv
 
 import java.nio.charset.StandardCharsets
+import java.util.Locale
 
 import org.apache.commons.lang3.time.FastDateFormat
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.util.{CompressionCodecs, ParseModes}
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CompressionCodecs, ParseModes}
 
-private[csv] class CSVOptions(@transient private val parameters: Map[String, String])
+private[csv] class CSVOptions(@transient private val parameters: CaseInsensitiveMap)
   extends Logging with Serializable {
 
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
+
   private def getChar(paramName: String, default: Char): Char = {
     val paramValue = parameters.get(paramName)
     paramValue match {
@@ -104,11 +107,11 @@ private[csv] class CSVOptions(@transient private val parameters: Map[String, Str
 
   // Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe.
   val dateFormat: FastDateFormat =
-    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"))
+    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), Locale.US)
 
   val timestampFormat: FastDateFormat =
     FastDateFormat.getInstance(
-      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"))
+      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"), Locale.US)
 
   val maxColumns = getInt("maxColumns", 20480)
 
@@ -127,7 +130,7 @@ private[csv] class CSVOptions(@transient private val parameters: Map[String, Str
 
 object CSVOptions {
 
-  def apply(): CSVOptions = new CSVOptions(Map.empty)
+  def apply(): CSVOptions = new CSVOptions(new CaseInsensitiveMap(Map.empty))
 
   def apply(paramName: String, paramValue: String): CSVOptions = {
     new CSVOptions(Map(paramName -> paramValue))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
index a249b9d9d59b..a47b4141531f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -124,6 +124,7 @@ object CSVRelation extends Logging {
             // value is not stored in the row.
             val value = CSVTypeCast.castTo(
               indexSafeTokens(index),
+              field.name,
               field.dataType,
               field.nullable,
               params)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index 59fb48ffea59..695ba1234d45 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.CatalogTable
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogUtils}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
 import org.apache.spark.sql.execution.command.RunnableCommand
@@ -56,6 +56,14 @@ case class CreateTempViewUsing(
       s"Temporary view '$tableIdent' should not have specified a database")
   }
 
+  override def argString: String = {
+    s"[tableIdent:$tableIdent " +
+      userSpecifiedSchema.map(_ + " ").getOrElse("") +
+      s"replace:$replace " +
+      s"provider:$provider " +
+      CatalogUtils.maskCredentials(options)
+  }
+
   def run(sparkSession: SparkSession): Seq[Row] = {
     val dataSource = DataSource(
       sparkSession,
@@ -96,21 +104,3 @@ case class RefreshResource(path: String)
     Seq.empty[Row]
   }
 }
-
-/**
- * Builds a map in which keys are case insensitive
- */
-class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String]
-  with Serializable {
-
-  val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase))
-
-  override def get(k: String): Option[String] = baseMap.get(k.toLowerCase)
-
-  override def + [B1 >: String](kv: (String, B1)): Map[String, B1] =
-    baseMap + kv.copy(_1 = kv._1.toLowerCase)
-
-  override def iterator: Iterator[(String, String)] = baseMap.iterator
-
-  override def -(key: String): Map[String, String] = baseMap - key.toLowerCase
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
index fcd7409159de..d94fa7e8d80a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
@@ -20,27 +20,42 @@ package org.apache.spark.sql.execution.datasources.jdbc
 import java.sql.{Connection, DriverManager}
 import java.util.Properties
 
-import scala.collection.mutable.ArrayBuffer
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 
 /**
  * Options for the JDBC data source.
  */
 class JDBCOptions(
-    @transient private val parameters: Map[String, String])
+    @transient private val parameters: CaseInsensitiveMap)
   extends Serializable {
 
   import JDBCOptions._
 
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
+
   def this(url: String, table: String, parameters: Map[String, String]) = {
-    this(parameters ++ Map(
+    this(new CaseInsensitiveMap(parameters ++ Map(
       JDBCOptions.JDBC_URL -> url,
-      JDBCOptions.JDBC_TABLE_NAME -> table))
+      JDBCOptions.JDBC_TABLE_NAME -> table)))
+  }
+
+  /**
+   * Returns a property with all options.
+   */
+  val asProperties: Properties = {
+    val properties = new Properties()
+    parameters.foreach { case (k, v) => properties.setProperty(k, v) }
+    properties
   }
 
+  /**
+   * Returns a property with all options except Spark internal data source options like `url`,
+   * `dbtable`, and `numPartition`. This should be used when invoking JDBC API like `Driver.connect`
+   * because each DBMS vendor has its own property list for JDBC driver. See SPARK-17776.
+   */
   val asConnectionProperties: Properties = {
     val properties = new Properties()
-    // We should avoid to pass the options into properties. See SPARK-17776.
-    parameters.filterKeys(!jdbcOptionNames.contains(_))
+    parameters.filterKeys(key => !jdbcOptionNames(key.toLowerCase))
       .foreach { case (k, v) => properties.setProperty(k, v) }
     properties
   }
@@ -121,10 +136,10 @@ class JDBCOptions(
 }
 
 object JDBCOptions {
-  private val jdbcOptionNames = ArrayBuffer.empty[String]
+  private val jdbcOptionNames = collection.mutable.Set[String]()
 
   private def newOption(name: String): String = {
-    jdbcOptionNames += name
+    jdbcOptionNames += name.toLowerCase
     name
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index c0fabc81e42a..2bdc43254133 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -23,11 +23,11 @@ import scala.util.control.NonFatal
 
 import org.apache.commons.lang3.StringUtils
 
-import org.apache.spark.{Partition, SparkContext, TaskContext}
+import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.jdbc.JdbcDialects
+import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcDialects}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.CompletionIterator
@@ -54,7 +54,6 @@ object JDBCRDD extends Logging {
   def resolveTable(options: JDBCOptions): StructType = {
     val url = options.url
     val table = options.table
-    val properties = options.asConnectionProperties
     val dialect = JdbcDialects.get(url)
     val conn: Connection = JdbcUtils.createConnectionFactory(options)()
     try {
@@ -105,35 +104,40 @@ object JDBCRDD extends Logging {
    * Turns a single Filter into a String representing a SQL expression.
    * Returns None for an unhandled filter.
    */
-  def compileFilter(f: Filter): Option[String] = {
+  def compileFilter(f: Filter, dialect: JdbcDialect): Option[String] = {
+    def quote(colName: String): String = dialect.quoteIdentifier(colName)
+
     Option(f match {
-      case EqualTo(attr, value) => s"$attr = ${compileValue(value)}"
+      case EqualTo(attr, value) => s"${quote(attr)} = ${compileValue(value)}"
       case EqualNullSafe(attr, value) =>
-        s"(NOT ($attr != ${compileValue(value)} OR $attr IS NULL OR " +
-          s"${compileValue(value)} IS NULL) OR ($attr IS NULL AND ${compileValue(value)} IS NULL))"
-      case LessThan(attr, value) => s"$attr < ${compileValue(value)}"
-      case GreaterThan(attr, value) => s"$attr > ${compileValue(value)}"
-      case LessThanOrEqual(attr, value) => s"$attr <= ${compileValue(value)}"
-      case GreaterThanOrEqual(attr, value) => s"$attr >= ${compileValue(value)}"
-      case IsNull(attr) => s"$attr IS NULL"
-      case IsNotNull(attr) => s"$attr IS NOT NULL"
-      case StringStartsWith(attr, value) => s"${attr} LIKE '${value}%'"
-      case StringEndsWith(attr, value) => s"${attr} LIKE '%${value}'"
-      case StringContains(attr, value) => s"${attr} LIKE '%${value}%'"
-      case In(attr, value) => s"$attr IN (${compileValue(value)})"
-      case Not(f) => compileFilter(f).map(p => s"(NOT ($p))").getOrElse(null)
+        val col = quote(attr)
+        s"(NOT ($col != ${compileValue(value)} OR $col IS NULL OR " +
+          s"${compileValue(value)} IS NULL) OR ($col IS NULL AND ${compileValue(value)} IS NULL))"
+      case LessThan(attr, value) => s"${quote(attr)} < ${compileValue(value)}"
+      case GreaterThan(attr, value) => s"${quote(attr)} > ${compileValue(value)}"
+      case LessThanOrEqual(attr, value) => s"${quote(attr)} <= ${compileValue(value)}"
+      case GreaterThanOrEqual(attr, value) => s"${quote(attr)} >= ${compileValue(value)}"
+      case IsNull(attr) => s"${quote(attr)} IS NULL"
+      case IsNotNull(attr) => s"${quote(attr)} IS NOT NULL"
+      case StringStartsWith(attr, value) => s"${quote(attr)} LIKE '${value}%'"
+      case StringEndsWith(attr, value) => s"${quote(attr)} LIKE '%${value}'"
+      case StringContains(attr, value) => s"${quote(attr)} LIKE '%${value}%'"
+      case In(attr, value) if value.isEmpty =>
+        s"CASE WHEN ${quote(attr)} IS NULL THEN NULL ELSE FALSE END"
+      case In(attr, value) => s"${quote(attr)} IN (${compileValue(value)})"
+      case Not(f) => compileFilter(f, dialect).map(p => s"(NOT ($p))").getOrElse(null)
       case Or(f1, f2) =>
         // We can't compile Or filter unless both sub-filters are compiled successfully.
         // It applies too for the following And filter.
         // If we can make sure compileFilter supports all filters, we can remove this check.
-        val or = Seq(f1, f2).flatMap(compileFilter(_))
+        val or = Seq(f1, f2).flatMap(compileFilter(_, dialect))
         if (or.size == 2) {
           or.map(p => s"($p)").mkString(" OR ")
         } else {
           null
         }
       case And(f1, f2) =>
-        val and = Seq(f1, f2).flatMap(compileFilter(_))
+        val and = Seq(f1, f2).flatMap(compileFilter(_, dialect))
         if (and.size == 2) {
           and.map(p => s"($p)").mkString(" AND ")
         } else {
@@ -212,7 +216,9 @@ private[jdbc] class JDBCRDD(
    * `filters`, but as a WHERE clause suitable for injection into a SQL query.
    */
   private val filterWhereClause: String =
-    filters.flatMap(JDBCRDD.compileFilter).map(p => s"($p)").mkString(" AND ")
+    filters
+      .flatMap(JDBCRDD.compileFilter(_, JdbcDialects.get(url)))
+      .map(p => s"($p)").mkString(" AND ")
 
   /**
    * A WHERE clause representing both `filters`, if any, and the current partition.
@@ -295,6 +301,7 @@ private[jdbc] class JDBCRDD(
     rs = stmt.executeQuery()
     val rowsIterator = JdbcUtils.resultSetToSparkInternalRows(rs, schema, inputMetrics)
 
-    CompletionIterator[InternalRow, Iterator[InternalRow]](rowsIterator, close())
+    CompletionIterator[InternalRow, Iterator[InternalRow]](
+      new InterruptibleIterator(context, rowsIterator), close())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index 672c21c6ac73..8b45dba04d29 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -23,6 +23,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.Partition
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.jdbc.JdbcDialects
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 
@@ -113,7 +114,7 @@ private[sql] case class JDBCRelation(
 
   // Check if JDBCRDD.compileFilter can accept input filters
   override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
-    filters.filter(JDBCRDD.compileFilter(_).isEmpty)
+    filters.filter(JDBCRDD.compileFilter(_, JdbcDialects.get(jdbcOptions.url)).isEmpty)
   }
 
   override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
@@ -130,14 +131,15 @@ private[sql] case class JDBCRelation(
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
     val url = jdbcOptions.url
     val table = jdbcOptions.table
-    val properties = jdbcOptions.asConnectionProperties
+    val properties = jdbcOptions.asProperties
     data.write
       .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
       .jdbc(url, table, properties)
   }
 
   override def toString: String = {
+    val partitioningInfo = if (parts.nonEmpty) s" [numPartitions=${parts.length}]" else ""
     // credentials should not be included in the plan output, table information is sufficient.
-    s"JDBCRelation(${jdbcOptions.table})"
+    s"JDBCRelation(${jdbcOptions.table})" + partitioningInfo
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 41edb6511c2c..81fdf69bfefb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -423,9 +423,9 @@ object JdbcUtils extends Logging {
       }
 
       (rs: ResultSet, row: InternalRow, pos: Int) =>
-        val array = nullSafeConvert[Object](
-          rs.getArray(pos + 1).getArray,
-          array => new GenericArrayData(elementConversion.apply(array)))
+        val array = nullSafeConvert[java.sql.Array](
+          input = rs.getArray(pos + 1),
+          array => new GenericArrayData(elementConversion.apply(array.getArray)))
         row.update(pos, array)
 
     case _ => throw new IllegalArgumentException(s"Unsupported type ${dt.simpleString}")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index b8ea7f40c4ab..f303a014fa48 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
+import java.io.IOException
 import java.net.URI
-import java.util.logging.{Logger => JLogger}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
+import scala.collection.parallel.ForkJoinTaskSupport
+import scala.concurrent.forkjoin.ForkJoinPool
 import scala.util.{Failure, Try}
 
 import org.apache.hadoop.conf.Configuration
@@ -29,14 +31,13 @@ import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.FileSplit
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-import org.apache.parquet.{Log => ApacheParquetLog}
 import org.apache.parquet.filter2.compat.FilterCompat
 import org.apache.parquet.filter2.predicate.FilterApi
+import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS
 import org.apache.parquet.hadoop._
 import org.apache.parquet.hadoop.codec.CodecConfig
 import org.apache.parquet.hadoop.util.ContextUtil
 import org.apache.parquet.schema.MessageType
-import org.slf4j.bridge.SLF4JBridgeHandler
 
 import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.internal.Logging
@@ -49,17 +50,22 @@ import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
-import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util.{SerializableConfiguration, ThreadUtils}
 
 class ParquetFileFormat
   extends FileFormat
   with DataSourceRegister
   with Logging
   with Serializable {
+  // Hold a reference to the (serializable) singleton instance of ParquetLogRedirector. This
+  // ensures the ParquetLogRedirector class is initialized whether an instance of ParquetFileFormat
+  // is constructed or deserialized. Do not heed the Scala compiler's warning about an unused field
+  // here.
+  private val parquetLogRedirector = ParquetLogRedirector.INSTANCE
 
   override def shortName(): String = "parquet"
 
-  override def toString: String = "ParquetFormat"
+  override def toString: String = "Parquet"
 
   override def hashCode(): Int = getClass.hashCode()
 
@@ -129,10 +135,14 @@ class ParquetFileFormat
       conf.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false)
     }
 
-    ParquetFileFormat.redirectParquetLogs()
-
     new OutputWriterFactory {
-      override def newInstance(
+      // This OutputWriterFactory instance is deserialized when writing Parquet files on the
+      // executor side without constructing or deserializing ParquetFileFormat. Therefore, we hold
+      // another reference to ParquetLogRedirector.INSTANCE here to ensure the latter class is
+      // initialized.
+      private val parquetLogRedirector = ParquetLogRedirector.INSTANCE
+
+        override def newInstance(
           path: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
@@ -145,7 +155,7 @@ class ParquetFileFormat
     }
   }
 
-  def inferSchema(
+  override def inferSchema(
       sparkSession: SparkSession,
       parameters: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
@@ -477,68 +487,38 @@ object ParquetFileFormat extends Logging {
   }
 
   /**
-   * Reconciles Hive Metastore case insensitivity issue and data type conflicts between Metastore
-   * schema and Parquet schema.
-   *
-   * Hive doesn't retain case information, while Parquet is case sensitive. On the other hand, the
-   * schema read from Parquet files may be incomplete (e.g. older versions of Parquet doesn't
-   * distinguish binary and string).  This method generates a correct schema by merging Metastore
-   * schema data types and Parquet schema field names.
+   * Reads Parquet footers in multi-threaded manner.
+   * If the config "spark.sql.files.ignoreCorruptFiles" is set to true, we will ignore the corrupted
+   * files when reading footers.
    */
-  def mergeMetastoreParquetSchema(
-      metastoreSchema: StructType,
-      parquetSchema: StructType): StructType = {
-    def schemaConflictMessage: String =
-      s"""Converting Hive Metastore Parquet, but detected conflicting schemas. Metastore schema:
-         |${metastoreSchema.prettyJson}
-         |
-         |Parquet schema:
-         |${parquetSchema.prettyJson}
-       """.stripMargin
-
-    val mergedParquetSchema = mergeMissingNullableFields(metastoreSchema, parquetSchema)
-
-    assert(metastoreSchema.size <= mergedParquetSchema.size, schemaConflictMessage)
-
-    val ordinalMap = metastoreSchema.zipWithIndex.map {
-      case (field, index) => field.name.toLowerCase -> index
-    }.toMap
-
-    val reorderedParquetSchema = mergedParquetSchema.sortBy(f =>
-      ordinalMap.getOrElse(f.name.toLowerCase, metastoreSchema.size + 1))
-
-    StructType(metastoreSchema.zip(reorderedParquetSchema).map {
-      // Uses Parquet field names but retains Metastore data types.
-      case (mSchema, pSchema) if mSchema.name.toLowerCase == pSchema.name.toLowerCase =>
-        mSchema.copy(name = pSchema.name)
-      case _ =>
-        throw new SparkException(schemaConflictMessage)
-    })
-  }
-
-  /**
-   * Returns the original schema from the Parquet file with any missing nullable fields from the
-   * Hive Metastore schema merged in.
-   *
-   * When constructing a DataFrame from a collection of structured data, the resulting object has
-   * a schema corresponding to the union of the fields present in each element of the collection.
-   * Spark SQL simply assigns a null value to any field that isn't present for a particular row.
-   * In some cases, it is possible that a given table partition stored as a Parquet file doesn't
-   * contain a particular nullable field in its schema despite that field being present in the
-   * table schema obtained from the Hive Metastore. This method returns a schema representing the
-   * Parquet file schema along with any additional nullable fields from the Metastore schema
-   * merged in.
-   */
-  private[parquet] def mergeMissingNullableFields(
-      metastoreSchema: StructType,
-      parquetSchema: StructType): StructType = {
-    val fieldMap = metastoreSchema.map(f => f.name.toLowerCase -> f).toMap
-    val missingFields = metastoreSchema
-      .map(_.name.toLowerCase)
-      .diff(parquetSchema.map(_.name.toLowerCase))
-      .map(fieldMap(_))
-      .filter(_.nullable)
-    StructType(parquetSchema ++ missingFields)
+  private[parquet] def readParquetFootersInParallel(
+      conf: Configuration,
+      partFiles: Seq[FileStatus],
+      ignoreCorruptFiles: Boolean): Seq[Footer] = {
+    val parFiles = partFiles.par
+    val pool = ThreadUtils.newForkJoinPool("readingParquetFooters", 8)
+    parFiles.tasksupport = new ForkJoinTaskSupport(pool)
+    try {
+      parFiles.flatMap { currentFile =>
+        try {
+          // Skips row group information since we only need the schema.
+          // ParquetFileReader.readFooter throws RuntimeException, instead of IOException,
+          // when it can't read the footer.
+          Some(new Footer(currentFile.getPath(),
+            ParquetFileReader.readFooter(
+              conf, currentFile, SKIP_ROW_GROUPS)))
+        } catch { case e: RuntimeException =>
+          if (ignoreCorruptFiles) {
+            logWarning(s"Skipped the footer in the corrupted file: $currentFile", e)
+            None
+          } else {
+            throw new IOException(s"Could not read footer for file: $currentFile", e)
+          }
+        }
+      }.seq
+    } finally {
+      pool.shutdown()
+    }
   }
 
   /**
@@ -581,6 +561,8 @@ object ParquetFileFormat extends Logging {
     val numParallelism = Math.min(Math.max(partialFileStatusInfo.size, 1),
       sparkSession.sparkContext.defaultParallelism)
 
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
+
     // Issues a Spark job to read Parquet schema in parallel.
     val partiallyMergedSchemas =
       sparkSession
@@ -592,13 +574,10 @@ object ParquetFileFormat extends Logging {
             new FileStatus(length, false, 0, 0, 0, 0, null, null, null, new Path(path))
           }.toSeq
 
-          // Skips row group information since we only need the schema
-          val skipRowGroups = true
-
           // Reads footers in multi-threaded manner within each task
           val footers =
-            ParquetFileReader.readAllFootersInParallel(
-              serializedConf.value, fakeFileStatuses.asJava, skipRowGroups).asScala
+            ParquetFileFormat.readParquetFootersInParallel(
+              serializedConf.value, fakeFileStatuses, ignoreCorruptFiles)
 
           // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
           val converter =
@@ -673,44 +652,4 @@ object ParquetFileFormat extends Logging {
         Failure(cause)
     }.toOption
   }
-
-  // JUL loggers must be held by a strong reference, otherwise they may get destroyed by GC.
-  // However, the root JUL logger used by Parquet isn't properly referenced.  Here we keep
-  // references to loggers in both parquet-mr <= 1.6 and >= 1.7
-  val apacheParquetLogger: JLogger = JLogger.getLogger(classOf[ApacheParquetLog].getPackage.getName)
-  val parquetLogger: JLogger = JLogger.getLogger("parquet")
-
-  // Parquet initializes its own JUL logger in a static block which always prints to stdout.  Here
-  // we redirect the JUL logger via SLF4J JUL bridge handler.
-  val redirectParquetLogsViaSLF4J: Unit = {
-    def redirect(logger: JLogger): Unit = {
-      logger.getHandlers.foreach(logger.removeHandler)
-      logger.setUseParentHandlers(false)
-      logger.addHandler(new SLF4JBridgeHandler)
-    }
-
-    // For parquet-mr 1.7.0 and above versions, which are under `org.apache.parquet` namespace.
-    // scalastyle:off classforname
-    Class.forName(classOf[ApacheParquetLog].getName)
-    // scalastyle:on classforname
-    redirect(JLogger.getLogger(classOf[ApacheParquetLog].getPackage.getName))
-
-    // For parquet-mr 1.6.0 and lower versions bundled with Hive, which are under `parquet`
-    // namespace.
-    try {
-      // scalastyle:off classforname
-      Class.forName("parquet.Log")
-      // scalastyle:on classforname
-      redirect(JLogger.getLogger("parquet"))
-    } catch { case _: Throwable =>
-      // SPARK-9974: com.twitter:parquet-hadoop-bundle:1.6.0 is not packaged into the assembly
-      // when Spark is built with SBT. So `parquet.Log` may not be found.  This try/catch block
-      // should be removed after this issue is fixed.
-    }
-  }
-
-  /**
-   * ParquetFileFormat.prepareWrite calls this function to initialize `redirectParquetLogsViaSLF4J`.
-   */
-  def redirectParquetLogs(): Unit = {}
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index a6e978809772..7730d1fccb0b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -40,6 +40,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.eq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     // Binary.fromString and Binary.fromByteArray don't accept null values
     case StringType =>
       (n: String, v: Any) => FilterApi.eq(
@@ -49,6 +52,7 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.eq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+     */
   }
 
   private val makeNotEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -62,6 +66,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
@@ -70,6 +77,7 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+     */
   }
 
   private val makeLt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -81,6 +89,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.lt(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n),
@@ -88,6 +99,7 @@ private[parquet] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeLtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -99,6 +111,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.ltEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n),
@@ -106,6 +121,7 @@ private[parquet] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeGt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -117,6 +133,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gt(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n),
@@ -124,6 +143,7 @@ private[parquet] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeGtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -135,6 +155,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gtEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n),
@@ -142,6 +165,7 @@ private[parquet] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index d0fd23605bea..a81a95d51085 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -19,18 +19,22 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
 
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Options for the Parquet data source.
  */
 private[parquet] class ParquetOptions(
-    @transient private val parameters: Map[String, String],
+    @transient private val parameters: CaseInsensitiveMap,
     @transient private val sqlConf: SQLConf)
   extends Serializable {
 
   import ParquetOptions._
 
+  def this(parameters: Map[String, String], sqlConf: SQLConf) =
+    this(new CaseInsensitiveMap(parameters), sqlConf)
+
   /**
    * Compression codec to use. By default use the value specified in SQLConf.
    * Acceptable values are defined in [[shortParquetCompressionCodecNames]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
index 4dea8cf29ec5..f1a35dd8a620 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
@@ -269,15 +269,11 @@ private[parquet] object ParquetReadSupport {
    */
   private def clipParquetGroupFields(
       parquetRecord: GroupType, structType: StructType): Seq[Type] = {
-    val parquetFieldMap = parquetRecord.getFields.asScala
-      .map(f => f.getName -> f).toMap
-    val caseInsensitiveParquetFieldMap = parquetRecord.getFields.asScala
-      .map(f => f.getName.toLowerCase -> f).toMap
+    val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
     val toParquet = new ParquetSchemaConverter(writeLegacyParquetFormat = false)
     structType.map { f =>
       parquetFieldMap
         .get(f.name)
-        .orElse(caseInsensitiveParquetFieldMap.get(f.name.toLowerCase))
         .map(clipParquetType(_, f.dataType))
         .getOrElse(toParquet.convertField(f))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 4647b11af4df..2b2fbddd12e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -17,18 +17,16 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import java.util.regex.Pattern
-
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
-import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogRelation, CatalogTable, SessionCatalog}
+import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogTable, CatalogUtils, SessionCatalog}
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast, RowOrdering}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation}
 import org.apache.spark.sql.types.{AtomicType, StructType}
@@ -121,13 +119,16 @@ case class AnalyzeCreateTable(sparkSession: SparkSession) extends Rule[LogicalPl
   }
 
   private def checkPartitionColumns(schema: StructType, tableDesc: CatalogTable): CatalogTable = {
-    val normalizedPartitionCols = tableDesc.partitionColumnNames.map { colName =>
-      normalizeColumnName(tableDesc.identifier, schema, colName, "partition")
-    }
+    val normalizedPartitionCols = CatalogUtils.normalizePartCols(
+      tableName = tableDesc.identifier.unquotedString,
+      tableCols = schema.map(_.name),
+      partCols = tableDesc.partitionColumnNames,
+      resolver = sparkSession.sessionState.conf.resolver)
+
     checkDuplication(normalizedPartitionCols, "partition")
 
     if (schema.nonEmpty && normalizedPartitionCols.length == schema.length) {
-      if (tableDesc.provider.get == "hive") {
+      if (tableDesc.provider.get == DDLUtils.HIVE_PROVIDER) {
         // When we hit this branch, it means users didn't specify schema for the table to be
         // created, as we always include partition columns in table schema for hive serde tables.
         // The real schema will be inferred at hive metastore by hive serde, plus the given
@@ -148,25 +149,21 @@ case class AnalyzeCreateTable(sparkSession: SparkSession) extends Rule[LogicalPl
 
   private def checkBucketColumns(schema: StructType, tableDesc: CatalogTable): CatalogTable = {
     tableDesc.bucketSpec match {
-      case Some(BucketSpec(numBuckets, bucketColumnNames, sortColumnNames)) =>
-        val normalizedBucketCols = bucketColumnNames.map { colName =>
-          normalizeColumnName(tableDesc.identifier, schema, colName, "bucket")
-        }
-        checkDuplication(normalizedBucketCols, "bucket")
-
-        val normalizedSortCols = sortColumnNames.map { colName =>
-          normalizeColumnName(tableDesc.identifier, schema, colName, "sort")
-        }
-        checkDuplication(normalizedSortCols, "sort")
-
-        schema.filter(f => normalizedSortCols.contains(f.name)).map(_.dataType).foreach {
+      case Some(bucketSpec) =>
+        val normalizedBucketing = CatalogUtils.normalizeBucketSpec(
+          tableName = tableDesc.identifier.unquotedString,
+          tableCols = schema.map(_.name),
+          bucketSpec = bucketSpec,
+          resolver = sparkSession.sessionState.conf.resolver)
+        checkDuplication(normalizedBucketing.bucketColumnNames, "bucket")
+        checkDuplication(normalizedBucketing.sortColumnNames, "sort")
+
+        normalizedBucketing.sortColumnNames.map(schema(_)).map(_.dataType).foreach {
           case dt if RowOrdering.isOrderable(dt) => // OK
           case other => failAnalysis(s"Cannot use ${other.simpleString} for sorting column")
         }
 
-        tableDesc.copy(
-          bucketSpec = Some(BucketSpec(numBuckets, normalizedBucketCols, normalizedSortCols))
-        )
+        tableDesc.copy(bucketSpec = Some(normalizedBucketing))
 
       case None => tableDesc
     }
@@ -181,19 +178,6 @@ case class AnalyzeCreateTable(sparkSession: SparkSession) extends Rule[LogicalPl
     }
   }
 
-  private def normalizeColumnName(
-      tableIdent: TableIdentifier,
-      schema: StructType,
-      colName: String,
-      colType: String): String = {
-    val tableCols = schema.map(_.name)
-    val resolver = sparkSession.sessionState.conf.resolver
-    tableCols.find(resolver(_, colName)).getOrElse {
-      failAnalysis(s"$colType column $colName is not defined in table $tableIdent, " +
-        s"defined table columns are: ${tableCols.mkString(", ")}")
-    }
-  }
-
   private def failAnalysis(msg: String) = throw new AnalysisException(msg)
 }
 
@@ -292,7 +276,7 @@ object HiveOnlyCheck extends (LogicalPlan => Unit) {
   def apply(plan: LogicalPlan): Unit = {
     plan.foreach {
       case CreateTable(tableDesc, _, Some(_))
-          if tableDesc.provider.get == "hive" =>
+          if tableDesc.provider.get == DDLUtils.HIVE_PROVIDER =>
         throw new AnalysisException("Hive support is required to use CREATE Hive TABLE AS SELECT")
 
       case _ => // OK
@@ -308,24 +292,9 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
 
   def failAnalysis(msg: String): Unit = { throw new AnalysisException(msg) }
 
-  // This regex is used to check if the table name and database name is valid for `CreateTable`.
-  private val validNameFormat = Pattern.compile("[\\w_]+")
-
   def apply(plan: LogicalPlan): Unit = {
     plan.foreach {
       case c @ CreateTable(tableDesc, mode, query) if c.resolved =>
-        // Since we are saving table metadata to metastore, we should make sure the table name
-        // and database name don't break some common restrictions, e.g. special chars except
-        // underscore are not allowed.
-        val tblIdent = tableDesc.identifier
-        if (!validNameFormat.matcher(tblIdent.table).matches()) {
-          failAnalysis(s"Table name ${tblIdent.table} is not a valid name for " +
-            s"metastore. Metastore only accepts table name containing characters, numbers and _.")
-        }
-        if (tblIdent.database.exists(db => !validNameFormat.matcher(db).matches())) {
-          failAnalysis(s"Database name ${tblIdent.database.get} is not a valid name for " +
-            s"metastore. Metastore only accepts table name containing characters, numbers and _.")
-        }
         if (query.isDefined &&
           mode == SaveMode.Overwrite &&
           catalog.tableExists(tableDesc.identifier)) {
@@ -333,7 +302,7 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
           EliminateSubqueryAliases(catalog.lookupRelation(tableDesc.identifier)) match {
             // Only do the check if the table is a data source table
             // (the relation is a BaseRelation).
-            case l @ LogicalRelation(dest: BaseRelation, _, _) =>
+            case LogicalRelation(dest: BaseRelation, _, _) =>
               // Get all input data source relations of the query.
               val srcRelations = query.get.collect {
                 case LogicalRelation(src: BaseRelation, _, _) => src
@@ -346,9 +315,8 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
           }
         }
 
-      case i @ logical.InsertIntoTable(
-        l @ LogicalRelation(t: InsertableRelation, _, _),
-        partition, query, overwrite, ifNotExists) =>
+      case logical.InsertIntoTable(
+          l @ LogicalRelation(t: InsertableRelation, _, _), partition, query, _, _) =>
         // Right now, we do not support insert into a data source table with partition specs.
         if (partition.nonEmpty) {
           failAnalysis(s"Insert into a partition is not allowed because $l is not partitioned.")
@@ -366,15 +334,15 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
         }
 
       case logical.InsertIntoTable(
-        LogicalRelation(r: HadoopFsRelation, _, _), part, query, overwrite, _) =>
+        LogicalRelation(r: HadoopFsRelation, _, _), part, query, _, _) =>
         // We need to make sure the partition columns specified by users do match partition
         // columns of the relation.
         val existingPartitionColumns = r.partitionSchema.fieldNames.toSet
         val specifiedPartitionColumns = part.keySet
         if (existingPartitionColumns != specifiedPartitionColumns) {
-          failAnalysis(s"Specified partition columns " +
+          failAnalysis("Specified partition columns " +
             s"(${specifiedPartitionColumns.mkString(", ")}) " +
-            s"do not match the partition columns of the table. Please use " +
+            "do not match the partition columns of the table. Please use " +
             s"(${existingPartitionColumns.mkString(", ")}) as the partition columns.")
         } else {
           // OK
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
index 8e043960326d..3e890828e88b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
@@ -43,6 +43,8 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
 
   override def shortName(): String = "text"
 
+  override def toString: String = "Text"
+
   private def verifySchema(schema: StructType): Unit = {
     if (schema.size != 1) {
       throw new AnalysisException(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
index ce5013daeb1f..7a2505ca86e3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala
@@ -97,13 +97,7 @@ case class BroadcastExchangeExec(
           val broadcasted = sparkContext.broadcast(relation)
           longMetric("broadcastTime") += (System.nanoTime() - beforeBroadcast) / 1000000
 
-          // There are some cases we don't care about the metrics and call `SparkPlan.doExecute`
-          // directly without setting an execution id. We should be tolerant to it.
-          if (executionId != null) {
-            sparkContext.listenerBus.post(SparkListenerDriverAccumUpdates(
-              executionId.toLong, metrics.values.map(m => m.id -> m.value).toSeq))
-          }
-
+          SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
           broadcasted
         } catch {
           case oe: OutOfMemoryError =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
index 7a4a25137070..125a4930c652 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
@@ -45,9 +45,7 @@ case class ShuffleExchange(
 
   override def nodeName: String = {
     val extraInfo = coordinator match {
-      case Some(exchangeCoordinator) if exchangeCoordinator.isEstimated =>
-        s"(coordinator id: ${System.identityHashCode(coordinator)})"
-      case Some(exchangeCoordinator) if !exchangeCoordinator.isEstimated =>
+      case Some(exchangeCoordinator) =>
         s"(coordinator id: ${System.identityHashCode(coordinator)})"
       case None => ""
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
index bfe7e3dea45d..f526a1987667 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
@@ -52,7 +52,7 @@ case class BroadcastNestedLoopJoinExec(
       UnspecifiedDistribution :: BroadcastDistribution(IdentityBroadcastMode) :: Nil
   }
 
-  private[this] def genResultProjection: InternalRow => InternalRow = joinType match {
+  private[this] def genResultProjection: UnsafeProjection = joinType match {
     case LeftExistence(j) =>
       UnsafeProjection.create(output, output)
     case other =>
@@ -84,7 +84,7 @@ case class BroadcastNestedLoopJoinExec(
 
   @transient private lazy val boundCondition = {
     if (condition.isDefined) {
-      newPredicate(condition.get, streamed.output ++ broadcast.output)
+      newPredicate(condition.get, streamed.output ++ broadcast.output).eval _
     } else {
       (r: InternalRow) => true
     }
@@ -366,8 +366,9 @@ case class BroadcastNestedLoopJoinExec(
     }
 
     val numOutputRows = longMetric("numOutputRows")
-    resultRdd.mapPartitionsInternal { iter =>
+    resultRdd.mapPartitionsWithIndexInternal { (index, iter) =>
       val resultProj = genResultProjection
+      resultProj.initialize(index)
       iter.map { r =>
         numOutputRows += 1
         resultProj(r)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
index 15dc9b40662e..8341fe2ffd07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
@@ -98,15 +98,15 @@ case class CartesianProductExec(
     val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]
 
     val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size)
-    pair.mapPartitionsInternal { iter =>
+    pair.mapPartitionsWithIndexInternal { (index, iter) =>
       val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
       val filtered = if (condition.isDefined) {
-        val boundCondition: (InternalRow) => Boolean =
-          newPredicate(condition.get, left.output ++ right.output)
+        val boundCondition = newPredicate(condition.get, left.output ++ right.output)
+        boundCondition.initialize(index)
         val joined = new JoinedRow
 
         iter.filter { r =>
-          boundCondition(joined(r._1, r._2))
+          boundCondition.eval(joined(r._1, r._2))
         }
       } else {
         iter
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 05c5e2f4cd77..1aef5f686426 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -81,7 +81,7 @@ trait HashJoin {
     UnsafeProjection.create(streamedKeys)
 
   @transient private[this] lazy val boundCondition = if (condition.isDefined) {
-    newPredicate(condition.get, streamedPlan.output ++ buildPlan.output)
+    newPredicate(condition.get, streamedPlan.output ++ buildPlan.output).eval _
   } else {
     (r: InternalRow) => true
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 8821c0dea9ee..b9f6601ea87f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -670,9 +670,9 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     var offset: Long = Platform.LONG_ARRAY_OFFSET
     val end = len * 8L + Platform.LONG_ARRAY_OFFSET
     while (offset < end) {
-      val size = Math.min(buffer.length, (end - offset).toInt)
+      val size = Math.min(buffer.length, end - offset)
       Platform.copyMemory(arr, offset, buffer, Platform.BYTE_ARRAY_OFFSET, size)
-      writeBuffer(buffer, 0, size)
+      writeBuffer(buffer, 0, size.toInt)
       offset += size
     }
   }
@@ -710,8 +710,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     var offset: Long = Platform.LONG_ARRAY_OFFSET
     val end = length * 8L + Platform.LONG_ARRAY_OFFSET
     while (offset < end) {
-      val size = Math.min(buffer.length, (end - offset).toInt)
-      readBuffer(buffer, 0, size)
+      val size = Math.min(buffer.length, end - offset)
+      readBuffer(buffer, 0, size.toInt)
       Platform.copyMemory(buffer, Platform.BYTE_ARRAY_OFFSET, array, offset, size)
       offset += size
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index ecf7cf289f03..89a9b3813273 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -79,7 +79,17 @@ case class SortMergeJoinExec(
   override def requiredChildDistribution: Seq[Distribution] =
     ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
 
-  override def outputOrdering: Seq[SortOrder] = requiredOrders(leftKeys)
+  override def outputOrdering: Seq[SortOrder] = joinType match {
+    // For left and right outer joins, the output is ordered by the streamed input's join keys.
+    case LeftOuter => requiredOrders(leftKeys)
+    case RightOuter => requiredOrders(rightKeys)
+    // There are null rows in both streams, so there is no order.
+    case FullOuter => Nil
+    case _: InnerLike | LeftExistence(_) => requiredOrders(leftKeys)
+    case x =>
+      throw new IllegalArgumentException(
+        s"${getClass.getSimpleName} should not take $x as the JoinType")
+  }
 
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
     requiredOrders(leftKeys) :: requiredOrders(rightKeys) :: Nil
@@ -101,7 +111,7 @@ case class SortMergeJoinExec(
     left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
       val boundCondition: (InternalRow) => Boolean = {
         condition.map { cond =>
-          newPredicate(cond, left.output ++ right.output)
+          newPredicate(cond, left.output ++ right.output).eval _
         }.getOrElse {
           (r: InternalRow) => true
         }
@@ -332,6 +342,7 @@ case class SortMergeJoinExec(
       keys: Seq[Expression],
       input: Seq[Attribute]): Seq[ExprCode] = {
     ctx.INPUT_ROW = row
+    ctx.currentVars = null
     keys.map(BindReferences.bindReference(_, input).genCode(ctx))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
index 9918ac327f2d..757fe2185d30 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -70,10 +70,10 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
     val stopEarly = ctx.freshName("stopEarly")
     ctx.addMutableState("boolean", stopEarly, s"$stopEarly = false;")
 
-    ctx.addNewFunction("shouldStop", s"""
+    ctx.addNewFunction("stopEarly", s"""
       @Override
-      protected boolean shouldStop() {
-        return !currentRows.isEmpty() || $stopEarly;
+      protected boolean stopEarly() {
+        return $stopEarly;
       }
     """)
     val countTerm = ctx.freshName("count")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 0cc1edd196bc..ef982a4ebd10 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -22,9 +22,15 @@ import java.util.Locale
 
 import org.apache.spark.SparkContext
 import org.apache.spark.scheduler.AccumulableInfo
+import org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates
 import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils}
 
 
+/**
+ * A metric used in a SQL query plan. This is implemented as an [[AccumulatorV2]]. Updates on
+ * the executor side are automatically propagated and shown in the SQL UI through metrics. Updates
+ * on the driver side must be explicitly posted using [[SQLMetrics.postDriverMetricUpdates()]].
+ */
 class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] {
   // This is a workaround for SPARK-11013.
   // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will
@@ -102,7 +108,7 @@ object SQLMetrics {
    */
   def stringValue(metricsType: String, values: Seq[Long]): String = {
     if (metricsType == SUM_METRIC) {
-      val numberFormat = NumberFormat.getIntegerInstance(Locale.ENGLISH)
+      val numberFormat = NumberFormat.getIntegerInstance(Locale.US)
       numberFormat.format(values.sum)
     } else {
       val strFormat: Long => String = if (metricsType == SIZE_METRIC) {
@@ -126,4 +132,18 @@ object SQLMetrics {
       s"\n$sum ($min, $med, $max)"
     }
   }
+
+  /**
+   * Updates metrics based on the driver side value. This is useful for certain metrics that
+   * are only updated on the driver, e.g. subquery execution time, or number of files.
+   */
+  def postDriverMetricUpdates(
+      sc: SparkContext, executionId: String, metrics: Seq[SQLMetric]): Unit = {
+    // There are some cases we don't care about the metrics and call `SparkPlan.doExecute`
+    // directly without setting an execution id. We should be tolerant to it.
+    if (executionId != null) {
+      sc.listenerBus.post(
+        SparkListenerDriverAccumUpdates(executionId.toLong, metrics.map(m => m.id -> m.value)))
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index 9df56bbf1ef8..fde3b2a52899 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -87,8 +87,9 @@ case class DeserializeToObjectExec(
   }
 
   override protected def doExecute(): RDD[InternalRow] = {
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val projection = GenerateSafeProjection.generate(deserializer :: Nil, child.output)
+      projection.initialize(index)
       iter.map(projection)
     }
   }
@@ -124,8 +125,9 @@ case class SerializeFromObjectExec(
   }
 
   override protected def doExecute(): RDD[InternalRow] = {
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val projection = UnsafeProjection.create(serializer)
+      projection.initialize(index)
       iter.map(projection)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index dcaf2c76d479..7a5ac48f1b69 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -119,26 +119,23 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       val pickle = new Pickler(needConversion)
       // Input iterator to Python: input rows are grouped so we send them in batches to Python.
       // For each row, add it to the queue.
-      val inputIterator = iter.grouped(100).map { inputRows =>
-        val toBePickled = inputRows.map { inputRow =>
-          queue.add(inputRow.asInstanceOf[UnsafeRow])
-          val row = projection(inputRow)
-          if (needConversion) {
-            EvaluatePython.toJava(row, schema)
-          } else {
-            // fast path for these types that does not need conversion in Python
-            val fields = new Array[Any](row.numFields)
-            var i = 0
-            while (i < row.numFields) {
-              val dt = dataTypes(i)
-              fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
-              i += 1
-            }
-            fields
+      val inputIterator = iter.map { inputRow =>
+        queue.add(inputRow.asInstanceOf[UnsafeRow])
+        val row = projection(inputRow)
+        if (needConversion) {
+          EvaluatePython.toJava(row, schema)
+        } else {
+          // fast path for these types that does not need conversion in Python
+          val fields = new Array[Any](row.numFields)
+          var i = 0
+          while (i < row.numFields) {
+            val dt = dataTypes(i)
+            fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
+            i += 1
           }
-        }.toArray
-        pickle.dumps(toBePickled)
-      }
+          fields
+        }
+      }.grouped(100).map(x => pickle.dumps(x.toArray))
 
       val context = TaskContext.get()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
index 46fd54e5c742..fcd84705f7e8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -112,6 +112,8 @@ object EvaluatePython {
     case (c: Int, DateType) => c
 
     case (c: Long, TimestampType) => c
+    // Py4J serializes values between MIN_INT and MAX_INT as Ints, not Longs
+    case (c: Int, TimestampType) => c.toLong
 
     case (c, StringType) => UTF8String.fromString(c.toString)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index b9dbfcf7734c..cdb755edc79a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -69,7 +69,8 @@ object FrequentItems extends Logging {
   /**
    * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker,
+   * and Papadimitriou.
    * The `support` should be greater than 1e-4.
    * For Internal use only.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index c02b15498748..2b2e706125ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -41,8 +41,8 @@ object StatFunctions extends Logging {
    *
    * This method implements a variation of the Greenwald-Khanna algorithm (with some speed
    * optimizations).
-   * The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient
-   * Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+   * The algorithm was first present in <a href="http://dx.doi.org/10.1145/375663.375670">
+   * Space-efficient Online Computation of Quantile Summaries</a> by Greenwald and Khanna.
    *
    * @param df the dataframe
    * @param cols numerical columns of the dataframe
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
index b26edeeb0400..408c8f81f17b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
@@ -24,6 +24,8 @@ import scala.io.{Source => IOSource}
 import scala.reflect.ClassTag
 
 import org.apache.hadoop.fs.{Path, PathFilter}
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
 
 import org.apache.spark.sql.SparkSession
 
@@ -37,14 +39,21 @@ import org.apache.spark.sql.SparkSession
  * compact log files every 10 batches by default into a big file. When
  * doing a compaction, it will read all old log files and merge them with the new batch.
  */
-abstract class CompactibleFileStreamLog[T: ClassTag](
-    metadataLogVersion: String,
+abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
+    metadataLogVersion: Int,
     sparkSession: SparkSession,
     path: String)
   extends HDFSMetadataLog[Array[T]](sparkSession, path) {
 
   import CompactibleFileStreamLog._
 
+  private implicit val formats = Serialization.formats(NoTypeHints)
+
+  /** Needed to serialize type T into JSON when using Jackson */
+  private implicit val manifest = Manifest.classType[T](implicitly[ClassTag[T]].runtimeClass)
+
+  protected val minBatchesToRetain = sparkSession.sessionState.conf.minBatchesToRetain
+
   /**
    * If we delete the old files after compaction at once, there is a race condition in S3: other
    * processes may see the old files are deleted but still cannot see the compaction file using
@@ -56,17 +65,46 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
 
   protected def isDeletingExpiredLog: Boolean
 
-  protected def compactInterval: Int
+  protected def defaultCompactInterval: Int
 
-  /**
-   * Serialize the data into encoded string.
-   */
-  protected def serializeData(t: T): String
+  protected final lazy val compactInterval: Int = {
+    // SPARK-18187: "compactInterval" can be set by user via defaultCompactInterval.
+    // If there are existing log entries, then we should ensure a compatible compactInterval
+    // is used, irrespective of the defaultCompactInterval. There are three cases:
+    //
+    // 1. If there is no '.compact' file, we can use the default setting directly.
+    // 2. If there are two or more '.compact' files, we use the interval of patch id suffix with
+    // '.compact' as compactInterval. This case could arise if isDeletingExpiredLog == false.
+    // 3. If there is only one '.compact' file, then we must find a compact interval
+    // that is compatible with (i.e., a divisor of) the previous compact file, and that
+    // faithfully tries to represent the revised default compact interval i.e., is at least
+    // is large if possible.
+    // e.g., if defaultCompactInterval is 5 (and previous compact interval could have
+    // been any 2,3,4,6,12), then a log could be: 11.compact, 12, 13, in which case
+    // will ensure that the new compactInterval = 6 > 5 and (11 + 1) % 6 == 0
+    val compactibleBatchIds = fileManager.list(metadataPath, batchFilesFilter)
+      .filter(f => f.getPath.toString.endsWith(CompactibleFileStreamLog.COMPACT_FILE_SUFFIX))
+      .map(f => pathToBatchId(f.getPath))
+      .sorted
+      .reverse
 
-  /**
-   * Deserialize the string into data object.
-   */
-  protected def deserializeData(encodedString: String): T
+    // Case 1
+    var interval = defaultCompactInterval
+    if (compactibleBatchIds.length >= 2) {
+      // Case 2
+      val latestCompactBatchId = compactibleBatchIds(0)
+      val previousCompactBatchId = compactibleBatchIds(1)
+      interval = (latestCompactBatchId - previousCompactBatchId).toInt
+    } else if (compactibleBatchIds.length == 1) {
+      // Case 3
+      interval = CompactibleFileStreamLog.deriveCompactInterval(
+        defaultCompactInterval, compactibleBatchIds(0).toInt)
+    }
+    assert(interval > 0, s"intervalValue = $interval not positive value.")
+    logInfo(s"Set the compact interval to $interval " +
+      s"[defaultCompactInterval: $defaultCompactInterval]")
+    interval
+  }
 
   /**
    * Filter out the obsolete logs.
@@ -96,10 +134,10 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
 
   override def serialize(logData: Array[T], out: OutputStream): Unit = {
     // called inside a try-finally where the underlying stream is closed in the caller
-    out.write(metadataLogVersion.getBytes(UTF_8))
+    out.write(("v" + metadataLogVersion).getBytes(UTF_8))
     logData.foreach { data =>
       out.write('\n')
-      out.write(serializeData(data).getBytes(UTF_8))
+      out.write(Serialization.write(data).getBytes(UTF_8))
     }
   }
 
@@ -108,19 +146,21 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
     if (!lines.hasNext) {
       throw new IllegalStateException("Incomplete log file")
     }
-    val version = lines.next()
-    if (version != metadataLogVersion) {
-      throw new IllegalStateException(s"Unknown log version: ${version}")
-    }
-    lines.map(deserializeData).toArray
+    val version = parseVersion(lines.next(), metadataLogVersion)
+    lines.map(Serialization.read[T]).toArray
   }
 
   override def add(batchId: Long, logs: Array[T]): Boolean = {
-    if (isCompactionBatch(batchId, compactInterval)) {
-      compact(batchId, logs)
-    } else {
-      super.add(batchId, logs)
+    val batchAdded =
+      if (isCompactionBatch(batchId, compactInterval)) {
+        compact(batchId, logs)
+      } else {
+        super.add(batchId, logs)
+      }
+    if (batchAdded && isDeletingExpiredLog) {
+      deleteExpiredLog(batchId)
     }
+    batchAdded
   }
 
   /**
@@ -131,9 +171,6 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
     val validBatches = getValidBatchesBeforeCompactionBatch(batchId, compactInterval)
     val allLogs = validBatches.flatMap(batchId => super.get(batchId)).flatten ++ logs
     if (super.add(batchId, compactLogs(allLogs).toArray)) {
-      if (isDeletingExpiredLog) {
-        deleteExpiredLog(batchId)
-      }
       true
     } else {
       // Return false as there is another writer.
@@ -174,26 +211,41 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
   }
 
   /**
-   * Since all logs before `compactionBatchId` are compacted and written into the
-   * `compactionBatchId` log file, they can be removed. However, due to the eventual consistency of
-   * S3, the compaction file may not be seen by other processes at once. So we only delete files
-   * created `fileCleanupDelayMs` milliseconds ago.
+   * Delete expired log entries that proceed the currentBatchId and retain
+   * sufficient minimum number of batches (given by minBatchsToRetain). This
+   * equates to retaining the earliest compaction log that proceeds
+   * batch id position currentBatchId + 1 - minBatchesToRetain. All log entries
+   * prior to the earliest compaction log proceeding that position will be removed.
+   * However, due to the eventual consistency of S3, the compaction file may not
+   * be seen by other processes at once. So we only delete files created
+   * `fileCleanupDelayMs` milliseconds ago.
    */
-  private def deleteExpiredLog(compactionBatchId: Long): Unit = {
-    val expiredTime = System.currentTimeMillis() - fileCleanupDelayMs
-    fileManager.list(metadataPath, new PathFilter {
-      override def accept(path: Path): Boolean = {
-        try {
-          val batchId = getBatchIdFromFileName(path.getName)
-          batchId < compactionBatchId
-        } catch {
-          case _: NumberFormatException =>
-            false
+  private def deleteExpiredLog(currentBatchId: Long): Unit = {
+    if (compactInterval <= currentBatchId + 1 - minBatchesToRetain) {
+      // Find the first compaction batch id that maintains minBatchesToRetain
+      val minBatchId = currentBatchId + 1 - minBatchesToRetain
+      val minCompactionBatchId = minBatchId - (minBatchId % compactInterval) - 1
+      assert(isCompactionBatch(minCompactionBatchId, compactInterval),
+        s"$minCompactionBatchId is not a compaction batch")
+
+      logInfo(s"Current compact batch id = $currentBatchId " +
+        s"min compaction batch id to delete = $minCompactionBatchId")
+
+      val expiredTime = System.currentTimeMillis() - fileCleanupDelayMs
+      fileManager.list(metadataPath, new PathFilter {
+        override def accept(path: Path): Boolean = {
+          try {
+            val batchId = getBatchIdFromFileName(path.getName)
+            batchId < minCompactionBatchId
+          } catch {
+            case _: NumberFormatException =>
+              false
+          }
+        }
+      }).foreach { f =>
+        if (f.getModificationTime <= expiredTime) {
+          fileManager.delete(f.getPath)
         }
-      }
-    }).foreach { f =>
-      if (f.getModificationTime <= expiredTime) {
-        fileManager.delete(f.getPath)
       }
     }
   }
@@ -248,4 +300,24 @@ object CompactibleFileStreamLog {
   def nextCompactionBatchId(batchId: Long, compactInterval: Long): Long = {
     (batchId + compactInterval + 1) / compactInterval * compactInterval - 1
   }
+
+  /**
+   * Derives a compact interval from the latest compact batch id and
+   * a default compact interval.
+   */
+  def deriveCompactInterval(defaultInterval: Int, latestCompactBatchId: Int) : Int = {
+    if (latestCompactBatchId + 1 <= defaultInterval) {
+      latestCompactBatchId + 1
+    } else if (defaultInterval < (latestCompactBatchId + 1) / 2) {
+      // Find the first divisor >= default compact interval
+      def properDivisors(min: Int, n: Int) =
+        (min to n/2).view.filter(i => n % i == 0) :+ n
+
+      properDivisors(defaultInterval, latestCompactBatchId + 1).head
+    } else {
+      // default compact interval > than any divisor other than latest compact id
+      latestCompactBatchId + 1
+    }
+  }
 }
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
new file mode 100644
index 000000000000..25cf609fc336
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.types.MetadataBuilder
+import org.apache.spark.unsafe.types.CalendarInterval
+import org.apache.spark.util.AccumulatorV2
+
+/** Class for collecting event time stats with an accumulator */
+case class EventTimeStats(var max: Long, var min: Long, var sum: Long, var count: Long) {
+  def add(eventTime: Long): Unit = {
+    this.max = math.max(this.max, eventTime)
+    this.min = math.min(this.min, eventTime)
+    this.sum += eventTime
+    this.count += 1
+  }
+
+  def merge(that: EventTimeStats): Unit = {
+    this.max = math.max(this.max, that.max)
+    this.min = math.min(this.min, that.min)
+    this.sum += that.sum
+    this.count += that.count
+  }
+
+  def avg: Long = sum / count
+}
+
+object EventTimeStats {
+  def zero: EventTimeStats = EventTimeStats(
+    max = Long.MinValue, min = Long.MaxValue, sum = 0L, count = 0L)
+}
+
+/** Accumulator that collects stats on event time in a batch. */
+class EventTimeStatsAccum(protected var currentStats: EventTimeStats = EventTimeStats.zero)
+  extends AccumulatorV2[Long, EventTimeStats] {
+
+  override def isZero: Boolean = value == EventTimeStats.zero
+  override def value: EventTimeStats = currentStats
+  override def copy(): AccumulatorV2[Long, EventTimeStats] = new EventTimeStatsAccum(currentStats)
+
+  override def reset(): Unit = {
+    currentStats = EventTimeStats.zero
+  }
+
+  override def add(v: Long): Unit = {
+    currentStats.add(v)
+  }
+
+  override def merge(other: AccumulatorV2[Long, EventTimeStats]): Unit = {
+    currentStats.merge(other.value)
+  }
+}
+
+/**
+ * Used to mark a column as the containing the event time for a given record. In addition to
+ * adding appropriate metadata to this column, this operator also tracks the maximum observed event
+ * time. Based on the maximum observed time and a user specified delay, we can calculate the
+ * `watermark` after which we assume we will no longer see late records for a particular time
+ * period. Note that event time is measured in milliseconds.
+ */
+case class EventTimeWatermarkExec(
+    eventTime: Attribute,
+    delay: CalendarInterval,
+    child: SparkPlan) extends SparkPlan {
+
+  val eventTimeStats = new EventTimeStatsAccum()
+  val delayMs = EventTimeWatermark.getDelayMs(delay)
+
+  sparkContext.register(eventTimeStats)
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    child.execute().mapPartitions { iter =>
+      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
+      iter.map { row =>
+        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
+        row
+      }
+    }
+  }
+
+  // Update the metadata on the eventTime column to include the desired delay.
+  override val output: Seq[Attribute] = child.output.map { a =>
+    if (a semanticEquals eventTime) {
+      val updatedMetadata = new MetadataBuilder()
+        .withMetadata(a.metadata)
+        .putLong(EventTimeWatermark.delayKey, delayMs)
+        .build()
+      a.withMetadata(updatedMetadata)
+    } else if (a.metadata.contains(EventTimeWatermark.delayKey)) {
+      // Remove existing watermark
+      val updatedMetadata = new MetadataBuilder()
+        .withMetadata(a.metadata)
+        .remove(EventTimeWatermark.delayKey)
+        .build()
+      a.withMetadata(updatedMetadata)
+    } else {
+      a
+    }
+  }
+
+  override def children: Seq[SparkPlan] = child :: Nil
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
index 3efc20c1d662..fe64838696a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
@@ -20,13 +20,15 @@ package org.apache.spark.sql.execution.streaming
 import scala.util.Try
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.util.Utils
 
 /**
  * User specified options for file streams.
  */
-class FileStreamOptions(parameters: Map[String, String]) extends Logging {
+class FileStreamOptions(parameters: CaseInsensitiveMap) extends Logging {
+
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
 
   val maxFilesPerTrigger: Option[Int] = parameters.get("maxFilesPerTrigger").map { str =>
     Try(str.toInt).toOption.filter(_ > 0).getOrElse {
@@ -36,7 +38,10 @@ class FileStreamOptions(parameters: Map[String, String]) extends Logging {
   }
 
   /**
-   * Maximum age of a file that can be found in this directory, before it is deleted.
+   * Maximum age of a file that can be found in this directory, before it is ignored. For the
+   * first batch all files will be considered valid. If `latestFirst` is set to `true` and
+   * `maxFilesPerTrigger` is set, then this parameter will be ignored, because old files that are
+   * valid, and should be processed, may be ignored. Please refer to SPARK-19813 for details.
    *
    * The max age is specified with respect to the timestamp of the latest file, and not the
    * timestamp of the current system. That this means if the last file has timestamp 1000, and the
@@ -50,5 +55,19 @@ class FileStreamOptions(parameters: Map[String, String]) extends Logging {
 
   /** Options as specified by the user, in a case-insensitive map, without "path" set. */
   val optionMapWithoutPath: Map[String, String] =
-    new CaseInsensitiveMap(parameters).filterKeys(_ != "path")
+    parameters.filterKeys(_ != "path")
+
+  /**
+   * Whether to scan latest files first. If it's true, when the source finds unprocessed files in a
+   * trigger, it will first process the latest files.
+   */
+  val latestFirst: Boolean = parameters.get("latestFirst").map { str =>
+    try {
+      str.toBoolean
+    } catch {
+      case _: IllegalArgumentException =>
+        throw new IllegalArgumentException(
+          s"Invalid value '$str' for option 'latestFirst', must be 'true' or 'false'")
+    }
+  }.getOrElse(false)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index daec2b545097..0dbe2a71ed3b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -20,9 +20,10 @@ package org.apache.spark.sql.execution.streaming
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.datasources.{FileCommitProtocol, FileFormat, FileFormatWriter}
+import org.apache.spark.sql.execution.datasources.{FileFormat, FileFormatWriter}
 
 object FileStreamSink {
   // The name of the subdirectory that is used to store metadata about which files are valid.
@@ -54,7 +55,11 @@ class FileStreamSink(
       logInfo(s"Skipping already committed batch $batchId")
     } else {
       val committer = FileCommitProtocol.instantiate(
-        sparkSession.sessionState.conf.streamingFileCommitProtocolClass, path, isAppend = false)
+        className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
+        jobId = batchId.toString,
+        outputPath = path,
+        isAppend = false)
+
       committer match {
         case manifestCommitter: ManifestFileCommitProtocol =>
           manifestCommitter.setupManifestOptions(fileLog, batchId)
@@ -72,10 +77,10 @@ class FileStreamSink(
 
       FileFormatWriter.write(
         sparkSession = sparkSession,
-        plan = data.logicalPlan,
+        queryExecution = data.queryExecution,
         fileFormat = fileFormat,
         committer = committer,
-        outputPath = path,
+        outputSpec = FileFormatWriter.OutputSpec(path, Map.empty),
         hadoopConf = hadoopConf,
         partitionColumns = partitionColumns,
         bucketSpec = None,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
index f9e24167a17e..8d718b2164d2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
@@ -77,7 +77,7 @@ object SinkFileStatus {
  * (drops the deleted files).
  */
 class FileStreamSinkLog(
-    metadataLogVersion: String,
+    metadataLogVersion: Int,
     sparkSession: SparkSession,
     path: String)
   extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {
@@ -88,18 +88,12 @@ class FileStreamSinkLog(
 
   protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion
 
-  protected override val compactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval
-  require(compactInterval > 0,
-    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $compactInterval) " +
-      "to a positive value.")
-
-  protected override def serializeData(data: SinkFileStatus): String = {
-    write(data)
-  }
+  protected override val defaultCompactInterval =
+    sparkSession.sessionState.conf.fileSinkLogCompactInterval
 
-  protected override def deserializeData(encodedString: String): SinkFileStatus = {
-    read[SinkFileStatus](encodedString)
-  }
+  require(defaultCompactInterval > 0,
+    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
+      "to a positive value.")
 
   override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
     val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
@@ -112,7 +106,7 @@ class FileStreamSinkLog(
 }
 
 object FileStreamSinkLog {
-  val VERSION = "v1"
+  val VERSION = 1
   val DELETE_ACTION = "delete"
   val ADD_ACTION = "add"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 680df01acc1a..fd94bb658dec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -57,21 +57,36 @@ class FileStreamSource(
 
   private val metadataLog =
     new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath)
-  private var maxBatchId = metadataLog.getLatest().map(_._1).getOrElse(-1L)
+  private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L)
 
   /** Maximum number of new files to be considered in each batch */
   private val maxFilesPerBatch = sourceOptions.maxFilesPerTrigger
 
+  private val fileSortOrder = if (sourceOptions.latestFirst) {
+      logWarning(
+        """'latestFirst' is true. New files will be processed first, which may affect the watermark
+          |value. In addition, 'maxFileAge' will be ignored.""".stripMargin)
+      implicitly[Ordering[Long]].reverse
+    } else {
+      implicitly[Ordering[Long]]
+    }
+
+  private val maxFileAgeMs: Long = if (sourceOptions.latestFirst && maxFilesPerBatch.isDefined) {
+    Long.MaxValue
+  } else {
+    sourceOptions.maxFileAgeMs
+  }
+
   /** A mapping from a file that we have processed to some timestamp it was last modified. */
   // Visible for testing and debugging in production.
-  val seenFiles = new SeenFilesMap(sourceOptions.maxFileAgeMs)
+  val seenFiles = new SeenFilesMap(maxFileAgeMs)
 
   metadataLog.allFiles().foreach { entry =>
     seenFiles.add(entry.path, entry.timestamp)
   }
   seenFiles.purge()
 
-  logInfo(s"maxFilesPerBatch = $maxFilesPerBatch, maxFileAge = ${sourceOptions.maxFileAgeMs}")
+  logInfo(s"maxFilesPerBatch = $maxFilesPerBatch, maxFileAgeMs = $maxFileAgeMs")
 
   /**
    * Returns the maximum offset that can be retrieved from the source.
@@ -79,7 +94,7 @@ class FileStreamSource(
    * `synchronized` on this method is for solving race conditions in tests. In the normal usage,
    * there is no race here, so the cost of `synchronized` should be rare.
    */
-  private def fetchMaxOffset(): LongOffset = synchronized {
+  private def fetchMaxOffset(): FileStreamSourceOffset = synchronized {
     // All the new files found - ignore aged files and files that we have seen.
     val newFiles = fetchAllFiles().filter {
       case (path, timestamp) => seenFiles.isNewFile(path, timestamp)
@@ -104,14 +119,14 @@ class FileStreamSource(
        """.stripMargin)
 
     if (batchFiles.nonEmpty) {
-      maxBatchId += 1
-      metadataLog.add(maxBatchId, batchFiles.map { case (path, timestamp) =>
-        FileEntry(path = path, timestamp = timestamp, batchId = maxBatchId)
+      metadataLogCurrentOffset += 1
+      metadataLog.add(metadataLogCurrentOffset, batchFiles.map { case (p, timestamp) =>
+        FileEntry(path = p, timestamp = timestamp, batchId = metadataLogCurrentOffset)
       }.toArray)
-      logInfo(s"Max batch id increased to $maxBatchId with ${batchFiles.size} new files")
+      logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files")
     }
 
-    new LongOffset(maxBatchId)
+    FileStreamSourceOffset(metadataLogCurrentOffset)
   }
 
   /**
@@ -122,21 +137,19 @@ class FileStreamSource(
     func
   }
 
-  /** Return the latest offset in the source */
-  def currentOffset: LongOffset = synchronized {
-    new LongOffset(maxBatchId)
-  }
+  /** Return the latest offset in the [[FileStreamSourceLog]] */
+  def currentLogOffset: Long = synchronized { metadataLogCurrentOffset }
 
   /**
    * Returns the data that is between the offsets (`start`, `end`].
    */
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
-    val startId = start.map(_.asInstanceOf[LongOffset].offset).getOrElse(-1L)
-    val endId = end.asInstanceOf[LongOffset].offset
+    val startOffset = start.map(FileStreamSourceOffset(_).logOffset).getOrElse(-1L)
+    val endOffset = FileStreamSourceOffset(end).logOffset
 
-    assert(startId <= endId)
-    val files = metadataLog.get(Some(startId + 1), Some(endId)).flatMap(_._2)
-    logInfo(s"Processing ${files.length} files from ${startId + 1}:$endId")
+    assert(startOffset <= endOffset)
+    val files = metadataLog.get(Some(startOffset + 1), Some(endOffset)).flatMap(_._2)
+    logInfo(s"Processing ${files.length} files from ${startOffset + 1}:$endOffset")
     logTrace(s"Files are:\n\t" + files.mkString("\n\t"))
     val newDataSource =
       DataSource(
@@ -157,7 +170,7 @@ class FileStreamSource(
     val startTime = System.nanoTime
     val globbedPaths = SparkHadoopUtil.get.globPathIfNecessary(qualifiedBasePath)
     val catalog = new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(new StructType))
-    val files = catalog.allFiles().sortBy(_.getModificationTime).map { status =>
+    val files = catalog.allFiles().sortBy(_.getModificationTime)(fileSortOrder).map { status =>
       (status.getPath.toUri.toString, status.getModificationTime)
     }
     val endTime = System.nanoTime
@@ -172,7 +185,7 @@ class FileStreamSource(
     files
   }
 
-  override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.offset == -1)
+  override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1)
 
   override def toString: String = s"FileStreamSource[$qualifiedBasePath]"
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
index 4681f2ba08c8..33e6a1d5d6e1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.streaming.FileStreamSource.FileEntry
 import org.apache.spark.sql.internal.SQLConf
 
 class FileStreamSourceLog(
-    metadataLogVersion: String,
+    metadataLogVersion: Int,
     sparkSession: SparkSession,
     path: String)
   extends CompactibleFileStreamLog[FileEntry](metadataLogVersion, sparkSession, path) {
@@ -38,11 +38,12 @@ class FileStreamSourceLog(
   import CompactibleFileStreamLog._
 
   // Configurations about metadata compaction
-  protected override val compactInterval =
+  protected override val defaultCompactInterval: Int =
     sparkSession.sessionState.conf.fileSourceLogCompactInterval
-  require(compactInterval > 0,
-    s"Please set ${SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key} (was $compactInterval) to a " +
-      s"positive value.")
+
+  require(defaultCompactInterval > 0,
+    s"Please set ${SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key} " +
+      s"(was $defaultCompactInterval) to a positive value.")
 
   protected override val fileCleanupDelayMs =
     sparkSession.sessionState.conf.fileSourceLogCleanupDelay
@@ -60,14 +61,6 @@ class FileStreamSourceLog(
     }
   }
 
-  protected override def serializeData(data: FileEntry): String = {
-    Serialization.write(data)
-  }
-
-  protected override def deserializeData(encodedString: String): FileEntry = {
-    Serialization.read[FileEntry](encodedString)
-  }
-
   def compactLogs(logs: Seq[FileEntry]): Seq[FileEntry] = {
     logs
   }
@@ -85,7 +78,7 @@ class FileStreamSourceLog(
 
   override def get(startId: Option[Long], endId: Option[Long]): Array[(Long, Array[FileEntry])] = {
     val startBatchId = startId.getOrElse(0L)
-    val endBatchId = getLatest().map(_._1).getOrElse(0L)
+    val endBatchId = endId.orElse(getLatest().map(_._1)).getOrElse(0L)
 
     val (existedBatches, removedBatches) = (startBatchId to endBatchId).map { id =>
       if (isCompactionBatch(id, compactInterval) && fileEntryCache.containsKey(id)) {
@@ -127,5 +120,5 @@ class FileStreamSourceLog(
 }
 
 object FileStreamSourceLog {
-  val VERSION = "v1"
+  val VERSION = 1
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
new file mode 100644
index 000000000000..06d0fe6c18c1
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import scala.util.control.Exception._
+
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
+
+/**
+ * Offset for the [[FileStreamSource]].
+ * @param logOffset  Position in the [[FileStreamSourceLog]]
+ */
+case class FileStreamSourceOffset(logOffset: Long) extends Offset {
+  override def json: String = {
+    Serialization.write(this)(FileStreamSourceOffset.format)
+  }
+}
+
+object FileStreamSourceOffset {
+  implicit val format = Serialization.formats(NoTypeHints)
+
+  def apply(offset: Offset): FileStreamSourceOffset = {
+    offset match {
+      case f: FileStreamSourceOffset => f
+      case SerializedOffset(str) =>
+        catching(classOf[NumberFormatException]).opt {
+          FileStreamSourceOffset(str.toLong)
+        }.getOrElse {
+          Serialization.read[FileStreamSourceOffset](str)
+        }
+      case _ =>
+        throw new IllegalArgumentException(
+          s"Invalid conversion from offset of ${offset.getClass} to FileStreamSourceOffset")
+    }
+  }
+}
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
index 24f98b9211f1..de09fb568d2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
@@ -18,9 +18,8 @@
 package org.apache.spark.sql.execution.streaming
 
 import org.apache.spark.TaskContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Dataset, Encoder, ForeachWriter}
-import org.apache.spark.sql.catalyst.plans.logical.CatalystSerde
+import org.apache.spark.sql.{DataFrame, Encoder, ForeachWriter}
+import org.apache.spark.sql.catalyst.encoders.encoderFor
 
 /**
  * A [[Sink]] that forwards all data into [[ForeachWriter]] according to the contract defined by
@@ -32,45 +31,26 @@ import org.apache.spark.sql.catalyst.plans.logical.CatalystSerde
 class ForeachSink[T : Encoder](writer: ForeachWriter[T]) extends Sink with Serializable {
 
   override def addBatch(batchId: Long, data: DataFrame): Unit = {
-    // TODO: Refine this method when SPARK-16264 is resolved; see comments below.
-
     // This logic should've been as simple as:
     // ```
     //   data.as[T].foreachPartition { iter => ... }
     // ```
     //
     // Unfortunately, doing that would just break the incremental planing. The reason is,
-    // `Dataset.foreachPartition()` would further call `Dataset.rdd()`, but `Dataset.rdd()` just
-    // does not support `IncrementalExecution`.
+    // `Dataset.foreachPartition()` would further call `Dataset.rdd()`, but `Dataset.rdd()` will
+    // create a new plan. Because StreamExecution uses the existing plan to collect metrics and
+    // update watermark, we should never create a new plan. Otherwise, metrics and watermark are
+    // updated in the new plan, and StreamExecution cannot retrieval them.
     //
-    // So as a provisional fix, below we've made a special version of `Dataset` with its `rdd()`
-    // method supporting incremental planning. But in the long run, we should generally make newly
-    // created Datasets use `IncrementalExecution` where necessary (which is SPARK-16264 tries to
-    // resolve).
-
-    val datasetWithIncrementalExecution =
-      new Dataset(data.sparkSession, data.logicalPlan, implicitly[Encoder[T]]) {
-        override lazy val rdd: RDD[T] = {
-          val objectType = exprEnc.deserializer.dataType
-          val deserialized = CatalystSerde.deserialize[T](logicalPlan)
-
-          // was originally: sparkSession.sessionState.executePlan(deserialized) ...
-          val incrementalExecution = new IncrementalExecution(
-            this.sparkSession,
-            deserialized,
-            data.queryExecution.asInstanceOf[IncrementalExecution].outputMode,
-            data.queryExecution.asInstanceOf[IncrementalExecution].checkpointLocation,
-            data.queryExecution.asInstanceOf[IncrementalExecution].currentBatchId)
-          incrementalExecution.toRdd.mapPartitions { rows =>
-            rows.map(_.get(0, objectType))
-          }.asInstanceOf[RDD[T]]
-        }
-      }
-    datasetWithIncrementalExecution.foreachPartition { iter =>
+    // Hence, we need to manually convert internal rows to objects using encoder.
+    val encoder = encoderFor[T].resolveAndBind(
+      data.logicalPlan.output,
+      data.sparkSession.sessionState.analyzer)
+    data.queryExecution.toRdd.foreachPartition { iter =>
       if (writer.open(TaskContext.getPartitionId(), batchId)) {
         try {
           while (iter.hasNext) {
-            writer.process(iter.next())
+            writer.process(encoder.fromRow(iter.next()))
           }
         } catch {
           case e: Throwable =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index c7235320fd6b..01c8f3a9e285 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.io.{FileNotFoundException, InputStream, IOException, OutputStream}
+import java.io._
+import java.nio.charset.StandardCharsets
 import java.util.{ConcurrentModificationException, EnumSet, UUID}
 
 import scala.reflect.ClassTag
@@ -26,9 +27,10 @@ import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.fs.permission.FsPermission
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.util.UninterruptibleThread
 
@@ -44,9 +46,14 @@ import org.apache.spark.util.UninterruptibleThread
  * Note: [[HDFSMetadataLog]] doesn't support S3-like file systems as they don't guarantee listing
  * files in a directory always shows the latest files.
  */
-class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
+class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: String)
   extends MetadataLog[T] with Logging {
 
+  private implicit val formats = Serialization.formats(NoTypeHints)
+
+  /** Needed to serialize type T into JSON when using Jackson */
+  private implicit val manifest = Manifest.classType[T](implicitly[ClassTag[T]].runtimeClass)
+
   // Avoid serializing generic sequences, see SPARK-17372
   require(implicitly[ClassTag[T]].runtimeClass != classOf[Seq[_]],
     "Should not create a log with type Seq, use Arrays instead - see SPARK-17372")
@@ -56,19 +63,43 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
   val metadataPath = new Path(path)
   protected val fileManager = createFileManager()
 
-  if (!fileManager.exists(metadataPath)) {
-    fileManager.mkdirs(metadataPath)
+  runUninterruptiblyIfLocal {
+    if (!fileManager.exists(metadataPath)) {
+      fileManager.mkdirs(metadataPath)
+    }
+  }
+
+  private def runUninterruptiblyIfLocal[T](body: => T): T = {
+    if (fileManager.isLocalFileSystem && Thread.currentThread.isInstanceOf[UninterruptibleThread]) {
+      // When using a local file system, some file system APIs like "create" or "mkdirs" must be
+      // called in [[org.apache.spark.util.UninterruptibleThread]] so that interrupts can be
+      // disabled.
+      //
+      // This is because there is a potential dead-lock in Hadoop "Shell.runCommand" before
+      // 2.5.0 (HADOOP-10622). If the thread running "Shell.runCommand" is interrupted, then
+      // the thread can get deadlocked. In our case, file system APIs like "create" or "mkdirs"
+      // will call "Shell.runCommand" to set the file permission if using the local file system,
+      // and can get deadlocked if the stream execution thread is stopped by interrupt.
+      //
+      // Hence, we use "runUninterruptibly" here to disable interrupts here. (SPARK-14131)
+      Thread.currentThread.asInstanceOf[UninterruptibleThread].runUninterruptibly {
+        body
+      }
+    } else {
+      // For a distributed file system, such as HDFS or S3, if the network is broken, write
+      // operations may just hang until timeout. We should enable interrupts to allow stopping
+      // the query fast.
+      body
+    }
   }
 
   /**
    * A `PathFilter` to filter only batch files
    */
-  private val batchFilesFilter = new PathFilter {
+  protected val batchFilesFilter = new PathFilter {
     override def accept(path: Path): Boolean = isBatchFile(path)
   }
 
-  private val serializer = new JavaSerializer(sparkSession.sparkContext.conf).newInstance()
-
   protected def batchIdToPath(batchId: Long): Path = {
     new Path(metadataPath, batchId.toString)
   }
@@ -88,80 +119,40 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
 
   protected def serialize(metadata: T, out: OutputStream): Unit = {
     // called inside a try-finally where the underlying stream is closed in the caller
-    val outStream = serializer.serializeStream(out)
-    outStream.writeObject(metadata)
+    Serialization.write(metadata, out)
   }
 
   protected def deserialize(in: InputStream): T = {
     // called inside a try-finally where the underlying stream is closed in the caller
-    val inStream = serializer.deserializeStream(in)
-    inStream.readObject[T]()
+    val reader = new InputStreamReader(in, StandardCharsets.UTF_8)
+    Serialization.read[T](reader)
   }
 
   /**
    * Store the metadata for the specified batchId and return `true` if successful. If the batchId's
    * metadata has already been stored, this method will return `false`.
-   *
-   * Note that this method must be called on a [[org.apache.spark.util.UninterruptibleThread]]
-   * so that interrupts can be disabled while writing the batch file. This is because there is a
-   * potential dead-lock in Hadoop "Shell.runCommand" before 2.5.0 (HADOOP-10622). If the thread
-   * running "Shell.runCommand" is interrupted, then the thread can get deadlocked. In our
-   * case, `writeBatch` creates a file using HDFS API and calls "Shell.runCommand" to set the
-   * file permissions, and can get deadlocked if the stream execution thread is stopped by
-   * interrupt. Hence, we make sure that this method is called on [[UninterruptibleThread]] which
-   * allows us to disable interrupts here. Also see SPARK-14131.
    */
   override def add(batchId: Long, metadata: T): Boolean = {
     get(batchId).map(_ => false).getOrElse {
       // Only write metadata when the batch has not yet been written
-      Thread.currentThread match {
-        case ut: UninterruptibleThread =>
-          ut.runUninterruptibly { writeBatch(batchId, metadata, serialize) }
-        case _ =>
-          throw new IllegalStateException(
-            "HDFSMetadataLog.add() must be executed on a o.a.spark.util.UninterruptibleThread")
+      runUninterruptiblyIfLocal {
+        writeBatch(batchId, metadata)
       }
       true
     }
   }
 
-  /**
-   * Write a batch to a temp file then rename it to the batch file.
-   *
-   * There may be multiple [[HDFSMetadataLog]] using the same metadata path. Although it is not a
-   * valid behavior, we still need to prevent it from destroying the files.
-   */
-  private def writeBatch(batchId: Long, metadata: T, writer: (T, OutputStream) => Unit): Unit = {
-    // Use nextId to create a temp file
-    var nextId = 0
+  private def writeTempBatch(metadata: T): Option[Path] = {
     while (true) {
       val tempPath = new Path(metadataPath, s".${UUID.randomUUID.toString}.tmp")
       try {
         val output = fileManager.create(tempPath)
         try {
-          writer(metadata, output)
+          serialize(metadata, output)
+          return Some(tempPath)
         } finally {
           IOUtils.closeQuietly(output)
         }
-        try {
-          // Try to commit the batch
-          // It will fail if there is an existing file (someone has committed the batch)
-          logDebug(s"Attempting to write log #${batchIdToPath(batchId)}")
-          fileManager.rename(tempPath, batchIdToPath(batchId))
-          return
-        } catch {
-          case e: IOException if isFileAlreadyExistsException(e) =>
-            // If "rename" fails, it means some other "HDFSMetadataLog" has committed the batch.
-            // So throw an exception to tell the user this is not a valid behavior.
-            throw new ConcurrentModificationException(
-              s"Multiple HDFSMetadataLog are using $path", e)
-          case e: FileNotFoundException =>
-            // Sometimes, "create" will succeed when multiple writers are calling it at the same
-            // time. However, only one writer can call "rename" successfully, others will get
-            // FileNotFoundException because the first writer has removed it.
-            throw new ConcurrentModificationException(
-              s"Multiple HDFSMetadataLog are using $path", e)
-        }
       } catch {
         case e: IOException if isFileAlreadyExistsException(e) =>
           // Failed to create "tempPath". There are two cases:
@@ -176,11 +167,39 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
           // big problem because it requires the attacker must have the permission to write the
           // metadata path. In addition, the old Streaming also have this issue, people can create
           // malicious checkpoint files to crash a Streaming application too.
-          nextId += 1
-      } finally {
-        fileManager.delete(tempPath)
       }
     }
+    None
+  }
+
+  /**
+   * Write a batch to a temp file then rename it to the batch file.
+   *
+   * There may be multiple [[HDFSMetadataLog]] using the same metadata path. Although it is not a
+   * valid behavior, we still need to prevent it from destroying the files.
+   */
+  private def writeBatch(batchId: Long, metadata: T): Unit = {
+    val tempPath = writeTempBatch(metadata).getOrElse(
+      throw new IllegalStateException(s"Unable to create temp batch file $batchId"))
+    try {
+      // Try to commit the batch
+      // It will fail if there is an existing file (someone has committed the batch)
+      logDebug(s"Attempting to write log #${batchIdToPath(batchId)}")
+      fileManager.rename(tempPath, batchIdToPath(batchId))
+
+      // SPARK-17475: HDFSMetadataLog should not leak CRC files
+      // If the underlying filesystem didn't rename the CRC file, delete it.
+      val crcPath = new Path(tempPath.getParent(), s".${tempPath.getName()}.crc")
+      if (fileManager.exists(crcPath)) fileManager.delete(crcPath)
+    } catch {
+      case e: IOException if isFileAlreadyExistsException(e) =>
+        // If "rename" fails, it means some other "HDFSMetadataLog" has committed the batch.
+        // So throw an exception to tell the user this is not a valid behavior.
+        throw new ConcurrentModificationException(
+          s"Multiple HDFSMetadataLog are using $path", e)
+    } finally {
+      fileManager.delete(tempPath)
+    }
   }
 
   private def isFileAlreadyExistsException(e: IOException): Boolean = {
@@ -190,12 +209,33 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
       (e.getMessage != null && e.getMessage.startsWith("File already exists: "))
   }
 
+  /**
+   * @return the deserialized metadata in a batch file, or None if file not exist.
+   * @throws IllegalArgumentException when path does not point to a batch file.
+   */
+  def get(batchFile: Path): Option[T] = {
+    if (fileManager.exists(batchFile)) {
+      if (isBatchFile(batchFile)) {
+        get(pathToBatchId(batchFile))
+      } else {
+        throw new IllegalArgumentException(s"File ${batchFile} is not a batch file!")
+      }
+    } else {
+      None
+    }
+  }
+
   override def get(batchId: Long): Option[T] = {
     val batchMetadataFile = batchIdToPath(batchId)
     if (fileManager.exists(batchMetadataFile)) {
       val input = fileManager.open(batchMetadataFile)
       try {
         Some(deserialize(input))
+      } catch {
+        case ise: IllegalStateException =>
+          // re-throw the exception with the log file path added
+          throw new IllegalStateException(
+            s"Failed to read log file $batchMetadataFile. ${ise.getMessage}", ise)
       } finally {
         IOUtils.closeQuietly(input)
       }
@@ -232,6 +272,17 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
     None
   }
 
+  /**
+   * Get an array of [FileStatus] referencing batch files.
+   * The array is sorted by most recent batch file first to
+   * oldest batch file.
+   */
+  def getOrderedBatchFiles(): Array[FileStatus] = {
+    fileManager.list(metadataPath, batchFilesFilter)
+      .sortBy(f => pathToBatchId(f.getPath))
+      .reverse
+  }
+
   /**
    * Removes all the log entry earlier than thresholdBatchId (exclusive).
    */
@@ -258,6 +309,37 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
         new FileSystemManager(metadataPath, hadoopConf)
     }
   }
+
+  /**
+   * Parse the log version from the given `text` -- will throw exception when the parsed version
+   * exceeds `maxSupportedVersion`, or when `text` is malformed (such as "xyz", "v", "v-1",
+   * "v123xyz" etc.)
+   */
+  private[sql] def parseVersion(text: String, maxSupportedVersion: Int): Int = {
+    if (text.length > 0 && text(0) == 'v') {
+      val version =
+        try {
+          text.substring(1, text.length).toInt
+        } catch {
+          case _: NumberFormatException =>
+            throw new IllegalStateException(s"Log file was malformed: failed to read correct log " +
+              s"version from $text.")
+        }
+      if (version > 0) {
+        if (version > maxSupportedVersion) {
+          throw new IllegalStateException(s"UnsupportedLogVersion: maximum supported log version " +
+            s"is v${maxSupportedVersion}, but encountered v$version. The log file was produced " +
+            s"by a newer version of Spark and cannot be read by this version. Please upgrade.")
+        } else {
+          return version
+        }
+      }
+    }
+
+    // reaching here means we failed to read the correct log version
+    throw new IllegalStateException(s"Log file was malformed: failed to read correct log " +
+      s"version from $text.")
+  }
 }
 
 object HDFSMetadataLog {
@@ -289,6 +371,9 @@ object HDFSMetadataLog {
 
     /** Recursively delete a path if it exists. Should not throw exception if file doesn't exist. */
     def delete(path: Path): Unit
+
+    /** Whether the file systme is a local FS. */
+    def isLocalFileSystem: Boolean
   }
 
   /**
@@ -333,6 +418,13 @@ object HDFSMetadataLog {
         // ignore if file has already been deleted
       }
     }
+
+    override def isLocalFileSystem: Boolean = fc.getDefaultFileSystem match {
+      case _: local.LocalFs | _: local.RawLocalFs =>
+        // LocalFs = RawLocalFs + ChecksumFs
+        true
+      case _ => false
+    }
   }
 
   /**
@@ -389,5 +481,12 @@ object HDFSMetadataLog {
           // ignore if file has already been deleted
       }
     }
+
+    override def isLocalFileSystem: Boolean = fs match {
+      case _: LocalFileSystem | _: RawLocalFileSystem =>
+        // LocalFileSystem = RawLocalFileSystem + ChecksumFileSystem
+        true
+      case _ => false
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 05294df2673d..6ab6fa61dc20 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.spark.sql.{InternalOutputModes, SparkSession}
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.{CurrentBatchTimestamp, Literal}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode}
@@ -32,11 +34,13 @@ class IncrementalExecution(
     logicalPlan: LogicalPlan,
     val outputMode: OutputMode,
     val checkpointLocation: String,
-    val currentBatchId: Long)
-  extends QueryExecution(sparkSession, logicalPlan) {
+    val currentBatchId: Long,
+    val currentEventTimeWatermark: Long)
+  extends QueryExecution(sparkSession, logicalPlan) with Logging {
 
   // TODO: make this always part of planning.
-  val stateStrategy = sparkSession.sessionState.planner.StatefulAggregationStrategy +:
+  val stateStrategy =
+    sparkSession.sessionState.planner.StatefulAggregationStrategy +:
     sparkSession.sessionState.planner.StreamingRelationStrategy +:
     sparkSession.sessionState.experimentalMethods.extraStrategies
 
@@ -47,6 +51,19 @@ class IncrementalExecution(
       sparkSession.sessionState.conf,
       stateStrategy)
 
+  /**
+   * See [SPARK-18339]
+   * Walk the optimized logical plan and replace CurrentBatchTimestamp
+   * with the desired literal
+   */
+  override lazy val optimizedPlan: LogicalPlan = {
+    sparkSession.sessionState.optimizer.execute(withCachedData) transformAllExpressions {
+      case ts @ CurrentBatchTimestamp(timestamp, _) =>
+        logInfo(s"Current batch timestamp = $timestamp")
+        ts.toLiteral
+    }
+  }
+
   /**
    * Records the current id for a given stateful operator in the query plan as the `state`
    * preparation walks the query plan.
@@ -57,17 +74,17 @@ class IncrementalExecution(
   val state = new Rule[SparkPlan] {
 
     override def apply(plan: SparkPlan): SparkPlan = plan transform {
-      case StateStoreSaveExec(keys, None, None,
+      case StateStoreSaveExec(keys, None, None, None,
              UnaryExecNode(agg,
                StateStoreRestoreExec(keys2, None, child))) =>
         val stateId = OperatorStateId(checkpointLocation, operatorId, currentBatchId)
-        val returnAllStates = if (outputMode == InternalOutputModes.Complete) true else false
         operatorId += 1
 
         StateStoreSaveExec(
           keys,
           Some(stateId),
-          Some(returnAllStates),
+          Some(outputMode),
+          Some(currentEventTimeWatermark),
           agg.withNewChildren(
             StateStoreRestoreExec(
               keys,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
index c5e882777779..5f0b195fcfcb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
@@ -22,8 +22,27 @@ package org.apache.spark.sql.execution.streaming
  */
 case class LongOffset(offset: Long) extends Offset {
 
+  override val json = offset.toString
+
   def +(increment: Long): LongOffset = new LongOffset(offset + increment)
   def -(decrement: Long): LongOffset = new LongOffset(offset - decrement)
+}
+
+object LongOffset {
+
+  /**
+   * LongOffset factory from serialized offset.
+   * @return new LongOffset
+   */
+  def apply(offset: SerializedOffset) : LongOffset = new LongOffset(offset.json.toLong)
 
-  override def toString: String = s"#$offset"
+  /**
+   * Convert generic Offset to LongOffset if possible.
+   * @return converted LongOffset
+   */
+  def convert(offset: Offset): Option[LongOffset] = offset match {
+    case lo: LongOffset => Some(lo)
+    case so: SerializedOffset => Some(LongOffset(so))
+    case _ => None
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 510312267a98..92191c8b64b7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -25,8 +25,8 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.execution.datasources.FileCommitProtocol
-import org.apache.spark.sql.execution.datasources.FileCommitProtocol.TaskCommitMessage
+import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 
 /**
  * A [[FileCommitProtocol]] that tracks the list of valid files in a manifest file, used in
@@ -34,7 +34,7 @@ import org.apache.spark.sql.execution.datasources.FileCommitProtocol.TaskCommitM
  *
  * @param path path to write the final output to.
  */
-class ManifestFileCommitProtocol(path: String)
+class ManifestFileCommitProtocol(jobId: String, path: String)
   extends FileCommitProtocol with Serializable with Logging {
 
   // Track the list of files added by a task, only used on the executors.
@@ -96,6 +96,12 @@ class ManifestFileCommitProtocol(path: String)
     file
   }
 
+  override def newTaskTempFileAbsPath(
+      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
+    throw new UnsupportedOperationException(
+      s"$this does not support adding files with an absolute path")
+  }
+
   override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
     if (addedFiles.nonEmpty) {
       val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
new file mode 100644
index 000000000000..5551d12fa8ad
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.{util => ju}
+
+import scala.collection.mutable
+
+import com.codahale.metrics.{Gauge, MetricRegistry}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.{Source => CodahaleSource}
+import org.apache.spark.util.Clock
+
+/**
+ * Serves metrics from a [[org.apache.spark.sql.streaming.StreamingQuery]] to
+ * Codahale/DropWizard metrics
+ */
+class MetricsReporter(
+    stream: StreamExecution,
+    override val sourceName: String) extends CodahaleSource with Logging {
+
+  override val metricRegistry: MetricRegistry = new MetricRegistry
+
+  // Metric names should not have . in them, so that all the metrics of a query are identified
+  // together in Ganglia as a single metric group
+  registerGauge("inputRate-total", () => stream.lastProgress.inputRowsPerSecond)
+  registerGauge("processingRate-total", () => stream.lastProgress.inputRowsPerSecond)
+  registerGauge("latency", () => stream.lastProgress.durationMs.get("triggerExecution").longValue())
+
+  private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {
+    synchronized {
+      metricRegistry.register(name, new Gauge[T] {
+        override def getValue: T = f()
+      })
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
index 1f52abf27758..4efcee0f8f9d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
@@ -23,4 +23,38 @@ package org.apache.spark.sql.execution.streaming
  * ordering of two [[Offset]] instances.  We do assume that if two offsets are `equal` then no
  * new data has arrived.
  */
-trait Offset extends Serializable {}
+abstract class Offset {
+
+  /**
+   * Equality based on JSON string representation. We leverage the
+   * JSON representation for normalization between the Offset's
+   * in memory and on disk representations.
+   */
+  override def equals(obj: Any): Boolean = obj match {
+    case o: Offset => this.json == o.json
+    case _ => false
+  }
+
+  override def hashCode(): Int = this.json.hashCode
+
+  override def toString(): String = this.json.toString
+
+  /**
+   * A JSON-serialized representation of an Offset that is
+   * used for saving offsets to the offset log.
+   * Note: We assume that equivalent/equal offsets serialize to
+   * identical JSON strings.
+   *
+   * @return JSON string encoding
+   */
+  def json: String
+}
+
+/**
+ * Used when loading a JSON serialized offset from external storage.
+ * We are currently not responsible for converting JSON serialized
+ * data into an internal (i.e., object) representation. Sources should
+ * define a factory method in their source Offset companion objects
+ * that accepts a [[SerializedOffset]] for doing the conversion.
+ */
+case class SerializedOffset(override val json: String) extends Offset
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompositeOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
similarity index 53%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompositeOffset.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index ebc6ee818490..e5a1997d6b80 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompositeOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -17,12 +17,17 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
+
+
 /**
  * An ordered collection of offsets, used to track the progress of processing data from one or more
  * [[Source]]s that are present in a streaming query. This is similar to simplified, single-instance
  * vector clock that must progress linearly forward.
  */
-case class CompositeOffset(offsets: Seq[Option[Offset]]) extends Offset {
+case class OffsetSeq(offsets: Seq[Option[Offset]], metadata: Option[OffsetSeqMetadata] = None) {
+
   /**
    * Unpacks an offset into [[StreamProgress]] by associating each offset with the order list of
    * sources.
@@ -36,15 +41,42 @@ case class CompositeOffset(offsets: Seq[Option[Offset]]) extends Offset {
   }
 
   override def toString: String =
-    offsets.map(_.map(_.toString).getOrElse("-")).mkString("[", ", ", "]")
+    offsets.map(_.map(_.json).getOrElse("-")).mkString("[", ", ", "]")
 }
 
-object CompositeOffset {
+object OffsetSeq {
+
   /**
-   * Returns a [[CompositeOffset]] with a variable sequence of offsets.
+   * Returns a [[OffsetSeq]] with a variable sequence of offsets.
    * `nulls` in the sequence are converted to `None`s.
    */
-  def fill(offsets: Offset*): CompositeOffset = {
-    CompositeOffset(offsets.map(Option(_)))
+  def fill(offsets: Offset*): OffsetSeq = OffsetSeq.fill(None, offsets: _*)
+
+  /**
+   * Returns a [[OffsetSeq]] with metadata and a variable sequence of offsets.
+   * `nulls` in the sequence are converted to `None`s.
+   */
+  def fill(metadata: Option[String], offsets: Offset*): OffsetSeq = {
+    OffsetSeq(offsets.map(Option(_)), metadata.map(OffsetSeqMetadata.apply))
   }
 }
+
+
+/**
+ * Contains metadata associated with a [[OffsetSeq]]. This information is
+ * persisted to the offset log in the checkpoint location via the [[OffsetSeq]] metadata field.
+ *
+ * @param batchWatermarkMs: The current eventTime watermark, used to
+ * bound the lateness of data that will processed. Time unit: milliseconds
+ * @param batchTimestampMs: The current batch processing timestamp.
+ * Time unit: milliseconds
+ */
+case class OffsetSeqMetadata(var batchWatermarkMs: Long = 0, var batchTimestampMs: Long = 0) {
+  def json: String = Serialization.write(this)(OffsetSeqMetadata.format)
+}
+
+object OffsetSeqMetadata {
+  private implicit val format = Serialization.formats(NoTypeHints)
+  def apply(json: String): OffsetSeqMetadata = Serialization.read[OffsetSeqMetadata](json)
+}
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
new file mode 100644
index 000000000000..4f8cd116f610
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
@@ -0,0 +1,91 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.streaming
+
+
+import java.io.{InputStream, OutputStream}
+import java.nio.charset.StandardCharsets._
+
+import scala.io.{Source => IOSource}
+
+import org.apache.spark.sql.SparkSession
+
+/**
+ * This class is used to log offsets to persistent files in HDFS.
+ * Each file corresponds to a specific batch of offsets. The file
+ * format contain a version string in the first line, followed
+ * by a the JSON string representation of the offsets separated
+ * by a newline character. If a source offset is missing, then
+ * that line will contain a string value defined in the
+ * SERIALIZED_VOID_OFFSET variable in [[OffsetSeqLog]] companion object.
+ * For instance, when dealing with [[LongOffset]] types:
+ *   v1        // version 1
+ *   metadata
+ *   {0}       // LongOffset 0
+ *   {3}       // LongOffset 3
+ *   -         // No offset for this source i.e., an invalid JSON string
+ *   {2}       // LongOffset 2
+ *   ...
+ */
+class OffsetSeqLog(sparkSession: SparkSession, path: String)
+  extends HDFSMetadataLog[OffsetSeq](sparkSession, path) {
+
+  override protected def deserialize(in: InputStream): OffsetSeq = {
+    // called inside a try-finally where the underlying stream is closed in the caller
+    def parseOffset(value: String): Offset = value match {
+      case OffsetSeqLog.SERIALIZED_VOID_OFFSET => null
+      case json => SerializedOffset(json)
+    }
+    val lines = IOSource.fromInputStream(in, UTF_8.name()).getLines()
+    if (!lines.hasNext) {
+      throw new IllegalStateException("Incomplete log file")
+    }
+
+    val version = parseVersion(lines.next(), OffsetSeqLog.VERSION)
+
+    // read metadata
+    val metadata = lines.next().trim match {
+      case "" => None
+      case md => Some(md)
+    }
+    OffsetSeq.fill(metadata, lines.map(parseOffset).toArray: _*)
+  }
+
+  override protected def serialize(offsetSeq: OffsetSeq, out: OutputStream): Unit = {
+    // called inside a try-finally where the underlying stream is closed in the caller
+    out.write(("v" + OffsetSeqLog.VERSION).getBytes(UTF_8))
+
+    // write metadata
+    out.write('\n')
+    out.write(offsetSeq.metadata.map(_.json).getOrElse("").getBytes(UTF_8))
+
+    // write offsets, one per line
+    offsetSeq.offsets.map(_.map(_.json)).foreach { offset =>
+      out.write('\n')
+      offset match {
+        case Some(json: String) => out.write(json.getBytes(UTF_8))
+        case None => out.write(OffsetSeqLog.SERIALIZED_VOID_OFFSET.getBytes(UTF_8))
+      }
+    }
+  }
+}
+
+object OffsetSeqLog {
+  private[streaming] val VERSION = 1
+  private val SERIALIZED_VOID_OFFSET = "-"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
new file mode 100644
index 000000000000..1f74fffbe6e6
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.text.SimpleDateFormat
+import java.util.{Date, TimeZone, UUID}
+
+import scala.collection.mutable
+import scala.collection.JavaConverters._
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, LogicalPlan}
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent
+import org.apache.spark.util.Clock
+
+/**
+ * Responsible for continually reporting statistics about the amount of data processed as well
+ * as latency for a streaming query.  This trait is designed to be mixed into the
+ * [[StreamExecution]], who is responsible for calling `startTrigger` and `finishTrigger`
+ * at the appropriate times. Additionally, the status can updated with `updateStatusMessage` to
+ * allow reporting on the streams current state (i.e. "Fetching more data").
+ */
+trait ProgressReporter extends Logging {
+
+  case class ExecutionStats(
+    inputRows: Map[Source, Long],
+    stateOperators: Seq[StateOperatorProgress],
+    eventTimeStats: Map[String, String])
+
+  // Internal state of the stream, required for computing metrics.
+  protected def id: UUID
+  protected def runId: UUID
+  protected def name: String
+  protected def triggerClock: Clock
+  protected def logicalPlan: LogicalPlan
+  protected def lastExecution: QueryExecution
+  protected def newData: Map[Source, DataFrame]
+  protected def availableOffsets: StreamProgress
+  protected def committedOffsets: StreamProgress
+  protected def sources: Seq[Source]
+  protected def sink: Sink
+  protected def offsetSeqMetadata: OffsetSeqMetadata
+  protected def currentBatchId: Long
+  protected def sparkSession: SparkSession
+  protected def postEvent(event: StreamingQueryListener.Event): Unit
+
+  // Local timestamps and counters.
+  private var currentTriggerStartTimestamp = -1L
+  private var currentTriggerEndTimestamp = -1L
+  // TODO: Restore this from the checkpoint when possible.
+  private var lastTriggerStartTimestamp = -1L
+  private val currentDurationsMs = new mutable.HashMap[String, Long]()
+
+  /** Flag that signals whether any error with input metrics have already been logged */
+  private var metricWarningLogged: Boolean = false
+
+  /** Holds the most recent query progress updates.  Accesses must lock on the queue itself. */
+  private val progressBuffer = new mutable.Queue[StreamingQueryProgress]()
+
+  private val noDataProgressEventInterval =
+    sparkSession.sessionState.conf.streamingNoDataProgressEventInterval
+
+  // The timestamp we report an event that has no input data
+  private var lastNoDataProgressEventTime = Long.MinValue
+
+  private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601
+  timestampFormat.setTimeZone(TimeZone.getTimeZone("UTC"))
+
+  @volatile
+  protected var currentStatus: StreamingQueryStatus = {
+    new StreamingQueryStatus(
+      message = "Initializing StreamExecution",
+      isDataAvailable = false,
+      isTriggerActive = false)
+  }
+
+  /** Returns the current status of the query. */
+  def status: StreamingQueryStatus = currentStatus
+
+  /** Returns an array containing the most recent query progress updates. */
+  def recentProgress: Array[StreamingQueryProgress] = progressBuffer.synchronized {
+    progressBuffer.toArray
+  }
+
+  /** Returns the most recent query progress update or null if there were no progress updates. */
+  def lastProgress: StreamingQueryProgress = progressBuffer.synchronized {
+    progressBuffer.lastOption.orNull
+  }
+
+  /** Begins recording statistics about query progress for a given trigger. */
+  protected def startTrigger(): Unit = {
+    logDebug("Starting Trigger Calculation")
+    lastTriggerStartTimestamp = currentTriggerStartTimestamp
+    currentTriggerStartTimestamp = triggerClock.getTimeMillis()
+    currentStatus = currentStatus.copy(isTriggerActive = true)
+    currentDurationsMs.clear()
+  }
+
+  private def updateProgress(newProgress: StreamingQueryProgress): Unit = {
+    progressBuffer.synchronized {
+      progressBuffer += newProgress
+      while (progressBuffer.length >= sparkSession.sqlContext.conf.streamingProgressRetention) {
+        progressBuffer.dequeue()
+      }
+    }
+    postEvent(new QueryProgressEvent(newProgress))
+    logInfo(s"Streaming query made progress: $newProgress")
+  }
+
+  /** Finalizes the query progress and adds it to list of recent status updates. */
+  protected def finishTrigger(hasNewData: Boolean): Unit = {
+    currentTriggerEndTimestamp = triggerClock.getTimeMillis()
+
+    val executionStats = extractExecutionStats(hasNewData)
+    val processingTimeSec =
+      (currentTriggerEndTimestamp - currentTriggerStartTimestamp).toDouble / 1000
+
+    val inputTimeSec = if (lastTriggerStartTimestamp >= 0) {
+      (currentTriggerStartTimestamp - lastTriggerStartTimestamp).toDouble / 1000
+    } else {
+      Double.NaN
+    }
+    logDebug(s"Execution stats: $executionStats")
+
+    val sourceProgress = sources.map { source =>
+      val numRecords = executionStats.inputRows.getOrElse(source, 0L)
+      new SourceProgress(
+        description = source.toString,
+        startOffset = committedOffsets.get(source).map(_.json).orNull,
+        endOffset = availableOffsets.get(source).map(_.json).orNull,
+        numInputRows = numRecords,
+        inputRowsPerSecond = numRecords / inputTimeSec,
+        processedRowsPerSecond = numRecords / processingTimeSec
+      )
+    }
+    val sinkProgress = new SinkProgress(sink.toString)
+
+    val newProgress = new StreamingQueryProgress(
+      id = id,
+      runId = runId,
+      name = name,
+      timestamp = formatTimestamp(currentTriggerStartTimestamp),
+      batchId = currentBatchId,
+      durationMs = new java.util.HashMap(currentDurationsMs.toMap.mapValues(long2Long).asJava),
+      eventTime = new java.util.HashMap(executionStats.eventTimeStats.asJava),
+      stateOperators = executionStats.stateOperators.toArray,
+      sources = sourceProgress.toArray,
+      sink = sinkProgress)
+
+    if (hasNewData) {
+      // Reset noDataEventTimestamp if we processed any data
+      lastNoDataProgressEventTime = Long.MinValue
+      updateProgress(newProgress)
+    } else {
+      val now = triggerClock.getTimeMillis()
+      if (now - noDataProgressEventInterval >= lastNoDataProgressEventTime) {
+        lastNoDataProgressEventTime = now
+        updateProgress(newProgress)
+      }
+    }
+
+    currentStatus = currentStatus.copy(isTriggerActive = false)
+  }
+
+  /** Extract statistics about stateful operators from the executed query plan. */
+  private def extractStateOperatorMetrics(hasNewData: Boolean): Seq[StateOperatorProgress] = {
+    if (lastExecution == null) return Nil
+    // lastExecution could belong to one of the previous triggers if `!hasNewData`.
+    // Walking the plan again should be inexpensive.
+    val stateNodes = lastExecution.executedPlan.collect {
+      case p if p.isInstanceOf[StateStoreSaveExec] => p
+    }
+    stateNodes.map { node =>
+      val numRowsUpdated = if (hasNewData) {
+        node.metrics.get("numUpdatedStateRows").map(_.value).getOrElse(0L)
+      } else {
+        0L
+      }
+      new StateOperatorProgress(
+        numRowsTotal = node.metrics.get("numTotalStateRows").map(_.value).getOrElse(0L),
+        numRowsUpdated = numRowsUpdated)
+    }
+  }
+
+  /** Extracts statistics from the most recent query execution. */
+  private def extractExecutionStats(hasNewData: Boolean): ExecutionStats = {
+    val hasEventTime = logicalPlan.collect { case e: EventTimeWatermark => e }.nonEmpty
+    val watermarkTimestamp =
+      if (hasEventTime) Map("watermark" -> formatTimestamp(offsetSeqMetadata.batchWatermarkMs))
+      else Map.empty[String, String]
+
+    // SPARK-19378: Still report metrics even though no data was processed while reporting progress.
+    val stateOperators = extractStateOperatorMetrics(hasNewData)
+
+    if (!hasNewData) {
+      return ExecutionStats(Map.empty, stateOperators, watermarkTimestamp)
+    }
+
+    // We want to associate execution plan leaves to sources that generate them, so that we match
+    // the their metrics (e.g. numOutputRows) to the sources. To do this we do the following.
+    // Consider the translation from the streaming logical plan to the final executed plan.
+    //
+    //  streaming logical plan (with sources) <==> trigger's logical plan <==> executed plan
+    //
+    // 1. We keep track of streaming sources associated with each leaf in the trigger's logical plan
+    //    - Each logical plan leaf will be associated with a single streaming source.
+    //    - There can be multiple logical plan leaves associated with a streaming source.
+    //    - There can be leaves not associated with any streaming source, because they were
+    //      generated from a batch source (e.g. stream-batch joins)
+    //
+    // 2. Assuming that the executed plan has same number of leaves in the same order as that of
+    //    the trigger logical plan, we associate executed plan leaves with corresponding
+    //    streaming sources.
+    //
+    // 3. For each source, we sum the metrics of the associated execution plan leaves.
+    //
+    val logicalPlanLeafToSource = newData.flatMap { case (source, df) =>
+      df.logicalPlan.collectLeaves().map { leaf => leaf -> source }
+    }
+    val allLogicalPlanLeaves = lastExecution.logical.collectLeaves() // includes non-streaming
+    val allExecPlanLeaves = lastExecution.executedPlan.collectLeaves()
+    val numInputRows: Map[Source, Long] =
+      if (allLogicalPlanLeaves.size == allExecPlanLeaves.size) {
+        val execLeafToSource = allLogicalPlanLeaves.zip(allExecPlanLeaves).flatMap {
+          case (lp, ep) => logicalPlanLeafToSource.get(lp).map { source => ep -> source }
+        }
+        val sourceToNumInputRows = execLeafToSource.map { case (execLeaf, source) =>
+          val numRows = execLeaf.metrics.get("numOutputRows").map(_.value).getOrElse(0L)
+          source -> numRows
+        }
+        sourceToNumInputRows.groupBy(_._1).mapValues(_.map(_._2).sum) // sum up rows for each source
+      } else {
+        if (!metricWarningLogged) {
+          def toString[T](seq: Seq[T]): String = s"(size = ${seq.size}), ${seq.mkString(", ")}"
+          logWarning(
+            "Could not report metrics as number leaves in trigger logical plan did not match that" +
+                s" of the execution plan:\n" +
+                s"logical plan leaves: ${toString(allLogicalPlanLeaves)}\n" +
+                s"execution plan leaves: ${toString(allExecPlanLeaves)}\n")
+          metricWarningLogged = true
+        }
+        Map.empty
+      }
+
+    val eventTimeStats = lastExecution.executedPlan.collect {
+      case e: EventTimeWatermarkExec if e.eventTimeStats.value.count > 0 =>
+        val stats = e.eventTimeStats.value
+        Map(
+          "max" -> stats.max,
+          "min" -> stats.min,
+          "avg" -> stats.avg).mapValues(formatTimestamp)
+    }.headOption.getOrElse(Map.empty) ++ watermarkTimestamp
+
+    ExecutionStats(numInputRows, stateOperators, eventTimeStats)
+  }
+
+  /** Records the duration of running `body` for the next query progress update. */
+  protected def reportTimeTaken[T](triggerDetailKey: String)(body: => T): T = {
+    val startTime = triggerClock.getTimeMillis()
+    val result = body
+    val endTime = triggerClock.getTimeMillis()
+    val timeTaken = math.max(endTime - startTime, 0)
+
+    val previousTime = currentDurationsMs.getOrElse(triggerDetailKey, 0L)
+    currentDurationsMs.put(triggerDetailKey, previousTime + timeTaken)
+    logDebug(s"$triggerDetailKey took $timeTaken ms")
+    result
+  }
+
+  private def formatTimestamp(millis: Long): String = {
+    timestampFormat.format(new Date(millis))
+  }
+
+  /** Updates the message returned in `status`. */
+  protected def updateStatusMessage(message: String): Unit = {
+    currentStatus = currentStatus.copy(message = message)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala
index 2571b59be54f..d10cd3044ecd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala
@@ -31,8 +31,11 @@ trait Sink {
    * this method is called more than once with the same batchId (which will happen in the case of
    * failures), then `data` should only be added once.
    *
-   * Note: You cannot apply any operators on `data` except consuming it (e.g., `collect/foreach`).
+   * Note 1: You cannot apply any operators on `data` except consuming it (e.g., `collect/foreach`).
    * Otherwise, you may get a wrong result.
+   *
+   * Note 2: The method is supposed to be executed synchronously, i.e. the method should only return
+   * after data is consumed by sink successfully.
    */
   def addBatch(batchId: Long, data: DataFrame): Unit
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
index f3bd5bfe23fd..75ffe90f2bb7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -45,6 +45,14 @@ trait Source  {
    * Higher layers will always call this method with a value of `start` greater than or equal
    * to the last value passed to `commit` and a value of `end` less than or equal to the
    * last value returned by `getOffset`
+   *
+   * It is possible for the [[Offset]] type to be a [[SerializedOffset]] when it was
+   * obtained from the log. Moreover, [[StreamExecution]] only compares the [[Offset]]
+   * JSON representation to determine if the two objects are equal. This could have
+   * ramifications when upgrading [[Offset]] JSON formats i.e., two equivalent [[Offset]]
+   * objects could differ between version. Consequently, [[StreamExecution]] may call
+   * this method with two such equivalent [[Offset]] objects. In which case, the [[Source]]
+   * should return an empty [[DataFrame]]
    */
   def getBatch(start: Option[Offset], end: Offset): DataFrame
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
index ad8238f189c6..d4ccced9ac9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
@@ -21,12 +21,18 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, Predicate}
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.execution.streaming.state._
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.TaskContext
+
 
 /** Used to identify the state store for a given operator. */
 case class OperatorStateId(
@@ -92,8 +98,9 @@ case class StateStoreRestoreExec(
  */
 case class StateStoreSaveExec(
     keyExpressions: Seq[Attribute],
-    stateId: Option[OperatorStateId],
-    returnAllStates: Option[Boolean],
+    stateId: Option[OperatorStateId] = None,
+    outputMode: Option[OutputMode] = None,
+    eventTimeWatermark: Option[Long] = None,
     child: SparkPlan)
   extends execution.UnaryExecNode with StatefulOperator {
 
@@ -102,11 +109,35 @@ case class StateStoreSaveExec(
     "numTotalStateRows" -> SQLMetrics.createMetric(sparkContext, "number of total state rows"),
     "numUpdatedStateRows" -> SQLMetrics.createMetric(sparkContext, "number of updated state rows"))
 
+  /** Generate a predicate that matches data older than the watermark */
+  private lazy val watermarkPredicate: Option[Predicate] = {
+    val optionalWatermarkAttribute =
+      keyExpressions.find(_.metadata.contains(EventTimeWatermark.delayKey))
+
+    optionalWatermarkAttribute.map { watermarkAttribute =>
+      // If we are evicting based on a window, use the end of the window.  Otherwise just
+      // use the attribute itself.
+      val evictionExpression =
+        if (watermarkAttribute.dataType.isInstanceOf[StructType]) {
+          LessThanOrEqual(
+            GetStructField(watermarkAttribute, 1),
+            Literal(eventTimeWatermark.get * 1000))
+        } else {
+          LessThanOrEqual(
+            watermarkAttribute,
+            Literal(eventTimeWatermark.get * 1000))
+        }
+
+      logInfo(s"Filtering state store on: $evictionExpression")
+      newPredicate(evictionExpression, keyExpressions)
+    }
+  }
+
   override protected def doExecute(): RDD[InternalRow] = {
     metrics // force lazy init at driver
-    assert(returnAllStates.nonEmpty,
-      "Incorrect planning in IncrementalExecution, returnAllStates have not been set")
-    val saveAndReturnFunc = if (returnAllStates.get) saveAndReturnAll _ else saveAndReturnUpdated _
+    assert(outputMode.nonEmpty,
+      "Incorrect planning in IncrementalExecution, outputMode has not been set")
+
     child.execute().mapPartitionsWithStateStore(
       getStateId.checkpointLocation,
       operatorId = getStateId.operatorId,
@@ -114,75 +145,93 @@ case class StateStoreSaveExec(
       keyExpressions.toStructType,
       child.output.toStructType,
       sqlContext.sessionState,
-      Some(sqlContext.streams.stateStoreCoordinator)
-    )(saveAndReturnFunc)
+      Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
+        val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
+        val numOutputRows = longMetric("numOutputRows")
+        val numTotalStateRows = longMetric("numTotalStateRows")
+        val numUpdatedStateRows = longMetric("numUpdatedStateRows")
+
+        // Abort the state store in case of error
+        TaskContext.get().addTaskCompletionListener(_ => {
+          if (!store.hasCommitted) {
+            store.abort()
+          }
+        })
+
+        outputMode match {
+          // Update and output all rows in the StateStore.
+          case Some(Complete) =>
+            while (iter.hasNext) {
+              val row = iter.next().asInstanceOf[UnsafeRow]
+              val key = getKey(row)
+              store.put(key.copy(), row.copy())
+              numUpdatedStateRows += 1
+            }
+            store.commit()
+            numTotalStateRows += store.numKeys()
+            store.iterator().map { case (k, v) =>
+              numOutputRows += 1
+              v.asInstanceOf[InternalRow]
+            }
+
+          // Update and output only rows being evicted from the StateStore
+          case Some(Append) =>
+            while (iter.hasNext) {
+              val row = iter.next().asInstanceOf[UnsafeRow]
+              val key = getKey(row)
+              store.put(key.copy(), row.copy())
+              numUpdatedStateRows += 1
+            }
+
+            // Assumption: Append mode can be done only when watermark has been specified
+            store.remove(watermarkPredicate.get.eval)
+            store.commit()
+
+            numTotalStateRows += store.numKeys()
+            store.updates().filter(_.isInstanceOf[ValueRemoved]).map { removed =>
+              numOutputRows += 1
+              removed.value.asInstanceOf[InternalRow]
+            }
+
+          // Update and output modified rows from the StateStore.
+          case Some(Update) =>
+
+            new Iterator[InternalRow] {
+
+              // Filter late date using watermark if specified
+              private[this] val baseIterator = watermarkPredicate match {
+                case Some(predicate) => iter.filter((row: InternalRow) => !predicate.eval(row))
+                case None => iter
+              }
+
+              override def hasNext: Boolean = {
+                if (!baseIterator.hasNext) {
+                  // Remove old aggregates if watermark specified
+                  if (watermarkPredicate.nonEmpty) store.remove(watermarkPredicate.get.eval)
+                  store.commit()
+                  numTotalStateRows += store.numKeys()
+                  false
+                } else {
+                  true
+                }
+              }
+
+              override def next(): InternalRow = {
+                val row = baseIterator.next().asInstanceOf[UnsafeRow]
+                val key = getKey(row)
+                store.put(key.copy(), row.copy())
+                numOutputRows += 1
+                numUpdatedStateRows += 1
+                row
+              }
+            }
+
+          case _ => throw new UnsupportedOperationException(s"Invalid output mode: $outputMode")
+        }
+    }
   }
 
   override def output: Seq[Attribute] = child.output
 
   override def outputPartitioning: Partitioning = child.outputPartitioning
-
-  /**
-   * Save all the rows to the state store, and return all the rows in the state store.
-   * Note that this returns an iterator that pipelines the saving to store with downstream
-   * processing.
-   */
-  private def saveAndReturnUpdated(
-      store: StateStore,
-      iter: Iterator[InternalRow]): Iterator[InternalRow] = {
-    val numOutputRows = longMetric("numOutputRows")
-    val numTotalStateRows = longMetric("numTotalStateRows")
-    val numUpdatedStateRows = longMetric("numUpdatedStateRows")
-
-    new Iterator[InternalRow] {
-      private[this] val baseIterator = iter
-      private[this] val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
-
-      override def hasNext: Boolean = {
-        if (!baseIterator.hasNext) {
-          store.commit()
-          numTotalStateRows += store.numKeys()
-          false
-        } else {
-          true
-        }
-      }
-
-      override def next(): InternalRow = {
-        val row = baseIterator.next().asInstanceOf[UnsafeRow]
-        val key = getKey(row)
-        store.put(key.copy(), row.copy())
-        numOutputRows += 1
-        numUpdatedStateRows += 1
-        row
-      }
-    }
-  }
-
-  /**
-   * Save all the rows to the state store, and return all the rows in the state store.
-   * Note that the saving to store is blocking; only after all the rows have been saved
-   * is the iterator on the update store data is generated.
-   */
-  private def saveAndReturnAll(
-      store: StateStore,
-      iter: Iterator[InternalRow]): Iterator[InternalRow] = {
-    val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
-    val numOutputRows = longMetric("numOutputRows")
-    val numTotalStateRows = longMetric("numTotalStateRows")
-    val numUpdatedStateRows = longMetric("numUpdatedStateRows")
-
-    while (iter.hasNext) {
-      val row = iter.next().asInstanceOf[UnsafeRow]
-      val key = getKey(row)
-      store.put(key.copy(), row.copy())
-      numUpdatedStateRows += 1
-    }
-    store.commit()
-    numTotalStateRows += store.numKeys()
-    store.iterator().map { case (k, v) =>
-      numOutputRows += 1
-      v.asInstanceOf[InternalRow]
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 37af1a550aaf..b380db0f9ec2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -17,8 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.UUID
 import java.util.concurrent.{CountDownLatch, TimeUnit}
-import java.util.concurrent.atomic.AtomicLong
+import java.util.concurrent.atomic.AtomicReference
 import java.util.concurrent.locks.ReentrantLock
 
 import scala.collection.mutable.ArrayBuffer
@@ -29,12 +30,10 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
-import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution.{QueryExecution, SparkPlan}
-import org.apache.spark.sql.execution.command.ExplainCommand
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.execution.command.StreamingExplainCommand
 import org.apache.spark.sql.streaming._
 import org.apache.spark.util.{Clock, UninterruptibleThread, Utils}
 
@@ -43,30 +42,36 @@ import org.apache.spark.util.{Clock, UninterruptibleThread, Utils}
  * Unlike a standard query, a streaming query executes repeatedly each time new data arrives at any
  * [[Source]] present in the query plan. Whenever new data arrives, a [[QueryExecution]] is created
  * and the results are committed transactionally to the given [[Sink]].
+ *
+ * @param deleteCheckpointOnStop whether to delete the checkpoint if the query is stopped without
+ *                               errors
  */
 class StreamExecution(
     override val sparkSession: SparkSession,
-    override val id: Long,
     override val name: String,
-    checkpointRoot: String,
-    val logicalPlan: LogicalPlan,
+    val checkpointRoot: String,
+    analyzedPlan: LogicalPlan,
     val sink: Sink,
     val trigger: Trigger,
     val triggerClock: Clock,
-    val outputMode: OutputMode)
-  extends StreamingQuery with Logging {
+    val outputMode: OutputMode,
+    deleteCheckpointOnStop: Boolean)
+  extends StreamingQuery with ProgressReporter with Logging {
 
   import org.apache.spark.sql.streaming.StreamingQueryListener._
-  import StreamMetrics._
 
   private val pollingDelayMs = sparkSession.sessionState.conf.streamingPollingDelay
 
+  private val minBatchesToRetain = sparkSession.sessionState.conf.minBatchesToRetain
+  require(minBatchesToRetain > 0, "minBatchesToRetain has to be positive")
+
   /**
    * A lock used to wait/notify when batches complete. Use a fair lock to avoid thread starvation.
    */
   private val awaitBatchLock = new ReentrantLock(true)
   private val awaitBatchLockCondition = awaitBatchLock.newCondition()
 
+  private val initializationLatch = new CountDownLatch(1)
   private val startLatch = new CountDownLatch(1)
   private val terminationLatch = new CountDownLatch(1)
 
@@ -88,28 +93,78 @@ class StreamExecution(
    * once, since the field's value may change at any time.
    */
   @volatile
-  private var availableOffsets = new StreamProgress
+  var availableOffsets = new StreamProgress
 
   /** The current batchId or -1 if execution has not yet been initialized. */
-  private var currentBatchId: Long = -1
+  protected var currentBatchId: Long = -1
+
+  /** Metadata associated with the whole query */
+  protected val streamMetadata: StreamMetadata = {
+    val metadataPath = new Path(checkpointFile("metadata"))
+    val hadoopConf = sparkSession.sessionState.newHadoopConf()
+    StreamMetadata.read(metadataPath, hadoopConf).getOrElse {
+      val newMetadata = new StreamMetadata(UUID.randomUUID.toString)
+      StreamMetadata.write(newMetadata, metadataPath, hadoopConf)
+      newMetadata
+    }
+  }
+
+  /** Metadata associated with the offset seq of a batch in the query. */
+  protected var offsetSeqMetadata = OffsetSeqMetadata()
+
+  override val id: UUID = UUID.fromString(streamMetadata.id)
+
+  override val runId: UUID = UUID.randomUUID
 
-  /** All stream sources present in the query plan. */
-  private val sources =
-    logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
+  /**
+   * Pretty identified string of printing in logs. Format is
+   * If name is set "queryName [id = xyz, runId = abc]" else "[id = xyz, runId = abc]"
+   */
+  private val prettyIdString =
+    Option(name).map(_ + " ").getOrElse("") + s"[id = $id, runId = $runId]"
 
-  /** A list of unique sources in the query plan. */
-  private val uniqueSources = sources.distinct
+  /**
+   * All stream sources present in the query plan. This will be set when generating logical plan.
+   */
+  @volatile protected var sources: Seq[Source] = Seq.empty
+
+  /**
+   * A list of unique sources in the query plan. This will be set when generating logical plan.
+   */
+  @volatile private var uniqueSources: Seq[Source] = Seq.empty
+
+  override lazy val logicalPlan: LogicalPlan = {
+    assert(microBatchThread eq Thread.currentThread,
+      "logicalPlan must be initialized in StreamExecutionThread " +
+        s"but the current thread was ${Thread.currentThread}")
+    var nextSourceId = 0L
+    val _logicalPlan = analyzedPlan.transform {
+      case StreamingRelation(dataSource, _, output) =>
+        // Materialize source to avoid creating it in every batch
+        val metadataPath = s"$checkpointRoot/sources/$nextSourceId"
+        val source = dataSource.createSource(metadataPath)
+        nextSourceId += 1
+        // We still need to use the previous `output` instead of `source.schema` as attributes in
+        // "df.logicalPlan" has already used attributes of the previous `output`.
+        StreamingExecutionRelation(source, output)
+    }
+    sources = _logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
+    uniqueSources = sources.distinct
+    _logicalPlan
+  }
 
   private val triggerExecutor = trigger match {
     case t: ProcessingTime => ProcessingTimeExecutor(t, triggerClock)
   }
 
   /** Defines the internal state of execution */
-  @volatile
-  private var state: State = INITIALIZED
+  private val state = new AtomicReference[State](INITIALIZING)
 
   @volatile
-  var lastExecution: QueryExecution = null
+  var lastExecution: IncrementalExecution = _
+
+  /** Holds the most recent input data for each source. */
+  protected var newData: Map[Source, DataFrame] = _
 
   @volatile
   private var streamDeathCause: StreamingQueryException = null
@@ -117,24 +172,18 @@ class StreamExecution(
   /* Get the call site in the caller thread; will pass this into the micro batch thread */
   private val callSite = Utils.getCallSite()
 
-  /** Metrics for this query */
-  private val streamMetrics =
-    new StreamMetrics(uniqueSources.toSet, triggerClock, s"StructuredStreaming.$name")
-
-  @volatile
-  private var currentStatus: StreamingQueryStatus = null
-
-  /** Flag that signals whether any error with input metrics have already been logged */
-  @volatile
-  private var metricWarningLogged: Boolean = false
+  /** Used to report metrics to coda-hale. This uses id for easier tracking across restarts. */
+  lazy val streamMetrics = new MetricsReporter(
+    this, s"spark.streaming.${Option(name).getOrElse(id)}")
 
   /**
    * The thread that runs the micro-batches of this stream. Note that this thread must be
-   * [[org.apache.spark.util.UninterruptibleThread]] to avoid potential deadlocks in using
-   * [[HDFSMetadataLog]]. See SPARK-14131 for more details.
+   * [[org.apache.spark.util.UninterruptibleThread]] to workaround KAFKA-1894: interrupting a
+   * running `KafkaConsumer` may cause endless loop, and HADOOP-10622: interrupting
+   * `Shell.runCommand` causes deadlock. (SPARK-14131)
    */
   val microBatchThread =
-    new StreamExecutionThread(s"stream execution thread for $name") {
+    new StreamExecutionThread(s"stream execution thread for $prettyIdString") {
       override def run(): Unit = {
         // To fix call site like "run at <unknown>:0", we bridge the call site from the caller
         // thread to this micro batch thread
@@ -149,19 +198,13 @@ class StreamExecution(
    * processing is done.  Thus, the Nth record in this log indicated data that is currently being
    * processed and the N-1th entry indicates which offsets have been durably committed to the sink.
    */
-  val offsetLog = new HDFSMetadataLog[CompositeOffset](sparkSession, checkpointFile("offsets"))
-
-  /** Whether the query is currently active or not */
-  override def isActive: Boolean = state == ACTIVE
+  val offsetLog = new OffsetSeqLog(sparkSession, checkpointFile("offsets"))
 
-  /** Returns the current status of the query. */
-  override def status: StreamingQueryStatus = currentStatus
+  /** Whether all fields of the query have been initialized */
+  private def isInitialized: Boolean = state.get != INITIALIZING
 
-  /** Returns current status of all the sources. */
-  override def sourceStatuses: Array[SourceStatus] = currentStatus.sourceStatuses.toArray
-
-  /** Returns current status of the sink. */
-  override def sinkStatus: SinkStatus = currentStatus.sinkStatus
+  /** Whether the query is currently active or not */
+  override def isActive: Boolean = state.get != TERMINATED
 
   /** Returns the [[StreamingQueryException]] if the query was terminated by an exception. */
   override def exception: Option[StreamingQueryException] = Option(streamDeathCause)
@@ -175,6 +218,7 @@ class StreamExecution(
    * has been posted to all the listeners.
    */
   def start(): Unit = {
+    logInfo(s"Starting $prettyIdString. Use $checkpointRoot to store the query checkpoint.")
     microBatchThread.setDaemon(true)
     microBatchThread.start()
     startLatch.await()  // Wait until thread started and QueryStart event has been posted
@@ -190,14 +234,12 @@ class StreamExecution(
    */
   private def runBatches(): Unit = {
     try {
-      // Mark ACTIVE and then post the event. QueryStarted event is synchronously sent to listeners,
-      // so must mark this as ACTIVE first.
-      state = ACTIVE
       if (sparkSession.sessionState.conf.streamingMetricsEnabled) {
         sparkSession.sparkContext.env.metricsSystem.registerSource(streamMetrics)
       }
-      updateStatus()
-      postEvent(new QueryStartedEvent(currentStatus)) // Assumption: Does not throw exception.
+
+      // `postEvent` does not throw non fatal exception.
+      postEvent(new QueryStartedEvent(id, runId, name))
 
       // Unblock starting thread
       startLatch.countDown()
@@ -205,70 +247,116 @@ class StreamExecution(
       // While active, repeatedly attempt to run batches.
       SparkSession.setActiveSession(sparkSession)
 
-      triggerExecutor.execute(() => {
-        streamMetrics.reportTriggerStarted(currentBatchId)
-        streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "Finding new data from sources")
-        updateStatus()
-        val isTerminated = reportTimeTaken(TRIGGER_LATENCY) {
-          if (isActive) {
-            if (currentBatchId < 0) {
-              // We'll do this initialization only once
-              populateStartOffsets()
-              logDebug(s"Stream running from $committedOffsets to $availableOffsets")
+      updateStatusMessage("Initializing sources")
+      // force initialization of the logical plan so that the sources can be created
+      logicalPlan
+      if (state.compareAndSet(INITIALIZING, ACTIVE)) {
+        // Unblock `awaitInitialization`
+        initializationLatch.countDown()
+
+        triggerExecutor.execute(() => {
+          startTrigger()
+
+          val continueToRun =
+            if (isActive) {
+              reportTimeTaken("triggerExecution") {
+                if (currentBatchId < 0) {
+                  // We'll do this initialization only once
+                  populateStartOffsets()
+                  logDebug(s"Stream running from $committedOffsets to $availableOffsets")
+                } else {
+                  constructNextBatch()
+                }
+                if (dataAvailable) {
+                  currentStatus = currentStatus.copy(isDataAvailable = true)
+                  updateStatusMessage("Processing new data")
+                  runBatch()
+                }
+              }
+
+              // Report trigger as finished and construct progress object.
+              finishTrigger(dataAvailable)
+              if (dataAvailable) {
+                // We'll increase currentBatchId after we complete processing current batch's data
+                currentBatchId += 1
+              } else {
+                currentStatus = currentStatus.copy(isDataAvailable = false)
+                updateStatusMessage("Waiting for data to arrive")
+                Thread.sleep(pollingDelayMs)
+              }
+              true
             } else {
-              constructNextBatch()
+              false
             }
-            if (dataAvailable) {
-              streamMetrics.reportTriggerDetail(IS_DATA_PRESENT_IN_TRIGGER, true)
-              streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "Processing new data")
-              updateStatus()
-              runBatch()
-              // We'll increase currentBatchId after we complete processing current batch's data
-              currentBatchId += 1
-            } else {
-              streamMetrics.reportTriggerDetail(IS_DATA_PRESENT_IN_TRIGGER, false)
-              streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "No new data")
-              updateStatus()
-              Thread.sleep(pollingDelayMs)
-            }
-            true
-          } else {
-            false
-          }
-        }
-        // Update metrics and notify others
-        streamMetrics.reportTriggerFinished()
-        updateStatus()
-        postEvent(new QueryProgressEvent(currentStatus))
-        isTerminated
-      })
+
+          // Update committed offsets.
+          committedOffsets ++= availableOffsets
+          updateStatusMessage("Waiting for next trigger")
+          continueToRun
+        })
+        updateStatusMessage("Stopped")
+      } else {
+        // `stop()` is already called. Let `finally` finish the cleanup.
+      }
     } catch {
-      case _: InterruptedException if state == TERMINATED => // interrupted by stop()
+      case _: InterruptedException if state.get == TERMINATED => // interrupted by stop()
+        updateStatusMessage("Stopped")
       case e: Throwable =>
         streamDeathCause = new StreamingQueryException(
-          this,
-          s"Query $name terminated with exception: ${e.getMessage}",
+          toDebugString(includeLogicalPlan = isInitialized),
+          s"Query $prettyIdString terminated with exception: ${e.getMessage}",
           e,
-          Some(committedOffsets.toCompositeOffset(sources)))
-        logError(s"Query $name terminated with error", e)
+          committedOffsets.toOffsetSeq(sources, offsetSeqMetadata).toString,
+          availableOffsets.toOffsetSeq(sources, offsetSeqMetadata).toString)
+        logError(s"Query $prettyIdString terminated with error", e)
+        updateStatusMessage(s"Terminated with exception: ${e.getMessage}")
         // Rethrow the fatal errors to allow the user using `Thread.UncaughtExceptionHandler` to
         // handle them
         if (!NonFatal(e)) {
           throw e
         }
     } finally {
-      state = TERMINATED
-
-      // Update metrics and status
-      streamMetrics.stop()
-      sparkSession.sparkContext.env.metricsSystem.removeSource(streamMetrics)
-      updateStatus()
-
-      // Notify others
-      sparkSession.streams.notifyQueryTermination(StreamExecution.this)
-      postEvent(
-        new QueryTerminatedEvent(currentStatus, exception.map(_.cause).map(Utils.exceptionString)))
-      terminationLatch.countDown()
+      // Release latches to unblock the user codes since exception can happen in any place and we
+      // may not get a chance to release them
+      startLatch.countDown()
+      initializationLatch.countDown()
+
+      try {
+        stopSources()
+        state.set(TERMINATED)
+        currentStatus = status.copy(isTriggerActive = false, isDataAvailable = false)
+
+        // Update metrics and status
+        sparkSession.sparkContext.env.metricsSystem.removeSource(streamMetrics)
+
+        // Notify others
+        sparkSession.streams.notifyQueryTermination(StreamExecution.this)
+        postEvent(
+          new QueryTerminatedEvent(id, runId, exception.map(_.cause).map(Utils.exceptionString)))
+
+        // Delete the temp checkpoint only when the query didn't fail
+        if (deleteCheckpointOnStop && exception.isEmpty) {
+          val checkpointPath = new Path(checkpointRoot)
+          try {
+            val fs = checkpointPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
+            fs.delete(checkpointPath, true)
+          } catch {
+            case NonFatal(e) =>
+              // Deleting temp checkpoint folder is best effort, don't throw non fatal exceptions
+              // when we cannot delete them.
+              logWarning(s"Cannot delete $checkpointPath", e)
+          }
+        }
+      } finally {
+        awaitBatchLock.lock()
+        try {
+          // Wake up any threads that are waiting for the stream to progress.
+          awaitBatchLockCondition.signalAll()
+        } finally {
+          awaitBatchLock.unlock()
+        }
+        terminationLatch.countDown()
+      }
     }
   }
 
@@ -286,7 +374,9 @@ class StreamExecution(
         logInfo(s"Resuming streaming query, starting with batch $batchId")
         currentBatchId = batchId
         availableOffsets = nextOffsets.toStreamProgress(sources)
-        logDebug(s"Found possibly uncommitted offsets $availableOffsets")
+        offsetSeqMetadata = nextOffsets.metadata.getOrElse(OffsetSeqMetadata())
+        logDebug(s"Found possibly unprocessed offsets $availableOffsets " +
+          s"at batch timestamp ${offsetSeqMetadata.batchTimestampMs}")
 
         offsetLog.get(batchId - 1).foreach {
           case lastOffsets =>
@@ -322,14 +412,13 @@ class StreamExecution(
     val hasNewData = {
       awaitBatchLock.lock()
       try {
-        reportTimeTaken(GET_OFFSET_LATENCY) {
-          val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map { s =>
-            reportTimeTaken(s, SOURCE_GET_OFFSET_LATENCY) {
-              (s, s.getOffset)
-            }
-          }.toMap
-          availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
-        }
+        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map { s =>
+          updateStatusMessage(s"Getting offsets from $s")
+          reportTimeTaken("getOffset") {
+            (s, s.getOffset)
+          }
+        }.toMap
+        availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
 
         if (dataAvailable) {
           true
@@ -342,10 +431,34 @@ class StreamExecution(
       }
     }
     if (hasNewData) {
-      reportTimeTaken(OFFSET_WAL_WRITE_LATENCY) {
-        assert(offsetLog.add(currentBatchId, availableOffsets.toCompositeOffset(sources)),
+      // Current batch timestamp in milliseconds
+      offsetSeqMetadata.batchTimestampMs = triggerClock.getTimeMillis()
+      // Update the eventTime watermark if we find one in the plan.
+      if (lastExecution != null) {
+        lastExecution.executedPlan.collect {
+          case e: EventTimeWatermarkExec if e.eventTimeStats.value.count > 0 =>
+            logDebug(s"Observed event time stats: ${e.eventTimeStats.value}")
+            e.eventTimeStats.value.max - e.delayMs
+        }.headOption.foreach { newWatermarkMs =>
+          if (newWatermarkMs > offsetSeqMetadata.batchWatermarkMs) {
+            logInfo(s"Updating eventTime watermark to: $newWatermarkMs ms")
+            offsetSeqMetadata.batchWatermarkMs = newWatermarkMs
+          } else {
+            logDebug(
+              s"Event time didn't move: $newWatermarkMs < " +
+                s"${offsetSeqMetadata.batchWatermarkMs}")
+          }
+        }
+      }
+
+      updateStatusMessage("Writing offsets to log")
+      reportTimeTaken("walCommit") {
+        assert(offsetLog.add(
+          currentBatchId,
+          availableOffsets.toOffsetSeq(sources, offsetSeqMetadata)),
           s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
-        logInfo(s"Committed offsets for batch $currentBatchId.")
+        logInfo(s"Committed offsets for batch $currentBatchId. " +
+          s"Metadata ${offsetSeqMetadata.toString}")
 
         // NOTE: The following code is correct because runBatches() processes exactly one
         // batch at a time. If we add pipeline parallelism (multiple batches in flight at
@@ -360,10 +473,11 @@ class StreamExecution(
           }
         }
 
-        // Now that we have logged the new batch, no further processing will happen for
-        // the batch before the previous batch, and it is safe to discard the old metadata.
+        // It is now safe to discard the metadata beyond the minimum number to retain.
         // Note that purge is exclusive, i.e. it purges everything before the target ID.
-        offsetLog.purge(currentBatchId - 1)
+        if (minBatchesToRetain < currentBatchId) {
+          offsetLog.purge(currentBatchId - minBatchesToRetain)
+        }
       }
     } else {
       awaitBatchLock.lock()
@@ -374,30 +488,24 @@ class StreamExecution(
         awaitBatchLock.unlock()
       }
     }
-    reportTimestamp(GET_OFFSET_TIMESTAMP)
   }
 
   /**
    * Processes any data available between `availableOffsets` and `committedOffsets`.
    */
   private def runBatch(): Unit = {
-    // TODO: Move this to IncrementalExecution.
-
     // Request unprocessed data from all sources.
-    val newData = reportTimeTaken(GET_BATCH_LATENCY) {
+    newData = reportTimeTaken("getBatch") {
       availableOffsets.flatMap {
         case (source, available)
           if committedOffsets.get(source).map(_ != available).getOrElse(true) =>
           val current = committedOffsets.get(source)
-          val batch = reportTimeTaken(source, SOURCE_GET_BATCH_LATENCY) {
-            source.getBatch(current, available)
-          }
+          val batch = source.getBatch(current, available)
           logDebug(s"Retrieving data from $source: $current -> $available")
           Some(source -> batch)
         case _ => None
       }
     }
-    reportTimestamp(GET_BATCH_TIMESTAMP)
 
     // A list of attributes that will need to be updated.
     var replacements = new ArrayBuffer[(Attribute, Attribute)]
@@ -420,22 +528,31 @@ class StreamExecution(
     val replacementMap = AttributeMap(replacements)
     val triggerLogicalPlan = withNewSources transformAllExpressions {
       case a: Attribute if replacementMap.contains(a) => replacementMap(a)
+      case ct: CurrentTimestamp =>
+        CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
+          ct.dataType)
+      case cd: CurrentDate =>
+        CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
+          cd.dataType)
     }
 
-    val executedPlan = reportTimeTaken(OPTIMIZER_LATENCY) {
+    reportTimeTaken("queryPlanning") {
       lastExecution = new IncrementalExecution(
         sparkSession,
         triggerLogicalPlan,
         outputMode,
         checkpointFile("state"),
-        currentBatchId)
+        currentBatchId,
+        offsetSeqMetadata.batchWatermarkMs)
       lastExecution.executedPlan // Force the lazy generation of execution plan
     }
 
     val nextBatch =
       new Dataset(sparkSession, lastExecution, RowEncoder(lastExecution.analyzed.schema))
-    sink.addBatch(currentBatchId, nextBatch)
-    reportNumRows(executedPlan, triggerLogicalPlan, newData)
+
+    reportTimeTaken("addBatch") {
+      sink.addBatch(currentBatchId, nextBatch)
+    }
 
     awaitBatchLock.lock()
     try {
@@ -444,15 +561,24 @@ class StreamExecution(
     } finally {
       awaitBatchLock.unlock()
     }
-
-    // Update committed offsets.
-    committedOffsets ++= availableOffsets
   }
 
-  private def postEvent(event: StreamingQueryListener.Event) {
+  override protected def postEvent(event: StreamingQueryListener.Event): Unit = {
     sparkSession.streams.postListenerEvent(event)
   }
 
+  /** Stops all streaming sources safely. */
+  private def stopSources(): Unit = {
+    uniqueSources.foreach { source =>
+      try {
+        source.stop()
+      } catch {
+        case NonFatal(e) =>
+          logWarning(s"Failed to stop streaming source: $source. Resources may have leaked.", e)
+      }
+    }
+  }
+
   /**
    * Signals to the thread executing micro-batches that it should stop running after the next
    * batch. This method blocks until the thread stops running.
@@ -460,13 +586,12 @@ class StreamExecution(
   override def stop(): Unit = {
     // Set the state to TERMINATED so that the batching thread knows that it was interrupted
     // intentionally
-    state = TERMINATED
+    state.set(TERMINATED)
     if (microBatchThread.isAlive) {
       microBatchThread.interrupt()
       microBatchThread.join()
     }
-    uniqueSources.foreach(_.stop())
-    logInfo(s"Query $name was stopped")
+    logInfo(s"Query $prettyIdString was stopped")
   }
 
   /**
@@ -474,6 +599,7 @@ class StreamExecution(
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
   private[sql] def awaitOffset(source: Source, newOffset: Offset): Unit = {
+    assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
       !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
@@ -496,7 +622,38 @@ class StreamExecution(
   /** A flag to indicate that a batch has completed with no new data available. */
   @volatile private var noNewData = false
 
+  /**
+   * Assert that the await APIs should not be called in the stream thread. Otherwise, it may cause
+   * dead-lock, e.g., calling any await APIs in `StreamingQueryListener.onQueryStarted` will block
+   * the stream thread forever.
+   */
+  private def assertAwaitThread(): Unit = {
+    if (microBatchThread eq Thread.currentThread) {
+      throw new IllegalStateException(
+        "Cannot wait for a query state from the same thread that is running the query")
+    }
+  }
+
+  /**
+   * Await until all fields of the query have been initialized.
+   */
+  def awaitInitialization(timeoutMs: Long): Unit = {
+    assertAwaitThread()
+    require(timeoutMs > 0, "Timeout has to be positive")
+    if (streamDeathCause != null) {
+      throw streamDeathCause
+    }
+    initializationLatch.await(timeoutMs, TimeUnit.MILLISECONDS)
+    if (streamDeathCause != null) {
+      throw streamDeathCause
+    }
+  }
+
   override def processAllAvailable(): Unit = {
+    assertAwaitThread()
+    if (streamDeathCause != null) {
+      throw streamDeathCause
+    }
     awaitBatchLock.lock()
     try {
       noNewData = false
@@ -515,9 +672,7 @@ class StreamExecution(
   }
 
   override def awaitTermination(): Unit = {
-    if (state == INITIALIZED) {
-      throw new IllegalStateException("Cannot wait for termination on a query that has not started")
-    }
+    assertAwaitThread()
     terminationLatch.await()
     if (streamDeathCause != null) {
       throw streamDeathCause
@@ -525,9 +680,7 @@ class StreamExecution(
   }
 
   override def awaitTermination(timeoutMs: Long): Boolean = {
-    if (state == INITIALIZED) {
-      throw new IllegalStateException("Cannot wait for termination on a query that has not started")
-    }
+    assertAwaitThread()
     require(timeoutMs > 0, "Timeout has to be positive")
     terminationLatch.await(timeoutMs, TimeUnit.MILLISECONDS)
     if (streamDeathCause != null) {
@@ -542,7 +695,7 @@ class StreamExecution(
     if (lastExecution == null) {
       "No physical plan. Waiting for data."
     } else {
-      val explain = ExplainCommand(lastExecution.logical, extended = extended)
+      val explain = StreamingExplainCommand(lastExecution, extended = extended)
       sparkSession.sessionState.executePlan(explain).executedPlan.executeCollect()
         .map(_.getString(0)).mkString("\n")
     }
@@ -557,166 +710,31 @@ class StreamExecution(
   override def explain(): Unit = explain(extended = false)
 
   override def toString: String = {
-    s"Streaming Query - $name [state = $state]"
-  }
-
-  def toDebugString: String = {
-    val deathCauseStr = if (streamDeathCause != null) {
-      "Error:\n" + stackTraceToString(streamDeathCause.cause)
-    } else ""
-    s"""
-       |=== Streaming Query ===
-       |Name: $name
-       |Current Offsets: $committedOffsets
-       |
-       |Current State: $state
-       |Thread State: ${microBatchThread.getState}
-       |
-       |Logical Plan:
-       |$logicalPlan
-       |
-       |$deathCauseStr
-     """.stripMargin
-  }
-
-  /**
-   * Report row metrics of the executed trigger
-   * @param triggerExecutionPlan Execution plan of the trigger
-   * @param triggerLogicalPlan Logical plan of the trigger, generated from the query logical plan
-   * @param sourceToDF Source to DataFrame returned by the source.getBatch in this trigger
-   */
-  private def reportNumRows(
-      triggerExecutionPlan: SparkPlan,
-      triggerLogicalPlan: LogicalPlan,
-      sourceToDF: Map[Source, DataFrame]): Unit = {
-    // We want to associate execution plan leaves to sources that generate them, so that we match
-    // the their metrics (e.g. numOutputRows) to the sources. To do this we do the following.
-    // Consider the translation from the streaming logical plan to the final executed plan.
-    //
-    //  streaming logical plan (with sources) <==> trigger's logical plan <==> executed plan
-    //
-    // 1. We keep track of streaming sources associated with each leaf in the trigger's logical plan
-    //    - Each logical plan leaf will be associated with a single streaming source.
-    //    - There can be multiple logical plan leaves associated with a streaming source.
-    //    - There can be leaves not associated with any streaming source, because they were
-    //      generated from a batch source (e.g. stream-batch joins)
-    //
-    // 2. Assuming that the executed plan has same number of leaves in the same order as that of
-    //    the trigger logical plan, we associate executed plan leaves with corresponding
-    //    streaming sources.
-    //
-    // 3. For each source, we sum the metrics of the associated execution plan leaves.
-    //
-    val logicalPlanLeafToSource = sourceToDF.flatMap { case (source, df) =>
-      df.logicalPlan.collectLeaves().map { leaf => leaf -> source }
-    }
-    val allLogicalPlanLeaves = triggerLogicalPlan.collectLeaves() // includes non-streaming sources
-    val allExecPlanLeaves = triggerExecutionPlan.collectLeaves()
-    val sourceToNumInputRows: Map[Source, Long] =
-      if (allLogicalPlanLeaves.size == allExecPlanLeaves.size) {
-        val execLeafToSource = allLogicalPlanLeaves.zip(allExecPlanLeaves).flatMap {
-          case (lp, ep) => logicalPlanLeafToSource.get(lp).map { source => ep -> source }
-        }
-        val sourceToNumInputRows = execLeafToSource.map { case (execLeaf, source) =>
-          val numRows = execLeaf.metrics.get("numOutputRows").map(_.value).getOrElse(0L)
-          source -> numRows
-        }
-        sourceToNumInputRows.groupBy(_._1).mapValues(_.map(_._2).sum) // sum up rows for each source
-      } else {
-        if (!metricWarningLogged) {
-          def toString[T](seq: Seq[T]): String = s"(size = ${seq.size}), ${seq.mkString(", ")}"
-          logWarning(
-            "Could not report metrics as number leaves in trigger logical plan did not match that" +
-              s" of the execution plan:\n" +
-              s"logical plan leaves: ${toString(allLogicalPlanLeaves)}\n" +
-              s"execution plan leaves: ${toString(allExecPlanLeaves)}\n")
-          metricWarningLogged = true
-        }
-        Map.empty
-      }
-    val numOutputRows = triggerExecutionPlan.metrics.get("numOutputRows").map(_.value)
-    val stateNodes = triggerExecutionPlan.collect {
-      case p if p.isInstanceOf[StateStoreSaveExec] => p
-    }
-
-    streamMetrics.reportNumInputRows(sourceToNumInputRows)
-    stateNodes.zipWithIndex.foreach { case (s, i) =>
-      streamMetrics.reportTriggerDetail(
-        NUM_TOTAL_STATE_ROWS(i + 1),
-        s.metrics.get("numTotalStateRows").map(_.value).getOrElse(0L))
-      streamMetrics.reportTriggerDetail(
-        NUM_UPDATED_STATE_ROWS(i + 1),
-        s.metrics.get("numUpdatedStateRows").map(_.value).getOrElse(0L))
-    }
-    updateStatus()
+    s"Streaming Query $prettyIdString [state = $state]"
   }
 
-  private def reportTimeTaken[T](triggerDetailKey: String)(body: => T): T = {
-    val startTime = triggerClock.getTimeMillis()
-    val result = body
-    val endTime = triggerClock.getTimeMillis()
-    val timeTaken = math.max(endTime - startTime, 0)
-    streamMetrics.reportTriggerDetail(triggerDetailKey, timeTaken)
-    updateStatus()
-    if (triggerDetailKey == TRIGGER_LATENCY) {
-      logInfo(s"Completed up to $availableOffsets in $timeTaken ms")
+  private def toDebugString(includeLogicalPlan: Boolean): String = {
+    val debugString =
+      s"""|=== Streaming Query ===
+          |Identifier: $prettyIdString
+          |Current Committed Offsets: $committedOffsets
+          |Current Available Offsets: $availableOffsets
+          |
+          |Current State: $state
+          |Thread State: ${microBatchThread.getState}""".stripMargin
+    if (includeLogicalPlan) {
+      debugString + s"\n\nLogical Plan:\n$logicalPlan"
+    } else {
+      debugString
     }
-    result
-  }
-
-  private def reportTimeTaken[T](source: Source, triggerDetailKey: String)(body: => T): T = {
-    val startTime = triggerClock.getTimeMillis()
-    val result = body
-    val endTime = triggerClock.getTimeMillis()
-    streamMetrics.reportSourceTriggerDetail(
-      source, triggerDetailKey, math.max(endTime - startTime, 0))
-    updateStatus()
-    result
-  }
-
-  private def reportTimestamp(triggerDetailKey: String): Unit = {
-    streamMetrics.reportTriggerDetail(triggerDetailKey, triggerClock.getTimeMillis)
-    updateStatus()
-  }
-
-  private def updateStatus(): Unit = {
-    val localAvailableOffsets = availableOffsets
-    val sourceStatuses = sources.map { s =>
-      SourceStatus(
-        s.toString,
-        localAvailableOffsets.get(s).map(_.toString).getOrElse("-"), // TODO: use json if available
-        streamMetrics.currentSourceInputRate(s),
-        streamMetrics.currentSourceProcessingRate(s),
-        streamMetrics.currentSourceTriggerDetails(s))
-    }.toArray
-    val sinkStatus = SinkStatus(
-      sink.toString,
-      committedOffsets.toCompositeOffset(sources).toString)
-
-    currentStatus =
-      StreamingQueryStatus(
-        name = name,
-        id = id,
-        timestamp = triggerClock.getTimeMillis(),
-        inputRate = streamMetrics.currentInputRate(),
-        processingRate = streamMetrics.currentProcessingRate(),
-        latency = streamMetrics.currentLatency(),
-        sourceStatuses = sourceStatuses,
-        sinkStatus = sinkStatus,
-        triggerDetails = streamMetrics.currentTriggerDetails())
   }
 
   trait State
-  case object INITIALIZED extends State
+  case object INITIALIZING extends State
   case object ACTIVE extends State
   case object TERMINATED extends State
 }
 
-object StreamExecution {
-  private val _nextId = new AtomicLong(0)
-
-  def nextId: Long = _nextId.getAndIncrement()
-}
 
 /**
  * A special thread to run the stream query. Some codes require to run in the StreamExecutionThread
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala
new file mode 100644
index 000000000000..0bc54eac4ee8
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.io.{InputStreamReader, OutputStreamWriter}
+import java.nio.charset.StandardCharsets
+
+import scala.util.control.NonFatal
+
+import org.apache.commons.io.IOUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.streaming.StreamingQuery
+
+/**
+ * Contains metadata associated with a [[StreamingQuery]]. This information is written
+ * in the checkpoint location the first time a query is started and recovered every time the query
+ * is restarted.
+ *
+ * @param id  unique id of the [[StreamingQuery]] that needs to be persisted across restarts
+ */
+case class StreamMetadata(id: String) {
+  def json: String = Serialization.write(this)(StreamMetadata.format)
+}
+
+object StreamMetadata extends Logging {
+  implicit val format = Serialization.formats(NoTypeHints)
+
+  /** Read the metadata from file if it exists */
+  def read(metadataFile: Path, hadoopConf: Configuration): Option[StreamMetadata] = {
+    val fs = metadataFile.getFileSystem(hadoopConf)
+    if (fs.exists(metadataFile)) {
+      var input: FSDataInputStream = null
+      try {
+        input = fs.open(metadataFile)
+        val reader = new InputStreamReader(input, StandardCharsets.UTF_8)
+        val metadata = Serialization.read[StreamMetadata](reader)
+        Some(metadata)
+      } catch {
+        case NonFatal(e) =>
+          logError(s"Error reading stream metadata from $metadataFile", e)
+          throw e
+      } finally {
+        IOUtils.closeQuietly(input)
+      }
+    } else None
+  }
+
+  /** Write metadata to file */
+  def write(
+      metadata: StreamMetadata,
+      metadataFile: Path,
+      hadoopConf: Configuration): Unit = {
+    var output: FSDataOutputStream = null
+    try {
+      val fs = metadataFile.getFileSystem(hadoopConf)
+      output = fs.create(metadataFile)
+      val writer = new OutputStreamWriter(output)
+      Serialization.write(metadata, writer)
+      writer.close()
+    } catch {
+      case NonFatal(e) =>
+        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
+        throw e
+    } finally {
+      IOUtils.closeQuietly(output)
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
deleted file mode 100644
index e98d1883e459..000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming
-
-import java.{util => ju}
-
-import scala.collection.mutable
-
-import com.codahale.metrics.{Gauge, MetricRegistry}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.{Source => CodahaleSource}
-import org.apache.spark.util.Clock
-
-/**
- * Class that manages all the metrics related to a StreamingQuery. It does the following.
- * - Calculates metrics (rates, latencies, etc.) based on information reported by StreamExecution.
- * - Allows the current metric values to be queried
- * - Serves some of the metrics through Codahale/DropWizard metrics
- *
- * @param sources Unique set of sources in a query
- * @param triggerClock Clock used for triggering in StreamExecution
- * @param codahaleSourceName Root name for all the Codahale metrics
- */
-class StreamMetrics(sources: Set[Source], triggerClock: Clock, codahaleSourceName: String)
-  extends CodahaleSource with Logging {
-
-  import StreamMetrics._
-
-  // Trigger infos
-  private val triggerDetails = new mutable.HashMap[String, String]
-  private val sourceTriggerDetails = new mutable.HashMap[Source, mutable.HashMap[String, String]]
-
-  // Rate estimators for sources and sinks
-  private val inputRates = new mutable.HashMap[Source, RateCalculator]
-  private val processingRates = new mutable.HashMap[Source, RateCalculator]
-
-  // Number of input rows in the current trigger
-  private val numInputRows = new mutable.HashMap[Source, Long]
-  private var currentTriggerStartTimestamp: Long = -1
-  private var previousTriggerStartTimestamp: Long = -1
-  private var latency: Option[Double] = None
-
-  override val sourceName: String = codahaleSourceName
-  override val metricRegistry: MetricRegistry = new MetricRegistry
-
-  // =========== Initialization ===========
-
-  // Metric names should not have . in them, so that all the metrics of a query are identified
-  // together in Ganglia as a single metric group
-  registerGauge("inputRate-total", currentInputRate)
-  registerGauge("processingRate-total", () => currentProcessingRate)
-  registerGauge("latency", () => currentLatency().getOrElse(-1.0))
-
-  sources.foreach { s =>
-    inputRates.put(s, new RateCalculator)
-    processingRates.put(s, new RateCalculator)
-    sourceTriggerDetails.put(s, new mutable.HashMap[String, String])
-
-    registerGauge(s"inputRate-${s.toString}", () => currentSourceInputRate(s))
-    registerGauge(s"processingRate-${s.toString}", () => currentSourceProcessingRate(s))
-  }
-
-  // =========== Setter methods ===========
-
-  def reportTriggerStarted(triggerId: Long): Unit = synchronized {
-    numInputRows.clear()
-    triggerDetails.clear()
-    sourceTriggerDetails.values.foreach(_.clear())
-
-    reportTriggerDetail(TRIGGER_ID, triggerId)
-    sources.foreach(s => reportSourceTriggerDetail(s, TRIGGER_ID, triggerId))
-    reportTriggerDetail(IS_TRIGGER_ACTIVE, true)
-    currentTriggerStartTimestamp = triggerClock.getTimeMillis()
-    reportTriggerDetail(START_TIMESTAMP, currentTriggerStartTimestamp)
-  }
-
-  def reportTriggerDetail[T](key: String, value: T): Unit = synchronized {
-    triggerDetails.put(key, value.toString)
-  }
-
-  def reportSourceTriggerDetail[T](source: Source, key: String, value: T): Unit = synchronized {
-    sourceTriggerDetails(source).put(key, value.toString)
-  }
-
-  def reportNumInputRows(inputRows: Map[Source, Long]): Unit = synchronized {
-    numInputRows ++= inputRows
-  }
-
-  def reportTriggerFinished(): Unit = synchronized {
-    require(currentTriggerStartTimestamp >= 0)
-    val currentTriggerFinishTimestamp = triggerClock.getTimeMillis()
-    reportTriggerDetail(FINISH_TIMESTAMP, currentTriggerFinishTimestamp)
-    triggerDetails.remove(STATUS_MESSAGE)
-    reportTriggerDetail(IS_TRIGGER_ACTIVE, false)
-
-    // Report number of rows
-    val totalNumInputRows = numInputRows.values.sum
-    reportTriggerDetail(NUM_INPUT_ROWS, totalNumInputRows)
-    numInputRows.foreach { case (s, r) =>
-      reportSourceTriggerDetail(s, NUM_SOURCE_INPUT_ROWS, r)
-    }
-
-    val currentTriggerDuration = currentTriggerFinishTimestamp - currentTriggerStartTimestamp
-    val previousInputIntervalOption = if (previousTriggerStartTimestamp >= 0) {
-      Some(currentTriggerStartTimestamp - previousTriggerStartTimestamp)
-    } else None
-
-    // Update input rate = num rows received by each source during the previous trigger interval
-    // Interval is measures as interval between start times of previous and current trigger.
-    //
-    // TODO: Instead of trigger start, we should use time when getOffset was called on each source
-    // as this may be different for each source if there are many sources in the query plan
-    // and getOffset is called serially on them.
-    if (previousInputIntervalOption.nonEmpty) {
-      sources.foreach { s =>
-        inputRates(s).update(numInputRows.getOrElse(s, 0), previousInputIntervalOption.get)
-      }
-    }
-
-    // Update processing rate = num rows processed for each source in current trigger duration
-    sources.foreach { s =>
-      processingRates(s).update(numInputRows.getOrElse(s, 0), currentTriggerDuration)
-    }
-
-    // Update latency = if data present, 0.5 * previous trigger interval + current trigger duration
-    if (previousInputIntervalOption.nonEmpty && totalNumInputRows > 0) {
-      latency = Some((previousInputIntervalOption.get.toDouble / 2) + currentTriggerDuration)
-    } else {
-      latency = None
-    }
-
-    previousTriggerStartTimestamp = currentTriggerStartTimestamp
-    currentTriggerStartTimestamp = -1
-  }
-
-  // =========== Getter methods ===========
-
-  def currentInputRate(): Double = synchronized {
-    // Since we are calculating source input rates using the same time interval for all sources
-    // it is fine to calculate total input rate as the sum of per source input rate.
-    inputRates.map(_._2.currentRate).sum
-  }
-
-  def currentSourceInputRate(source: Source): Double = synchronized {
-    inputRates(source).currentRate
-  }
-
-  def currentProcessingRate(): Double = synchronized {
-    // Since we are calculating source processing rates using the same time interval for all sources
-    // it is fine to calculate total processing rate as the sum of per source processing rate.
-    processingRates.map(_._2.currentRate).sum
-  }
-
-  def currentSourceProcessingRate(source: Source): Double = synchronized {
-    processingRates(source).currentRate
-  }
-
-  def currentLatency(): Option[Double] = synchronized { latency }
-
-  def currentTriggerDetails(): Map[String, String] = synchronized { triggerDetails.toMap }
-
-  def currentSourceTriggerDetails(source: Source): Map[String, String] = synchronized {
-    sourceTriggerDetails(source).toMap
-  }
-
-  // =========== Other methods ===========
-
-  private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {
-    synchronized {
-      metricRegistry.register(name, new Gauge[T] {
-        override def getValue: T = f()
-      })
-    }
-  }
-
-  def stop(): Unit = synchronized {
-    triggerDetails.clear()
-    inputRates.valuesIterator.foreach { _.stop() }
-    processingRates.valuesIterator.foreach { _.stop() }
-    latency = None
-  }
-}
-
-object StreamMetrics extends Logging {
-  /** Simple utility class to calculate rate while avoiding DivideByZero */
-  class RateCalculator {
-    @volatile private var rate: Option[Double] = None
-
-    def update(numRows: Long, timeGapMs: Long): Unit = {
-      if (timeGapMs > 0) {
-        rate = Some(numRows.toDouble * 1000 / timeGapMs)
-      } else {
-        rate = None
-        logDebug(s"Rate updates cannot with zero or negative time gap $timeGapMs")
-      }
-    }
-
-    def currentRate: Double = rate.getOrElse(0.0)
-
-    def stop(): Unit = { rate = None }
-  }
-
-
-  val TRIGGER_ID = "triggerId"
-  val IS_TRIGGER_ACTIVE = "isTriggerActive"
-  val IS_DATA_PRESENT_IN_TRIGGER = "isDataPresentInTrigger"
-  val STATUS_MESSAGE = "statusMessage"
-
-  val START_TIMESTAMP = "timestamp.triggerStart"
-  val GET_OFFSET_TIMESTAMP = "timestamp.afterGetOffset"
-  val GET_BATCH_TIMESTAMP = "timestamp.afterGetBatch"
-  val FINISH_TIMESTAMP = "timestamp.triggerFinish"
-
-  val GET_OFFSET_LATENCY = "latency.getOffset.total"
-  val GET_BATCH_LATENCY = "latency.getBatch.total"
-  val OFFSET_WAL_WRITE_LATENCY = "latency.offsetLogWrite"
-  val OPTIMIZER_LATENCY = "latency.optimizer"
-  val TRIGGER_LATENCY = "latency.fullTrigger"
-  val SOURCE_GET_OFFSET_LATENCY = "latency.getOffset.source"
-  val SOURCE_GET_BATCH_LATENCY = "latency.getBatch.source"
-
-  val NUM_INPUT_ROWS = "numRows.input.total"
-  val NUM_SOURCE_INPUT_ROWS = "numRows.input.source"
-  def NUM_TOTAL_STATE_ROWS(aggId: Int): String = s"numRows.state.aggregation$aggId.total"
-  def NUM_UPDATED_STATE_ROWS(aggId: Int): String = s"numRows.state.aggregation$aggId.updated"
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
index db0bd9e6bc6f..a3f3662e6f4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
@@ -26,8 +26,8 @@ class StreamProgress(
     val baseMap: immutable.Map[Source, Offset] = new immutable.HashMap[Source, Offset])
   extends scala.collection.immutable.Map[Source, Offset] {
 
-  def toCompositeOffset(source: Seq[Source]): CompositeOffset = {
-    CompositeOffset(source.map(get))
+  def toOffsetSeq(source: Seq[Source], metadata: OffsetSeqMetadata): OffsetSeq = {
+    OffsetSeq(source.map(get), Some(metadata))
   }
 
   override def toString: String =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
index fc2190d39da4..4207013c3f75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.UUID
+
+import scala.collection.mutable
+
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerEvent}
 import org.apache.spark.sql.streaming.StreamingQueryListener
 import org.apache.spark.util.ListenerBus
@@ -25,7 +29,11 @@ import org.apache.spark.util.ListenerBus
  * A bus to forward events to [[StreamingQueryListener]]s. This one will send received
  * [[StreamingQueryListener.Event]]s to the Spark listener bus. It also registers itself with
  * Spark listener bus, so that it can receive [[StreamingQueryListener.Event]]s and dispatch them
- * to StreamingQueryListener.
+ * to StreamingQueryListeners.
+ *
+ * Note that each bus and its registered listeners are associated with a single SparkSession
+ * and StreamingQueryManager. So this bus will dispatch events to registered listeners for only
+ * those queries that were started in the associated SparkSession.
  */
 class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
   extends SparkListener with ListenerBus[StreamingQueryListener, StreamingQueryListener.Event] {
@@ -35,38 +43,90 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
   sparkListenerBus.addListener(this)
 
   /**
-   * Post a StreamingQueryListener event to the Spark listener bus asynchronously. This event will
-   * be dispatched to all StreamingQueryListener in the thread of the Spark listener bus.
+   * RunIds of active queries whose events are supposed to be forwarded by this ListenerBus
+   * to registered `StreamingQueryListeners`.
+   *
+   * Note 1: We need to track runIds instead of ids because the runId is unique for every started
+   * query, even it its a restart. So even if a query is restarted, this bus will identify them
+   * separately and correctly account for the restart.
+   *
+   * Note 2: This list needs to be maintained separately from the
+   * `StreamingQueryManager.activeQueries` because a terminated query is cleared from
+   * `StreamingQueryManager.activeQueries` as soon as it is stopped, but the this ListenerBus
+   * must clear a query only after the termination event of that query has been posted.
+   */
+  private val activeQueryRunIds = new mutable.HashSet[UUID]
+
+  /**
+   * Post a StreamingQueryListener event to the added StreamingQueryListeners.
+   * Note that only the QueryStarted event is posted to the listener synchronously. Other events
+   * are dispatched to Spark listener bus. This method is guaranteed to be called by queries in
+   * the same SparkSession as this listener.
    */
   def post(event: StreamingQueryListener.Event) {
     event match {
       case s: QueryStartedEvent =>
+        activeQueryRunIds.synchronized { activeQueryRunIds += s.runId }
+        sparkListenerBus.post(s)
+        // post to local listeners to trigger callbacks
         postToAll(s)
       case _ =>
         sparkListenerBus.post(event)
     }
   }
 
+  /**
+   * Override the parent `postToAll` to remove the query id from `activeQueryRunIds` after all
+   * the listeners process `QueryTerminatedEvent`. (SPARK-19594)
+   */
+  override def postToAll(event: Event): Unit = {
+    super.postToAll(event)
+    event match {
+      case t: QueryTerminatedEvent =>
+        activeQueryRunIds.synchronized { activeQueryRunIds -= t.runId }
+      case _ =>
+    }
+  }
+
   override def onOtherEvent(event: SparkListenerEvent): Unit = {
     event match {
       case e: StreamingQueryListener.Event =>
-        postToAll(e)
+        // SPARK-18144: we broadcast QueryStartedEvent to all listeners attached to this bus
+        // synchronously and the ones attached to LiveListenerBus asynchronously. Therefore,
+        // we need to ignore QueryStartedEvent if this method is called within SparkListenerBus
+        // thread
+        if (!LiveListenerBus.withinListenerThread.value || !e.isInstanceOf[QueryStartedEvent]) {
+          postToAll(e)
+        }
       case _ =>
     }
   }
 
+  /**
+   * Dispatch events to registered StreamingQueryListeners. Only the events associated queries
+   * started in the same SparkSession as this ListenerBus will be dispatched to the listeners.
+   */
   override protected def doPostEvent(
       listener: StreamingQueryListener,
       event: StreamingQueryListener.Event): Unit = {
+    def shouldReport(runId: UUID): Boolean = {
+      activeQueryRunIds.synchronized { activeQueryRunIds.contains(runId) }
+    }
+
     event match {
       case queryStarted: QueryStartedEvent =>
-        listener.onQueryStarted(queryStarted)
+        if (shouldReport(queryStarted.runId)) {
+          listener.onQueryStarted(queryStarted)
+        }
       case queryProgress: QueryProgressEvent =>
-        listener.onQueryProgress(queryProgress)
+        if (shouldReport(queryProgress.progress.runId)) {
+          listener.onQueryProgress(queryProgress)
+        }
       case queryTerminated: QueryTerminatedEvent =>
-        listener.onQueryTerminated(queryTerminated)
+        if (shouldReport(queryTerminated.runId)) {
+          listener.onQueryTerminated(queryTerminated)
+        }
       case _ =>
     }
   }
-
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryWrapper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryWrapper.scala
new file mode 100644
index 000000000000..020c9cb4a730
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryWrapper.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import java.util.UUID
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.streaming.{StreamingQuery, StreamingQueryException, StreamingQueryProgress, StreamingQueryStatus}
+
+/**
+ * Wrap non-serializable StreamExecution to make the query serializable as it's easy to for it to
+ * get captured with normal usage. It's safe to capture the query but not use it in executors.
+ * However, if the user tries to call its methods, it will throw `IllegalStateException`.
+ */
+class StreamingQueryWrapper(@transient private val _streamingQuery: StreamExecution)
+  extends StreamingQuery with Serializable {
+
+  def streamingQuery: StreamExecution = {
+    /** Assert the codes run in the driver. */
+    if (_streamingQuery == null) {
+      throw new IllegalStateException("StreamingQuery cannot be used in executors")
+    }
+    _streamingQuery
+  }
+
+  override def name: String = {
+    streamingQuery.name
+  }
+
+  override def id: UUID = {
+    streamingQuery.id
+  }
+
+  override def runId: UUID = {
+    streamingQuery.runId
+  }
+
+  override def awaitTermination(): Unit = {
+    streamingQuery.awaitTermination()
+  }
+
+  override def awaitTermination(timeoutMs: Long): Boolean = {
+    streamingQuery.awaitTermination(timeoutMs)
+  }
+
+  override def stop(): Unit = {
+    streamingQuery.stop()
+  }
+
+  override def processAllAvailable(): Unit = {
+    streamingQuery.processAllAvailable()
+  }
+
+  override def isActive: Boolean = {
+    streamingQuery.isActive
+  }
+
+  override def lastProgress: StreamingQueryProgress = {
+    streamingQuery.lastProgress
+  }
+
+  override def explain(): Unit = {
+    streamingQuery.explain()
+  }
+
+  override def explain(extended: Boolean): Unit = {
+    streamingQuery.explain(extended)
+  }
+
+  /**
+   * This method is called in Python. Python cannot call "explain" directly as it outputs in the JVM
+   * process, which may not be visible in Python process.
+   */
+  def explainInternal(extended: Boolean): String = {
+    streamingQuery.explainInternal(extended)
+  }
+
+  override def sparkSession: SparkSession = {
+    streamingQuery.sparkSession
+  }
+
+  override def recentProgress: Array[StreamingQueryProgress] = {
+    streamingQuery.recentProgress
+  }
+
+  override def status: StreamingQueryStatus = {
+    streamingQuery.status
+  }
+
+  override def exception: Option[StreamingQueryException] = {
+    streamingQuery.exception
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 48d9791faf1e..91da6b38464d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -27,7 +27,8 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.LeafNode
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
@@ -70,11 +71,11 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
 
   def schema: StructType = encoder.schema
 
-  def toDS()(implicit sqlContext: SQLContext): Dataset[A] = {
+  def toDS(): Dataset[A] = {
     Dataset(sqlContext.sparkSession, logicalPlan)
   }
 
-  def toDF()(implicit sqlContext: SQLContext): DataFrame = {
+  def toDF(): DataFrame = {
     Dataset.ofRows(sqlContext.sparkSession, logicalPlan)
   }
 
@@ -106,8 +107,8 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
     // Compute the internal batch numbers to fetch: [startOrdinal, endOrdinal)
     val startOrdinal =
-      start.map(_.asInstanceOf[LongOffset]).getOrElse(LongOffset(-1)).offset.toInt + 1
-    val endOrdinal = end.asInstanceOf[LongOffset].offset.toInt + 1
+      start.flatMap(LongOffset.convert).getOrElse(LongOffset(-1)).offset.toInt + 1
+    val endOrdinal = LongOffset.convert(end).getOrElse(LongOffset(-1)).offset.toInt + 1
 
     // Internal buffer only holds the batches after lastCommittedOffset.
     val newBlocks = synchronized {
@@ -127,19 +128,21 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
   }
 
   override def commit(end: Offset): Unit = synchronized {
-    end match {
-      case newOffset: LongOffset =>
-        val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
-
-        if (offsetDiff < 0) {
-          sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
-        }
-
-        batches.trimStart(offsetDiff)
-        lastOffsetCommitted = newOffset
-      case _ =>
-        sys.error(s"MemoryStream.commit() received an offset ($end) that did not originate with " +
-          "an instance of this class")
+    def check(newOffset: LongOffset): Unit = {
+      val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
+
+      if (offsetDiff < 0) {
+        sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
+      }
+
+      batches.trimStart(offsetDiff)
+      lastOffsetCommitted = newOffset
+    }
+
+    LongOffset.convert(end) match {
+      case Some(lo) => check(lo)
+      case None => sys.error(s"MemoryStream.commit() received an offset ($end) " +
+        "that did not originate with an instance of this class")
     }
   }
 
@@ -184,16 +187,23 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
     }.mkString("\n")
   }
 
-  override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized {
-    if (latestBatchId.isEmpty || batchId > latestBatchId.get) {
+  override def addBatch(batchId: Long, data: DataFrame): Unit = {
+    val notCommitted = synchronized {
+      latestBatchId.isEmpty || batchId > latestBatchId.get
+    }
+    if (notCommitted) {
       logDebug(s"Committing batch $batchId to $this")
       outputMode match {
-        case InternalOutputModes.Append | InternalOutputModes.Update =>
-          batches.append(AddedData(batchId, data.collect()))
+        case Append | Update =>
+          val rows = AddedData(batchId, data.collect())
+          synchronized { batches += rows }
 
-        case InternalOutputModes.Complete =>
-          batches.clear()
-          batches += AddedData(batchId, data.collect())
+        case Complete =>
+          val rows = AddedData(batchId, data.collect())
+          synchronized {
+            batches.clear()
+            batches += rows
+          }
 
         case _ =>
           throw new IllegalArgumentException(
@@ -204,6 +214,10 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
     }
   }
 
+  def clear(): Unit = synchronized {
+    batches.clear()
+  }
+
   override def toString(): String = "MemorySink"
 }
 
@@ -212,4 +226,8 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
  */
 case class MemoryPlan(sink: MemorySink, output: Seq[Attribute]) extends LeafNode {
   def this(sink: MemorySink) = this(sink, sink.schema.toAttributes)
+
+  private val sizePerRow = sink.schema.toAttributes.map(_.dataType.defaultSize).sum
+
+  override def statistics: Statistics = Statistics(sizePerRow * sink.allData.size)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
index c662e7c6bc77..900d92bc0d95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
@@ -21,7 +21,7 @@ import java.io.{BufferedReader, InputStreamReader, IOException}
 import java.net.Socket
 import java.sql.Timestamp
 import java.text.SimpleDateFormat
-import java.util.Calendar
+import java.util.{Calendar, Locale}
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable.ListBuffer
@@ -37,7 +37,7 @@ object TextSocketSource {
   val SCHEMA_REGULAR = StructType(StructField("value", StringType) :: Nil)
   val SCHEMA_TIMESTAMP = StructType(StructField("value", StringType) ::
     StructField("timestamp", TimestampType) :: Nil)
-  val DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+  val DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
 }
 
 /**
@@ -116,8 +116,8 @@ class TextSocketSource(host: String, port: Int, includeTimestamp: Boolean, sqlCo
   /** Returns the data that is between the offsets (`start`, `end`]. */
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = synchronized {
     val startOrdinal =
-      start.map(_.asInstanceOf[LongOffset]).getOrElse(LongOffset(-1)).offset.toInt + 1
-    val endOrdinal = end.asInstanceOf[LongOffset].offset.toInt + 1
+      start.flatMap(LongOffset.convert).getOrElse(LongOffset(-1)).offset.toInt + 1
+    val endOrdinal = LongOffset.convert(end).getOrElse(LongOffset(-1)).offset.toInt + 1
 
     // Internal buffer only holds the batches after lastOffsetCommitted
     val rawList = synchronized {
@@ -140,20 +140,19 @@ class TextSocketSource(host: String, port: Int, includeTimestamp: Boolean, sqlCo
   }
 
   override def commit(end: Offset): Unit = synchronized {
-    if (end.isInstanceOf[LongOffset]) {
-      val newOffset = end.asInstanceOf[LongOffset]
-      val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
-
-      if (offsetDiff < 0) {
-        sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
-      }
-
-      batches.trimStart(offsetDiff)
-      lastOffsetCommitted = newOffset
-    } else {
+    val newOffset = LongOffset.convert(end).getOrElse(
       sys.error(s"TextSocketStream.commit() received an offset ($end) that did not " +
         s"originate with an instance of this class")
+    )
+
+    val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
+
+    if (offsetDiff < 0) {
+      sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
     }
+
+    batches.trimStart(offsetDiff)
+    lastOffsetCommitted = newOffset
   }
 
   /** Stop this source. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index f1e7f1d113ce..f53b9b9a4315 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -87,8 +87,7 @@ private[state] class HDFSBackedStateStoreProvider(
 
     private val newVersion = version + 1
     private val tempDeltaFile = new Path(baseDir, s"temp-${Random.nextLong}")
-    private val tempDeltaFileStream = compressStream(fs.create(tempDeltaFile, true))
-
+    private lazy val tempDeltaFileStream = compressStream(fs.create(tempDeltaFile, true))
     private val allUpdates = new java.util.HashMap[UnsafeRow, StoreUpdate]()
 
     @volatile private var state: STATE = UPDATING
@@ -101,7 +100,7 @@ private[state] class HDFSBackedStateStoreProvider(
     }
 
     override def put(key: UnsafeRow, value: UnsafeRow): Unit = {
-      verify(state == UPDATING, "Cannot remove after already committed or aborted")
+      verify(state == UPDATING, "Cannot put after already committed or aborted")
 
       val isNewKey = !mapToUpdate.containsKey(key)
       mapToUpdate.put(key, value)
@@ -110,7 +109,7 @@ private[state] class HDFSBackedStateStoreProvider(
         case Some(ValueAdded(_, _)) =>
           // Value did not exist in previous version and was added already, keep it marked as added
           allUpdates.put(key, ValueAdded(key, value))
-        case Some(ValueUpdated(_, _)) | Some(KeyRemoved(_)) =>
+        case Some(ValueUpdated(_, _)) | Some(ValueRemoved(_, _)) =>
           // Value existed in previous version and updated/removed, mark it as updated
           allUpdates.put(key, ValueUpdated(key, value))
         case None =>
@@ -125,23 +124,25 @@ private[state] class HDFSBackedStateStoreProvider(
     /** Remove keys that match the following condition */
     override def remove(condition: UnsafeRow => Boolean): Unit = {
       verify(state == UPDATING, "Cannot remove after already committed or aborted")
-      val keyIter = mapToUpdate.keySet().iterator()
-      while (keyIter.hasNext) {
-        val key = keyIter.next
-        if (condition(key)) {
-          keyIter.remove()
+      val entryIter = mapToUpdate.entrySet().iterator()
+      while (entryIter.hasNext) {
+        val entry = entryIter.next
+        if (condition(entry.getKey)) {
+          val value = entry.getValue
+          val key = entry.getKey
+          entryIter.remove()
 
           Option(allUpdates.get(key)) match {
             case Some(ValueUpdated(_, _)) | None =>
               // Value existed in previous version and maybe was updated, mark removed
-              allUpdates.put(key, KeyRemoved(key))
+              allUpdates.put(key, ValueRemoved(key, value))
             case Some(ValueAdded(_, _)) =>
               // Value did not exist in previous version and was added, should not appear in updates
               allUpdates.remove(key)
-            case Some(KeyRemoved(_)) =>
+            case Some(ValueRemoved(_, _)) =>
               // Remove already in update map, no need to change
           }
-          writeToDeltaFile(tempDeltaFileStream, KeyRemoved(key))
+          writeToDeltaFile(tempDeltaFileStream, ValueRemoved(key, value))
         }
       }
     }
@@ -154,7 +155,7 @@ private[state] class HDFSBackedStateStoreProvider(
         finalizeDeltaFile(tempDeltaFileStream)
         finalDeltaFile = commitUpdates(newVersion, mapToUpdate, tempDeltaFile)
         state = COMMITTED
-        logInfo(s"Committed version $newVersion for $this")
+        logInfo(s"Committed version $newVersion for $this to file $finalDeltaFile")
         newVersion
       } catch {
         case NonFatal(e) =>
@@ -174,7 +175,7 @@ private[state] class HDFSBackedStateStoreProvider(
       if (tempDeltaFile != null) {
         fs.delete(tempDeltaFile, true)
       }
-      logInfo("Aborted")
+      logInfo(s"Aborted version $newVersion for $this")
     }
 
     /**
@@ -202,7 +203,7 @@ private[state] class HDFSBackedStateStoreProvider(
     /**
      * Whether all updates have been committed
      */
-    override private[state] def hasCommitted: Boolean = {
+    override private[streaming] def hasCommitted: Boolean = {
       state == COMMITTED
     }
 
@@ -254,7 +255,20 @@ private[state] class HDFSBackedStateStoreProvider(
   private def commitUpdates(newVersion: Long, map: MapType, tempDeltaFile: Path): Path = {
     synchronized {
       val finalDeltaFile = deltaFile(newVersion)
-      fs.rename(tempDeltaFile, finalDeltaFile)
+
+      // scalastyle:off
+      // Renaming a file atop an existing one fails on HDFS
+      // (http://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/filesystem/filesystem.html).
+      // Hence we should either skip the rename step or delete the target file. Because deleting the
+      // target file will break speculation, skipping the rename step is the only choice. It's still
+      // semantically correct because Structured Streaming requires rerunning a batch should
+      // generate the same output. (SPARK-19677)
+      // scalastyle:on
+      if (fs.exists(finalDeltaFile)) {
+        fs.delete(tempDeltaFile, true)
+      } else if (!fs.rename(tempDeltaFile, finalDeltaFile)) {
+        throw new IOException(s"Failed to rename $tempDeltaFile to $finalDeltaFile")
+      }
       loadedMaps.put(newVersion, map)
       finalDeltaFile
     }
@@ -300,7 +314,6 @@ private[state] class HDFSBackedStateStoreProvider(
       val mapFromFile = readSnapshotFile(version).getOrElse {
         val prevMap = loadMap(version - 1)
         val newMap = new MapType(prevMap)
-        newMap.putAll(prevMap)
         updateFromDeltaFile(version, newMap)
         newMap
       }
@@ -332,7 +345,7 @@ private[state] class HDFSBackedStateStoreProvider(
         writeUpdate(key, value)
       case ValueUpdated(key, value) =>
         writeUpdate(key, value)
-      case KeyRemoved(key) =>
+      case ValueRemoved(key, value) =>
         writeRemove(key)
     }
   }
@@ -525,7 +538,7 @@ private[state] class HDFSBackedStateStoreProvider(
 
         val deltaFiles = allFiles.filter { file =>
           file.version > snapshotFile.version && file.version <= version
-        }
+        }.toList
         verify(
           deltaFiles.size == version - snapshotFile.version,
           s"Unexpected list of delta files for version $version for $this: $deltaFiles"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index 7132e284c28f..e61d95a1b1bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.streaming.state
 
 import java.util.concurrent.{ScheduledFuture, TimeUnit}
+import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable
 import scala.util.control.NonFatal
@@ -83,7 +84,7 @@ trait StateStore {
   /**
    * Whether all updates have been committed
    */
-  private[state] def hasCommitted: Boolean
+  private[streaming] def hasCommitted: Boolean
 }
 
 
@@ -99,13 +100,16 @@ trait StateStoreProvider {
 
 
 /** Trait representing updates made to a [[StateStore]]. */
-sealed trait StoreUpdate
+sealed trait StoreUpdate {
+  def key: UnsafeRow
+  def value: UnsafeRow
+}
 
 case class ValueAdded(key: UnsafeRow, value: UnsafeRow) extends StoreUpdate
 
 case class ValueUpdated(key: UnsafeRow, value: UnsafeRow) extends StoreUpdate
 
-case class KeyRemoved(key: UnsafeRow) extends StoreUpdate
+case class ValueRemoved(key: UnsafeRow, value: UnsafeRow) extends StoreUpdate
 
 
 /**
@@ -121,12 +125,46 @@ object StateStore extends Logging {
   val MAINTENANCE_INTERVAL_CONFIG = "spark.sql.streaming.stateStore.maintenanceInterval"
   val MAINTENANCE_INTERVAL_DEFAULT_SECS = 60
 
+  @GuardedBy("loadedProviders")
   private val loadedProviders = new mutable.HashMap[StateStoreId, StateStoreProvider]()
-  private val maintenanceTaskExecutor =
-    ThreadUtils.newDaemonSingleThreadScheduledExecutor("state-store-maintenance-task")
 
-  @volatile private var maintenanceTask: ScheduledFuture[_] = null
-  @volatile private var _coordRef: StateStoreCoordinatorRef = null
+  /**
+   * Runs the `task` periodically and automatically cancels it if there is an exception. `onError`
+   * will be called when an exception happens.
+   */
+  class MaintenanceTask(periodMs: Long, task: => Unit, onError: => Unit) {
+    private val executor =
+      ThreadUtils.newDaemonSingleThreadScheduledExecutor("state-store-maintenance-task")
+
+    private val runnable = new Runnable {
+      override def run(): Unit = {
+        try {
+          task
+        } catch {
+          case NonFatal(e) =>
+            logWarning("Error running maintenance thread", e)
+            onError
+            throw e
+        }
+      }
+    }
+
+    private val future: ScheduledFuture[_] = executor.scheduleAtFixedRate(
+      runnable, periodMs, periodMs, TimeUnit.MILLISECONDS)
+
+    def stop(): Unit = {
+      future.cancel(false)
+      executor.shutdown()
+    }
+
+    def isRunning: Boolean = !future.isDone
+  }
+
+  @GuardedBy("loadedProviders")
+  private var maintenanceTask: MaintenanceTask = null
+
+  @GuardedBy("loadedProviders")
+  private var _coordRef: StateStoreCoordinatorRef = null
 
   /** Get or create a store associated with the id. */
   def get(
@@ -159,7 +197,7 @@ object StateStore extends Logging {
   }
 
   def isMaintenanceRunning: Boolean = loadedProviders.synchronized {
-    maintenanceTask != null
+    maintenanceTask != null && maintenanceTask.isRunning
   }
 
   /** Unload and stop all state store providers */
@@ -167,7 +205,7 @@ object StateStore extends Logging {
     loadedProviders.clear()
     _coordRef = null
     if (maintenanceTask != null) {
-      maintenanceTask.cancel(false)
+      maintenanceTask.stop()
       maintenanceTask = null
     }
     logInfo("StateStore stopped")
@@ -176,14 +214,14 @@ object StateStore extends Logging {
   /** Start the periodic maintenance task if not already started and if Spark active */
   private def startMaintenanceIfNeeded(): Unit = loadedProviders.synchronized {
     val env = SparkEnv.get
-    if (maintenanceTask == null && env != null) {
+    if (env != null && !isMaintenanceRunning) {
       val periodMs = env.conf.getTimeAsMs(
         MAINTENANCE_INTERVAL_CONFIG, s"${MAINTENANCE_INTERVAL_DEFAULT_SECS}s")
-      val runnable = new Runnable {
-        override def run(): Unit = { doMaintenance() }
-      }
-      maintenanceTask = maintenanceTaskExecutor.scheduleAtFixedRate(
-        runnable, periodMs, periodMs, TimeUnit.MILLISECONDS)
+      maintenanceTask = new MaintenanceTask(
+        periodMs,
+        task = { doMaintenance() },
+        onError = { loadedProviders.synchronized { loadedProviders.clear() } }
+      )
       logInfo("State Store maintenance task started")
     }
   }
@@ -195,21 +233,20 @@ object StateStore extends Logging {
   private def doMaintenance(): Unit = {
     logDebug("Doing maintenance")
     if (SparkEnv.get == null) {
-      stop()
-    } else {
-      loadedProviders.synchronized { loadedProviders.toSeq }.foreach { case (id, provider) =>
-        try {
-          if (verifyIfStoreInstanceActive(id)) {
-            provider.doMaintenance()
-          } else {
-            unload(id)
-            logInfo(s"Unloaded $provider")
-          }
-        } catch {
-          case NonFatal(e) =>
-            logWarning(s"Error managing $provider, stopping management thread")
-            stop()
+      throw new IllegalStateException("SparkEnv not active, cannot do maintenance on StateStores")
+    }
+    loadedProviders.synchronized { loadedProviders.toSeq }.foreach { case (id, provider) =>
+      try {
+        if (verifyIfStoreInstanceActive(id)) {
+          provider.doMaintenance()
+        } else {
+          unload(id)
+          logInfo(s"Unloaded $provider")
         }
+      } catch {
+        case NonFatal(e) =>
+          logWarning(s"Error managing $provider, stopping management thread")
+          throw e
       }
     }
   }
@@ -235,7 +272,7 @@ object StateStore extends Logging {
     }
   }
 
-  private def coordinatorRef: Option[StateStoreCoordinatorRef] = synchronized {
+  private def coordinatorRef: Option[StateStoreCoordinatorRef] = loadedProviders.synchronized {
     val env = SparkEnv.get
     if (env != null) {
       if (_coordRef == null) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
index de72f1cf2723..acfaa8e5eb3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
@@ -26,9 +26,11 @@ private[streaming] class StateStoreConf(@transient private val conf: SQLConf) ex
 
   val minDeltasForSnapshot = conf.stateStoreMinDeltasForSnapshot
 
-  val minVersionsToRetain = conf.stateStoreMinVersionsToRetain
+  val minVersionsToRetain = conf.minBatchesToRetain
 }
 
 private[streaming] object StateStoreConf {
   val empty = new StateStoreConf()
+
+  def apply(conf: SQLConf): StateStoreConf = new StateStoreConf(conf)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
index 4914a9d722a8..1b56c08f729c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
@@ -28,7 +28,7 @@ package object state {
 
   implicit class StateStoreOps[T: ClassTag](dataRDD: RDD[T]) {
 
-    /** Map each partition of a RDD along with data in a [[StateStore]]. */
+    /** Map each partition of an RDD along with data in a [[StateStore]]. */
     def mapPartitionsWithStateStore[U: ClassTag](
         sqlContext: SQLContext,
         checkpointLocation: String,
@@ -49,7 +49,7 @@ package object state {
         storeUpdateFunction)
     }
 
-    /** Map each partition of a RDD along with data in a [[StateStore]]. */
+    /** Map each partition of an RDD along with data in a [[StateStore]]. */
     private[streaming] def mapPartitionsWithStateStore[U: ClassTag](
         checkpointLocation: String,
         operatorId: Long,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
index 23fc0bd0bce1..460fc946c3e6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
@@ -29,7 +29,8 @@ class ExecutionPage(parent: SQLTab) extends WebUIPage("execution") with Logging
   private val listener = parent.listener
 
   override def render(request: HttpServletRequest): Seq[Node] = listener.synchronized {
-    val parameterExecutionId = request.getParameter("id")
+    // stripXSS is called first to remove suspicious characters used in XSS attacks
+    val parameterExecutionId = UIUtils.stripXSS(request.getParameter("id"))
     require(parameterExecutionId != null && parameterExecutionId.nonEmpty,
       "Missing execution id parameter")
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index 60f13432d78d..b4a91230a001 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -19,6 +19,11 @@ package org.apache.spark.sql.execution.ui
 
 import scala.collection.mutable
 
+import com.fasterxml.jackson.databind.JavaType
+import com.fasterxml.jackson.databind.`type`.TypeFactory
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize
+import com.fasterxml.jackson.databind.util.Converter
+
 import org.apache.spark.{JobExecutionStatus, SparkConf}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.internal.Logging
@@ -42,10 +47,49 @@ case class SparkListenerSQLExecutionStart(
 case class SparkListenerSQLExecutionEnd(executionId: Long, time: Long)
   extends SparkListenerEvent
 
+/**
+ * A message used to update SQL metric value for driver-side updates (which doesn't get reflected
+ * automatically).
+ *
+ * @param executionId The execution id for a query, so we can find the query plan.
+ * @param accumUpdates Map from accumulator id to the metric value (metrics are always 64-bit ints).
+ */
 @DeveloperApi
-case class SparkListenerDriverAccumUpdates(executionId: Long, accumUpdates: Seq[(Long, Long)])
+case class SparkListenerDriverAccumUpdates(
+    executionId: Long,
+    @JsonDeserialize(contentConverter = classOf[LongLongTupleConverter])
+    accumUpdates: Seq[(Long, Long)])
   extends SparkListenerEvent
 
+/**
+ * Jackson [[Converter]] for converting an (Int, Int) tuple into a (Long, Long) tuple.
+ *
+ * This is necessary due to limitations in how Jackson's scala module deserializes primitives;
+ * see the "Deserializing Option[Int] and other primitive challenges" section in
+ * https://github.com/FasterXML/jackson-module-scala/wiki/FAQ for a discussion of this issue and
+ * SPARK-18462 for the specific problem that motivated this conversion.
+ */
+private class LongLongTupleConverter extends Converter[(Object, Object), (Long, Long)] {
+
+  override def convert(in: (Object, Object)): (Long, Long) = {
+    def toLong(a: Object): Long = a match {
+      case i: java.lang.Integer => i.intValue()
+      case l: java.lang.Long => l.longValue()
+    }
+    (toLong(in._1), toLong(in._2))
+  }
+
+  override def getInputType(typeFactory: TypeFactory): JavaType = {
+    val objectType = typeFactory.uncheckedSimpleType(classOf[Object])
+    typeFactory.constructSimpleType(classOf[(_, _)], classOf[(_, _)], Array(objectType, objectType))
+  }
+
+  override def getOutputType(typeFactory: TypeFactory): JavaType = {
+    val longType = typeFactory.uncheckedSimpleType(classOf[Long])
+    typeFactory.constructSimpleType(classOf[(_, _)], classOf[(_, _)], Array(longType, longType))
+  }
+}
+
 class SQLHistoryListenerFactory extends SparkHistoryListenerFactory {
 
   override def createListeners(conf: SparkConf, sparkUI: SparkUI): Seq[SparkListener] = {
@@ -306,10 +350,13 @@ class SQLListener(conf: SparkConf) extends SparkListener with Logging {
                accumulatorUpdate <- taskMetrics.accumulatorUpdates) yield {
             (accumulatorUpdate._1, accumulatorUpdate._2)
           }
-        }.filter { case (id, _) => executionUIData.accumulatorMetrics.contains(id) }
+        }
 
         val driverUpdates = executionUIData.driverAccumUpdates.toSeq
-        mergeAccumulatorUpdates(accumulatorUpdates ++ driverUpdates, accumulatorId =>
+        val totalUpdates = (accumulatorUpdates ++ driverUpdates).filter {
+          case (id, _) => executionUIData.accumulatorMetrics.contains(id)
+        }
+        mergeAccumulatorUpdates(totalUpdates, accumulatorId =>
           executionUIData.accumulatorMetrics(accumulatorId).metricType)
       case None =>
         // This execution has been dropped
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
index eea98414003b..058c38c8cb8f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
 
 /**
  * :: Experimental ::
- * A base class for user-defined aggregations, which can be used in [[Dataset]] operations to take
+ * A base class for user-defined aggregations, which can be used in `Dataset` operations to take
  * all of the elements of a group and reduce them to a single value.
  *
  * For example, the following aggregator extracts an `int` from a specific class and adds them up:
@@ -80,19 +80,19 @@ abstract class Aggregator[-IN, BUF, OUT] extends Serializable {
   def finish(reduction: BUF): OUT
 
   /**
-   * Specifies the [[Encoder]] for the intermediate value type.
+   * Specifies the `Encoder` for the intermediate value type.
    * @since 2.0.0
    */
   def bufferEncoder: Encoder[BUF]
 
   /**
-   * Specifies the [[Encoder]] for the final ouput value type.
+   * Specifies the `Encoder` for the final ouput value type.
    * @since 2.0.0
    */
   def outputEncoder: Encoder[OUT]
 
   /**
-   * Returns this `Aggregator` as a [[TypedColumn]] that can be used in [[Dataset]].
+   * Returns this `Aggregator` as a `TypedColumn` that can be used in `Dataset`.
    * operations.
    * @since 1.6.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 28598af78165..b13fe7016092 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -24,10 +24,8 @@ import org.apache.spark.sql.functions
 import org.apache.spark.sql.types.DataType
 
 /**
- * A user-defined function. To create one, use the `udf` functions in [[functions]].
- * Note that the user-defined functions must be deterministic. Due to optimization,
- * duplicate invocations may be eliminated or the function may even be invoked more times than
- * it is present in the query.
+ * A user-defined function. To create one, use the `udf` functions in `functions`.
+ *
  * As an example:
  * {{{
  *   // Defined a UDF that returns true or false based on some numeric score.
@@ -37,6 +35,10 @@ import org.apache.spark.sql.types.DataType
  *   df.select( predict(df("score")) )
  * }}}
  *
+ * @note The user-defined functions must be deterministic. Due to optimization,
+ * duplicate invocations may be eliminated or the function may even be invoked more times than
+ * it is present in the query.
+ *
  * @since 1.3.0
  */
 @InterfaceStability.Stable
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index 0b26d863cac5..f3cf3052ea3e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -117,14 +117,40 @@ object Window {
    * "current row", while "-1" means the row before the current row, and "5" means the fifth row
    * after the current row.
    *
-   * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]],
-   * and [[Window.currentRow]] to specify special boundary values, rather than using integral
+   * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
+   * and `Window.currentRow` to specify special boundary values, rather than using integral
    * values directly.
    *
+   * A row based boundary is based on the position of the row within the partition.
+   * An offset indicates the number of rows above or below the current row, the frame for the
+   * current row starts or ends. For instance, given a row based sliding frame with a lower bound
+   * offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from
+   * index 4 to index 6.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   df.withColumn("sum",
+   *       sum('id) over Window.partitionBy('category).orderBy('id).rowsBetween(0,1))
+   *     .show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  2|
+   *   |  1|       a|  3|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
    * @param start boundary start, inclusive. The frame is unbounded if this is
-   *              the minimum long value ([[Window.unboundedPreceding]]).
+   *              the minimum long value (`Window.unboundedPreceding`).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
-   *            maximum long value  ([[Window.unboundedFollowing]]).
+   *            maximum long value  (`Window.unboundedFollowing`).
    * @since 2.1.0
    */
   // Note: when updating the doc for this method, also update WindowSpec.rowsBetween.
@@ -140,14 +166,43 @@ object Window {
    * while "-1" means one off before the current row, and "5" means the five off after the
    * current row.
    *
-   * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]],
-   * and [[Window.currentRow]] to specify special boundary values, rather than using integral
+   * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
+   * and `Window.currentRow` to specify special boundary values, rather than using integral
    * values directly.
    *
+   * A range based boundary is based on the actual value of the ORDER BY
+   * expression(s). An offset is used to alter the value of the ORDER BY expression, for
+   * instance if the current order by expression has a value of 10 and the lower bound offset
+   * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
+   * number of constraints on the ORDER BY expressions: there can be only one expression and this
+   * expression must have a numerical data type. An exception can be made when the offset is 0,
+   * because no value modification is needed, in this case multiple and non-numeric ORDER BY
+   * expression are allowed.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   df.withColumn("sum",
+   *       sum('id) over Window.partitionBy('category).orderBy('id).rangeBetween(0,1))
+   *     .show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  4|
+   *   |  1|       a|  4|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
    * @param start boundary start, inclusive. The frame is unbounded if this is
-   *              the minimum long value ([[Window.unboundedPreceding]]).
+   *              the minimum long value (`Window.unboundedPreceding`).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
-   *            maximum long value  ([[Window.unboundedFollowing]]).
+   *            maximum long value  (`Window.unboundedFollowing`).
    * @since 2.1.0
    */
   // Note: when updating the doc for this method, also update WindowSpec.rangeBetween.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index 1e85b6e7881a..de7d7a177275 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -85,14 +85,40 @@ class WindowSpec private[sql](
    * "current row", while "-1" means the row before the current row, and "5" means the fifth row
    * after the current row.
    *
-   * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]],
-   * and [[Window.currentRow]] to specify special boundary values, rather than using integral
+   * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
+   * and `[Window.currentRow` to specify special boundary values, rather than using integral
    * values directly.
    *
+   * A row based boundary is based on the position of the row within the partition.
+   * An offset indicates the number of rows above or below the current row, the frame for the
+   * current row starts or ends. For instance, given a row based sliding frame with a lower bound
+   * offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from
+   * index 4 to index 6.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   df.withColumn("sum",
+   *       sum('id) over Window.partitionBy('category).orderBy('id).rowsBetween(0,1))
+   *     .show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  2|
+   *   |  1|       a|  3|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
    * @param start boundary start, inclusive. The frame is unbounded if this is
-   *              the minimum long value ([[Window.unboundedPreceding]]).
+   *              the minimum long value (`Window.unboundedPreceding`).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
-   *            maximum long value  ([[Window.unboundedFollowing]]).
+   *            maximum long value  (`Window.unboundedFollowing`).
    * @since 1.4.0
    */
   // Note: when updating the doc for this method, also update Window.rowsBetween.
@@ -107,14 +133,43 @@ class WindowSpec private[sql](
    * while "-1" means one off before the current row, and "5" means the five off after the
    * current row.
    *
-   * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]],
-   * and [[Window.currentRow]] to specify special boundary values, rather than using integral
+   * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
+   * and `[Window.currentRow` to specify special boundary values, rather than using integral
    * values directly.
    *
+   * A range based boundary is based on the actual value of the ORDER BY
+   * expression(s). An offset is used to alter the value of the ORDER BY expression, for
+   * instance if the current order by expression has a value of 10 and the lower bound offset
+   * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
+   * number of constraints on the ORDER BY expressions: there can be only one expression and this
+   * expression must have a numerical data type. An exception can be made when the offset is 0,
+   * because no value modification is needed, in this case multiple and non-numeric ORDER BY
+   * expression are allowed.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   df.withColumn("sum",
+   *       sum('id) over Window.partitionBy('category).orderBy('id).rangeBetween(0,1))
+   *     .show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  4|
+   *   |  1|       a|  4|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
    * @param start boundary start, inclusive. The frame is unbounded if this is
-   *              the minimum long value ([[Window.unboundedPreceding]]).
+   *              the minimum long value (`Window.unboundedPreceding`).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
-   *            maximum long value  ([[Window.unboundedFollowing]]).
+   *            maximum long value  (`Window.unboundedFollowing`).
    * @since 1.4.0
    */
   // Note: when updating the doc for this method, also update Window.rangeBetween.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
index aa71cb9e3bc8..650ffd458659 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.execution.aggregate._
 
 /**
  * :: Experimental ::
- * Type-safe functions available for [[Dataset]] operations in Scala.
+ * Type-safe functions available for `Dataset` operations in Scala.
  *
  * Java users should use [[org.apache.spark.sql.expressions.javalang.typed]].
  *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
index bc9788d81fe6..4976b875fa29 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
@@ -32,9 +32,9 @@ import org.apache.spark.sql.types._
 abstract class UserDefinedAggregateFunction extends Serializable {
 
   /**
-   * A [[StructType]] represents data types of input arguments of this aggregate function.
+   * A `StructType` represents data types of input arguments of this aggregate function.
    * For example, if a [[UserDefinedAggregateFunction]] expects two input arguments
-   * with type of [[DoubleType]] and [[LongType]], the returned [[StructType]] will look like
+   * with type of `DoubleType` and `LongType`, the returned `StructType` will look like
    *
    * ```
    *   new StructType()
@@ -42,7 +42,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    *    .add("longInput", LongType)
    * ```
    *
-   * The name of a field of this [[StructType]] is only used to identify the corresponding
+   * The name of a field of this `StructType` is only used to identify the corresponding
    * input argument. Users can choose names to identify the input arguments.
    *
    * @since 1.5.0
@@ -50,10 +50,10 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   def inputSchema: StructType
 
   /**
-   * A [[StructType]] represents data types of values in the aggregation buffer.
+   * A `StructType` represents data types of values in the aggregation buffer.
    * For example, if a [[UserDefinedAggregateFunction]]'s buffer has two values
-   * (i.e. two intermediate values) with type of [[DoubleType]] and [[LongType]],
-   * the returned [[StructType]] will look like
+   * (i.e. two intermediate values) with type of `DoubleType` and `LongType`,
+   * the returned `StructType` will look like
    *
    * ```
    *   new StructType()
@@ -61,7 +61,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    *    .add("longInput", LongType)
    * ```
    *
-   * The name of a field of this [[StructType]] is only used to identify the corresponding
+   * The name of a field of this `StructType` is only used to identify the corresponding
    * buffer value. Users can choose names to identify the input arguments.
    *
    * @since 1.5.0
@@ -69,7 +69,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   def bufferSchema: StructType
 
   /**
-   * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]].
+   * The `DataType` of the returned value of this [[UserDefinedAggregateFunction]].
    *
    * @since 1.5.0
    */
@@ -121,7 +121,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   def evaluate(buffer: Row): Any
 
   /**
-   * Creates a [[Column]] for this UDAF using given [[Column]]s as input arguments.
+   * Creates a `Column` for this UDAF using given `Column`s as input arguments.
    *
    * @since 1.5.0
    */
@@ -136,8 +136,8 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   }
 
   /**
-   * Creates a [[Column]] for this UDAF using the distinct values of the given
-   * [[Column]]s as input arguments.
+   * Creates a `Column` for this UDAF using the distinct values of the given
+   * `Column`s as input arguments.
    *
    * @since 1.5.0
    */
@@ -153,7 +153,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
 }
 
 /**
- * A [[Row]] representing a mutable aggregation buffer.
+ * A `Row` representing a mutable aggregation buffer.
  *
  * This is not meant to be extended outside of Spark.
  *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 944a476114fa..fab92021b5b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -474,9 +474,11 @@ object functions {
   /**
    * Aggregate function: returns the level of grouping, equals to
    *
-   *   (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
+   * {{{
+   *   (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn)
+   * }}}
    *
-   * Note: the list of columns should match with grouping columns exactly, or empty (means all the
+   * @note The list of columns should match with grouping columns exactly, or empty (means all the
    * grouping columns).
    *
    * @group agg_funcs
@@ -487,9 +489,11 @@ object functions {
   /**
    * Aggregate function: returns the level of grouping, equals to
    *
-   *   (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
+   * {{{
+   *   (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn)
+   * }}}
    *
-   * Note: the list of columns should match with grouping columns exactly.
+   * @note The list of columns should match with grouping columns exactly.
    *
    * @group agg_funcs
    * @since 2.0.0
@@ -781,10 +785,13 @@ object functions {
   /**
    * Window function: returns the rank of rows within a window partition, without any gaps.
    *
-   * The difference between rank and denseRank is that denseRank leaves no gaps in ranking
-   * sequence when there are ties. That is, if you were ranking a competition using denseRank
+   * The difference between rank and dense_rank is that denseRank leaves no gaps in ranking
+   * sequence when there are ties. That is, if you were ranking a competition using dense_rank
    * and had three people tie for second place, you would say that all three were in second
-   * place and that the next person came in third.
+   * place and that the next person came in third. Rank would give me sequential numbers, making
+   * the person that came in third place (after the ties) would register as coming in fifth.
+   *
+   * This is equivalent to the DENSE_RANK function in SQL.
    *
    * @group window_funcs
    * @since 1.6.0
@@ -925,10 +932,11 @@ object functions {
   /**
    * Window function: returns the rank of rows within a window partition.
    *
-   * The difference between rank and denseRank is that denseRank leaves no gaps in ranking
-   * sequence when there are ties. That is, if you were ranking a competition using denseRank
+   * The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
+   * sequence when there are ties. That is, if you were ranking a competition using dense_rank
    * and had three people tie for second place, you would say that all three were in second
-   * place and that the next person came in third.
+   * place and that the next person came in third. Rank would give me sequential numbers, making
+   * the person that came in third place (after the ties) would register as coming in fifth.
    *
    * This is equivalent to the RANK function in SQL.
    *
@@ -1048,9 +1056,12 @@ object functions {
    * within each partition in the lower 33 bits. The assumption is that the data frame has
    * less than 1 billion partitions, and each partition has less than 8 billion records.
    *
-   * As an example, consider a [[DataFrame]] with two partitions, each with 3 records.
+   * As an example, consider a `DataFrame` with two partitions, each with 3 records.
    * This expression would return the following IDs:
+   *
+   * {{{
    * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+   * }}}
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1066,9 +1077,12 @@ object functions {
    * within each partition in the lower 33 bits. The assumption is that the data frame has
    * less than 1 billion partitions, and each partition has less than 8 billion records.
    *
-   * As an example, consider a [[DataFrame]] with two partitions, each with 3 records.
+   * As an example, consider a `DataFrame` with two partitions, each with 3 records.
    * This expression would return the following IDs:
+   *
+   * {{{
    * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+   * }}}
    *
    * @group normal_funcs
    * @since 1.6.0
@@ -1117,9 +1131,10 @@ object functions {
   def not(e: Column): Column = !e
 
   /**
-   * Generate a random column with i.i.d. samples from U[0.0, 1.0].
+   * Generate a random column with independent and identically distributed (i.i.d.) samples
+   * from U[0.0, 1.0].
    *
-   * Note that this is indeterministic when data partitions are not fixed.
+   * @note This is indeterministic when data partitions are not fixed.
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1127,7 +1142,8 @@ object functions {
   def rand(seed: Long): Column = withExpr { Rand(seed) }
 
   /**
-   * Generate a random column with i.i.d. samples from U[0.0, 1.0].
+   * Generate a random column with independent and identically distributed (i.i.d.) samples
+   * from U[0.0, 1.0].
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1135,9 +1151,10 @@ object functions {
   def rand(): Column = rand(Utils.random.nextLong)
 
   /**
-   * Generate a column with i.i.d. samples from the standard normal distribution.
+   * Generate a column with independent and identically distributed (i.i.d.) samples from
+   * the standard normal distribution.
    *
-   * Note that this is indeterministic when data partitions are not fixed.
+   * @note This is indeterministic when data partitions are not fixed.
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1145,7 +1162,8 @@ object functions {
   def randn(seed: Long): Column = withExpr { Randn(seed) }
 
   /**
-   * Generate a column with i.i.d. samples from the standard normal distribution.
+   * Generate a column with independent and identically distributed (i.i.d.) samples from
+   * the standard normal distribution.
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1153,9 +1171,9 @@ object functions {
   def randn(): Column = randn(Utils.random.nextLong)
 
   /**
-   * Partition ID of the Spark task.
+   * Partition ID.
    *
-   * Note that this is indeterministic because it depends on data partitioning and task scheduling.
+   * @note This is indeterministic because it depends on data partitioning and task scheduling.
    *
    * @group normal_funcs
    * @since 1.6.0
@@ -1180,7 +1198,7 @@ object functions {
 
   /**
    * Creates a new struct column.
-   * If the input column is a column in a [[DataFrame]], or a derived column expression
+   * If the input column is a column in a `DataFrame`, or a derived column expression
    * that is named (i.e. aliased), its name would be remained as the StructField's name,
    * otherwise, the newly generated StructField's name would be auto generated as col${index + 1},
    * i.e. col1, col2, col3, ...
@@ -1842,8 +1860,8 @@ object functions {
   def round(e: Column): Column = round(e, 0)
 
   /**
-   * Round the value of `e` to `scale` decimal places if `scale` >= 0
-   * or at integral part when `scale` < 0.
+   * Round the value of `e` to `scale` decimal places if `scale` is greater than or equal to 0
+   * or at integral part when `scale` is less than 0.
    *
    * @group math_funcs
    * @since 1.5.0
@@ -1860,7 +1878,7 @@ object functions {
 
   /**
    * Round the value of `e` to `scale` decimal places with HALF_EVEN round mode
-   * if `scale` >= 0 or at integral part when `scale` < 0.
+   * if `scale` is greater than or equal to 0 or at integral part when `scale` is less than 0.
    *
    * @group math_funcs
    * @since 2.0.0
@@ -1877,8 +1895,8 @@ object functions {
   def shiftLeft(e: Column, numBits: Int): Column = withExpr { ShiftLeft(e.expr, lit(numBits).expr) }
 
   /**
-   * Shift the given value numBits right. If the given value is a long value, it will return
-   * a long value else it will return an integer value.
+   * (Signed) shift the given value numBits right. If the given value is a long value, it will
+   * return a long value else it will return an integer value.
    *
    * @group math_funcs
    * @since 1.5.0
@@ -2168,7 +2186,7 @@ object functions {
    * and returns the result as a string column.
    *
    * If d is 0, the result has no decimal point or fractional part.
-   * If d < 0, the result will be null.
+   * If d is less than 0, the result will be null.
    *
    * @group string_funcs
    * @since 1.5.0
@@ -2203,7 +2221,7 @@ object functions {
    * Locate the position of the first occurrence of substr column in the given string.
    * Returns null if either of the arguments are null.
    *
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * @note The position is not zero based, but 1 based index. Returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
@@ -2238,7 +2256,8 @@ object functions {
 
   /**
    * Locate the position of the first occurrence of substr.
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   *
+   * @note The position is not zero based, but 1 based index. Returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
@@ -2251,7 +2270,7 @@ object functions {
   /**
    * Locate the position of the first occurrence of substr in a string column, after position pos.
    *
-   * NOTE: The position is not zero based, but 1 based index. returns 0 if substr
+   * @note The position is not zero based, but 1 based index. returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
@@ -2365,7 +2384,8 @@ object functions {
 
   /**
    * Splits str around pattern (pattern is a regular expression).
-   * NOTE: pattern is a string representation of the regular expression.
+   *
+   * @note Pattern is a string representation of the regular expression.
    *
    * @group string_funcs
    * @since 1.5.0
@@ -2464,7 +2484,7 @@ object functions {
    * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
    * pattern letters of [[java.text.SimpleDateFormat]] can be used.
    *
-   * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a
+   * @note Use when ever possible specialized functions like [[year]]. These benefit from a
    * specialized implementation.
    *
    * @group datetime_funcs
@@ -2666,7 +2686,8 @@ object functions {
   }
 
   /**
-   * Assumes given timestamp is UTC and converts to given timezone.
+   * Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+   * that corresponds to the same time of day in the given timezone.
    * @group datetime_funcs
    * @since 1.5.0
    */
@@ -2675,7 +2696,8 @@ object functions {
   }
 
   /**
-   * Assumes given timestamp is in given timezone and converts to UTC.
+   * Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+   * another timestamp that corresponds to the same time of day in UTC.
    * @group datetime_funcs
    * @since 1.5.0
    */
@@ -2832,7 +2854,7 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
-   * Returns true if the array contains `value`
+   * Returns null if the array is null, true if the array contains `value`, and false otherwise.
    * @group collection_funcs
    * @since 1.5.0
    */
@@ -2880,7 +2902,7 @@ object functions {
   }
 
   /**
-   * (Scala-specific) Parses a column containing a JSON string into a [[StructType]] with the
+   * (Scala-specific) Parses a column containing a JSON string into a `StructType` with the
    * specified schema. Returns `null`, in the case of an unparseable string.
    *
    * @param e a string column containing JSON data.
@@ -2896,7 +2918,7 @@ object functions {
   }
 
   /**
-   * (Java-specific) Parses a column containing a JSON string into a [[StructType]] with the
+   * (Java-specific) Parses a column containing a JSON string into a `StructType` with the
    * specified schema. Returns `null`, in the case of an unparseable string.
    *
    * @param e a string column containing JSON data.
@@ -2911,7 +2933,7 @@ object functions {
     from_json(e, schema, options.asScala.toMap)
 
   /**
-   * Parses a column containing a JSON string into a [[StructType]] with the specified schema.
+   * Parses a column containing a JSON string into a `StructType` with the specified schema.
    * Returns `null`, in the case of an unparseable string.
    *
    * @param e a string column containing JSON data.
@@ -2924,7 +2946,7 @@ object functions {
     from_json(e, schema, Map.empty[String, String])
 
   /**
-   * Parses a column containing a JSON string into a [[StructType]] with the specified schema.
+   * Parses a column containing a JSON string into a `StructType` with the specified schema.
    * Returns `null`, in the case of an unparseable string.
    *
    * @param e a string column containing JSON data.
@@ -2938,7 +2960,7 @@ object functions {
 
 
   /**
-   * (Scala-specific) Converts a column containing a [[StructType]] into a JSON string with the
+   * (Scala-specific) Converts a column containing a `StructType` into a JSON string with the
    * specified schema. Throws an exception, in the case of an unsupported type.
    *
    * @param e a struct column.
@@ -2953,7 +2975,7 @@ object functions {
   }
 
   /**
-   * (Java-specific) Converts a column containing a [[StructType]] into a JSON string with the
+   * (Java-specific) Converts a column containing a `StructType` into a JSON string with the
    * specified schema. Throws an exception, in the case of an unsupported type.
    *
    * @param e a struct column.
@@ -2967,7 +2989,7 @@ object functions {
     to_json(e, options.asScala.toMap)
 
   /**
-   * Converts a column containing a [[StructType]] into a JSON string with the
+   * Converts a column containing a `StructType` into a JSON string with the
    * specified schema. Throws an exception, in the case of an unsupported type.
    *
    * @param e a struct column.
@@ -2996,7 +3018,7 @@ object functions {
   def sort_array(e: Column): Column = sort_array(e, asc = true)
 
   /**
-   * Sorts the input array for the given column in ascending / descending order,
+   * Sorts the input array for the given column in ascending or descending order,
    * according to the natural ordering of the array elements.
    *
    * @group collection_funcs
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index 44fd38dfb96f..c3c351309073 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.internal
 
 import scala.collection.JavaConverters._
 import scala.reflect.runtime.universe.TypeTag
+import scala.util.control.NonFatal
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
@@ -27,12 +28,13 @@ import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, FunctionIdenti
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.execution.datasources.CreateTable
+import org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand
+import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource}
 import org.apache.spark.sql.types.StructType
 
 
 /**
- * Internal implementation of the user-facing [[Catalog]].
+ * Internal implementation of the user-facing `Catalog`.
  */
 class CatalogImpl(sparkSession: SparkSession) extends Catalog {
 
@@ -98,14 +100,27 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
     CatalogImpl.makeDataset(tables, sparkSession)
   }
 
+  /**
+   * Returns a Table for the given table/view or temporary view.
+   *
+   * Note that this function requires the table already exists in the Catalog.
+   *
+   * If the table metadata retrieval failed due to any reason (e.g., table serde class
+   * is not accessible or the table type is not accepted by Spark SQL), this function
+   * still returns the corresponding Table without the description and tableType)
+   */
   private def makeTable(tableIdent: TableIdentifier): Table = {
-    val metadata = sessionCatalog.getTempViewOrPermanentTableMetadata(tableIdent)
+    val metadata = try {
+      Some(sessionCatalog.getTempViewOrPermanentTableMetadata(tableIdent))
+    } catch {
+      case NonFatal(_) => None
+    }
     val isTemp = sessionCatalog.isTemporaryTable(tableIdent)
     new Table(
       name = tableIdent.table,
-      database = metadata.identifier.database.orNull,
-      description = metadata.comment.orNull,
-      tableType = if (isTemp) "TEMPORARY" else metadata.tableType.name,
+      database = metadata.map(_.identifier.database).getOrElse(tableIdent.database).orNull,
+      description = metadata.map(_.comment.orNull).orNull,
+      tableType = if (isTemp) "TEMPORARY" else metadata.map(_.tableType.name).orNull,
       isTemporary = isTemp)
   }
 
@@ -175,8 +190,8 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
   }
 
   /**
-   * Get the database with the specified name. This throws an [[AnalysisException]] when no
-   * [[Database]] can be found.
+   * Get the database with the specified name. This throws an `AnalysisException` when no
+   * `Database` can be found.
    */
   override def getDatabase(dbName: String): Database = {
     makeDatabase(dbName)
@@ -184,7 +199,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
 
   /**
    * Get the table or view with the specified name. This table can be a temporary view or a
-   * table/view in the current database. This throws an [[AnalysisException]] when no [[Table]]
+   * table/view in the current database. This throws an `AnalysisException` when no `Table`
    * can be found.
    */
   override def getTable(tableName: String): Table = {
@@ -193,15 +208,19 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
 
   /**
    * Get the table or view with the specified name in the specified database. This throws an
-   * [[AnalysisException]] when no [[Table]] can be found.
+   * `AnalysisException` when no `Table` can be found.
    */
   override def getTable(dbName: String, tableName: String): Table = {
-    makeTable(TableIdentifier(tableName, Option(dbName)))
+    if (tableExists(dbName, tableName)) {
+      makeTable(TableIdentifier(tableName, Option(dbName)))
+    } else {
+      throw new AnalysisException(s"Table or view '$tableName' not found in database '$dbName'")
+    }
   }
 
   /**
    * Get the function with the specified name. This function can be a temporary function or a
-   * function in the current database. This throws an [[AnalysisException]] when no [[Function]]
+   * function in the current database. This throws an `AnalysisException` when no `Function`
    * can be found.
    */
   override def getFunction(functionName: String): Function = {
@@ -209,7 +228,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
   }
 
   /**
-   * Get the function with the specified name. This returns [[None]] when no [[Function]] can be
+   * Get the function with the specified name. This returns `None` when no `Function` can be
    * found.
    */
   override def getFunction(dbName: String, functionName: String): Function = {
@@ -354,7 +373,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
     val tableDesc = CatalogTable(
       identifier = tableIdent,
       tableType = CatalogTableType.EXTERNAL,
-      storage = CatalogStorageFormat.empty.copy(properties = options),
+      storage = DataSource.buildStorageFormatFromOptions(options),
       schema = schema,
       provider = Some(source)
     )
@@ -372,8 +391,8 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
    * @since 2.0.0
    */
   override def dropTempView(viewName: String): Boolean = {
-    sparkSession.sessionState.catalog.getTempView(viewName).exists { tempView =>
-      sparkSession.sharedState.cacheManager.uncacheQuery(Dataset.ofRows(sparkSession, tempView))
+    sparkSession.sessionState.catalog.getTempView(viewName).exists { viewDef =>
+      sparkSession.sharedState.cacheManager.uncacheQuery(sparkSession, viewDef, blocking = true)
       sessionCatalog.dropTempView(viewName)
     }
   }
@@ -388,11 +407,24 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
    */
   override def dropGlobalTempView(viewName: String): Boolean = {
     sparkSession.sessionState.catalog.getGlobalTempView(viewName).exists { viewDef =>
-      sparkSession.sharedState.cacheManager.uncacheQuery(Dataset.ofRows(sparkSession, viewDef))
+      sparkSession.sharedState.cacheManager.uncacheQuery(sparkSession, viewDef, blocking = true)
       sessionCatalog.dropGlobalTempView(viewName)
     }
   }
 
+  /**
+   * Recover all the partitions in the directory of a table and update the catalog.
+   *
+   * @param tableName the name of the table to be repaired.
+   * @group ddl_ops
+   * @since 2.1.1
+   */
+  override def recoverPartitions(tableName: String): Unit = {
+    val tableIdent = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
+    sparkSession.sessionState.executePlan(
+      AlterTableRecoverPartitionsCommand(tableIdent)).toRdd
+  }
+
   /**
    * Returns true if the table is currently cached in-memory.
    *
@@ -420,7 +452,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
    * @since 2.0.0
    */
   override def uncacheTable(tableName: String): Unit = {
-    sparkSession.sharedState.cacheManager.uncacheQuery(query = sparkSession.table(tableName))
+    sparkSession.sharedState.cacheManager.uncacheQuery(sparkSession.table(tableName))
   }
 
   /**
@@ -458,17 +490,12 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
 
     // If this table is cached as an InMemoryRelation, drop the original
     // cached version and make the new version cached lazily.
-    val logicalPlan = sparkSession.sessionState.catalog.lookupRelation(tableIdent)
-    // Use lookupCachedData directly since RefreshTable also takes databaseName.
-    val isCached = sparkSession.sharedState.cacheManager.lookupCachedData(logicalPlan).nonEmpty
-    if (isCached) {
-      // Create a data frame to represent the table.
-      // TODO: Use uncacheTable once it supports database name.
-      val df = Dataset.ofRows(sparkSession, logicalPlan)
+    val table = sparkSession.table(tableIdent)
+    if (isCached(table)) {
       // Uncache the logicalPlan.
-      sparkSession.sharedState.cacheManager.uncacheQuery(df, blocking = true)
+      sparkSession.sharedState.cacheManager.uncacheQuery(table, blocking = true)
       // Cache it again.
-      sparkSession.sharedState.cacheManager.cacheQuery(df, Some(tableIdent.table))
+      sparkSession.sharedState.cacheManager.cacheQuery(table, Some(tableIdent.table))
     }
   }
 
@@ -480,7 +507,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
    * @since 2.0.0
    */
   override def refreshByPath(resourcePath: String): Unit = {
-    sparkSession.sharedState.cacheManager.invalidateCachedPath(sparkSession, resourcePath)
+    sparkSession.sharedState.cacheManager.recacheByPath(sparkSession, resourcePath)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
index c6083b372a2d..8de95fe64e66 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -23,10 +23,9 @@ import scala.util.control.NonFatal
 import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.{SparkConf, SparkContext, SparkException}
-import org.apache.spark.internal.config._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{SparkSession, SQLContext}
-import org.apache.spark.sql.catalyst.catalog.{ExternalCatalog, GlobalTempViewManager, InMemoryCatalog}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.execution.CacheManager
 import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -40,34 +39,35 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
 
   // Load hive-site.xml into hadoopConf and determine the warehouse path we want to use, based on
   // the config from both hive and Spark SQL. Finally set the warehouse config value to sparkConf.
-  {
+  val warehousePath = {
     val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml")
     if (configFile != null) {
       sparkContext.hadoopConfiguration.addResource(configFile)
     }
 
     // Set the Hive metastore warehouse path to the one we use
-    val tempConf = new SQLConf
-    sparkContext.conf.getAll.foreach { case (k, v) => tempConf.setConfString(k, v) }
     val hiveWarehouseDir = sparkContext.hadoopConfiguration.get("hive.metastore.warehouse.dir")
-    if (hiveWarehouseDir != null && !tempConf.contains(SQLConf.WAREHOUSE_PATH.key)) {
+    if (hiveWarehouseDir != null && !sparkContext.conf.contains(WAREHOUSE_PATH.key)) {
       // If hive.metastore.warehouse.dir is set and spark.sql.warehouse.dir is not set,
       // we will respect the value of hive.metastore.warehouse.dir.
-      tempConf.setConfString(SQLConf.WAREHOUSE_PATH.key, hiveWarehouseDir)
-      sparkContext.conf.set(SQLConf.WAREHOUSE_PATH.key, hiveWarehouseDir)
-      logInfo(s"${SQLConf.WAREHOUSE_PATH.key} is not set, but hive.metastore.warehouse.dir " +
-        s"is set. Setting ${SQLConf.WAREHOUSE_PATH.key} to the value of " +
+      sparkContext.conf.set(WAREHOUSE_PATH.key, hiveWarehouseDir)
+      logInfo(s"${WAREHOUSE_PATH.key} is not set, but hive.metastore.warehouse.dir " +
+        s"is set. Setting ${WAREHOUSE_PATH.key} to the value of " +
         s"hive.metastore.warehouse.dir ('$hiveWarehouseDir').")
+      hiveWarehouseDir
     } else {
       // If spark.sql.warehouse.dir is set, we will override hive.metastore.warehouse.dir using
       // the value of spark.sql.warehouse.dir.
       // When neither spark.sql.warehouse.dir nor hive.metastore.warehouse.dir is set,
       // we will set hive.metastore.warehouse.dir to the default value of spark.sql.warehouse.dir.
-      sparkContext.conf.set("hive.metastore.warehouse.dir", tempConf.warehousePath)
+      val sparkWarehouseDir = sparkContext.conf.get(WAREHOUSE_PATH)
+      sparkContext.conf.set("hive.metastore.warehouse.dir", sparkWarehouseDir)
+      sparkWarehouseDir
     }
 
-    logInfo(s"Warehouse path is '${tempConf.warehousePath}'.")
   }
+  logInfo(s"Warehouse path is '$warehousePath'.")
+
 
   /**
    * Class for caching query results reused in future executions.
@@ -88,6 +88,18 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
       sparkContext.conf,
       sparkContext.hadoopConfiguration)
 
+  // Create the default database if it doesn't exist.
+  {
+    val defaultDbDefinition = CatalogDatabase(
+      SessionCatalog.DEFAULT_DATABASE, "default database", warehousePath, Map())
+    // Initialize default database if it doesn't exist
+    if (!externalCatalog.databaseExists(SessionCatalog.DEFAULT_DATABASE)) {
+      // There may be another Spark application creating default database at the same time, here we
+      // set `ignoreIfExists = true` to avoid `DatabaseAlreadyExists` exception.
+      externalCatalog.createDatabase(defaultDbDefinition, ignoreIfExists = true)
+    }
+  }
+
   /**
    * A manager for global temporary views.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
index 50725a09c42b..4e7c813be992 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
@@ -17,16 +17,13 @@
 
 package org.apache.spark.sql.internal
 
-import java.util.regex.Pattern
-
 import org.apache.spark.internal.config._
-import org.apache.spark.sql.AnalysisException
 
 /**
  * A helper class that enables substitution using syntax like
  * `${var}`, `${system:var}` and `${env:var}`.
  *
- * Variable substitution is controlled by [[SQLConf.variableSubstituteEnabled]].
+ * Variable substitution is controlled by `SQLConf.variableSubstituteEnabled`.
  */
 class VariableSubstitution(conf: SQLConf) {
 
@@ -37,6 +34,7 @@ class VariableSubstitution(conf: SQLConf) {
   private val reader = new ConfigReader(provider)
     .bind("spark", provider)
     .bind("sparkconf", provider)
+    .bind("hivevar", provider)
     .bind("hiveconf", provider)
 
   /**
@@ -49,5 +47,4 @@ class VariableSubstitution(conf: SQLConf) {
       input
     }
   }
-
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index dec316be7aea..e328b86437d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -40,8 +40,8 @@ case class JdbcType(databaseTypeDefinition : String, jdbcNullType : Int)
  * SQL dialect of a certain database or jdbc driver.
  * Lots of databases define types that aren't explicitly supported
  * by the JDBC spec.  Some JDBC drivers also report inaccurate
- * information---for instance, BIT(n>1) being reported as a BIT type is quite
- * common, even though BIT in JDBC is meant for single-bit values.  Also, there
+ * information---for instance, BIT(n{@literal >}1) being reported as a BIT type is quite
+ * common, even though BIT in JDBC is meant for single-bit values. Also, there
  * does not appear to be a standard name for an unbounded string or binary
  * type; we use BLOB and CLOB by default but override with database-specific
  * alternatives when these are absent or do not behave correctly.
@@ -134,13 +134,13 @@ abstract class JdbcDialect extends Serializable {
 
 /**
  * :: DeveloperApi ::
- * Registry of dialects that apply to every new jdbc [[org.apache.spark.sql.DataFrame]].
+ * Registry of dialects that apply to every new jdbc `org.apache.spark.sql.DataFrame`.
  *
  * If multiple matching dialects are registered then all matching ones will be
  * tried in reverse order. A user-added dialect will thus be applied first,
  * overwriting the defaults.
  *
- * Note that all new dialects are applied to new jdbc DataFrames only. Make
+ * @note All new dialects are applied to new jdbc DataFrames only. Make
  * sure to register your dialects first.
  */
 @DeveloperApi
@@ -148,7 +148,7 @@ abstract class JdbcDialect extends Serializable {
 object JdbcDialects {
 
   /**
-   * Register a dialect for use on all new matching jdbc [[org.apache.spark.sql.DataFrame]].
+   * Register a dialect for use on all new matching jdbc `org.apache.spark.sql.DataFrame`.
    * Reading an existing dialect will cause a move-to-front.
    *
    * @param dialect The new dialect.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
index e0494dfd9343..2499e9b604f3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
@@ -130,7 +130,7 @@ case class In(attribute: String, values: Array[Any]) extends Filter {
     case _ => false
   }
   override def toString: String = {
-    s"In($attribute, [${values.mkString(",")}]"
+    s"In($attribute, [${values.mkString(",")}])"
   }
 
   override def references: Array[String] = Array(attribute) ++ values.flatMap(findReferences)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 15a48072525b..f288ad61410f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -69,7 +69,8 @@ trait DataSourceRegister {
 trait RelationProvider {
   /**
    * Returns a new base relation with the given parameters.
-   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
+   *
+   * @note The parameters' keywords are case insensitive and this insensitivity is enforced
    * by the Map that is passed to the function.
    */
   def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation
@@ -99,7 +100,8 @@ trait RelationProvider {
 trait SchemaRelationProvider {
   /**
    * Returns a new base relation with the given parameters and user defined schema.
-   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
+   *
+   * @note The parameters' keywords are case insensitive and this insensitivity is enforced
    * by the Map that is passed to the function.
    */
   def createRelation(
@@ -110,7 +112,7 @@ trait SchemaRelationProvider {
 
 /**
  * ::Experimental::
- * Implemented by objects that can produce a streaming [[Source]] for a specific format or system.
+ * Implemented by objects that can produce a streaming `Source` for a specific format or system.
  *
  * @since 2.0.0
  */
@@ -141,7 +143,7 @@ trait StreamSourceProvider {
 
 /**
  * ::Experimental::
- * Implemented by objects that can produce a streaming [[Sink]] for a specific format or system.
+ * Implemented by objects that can produce a streaming `Sink` for a specific format or system.
  *
  * @since 2.0.0
  */
@@ -183,7 +185,7 @@ trait CreatableRelationProvider {
 
 /**
  * Represents a collection of tuples with a known schema. Classes that extend BaseRelation must
- * be able to produce the schema of their data in the form of a [[StructType]]. Concrete
+ * be able to produce the schema of their data in the form of a `StructType`. Concrete
  * implementation should inherit from one of the descendant `Scan` classes, which define various
  * abstract methods for execution.
  *
@@ -205,7 +207,7 @@ abstract class BaseRelation {
    * large to broadcast. This method will be called multiple times during query planning
    * and thus should not perform expensive operations for each invocation.
    *
-   * Note that it is always better to overestimate size than underestimate, because underestimation
+   * @note It is always better to overestimate size than underestimate, because underestimation
    * could lead to execution plans that are suboptimal (i.e. broadcasting a very large table).
    *
    * @since 1.3.0
@@ -214,12 +216,12 @@ abstract class BaseRelation {
 
   /**
    * Whether does it need to convert the objects in Row to internal representation, for example:
-   *  java.lang.String -> UTF8String
-   *  java.lang.Decimal -> Decimal
+   *  java.lang.String to UTF8String
+   *  java.lang.Decimal to Decimal
    *
-   * If `needConversion` is `false`, buildScan() should return an [[RDD]] of [[InternalRow]]
+   * If `needConversion` is `false`, buildScan() should return an `RDD` of `InternalRow`
    *
-   * Note: The internal representation is not stable across releases and thus data sources outside
+   * @note The internal representation is not stable across releases and thus data sources outside
    * of Spark SQL should leave this as true.
    *
    * @since 1.4.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 40b482e4c01a..7db9d9264b1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.streaming
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession}
 import org.apache.spark.sql.execution.datasources.DataSource
@@ -27,12 +27,13 @@ import org.apache.spark.sql.execution.streaming.StreamingRelation
 import org.apache.spark.sql.types.StructType
 
 /**
- * Interface used to load a streaming [[Dataset]] from external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[SparkSession.readStream]] to access this.
+ * Interface used to load a streaming `Dataset` from external storage systems (e.g. file systems,
+ * key-value stores, etc). Use `SparkSession.readStream` to access this.
  *
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 final class DataStreamReader private[sql](sparkSession: SparkSession) extends Logging {
   /**
    * Specifies the input data source format.
@@ -109,7 +110,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
 
 
   /**
-   * Loads input data stream in as a [[DataFrame]], for data streams that don't require a path
+   * Loads input data stream in as a `DataFrame`, for data streams that don't require a path
    * (e.g. external key-value stores).
    *
    * @since 2.0.0
@@ -125,7 +126,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   }
 
   /**
-   * Loads input in as a [[DataFrame]], for data streams that read from some path.
+   * Loads input in as a `DataFrame`, for data streams that read from some path.
    *
    * @since 2.0.0
    */
@@ -134,8 +135,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   }
 
   /**
-   * Loads a JSON file stream ([[http://jsonlines.org/ JSON Lines text format or newline-delimited
-   * JSON]]) and returns the result as a [[DataFrame]].
+   * Loads a JSON file stream (<a href="http://jsonlines.org/">JSON Lines text format or
+   * newline-delimited JSON</a>) and returns the result as a `DataFrame`.
    *
    * This function goes through the input once to determine the input schema. If you know the
    * schema in advance, use the version that specifies the schema to avoid the extra scan.
@@ -181,7 +182,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   def json(path: String): DataFrame = format("json").load(path)
 
   /**
-   * Loads a CSV file stream and returns the result as a [[DataFrame]].
+   * Loads a CSV file stream and returns the result as a `DataFrame`.
    *
    * This function will go through the input once to determine the input schema if `inferSchema`
    * is enabled. To avoid going through the entire data once, disable `inferSchema` option or
@@ -243,7 +244,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   def csv(path: String): DataFrame = format("csv").load(path)
 
   /**
-   * Loads a Parquet file stream, returning the result as a [[DataFrame]].
+   * Loads a Parquet file stream, returning the result as a `DataFrame`.
    *
    * You can set the following Parquet-specific option(s) for reading Parquet files:
    * <ul>
@@ -262,7 +263,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   }
 
   /**
-   * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
+   * Loads text files and returns a `DataFrame` whose schema starts with a string column named
    * "value", and followed by partitioned columns if there are any.
    *
    * Each line in the text files is a new row in the resulting DataFrame. For example:
@@ -285,7 +286,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   def text(path: String): DataFrame = format("text").load(path)
 
   /**
-   * Loads text file(s) and returns a [[Dataset]] of String. The underlying schema of the Dataset
+   * Loads text file(s) and returns a `Dataset` of String. The underlying schema of the Dataset
    * contains a single string column named "value".
    *
    * If the directory structure of the text files contains partitioning information, those are
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index b959444b4929..5f49bef55811 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -19,19 +19,21 @@ package org.apache.spark.sql.streaming
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, ForeachWriter}
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.{ForeachSink, MemoryPlan, MemorySink}
 
 /**
  * :: Experimental ::
- * Interface used to write a streaming [[Dataset]] to external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[Dataset.writeStream]] to access this.
+ * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
+ * key-value stores, etc). Use `Dataset.writeStream` to access this.
  *
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
 
   private val df = ds.toDF()
@@ -42,6 +44,10 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
    *                            written to the sink
    *   - `OutputMode.Complete()`: all the rows in the streaming DataFrame/Dataset will be written
    *                              to the sink every time these is some updates
+   *   - `OutputMode.Update()`: only the rows that were updated in the streaming DataFrame/Dataset
+   *                            will be written to the sink every time there are some updates. If
+   *                            the query doesn't contain aggregations, it will be equivalent to
+   *                            `OutputMode.Append()` mode.
    *
    * @since 2.0.0
    */
@@ -56,7 +62,9 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
    *                 the sink
    *   - `complete`: all the rows in the streaming DataFrame/Dataset will be written to the sink
    *                 every time these is some updates
-   *
+   *   - `update`:   only the rows that were updated in the streaming DataFrame/Dataset will
+   *                 be written to the sink every time there are some updates. If the query doesn't
+   *                 contain aggregations, it will be equivalent to `append` mode.
    * @since 2.0.0
    */
   def outputMode(outputMode: String): DataStreamWriter[T] = {
@@ -65,9 +73,11 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         OutputMode.Append
       case "complete" =>
         OutputMode.Complete
+      case "update" =>
+        OutputMode.Update
       case _ =>
         throw new IllegalArgumentException(s"Unknown output mode $outputMode. " +
-          "Accepted output modes are 'append' and 'complete'")
+          "Accepted output modes are 'append', 'complete', 'update'")
     }
     this
   }
@@ -99,7 +109,6 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
     this
   }
 
-
   /**
    * Specifies the name of the [[StreamingQuery]] that can be started with `start()`.
    * This name must be unique among all the currently active queries in the associated SQLContext.
@@ -112,7 +121,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Specifies the underlying output data source. Built-in options include "parquet" for now.
+   * Specifies the underlying output data source.
    *
    * @since 2.0.0
    */
@@ -134,9 +143,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
    * predicates on the partitioned columns. In order for partitioning to work well, the number
    * of distinct values in each column should typically be less than tens of thousands.
    *
-   * This was initially applicable for Parquet but in 1.5+ covers JSON, text, ORC and avro as well.
-   *
-   * @since 1.4.0
+   * @since 2.0.0
    */
   @scala.annotation.varargs
   def partitionBy(colNames: String*): DataStreamWriter[T] = {
@@ -219,17 +226,18 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
       if (extraOptions.get("queryName").isEmpty) {
         throw new AnalysisException("queryName must be specified for memory sink")
       }
-
       val sink = new MemorySink(df.schema, outputMode)
       val resultDf = Dataset.ofRows(df.sparkSession, new MemoryPlan(sink))
+      val chkpointLoc = extraOptions.get("checkpointLocation")
+      val recoverFromChkpoint = outputMode == OutputMode.Complete()
       val query = df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
-        extraOptions.get("checkpointLocation"),
+        chkpointLoc,
         df,
         sink,
         outputMode,
         useTempCheckpointLocation = true,
-        recoverFromCheckpointLocation = false,
+        recoverFromCheckpointLocation = recoverFromChkpoint,
         trigger = trigger)
       resultDf.createOrReplaceTempView(query.name)
       query
@@ -271,8 +279,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
 
   /**
    * Starts the execution of the streaming query, which will continually send results to the given
-   * [[ForeachWriter]] as as new data arrives. The [[ForeachWriter]] can be used to send the data
-   * generated by the [[DataFrame]]/[[Dataset]] to an external system.
+   * `ForeachWriter` as as new data arrives. The `ForeachWriter` can be used to send the data
+   * generated by the `DataFrame`/`Dataset` to an external system.
    *
    * Scala example:
    * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
deleted file mode 100644
index ab19602207ad..000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import org.json4s._
-import org.json4s.JsonAST.JValue
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods._
-
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.streaming.StreamingQueryStatus.indent
-
-/**
- * :: Experimental ::
- * Status and metrics of a streaming sink.
- *
- * @param description Description of the source corresponding to this status.
- * @param offsetDesc Description of the current offsets up to which data has been written
- *                   by the sink.
- * @since 2.0.0
- */
-@Experimental
-class SinkStatus private(
-    val description: String,
-    val offsetDesc: String) {
-
-  /** The compact JSON representation of this status. */
-  def json: String = compact(render(jsonValue))
-
-  /** The pretty (i.e. indented) JSON representation of this status. */
-  def prettyJson: String = pretty(render(jsonValue))
-
-  override def toString: String =
-    "Status of sink " + indent(prettyString).trim
-
-  private[sql] def jsonValue: JValue = {
-    ("description" -> JString(description)) ~
-    ("offsetDesc" -> JString(offsetDesc))
-  }
-
-  private[sql] def prettyString: String = {
-    s"""$description
-       |Committed offsets: $offsetDesc
-       |""".stripMargin
-  }
-}
-
-/** Companion object, primarily for creating SinkStatus instances internally */
-private[sql] object SinkStatus {
-  def apply(desc: String, offsetDesc: String): SinkStatus = new SinkStatus(desc, offsetDesc)
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
deleted file mode 100644
index cfdf11370e06..000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import java.{util => ju}
-
-import scala.collection.JavaConverters._
-
-import org.json4s._
-import org.json4s.JsonAST.JValue
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods._
-
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.streaming.StreamingQueryStatus.indent
-import org.apache.spark.util.JsonProtocol
-
-/**
- * :: Experimental ::
- * Status and metrics of a streaming Source.
- *
- * @param description Description of the source corresponding to this status.
- * @param offsetDesc Description of the current offset if known.
- * @param inputRate Current rate (rows/sec) at which data is being generated by the source.
- * @param processingRate Current rate (rows/sec) at which the query is processing data from
- *                       the source.
- * @param triggerDetails Low-level details of the currently active trigger (e.g. number of
- *                      rows processed in trigger, latency of intermediate steps, etc.).
- *                      If no trigger is active, then it will have details of the last completed
- *                      trigger.
- * @since 2.0.0
- */
-@Experimental
-class SourceStatus private(
-    val description: String,
-    val offsetDesc: String,
-    val inputRate: Double,
-    val processingRate: Double,
-    val triggerDetails: ju.Map[String, String]) {
-
-  /** The compact JSON representation of this status. */
-  def json: String = compact(render(jsonValue))
-
-  /** The pretty (i.e. indented) JSON representation of this status. */
-  def prettyJson: String = pretty(render(jsonValue))
-
-  override def toString: String =
-    "Status of source " + indent(prettyString).trim
-
-  private[sql] def jsonValue: JValue = {
-    ("description" -> JString(description)) ~
-    ("offsetDesc" -> JString(offsetDesc)) ~
-    ("inputRate" -> JDouble(inputRate)) ~
-    ("processingRate" -> JDouble(processingRate)) ~
-    ("triggerDetails" -> JsonProtocol.mapToJson(triggerDetails.asScala))
-  }
-
-  private[sql] def prettyString: String = {
-    val triggerDetailsLines =
-      triggerDetails.asScala.map { case (k, v) => s"$k: $v" }
-    s"""$description
-       |Available offset: $offsetDesc
-       |Input rate: $inputRate rows/sec
-       |Processing rate: $processingRate rows/sec
-       |Trigger details:
-       |""".stripMargin + indent(triggerDetailsLines)
-  }
-}
-
-/** Companion object, primarily for creating SourceStatus instances internally */
-private[sql] object SourceStatus {
-  def apply(
-      desc: String,
-      offsetDesc: String,
-      inputRate: Double,
-      processingRate: Double,
-      triggerDetails: Map[String, String]): SourceStatus = {
-    new SourceStatus(desc, offsetDesc, inputRate, processingRate, triggerDetails.asJava)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
index 0a8541445198..9c00259f73e2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.streaming
 
-import org.apache.spark.annotation.Experimental
+import java.util.UUID
+
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.SparkSession
 
 /**
@@ -27,31 +29,45 @@ import org.apache.spark.sql.SparkSession
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 trait StreamingQuery {
 
   /**
-   * Returns the name of the query. This name is unique across all active queries. This can be
-   * set in the [[org.apache.spark.sql.DataStreamWriter DataStreamWriter]] as
-   * `dataframe.writeStream.queryName("query").start()`.
+   * Returns the user-specified name of the query, or null if not specified.
+   * This name can be specified in the `org.apache.spark.sql.streaming.DataStreamWriter`
+   * as `dataframe.writeStream.queryName("query").start()`.
+   * This name, if set, must be unique across all active queries.
+   *
    * @since 2.0.0
    */
   def name: String
 
   /**
-   * Returns the unique id of this query. This id is automatically generated and is unique across
-   * all queries that have been started in the current process.
-   * @since 2.0.0
+   * Returns the unique id of this query that persists across restarts from checkpoint data.
+   * That is, this id is generated when a query is started for the first time, and
+   * will be the same every time it is restarted from checkpoint data. Also see [[runId]].
+   *
+   * @since 2.1.0
+   */
+  def id: UUID
+
+  /**
+   * Returns the unique id of this run of the query. That is, every start/restart of a query will
+   * generated a unique runId. Therefore, every time a query is restarted from
+   * checkpoint, it will have the same [[id]] but different [[runId]]s.
    */
-  def id: Long
+  def runId: UUID
 
   /**
-   * Returns the [[SparkSession]] associated with `this`.
+   * Returns the `SparkSession` associated with `this`.
+   *
    * @since 2.0.0
    */
   def sparkSession: SparkSession
 
   /**
-   * Whether the query is currently active or not
+   * Returns `true` if this query is actively running.
+   *
    * @since 2.0.0
    */
   def isActive: Boolean
@@ -64,23 +80,26 @@ trait StreamingQuery {
 
   /**
    * Returns the current status of the query.
+   *
    * @since 2.0.2
    */
   def status: StreamingQueryStatus
 
   /**
-   * Returns current status of all the sources.
-   * @since 2.0.0
+   * Returns an array of the most recent [[StreamingQueryProgress]] updates for this query.
+   * The number of progress updates retained for each stream is configured by Spark session
+   * configuration `spark.sql.streaming.numRecentProgressUpdates`.
+   *
+   * @since 2.1.0
    */
-  @deprecated("use status.sourceStatuses", "2.0.2")
-  def sourceStatuses: Array[SourceStatus]
+  def recentProgress: Array[StreamingQueryProgress]
 
   /**
-   * Returns current status of the sink.
-   * @since 2.0.0
+   * Returns the most recent [[StreamingQueryProgress]] update of this streaming query.
+   *
+   * @since 2.1.0
    */
-  @deprecated("use status.sinkStatus", "2.0.2")
-  def sinkStatus: SinkStatus
+  def lastProgress: StreamingQueryProgress
 
   /**
    * Waits for the termination of `this` query, either by `query.stop()` or by an exception.
@@ -90,10 +109,11 @@ trait StreamingQuery {
    * immediately (if the query was terminated by `stop()`), or throw the exception
    * immediately (if the query has terminated with exception).
    *
-   * @throws StreamingQueryException, if `this` query has terminated with an exception.
+   * @throws StreamingQueryException if the query has terminated with an exception.
    *
    * @since 2.0.0
    */
+  @throws[StreamingQueryException]
   def awaitTermination(): Unit
 
   /**
@@ -106,10 +126,11 @@ trait StreamingQuery {
    * `true` immediately (if the query was terminated by `stop()`), or throw the exception
    * immediately (if the query has terminated with exception).
    *
-   * @throws StreamingQueryException, if `this` query has terminated with an exception
+   * @throws StreamingQueryException if the query has terminated with an exception
    *
    * @since 2.0.0
    */
+  @throws[StreamingQueryException]
   def awaitTermination(timeoutMs: Long): Boolean
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
index bd3e5a5618ec..234a1166a195 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
@@ -17,39 +17,32 @@
 
 package org.apache.spark.sql.streaming
 
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.execution.streaming.{Offset, StreamExecution}
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 
 /**
  * :: Experimental ::
  * Exception that stopped a [[StreamingQuery]]. Use `cause` get the actual exception
  * that caused the failure.
- * @param query      Query that caused the exception
  * @param message     Message of this exception
  * @param cause       Internal cause of this exception
- * @param startOffset Starting offset (if known) of the range of data in which exception occurred
- * @param endOffset   Ending offset (if known) of the range of data in exception occurred
+ * @param startOffset Starting offset in json of the range of data in which exception occurred
+ * @param endOffset   Ending offset in json of the range of data in exception occurred
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 class StreamingQueryException private[sql](
-    @transient val query: StreamingQuery,
+    private val queryDebugString: String,
     val message: String,
     val cause: Throwable,
-    val startOffset: Option[Offset] = None,
-    val endOffset: Option[Offset] = None)
+    val startOffset: String,
+    val endOffset: String)
   extends Exception(message, cause) {
 
   /** Time when the exception occurred */
   val time: Long = System.currentTimeMillis
 
-  override def toString(): String = {
-    val causeStr =
-      s"${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}"
-    s"""
-       |$causeStr
-       |
-       |${query.asInstanceOf[StreamExecution].toDebugString}
-       """.stripMargin
-  }
+  override def toString(): String =
+    s"""${classOf[StreamingQueryException].getName}: ${cause.getMessage}
+       |$queryDebugString""".stripMargin
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
index 9e311fae842b..6b871b1fe685 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.streaming
 
-import org.apache.spark.annotation.Experimental
+import java.util.UUID
+
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.scheduler.SparkListenerEvent
 
 /**
@@ -28,6 +30,7 @@ import org.apache.spark.scheduler.SparkListenerEvent
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 abstract class StreamingQueryListener {
 
   import StreamingQueryListener._
@@ -68,6 +71,7 @@ abstract class StreamingQueryListener {
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 object StreamingQueryListener {
 
   /**
@@ -76,35 +80,48 @@ object StreamingQueryListener {
    * @since 2.0.0
    */
   @Experimental
+  @InterfaceStability.Evolving
   trait Event extends SparkListenerEvent
 
   /**
    * :: Experimental ::
    * Event representing the start of a query
-   * @since 2.0.0
+   * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`.
+   * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`.
+   * @param name User-specified name of the query, null if not specified.
+   * @since 2.1.0
    */
   @Experimental
-  class QueryStartedEvent private[sql](val queryStatus: StreamingQueryStatus) extends Event
+  @InterfaceStability.Evolving
+  class QueryStartedEvent private[sql](
+      val id: UUID,
+      val runId: UUID,
+      val name: String) extends Event
 
   /**
    * :: Experimental ::
-   * Event representing any progress updates in a query
-   * @since 2.0.0
+   * Event representing any progress updates in a query.
+   * @param progress The query progress updates.
+   * @since 2.1.0
    */
   @Experimental
-  class QueryProgressEvent private[sql](val queryStatus: StreamingQueryStatus) extends Event
+  @InterfaceStability.Evolving
+  class QueryProgressEvent private[sql](val progress: StreamingQueryProgress) extends Event
 
   /**
    * :: Experimental ::
-   * Event representing that termination of a query
+   * Event representing that termination of a query.
    *
-   * @param queryStatus Information about the status of the query.
-   * @param exception The exception message of the [[StreamingQuery]] if the query was terminated
+   * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`.
+   * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`.
+   * @param exception The exception message of the query if the query was terminated
    *                  with an exception. Otherwise, it will be `None`.
-   * @since 2.0.0
+   * @since 2.1.0
    */
   @Experimental
+  @InterfaceStability.Evolving
   class QueryTerminatedEvent private[sql](
-      val queryStatus: StreamingQueryStatus,
+      val id: UUID,
+      val runId: UUID,
       val exception: Option[String]) extends Event
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index bba7bc753eea..38edb40dfb78 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.UUID
+import javax.annotation.concurrent.GuardedBy
+
 import scala.collection.mutable
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker
 import org.apache.spark.sql.execution.streaming._
@@ -31,20 +34,24 @@ import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
  * :: Experimental ::
- * A class to manage all the [[StreamingQuery]] active on a [[SparkSession]].
+ * A class to manage all the [[StreamingQuery]] active on a `SparkSession`.
  *
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
 
   private[sql] val stateStoreCoordinator =
     StateStoreCoordinatorRef.forDriver(sparkSession.sparkContext.env)
   private val listenerBus = new StreamingQueryListenerBus(sparkSession.sparkContext.listenerBus)
-  private val activeQueries = new mutable.HashMap[Long, StreamingQuery]
+
+  @GuardedBy("activeQueriesLock")
+  private val activeQueries = new mutable.HashMap[UUID, StreamingQuery]
   private val activeQueriesLock = new Object
   private val awaitTerminationLock = new Object
 
+  @GuardedBy("awaitTerminationLock")
   private var lastTerminatedQuery: StreamingQuery = null
 
   /**
@@ -59,12 +66,19 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
   /**
    * Returns the query if there is an active query with the given id, or null.
    *
-   * @since 2.0.0
+   * @since 2.1.0
    */
-  def get(id: Long): StreamingQuery = activeQueriesLock.synchronized {
+  def get(id: UUID): StreamingQuery = activeQueriesLock.synchronized {
     activeQueries.get(id).orNull
   }
 
+  /**
+   * Returns the query if there is an active query with the given id, or null.
+   *
+   * @since 2.1.0
+   */
+  def get(id: String): StreamingQuery = get(UUID.fromString(id))
+
   /**
    * Wait until any of the queries on the associated SQLContext has terminated since the
    * creation of the context, or since `resetTerminated()` was called. If any query was terminated
@@ -81,10 +95,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
    * users need to stop all of them after any of them terminates with exception, and then check the
    * `query.exception()` for each query.
    *
-   * @throws StreamingQueryException, if any query has terminated with an exception
+   * @throws StreamingQueryException if any query has terminated with an exception
    *
    * @since 2.0.0
    */
+  @throws[StreamingQueryException]
   def awaitAnyTermination(): Unit = {
     awaitTerminationLock.synchronized {
       while (lastTerminatedQuery == null) {
@@ -113,10 +128,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
    * users need to stop all of them after any of them terminates with exception, and then check the
    * `query.exception()` for each query.
    *
-   * @throws StreamingQueryException, if any query has terminated with an exception
+   * @throws StreamingQueryException if any query has terminated with an exception
    *
    * @since 2.0.0
    */
+  @throws[StreamingQueryException]
   def awaitAnyTermination(timeoutMs: Long): Boolean = {
 
     val startTime = System.currentTimeMillis
@@ -169,8 +185,75 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
     listenerBus.post(event)
   }
 
+  private def createQuery(
+      userSpecifiedName: Option[String],
+      userSpecifiedCheckpointLocation: Option[String],
+      df: DataFrame,
+      sink: Sink,
+      outputMode: OutputMode,
+      useTempCheckpointLocation: Boolean,
+      recoverFromCheckpointLocation: Boolean,
+      trigger: Trigger,
+      triggerClock: Clock): StreamingQueryWrapper = {
+    var deleteCheckpointOnStop = false
+    val checkpointLocation = userSpecifiedCheckpointLocation.map { userSpecified =>
+      new Path(userSpecified).toUri.toString
+    }.orElse {
+      df.sparkSession.sessionState.conf.checkpointLocation.map { location =>
+        new Path(location, userSpecifiedName.getOrElse(UUID.randomUUID().toString)).toUri.toString
+      }
+    }.getOrElse {
+      if (useTempCheckpointLocation) {
+        // Delete the temp checkpoint when a query is being stopped without errors.
+        deleteCheckpointOnStop = true
+        Utils.createTempDir(namePrefix = s"temporary").getCanonicalPath
+      } else {
+        throw new AnalysisException(
+          "checkpointLocation must be specified either " +
+            """through option("checkpointLocation", ...) or """ +
+            s"""SparkSession.conf.set("${SQLConf.CHECKPOINT_LOCATION.key}", ...)""")
+      }
+    }
+
+    // If offsets have already been created, we trying to resume a query.
+    if (!recoverFromCheckpointLocation) {
+      val checkpointPath = new Path(checkpointLocation, "offsets")
+      val fs = checkpointPath.getFileSystem(df.sparkSession.sessionState.newHadoopConf())
+      if (fs.exists(checkpointPath)) {
+        throw new AnalysisException(
+          s"This query does not support recovering from checkpoint location. " +
+            s"Delete $checkpointPath to start over.")
+      }
+    }
+
+    val analyzedPlan = df.queryExecution.analyzed
+    df.queryExecution.assertAnalyzed()
+
+    if (sparkSession.sessionState.conf.isUnsupportedOperationCheckEnabled) {
+      UnsupportedOperationChecker.checkForStreaming(analyzedPlan, outputMode)
+    }
+
+    if (sparkSession.sessionState.conf.adaptiveExecutionEnabled) {
+      throw new AnalysisException(
+        s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} " +
+          "is not supported in streaming DataFrames/Datasets")
+    }
+
+    new StreamingQueryWrapper(new StreamExecution(
+      sparkSession,
+      userSpecifiedName.orNull,
+      checkpointLocation,
+      analyzedPlan,
+      sink,
+      trigger,
+      triggerClock,
+      outputMode,
+      deleteCheckpointOnStop))
+  }
+
   /**
    * Start a [[StreamingQuery]].
+   *
    * @param userSpecifiedName Query name optionally specified by the user.
    * @param userSpecifiedCheckpointLocation  Checkpoint location optionally specified by the user.
    * @param df Streaming DataFrame.
@@ -194,74 +277,50 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
       recoverFromCheckpointLocation: Boolean = true,
       trigger: Trigger = ProcessingTime(0),
       triggerClock: Clock = new SystemClock()): StreamingQuery = {
-    activeQueriesLock.synchronized {
-      val id = StreamExecution.nextId
-      val name = userSpecifiedName.getOrElse(s"query-$id")
-      if (activeQueries.values.exists(_.name == name)) {
-        throw new IllegalArgumentException(
-          s"Cannot start query with name $name as a query with that name is already active")
-      }
-      val checkpointLocation = userSpecifiedCheckpointLocation.map { userSpecified =>
-        new Path(userSpecified).toUri.toString
-      }.orElse {
-        df.sparkSession.sessionState.conf.checkpointLocation.map { location =>
-          new Path(location, name).toUri.toString
-        }
-      }.getOrElse {
-        if (useTempCheckpointLocation) {
-          Utils.createTempDir(namePrefix = s"temporary").getCanonicalPath
-        } else {
-          throw new AnalysisException(
-            "checkpointLocation must be specified either " +
-              """through option("checkpointLocation", ...) or """ +
-              s"""SparkSession.conf.set("${SQLConf.CHECKPOINT_LOCATION.key}", ...)""")
-        }
-      }
+    val query = createQuery(
+      userSpecifiedName,
+      userSpecifiedCheckpointLocation,
+      df,
+      sink,
+      outputMode,
+      useTempCheckpointLocation,
+      recoverFromCheckpointLocation,
+      trigger,
+      triggerClock)
 
-      // If offsets have already been created, we trying to resume a query.
-      if (!recoverFromCheckpointLocation) {
-        val checkpointPath = new Path(checkpointLocation, "offsets")
-        val fs = checkpointPath.getFileSystem(df.sparkSession.sessionState.newHadoopConf())
-        if (fs.exists(checkpointPath)) {
-          throw new AnalysisException(
-            s"This query does not support recovering from checkpoint location. " +
-              s"Delete $checkpointPath to start over.")
+    activeQueriesLock.synchronized {
+      // Make sure no other query with same name is active
+      userSpecifiedName.foreach { name =>
+        if (activeQueries.values.exists(_.name == name)) {
+          throw new IllegalArgumentException(
+            s"Cannot start query with name $name as a query with that name is already active")
         }
       }
 
-      val analyzedPlan = df.queryExecution.analyzed
-      df.queryExecution.assertAnalyzed()
-
-      if (sparkSession.sessionState.conf.isUnsupportedOperationCheckEnabled) {
-        UnsupportedOperationChecker.checkForStreaming(analyzedPlan, outputMode)
+      // Make sure no other query with same id is active
+      if (activeQueries.values.exists(_.id == query.id)) {
+        throw new IllegalStateException(
+          s"Cannot start query with id ${query.id} as another query with same id is " +
+            s"already active. Perhaps you are attempting to restart a query from checkpoint " +
+            s"that is already active.")
       }
 
-      var nextSourceId = 0L
-
-      val logicalPlan = analyzedPlan.transform {
-        case StreamingRelation(dataSource, _, output) =>
-          // Materialize source to avoid creating it in every batch
-          val metadataPath = s"$checkpointLocation/sources/$nextSourceId"
-          val source = dataSource.createSource(metadataPath)
-          nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
-          StreamingExecutionRelation(source, output)
-      }
-      val query = new StreamExecution(
-        sparkSession,
-        id,
-        name,
-        checkpointLocation,
-        logicalPlan,
-        sink,
-        trigger,
-        triggerClock,
-        outputMode)
-      query.start()
-      activeQueries.put(id, query)
-      query
+      activeQueries.put(query.id, query)
+    }
+    try {
+      // When starting a query, it will call `StreamingQueryListener.onQueryStarted` synchronously.
+      // As it's provided by the user and can run arbitrary codes, we must not hold any lock here.
+      // Otherwise, it's easy to cause dead-lock, or block too long if the user codes take a long
+      // time to finish.
+      query.streamingQuery.start()
+    } catch {
+      case e: Throwable =>
+        activeQueriesLock.synchronized {
+          activeQueries -= query.id
+        }
+        throw e
     }
+    query
   }
 
   /** Notify (by the StreamingQuery) that the query has been terminated */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
index a50b0d96c13f..687b1267825f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
@@ -17,53 +17,30 @@
 
 package org.apache.spark.sql.streaming
 
-import java.{util => ju}
-
-import scala.collection.JavaConverters._
-
 import org.json4s._
 import org.json4s.JsonAST.JValue
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.execution.streaming.{CompositeOffset, LongOffset}
-import org.apache.spark.util.JsonProtocol
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 
 /**
  * :: Experimental ::
- * A class used to report information about the progress of a [[StreamingQuery]].
+ * Reports information about the instantaneous status of a streaming query.
+ *
+ * @param message A human readable description of what the stream is currently doing.
+ * @param isDataAvailable True when there is new data to be processed.
+ * @param isTriggerActive True when the trigger is actively firing, false when waiting for the
+ *                        next trigger time.
  *
- * @param name Name of the query. This name is unique across all active queries.
- * @param id Id of the query. This id is unique across
- *          all queries that have been started in the current process.
- * @param timestamp Timestamp (ms) of when this query was generated.
- * @param inputRate Current rate (rows/sec) at which data is being generated by all the sources.
- * @param processingRate Current rate (rows/sec) at which the query is processing data from
- *                       all the sources.
- * @param latency  Current average latency between the data being available in source and the sink
- *                   writing the corresponding output.
- * @param sourceStatuses Current statuses of the sources.
- * @param sinkStatus Current status of the sink.
- * @param triggerDetails Low-level details of the currently active trigger (e.g. number of
- *                      rows processed in trigger, latency of intermediate steps, etc.).
- *                      If no trigger is active, then it will have details of the last completed
- *                      trigger.
- * @since 2.0.0
+ * @since 2.1.0
  */
 @Experimental
-class StreamingQueryStatus private(
-  val name: String,
-  val id: Long,
-  val timestamp: Long,
-  val inputRate: Double,
-  val processingRate: Double,
-  val latency: Option[Double],
-  val sourceStatuses: Array[SourceStatus],
-  val sinkStatus: SinkStatus,
-  val triggerDetails: ju.Map[String, String]) {
-
-  import StreamingQueryStatus._
+@InterfaceStability.Evolving
+class StreamingQueryStatus protected[sql](
+    val message: String,
+    val isDataAvailable: Boolean,
+    val isTriggerActive: Boolean) extends Serializable {
 
   /** The compact JSON representation of this status. */
   def json: String = compact(render(jsonValue))
@@ -71,92 +48,21 @@ class StreamingQueryStatus private(
   /** The pretty (i.e. indented) JSON representation of this status. */
   def prettyJson: String = pretty(render(jsonValue))
 
-  override def toString: String = {
-    val sourceStatusLines = sourceStatuses.zipWithIndex.map { case (s, i) =>
-      s"Source ${i + 1} - " + indent(s.prettyString).trim
-    }
-    val sinkStatusLines = sinkStatus.prettyString.trim
-    val triggerDetailsLines = triggerDetails.asScala.map { case (k, v) => s"$k: $v" }.toSeq.sorted
-    val numSources = sourceStatuses.length
-    val numSourcesString = s"$numSources source" + { if (numSources > 1) "s" else "" }
-
-    val allLines =
-      s"""|Query id: $id
-          |Status timestamp: $timestamp
-          |Input rate: $inputRate rows/sec
-          |Processing rate $processingRate rows/sec
-          |Latency: ${latency.getOrElse("-")} ms
-          |Trigger details:
-          |${indent(triggerDetailsLines)}
-          |Source statuses [$numSourcesString]:
-          |${indent(sourceStatusLines)}
-          |Sink status - ${indent(sinkStatusLines).trim}""".stripMargin
+  override def toString: String = prettyJson
 
-    s"Status of query '$name'\n${indent(allLines)}"
+  private[sql] def copy(
+      message: String = this.message,
+      isDataAvailable: Boolean = this.isDataAvailable,
+      isTriggerActive: Boolean = this.isTriggerActive): StreamingQueryStatus = {
+    new StreamingQueryStatus(
+      message = message,
+      isDataAvailable = isDataAvailable,
+      isTriggerActive = isTriggerActive)
   }
 
   private[sql] def jsonValue: JValue = {
-    ("name" -> JString(name)) ~
-    ("id" -> JInt(id)) ~
-    ("timestamp" -> JInt(timestamp)) ~
-    ("inputRate" -> JDouble(inputRate)) ~
-    ("processingRate" -> JDouble(processingRate)) ~
-    ("latency" -> latency.map(JDouble).getOrElse(JNothing)) ~
-    ("triggerDetails" -> JsonProtocol.mapToJson(triggerDetails.asScala))
-    ("sourceStatuses" -> JArray(sourceStatuses.map(_.jsonValue).toList)) ~
-    ("sinkStatus" -> sinkStatus.jsonValue)
-  }
-}
-
-/** Companion object, primarily for creating StreamingQueryInfo instances internally */
-private[sql] object StreamingQueryStatus {
-  def apply(
-      name: String,
-      id: Long,
-      timestamp: Long,
-      inputRate: Double,
-      processingRate: Double,
-      latency: Option[Double],
-      sourceStatuses: Array[SourceStatus],
-      sinkStatus: SinkStatus,
-      triggerDetails: Map[String, String]): StreamingQueryStatus = {
-    new StreamingQueryStatus(name, id, timestamp, inputRate, processingRate,
-      latency, sourceStatuses, sinkStatus, triggerDetails.asJava)
-  }
-
-  def indent(strings: Iterable[String]): String = strings.map(indent).mkString("\n")
-  def indent(string: String): String = string.split("\n").map("    " + _).mkString("\n")
-
-  /** Create an instance of status for python testing */
-  def testStatus(): StreamingQueryStatus = {
-    import org.apache.spark.sql.execution.streaming.StreamMetrics._
-    StreamingQueryStatus(
-      name = "query",
-      id = 1,
-      timestamp = 123,
-      inputRate = 15.5,
-      processingRate = 23.5,
-      latency = Some(345),
-      sourceStatuses = Array(
-        SourceStatus(
-          desc = "MySource1",
-          offsetDesc = LongOffset(0).toString,
-          inputRate = 15.5,
-          processingRate = 23.5,
-          triggerDetails = Map(
-            NUM_SOURCE_INPUT_ROWS -> "100",
-            SOURCE_GET_OFFSET_LATENCY -> "10",
-            SOURCE_GET_BATCH_LATENCY -> "20"))),
-      sinkStatus = SinkStatus(
-        desc = "MySink",
-        offsetDesc = CompositeOffset(Some(LongOffset(1)) :: None :: Nil).toString),
-      triggerDetails = Map(
-        TRIGGER_ID -> "5",
-        IS_TRIGGER_ACTIVE -> "true",
-        IS_DATA_PRESENT_IN_TRIGGER -> "true",
-        GET_OFFSET_LATENCY -> "10",
-        GET_BATCH_LATENCY -> "20",
-        NUM_INPUT_ROWS -> "100"
-      ))
+    ("message" -> JString(message.toString)) ~
+    ("isDataAvailable" -> JBool(isDataAvailable)) ~
+    ("isTriggerActive" -> JBool(isTriggerActive))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/Trigger.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/Trigger.scala
index 55be7a711adb..68f2eab9d45f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/Trigger.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/Trigger.scala
@@ -23,7 +23,7 @@ import scala.concurrent.duration.Duration
 
 import org.apache.commons.lang3.StringUtils
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.unsafe.types.CalendarInterval
 
 /**
@@ -33,6 +33,7 @@ import org.apache.spark.unsafe.types.CalendarInterval
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 sealed trait Trigger
 
 /**
@@ -59,6 +60,7 @@ sealed trait Trigger
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 case class ProcessingTime(intervalMs: Long) extends Trigger {
   require(intervalMs >= 0, "the interval of trigger should not be negative")
 }
@@ -70,6 +72,7 @@ case class ProcessingTime(intervalMs: Long) extends Trigger {
  * @since 2.0.0
  */
 @Experimental
+@InterfaceStability.Evolving
 object ProcessingTime {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
new file mode 100644
index 000000000000..eddae1b4771f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.{util => ju}
+import java.lang.{Long => JLong}
+import java.util.UUID
+
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+import org.json4s._
+import org.json4s.JsonAST.JValue
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
+
+/**
+ * :: Experimental ::
+ * Information about updates made to stateful operators in a [[StreamingQuery]] during a trigger.
+ */
+@Experimental
+@InterfaceStability.Evolving
+class StateOperatorProgress private[sql](
+    val numRowsTotal: Long,
+    val numRowsUpdated: Long) extends Serializable {
+
+  /** The compact JSON representation of this progress. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this progress. */
+  def prettyJson: String = pretty(render(jsonValue))
+
+  private[sql] def jsonValue: JValue = {
+    ("numRowsTotal" -> JInt(numRowsTotal)) ~
+    ("numRowsUpdated" -> JInt(numRowsUpdated))
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Information about progress made in the execution of a [[StreamingQuery]] during
+ * a trigger. Each event relates to processing done for a single trigger of the streaming
+ * query. Events are emitted even when no new data is available to be processed.
+ *
+ * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`.
+ * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`.
+ * @param name User-specified name of the query, null if not specified.
+ * @param timestamp Beginning time of the trigger in ISO8601 format, i.e. UTC timestamps.
+ * @param batchId A unique id for the current batch of data being processed.  Note that in the
+ *                case of retries after a failure a given batchId my be executed more than once.
+ *                Similarly, when there is no data to be processed, the batchId will not be
+ *                incremented.
+ * @param durationMs The amount of time taken to perform various operations in milliseconds.
+ * @param eventTime Statistics of event time seen in this batch. It may contain the following keys:
+ *                 {
+ *                   "max" -> "2016-12-05T20:54:20.827Z"  // maximum event time seen in this trigger
+ *                   "min" -> "2016-12-05T20:54:20.827Z"  // minimum event time seen in this trigger
+ *                   "avg" -> "2016-12-05T20:54:20.827Z"  // average event time seen in this trigger
+ *                   "watermark" -> "2016-12-05T20:54:20.827Z"  // watermark used in this trigger
+ *                 }
+ *                 All timestamps are in ISO8601 format, i.e. UTC timestamps.
+ * @param stateOperators Information about operators in the query that store state.
+ * @param sources detailed statistics on data being read from each of the streaming sources.
+ * @since 2.1.0
+ */
+@Experimental
+@InterfaceStability.Evolving
+class StreamingQueryProgress private[sql](
+  val id: UUID,
+  val runId: UUID,
+  val name: String,
+  val timestamp: String,
+  val batchId: Long,
+  val durationMs: ju.Map[String, JLong],
+  val eventTime: ju.Map[String, String],
+  val stateOperators: Array[StateOperatorProgress],
+  val sources: Array[SourceProgress],
+  val sink: SinkProgress) extends Serializable {
+
+  /** The aggregate (across all sources) number of records processed in a trigger. */
+  def numInputRows: Long = sources.map(_.numInputRows).sum
+
+  /** The aggregate (across all sources) rate of data arriving. */
+  def inputRowsPerSecond: Double = sources.map(_.inputRowsPerSecond).sum
+
+  /** The aggregate (across all sources) rate at which Spark is processing data. */
+  def processedRowsPerSecond: Double = sources.map(_.processedRowsPerSecond).sum
+
+  /** The compact JSON representation of this progress. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this progress. */
+  def prettyJson: String = pretty(render(jsonValue))
+
+  override def toString: String = prettyJson
+
+  private[sql] def jsonValue: JValue = {
+    def safeDoubleToJValue(value: Double): JValue = {
+      if (value.isNaN || value.isInfinity) JNothing else JDouble(value)
+    }
+
+    /** Convert map to JValue while handling empty maps. Also, this sorts the keys. */
+    def safeMapToJValue[T](map: ju.Map[String, T], valueToJValue: T => JValue): JValue = {
+      if (map.isEmpty) return JNothing
+      val keys = map.asScala.keySet.toSeq.sorted
+      keys.map { k => k -> valueToJValue(map.get(k)) : JObject }.reduce(_ ~ _)
+    }
+
+    ("id" -> JString(id.toString)) ~
+    ("runId" -> JString(runId.toString)) ~
+    ("name" -> JString(name)) ~
+    ("timestamp" -> JString(timestamp)) ~
+    ("numInputRows" -> JInt(numInputRows)) ~
+    ("inputRowsPerSecond" -> safeDoubleToJValue(inputRowsPerSecond)) ~
+    ("processedRowsPerSecond" -> safeDoubleToJValue(processedRowsPerSecond)) ~
+    ("durationMs" -> safeMapToJValue[JLong](durationMs, v => JInt(v.toLong))) ~
+    ("eventTime" -> safeMapToJValue[String](eventTime, s => JString(s))) ~
+    ("stateOperators" -> JArray(stateOperators.map(_.jsonValue).toList)) ~
+    ("sources" -> JArray(sources.map(_.jsonValue).toList)) ~
+    ("sink" -> sink.jsonValue)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Information about progress made for a source in the execution of a [[StreamingQuery]]
+ * during a trigger. See [[StreamingQueryProgress]] for more information.
+ *
+ * @param description            Description of the source.
+ * @param startOffset            The starting offset for data being read.
+ * @param endOffset              The ending offset for data being read.
+ * @param numInputRows           The number of records read from this source.
+ * @param inputRowsPerSecond     The rate at which data is arriving from this source.
+ * @param processedRowsPerSecond The rate at which data from this source is being procressed by
+ *                               Spark.
+ * @since 2.1.0
+ */
+@Experimental
+@InterfaceStability.Evolving
+class SourceProgress protected[sql](
+  val description: String,
+  val startOffset: String,
+  val endOffset: String,
+  val numInputRows: Long,
+  val inputRowsPerSecond: Double,
+  val processedRowsPerSecond: Double) extends Serializable {
+
+  /** The compact JSON representation of this progress. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this progress. */
+  def prettyJson: String = pretty(render(jsonValue))
+
+  override def toString: String = prettyJson
+
+  private[sql] def jsonValue: JValue = {
+    def safeDoubleToJValue(value: Double): JValue = {
+      if (value.isNaN || value.isInfinity) JNothing else JDouble(value)
+    }
+
+    ("description" -> JString(description)) ~
+      ("startOffset" -> tryParse(startOffset)) ~
+      ("endOffset" -> tryParse(endOffset)) ~
+      ("numInputRows" -> JInt(numInputRows)) ~
+      ("inputRowsPerSecond" -> safeDoubleToJValue(inputRowsPerSecond)) ~
+      ("processedRowsPerSecond" -> safeDoubleToJValue(processedRowsPerSecond))
+  }
+
+  private def tryParse(json: String) = try {
+    parse(json)
+  } catch {
+    case NonFatal(e) => JString(json)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Information about progress made for a sink in the execution of a [[StreamingQuery]]
+ * during a trigger. See [[StreamingQueryProgress]] for more information.
+ *
+ * @param description Description of the source corresponding to this status.
+ * @since 2.1.0
+ */
+@Experimental
+@InterfaceStability.Evolving
+class SinkProgress protected[sql](
+    val description: String) extends Serializable {
+
+  /** The compact JSON representation of this progress. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this progress. */
+  def prettyJson: String = pretty(render(jsonValue))
+
+  override def toString: String = prettyJson
+
+  private[sql] def jsonValue: JValue = {
+    ("description" -> JString(description))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
index 5e93fc469a41..26ad0eadd9d4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.QueryExecution
  * :: Experimental ::
  * The interface of query execution listener that can be used to analyze execution metrics.
  *
- * Note that implementations should guarantee thread-safety as they can be invoked by
+ * @note Implementations should guarantee thread-safety as they can be invoked by
  * multiple different threads.
  */
 @Experimental
@@ -39,24 +39,26 @@ trait QueryExecutionListener {
 
   /**
    * A callback function that will be called when a query executed successfully.
-   * Note that this can be invoked by multiple different threads.
    *
    * @param funcName name of the action that triggered this query.
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param durationNs the execution time for this query in nanoseconds.
+   *
+   * @note This can be invoked by multiple different threads.
    */
   @DeveloperApi
   def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit
 
   /**
    * A callback function that will be called when a query execution failed.
-   * Note that this can be invoked by multiple different threads.
    *
    * @param funcName the name of the action that triggered this query.
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param exception the exception that failed this query.
+   *
+   * @note This can be invoked by multiple different threads.
    */
   @DeveloperApi
   def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit
@@ -66,7 +68,7 @@ trait QueryExecutionListener {
 /**
  * :: Experimental ::
  *
- * Manager for [[QueryExecutionListener]]. See [[org.apache.spark.sql.SQLContext.listenerManager]].
+ * Manager for [[QueryExecutionListener]]. See `org.apache.spark.sql.SQLContext.listenerManager`.
  */
 @Experimental
 @InterfaceStability.Evolving
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
index 96e8fb066854..b25e3493c17b 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
@@ -876,4 +876,457 @@ public void testRuntimeNullabilityCheck() {
       ds.collect();
     }
   }
+
+  public static class Nesting3 implements Serializable {
+    private Integer field3_1;
+    private Double field3_2;
+    private String field3_3;
+
+    public Nesting3() {
+    }
+
+    public Nesting3(Integer field3_1, Double field3_2, String field3_3) {
+      this.field3_1 = field3_1;
+      this.field3_2 = field3_2;
+      this.field3_3 = field3_3;
+    }
+
+    private Nesting3(Builder builder) {
+      setField3_1(builder.field3_1);
+      setField3_2(builder.field3_2);
+      setField3_3(builder.field3_3);
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public Integer getField3_1() {
+      return field3_1;
+    }
+
+    public void setField3_1(Integer field3_1) {
+      this.field3_1 = field3_1;
+    }
+
+    public Double getField3_2() {
+      return field3_2;
+    }
+
+    public void setField3_2(Double field3_2) {
+      this.field3_2 = field3_2;
+    }
+
+    public String getField3_3() {
+      return field3_3;
+    }
+
+    public void setField3_3(String field3_3) {
+      this.field3_3 = field3_3;
+    }
+
+    public static final class Builder {
+      private Integer field3_1 = 0;
+      private Double field3_2 = 0.0;
+      private String field3_3 = "value";
+
+      private Builder() {
+      }
+
+      public Builder field3_1(Integer field3_1) {
+        this.field3_1 = field3_1;
+        return this;
+      }
+
+      public Builder field3_2(Double field3_2) {
+        this.field3_2 = field3_2;
+        return this;
+      }
+
+      public Builder field3_3(String field3_3) {
+        this.field3_3 = field3_3;
+        return this;
+      }
+
+      public Nesting3 build() {
+        return new Nesting3(this);
+      }
+    }
+  }
+
+  public static class Nesting2 implements Serializable {
+    private Nesting3 field2_1;
+    private Nesting3 field2_2;
+    private Nesting3 field2_3;
+
+    public Nesting2() {
+    }
+
+    public Nesting2(Nesting3 field2_1, Nesting3 field2_2, Nesting3 field2_3) {
+      this.field2_1 = field2_1;
+      this.field2_2 = field2_2;
+      this.field2_3 = field2_3;
+    }
+
+    private Nesting2(Builder builder) {
+      setField2_1(builder.field2_1);
+      setField2_2(builder.field2_2);
+      setField2_3(builder.field2_3);
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public Nesting3 getField2_1() {
+      return field2_1;
+    }
+
+    public void setField2_1(Nesting3 field2_1) {
+      this.field2_1 = field2_1;
+    }
+
+    public Nesting3 getField2_2() {
+      return field2_2;
+    }
+
+    public void setField2_2(Nesting3 field2_2) {
+      this.field2_2 = field2_2;
+    }
+
+    public Nesting3 getField2_3() {
+      return field2_3;
+    }
+
+    public void setField2_3(Nesting3 field2_3) {
+      this.field2_3 = field2_3;
+    }
+
+
+    public static final class Builder {
+      private Nesting3 field2_1 = Nesting3.newBuilder().build();
+      private Nesting3 field2_2 = Nesting3.newBuilder().build();
+      private Nesting3 field2_3 = Nesting3.newBuilder().build();
+
+      private Builder() {
+      }
+
+      public Builder field2_1(Nesting3 field2_1) {
+        this.field2_1 = field2_1;
+        return this;
+      }
+
+      public Builder field2_2(Nesting3 field2_2) {
+        this.field2_2 = field2_2;
+        return this;
+      }
+
+      public Builder field2_3(Nesting3 field2_3) {
+        this.field2_3 = field2_3;
+        return this;
+      }
+
+      public Nesting2 build() {
+        return new Nesting2(this);
+      }
+    }
+  }
+
+  public static class Nesting1 implements Serializable {
+    private Nesting2 field1_1;
+    private Nesting2 field1_2;
+    private Nesting2 field1_3;
+
+    public Nesting1() {
+    }
+
+    public Nesting1(Nesting2 field1_1, Nesting2 field1_2, Nesting2 field1_3) {
+      this.field1_1 = field1_1;
+      this.field1_2 = field1_2;
+      this.field1_3 = field1_3;
+    }
+
+    private Nesting1(Builder builder) {
+      setField1_1(builder.field1_1);
+      setField1_2(builder.field1_2);
+      setField1_3(builder.field1_3);
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public Nesting2 getField1_1() {
+      return field1_1;
+    }
+
+    public void setField1_1(Nesting2 field1_1) {
+      this.field1_1 = field1_1;
+    }
+
+    public Nesting2 getField1_2() {
+      return field1_2;
+    }
+
+    public void setField1_2(Nesting2 field1_2) {
+      this.field1_2 = field1_2;
+    }
+
+    public Nesting2 getField1_3() {
+      return field1_3;
+    }
+
+    public void setField1_3(Nesting2 field1_3) {
+      this.field1_3 = field1_3;
+    }
+
+
+    public static final class Builder {
+      private Nesting2 field1_1 = Nesting2.newBuilder().build();
+      private Nesting2 field1_2 = Nesting2.newBuilder().build();
+      private Nesting2 field1_3 = Nesting2.newBuilder().build();
+
+      private Builder() {
+      }
+
+      public Builder field1_1(Nesting2 field1_1) {
+        this.field1_1 = field1_1;
+        return this;
+      }
+
+      public Builder field1_2(Nesting2 field1_2) {
+        this.field1_2 = field1_2;
+        return this;
+      }
+
+      public Builder field1_3(Nesting2 field1_3) {
+        this.field1_3 = field1_3;
+        return this;
+      }
+
+      public Nesting1 build() {
+        return new Nesting1(this);
+      }
+    }
+  }
+
+  public static class NestedComplicatedJavaBean implements Serializable {
+    private Nesting1 field1;
+    private Nesting1 field2;
+    private Nesting1 field3;
+    private Nesting1 field4;
+    private Nesting1 field5;
+    private Nesting1 field6;
+    private Nesting1 field7;
+    private Nesting1 field8;
+    private Nesting1 field9;
+    private Nesting1 field10;
+
+    public NestedComplicatedJavaBean() {
+    }
+
+    private NestedComplicatedJavaBean(Builder builder) {
+      setField1(builder.field1);
+      setField2(builder.field2);
+      setField3(builder.field3);
+      setField4(builder.field4);
+      setField5(builder.field5);
+      setField6(builder.field6);
+      setField7(builder.field7);
+      setField8(builder.field8);
+      setField9(builder.field9);
+      setField10(builder.field10);
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public Nesting1 getField1() {
+      return field1;
+    }
+
+    public void setField1(Nesting1 field1) {
+      this.field1 = field1;
+    }
+
+    public Nesting1 getField2() {
+      return field2;
+    }
+
+    public void setField2(Nesting1 field2) {
+      this.field2 = field2;
+    }
+
+    public Nesting1 getField3() {
+      return field3;
+    }
+
+    public void setField3(Nesting1 field3) {
+      this.field3 = field3;
+    }
+
+    public Nesting1 getField4() {
+      return field4;
+    }
+
+    public void setField4(Nesting1 field4) {
+      this.field4 = field4;
+    }
+
+    public Nesting1 getField5() {
+      return field5;
+    }
+
+    public void setField5(Nesting1 field5) {
+      this.field5 = field5;
+    }
+
+    public Nesting1 getField6() {
+      return field6;
+    }
+
+    public void setField6(Nesting1 field6) {
+      this.field6 = field6;
+    }
+
+    public Nesting1 getField7() {
+      return field7;
+    }
+
+    public void setField7(Nesting1 field7) {
+      this.field7 = field7;
+    }
+
+    public Nesting1 getField8() {
+      return field8;
+    }
+
+    public void setField8(Nesting1 field8) {
+      this.field8 = field8;
+    }
+
+    public Nesting1 getField9() {
+      return field9;
+    }
+
+    public void setField9(Nesting1 field9) {
+      this.field9 = field9;
+    }
+
+    public Nesting1 getField10() {
+      return field10;
+    }
+
+    public void setField10(Nesting1 field10) {
+      this.field10 = field10;
+    }
+
+    public static final class Builder {
+      private Nesting1 field1 = Nesting1.newBuilder().build();
+      private Nesting1 field2 = Nesting1.newBuilder().build();
+      private Nesting1 field3 = Nesting1.newBuilder().build();
+      private Nesting1 field4 = Nesting1.newBuilder().build();
+      private Nesting1 field5 = Nesting1.newBuilder().build();
+      private Nesting1 field6 = Nesting1.newBuilder().build();
+      private Nesting1 field7 = Nesting1.newBuilder().build();
+      private Nesting1 field8 = Nesting1.newBuilder().build();
+      private Nesting1 field9 = Nesting1.newBuilder().build();
+      private Nesting1 field10 = Nesting1.newBuilder().build();
+
+      private Builder() {
+      }
+
+      public Builder field1(Nesting1 field1) {
+        this.field1 = field1;
+        return this;
+      }
+
+      public Builder field2(Nesting1 field2) {
+        this.field2 = field2;
+        return this;
+      }
+
+      public Builder field3(Nesting1 field3) {
+        this.field3 = field3;
+        return this;
+      }
+
+      public Builder field4(Nesting1 field4) {
+        this.field4 = field4;
+        return this;
+      }
+
+      public Builder field5(Nesting1 field5) {
+        this.field5 = field5;
+        return this;
+      }
+
+      public Builder field6(Nesting1 field6) {
+        this.field6 = field6;
+        return this;
+      }
+
+      public Builder field7(Nesting1 field7) {
+        this.field7 = field7;
+        return this;
+      }
+
+      public Builder field8(Nesting1 field8) {
+        this.field8 = field8;
+        return this;
+      }
+
+      public Builder field9(Nesting1 field9) {
+        this.field9 = field9;
+        return this;
+      }
+
+      public Builder field10(Nesting1 field10) {
+        this.field10 = field10;
+        return this;
+      }
+
+      public NestedComplicatedJavaBean build() {
+        return new NestedComplicatedJavaBean(this);
+      }
+    }
+  }
+
+  @Test
+  public void test() {
+    /* SPARK-15285 Large numbers of Nested JavaBeans generates more than 64KB java bytecode */
+    List<NestedComplicatedJavaBean> data = new ArrayList<NestedComplicatedJavaBean>();
+    data.add(NestedComplicatedJavaBean.newBuilder().build());
+
+    NestedComplicatedJavaBean obj3 = new NestedComplicatedJavaBean();
+
+    Dataset<NestedComplicatedJavaBean> ds =
+      spark.createDataset(data, Encoders.bean(NestedComplicatedJavaBean.class));
+    ds.collectAsList();
+  }
+
+  @Test(expected = RuntimeException.class)
+  public void testNullInTopLevelBean() {
+    NestedSmallBean bean = new NestedSmallBean();
+    // We cannot set null in top-level bean
+    spark.createDataset(Arrays.asList(bean, null), Encoders.bean(NestedSmallBean.class));
+  }
+
+  @Test
+  public void testSerializeNull() {
+    NestedSmallBean bean = new NestedSmallBean();
+    Encoder<NestedSmallBean> encoder = Encoders.bean(NestedSmallBean.class);
+    List<NestedSmallBean> beans = Arrays.asList(bean);
+    Dataset<NestedSmallBean> ds1 = spark.createDataset(beans, encoder);
+    Assert.assertEquals(beans, ds1.collectAsList());
+    Dataset<NestedSmallBean> ds2 =
+      ds1.map(new MapFunction<NestedSmallBean, NestedSmallBean>() {
+        @Override
+        public NestedSmallBean call(NestedSmallBean b) throws Exception {
+          return b;
+        }
+      }, encoder);
+    Assert.assertEquals(beans, ds2.collectAsList());
+  }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
index 8bf3278c4388..bbaac5a33975 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
@@ -18,6 +18,7 @@
 package test.org.apache.spark.sql;
 
 import java.io.Serializable;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -108,4 +109,25 @@ public void udf3Test() {
     result = spark.sql("SELECT stringLengthTest('test', 'test2')").head();
     Assert.assertEquals(9, result.getInt(0));
   }
+
+  @SuppressWarnings("unchecked")
+  @Test
+  public void udf4Test() {
+    spark.udf().register("inc", new UDF1<Long, Long>() {
+      @Override
+      public Long call(Long i) {
+        return i + 1;
+      }
+    }, DataTypes.LongType);
+
+    spark.range(10).toDF("x").createOrReplaceTempView("tmp");
+    // This tests when Java UDFs are required to be the semantically same (See SPARK-9435).
+    List<Row> results = spark.sql("SELECT inc(x) FROM tmp GROUP BY inc(x)").collectAsList();
+    Assert.assertEquals(10, results.size());
+    long sum = 0;
+    for (Row result : results) {
+      sum += result.getLong(0);
+    }
+    Assert.assertEquals(55, sum);
+  }
 }
diff --git a/sql/core/src/test/resources/log4j.properties b/sql/core/src/test/resources/log4j.properties
index 33b9ecf1e282..25b817382195 100644
--- a/sql/core/src/test/resources/log4j.properties
+++ b/sql/core/src/test/resources/log4j.properties
@@ -53,5 +53,5 @@ log4j.additivity.hive.ql.metadata.Hive=false
 log4j.logger.hive.ql.metadata.Hive=OFF
 
 # Parquet related logging
-log4j.logger.org.apache.parquet.hadoop=WARN
-log4j.logger.org.apache.spark.sql.parquet=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cast.sql b/sql/core/src/test/resources/sql-tests/inputs/cast.sql
new file mode 100644
index 000000000000..5fae571945e4
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/cast.sql
@@ -0,0 +1,43 @@
+-- cast string representing a valid fractional number to integral should truncate the number
+SELECT CAST('1.23' AS int);
+SELECT CAST('1.23' AS long);
+SELECT CAST('-4.56' AS int);
+SELECT CAST('-4.56' AS long);
+
+-- cast string which are not numbers to integral should return null
+SELECT CAST('abc' AS int);
+SELECT CAST('abc' AS long);
+
+-- cast string representing a very large number to integral should return null
+SELECT CAST('1234567890123' AS int);
+SELECT CAST('12345678901234567890123' AS long);
+
+-- cast empty string to integral should return null
+SELECT CAST('' AS int);
+SELECT CAST('' AS long);
+
+-- cast null to integral should return null
+SELECT CAST(NULL AS int);
+SELECT CAST(NULL AS long);
+
+-- cast invalid decimal string to integral should return null
+SELECT CAST('123.a' AS int);
+SELECT CAST('123.a' AS long);
+
+-- '-2147483648' is the smallest int value
+SELECT CAST('-2147483648' AS int);
+SELECT CAST('-2147483649' AS int);
+
+-- '2147483647' is the largest int value
+SELECT CAST('2147483647' AS int);
+SELECT CAST('2147483648' AS int);
+
+-- '-9223372036854775808' is the smallest long value
+SELECT CAST('-9223372036854775808' AS long);
+SELECT CAST('-9223372036854775809' AS long);
+
+-- '9223372036854775807' is the largest long value
+SELECT CAST('9223372036854775807' AS long);
+SELECT CAST('9223372036854775808' AS long);
+
+-- TODO: migrate all cast tests here.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
index 3914db26914b..d34d89f23575 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
@@ -12,3 +12,18 @@ WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2;
 
 -- WITH clause should reference the previous CTE
 WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2;
+
+-- SPARK-18609 CTE with self-join
+WITH CTE1 AS (
+  SELECT b.id AS id
+  FROM   T2 a
+         CROSS JOIN (SELECT id AS id FROM T2) b
+)
+SELECT t1.id AS c1,
+       t2.id AS c2
+FROM   CTE1 t1
+       CROSS JOIN CTE1 t2;
+
+-- Clean up
+DROP VIEW IF EXISTS t;
+DROP VIEW IF EXISTS t2;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index d950ec83d98c..f4f5a043d478 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -32,3 +32,13 @@ SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1;
 -- Aggregate with nulls.
 SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
 FROM testData;
+
+-- Aggregate with foldable input and multiple distinct groups.
+SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a;
+
+-- Aggregate with empty input and non-empty GroupBy expressions.
+SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a;
+
+-- Aggregate with empty input and empty GroupBy expressions.
+SELECT COUNT(1) FROM testData WHERE false;
+SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
new file mode 100644
index 000000000000..2b54658a0710
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
@@ -0,0 +1,25 @@
+CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES
+  ("1", "2", "3", 1),
+  ("4", "5", "6", 1),
+  ("7", "8", "9", 1)
+  AS grouping(a, b, c, d);
+
+CREATE TEMPORARY VIEW grouping_null AS SELECT * FROM VALUES
+  CAST(NULL AS STRING),
+  CAST(NULL AS STRING)
+  AS T(e);
+
+-- SPARK-17849: grouping set throws NPE #1
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (());
+
+-- SPARK-17849: grouping set throws NPE #2
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a));
+
+-- SPARK-17849: grouping set throws NPE #3
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c));
+
+-- SPARK-19509: grouping set should honor input nullability
+SELECT COUNT(1) FROM grouping_null GROUP BY e GROUPING SETS (e);
+
+DROP VIEW IF EXISTS grouping;
+DROP VIEW IF EXISTS grouping_null;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/inner-join.sql b/sql/core/src/test/resources/sql-tests/inputs/inner-join.sql
new file mode 100644
index 000000000000..38739cb95058
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/inner-join.sql
@@ -0,0 +1,17 @@
+CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a);
+CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a);
+CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a);
+CREATE TEMPORARY VIEW t4 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a);
+
+CREATE TEMPORARY VIEW ta AS
+SELECT a, 'a' AS tag FROM t1
+UNION ALL
+SELECT a, 'b' AS tag FROM t2;
+
+CREATE TEMPORARY VIEW tb AS
+SELECT a, 'a' AS tag FROM t3
+UNION ALL
+SELECT a, 'b' AS tag FROM t4;
+
+-- SPARK-19766 Constant alias columns in INNER JOIN should not be folded by FoldablePropagation rule
+SELECT tb.* FROM ta INNER JOIN tb ON ta.a = tb.a AND ta.tag = tb.tag;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/pred-pushdown.sql b/sql/core/src/test/resources/sql-tests/inputs/pred-pushdown.sql
new file mode 100644
index 000000000000..eff258a06635
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/pred-pushdown.sql
@@ -0,0 +1,12 @@
+CREATE OR REPLACE TEMPORARY VIEW tbl_a AS VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2);
+CREATE OR REPLACE TEMPORARY VIEW tbl_b AS VALUES 1 AS T(c1);
+
+-- SPARK-18597: Do not push down predicates to left hand side in an anti-join
+SELECT *
+FROM   tbl_a
+       LEFT ANTI JOIN tbl_b ON ((tbl_a.c1 = tbl_a.c2) IS NULL OR tbl_a.c1 = tbl_a.c2);
+
+-- SPARK-18614: Do not push down predicates on left table below ExistenceJoin
+SELECT l.c1, l.c2
+FROM   tbl_a l
+WHERE  EXISTS (SELECT 1 FROM tbl_b r WHERE l.c1 = l.c2) OR l.c2 < 2;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/random.sql b/sql/core/src/test/resources/sql-tests/inputs/random.sql
new file mode 100644
index 000000000000..a1aae7b8759d
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/random.sql
@@ -0,0 +1,17 @@
+-- rand with the seed 0
+SELECT rand(0);
+SELECT rand(cast(3 / 7 AS int));
+SELECT rand(NULL);
+SELECT rand(cast(NULL AS int));
+
+-- rand unsupported data type
+SELECT rand(1.0);
+
+-- randn with the seed 0
+SELECT randn(0L);
+SELECT randn(cast(3 / 7 AS long));
+SELECT randn(NULL);
+SELECT randn(cast(NULL AS long));
+
+-- randn unsupported data type
+SELECT rand('1')
diff --git a/sql/core/src/test/resources/sql-tests/inputs/scalar-subquery.sql b/sql/core/src/test/resources/sql-tests/inputs/scalar-subquery.sql
new file mode 100644
index 000000000000..3acc9db09cb8
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/scalar-subquery.sql
@@ -0,0 +1,20 @@
+CREATE OR REPLACE TEMPORARY VIEW p AS VALUES (1, 1) AS T(pk, pv);
+CREATE OR REPLACE TEMPORARY VIEW c AS VALUES (1, 1) AS T(ck, cv);
+
+-- SPARK-18814.1: Simplified version of TPCDS-Q32
+SELECT pk, cv
+FROM   p, c
+WHERE  p.pk = c.ck
+AND    c.cv = (SELECT avg(c1.cv)
+               FROM   c c1
+               WHERE  c1.ck = p.pk);
+
+-- SPARK-18814.2: Adding stack of aggregates
+SELECT pk, cv
+FROM   p, c
+WHERE  p.pk = c.ck
+AND    c.cv = (SELECT max(avg)
+	       FROM   (SELECT   c1.cv, avg(c1.cv) avg
+		       FROM     c c1
+		       WHERE    c1.ck = p.pk
+                       GROUP BY c1.cv));
diff --git a/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql
new file mode 100644
index 000000000000..2b5b692d29ef
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql
@@ -0,0 +1,25 @@
+-- A test suite for functions added for compatibility with other databases such as Oracle, MSSQL.
+-- These functions are typically implemented using the trait RuntimeReplaceable.
+
+SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null);
+SELECT nullif('x', 'x'), nullif('x', 'y');
+SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null);
+SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null);
+
+-- type coercion
+SELECT ifnull(1, 2.1d), ifnull(null, 2.1d);
+SELECT nullif(1, 2.1d), nullif(1, 1.0d);
+SELECT nvl(1, 2.1d), nvl(null, 2.1d);
+SELECT nvl2(null, 1, 2.1d), nvl2('n', 1, 2.1d);
+
+-- explain for these functions; use range to avoid constant folding
+explain extended
+select ifnull(id, 'x'), nullif(id, 'x'), nvl(id, 'x'), nvl2(id, 'x', 'y')
+from range(2);
+
+-- SPARK-16730 cast alias functions for Hive compatibility
+SELECT boolean(1), tinyint(1), smallint(1), int(1), bigint(1);
+SELECT float(1), double(1), decimal(1);
+SELECT date("2014-04-04"), timestamp(date("2014-04-04"));
+-- error handling: only one argument
+SELECT string(1, 2);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
new file mode 100644
index 000000000000..f21981ef7b72
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -0,0 +1,3 @@
+-- Argument number exception
+select concat_ws();
+select format_string();
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-multiple-columns.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-multiple-columns.sql
new file mode 100644
index 000000000000..db668505adf2
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-multiple-columns.sql
@@ -0,0 +1,55 @@
+-- This file contains test cases for NOT IN subquery with multiple columns.
+
+-- The data sets are populated as follows:
+-- 1) When T1.A1 = T2.A2
+--    1.1) T1.B1 = T2.B2
+--    1.2) T1.B1 = T2.B2 returns false
+--    1.3) T1.B1 is null
+--    1.4) T2.B2 is null
+-- 2) When T1.A1 = T2.A2 returns false
+-- 3) When T1.A1 is null
+-- 4) When T1.A2 is null
+
+-- T1.A1  T1.B1     T2.A2  T2.B2
+-- -----  -----     -----  -----
+--     1      1         1      1    (1.1)
+--     1      3                     (1.2)
+--     1   null         1   null    (1.3 & 1.4)
+--
+--     2      1         1      1    (2)
+--  null      1                     (3)
+--                   null      3    (4)
+
+create temporary view t1 as select * from values
+  (1, 1), (2, 1), (null, 1),
+  (1, 3), (null, 3),
+  (1, null), (null, 2)
+as t1(a1, b1);
+
+create temporary view t2 as select * from values
+  (1, 1),
+  (null, 3),
+  (1, null)
+as t2(a2, b2);
+
+-- multiple columns in NOT IN
+-- TC 01.01
+select a1,b1
+from   t1
+where  (a1,b1) not in (select a2,b2
+                       from   t2);
+
+-- multiple columns with expressions in NOT IN
+-- TC 01.02
+select a1,b1
+from   t1
+where  (a1-1,b1) not in (select a2,b2
+                         from   t2);
+
+-- multiple columns with expressions in NOT IN
+-- TC 01.02
+select a1,b1
+from   t1
+where  (a1,b1) not in (select a2+1,b2
+                       from   t2);
+
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/invalid-correlation.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/invalid-correlation.sql
new file mode 100644
index 000000000000..cf93c5a83597
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/invalid-correlation.sql
@@ -0,0 +1,42 @@
+-- The test file contains negative test cases
+-- of invalid queries where error messages are expected.
+
+create temporary view t1 as select * from values
+  (1, 2, 3)
+as t1(t1a, t1b, t1c);
+
+create temporary view t2 as select * from values
+  (1, 0, 1)
+as t2(t2a, t2b, t2c);
+
+create temporary view t3 as select * from values
+  (3, 1, 2)
+as t3(t3a, t3b, t3c);
+
+-- TC 01.01
+-- The column t2b in the SELECT of the subquery is invalid
+-- because it is neither an aggregate function nor a GROUP BY column.
+select t1a, t2b
+from   t1, t2
+where  t1b = t2c
+and    t2b = (select max(avg)
+              from   (select   t2b, avg(t2b) avg
+                      from     t2
+                      where    t2a = t1.t1b
+                     )
+             )
+;
+
+-- TC 01.02
+-- Invalid due to the column t2b not part of the output from table t2.
+select *
+from   t1
+where  t1a in (select   min(t2a)
+               from     t2
+               group by t2c
+               having   t2c in (select   max(t3c)
+                                from     t3
+                                group by t3b
+                                having   t3b > t2b ))
+;
+
diff --git a/sql/core/src/test/resources/sql-tests/inputs/union.sql b/sql/core/src/test/resources/sql-tests/inputs/union.sql
new file mode 100644
index 000000000000..e57d69eaad03
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/union.sql
@@ -0,0 +1,43 @@
+CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (1, 'a'), (2, 'b') tbl(c1, c2);
+CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2);
+
+-- Simple Union
+SELECT *
+FROM   (SELECT * FROM t1
+        UNION ALL
+        SELECT * FROM t1);
+
+-- Type Coerced Union
+SELECT *
+FROM   (SELECT * FROM t1
+        UNION ALL
+        SELECT * FROM t2
+        UNION ALL
+        SELECT * FROM t2);
+
+-- Regression test for SPARK-18622
+SELECT a
+FROM (SELECT 0 a, 0 b
+      UNION ALL
+      SELECT SUM(1) a, CAST(0 AS BIGINT) b
+      UNION ALL SELECT 0 a, 0 b) T;
+
+-- Regression test for SPARK-18841 Push project through union should not be broken by redundant alias removal.
+CREATE OR REPLACE TEMPORARY VIEW p1 AS VALUES 1 T(col);
+CREATE OR REPLACE TEMPORARY VIEW p2 AS VALUES 1 T(col);
+CREATE OR REPLACE TEMPORARY VIEW p3 AS VALUES 1 T(col);
+SELECT 1 AS x,
+       col
+FROM   (SELECT col AS col
+        FROM (SELECT p1.col AS col
+              FROM   p1 CROSS JOIN p2
+              UNION ALL
+              SELECT col
+              FROM p3) T1) T2;
+
+-- Clean-up
+DROP VIEW IF EXISTS t1;
+DROP VIEW IF EXISTS t2;
+DROP VIEW IF EXISTS p1;
+DROP VIEW IF EXISTS p2;
+DROP VIEW IF EXISTS p3;
diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out
index 499a3d5fb72f..981b2504bcaa 100644
--- a/sql/core/src/test/resources/sql-tests/results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 10
+-- Number of queries: 12
 
 
 -- !query 0
@@ -124,6 +124,7 @@ struct<sort_array(boolean_array, true):array<boolean>,sort_array(tinyint_array,
 -- !query 8 output
 [true]	[1,2]	[1,2]	[1,2]	[1,2]	[9223372036854775808,9223372036854775809]	[1.0,2.0]	[1.0,2.0]	[2016-03-13,2016-03-14]	[2016-11-12 20:54:00.0,2016-11-15 20:54:00.0]
 
+
 -- !query 9
 select sort_array(array('b', 'd'), '1')
 -- !query 9 schema
@@ -132,6 +133,7 @@ struct<>
 org.apache.spark.sql.AnalysisException
 cannot resolve 'sort_array(array('b', 'd'), '1')' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7
 
+
 -- !query 10
 select sort_array(array('b', 'd'), cast(NULL as boolean))
 -- !query 10 schema
@@ -140,6 +142,7 @@ struct<>
 org.apache.spark.sql.AnalysisException
 cannot resolve 'sort_array(array('b', 'd'), CAST(NULL AS BOOLEAN))' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7
 
+
 -- !query 11
 select
   size(boolean_array),
diff --git a/sql/core/src/test/resources/sql-tests/results/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/cast.sql.out
new file mode 100644
index 000000000000..bfa29d7d2d59
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/cast.sql.out
@@ -0,0 +1,178 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 22
+
+
+-- !query 0
+SELECT CAST('1.23' AS int)
+-- !query 0 schema
+struct<CAST(1.23 AS INT):int>
+-- !query 0 output
+1
+
+
+-- !query 1
+SELECT CAST('1.23' AS long)
+-- !query 1 schema
+struct<CAST(1.23 AS BIGINT):bigint>
+-- !query 1 output
+1
+
+
+-- !query 2
+SELECT CAST('-4.56' AS int)
+-- !query 2 schema
+struct<CAST(-4.56 AS INT):int>
+-- !query 2 output
+-4
+
+
+-- !query 3
+SELECT CAST('-4.56' AS long)
+-- !query 3 schema
+struct<CAST(-4.56 AS BIGINT):bigint>
+-- !query 3 output
+-4
+
+
+-- !query 4
+SELECT CAST('abc' AS int)
+-- !query 4 schema
+struct<CAST(abc AS INT):int>
+-- !query 4 output
+NULL
+
+
+-- !query 5
+SELECT CAST('abc' AS long)
+-- !query 5 schema
+struct<CAST(abc AS BIGINT):bigint>
+-- !query 5 output
+NULL
+
+
+-- !query 6
+SELECT CAST('1234567890123' AS int)
+-- !query 6 schema
+struct<CAST(1234567890123 AS INT):int>
+-- !query 6 output
+NULL
+
+
+-- !query 7
+SELECT CAST('12345678901234567890123' AS long)
+-- !query 7 schema
+struct<CAST(12345678901234567890123 AS BIGINT):bigint>
+-- !query 7 output
+NULL
+
+
+-- !query 8
+SELECT CAST('' AS int)
+-- !query 8 schema
+struct<CAST( AS INT):int>
+-- !query 8 output
+NULL
+
+
+-- !query 9
+SELECT CAST('' AS long)
+-- !query 9 schema
+struct<CAST( AS BIGINT):bigint>
+-- !query 9 output
+NULL
+
+
+-- !query 10
+SELECT CAST(NULL AS int)
+-- !query 10 schema
+struct<CAST(NULL AS INT):int>
+-- !query 10 output
+NULL
+
+
+-- !query 11
+SELECT CAST(NULL AS long)
+-- !query 11 schema
+struct<CAST(NULL AS BIGINT):bigint>
+-- !query 11 output
+NULL
+
+
+-- !query 12
+SELECT CAST('123.a' AS int)
+-- !query 12 schema
+struct<CAST(123.a AS INT):int>
+-- !query 12 output
+NULL
+
+
+-- !query 13
+SELECT CAST('123.a' AS long)
+-- !query 13 schema
+struct<CAST(123.a AS BIGINT):bigint>
+-- !query 13 output
+NULL
+
+
+-- !query 14
+SELECT CAST('-2147483648' AS int)
+-- !query 14 schema
+struct<CAST(-2147483648 AS INT):int>
+-- !query 14 output
+-2147483648
+
+
+-- !query 15
+SELECT CAST('-2147483649' AS int)
+-- !query 15 schema
+struct<CAST(-2147483649 AS INT):int>
+-- !query 15 output
+NULL
+
+
+-- !query 16
+SELECT CAST('2147483647' AS int)
+-- !query 16 schema
+struct<CAST(2147483647 AS INT):int>
+-- !query 16 output
+2147483647
+
+
+-- !query 17
+SELECT CAST('2147483648' AS int)
+-- !query 17 schema
+struct<CAST(2147483648 AS INT):int>
+-- !query 17 output
+NULL
+
+
+-- !query 18
+SELECT CAST('-9223372036854775808' AS long)
+-- !query 18 schema
+struct<CAST(-9223372036854775808 AS BIGINT):bigint>
+-- !query 18 output
+-9223372036854775808
+
+
+-- !query 19
+SELECT CAST('-9223372036854775809' AS long)
+-- !query 19 schema
+struct<CAST(-9223372036854775809 AS BIGINT):bigint>
+-- !query 19 output
+NULL
+
+
+-- !query 20
+SELECT CAST('9223372036854775807' AS long)
+-- !query 20 schema
+struct<CAST(9223372036854775807 AS BIGINT):bigint>
+-- !query 20 output
+9223372036854775807
+
+
+-- !query 21
+SELECT CAST('9223372036854775808' AS long)
+-- !query 21 schema
+struct<CAST(9223372036854775808 AS BIGINT):bigint>
+-- !query 21 output
+NULL
diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
index 9fbad8f3800a..a446c2cd183d 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 6
+-- Number of queries: 9
 
 
 -- !query 0
@@ -55,3 +55,50 @@ struct<id:int,2:int>
 0	2
 1	2
 1	2
+
+
+-- !query 6
+WITH CTE1 AS (
+  SELECT b.id AS id
+  FROM   T2 a
+         CROSS JOIN (SELECT id AS id FROM T2) b
+)
+SELECT t1.id AS c1,
+       t2.id AS c2
+FROM   CTE1 t1
+       CROSS JOIN CTE1 t2
+-- !query 6 schema
+struct<c1:int,c2:int>
+-- !query 6 output
+0	0
+0	0
+0	0
+0	0
+0	1
+0	1
+0	1
+0	1
+1	0
+1	0
+1	0
+1	0
+1	1
+1	1
+1	1
+1	1
+
+
+-- !query 7
+DROP VIEW IF EXISTS t
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
+
+
+-- !query 8
+DROP VIEW IF EXISTS t2
+-- !query 8 schema
+struct<>
+-- !query 8 output
+
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
index 9c3a145f3aaa..614c8784ada3 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
@@ -122,7 +122,7 @@ select a, b, sum(b) from data group by 3
 struct<>
 -- !query 11 output
 org.apache.spark.sql.AnalysisException
-GROUP BY position 3 is an aggregate function, and aggregate functions are not allowed in GROUP BY; line 1 pos 39
+aggregate functions are not allowed in GROUP BY, but found sum(CAST(data.`b` AS BIGINT));
 
 
 -- !query 12
@@ -131,16 +131,20 @@ select a, b, sum(b) + 2 from data group by 3
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-GROUP BY position 3 is an aggregate function, and aggregate functions are not allowed in GROUP BY; line 1 pos 43
+aggregate functions are not allowed in GROUP BY, but found (sum(CAST(data.`b` AS BIGINT)) + CAST(2 AS BIGINT));
 
 
 -- !query 13
 select a, rand(0), sum(b) from data group by a, 2
 -- !query 13 schema
-struct<>
+struct<a:int,rand(0):double,sum(b):bigint>
 -- !query 13 output
-org.apache.spark.sql.AnalysisException
-nondeterministic expression rand(0) should not appear in grouping expression.;
+1	0.4048454303385226	2
+1	0.8446490682263027	1
+2	0.5871875724155838	1
+2	0.8865128837019473	2
+3	0.742083829230211	1
+3	0.9179913208300406	2
 
 
 -- !query 14
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index a91f04e098b1..f64dd0007846 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 14
+-- Number of queries: 18
 
 
 -- !query 0
@@ -87,7 +87,7 @@ struct<foo:string,approx_count_distinct(a):bigint>
 -- !query 9
 SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1
 -- !query 9 schema
-struct<foo:string,max(struct(a)):struct<a:int>>
+struct<foo:string,max(named_struct(a, a)):struct<a:int>>
 -- !query 9 output
 
 
@@ -131,3 +131,35 @@ FROM testData
 struct<skewness(CAST(a AS DOUBLE)):double,kurtosis(CAST(a AS DOUBLE)):double,min(a):int,max(a):int,avg(a):double,var_samp(CAST(a AS DOUBLE)):double,stddev_samp(CAST(a AS DOUBLE)):double,sum(a):bigint,count(a):bigint>
 -- !query 13 output
 -0.2723801058145729	-1.5069204152249134	1	3	2.142857142857143	0.8095238095238094	0.8997354108424372	15	7
+
+
+-- !query 14
+SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a
+-- !query 14 schema
+struct<count(DISTINCT b):bigint,count(DISTINCT b, c):bigint>
+-- !query 14 output
+1	1
+
+
+-- !query 15
+SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a
+-- !query 15 schema
+struct<a:int,count(1):bigint>
+-- !query 15 output
+
+
+
+-- !query 16
+SELECT COUNT(1) FROM testData WHERE false
+-- !query 16 schema
+struct<count(1):bigint>
+-- !query 16 output
+0
+
+
+-- !query 17
+SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t
+-- !query 17 schema
+struct<1:int>
+-- !query 17 output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
new file mode 100644
index 000000000000..a9c056555dcd
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
@@ -0,0 +1,77 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 8
+
+
+-- !query 0
+CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES
+  ("1", "2", "3", 1),
+  ("4", "5", "6", 1),
+  ("7", "8", "9", 1)
+  AS grouping(a, b, c, d)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE TEMPORARY VIEW grouping_null AS SELECT * FROM VALUES
+  CAST(NULL AS STRING),
+  CAST(NULL AS STRING)
+  AS T(e)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (())
+-- !query 2 schema
+struct<a:string,b:string,c:string,count(d):bigint>
+-- !query 2 output
+NULL	NULL	NULL	3
+
+
+-- !query 3
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a))
+-- !query 3 schema
+struct<a:string,b:string,c:string,count(d):bigint>
+-- !query 3 output
+1	NULL	NULL	1
+4	NULL	NULL	1
+7	NULL	NULL	1
+
+
+-- !query 4
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c))
+-- !query 4 schema
+struct<a:string,b:string,c:string,count(d):bigint>
+-- !query 4 output
+NULL	NULL	3	1
+NULL	NULL	6	1
+NULL	NULL	9	1
+
+
+-- !query 5
+SELECT COUNT(1) FROM grouping_null GROUP BY e GROUPING SETS (e)
+-- !query 5 schema
+struct<count(1):bigint>
+-- !query 5 output
+2
+
+
+-- !query 6
+DROP VIEW IF EXISTS grouping
+-- !query 6 schema
+struct<>
+-- !query 6 output
+
+
+
+-- !query 7
+DROP VIEW IF EXISTS grouping_null
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
diff --git a/sql/core/src/test/resources/sql-tests/results/inner-join.sql.out b/sql/core/src/test/resources/sql-tests/results/inner-join.sql.out
new file mode 100644
index 000000000000..aa20537d449e
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/inner-join.sql.out
@@ -0,0 +1,68 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 13
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+CREATE TEMPORARY VIEW t4 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a)
+-- !query 3 schema
+struct<>
+-- !query 3 output
+
+
+
+-- !query 4
+CREATE TEMPORARY VIEW ta AS
+SELECT a, 'a' AS tag FROM t1
+UNION ALL
+SELECT a, 'b' AS tag FROM t2
+-- !query 4 schema
+struct<>
+-- !query 4 output
+
+
+
+-- !query 5
+CREATE TEMPORARY VIEW tb AS
+SELECT a, 'a' AS tag FROM t3
+UNION ALL
+SELECT a, 'b' AS tag FROM t4
+-- !query 5 schema
+struct<>
+-- !query 5 output
+
+
+
+-- !query 6
+SELECT tb.* FROM ta INNER JOIN tb ON ta.a = tb.a AND ta.tag = tb.tag
+-- !query 6 schema
+struct<a:int,tag:string>
+-- !query 6 output
+1	a
+1	a
+1	b
+1	b
+
diff --git a/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out b/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out
new file mode 100644
index 000000000000..1b8ddbe4c721
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out
@@ -0,0 +1,40 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 4
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW tbl_a AS VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW tbl_b AS VALUES 1 AS T(c1)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT *
+FROM   tbl_a
+       LEFT ANTI JOIN tbl_b ON ((tbl_a.c1 = tbl_a.c2) IS NULL OR tbl_a.c1 = tbl_a.c2)
+-- !query 2 schema
+struct<c1:int,c2:int>
+-- !query 2 output
+2	1
+3	6
+
+
+-- !query 3
+SELECT l.c1, l.c2
+FROM   tbl_a l
+WHERE  EXISTS (SELECT 1 FROM tbl_b r WHERE l.c1 = l.c2) OR l.c2 < 2
+-- !query 3 schema
+struct<c1:int,c2:int>
+-- !query 3 output
+1	1
+2	1
diff --git a/sql/core/src/test/resources/sql-tests/results/random.sql.out b/sql/core/src/test/resources/sql-tests/results/random.sql.out
new file mode 100644
index 000000000000..bca67320fe7b
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/random.sql.out
@@ -0,0 +1,84 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 10
+
+
+-- !query 0
+SELECT rand(0)
+-- !query 0 schema
+struct<rand(0):double>
+-- !query 0 output
+0.8446490682263027
+
+
+-- !query 1
+SELECT rand(cast(3 / 7 AS int))
+-- !query 1 schema
+struct<rand(CAST((CAST(3 AS DOUBLE) / CAST(7 AS DOUBLE)) AS INT)):double>
+-- !query 1 output
+0.8446490682263027
+
+
+-- !query 2
+SELECT rand(NULL)
+-- !query 2 schema
+struct<rand(CAST(NULL AS INT)):double>
+-- !query 2 output
+0.8446490682263027
+
+
+-- !query 3
+SELECT rand(cast(NULL AS int))
+-- !query 3 schema
+struct<rand(CAST(NULL AS INT)):double>
+-- !query 3 output
+0.8446490682263027
+
+
+-- !query 4
+SELECT rand(1.0)
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'rand(1.0BD)' due to data type mismatch: argument 1 requires (int or bigint) type, however, '1.0BD' is of decimal(2,1) type.; line 1 pos 7
+
+
+-- !query 5
+SELECT randn(0L)
+-- !query 5 schema
+struct<randn(0):double>
+-- !query 5 output
+1.1164209726833079
+
+
+-- !query 6
+SELECT randn(cast(3 / 7 AS long))
+-- !query 6 schema
+struct<randn(CAST((CAST(3 AS DOUBLE) / CAST(7 AS DOUBLE)) AS BIGINT)):double>
+-- !query 6 output
+1.1164209726833079
+
+
+-- !query 7
+SELECT randn(NULL)
+-- !query 7 schema
+struct<randn(CAST(NULL AS INT)):double>
+-- !query 7 output
+1.1164209726833079
+
+
+-- !query 8
+SELECT randn(cast(NULL AS long))
+-- !query 8 schema
+struct<randn(CAST(NULL AS BIGINT)):double>
+-- !query 8 output
+1.1164209726833079
+
+
+-- !query 9
+SELECT rand('1')
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'rand('1')' due to data type mismatch: argument 1 requires (int or bigint) type, however, ''1'' is of string type.; line 1 pos 7
diff --git a/sql/core/src/test/resources/sql-tests/results/scalar-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/scalar-subquery.sql.out
new file mode 100644
index 000000000000..c249329d6a61
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/scalar-subquery.sql.out
@@ -0,0 +1,46 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 4
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW p AS VALUES (1, 1) AS T(pk, pv)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW c AS VALUES (1, 1) AS T(ck, cv)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT pk, cv
+FROM   p, c
+WHERE  p.pk = c.ck
+AND    c.cv = (SELECT avg(c1.cv)
+               FROM   c c1
+               WHERE  c1.ck = p.pk)
+-- !query 2 schema
+struct<pk:int,cv:int>
+-- !query 2 output
+1	1
+
+
+-- !query 3
+SELECT pk, cv
+FROM   p, c
+WHERE  p.pk = c.ck
+AND    c.cv = (SELECT max(avg)
+	       FROM   (SELECT   c1.cv, avg(c1.cv) avg
+		       FROM     c c1
+		       WHERE    c1.ck = p.pk
+                       GROUP BY c1.cv))
+-- !query 3 schema
+struct<pk:int,cv:int>
+-- !query 3 output
+1	1
diff --git a/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out
new file mode 100644
index 000000000000..9f0b95994be5
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out
@@ -0,0 +1,124 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 13
+
+
+-- !query 0
+SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null)
+-- !query 0 schema
+struct<ifnull(NULL, 'x'):string,ifnull('y', 'x'):string,ifnull(NULL, NULL):null>
+-- !query 0 output
+x	y	NULL
+
+
+-- !query 1
+SELECT nullif('x', 'x'), nullif('x', 'y')
+-- !query 1 schema
+struct<nullif('x', 'x'):string,nullif('x', 'y'):string>
+-- !query 1 output
+NULL	x
+
+
+-- !query 2
+SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)
+-- !query 2 schema
+struct<nvl(NULL, 'x'):string,nvl('y', 'x'):string,nvl(NULL, NULL):null>
+-- !query 2 output
+x	y	NULL
+
+
+-- !query 3
+SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null)
+-- !query 3 schema
+struct<nvl2(NULL, 'x', 'y'):string,nvl2('n', 'x', 'y'):string,nvl2(NULL, NULL, NULL):null>
+-- !query 3 output
+y	x	NULL
+
+
+-- !query 4
+SELECT ifnull(1, 2.1d), ifnull(null, 2.1d)
+-- !query 4 schema
+struct<ifnull(1, 2.1D):double,ifnull(NULL, 2.1D):double>
+-- !query 4 output
+1.0	2.1
+
+
+-- !query 5
+SELECT nullif(1, 2.1d), nullif(1, 1.0d)
+-- !query 5 schema
+struct<nullif(1, 2.1D):int,nullif(1, 1.0D):int>
+-- !query 5 output
+1	NULL
+
+
+-- !query 6
+SELECT nvl(1, 2.1d), nvl(null, 2.1d)
+-- !query 6 schema
+struct<nvl(1, 2.1D):double,nvl(NULL, 2.1D):double>
+-- !query 6 output
+1.0	2.1
+
+
+-- !query 7
+SELECT nvl2(null, 1, 2.1d), nvl2('n', 1, 2.1d)
+-- !query 7 schema
+struct<nvl2(NULL, 1, 2.1D):double,nvl2('n', 1, 2.1D):double>
+-- !query 7 output
+2.1	1.0
+
+
+-- !query 8
+explain extended
+select ifnull(id, 'x'), nullif(id, 'x'), nvl(id, 'x'), nvl2(id, 'x', 'y')
+from range(2)
+-- !query 8 schema
+struct<plan:string>
+-- !query 8 output
+== Parsed Logical Plan ==
+'Project [unresolvedalias('ifnull('id, x), None), unresolvedalias('nullif('id, x), None), unresolvedalias('nvl('id, x), None), unresolvedalias('nvl2('id, x, y), None)]
++- 'UnresolvedTableValuedFunction range, [2]
+
+== Analyzed Logical Plan ==
+ifnull(`id`, 'x'): string, nullif(`id`, 'x'): bigint, nvl(`id`, 'x'): string, nvl2(`id`, 'x', 'y'): string
+Project [ifnull(id#xL, x) AS ifnull(`id`, 'x')#x, nullif(id#xL, x) AS nullif(`id`, 'x')#xL, nvl(id#xL, x) AS nvl(`id`, 'x')#x, nvl2(id#xL, x, y) AS nvl2(`id`, 'x', 'y')#x]
++- Range (0, 2, step=1, splits=None)
+
+== Optimized Logical Plan ==
+Project [coalesce(cast(id#xL as string), x) AS ifnull(`id`, 'x')#x, id#xL AS nullif(`id`, 'x')#xL, coalesce(cast(id#xL as string), x) AS nvl(`id`, 'x')#x, x AS nvl2(`id`, 'x', 'y')#x]
++- Range (0, 2, step=1, splits=None)
+
+== Physical Plan ==
+*Project [coalesce(cast(id#xL as string), x) AS ifnull(`id`, 'x')#x, id#xL AS nullif(`id`, 'x')#xL, coalesce(cast(id#xL as string), x) AS nvl(`id`, 'x')#x, x AS nvl2(`id`, 'x', 'y')#x]
++- *Range (0, 2, step=1, splits=None)
+
+
+-- !query 9
+SELECT boolean(1), tinyint(1), smallint(1), int(1), bigint(1)
+-- !query 9 schema
+struct<CAST(1 AS BOOLEAN):boolean,CAST(1 AS TINYINT):tinyint,CAST(1 AS SMALLINT):smallint,CAST(1 AS INT):int,CAST(1 AS BIGINT):bigint>
+-- !query 9 output
+true	1	1	1	1
+
+
+-- !query 10
+SELECT float(1), double(1), decimal(1)
+-- !query 10 schema
+struct<CAST(1 AS FLOAT):float,CAST(1 AS DOUBLE):double,CAST(1 AS DECIMAL(10,0)):decimal(10,0)>
+-- !query 10 output
+1.0	1.0	1
+
+
+-- !query 11
+SELECT date("2014-04-04"), timestamp(date("2014-04-04"))
+-- !query 11 schema
+struct<CAST(2014-04-04 AS DATE):date,CAST(CAST(2014-04-04 AS DATE) AS TIMESTAMP):timestamp>
+-- !query 11 output
+2014-04-04	2014-04-04 00:00:00
+
+
+-- !query 12
+SELECT string(1, 2)
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+Function string accepts only one argument; line 1 pos 7
diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
new file mode 100644
index 000000000000..6961e9b65922
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -0,0 +1,20 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 2
+
+
+-- !query 0
+select concat_ws()
+-- !query 0 schema
+struct<>
+-- !query 0 output
+org.apache.spark.sql.AnalysisException
+requirement failed: concat_ws requires at least one argument.; line 1 pos 7
+
+
+-- !query 1
+select format_string()
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+requirement failed: format_string() should take at least 1 argument; line 1 pos 7
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-multiple-columns.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-multiple-columns.sql.out
new file mode 100644
index 000000000000..756c3782a0e7
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-multiple-columns.sql.out
@@ -0,0 +1,59 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 5
+
+
+-- !query 0
+create temporary view t1 as select * from values
+  (1, 1), (2, 1), (null, 1),
+  (1, 3), (null, 3),
+  (1, null), (null, 2)
+as t1(a1, b1)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+create temporary view t2 as select * from values
+  (1, 1),
+  (null, 3),
+  (1, null)
+as t2(a2, b2)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+select a1,b1
+from   t1
+where  (a1,b1) not in (select a2,b2
+                       from   t2)
+-- !query 2 schema
+struct<a1:int,b1:int>
+-- !query 2 output
+2	1
+
+
+-- !query 3
+select a1,b1
+from   t1
+where  (a1-1,b1) not in (select a2,b2
+                         from   t2)
+-- !query 3 schema
+struct<a1:int,b1:int>
+-- !query 3 output
+1	1
+
+
+-- !query 4
+select a1,b1
+from   t1
+where  (a1,b1) not in (select a2+1,b2
+                       from   t2)
+-- !query 4 schema
+struct<a1:int,b1:int>
+-- !query 4 output
+1	1
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out
new file mode 100644
index 000000000000..50ae01e181bc
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out
@@ -0,0 +1,66 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 5
+
+
+-- !query 0
+create temporary view t1 as select * from values
+  (1, 2, 3)
+as t1(t1a, t1b, t1c)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+create temporary view t2 as select * from values
+  (1, 0, 1)
+as t2(t2a, t2b, t2c)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+create temporary view t3 as select * from values
+  (3, 1, 2)
+as t3(t3a, t3b, t3c)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+select t1a, t2b
+from   t1, t2
+where  t1b = t2c
+and    t2b = (select max(avg)
+              from   (select   t2b, avg(t2b) avg
+                      from     t2
+                      where    t2a = t1.t1b
+                     )
+             )
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+expression 't2.`t2b`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
+
+
+-- !query 4
+select *
+from   t1
+where  t1a in (select   min(t2a)
+               from     t2
+               group by t2c
+               having   t2c in (select   max(t3c)
+                                from     t3
+                                group by t3b
+                                having   t3b > t2b ))
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+resolved attribute(s) t2b#x missing from min(t2a)#x,t2c#x in operator !Filter predicate-subquery#x [(t2c#x = max(t3c)#x) && (t3b#x > t2b#x)];
diff --git a/sql/core/src/test/resources/sql-tests/results/union.sql.out b/sql/core/src/test/resources/sql-tests/results/union.sql.out
new file mode 100644
index 000000000000..d123b7fdbe0c
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/union.sql.out
@@ -0,0 +1,144 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 14
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (1, 'a'), (2, 'b') tbl(c1, c2)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT *
+FROM   (SELECT * FROM t1
+        UNION ALL
+        SELECT * FROM t1)
+-- !query 2 schema
+struct<c1:int,c2:string>
+-- !query 2 output
+1	a
+1	a
+2	b
+2	b
+
+
+-- !query 3
+SELECT *
+FROM   (SELECT * FROM t1
+        UNION ALL
+        SELECT * FROM t2
+        UNION ALL
+        SELECT * FROM t2)
+-- !query 3 schema
+struct<c1:decimal(11,1),c2:string>
+-- !query 3 output
+1	1
+1	1
+1	a
+2	4
+2	4
+2	b
+
+
+-- !query 4
+SELECT a
+FROM (SELECT 0 a, 0 b
+      UNION ALL
+      SELECT SUM(1) a, CAST(0 AS BIGINT) b
+      UNION ALL SELECT 0 a, 0 b) T
+-- !query 4 schema
+struct<a:bigint>
+-- !query 4 output
+0
+0
+1
+
+
+-- !query 5
+CREATE OR REPLACE TEMPORARY VIEW p1 AS VALUES 1 T(col)
+-- !query 5 schema
+struct<>
+-- !query 5 output
+
+
+
+-- !query 6
+CREATE OR REPLACE TEMPORARY VIEW p2 AS VALUES 1 T(col)
+-- !query 6 schema
+struct<>
+-- !query 6 output
+
+
+
+-- !query 7
+CREATE OR REPLACE TEMPORARY VIEW p3 AS VALUES 1 T(col)
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
+
+
+-- !query 8
+SELECT 1 AS x,
+       col
+FROM   (SELECT col AS col
+        FROM (SELECT p1.col AS col
+              FROM   p1 CROSS JOIN p2
+              UNION ALL
+              SELECT col
+              FROM p3) T1) T2
+-- !query 8 schema
+struct<x:int,col:int>
+-- !query 8 output
+1	1
+1	1
+
+
+-- !query 9
+DROP VIEW IF EXISTS t1
+-- !query 9 schema
+struct<>
+-- !query 9 output
+
+
+
+-- !query 10
+DROP VIEW IF EXISTS t2
+-- !query 10 schema
+struct<>
+-- !query 10 output
+
+
+
+-- !query 11
+DROP VIEW IF EXISTS p1
+-- !query 11 schema
+struct<>
+-- !query 11 output
+
+
+
+-- !query 12
+DROP VIEW IF EXISTS p2
+-- !query 12 schema
+struct<>
+-- !query 12 output
+
+
+
+-- !query 13
+DROP VIEW IF EXISTS p3
+-- !query 13 schema
+struct<>
+-- !query 13 output
+
diff --git a/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/7.compact b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/7.compact
new file mode 100644
index 000000000000..e1ec8a74f052
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/7.compact
@@ -0,0 +1,9 @@
+v1
+{"path":"/a/b/0","size":1,"isDir":false,"modificationTime":1,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/1","size":100,"isDir":false,"modificationTime":100,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/2","size":200,"isDir":false,"modificationTime":200,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/3","size":300,"isDir":false,"modificationTime":300,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/4","size":400,"isDir":false,"modificationTime":400,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/5","size":500,"isDir":false,"modificationTime":500,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/6","size":600,"isDir":false,"modificationTime":600,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/7","size":700,"isDir":false,"modificationTime":700,"blockReplication":1,"blockSize":100,"action":"add"}
diff --git a/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/8 b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/8
new file mode 100644
index 000000000000..e7989804e888
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/8
@@ -0,0 +1,3 @@
+v1
+{"path":"/a/b/8","size":800,"isDir":false,"modificationTime":800,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/0","size":100,"isDir":false,"modificationTime":100,"blockReplication":1,"blockSize":100,"action":"delete"}
diff --git a/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/9 b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/9
new file mode 100644
index 000000000000..42fb0ee41692
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/9
@@ -0,0 +1,2 @@
+v1
+{"path":"/a/b/9","size":900,"isDir":false,"modificationTime":900,"blockReplication":3,"blockSize":200,"action":"add"}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/2.compact b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/2.compact
new file mode 100644
index 000000000000..95f78bb2620d
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/2.compact
@@ -0,0 +1,4 @@
+v1
+{"path":"/a/b/0","timestamp":1480730949000,"batchId":0}
+{"path":"/a/b/1","timestamp":1480730950000,"batchId":1}
+{"path":"/a/b/2","timestamp":1480730950000,"batchId":2}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/3 b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/3
new file mode 100644
index 000000000000..2caa5972e42e
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/3
@@ -0,0 +1,2 @@
+v1
+{"path":"/a/b/3","timestamp":1480730950000,"batchId":3}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/4 b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/4
new file mode 100644
index 000000000000..e54b94322988
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/4
@@ -0,0 +1,2 @@
+v1
+{"path":"/a/b/4","timestamp":1480730951000,"batchId":4}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-json.txt b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-json.txt
new file mode 100644
index 000000000000..e266a47368e1
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-json.txt
@@ -0,0 +1 @@
+{"logOffset":345}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-long.txt b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-long.txt
new file mode 100644
index 000000000000..51b4008129ff
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-long.txt
@@ -0,0 +1 @@
+345
diff --git a/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0 b/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0
new file mode 100644
index 000000000000..988a98a7587d
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0
@@ -0,0 +1,4 @@
+v1
+{"batchWatermarkMs":0,"batchTimestampMs":1480981499528}
+{"logOffset":345}
+{"topic-0":{"0":1}}
diff --git a/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.2.txt b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.2.txt
new file mode 100644
index 000000000000..57c44c862725
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.2.txt
@@ -0,0 +1,5 @@
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent","queryStatus":{"name":"query-1","id":1,"timestamp":1480491481350,"inputRate":0.0,"processingRate":0.0,"latency":null,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"-","inputRate":0.0,"processingRate":0.0,"triggerDetails":{}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[-]"},"triggerDetails":{}}}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","queryStatus":{"name":"query-1","id":1,"timestamp":1480491493386,"inputRate":83.33333333333333,"processingRate":0.5773672055427251,"latency":1738.0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0","inputRate":83.33333333333333,"processingRate":0.5773672055427251,"triggerDetails":{"latency.getBatch.source":"39","numRows.input.source":"1","latency.getOffset.source":"91","triggerId":"0"}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[#0]"},"triggerDetails":{"timestamp.afterGetBatch":"1480491491817","latency.offsetLogWrite":"26","timestamp.triggerStart":"1480491491653","triggerId":"0","timestamp.triggerFinish":"1480491493385","latency.fullTrigger":"1732","latency.getBatch.total":"44","timestamp.afterGetOffset":"1480491491772","numRows.input.total":"1","isTriggerActive":"false","latency.optimizer":"406","latency.getOffset.total":"91","isDataPresentInTrigger":"true"}}}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent","queryStatus":{"name":"query-1","id":1,"timestamp":1480491532753,"inputRate":0.0,"processingRate":0.0,"latency":null,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0","inputRate":0.0,"processingRate":0.0,"triggerDetails":{"latency.getOffset.source":"1","triggerId":"1"}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[#0]"},"triggerDetails":{}},"exception":null}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent","queryStatus":{"name":"query-0","id":0,"timestamp":1480491812530,"inputRate":0.0,"processingRate":0.0,"latency":null,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0","inputRate":0.0,"processingRate":0.0,"triggerDetails":{"latency.getBatch.source":"25","latency.getOffset.source":"65","triggerId":"0"}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[-]"},"triggerDetails":{}},"exception":"org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.SparkException: Task failed while writing rows.\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.writePartitionToSingleFile(FileStreamSink.scala:183)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter$$anonfun$write$1.apply(FileStreamSink.scala:155)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter$$anonfun$write$1.apply(FileStreamSink.scala:153)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:86)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.lang.ArithmeticException: / by zero\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.writePartitionToSingleFile(FileStreamSink.scala:172)\n\t... 8 more\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)\n\tat scala.Option.foreach(Option.scala:257)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1906)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.write(FileStreamSink.scala:151)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSink.addBatch(FileStreamSink.scala:70)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch(StreamExecution.scala:437)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply$mcZ$sp(StreamExecution.scala:225)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$reportTimeTaken(StreamExecution.scala:656)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:212)\n\tat org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:43)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:208)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:142)\nCaused by: org.apache.spark.SparkException: Task failed while writing rows.\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.writePartitionToSingleFile(FileStreamSink.scala:183)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter$$anonfun$write$1.apply(FileStreamSink.scala:155)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter$$anonfun$write$1.apply(FileStreamSink.scala:153)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:86)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.lang.ArithmeticException: / by zero\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.writePartitionToSingleFile(FileStreamSink.scala:172)\n\t... 8 more\n"}
+{"Event":"SparkListenerApplicationEnd","Timestamp":1480491541552}
diff --git a/sql/core/src/test/resources/structured-streaming/query-metadata-logs-version-2.1.0.txt b/sql/core/src/test/resources/structured-streaming/query-metadata-logs-version-2.1.0.txt
new file mode 100644
index 000000000000..79613e236216
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/query-metadata-logs-version-2.1.0.txt
@@ -0,0 +1,3 @@
+{
+  "id": "d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"
+}
diff --git a/sql/core/src/test/resources/tpcds/q77.sql b/sql/core/src/test/resources/tpcds/q77.sql
index 7830f96e7651..a69df9fbcd36 100755
--- a/sql/core/src/test/resources/tpcds/q77.sql
+++ b/sql/core/src/test/resources/tpcds/q77.sql
@@ -36,7 +36,7 @@ WITH ss AS
     sum(cr_net_loss) AS profit_loss
   FROM catalog_returns, date_dim
   WHERE cr_returned_date_sk = d_date_sk
-    AND d_date BETWEEN cast('2000-08-03]' AS DATE) AND
+    AND d_date BETWEEN cast('2000-08-03' AS DATE) AND
   (cast('2000-08-03' AS DATE) + INTERVAL 30 days)),
     ws AS
   (SELECT
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index f42402e1cc7d..5fc081c43113 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -24,19 +24,31 @@ import scala.language.postfixOps
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.CleanerListener
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
 import org.apache.spark.sql.execution.RDDScanExec
 import org.apache.spark.sql.execution.columnar._
 import org.apache.spark.sql.execution.exchange.ShuffleExchange
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.storage.{RDDBlockId, StorageLevel}
-import org.apache.spark.util.AccumulatorContext
+import org.apache.spark.util.{AccumulatorContext, Utils}
 
 private case class BigData(s: String)
 
 class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext {
   import testImplicits._
 
+  setupTestData()
+
+  override def afterEach(): Unit = {
+    try {
+      spark.catalog.clearCache()
+    } finally {
+      super.afterEach()
+    }
+  }
+
   def rddIdOf(tableName: String): Int = {
     val plan = spark.table(tableName).queryExecution.sparkPlan
     plan.collect {
@@ -53,6 +65,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
     maybeBlock.nonEmpty
   }
 
+  private def getNumInMemoryRelations(ds: Dataset[_]): Int = {
+    val plan = ds.queryExecution.withCachedData
+    var sum = plan.collect { case _: InMemoryRelation => 1 }.sum
+    plan.transformAllExpressions {
+      case e: SubqueryExpression =>
+        sum += getNumInMemoryRelations(e.plan)
+        e
+    }
+    sum
+  }
+
   test("withColumn doesn't invalidate cached dataframe") {
     var evalCount = 0
     val myUDF = udf((x: String) => { evalCount += 1; "result" })
@@ -165,9 +188,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
     assertCached(spark.table("testData"))
 
     assertResult(1, "InMemoryRelation not found, testData should have been cached") {
-      spark.table("testData").queryExecution.withCachedData.collect {
-        case r: InMemoryRelation => r
-      }.size
+      getNumInMemoryRelations(spark.table("testData"))
     }
 
     spark.catalog.cacheTable("testData")
@@ -560,9 +581,93 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext
     localRelation.createOrReplaceTempView("localRelation")
 
     spark.catalog.cacheTable("localRelation")
-    assert(
-      localRelation.queryExecution.withCachedData.collect {
-        case i: InMemoryRelation => i
-      }.size == 1)
+    assert(getNumInMemoryRelations(localRelation) == 1)
+  }
+
+  test("SPARK-19093 Caching in side subquery") {
+    withTempView("t1") {
+      Seq(1).toDF("c1").createOrReplaceTempView("t1")
+      spark.catalog.cacheTable("t1")
+      val ds =
+        sql(
+          """
+            |SELECT * FROM t1
+            |WHERE
+            |NOT EXISTS (SELECT * FROM t1)
+          """.stripMargin)
+      assert(getNumInMemoryRelations(ds) == 2)
+    }
+  }
+
+  test("SPARK-19093 scalar and nested predicate query") {
+    withTempView("t1", "t2", "t3", "t4") {
+      Seq(1).toDF("c1").createOrReplaceTempView("t1")
+      Seq(2).toDF("c1").createOrReplaceTempView("t2")
+      Seq(1).toDF("c1").createOrReplaceTempView("t3")
+      Seq(1).toDF("c1").createOrReplaceTempView("t4")
+      spark.catalog.cacheTable("t1")
+      spark.catalog.cacheTable("t2")
+      spark.catalog.cacheTable("t3")
+      spark.catalog.cacheTable("t4")
+
+      // Nested predicate subquery
+      val ds =
+        sql(
+          """
+            |SELECT * FROM t1
+            |WHERE
+            |c1 IN (SELECT c1 FROM t2 WHERE c1 IN (SELECT c1 FROM t3 WHERE c1 = 1))
+          """.stripMargin)
+      assert(getNumInMemoryRelations(ds) == 3)
+
+      // Scalar subquery and predicate subquery
+      val ds2 =
+        sql(
+          """
+            |SELECT * FROM (SELECT max(c1) FROM t1 GROUP BY c1)
+            |WHERE
+            |c1 = (SELECT max(c1) FROM t2 GROUP BY c1)
+            |OR
+            |EXISTS (SELECT c1 FROM t3)
+            |OR
+            |c1 IN (SELECT c1 FROM t4)
+          """.stripMargin)
+      assert(getNumInMemoryRelations(ds2) == 4)
+    }
+  }
+
+  test("SPARK-19765: UNCACHE TABLE should un-cache all cached plans that refer to this table") {
+    withTable("t") {
+      withTempPath { path =>
+        Seq(1 -> "a").toDF("i", "j").write.parquet(path.getCanonicalPath)
+        sql(s"CREATE TABLE t USING parquet OPTIONS (PATH '${path.toURI}')")
+        spark.catalog.cacheTable("t")
+        spark.table("t").select($"i").cache()
+        checkAnswer(spark.table("t").select($"i"), Row(1))
+        assertCached(spark.table("t").select($"i"))
+
+        Utils.deleteRecursively(path)
+        spark.sessionState.catalog.refreshTable(TableIdentifier("t"))
+        spark.catalog.uncacheTable("t")
+        assert(spark.table("t").select($"i").count() == 0)
+        assert(getNumInMemoryRelations(spark.table("t").select($"i")) == 0)
+      }
+    }
+  }
+
+  test("refreshByPath should refresh all cached plans with the specified path") {
+    withTempDir { dir =>
+      val path = dir.getCanonicalPath()
+
+      spark.range(10).write.mode("overwrite").parquet(path)
+      spark.read.parquet(path).cache()
+      spark.read.parquet(path).filter($"id" > 4).cache()
+      assert(spark.read.parquet(path).filter($"id" > 4).count() == 5)
+
+      spark.range(20).write.mode("overwrite").parquet(path)
+      spark.catalog.refreshByPath(path)
+      assert(spark.read.parquet(path).count() == 20)
+      assert(spark.read.parquet(path).filter($"id" > 4).count() == 15)
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 7aa4f0026f27..8569c2d76b69 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -97,6 +97,15 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
     )
   }
 
+  test("SPARK-18952: regexes fail codegen when used as keys due to bad forward-slash escapes") {
+    val df = Seq(("some[thing]", "random-string")).toDF("key", "val")
+
+    checkAnswer(
+      df.groupBy(regexp_extract('key, "([a-z]+)\\[", 1)).count(),
+      Row("some", 1) :: Nil
+    )
+  }
+
   test("rollup") {
     checkAnswer(
       courseSales.rollup("course", "year").sum("earnings"),
@@ -513,4 +522,27 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       df.groupBy($"x").agg(countDistinct($"y"), sort_array(collect_list($"z"))),
       Seq(Row(1, 2, Seq("a", "b")), Row(3, 2, Seq("c", "c", "d"))))
   }
+
+  test("SPARK-18004 limit + aggregates") {
+    val df = Seq(("a", 1), ("b", 2), ("c", 1), ("d", 5)).toDF("id", "value")
+    val limit2Df = df.limit(2)
+    checkAnswer(
+      limit2Df.groupBy("id").count().select($"id"),
+      limit2Df.select($"id"))
+  }
+
+  test("SPARK-17237 remove backticks in a pivot result schema") {
+    val df = Seq((2, 3, 4), (3, 4, 5)).toDF("a", "x", "y")
+    checkAnswer(
+      df.groupBy("a").pivot("x").agg(count("y"), avg("y")).na.fill(0),
+      Seq(Row(3, 0, 0.0, 1, 5.0), Row(2, 1, 4.0, 0, 0.0))
+    )
+  }
+
+  test("aggregate function in GROUP BY") {
+    val e = intercept[AnalysisException] {
+      testData.groupBy(sum($"key")).count()
+    }
+    assert(e.message.contains("aggregate functions are not allowed in GROUP BY"))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 586a0fffeb7a..0e9a2c6cf7de 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -19,7 +19,13 @@ package org.apache.spark.sql
 
 import java.nio.charset.StandardCharsets
 
+import scala.util.Random
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -406,4 +412,50 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
       Seq(Row(true), Row(true))
     )
   }
+
+  private def assertValuesDoNotChangeAfterCoalesceOrUnion(v: Column): Unit = {
+    import DataFrameFunctionsSuite.CodegenFallbackExpr
+    for ((codegenFallback, wholeStage) <- Seq((true, false), (false, false), (false, true))) {
+      val c = if (codegenFallback) {
+        Column(CodegenFallbackExpr(v.expr))
+      } else {
+        v
+      }
+      withSQLConf(
+        (SQLConf.WHOLESTAGE_FALLBACK.key, codegenFallback.toString),
+        (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString)) {
+        val df = spark.range(0, 4, 1, 4).withColumn("c", c)
+        val rows = df.collect()
+        val rowsAfterCoalesce = df.coalesce(2).collect()
+        assert(rows === rowsAfterCoalesce, "Values changed after coalesce when " +
+          s"codegenFallback=$codegenFallback and wholeStage=$wholeStage.")
+
+        val df1 = spark.range(0, 2, 1, 2).withColumn("c", c)
+        val rows1 = df1.collect()
+        val df2 = spark.range(2, 4, 1, 2).withColumn("c", c)
+        val rows2 = df2.collect()
+        val rowsAfterUnion = df1.union(df2).collect()
+        assert(rowsAfterUnion === rows1 ++ rows2, "Values changed after union when " +
+          s"codegenFallback=$codegenFallback and wholeStage=$wholeStage.")
+      }
+    }
+  }
+
+  test("SPARK-14393: values generated by non-deterministic functions shouldn't change after " +
+    "coalesce or union") {
+    Seq(
+      monotonically_increasing_id(), spark_partition_id(),
+      rand(Random.nextLong()), randn(Random.nextLong())
+    ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
+  }
+}
+
+object DataFrameFunctionsSuite {
+  case class CodegenFallbackExpr(child: Expression) extends Expression with CodegenFallback {
+    override def children: Seq[Expression] = Seq(child)
+    override def nullable: Boolean = child.nullable
+    override def dataType: DataType = child.dataType
+    override lazy val resolved = true
+    override def eval(input: InternalRow): Any = child.eval(input)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
index 094efbaeadcd..63094d1b6122 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
@@ -51,4 +51,15 @@ class DataFrameImplicitsSuite extends QueryTest with SharedSQLContext {
       sparkContext.parallelize(1 to 10).map(_.toString).toDF("stringCol"),
       (1 to 10).map(i => Row(i.toString)))
   }
+
+  test("SPARK-19959: df[java.lang.Long].collect includes null throws NullPointerException") {
+    checkAnswer(sparkContext.parallelize(Seq[java.lang.Integer](0, null, 2), 1).toDF,
+      Seq(Row(0), Row(null), Row(2)))
+    checkAnswer(sparkContext.parallelize(Seq[java.lang.Long](0L, null, 2L), 1).toDF,
+      Seq(Row(0L), Row(null), Row(2L)))
+    checkAnswer(sparkContext.parallelize(Seq[java.lang.Float](0.0F, null, 2.0F), 1).toDF,
+      Seq(Row(0.0F), Row(null), Row(2.0F)))
+    checkAnswer(sparkContext.parallelize(Seq[java.lang.Double](0.0D, null, 2.0D), 1).toDF,
+      Seq(Row(0.0D), Row(null), Row(2.0D)))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 541ffb58e727..9383e8381f38 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -248,4 +248,14 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
     val ab = a.join(b, Seq("a"), "fullouter")
     checkAnswer(ab.join(c, "a"), Row(3, null, 4, 1) :: Nil)
   }
+
+  test("SPARK-17685: WholeStageCodegenExec throws IndexOutOfBoundsException") {
+    val df = Seq((1, 1, "1"), (2, 2, "3")).toDF("int", "int2", "str")
+    val df2 = Seq((1, 1, "1"), (2, 3, "5")).toDF("int", "int2", "str")
+    val limit = 1310721
+    val innerJoin = df.limit(limit).join(df2.limit(limit), Seq("int", "int2"), "inner")
+      .agg(count($"int"))
+    checkAnswer(innerJoin, Row(1) :: Nil)
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index 47b55e2547d1..aa237d0619ac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -138,6 +138,38 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(
       Seq[(String, String)]((null, null)).toDF("col1", "col2").na.fill("test", "col1" :: Nil),
       Row("test", null))
+
+    checkAnswer(
+      Seq[(Long, Long)]((1, 2), (-1, -2), (9123146099426677101L, 9123146560113991650L))
+        .toDF("a", "b").na.fill(0),
+      Row(1, 2) :: Row(-1, -2) :: Row(9123146099426677101L, 9123146560113991650L) :: Nil
+    )
+
+    checkAnswer(
+      Seq[(java.lang.Long, java.lang.Double)]((null, 3.14), (9123146099426677101L, null),
+        (9123146560113991650L, 1.6), (null, null)).toDF("a", "b").na.fill(0.2),
+      Row(0, 3.14) :: Row(9123146099426677101L, 0.2) :: Row(9123146560113991650L, 1.6)
+        :: Row(0, 0.2) :: Nil
+    )
+
+    checkAnswer(
+      Seq[(java.lang.Long, java.lang.Float)]((null, 3.14f), (9123146099426677101L, null),
+        (9123146560113991650L, 1.6f), (null, null)).toDF("a", "b").na.fill(0.2),
+      Row(0, 3.14f) :: Row(9123146099426677101L, 0.2f) :: Row(9123146560113991650L, 1.6f)
+        :: Row(0, 0.2f) :: Nil
+    )
+
+    checkAnswer(
+      Seq[(java.lang.Long, java.lang.Double)]((null, 1.23), (3L, null), (4L, 3.45))
+        .toDF("a", "b").na.fill(2.34),
+      Row(2, 1.23) :: Row(3, 2.34) :: Row(4, 3.45) :: Nil
+    )
+
+    checkAnswer(
+      Seq[(java.lang.Long, java.lang.Double)]((null, 1.23), (3L, null), (4L, 3.45))
+        .toDF("a", "b").na.fill(5),
+      Row(5, 1.23) :: Row(3, 5.0) :: Row(4, 3.45) :: Nil
+    )
   }
 
   test("fill with map") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala
index 1bbe1354d55f..51ffe3417271 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala
@@ -200,7 +200,7 @@ class DataFramePivotSuite extends QueryTest with SharedSQLContext{
 
   test("pivot preserves aliases if given") {
     assertResult(
-      Array("year", "dotNET_foo", "dotNET_avg(`earnings`)", "Java_foo", "Java_avg(`earnings`)")
+      Array("year", "dotNET_foo", "dotNET_avg(earnings)", "Java_foo", "Java_avg(earnings)")
     )(
       courseSales.groupBy($"year")
         .pivot("course", Seq("dotNET", "Java"))
@@ -208,4 +208,12 @@ class DataFramePivotSuite extends QueryTest with SharedSQLContext{
     )
   }
 
+  test("pivot with column definition in groupby") {
+    checkAnswer(
+      courseSales.groupBy(substring(col("course"), 0, 1).as("foo"))
+        .pivot("year", Seq(2012, 2013))
+        .sum("earnings"),
+      Row("d", 15000.0, 48000.0) :: Row("J", 20000.0, 30000.0) :: Nil
+    )
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 1383208874a1..0602f4ac78c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -68,25 +68,38 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
   }
 
   test("randomSplit on reordered partitions") {
-    // This test ensures that randomSplit does not create overlapping splits even when the
-    // underlying dataframe (such as the one below) doesn't guarantee a deterministic ordering of
-    // rows in each partition.
-    val data =
-      sparkContext.parallelize(1 to 600, 2).mapPartitions(scala.util.Random.shuffle(_)).toDF("id")
-    val splits = data.randomSplit(Array[Double](2, 3), seed = 1)
 
-    assert(splits.length == 2, "wrong number of splits")
+    def testNonOverlappingSplits(data: DataFrame): Unit = {
+      val splits = data.randomSplit(Array[Double](2, 3), seed = 1)
+      assert(splits.length == 2, "wrong number of splits")
+
+      // Verify that the splits span the entire dataset
+      assert(splits.flatMap(_.collect()).toSet == data.collect().toSet)
 
-    // Verify that the splits span the entire dataset
-    assert(splits.flatMap(_.collect()).toSet == data.collect().toSet)
+      // Verify that the splits don't overlap
+      assert(splits(0).collect().toSeq.intersect(splits(1).collect().toSeq).isEmpty)
 
-    // Verify that the splits don't overlap
-    assert(splits(0).intersect(splits(1)).collect().isEmpty)
+      // Verify that the results are deterministic across multiple runs
+      val firstRun = splits.toSeq.map(_.collect().toSeq)
+      val secondRun = data.randomSplit(Array[Double](2, 3), seed = 1).toSeq.map(_.collect().toSeq)
+      assert(firstRun == secondRun)
+    }
 
-    // Verify that the results are deterministic across multiple runs
-    val firstRun = splits.toSeq.map(_.collect().toSeq)
-    val secondRun = data.randomSplit(Array[Double](2, 3), seed = 1).toSeq.map(_.collect().toSeq)
-    assert(firstRun == secondRun)
+    // This test ensures that randomSplit does not create overlapping splits even when the
+    // underlying dataframe (such as the one below) doesn't guarantee a deterministic ordering of
+    // rows in each partition.
+    val dataWithInts = sparkContext.parallelize(1 to 600, 2)
+      .mapPartitions(scala.util.Random.shuffle(_)).toDF("int")
+    val dataWithMaps = sparkContext.parallelize(1 to 600, 2)
+      .map(i => (i, Map(i -> i.toString)))
+      .mapPartitions(scala.util.Random.shuffle(_)).toDF("int", "map")
+    val dataWithArrayOfMaps = sparkContext.parallelize(1 to 600, 2)
+      .map(i => (i, Array(Map(i -> i.toString))))
+      .mapPartitions(scala.util.Random.shuffle(_)).toDF("int", "arrayOfMaps")
+
+    testNonOverlappingSplits(dataWithInts)
+    testNonOverlappingSplits(dataWithMaps)
+    testNonOverlappingSplits(dataWithArrayOfMaps)
   }
 
   test("pearson correlation") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 33b3b78c9f04..149db9866a36 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -28,8 +28,8 @@ import org.scalatest.Matchers._
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project, Union}
-import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, OneRowRelation, Project, Union}
+import org.apache.spark.sql.execution.{FilterExec, QueryExecution}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchange}
 import org.apache.spark.sql.functions._
@@ -1635,6 +1635,82 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  private def verifyNullabilityInFilterExec(
+      df: DataFrame,
+      expr: String,
+      expectedNonNullableColumns: Seq[String]): Unit = {
+    val dfWithFilter = df.where(s"isnotnull($expr)").selectExpr(expr)
+    // In the logical plan, all the output columns of input dataframe are nullable
+    dfWithFilter.queryExecution.optimizedPlan.collect {
+      case e: Filter => assert(e.output.forall(_.nullable))
+    }
+
+    dfWithFilter.queryExecution.executedPlan.collect {
+      // When the child expression in isnotnull is null-intolerant (i.e. any null input will
+      // result in null output), the involved columns are converted to not nullable;
+      // otherwise, no change should be made.
+      case e: FilterExec =>
+        assert(e.output.forall { o =>
+          if (expectedNonNullableColumns.contains(o.name)) !o.nullable else o.nullable
+        })
+    }
+  }
+
+  test("SPARK-17957: no change on nullability in FilterExec output") {
+    val df = sparkContext.parallelize(Seq(
+      null.asInstanceOf[java.lang.Integer] -> new java.lang.Integer(3),
+      new java.lang.Integer(1) -> null.asInstanceOf[java.lang.Integer],
+      new java.lang.Integer(2) -> new java.lang.Integer(4))).toDF()
+
+    verifyNullabilityInFilterExec(df,
+      expr = "Rand()", expectedNonNullableColumns = Seq.empty[String])
+    verifyNullabilityInFilterExec(df,
+      expr = "coalesce(_1, _2)", expectedNonNullableColumns = Seq.empty[String])
+    verifyNullabilityInFilterExec(df,
+      expr = "coalesce(_1, 0) + Rand()", expectedNonNullableColumns = Seq.empty[String])
+    verifyNullabilityInFilterExec(df,
+      expr = "cast(coalesce(cast(coalesce(_1, _2) as double), 0.0) as int)",
+      expectedNonNullableColumns = Seq.empty[String])
+  }
+
+  test("SPARK-17957: set nullability to false in FilterExec output") {
+    val df = sparkContext.parallelize(Seq(
+      null.asInstanceOf[java.lang.Integer] -> new java.lang.Integer(3),
+      new java.lang.Integer(1) -> null.asInstanceOf[java.lang.Integer],
+      new java.lang.Integer(2) -> new java.lang.Integer(4))).toDF()
+
+    verifyNullabilityInFilterExec(df,
+      expr = "_1 + _2 * 3", expectedNonNullableColumns = Seq("_1", "_2"))
+    verifyNullabilityInFilterExec(df,
+      expr = "_1 + _2", expectedNonNullableColumns = Seq("_1", "_2"))
+    verifyNullabilityInFilterExec(df,
+      expr = "_1", expectedNonNullableColumns = Seq("_1"))
+    // `constructIsNotNullConstraints` infers the IsNotNull(_2) from IsNotNull(_2 + Rand())
+    // Thus, we are able to set nullability of _2 to false.
+    // If IsNotNull(_2) is not given from `constructIsNotNullConstraints`, the impl of
+    // isNullIntolerant in `FilterExec` needs an update for more advanced inference.
+    verifyNullabilityInFilterExec(df,
+      expr = "_2 + Rand()", expectedNonNullableColumns = Seq("_2"))
+    verifyNullabilityInFilterExec(df,
+      expr = "_2 * 3 + coalesce(_1, 0)", expectedNonNullableColumns = Seq("_2"))
+    verifyNullabilityInFilterExec(df,
+      expr = "cast((_1 + _2) as boolean)", expectedNonNullableColumns = Seq("_1", "_2"))
+  }
+
+  test("SPARK-17897: Fixed IsNotNull Constraint Inference Rule") {
+    val data = Seq[java.lang.Integer](1, null).toDF("key")
+    checkAnswer(data.filter(!$"key".isNotNull), Row(null))
+    checkAnswer(data.filter(!(- $"key").isNotNull), Row(null))
+  }
+
+  test("SPARK-17957: outer join + na.fill") {
+    val df1 = Seq((1, 2), (2, 3)).toDF("a", "b")
+    val df2 = Seq((2, 5), (3, 4)).toDF("a", "c")
+    val joinedDf = df1.join(df2, Seq("a"), "outer").na.fill(0)
+    val df3 = Seq((3, 1)).toDF("a", "d")
+    checkAnswer(joinedDf.join(df3, "a"), Row(3, 0, 4, 1))
+  }
+
   test("SPARK-17123: Performing set operations that combine non-scala native types") {
     val dates = Seq(
       (new Date(0), BigDecimal.valueOf(1), new Timestamp(2)),
@@ -1658,4 +1734,35 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val df = spark.createDataFrame(spark.sparkContext.makeRDD(rows), schema)
     assert(df.filter($"array1" === $"array2").count() == 1)
   }
+
+  test("SPARK-19691 Calculating percentile of decimal column fails with ClassCastException") {
+    val df = spark.range(1).selectExpr("CAST(id as DECIMAL) as x").selectExpr("percentile(x, 0.5)")
+    checkAnswer(df, Row(BigDecimal(0.0)) :: Nil)
+  }
+
+  test("SPARK-19893: cannot run set operations with map type") {
+    val df = spark.range(1).select(map(lit("key"), $"id").as("m"))
+    val e = intercept[AnalysisException](df.intersect(df))
+    assert(e.message.contains(
+      "Cannot have map type columns in DataFrame which calls set operations"))
+    val e2 = intercept[AnalysisException](df.except(df))
+    assert(e2.message.contains(
+      "Cannot have map type columns in DataFrame which calls set operations"))
+    withTempView("v") {
+      df.createOrReplaceTempView("v")
+      val e3 = intercept[AnalysisException](sql("SELECT DISTINCT m FROM v"))
+      assert(e3.message.contains(
+        "Cannot have map type columns in DataFrame which calls set operations"))
+    }
+  }
+
+  test("SPARK-20359: catalyst outer join optimization should not throw npe") {
+    val df1 = Seq("a", "b", "c").toDF("x")
+      .withColumn("y", udf{ (x: String) => x.substring(0, 1) + "!" }.apply($"x"))
+    val df2 = Seq("a", "b").toDF("x1")
+    df1
+      .join(df2, df1("x") === df2("x1"), "left_outer")
+      .filter($"x1".isNotNull || !$"y".isin("a!"))
+      .count
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
index b117fbd0bcf9..0e7eaa9e88d5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
@@ -92,13 +92,13 @@ object NameAgg extends Aggregator[AggData, String, String] {
 }
 
 
-object SeqAgg extends Aggregator[AggData, Seq[Int], Seq[Int]] {
+object SeqAgg extends Aggregator[AggData, Seq[Int], Seq[(Int, Int)]] {
   def zero: Seq[Int] = Nil
   def reduce(b: Seq[Int], a: AggData): Seq[Int] = a.a +: b
   def merge(b1: Seq[Int], b2: Seq[Int]): Seq[Int] = b1 ++ b2
-  def finish(r: Seq[Int]): Seq[Int] = r
+  def finish(r: Seq[Int]): Seq[(Int, Int)] = r.map(i => i -> i)
   override def bufferEncoder: Encoder[Seq[Int]] = ExpressionEncoder()
-  override def outputEncoder: Encoder[Seq[Int]] = ExpressionEncoder()
+  override def outputEncoder: Encoder[Seq[(Int, Int)]] = ExpressionEncoder()
 }
 
 
@@ -134,6 +134,19 @@ object NullResultAgg extends Aggregator[AggData, AggData, AggData] {
   override def outputEncoder: Encoder[AggData] = Encoders.product[AggData]
 }
 
+case class ComplexAggData(d1: AggData, d2: AggData)
+
+object VeryComplexResultAgg extends Aggregator[Row, String, ComplexAggData] {
+  override def zero: String = ""
+  override def reduce(buffer: String, input: Row): String = buffer + input.getString(1)
+  override def merge(b1: String, b2: String): String = b1 + b2
+  override def finish(reduction: String): ComplexAggData = {
+    ComplexAggData(AggData(reduction.length, reduction), AggData(reduction.length, reduction))
+  }
+  override def bufferEncoder: Encoder[String] = Encoders.STRING
+  override def outputEncoder: Encoder[ComplexAggData] = Encoders.product[ComplexAggData]
+}
+
 
 class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -268,7 +281,7 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
 
     checkDataset(
       ds.groupByKey(_.b).agg(SeqAgg.toColumn),
-      "a" -> Seq(1, 2)
+      "a" -> Seq(1 -> 1, 2 -> 2)
     )
   }
 
@@ -312,4 +325,12 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
     val ds3 = sql("SELECT 'Some String' AS b, 1279869254 AS a").as[AggData]
     assert(ds3.select(NameAgg.toColumn).schema.head.nullable === true)
   }
+
+  test("SPARK-18147: very complex aggregator result type") {
+    val df = Seq(1 -> "a", 2 -> "b", 2 -> "c").toDF("i", "j")
+
+    checkAnswer(
+      df.groupBy($"i").agg(VeryComplexResultAgg.toColumn),
+      Row(1, Row(Row(1, "a"), Row(1, "a"))) :: Row(2, Row(Row(2, "bc"), Row(2, "bc"))) :: Nil)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 55f04878052a..9cc49b66b76e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -867,10 +867,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     checkDataset(Seq("a", null).toDS(), "a", null)
   }
 
-  test("Dataset should throw RuntimeException if non-flat input object is null") {
+  test("Dataset should throw RuntimeException if top-level product input object is null") {
     val e = intercept[RuntimeException](Seq(ClassData("a", 1), null).toDS())
     assert(e.getMessage.contains("Null value appeared in non-nullable field"))
-    assert(e.getMessage.contains("top level non-flat input object"))
+    assert(e.getMessage.contains("top level Product input object"))
   }
 
   test("dropDuplicates") {
@@ -896,13 +896,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       (1, 2), (1, 1), (2, 1), (2, 2))
   }
 
-  test("dropDuplicates should not change child plan output") {
-    val ds = Seq(("a", 1), ("a", 2), ("b", 1), ("a", 1)).toDS()
-    checkDataset(
-      ds.dropDuplicates("_1").select(ds("_1").as[String], ds("_2").as[Int]),
-      ("a", 1), ("b", 1))
-  }
-
   test("SPARK-16097: Encoders.tuple should handle null object correctly") {
     val enc = Encoders.tuple(Encoders.tuple(Encoders.STRING, Encoders.STRING), Encoders.STRING)
     val data = Seq((("a", "b"), "c"), (null, "d"))
@@ -923,6 +916,52 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
         .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() })
   }
 
+  test("SPARK-18125: Spark generated code causes CompileException") {
+    val data = Array(
+      Route("a", "b", 1),
+      Route("a", "b", 2),
+      Route("a", "c", 2),
+      Route("a", "d", 10),
+      Route("b", "a", 1),
+      Route("b", "a", 5),
+      Route("b", "c", 6))
+    val ds = sparkContext.parallelize(data).toDF.as[Route]
+
+    val grped = ds.map(r => GroupedRoutes(r.src, r.dest, Seq(r)))
+      .groupByKey(r => (r.src, r.dest))
+      .reduceGroups { (g1: GroupedRoutes, g2: GroupedRoutes) =>
+        GroupedRoutes(g1.src, g1.dest, g1.routes ++ g2.routes)
+      }.map(_._2)
+
+    val expected = Seq(
+      GroupedRoutes("a", "d", Seq(Route("a", "d", 10))),
+      GroupedRoutes("b", "c", Seq(Route("b", "c", 6))),
+      GroupedRoutes("a", "b", Seq(Route("a", "b", 1), Route("a", "b", 2))),
+      GroupedRoutes("b", "a", Seq(Route("b", "a", 1), Route("b", "a", 5))),
+      GroupedRoutes("a", "c", Seq(Route("a", "c", 2)))
+    )
+
+    implicit def ordering[GroupedRoutes]: Ordering[GroupedRoutes] = new Ordering[GroupedRoutes] {
+      override def compare(x: GroupedRoutes, y: GroupedRoutes): Int = {
+        x.toString.compareTo(y.toString)
+      }
+    }
+
+    checkDatasetUnorderly(grped, expected: _*)
+  }
+
+  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
+    val resultValue = 12345
+    val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
+    val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
+    val broadcasted = spark.sparkContext.broadcast(resultValue)
+
+    // Using broadcast triggers serialization issue in KeyValueGroupedDataset
+    val dataset = mapGroups.map(_ => broadcasted.value)
+
+    assert(dataset.collect() sameElements Array(resultValue, resultValue))
+  }
+
   Seq(true, false).foreach { eager =>
     def testCheckpointing(testName: String)(f: => Unit): Unit = {
       test(s"Dataset.checkpoint() - $testName (eager = $eager)") {
@@ -987,8 +1026,63 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       checkAnswer(agg, ds.groupBy('id % 2).agg(count('id)))
     }
   }
+
+  test("identity map for primitive arrays") {
+    val arrayByte = Array(1.toByte, 2.toByte, 3.toByte)
+    val arrayInt = Array(1, 2, 3)
+    val arrayLong = Array(1.toLong, 2.toLong, 3.toLong)
+    val arrayDouble = Array(1.1, 2.2, 3.3)
+    val arrayString = Array("a", "b", "c")
+    val dsByte = sparkContext.parallelize(Seq(arrayByte), 1).toDS.map(e => e)
+    val dsInt = sparkContext.parallelize(Seq(arrayInt), 1).toDS.map(e => e)
+    val dsLong = sparkContext.parallelize(Seq(arrayLong), 1).toDS.map(e => e)
+    val dsDouble = sparkContext.parallelize(Seq(arrayDouble), 1).toDS.map(e => e)
+    val dsString = sparkContext.parallelize(Seq(arrayString), 1).toDS.map(e => e)
+    checkDataset(dsByte, arrayByte)
+    checkDataset(dsInt, arrayInt)
+    checkDataset(dsLong, arrayLong)
+    checkDataset(dsDouble, arrayDouble)
+    checkDataset(dsString, arrayString)
+  }
+
+  test("SPARK-18251: the type of Dataset can't be Option of Product type") {
+    checkDataset(Seq(Some(1), None).toDS(), Some(1), None)
+
+    val e = intercept[UnsupportedOperationException] {
+      Seq(Some(1 -> "a"), None).toDS()
+    }
+    assert(e.getMessage.contains("Cannot create encoder for Option of Product type"))
+  }
+
+  test ("SPARK-17460: the sizeInBytes in Statistics shouldn't overflow to a negative number") {
+    // Since the sizeInBytes in Statistics could exceed the limit of an Int, we should use BigInt
+    // instead of Int for avoiding possible overflow.
+    val ds = (0 to 10000).map( i =>
+      (i, Seq((i, Seq((i, "This is really not that long of a string")))))).toDS()
+    val sizeInBytes = ds.logicalPlan.statistics.sizeInBytes
+    // sizeInBytes is 2404280404, before the fix, it overflows to a negative number
+    assert(sizeInBytes > 0)
+  }
+
+  test("SPARK-18717: code generation works for both scala.collection.Map" +
+    " and scala.collection.imutable.Map") {
+    val ds = Seq(WithImmutableMap("hi", Map(42L -> "foo"))).toDS
+    checkDataset(ds.map(t => t), WithImmutableMap("hi", Map(42L -> "foo")))
+
+    val ds2 = Seq(WithMap("hi", Map(42L -> "foo"))).toDS
+    checkDataset(ds2.map(t => t), WithMap("hi", Map(42L -> "foo")))
+  }
+
+  test("SPARK-20125: option of map") {
+    val ds = Seq(WithMapInOption(Some(Map(1 -> 1)))).toDS()
+    checkDataset(ds, WithMapInOption(Some(Map(1 -> 1))))
+  }
 }
 
+case class WithImmutableMap(id: String, map_test: scala.collection.immutable.Map[Long, String])
+case class WithMap(id: String, map_test: scala.collection.Map[Long, String])
+case class WithMapInOption(m: Option[scala.collection.Map[Int, Int]])
+
 case class Generic[T](id: T, value: Double)
 
 case class OtherTuple(_1: String, _2: Int)
@@ -1059,3 +1153,6 @@ object DatasetTransform {
     ds.map(_ + 1)
   }
 }
+
+case class Route(src: String, dest: String, cost: Int)
+case class GroupedRoutes(src: String, dest: String, routes: Seq[Route])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index f7aa3b747ae5..e05b2252ee34 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
+import java.util.Locale
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.functions._
@@ -55,8 +56,8 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(sql("""SELECT CURRENT_TIMESTAMP() = NOW()"""), Row(true))
   }
 
-  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
-  val sdfDate = new SimpleDateFormat("yyyy-MM-dd")
+  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+  val sdfDate = new SimpleDateFormat("yyyy-MM-dd", Locale.US)
   val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
   val ts = new Timestamp(sdf.parse("2013-04-08 13:10:15").getTime)
 
@@ -395,11 +396,11 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("from_unixtime") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2)
+    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
     val fmt3 = "yy-MM-dd HH-mm-ss"
-    val sdf3 = new SimpleDateFormat(fmt3)
+    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
     val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b")
     checkAnswer(
       df.select(from_unixtime(col("a"))),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 59ae889cf3b9..9c39b3c7f09b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.functions.{from_json, struct, to_json}
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{CalendarIntervalType, IntegerType, StructType}
+import org.apache.spark.sql.types.{CalendarIntervalType, IntegerType, StructType, TimestampType}
 
 class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -105,6 +105,16 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
       Row(Row(1)) :: Nil)
   }
 
+  test("from_json with option") {
+    val df = Seq("""{"time": "26/08/2015 18:00"}""").toDS()
+    val schema = new StructType().add("time", TimestampType)
+    val options = Map("timestampFormat" -> "dd/MM/yyyy HH:mm")
+
+    checkAnswer(
+      df.select(from_json($"value", schema, options)),
+      Row(Row(java.sql.Timestamp.valueOf("2015-08-26 18:00:00.0"))))
+  }
+
   test("from_json missing columns") {
     val df = Seq("""{"a": 1}""").toDS()
     val schema = new StructType().add("b", IntegerType)
@@ -131,6 +141,15 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
       Row("""{"_1":1}""") :: Nil)
   }
 
+  test("to_json with option") {
+    val df = Seq(Tuple1(Tuple1(java.sql.Timestamp.valueOf("2015-08-26 18:00:00.0")))).toDF("a")
+    val options = Map("timestampFormat" -> "dd/MM/yyyy HH:mm")
+
+    checkAnswer(
+      df.select(to_json($"a", options)),
+      Row("""{"_1":"26/08/2015 18:00"}""") :: Nil)
+  }
+
   test("to_json unsupported type") {
     val df = Seq(Tuple1(Tuple1("interval -3 month 7 hours"))).toDF("a")
       .select(struct($"a._1".cast(CalendarIntervalType).as("a")).as("c"))
@@ -141,4 +160,18 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
     assert(e.getMessage.contains(
       "Unable to convert column a of type calendarinterval to JSON."))
   }
+
+  test("roundtrip in to_json and from_json") {
+    val dfOne = Seq(Tuple1(Tuple1(1)), Tuple1(null)).toDF("struct")
+    val schemaOne = dfOne.schema(0).dataType.asInstanceOf[StructType]
+    val readBackOne = dfOne.select(to_json($"struct").as("json"))
+      .select(from_json($"json", schemaOne).as("struct"))
+    checkAnswer(dfOne, readBackOne)
+
+    val dfTwo = Seq(Some("""{"a":1}"""), None).toDF("json")
+    val schemaTwo = new StructType().add("a", IntegerType)
+    val readBackTwo = dfTwo.select(from_json($"json", schemaTwo).as("struct"))
+      .select(to_json($"struct").as("json"))
+    checkAnswer(dfTwo, readBackTwo)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
similarity index 94%
rename from sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
index 6944c6f84817..0284f8311e18 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
@@ -23,13 +23,13 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.functions.{log => logarithm}
 import org.apache.spark.sql.test.SharedSQLContext
 
-private object MathExpressionsTestData {
+private object MathFunctionsTestData {
   case class DoubleData(a: java.lang.Double, b: java.lang.Double)
   case class NullDoubles(a: java.lang.Double)
 }
 
-class MathExpressionsSuite extends QueryTest with SharedSQLContext {
-  import MathExpressionsTestData._
+class MathFunctionsSuite extends QueryTest with SharedSQLContext {
+  import MathFunctionsTestData._
   import testImplicits._
 
   private lazy val doubleData = (1 to 10).map(i => DoubleData(i * 0.2 - 1, i * -0.2 + 1)).toDF()
@@ -231,6 +231,19 @@ class MathExpressionsSuite extends QueryTest with SharedSQLContext {
       Seq(Row(BigDecimal("0E3"), BigDecimal("0E2"), BigDecimal("0E1"), BigDecimal(3),
         BigDecimal("3.1"), BigDecimal("3.14"), BigDecimal("3.142")))
     )
+
+    val bdPi: BigDecimal = BigDecimal(31415925L, 7)
+    checkAnswer(
+      sql(s"SELECT round($bdPi, 7), round($bdPi, 8), round($bdPi, 9), round($bdPi, 10), " +
+        s"round($bdPi, 100), round($bdPi, 6), round(null, 8)"),
+      Seq(Row(bdPi, bdPi, bdPi, bdPi, bdPi, BigDecimal("3.141593"), null))
+    )
+
+    checkAnswer(
+      sql(s"SELECT bround($bdPi, 7), bround($bdPi, 8), bround($bdPi, 9), bround($bdPi, 10), " +
+        s"bround($bdPi, 100), bround($bdPi, 6), bround(null, 8)"),
+      Seq(Row(bdPi, bdPi, bdPi, bdPi, bdPi, BigDecimal("3.141592"), null))
+    )
   }
 
   test("exp") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLCompatibilityFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLCompatibilityFunctionSuite.scala
deleted file mode 100644
index 27b60e0d9def..000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLCompatibilityFunctionSuite.scala
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import java.math.BigDecimal
-import java.sql.Timestamp
-
-import org.apache.spark.sql.test.SharedSQLContext
-
-/**
- * A test suite for functions added for compatibility with other databases such as Oracle, MSSQL.
- *
- * These functions are typically implemented using the trait
- * [[org.apache.spark.sql.catalyst.expressions.RuntimeReplaceable]].
- */
-class SQLCompatibilityFunctionSuite extends QueryTest with SharedSQLContext {
-
-  test("ifnull") {
-    checkAnswer(
-      sql("SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null)"),
-      Row("x", "y", null))
-
-    // Type coercion
-    checkAnswer(
-      sql("SELECT ifnull(1, 2.1d), ifnull(null, 2.1d)"),
-      Row(1.0, 2.1))
-  }
-
-  test("nullif") {
-    checkAnswer(
-      sql("SELECT nullif('x', 'x'), nullif('x', 'y')"),
-      Row(null, "x"))
-
-    // Type coercion
-    checkAnswer(
-      sql("SELECT nullif(1, 2.1d), nullif(1, 1.0d)"),
-      Row(1.0, null))
-  }
-
-  test("nvl") {
-    checkAnswer(
-      sql("SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)"),
-      Row("x", "y", null))
-
-    // Type coercion
-    checkAnswer(
-      sql("SELECT nvl(1, 2.1d), nvl(null, 2.1d)"),
-      Row(1.0, 2.1))
-  }
-
-  test("nvl2") {
-    checkAnswer(
-      sql("SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null)"),
-      Row("y", "x", null))
-
-    // Type coercion
-    checkAnswer(
-      sql("SELECT nvl2(null, 1, 2.1d), nvl2('n', 1, 2.1d)"),
-      Row(2.1, 1.0))
-  }
-
-  test("SPARK-16730 cast alias functions for Hive compatibility") {
-    checkAnswer(
-      sql("SELECT boolean(1), tinyint(1), smallint(1), int(1), bigint(1)"),
-      Row(true, 1.toByte, 1.toShort, 1, 1L))
-
-    checkAnswer(
-      sql("SELECT float(1), double(1), decimal(1)"),
-      Row(1.toFloat, 1.0, new BigDecimal(1)))
-
-    checkAnswer(
-      sql("SELECT date(\"2014-04-04\"), timestamp(date(\"2014-04-04\"))"),
-      Row(new java.util.Date(114, 3, 4), new Timestamp(114, 3, 4, 0, 0, 0, 0)))
-
-    checkAnswer(
-      sql("SELECT string(1)"),
-      Row("1"))
-
-    // Error handling: only one argument
-    val errorMsg = intercept[AnalysisException](sql("SELECT string(1, 2)")).getMessage
-    assert(errorMsg.contains("Function string accepts only one argument"))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 9a3d93cf17b7..8a156df9da95 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -85,15 +85,16 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     checkKeywordsExist(sql("describe function extended upper"),
       "Function: upper",
       "Class: org.apache.spark.sql.catalyst.expressions.Upper",
-      "Usage: upper(str) - Returns str with all characters changed to uppercase",
+      "Usage: upper(str) - Returns `str` with all characters changed to uppercase",
       "Extended Usage:",
+      "Examples:",
       "> SELECT upper('SparkSql');",
-      "'SPARKSQL'")
+      "SPARKSQL")
 
     checkKeywordsExist(sql("describe functioN Upper"),
       "Function: upper",
       "Class: org.apache.spark.sql.catalyst.expressions.Upper",
-      "Usage: upper(str) - Returns str with all characters changed to uppercase")
+      "Usage: upper(str) - Returns `str` with all characters changed to uppercase")
 
     checkKeywordsNotExist(sql("describe functioN Upper"), "Extended Usage")
 
@@ -2475,4 +2476,19 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-18053: ARRAY equality is broken") {
+    withTable("array_tbl") {
+      spark.range(10).select(array($"id").as("arr")).write.saveAsTable("array_tbl")
+      assert(sql("SELECT * FROM array_tbl where arr = ARRAY(1L)").count == 1)
+    }
+  }
+
+  test("SPARK-20164: AnalysisException should be tolerant to null query plan") {
+    try {
+      throw new AnalysisException("", None, None, plan = null)
+    } catch {
+      case ae: AnalysisException => assert(ae.plan == null && ae.getMessage == ae.getSimpleMessage)
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 2d73d9f1fc80..91aecca537fb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile}
-import org.apache.spark.sql.execution.command.ShowColumnsCommand
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.StructType
 
@@ -164,7 +163,12 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
         s"-- Number of queries: ${outputs.size}\n\n\n" +
         outputs.zipWithIndex.map{case (qr, i) => qr.toString(i)}.mkString("\n\n\n") + "\n"
       }
-      stringToFile(new File(testCase.resultFile), goldenOutput)
+      val resultFile = new File(testCase.resultFile);
+      val parent = resultFile.getParentFile();
+      if (!parent.exists()) {
+        assert(parent.mkdirs(), "Could not create directory: " + parent)
+      }
+      stringToFile(resultFile, goldenOutput)
     }
 
     // Read back the golden file.
@@ -215,7 +219,8 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
     try {
       val df = session.sql(sql)
       val schema = df.schema
-      val answer = df.queryExecution.hiveResultString()
+      // Get answer, but also get rid of the #1234 expression ids that show up in explain plans
+      val answer = df.queryExecution.hiveResultString().map(_.replaceAll("#\\d+", "#x"))
 
       // If the output is not pre-sorted, sort it.
       if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted)
@@ -223,7 +228,10 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
     } catch {
       case a: AnalysisException if a.plan.nonEmpty =>
         // Do not output the logical plan tree which contains expression IDs.
-        (StructType(Seq.empty), Seq(a.getClass.getName, a.getSimpleMessage))
+        // Also implement a crude way of masking expression IDs in the error message
+        // with a generic pattern "###".
+        (StructType(Seq.empty),
+          Seq(a.getClass.getName, a.getSimpleMessage.replaceAll("#\\d+", "#x")))
       case NonFatal(e) =>
         // If there is an exception, put the exception class followed by the message.
         (StructType(Seq.empty), Seq(e.getClass.getName, e.getMessage))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
new file mode 100644
index 000000000000..c663b31351b5
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.{lang => jl}
+import java.sql.{Date, Timestamp}
+
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.internal.StaticSQLConf
+import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
+import org.apache.spark.sql.test.SQLTestData.ArrayData
+import org.apache.spark.sql.types._
+
+
+/**
+ * End-to-end suite testing statistics collection and use on both entire table and columns.
+ */
+class StatisticsCollectionSuite extends StatisticsCollectionTestBase with SharedSQLContext {
+  import testImplicits._
+
+  private def checkTableStats(tableName: String, expectedRowCount: Option[Int])
+    : Option[Statistics] = {
+    val df = spark.table(tableName)
+    val stats = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
+      assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
+      rel.catalogTable.get.stats
+    }
+    assert(stats.size == 1)
+    stats.head
+  }
+
+  test("estimates the size of a limit 0 on outer join") {
+    withTempView("test") {
+      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
+        .createOrReplaceTempView("test")
+      val df1 = spark.table("test")
+      val df2 = spark.table("test").limit(0)
+      val df = df1.join(df2, Seq("k"), "left")
+
+      val sizes = df.queryExecution.analyzed.collect { case g: Join =>
+        g.statistics.sizeInBytes
+      }
+
+      assert(sizes.size === 1, s"number of Join nodes is wrong:\n ${df.queryExecution}")
+      assert(sizes.head === BigInt(96),
+        s"expected exact size 96 for table 'test', got: ${sizes.head}")
+    }
+  }
+
+  test("analyze column command - unsupported types and invalid columns") {
+    val tableName = "column_stats_test1"
+    withTable(tableName) {
+      Seq(ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3)))).toDF().write.saveAsTable(tableName)
+
+      // Test unsupported data types
+      val err1 = intercept[AnalysisException] {
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS data")
+      }
+      assert(err1.message.contains("does not support statistics collection"))
+
+      // Test invalid columns
+      val err2 = intercept[AnalysisException] {
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS some_random_column")
+      }
+      assert(err2.message.contains("does not exist"))
+    }
+  }
+
+  test("test table-level statistics for data source table") {
+    val tableName = "tbl"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName(i INT, j STRING) USING parquet")
+      Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.mode("overwrite").insertInto(tableName)
+
+      // noscan won't count the number of rows
+      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
+      checkTableStats(tableName, expectedRowCount = None)
+
+      // without noscan, we count the number of rows
+      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+      checkTableStats(tableName, expectedRowCount = Some(2))
+    }
+  }
+
+  test("SPARK-15392: DataFrame created from RDD should not be broadcasted") {
+    val rdd = sparkContext.range(1, 100).map(i => Row(i, i))
+    val df = spark.createDataFrame(rdd, new StructType().add("a", LongType).add("b", LongType))
+    assert(df.queryExecution.analyzed.statistics.sizeInBytes >
+      spark.sessionState.conf.autoBroadcastJoinThreshold)
+    assert(df.selectExpr("a").queryExecution.analyzed.statistics.sizeInBytes >
+      spark.sessionState.conf.autoBroadcastJoinThreshold)
+  }
+
+  test("estimates the size of limit") {
+    withTempView("test") {
+      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
+        .createOrReplaceTempView("test")
+      Seq((0, 1), (1, 24), (2, 48)).foreach { case (limit, expected) =>
+        val df = sql(s"""SELECT * FROM test limit $limit""")
+
+        val sizesGlobalLimit = df.queryExecution.analyzed.collect { case g: GlobalLimit =>
+          g.statistics.sizeInBytes
+        }
+        assert(sizesGlobalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
+        assert(sizesGlobalLimit.head === BigInt(expected),
+          s"expected exact size $expected for table 'test', got: ${sizesGlobalLimit.head}")
+
+        val sizesLocalLimit = df.queryExecution.analyzed.collect { case l: LocalLimit =>
+          l.statistics.sizeInBytes
+        }
+        assert(sizesLocalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
+        assert(sizesLocalLimit.head === BigInt(expected),
+          s"expected exact size $expected for table 'test', got: ${sizesLocalLimit.head}")
+      }
+    }
+  }
+
+  test("column stats round trip serialization") {
+    // Make sure we serialize and then deserialize and we will get the result data
+    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+    stats.zip(df.schema).foreach { case ((k, v), field) =>
+      withClue(s"column $k with type ${field.dataType}") {
+        val roundtrip = ColumnStat.fromMap("table_is_foo", field, v.toMap)
+        assert(roundtrip == Some(v))
+      }
+    }
+  }
+
+  test("analyze column command - result verification") {
+    // (data.head.productArity - 1) because the last column does not support stats collection.
+    assert(stats.size == data.head.productArity - 1)
+    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+    checkColStats(df, stats)
+  }
+
+  test("column stats collection for null columns") {
+    val dataTypes: Seq[(DataType, Int)] = Seq(
+      BooleanType, ByteType, ShortType, IntegerType, LongType,
+      DoubleType, FloatType, DecimalType.SYSTEM_DEFAULT,
+      StringType, BinaryType, DateType, TimestampType
+    ).zipWithIndex
+
+    val df = sql("select " + dataTypes.map { case (tpe, idx) =>
+      s"cast(null as ${tpe.sql}) as col$idx"
+    }.mkString(", "))
+
+    val expectedColStats = dataTypes.map { case (tpe, idx) =>
+      (s"col$idx", ColumnStat(0, None, None, 1, tpe.defaultSize.toLong, tpe.defaultSize.toLong))
+    }
+    checkColStats(df, mutable.LinkedHashMap(expectedColStats: _*))
+  }
+}
+
+
+/**
+ * The base for test cases that we want to include in both the hive module (for verifying behavior
+ * when using the Hive external catalog) as well as in the sql/core module.
+ */
+abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils {
+  import testImplicits._
+
+  private val dec1 = new java.math.BigDecimal("1.000000000000000000")
+  private val dec2 = new java.math.BigDecimal("8.000000000000000000")
+  private val d1 = Date.valueOf("2016-05-08")
+  private val d2 = Date.valueOf("2016-05-09")
+  private val t1 = Timestamp.valueOf("2016-05-08 00:00:01")
+  private val t2 = Timestamp.valueOf("2016-05-09 00:00:02")
+
+  /**
+   * Define a very simple 3 row table used for testing column serialization.
+   * Note: last column is seq[int] which doesn't support stats collection.
+   */
+  protected val data = Seq[
+    (jl.Boolean, jl.Byte, jl.Short, jl.Integer, jl.Long,
+      jl.Double, jl.Float, java.math.BigDecimal,
+      String, Array[Byte], Date, Timestamp,
+      Seq[Int])](
+    (false, 1.toByte, 1.toShort, 1, 1L, 1.0, 1.0f, dec1, "s1", "b1".getBytes, d1, t1, null),
+    (true, 2.toByte, 3.toShort, 4, 5L, 6.0, 7.0f, dec2, "ss9", "bb0".getBytes, d2, t2, null),
+    (null, null, null, null, null, null, null, null, null, null, null, null, null)
+  )
+
+  /** A mapping from column to the stats collected. */
+  protected val stats = mutable.LinkedHashMap(
+    "cbool" -> ColumnStat(2, Some(false), Some(true), 1, 1, 1),
+    "cbyte" -> ColumnStat(2, Some(1L), Some(2L), 1, 1, 1),
+    "cshort" -> ColumnStat(2, Some(1L), Some(3L), 1, 2, 2),
+    "cint" -> ColumnStat(2, Some(1L), Some(4L), 1, 4, 4),
+    "clong" -> ColumnStat(2, Some(1L), Some(5L), 1, 8, 8),
+    "cdouble" -> ColumnStat(2, Some(1.0), Some(6.0), 1, 8, 8),
+    "cfloat" -> ColumnStat(2, Some(1.0), Some(7.0), 1, 4, 4),
+    "cdecimal" -> ColumnStat(2, Some(dec1), Some(dec2), 1, 16, 16),
+    "cstring" -> ColumnStat(2, None, None, 1, 3, 3),
+    "cbinary" -> ColumnStat(2, None, None, 1, 3, 3),
+    "cdate" -> ColumnStat(2, Some(d1), Some(d2), 1, 4, 4),
+    "ctimestamp" -> ColumnStat(2, Some(t1), Some(t2), 1, 8, 8)
+  )
+
+  private val randomName = new Random(31)
+
+  /**
+   * Compute column stats for the given DataFrame and compare it with colStats.
+   */
+  def checkColStats(
+      df: DataFrame,
+      colStats: mutable.LinkedHashMap[String, ColumnStat]): Unit = {
+    val tableName = "column_stats_test_" + randomName.nextInt(1000)
+    withTable(tableName) {
+      df.write.saveAsTable(tableName)
+
+      // Collect statistics
+      sql(s"analyze table $tableName compute STATISTICS FOR COLUMNS " +
+        colStats.keys.mkString(", "))
+
+      // Validate statistics
+      val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
+      assert(table.stats.isDefined)
+      assert(table.stats.get.colStats.size == colStats.size)
+
+      colStats.foreach { case (k, v) =>
+        withClue(s"column $k") {
+          assert(table.stats.get.colStats(k) == v)
+        }
+      }
+    }
+  }
+
+  // This test will be run twice: with and without Hive support
+  test("SPARK-18856: non-empty partitioned table should not report zero size") {
+    withTable("ds_tbl", "hive_tbl") {
+      spark.range(100).select($"id", $"id" % 5 as "p").write.partitionBy("p").saveAsTable("ds_tbl")
+      val stats = spark.table("ds_tbl").queryExecution.optimizedPlan.statistics
+      assert(stats.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
+
+      if (spark.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive") {
+        sql("CREATE TABLE hive_tbl(i int) PARTITIONED BY (j int)")
+        sql("INSERT INTO hive_tbl PARTITION(j=1) SELECT 1")
+        val stats2 = spark.table("hive_tbl").queryExecution.optimizedPlan.statistics
+        assert(stats2.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
deleted file mode 100644
index f1a201abd8da..000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import java.sql.{Date, Timestamp}
-
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
-import org.apache.spark.sql.catalyst.parser.ParseException
-import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.command.AnalyzeColumnCommand
-import org.apache.spark.sql.test.SQLTestData.ArrayData
-import org.apache.spark.sql.types._
-
-class StatisticsColumnSuite extends StatisticsTest {
-  import testImplicits._
-
-  test("parse analyze column commands") {
-    val tableName = "tbl"
-
-    // we need to specify column names
-    intercept[ParseException] {
-      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS")
-    }
-
-    val analyzeSql = s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key, value"
-    val parsed = spark.sessionState.sqlParser.parsePlan(analyzeSql)
-    val expected = AnalyzeColumnCommand(TableIdentifier(tableName), Seq("key", "value"))
-    comparePlans(parsed, expected)
-  }
-
-  test("analyzing columns of non-atomic types is not supported") {
-    val tableName = "tbl"
-    withTable(tableName) {
-      Seq(ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3)))).toDF().write.saveAsTable(tableName)
-      val err = intercept[AnalysisException] {
-        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS data")
-      }
-      assert(err.message.contains("Analyzing columns is not supported"))
-    }
-  }
-
-  test("check correctness of columns") {
-    val table = "tbl"
-    val colName1 = "abc"
-    val colName2 = "x.yz"
-    withTable(table) {
-      sql(s"CREATE TABLE $table ($colName1 int, `$colName2` string) USING PARQUET")
-
-      val invalidColError = intercept[AnalysisException] {
-        sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS key")
-      }
-      assert(invalidColError.message == "Invalid column name: key.")
-
-      withSQLConf("spark.sql.caseSensitive" -> "true") {
-        val invalidErr = intercept[AnalysisException] {
-          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS ${colName1.toUpperCase}")
-        }
-        assert(invalidErr.message == s"Invalid column name: ${colName1.toUpperCase}.")
-      }
-
-      withSQLConf("spark.sql.caseSensitive" -> "false") {
-        val columnsToAnalyze = Seq(colName2.toUpperCase, colName1, colName2)
-        val tableIdent = TableIdentifier(table, Some("default"))
-        val relation = spark.sessionState.catalog.lookupRelation(tableIdent)
-        val (_, columnStats) =
-          AnalyzeColumnCommand(tableIdent, columnsToAnalyze).computeColStats(spark, relation)
-        assert(columnStats.contains(colName1))
-        assert(columnStats.contains(colName2))
-        // check deduplication
-        assert(columnStats.size == 2)
-        assert(!columnStats.contains(colName2.toUpperCase))
-      }
-    }
-  }
-
-  private def getNonNullValues[T](values: Seq[Option[T]]): Seq[T] = {
-    values.filter(_.isDefined).map(_.get)
-  }
-
-  test("column-level statistics for integral type columns") {
-    val values = (0 to 5).map { i =>
-      if (i % 2 == 0) None else Some(i)
-    }
-    val data = values.map { i =>
-      (i.map(_.toByte), i.map(_.toShort), i.map(_.toInt), i.map(_.toLong))
-    }
-
-    val df = data.toDF("c1", "c2", "c3", "c4")
-    val nonNullValues = getNonNullValues[Int](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        nonNullValues.max,
-        nonNullValues.min,
-        nonNullValues.distinct.length.toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for fractional type columns") {
-    val values: Seq[Option[Decimal]] = (0 to 5).map { i =>
-      if (i == 0) None else Some(Decimal(i + i * 0.01))
-    }
-    val data = values.map { i =>
-      (i.map(_.toFloat), i.map(_.toDouble), i)
-    }
-
-    val df = data.toDF("c1", "c2", "c3")
-    val nonNullValues = getNonNullValues[Decimal](values)
-    val numNulls = values.count(_.isEmpty).toLong
-    val ndv = nonNullValues.distinct.length.toLong
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = f.dataType match {
-        case floatType: FloatType =>
-          ColumnStat(InternalRow(numNulls, nonNullValues.max.toFloat, nonNullValues.min.toFloat,
-            ndv))
-        case doubleType: DoubleType =>
-          ColumnStat(InternalRow(numNulls, nonNullValues.max.toDouble, nonNullValues.min.toDouble,
-            ndv))
-        case decimalType: DecimalType =>
-          ColumnStat(InternalRow(numNulls, nonNullValues.max, nonNullValues.min, ndv))
-      }
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for string column") {
-    val values = Seq(None, Some("a"), Some("bbbb"), Some("cccc"), Some(""))
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[String](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        nonNullValues.map(_.length).sum / nonNullValues.length.toDouble,
-        nonNullValues.map(_.length).max.toInt,
-        nonNullValues.distinct.length.toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for binary column") {
-    val values = Seq(None, Some("a"), Some("bbbb"), Some("cccc"), Some("")).map(_.map(_.getBytes))
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[Array[Byte]](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        nonNullValues.map(_.length).sum / nonNullValues.length.toDouble,
-        nonNullValues.map(_.length).max.toInt))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for boolean column") {
-    val values = Seq(None, Some(true), Some(false), Some(true))
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[Boolean](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        nonNullValues.count(_.equals(true)).toLong,
-        nonNullValues.count(_.equals(false)).toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for date column") {
-    val values = Seq(None, Some("1970-01-01"), Some("1970-02-02")).map(_.map(Date.valueOf))
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[Date](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        // Internally, DateType is represented as the number of days from 1970-01-01.
-        nonNullValues.map(DateTimeUtils.fromJavaDate).max,
-        nonNullValues.map(DateTimeUtils.fromJavaDate).min,
-        nonNullValues.distinct.length.toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for timestamp column") {
-    val values = Seq(None, Some("1970-01-01 00:00:00"), Some("1970-01-01 00:00:05")).map { i =>
-      i.map(Timestamp.valueOf)
-    }
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[Timestamp](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        // Internally, TimestampType is represented as the number of days from 1970-01-01
-        nonNullValues.map(DateTimeUtils.fromJavaTimestamp).max,
-        nonNullValues.map(DateTimeUtils.fromJavaTimestamp).min,
-        nonNullValues.distinct.length.toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for null columns") {
-    val values = Seq(None, None)
-    val data = values.map { i =>
-      (i.map(_.toString), i.map(_.toString.toInt))
-    }
-    val df = data.toDF("c1", "c2")
-    val expectedColStatsSeq = df.schema.map { f =>
-      (f, ColumnStat(InternalRow(values.count(_.isEmpty).toLong, null, null, 0L)))
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for columns with different types") {
-    val intSeq = Seq(1, 2)
-    val doubleSeq = Seq(1.01d, 2.02d)
-    val stringSeq = Seq("a", "bb")
-    val binarySeq = Seq("a", "bb").map(_.getBytes)
-    val booleanSeq = Seq(true, false)
-    val dateSeq = Seq("1970-01-01", "1970-02-02").map(Date.valueOf)
-    val timestampSeq = Seq("1970-01-01 00:00:00", "1970-01-01 00:00:05").map(Timestamp.valueOf)
-    val longSeq = Seq(5L, 4L)
-
-    val data = intSeq.indices.map { i =>
-      (intSeq(i), doubleSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i), dateSeq(i),
-        timestampSeq(i), longSeq(i))
-    }
-    val df = data.toDF("c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8")
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = f.dataType match {
-        case IntegerType =>
-          ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, intSeq.distinct.length.toLong))
-        case DoubleType =>
-          ColumnStat(InternalRow(0L, doubleSeq.max, doubleSeq.min,
-              doubleSeq.distinct.length.toLong))
-        case StringType =>
-          ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / stringSeq.length.toDouble,
-                stringSeq.map(_.length).max.toInt, stringSeq.distinct.length.toLong))
-        case BinaryType =>
-          ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / binarySeq.length.toDouble,
-                binarySeq.map(_.length).max.toInt))
-        case BooleanType =>
-          ColumnStat(InternalRow(0L, booleanSeq.count(_.equals(true)).toLong,
-              booleanSeq.count(_.equals(false)).toLong))
-        case DateType =>
-          ColumnStat(InternalRow(0L, dateSeq.map(DateTimeUtils.fromJavaDate).max,
-                dateSeq.map(DateTimeUtils.fromJavaDate).min, dateSeq.distinct.length.toLong))
-        case TimestampType =>
-          ColumnStat(InternalRow(0L, timestampSeq.map(DateTimeUtils.fromJavaTimestamp).max,
-                timestampSeq.map(DateTimeUtils.fromJavaTimestamp).min,
-                timestampSeq.distinct.length.toLong))
-        case LongType =>
-          ColumnStat(InternalRow(0L, longSeq.max, longSeq.min, longSeq.distinct.length.toLong))
-      }
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("update table-level stats while collecting column-level stats") {
-    val table = "tbl"
-    withTable(table) {
-      sql(s"CREATE TABLE $table (c1 int) USING PARQUET")
-      sql(s"INSERT INTO $table SELECT 1")
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
-      checkTableStats(tableName = table, expectedRowCount = Some(1))
-
-      // update table-level stats between analyze table and analyze column commands
-      sql(s"INSERT INTO $table SELECT 1")
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1")
-      val fetchedStats = checkTableStats(tableName = table, expectedRowCount = Some(2))
-
-      val colStat = fetchedStats.get.colStats("c1")
-      StatisticsTest.checkColStat(
-        dataType = IntegerType,
-        colStat = colStat,
-        expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
-        rsd = spark.sessionState.conf.ndvMaxError)
-    }
-  }
-
-  test("analyze column stats independently") {
-    val table = "tbl"
-    withTable(table) {
-      sql(s"CREATE TABLE $table (c1 int, c2 long) USING PARQUET")
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1")
-      val fetchedStats1 = checkTableStats(tableName = table, expectedRowCount = Some(0))
-      assert(fetchedStats1.get.colStats.size == 1)
-      val expected1 = ColumnStat(InternalRow(0L, null, null, 0L))
-      val rsd = spark.sessionState.conf.ndvMaxError
-      StatisticsTest.checkColStat(
-        dataType = IntegerType,
-        colStat = fetchedStats1.get.colStats("c1"),
-        expectedColStat = expected1,
-        rsd = rsd)
-
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c2")
-      val fetchedStats2 = checkTableStats(tableName = table, expectedRowCount = Some(0))
-      // column c1 is kept in the stats
-      assert(fetchedStats2.get.colStats.size == 2)
-      StatisticsTest.checkColStat(
-        dataType = IntegerType,
-        colStat = fetchedStats2.get.colStats("c1"),
-        expectedColStat = expected1,
-        rsd = rsd)
-      val expected2 = ColumnStat(InternalRow(0L, null, null, 0L))
-      StatisticsTest.checkColStat(
-        dataType = LongType,
-        colStat = fetchedStats2.get.colStats("c2"),
-        expectedColStat = expected2,
-        rsd = rsd)
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
deleted file mode 100644
index 8cf42e9248c2..000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import org.apache.spark.sql.catalyst.plans.logical.{GlobalLimit, Join, LocalLimit}
-import org.apache.spark.sql.types._
-
-class StatisticsSuite extends StatisticsTest {
-  import testImplicits._
-
-  test("SPARK-15392: DataFrame created from RDD should not be broadcasted") {
-    val rdd = sparkContext.range(1, 100).map(i => Row(i, i))
-    val df = spark.createDataFrame(rdd, new StructType().add("a", LongType).add("b", LongType))
-    assert(df.queryExecution.analyzed.statistics.sizeInBytes >
-      spark.sessionState.conf.autoBroadcastJoinThreshold)
-    assert(df.selectExpr("a").queryExecution.analyzed.statistics.sizeInBytes >
-      spark.sessionState.conf.autoBroadcastJoinThreshold)
-  }
-
-  test("estimates the size of limit") {
-    withTempView("test") {
-      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
-        .createOrReplaceTempView("test")
-      Seq((0, 1), (1, 24), (2, 48)).foreach { case (limit, expected) =>
-        val df = sql(s"""SELECT * FROM test limit $limit""")
-
-        val sizesGlobalLimit = df.queryExecution.analyzed.collect { case g: GlobalLimit =>
-          g.statistics.sizeInBytes
-        }
-        assert(sizesGlobalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
-        assert(sizesGlobalLimit.head === BigInt(expected),
-          s"expected exact size $expected for table 'test', got: ${sizesGlobalLimit.head}")
-
-        val sizesLocalLimit = df.queryExecution.analyzed.collect { case l: LocalLimit =>
-          l.statistics.sizeInBytes
-        }
-        assert(sizesLocalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
-        assert(sizesLocalLimit.head === BigInt(expected),
-          s"expected exact size $expected for table 'test', got: ${sizesLocalLimit.head}")
-      }
-    }
-  }
-
-  test("estimates the size of a limit 0 on outer join") {
-    withTempView("test") {
-      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
-        .createOrReplaceTempView("test")
-      val df1 = spark.table("test")
-      val df2 = spark.table("test").limit(0)
-      val df = df1.join(df2, Seq("k"), "left")
-
-      val sizes = df.queryExecution.analyzed.collect { case g: Join =>
-        g.statistics.sizeInBytes
-      }
-
-      assert(sizes.size === 1, s"number of Join nodes is wrong:\n ${df.queryExecution}")
-      assert(sizes.head === BigInt(96),
-        s"expected exact size 96 for table 'test', got: ${sizes.head}")
-    }
-  }
-
-  test("test table-level statistics for data source table created in InMemoryCatalog") {
-    val tableName = "tbl"
-    withTable(tableName) {
-      sql(s"CREATE TABLE $tableName(i INT, j STRING) USING parquet")
-      Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.mode("overwrite").insertInto(tableName)
-
-      // noscan won't count the number of rows
-      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
-      checkTableStats(tableName, expectedRowCount = None)
-
-      // without noscan, we count the number of rows
-      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
-      checkTableStats(tableName, expectedRowCount = Some(2))
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala
deleted file mode 100644
index 5134ac0e7e5b..000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
-import org.apache.spark.sql.execution.command.{AnalyzeColumnCommand, ColumnStatStruct}
-import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types._
-
-trait StatisticsTest extends QueryTest with SharedSQLContext {
-
-  def checkColStats(
-      df: DataFrame,
-      expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
-    val table = "tbl"
-    withTable(table) {
-      df.write.format("json").saveAsTable(table)
-      val columns = expectedColStatsSeq.map(_._1)
-      val tableIdent = TableIdentifier(table, Some("default"))
-      val relation = spark.sessionState.catalog.lookupRelation(tableIdent)
-      val (_, columnStats) =
-        AnalyzeColumnCommand(tableIdent, columns.map(_.name)).computeColStats(spark, relation)
-      expectedColStatsSeq.foreach { case (field, expectedColStat) =>
-        assert(columnStats.contains(field.name))
-        val colStat = columnStats(field.name)
-        StatisticsTest.checkColStat(
-          dataType = field.dataType,
-          colStat = colStat,
-          expectedColStat = expectedColStat,
-          rsd = spark.sessionState.conf.ndvMaxError)
-
-        // check if we get the same colStat after encoding and decoding
-        val encodedCS = colStat.toString
-        val numFields = ColumnStatStruct.numStatFields(field.dataType)
-        val decodedCS = ColumnStat(numFields, encodedCS)
-        StatisticsTest.checkColStat(
-          dataType = field.dataType,
-          colStat = decodedCS,
-          expectedColStat = expectedColStat,
-          rsd = spark.sessionState.conf.ndvMaxError)
-      }
-    }
-  }
-
-  def checkTableStats(tableName: String, expectedRowCount: Option[Int]): Option[Statistics] = {
-    val df = spark.table(tableName)
-    val stats = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
-      assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
-      rel.catalogTable.get.stats
-    }
-    assert(stats.size == 1)
-    stats.head
-  }
-}
-
-object StatisticsTest {
-  def checkColStat(
-      dataType: DataType,
-      colStat: ColumnStat,
-      expectedColStat: ColumnStat,
-      rsd: Double): Unit = {
-    dataType match {
-      case StringType =>
-        val cs = colStat.forString
-        val expectedCS = expectedColStat.forString
-        assert(cs.numNulls == expectedCS.numNulls)
-        assert(cs.avgColLen == expectedCS.avgColLen)
-        assert(cs.maxColLen == expectedCS.maxColLen)
-        checkNdv(ndv = cs.ndv, expectedNdv = expectedCS.ndv, rsd = rsd)
-      case BinaryType =>
-        val cs = colStat.forBinary
-        val expectedCS = expectedColStat.forBinary
-        assert(cs.numNulls == expectedCS.numNulls)
-        assert(cs.avgColLen == expectedCS.avgColLen)
-        assert(cs.maxColLen == expectedCS.maxColLen)
-      case BooleanType =>
-        val cs = colStat.forBoolean
-        val expectedCS = expectedColStat.forBoolean
-        assert(cs.numNulls == expectedCS.numNulls)
-        assert(cs.numTrues == expectedCS.numTrues)
-        assert(cs.numFalses == expectedCS.numFalses)
-      case atomicType: AtomicType =>
-        checkNumericColStats(
-          dataType = atomicType, colStat = colStat, expectedColStat = expectedColStat, rsd = rsd)
-    }
-  }
-
-  private def checkNumericColStats(
-      dataType: AtomicType,
-      colStat: ColumnStat,
-      expectedColStat: ColumnStat,
-      rsd: Double): Unit = {
-    val cs = colStat.forNumeric(dataType)
-    val expectedCS = expectedColStat.forNumeric(dataType)
-    assert(cs.numNulls == expectedCS.numNulls)
-    assert(cs.max == expectedCS.max)
-    assert(cs.min == expectedCS.min)
-    checkNdv(ndv = cs.ndv, expectedNdv = expectedCS.ndv, rsd = rsd)
-  }
-
-  private def checkNdv(ndv: Long, expectedNdv: Long, rsd: Double): Unit = {
-    // ndv is an approximate value, so we make sure we have the value, and it should be
-    // within 3*SD's of the given rsd.
-    if (expectedNdv == 0) {
-      assert(ndv == 0)
-    } else if (expectedNdv > 0) {
-      assert(ndv > 0)
-      val error = math.abs((ndv / expectedNdv.toDouble) - 1.0d)
-      assert(error <= rsd * 3.0d, "Error should be within 3 std. errors.")
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index eab45050f7e6..b0b28337509f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -54,7 +54,25 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
     t.createOrReplaceTempView("t")
   }
 
-  test("rdd deserialization does not crash [SPARK-15791]") {
+  test("SPARK-18854 numberedTreeString for subquery") {
+    val df = sql("select * from range(10) where id not in " +
+      "(select id from range(2) union all select id from range(2))")
+
+    // The depth first traversal of the plan tree
+    val dfs = Seq("Project", "Filter", "Union", "Project", "Range", "Project", "Range", "Range")
+    val numbered = df.queryExecution.analyzed.numberedTreeString.split("\n")
+
+    // There should be 8 plan nodes in total
+    assert(numbered.size == dfs.size)
+
+    for (i <- dfs.indices) {
+      val node = df.queryExecution.analyzed(i)
+      assert(node.nodeName == dfs(i))
+      assert(numbered(i).contains(node.nodeName))
+    }
+  }
+
+  test("SPARK-15791: rdd deserialization does not crash") {
     sql("select (select 1 as b) as b").rdd.count()
   }
 
@@ -245,12 +263,12 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       Row(1, 2.0) :: Row(1, 2.0) :: Nil)
 
     checkAnswer(
-      sql("select * from l where a not in (select c from t where b < d)"),
-      Row(1, 2.0) :: Row(1, 2.0) :: Row(3, 3.0) :: Nil)
+      sql("select * from l where (a, b) not in (select c, d from t) and a < 4"),
+      Row(1, 2.0) :: Row(1, 2.0) :: Row(2, 1.0) :: Row(2, 1.0) :: Row(3, 3.0) :: Nil)
 
     // Empty sub-query
     checkAnswer(
-      sql("select * from l where a not in (select c from r where c > 10 and b < d)"),
+      sql("select * from l where (a, b) not in (select c, d from r where c > 10)"),
       Row(1, 2.0) :: Row(1, 2.0) :: Row(2, 1.0) :: Row(2, 1.0) ::
       Row(3, 3.0) :: Row(null, null) :: Row(null, 5.0) :: Row(6, null) :: Nil)
 
@@ -483,6 +501,18 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       Row(1, null) :: Row(2, 6.0) :: Row(3, 2.0) :: Row(null, null) :: Row(6, null) :: Nil)
   }
 
+  test("SPARK-18504 extra GROUP BY column in correlated scalar subquery is not permitted") {
+    withTempView("t") {
+      Seq((1, 1), (1, 2)).toDF("c1", "c2").createOrReplaceTempView("t")
+
+      val errMsg = intercept[AnalysisException] {
+        sql("select (select sum(-1) from t t2 where t1.c2 = t2.c1 group by t2.c2) sum from t t1")
+      }
+      assert(errMsg.getMessage.contains(
+        "A GROUP BY clause in a scalar correlated subquery cannot contain non-correlated columns:"))
+    }
+  }
+
   test("non-aggregated correlated scalar subquery") {
     val msg1 = intercept[AnalysisException] {
       sql("select a, (select b from l l2 where l2.a = l1.a) sum_b from l l1")
@@ -498,10 +528,10 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
 
   test("non-equal correlated scalar subquery") {
     val msg1 = intercept[AnalysisException] {
-      sql("select a, (select b from l l2 where l2.a < l1.a) sum_b from l l1")
+      sql("select a, (select sum(b) from l l2 where l2.a < l1.a) sum_b from l l1")
     }
     assert(msg1.getMessage.contains(
-      "The correlated scalar subquery can only contain equality predicates"))
+      "Correlated column is not allowed in a non-equality predicate:"))
   }
 
   test("disjunctive correlated scalar subquery") {
@@ -608,8 +638,8 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
             | where exists (select 1 from onerow t2 where t1.c1=t2.c1)
             | and   exists (select 1 from onerow LIMIT 1)""".stripMargin),
         Row(1) :: Nil)
-     }
-   }
+    }
+  }
 
   test("SPARK-16804: Correlated subqueries containing LIMIT - 2") {
     withTempView("onerow") {
@@ -623,6 +653,198 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
             |               from   (select 1 from onerow t2 LIMIT 1)
             |               where  t1.c1=t2.c1)""".stripMargin),
         Row(1) :: Nil)
+    }
+  }
+
+  test("SPARK-17337: Incorrect column resolution leads to incorrect results") {
+    withTempView("t1", "t2") {
+      Seq(1, 2).toDF("c1").createOrReplaceTempView("t1")
+      Seq(1).toDF("c2").createOrReplaceTempView("t2")
+
+      checkAnswer(
+        sql(
+          """
+            | select *
+            | from   (select t2.c2+1 as c3
+            |         from   t1 left join t2 on t1.c1=t2.c2) t3
+            | where  c3 not in (select c2 from t2)""".stripMargin),
+        Row(2) :: Nil)
      }
    }
+
+   test("SPARK-17348: Correlated subqueries with non-equality predicate (good case)") {
+     withTempView("t1", "t2") {
+       Seq((1, 1)).toDF("c1", "c2").createOrReplaceTempView("t1")
+       Seq((1, 1), (2, 0)).toDF("c1", "c2").createOrReplaceTempView("t2")
+
+       // Simple case
+       checkAnswer(
+         sql(
+           """
+             | select c1
+             | from   t1
+             | where  c1 in (select t2.c1
+             |               from   t2
+             |               where  t1.c2 >= t2.c2)""".stripMargin),
+         Row(1) :: Nil)
+
+       // More complex case with OR predicate
+       checkAnswer(
+         sql(
+           """
+             | select t1.c1
+             | from   t1, t1 as t3
+             | where  t1.c1 = t3.c1
+             | and    (t1.c1 in (select t2.c1
+             |                   from   t2
+             |                   where  t1.c2 >= t2.c2
+             |                          or t3.c2 < t2.c2)
+             |         or t1.c2 >= 0)""".stripMargin),
+         Row(1) :: Nil)
+    }
+  }
+
+  test("SPARK-17348: Correlated subqueries with non-equality predicate (error case)") {
+    withTempView("t1", "t2", "t3", "t4") {
+      Seq((1, 1)).toDF("c1", "c2").createOrReplaceTempView("t1")
+      Seq((1, 1), (2, 0)).toDF("c1", "c2").createOrReplaceTempView("t2")
+      Seq((2, 1)).toDF("c1", "c2").createOrReplaceTempView("t3")
+      Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t4")
+
+      // Simplest case
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1
+            | where  t1.c1 in (select max(t2.c1)
+            |                  from   t2
+            |                  where  t1.c2 >= t2.c2)""".stripMargin).collect()
+      }
+
+      // Add a HAVING on top and augmented within an OR predicate
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1
+            | where  t1.c1 in (select max(t2.c1)
+            |                  from   t2
+            |                  where  t1.c2 >= t2.c2
+            |                  having count(*) > 0 )
+            |         or t1.c2 >= 0""".stripMargin).collect()
+      }
+
+      // Add a HAVING on top and augmented within an OR predicate
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1, t1 as t3
+            | where  t1.c1 = t3.c1
+            | and    (t1.c1 in (select max(t2.c1)
+            |                   from   t2
+            |                   where  t1.c2 = t2.c2
+            |                          or t3.c2 = t2.c2)
+            |        )""".stripMargin).collect()
+      }
+
+      // In Window expression: changing the data set to
+      // demonstrate if this query ran, it would return incorrect result.
+      intercept[AnalysisException] {
+        sql(
+          """
+          | select c1
+          | from   t3
+          | where  c1 in (select max(t4.c1) over ()
+          |               from   t4
+          |               where t3.c2 >= t4.c2)""".stripMargin).collect()
+      }
+    }
+  }
+  // This restriction applies to
+  // the permutation of { LOJ, ROJ, FOJ } x { EXISTS, IN, scalar subquery }
+  // where correlated predicates appears in right operand of LOJ,
+  // or in left operand of ROJ, or in either operand of FOJ.
+  // The test cases below cover the representatives of the patterns
+  test("Correlated subqueries in outer joins") {
+    withTempView("t1", "t2", "t3") {
+      Seq(1).toDF("c1").createOrReplaceTempView("t1")
+      Seq(2).toDF("c1").createOrReplaceTempView("t2")
+      Seq(1).toDF("c1").createOrReplaceTempView("t3")
+
+      // Left outer join (LOJ) in IN subquery context
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1
+            | where  1 IN (select 1
+            |              from   t3 left outer join
+            |                     (select c1 from t2 where t1.c1 = 2) t2
+            |                     on t2.c1 = t3.c1)""".stripMargin).collect()
+      }
+      // Right outer join (ROJ) in EXISTS subquery context
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1
+            | where  exists (select 1
+            |                from   (select c1 from t2 where t1.c1 = 2) t2
+            |                       right outer join t3
+            |                       on t2.c1 = t3.c1)""".stripMargin).collect()
+      }
+      // SPARK-18578: Full outer join (FOJ) in scalar subquery context
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select (select max(1)
+            |         from   (select c1 from  t2 where t1.c1 = 2 and t1.c1=t2.c1) t2
+            |                full join t3
+            |                on t2.c1=t3.c1)
+            | from   t1""".stripMargin).collect()
+      }
+    }
+  }
+
+  // Generate operator
+  test("Correlated subqueries in LATERAL VIEW") {
+    withTempView("t1", "t2") {
+      Seq((1, 1), (2, 0)).toDF("c1", "c2").createOrReplaceTempView("t1")
+      Seq[(Int, Array[Int])]((1, Array(1, 2)), (2, Array(-1, -3)))
+        .toDF("c1", "arr_c2").createTempView("t2")
+      checkAnswer(
+        sql(
+          """
+          | select c2
+          | from t1
+          | where exists (select *
+          |               from t2 lateral view explode(arr_c2) q as c2
+                          where t1.c1 = t2.c1)""".stripMargin),
+        Row(1) :: Row(0) :: Nil)
+    }
+  }
+
+  test("SPARK-19933 Do not eliminate top-level aliases in sub-queries") {
+    withTempView("t1", "t2") {
+      spark.range(4).createOrReplaceTempView("t1")
+      checkAnswer(
+        sql("select * from t1 where id in (select id as id from t1)"),
+        Row(0) :: Row(1) :: Row(2) :: Row(3) :: Nil)
+
+      spark.range(2).createOrReplaceTempView("t2")
+      checkAnswer(
+        sql("select * from t1 where id in (select id as id from t2)"),
+        Row(0) :: Row(1) :: Nil)
+    }
+  }
+
+  test("SPARK-20688: correctly check analysis for scalar sub-queries") {
+    withTempView("t") {
+      Seq(1 -> "a").toDF("i", "j").createTempView("t")
+      val e = intercept[AnalysisException](sql("SELECT (SELECT count(*) FROM t WHERE a = 1)"))
+      assert(e.message.contains("cannot resolve '`a`' given input columns: [i, j]"))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 547d3c1abe85..a070fc803ecd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
 
@@ -248,4 +249,17 @@ class UDFSuite extends QueryTest with SharedSQLContext {
       sql("SELECT tmp.t.* FROM (SELECT testDataFunc(a, b) AS t from testData2) tmp").toDF(),
       testData2)
   }
+
+  test("SPARK-19338 Provide identical names for UDFs in the EXPLAIN output") {
+    def explainStr(df: DataFrame): String = {
+      val explain = ExplainCommand(df.queryExecution.logical, extended = false)
+      val sparkPlan = spark.sessionState.executePlan(explain).executedPlan
+      sparkPlan.executeCollect().map(_.getString(0).trim).headOption.getOrElse("")
+    }
+    val udf1 = "myUdf1"
+    val udf2 = "myUdf2"
+    spark.udf.register(udf1, (n: Int) => { n + 1 })
+    spark.udf.register(udf2, (n: Int) => { n * 1 })
+    assert(explainStr(sql("SELECT myUdf1(myUdf2(1))")).contains(s"UDF:$udf1(UDF:$udf2(1))"))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index 375da224aaa7..6df80bca487d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{execution, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.Inner
+import org.apache.spark.sql.catalyst.plans.{FullOuter, Inner, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Repartition}
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
@@ -250,7 +250,9 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  // --- Unit tests of EnsureRequirements ---------------------------------------------------------
+  ///////////////////////////////////////////////////////////////////////////
+  // Unit tests of EnsureRequirements for Exchange
+  ///////////////////////////////////////////////////////////////////////////
 
   // When it comes to testing whether EnsureRequirements properly ensures distribution requirements,
   // there two dimensions that need to be considered: are the child partitionings compatible and
@@ -383,93 +385,6 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  test("EnsureRequirements adds sort when there is no existing ordering") {
-    val orderingA = SortOrder(Literal(1), Ascending)
-    val orderingB = SortOrder(Literal(2), Ascending)
-    assert(orderingA != orderingB)
-    val inputPlan = DummySparkPlan(
-      children = DummySparkPlan(outputOrdering = Seq.empty) :: Nil,
-      requiredChildOrdering = Seq(Seq(orderingB)),
-      requiredChildDistribution = Seq(UnspecifiedDistribution)
-    )
-    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
-    assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case s: SortExec => true }.isEmpty) {
-      fail(s"Sort should have been added:\n$outputPlan")
-    }
-  }
-
-  test("EnsureRequirements skips sort when required ordering is prefix of existing ordering") {
-    val orderingA = SortOrder(Literal(1), Ascending)
-    val orderingB = SortOrder(Literal(2), Ascending)
-    assert(orderingA != orderingB)
-    val inputPlan = DummySparkPlan(
-      children = DummySparkPlan(outputOrdering = Seq(orderingA, orderingB)) :: Nil,
-      requiredChildOrdering = Seq(Seq(orderingA)),
-      requiredChildDistribution = Seq(UnspecifiedDistribution)
-    )
-    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
-    assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case s: SortExec => true }.nonEmpty) {
-      fail(s"No sorts should have been added:\n$outputPlan")
-    }
-  }
-
-  test("EnsureRequirements skips sort when required ordering is semantically equal to " +
-    "existing ordering") {
-    val exprId: ExprId = NamedExpression.newExprId
-    val attribute1 =
-      AttributeReference(
-        name = "col1",
-        dataType = LongType,
-        nullable = false
-      ) (exprId = exprId,
-        qualifier = Some("col1_qualifier")
-      )
-
-    val attribute2 =
-      AttributeReference(
-        name = "col1",
-        dataType = LongType,
-        nullable = false
-      ) (exprId = exprId)
-
-    val orderingA1 = SortOrder(attribute1, Ascending)
-    val orderingA2 = SortOrder(attribute2, Ascending)
-
-    assert(orderingA1 != orderingA2, s"$orderingA1 should NOT equal to $orderingA2")
-    assert(orderingA1.semanticEquals(orderingA2),
-      s"$orderingA1 should be semantically equal to $orderingA2")
-
-    val inputPlan = DummySparkPlan(
-      children = DummySparkPlan(outputOrdering = Seq(orderingA1)) :: Nil,
-      requiredChildOrdering = Seq(Seq(orderingA2)),
-      requiredChildDistribution = Seq(UnspecifiedDistribution)
-    )
-    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
-    assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case s: SortExec => true }.nonEmpty) {
-      fail(s"No sorts should have been added:\n$outputPlan")
-    }
-  }
-
-  // This is a regression test for SPARK-11135
-  test("EnsureRequirements adds sort when required ordering isn't a prefix of existing ordering") {
-    val orderingA = SortOrder(Literal(1), Ascending)
-    val orderingB = SortOrder(Literal(2), Ascending)
-    assert(orderingA != orderingB)
-    val inputPlan = DummySparkPlan(
-      children = DummySparkPlan(outputOrdering = Seq(orderingA)) :: Nil,
-      requiredChildOrdering = Seq(Seq(orderingA, orderingB)),
-      requiredChildDistribution = Seq(UnspecifiedDistribution)
-    )
-    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
-    assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case s: SortExec => true }.isEmpty) {
-      fail(s"Sort should have been added:\n$outputPlan")
-    }
-  }
-
   test("EnsureRequirements eliminates Exchange if child has Exchange with same partitioning") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
     val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
@@ -480,7 +395,7 @@ class PlannerSuite extends SharedSQLContext {
         children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
         requiredChildDistribution = Seq(distribution),
         requiredChildOrdering = Seq(Seq.empty)),
-        None)
+      None)
 
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
@@ -509,8 +424,6 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  // ---------------------------------------------------------------------------------------------
-
   test("Reuse exchanges") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
     val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
@@ -524,12 +437,12 @@ class PlannerSuite extends SharedSQLContext {
       None)
 
     val inputPlan = SortMergeJoinExec(
-        Literal(1) :: Nil,
-        Literal(1) :: Nil,
-        Inner,
-        None,
-        shuffle,
-        shuffle)
+      Literal(1) :: Nil,
+      Literal(1) :: Nil,
+      Inner,
+      None,
+      shuffle,
+      shuffle)
 
     val outputPlan = ReuseExchange(spark.sessionState.conf).apply(inputPlan)
     if (outputPlan.collect { case e: ReusedExchangeExec => true }.size != 1) {
@@ -556,6 +469,130 @@ class PlannerSuite extends SharedSQLContext {
       fail(s"Should have only two shuffles:\n$outputPlan")
     }
   }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Unit tests of EnsureRequirements for Sort
+  ///////////////////////////////////////////////////////////////////////////
+
+  private val exprA = Literal(1)
+  private val exprB = Literal(2)
+  private val orderingA = SortOrder(exprA, Ascending)
+  private val orderingB = SortOrder(exprB, Ascending)
+  private val planA = DummySparkPlan(outputOrdering = Seq(orderingA),
+    outputPartitioning = HashPartitioning(exprA :: Nil, 5))
+  private val planB = DummySparkPlan(outputOrdering = Seq(orderingB),
+    outputPartitioning = HashPartitioning(exprB :: Nil, 5))
+
+  assert(orderingA != orderingB)
+
+  private def assertSortRequirementsAreSatisfied(
+      childPlan: SparkPlan,
+      requiredOrdering: Seq[SortOrder],
+      shouldHaveSort: Boolean): Unit = {
+    val inputPlan = DummySparkPlan(
+      children = childPlan :: Nil,
+      requiredChildOrdering = Seq(requiredOrdering),
+      requiredChildDistribution = Seq(UnspecifiedDistribution)
+    )
+    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
+    assertDistributionRequirementsAreSatisfied(outputPlan)
+    if (shouldHaveSort) {
+      if (outputPlan.collect { case s: SortExec => true }.isEmpty) {
+        fail(s"Sort should have been added:\n$outputPlan")
+      }
+    } else {
+      if (outputPlan.collect { case s: SortExec => true }.nonEmpty) {
+        fail(s"No sorts should have been added:\n$outputPlan")
+      }
+    }
+  }
+
+  test("EnsureRequirements for sort operator after left outer sort merge join") {
+    // Only left key is sorted after left outer SMJ (thus doesn't need a sort).
+    val leftSmj = SortMergeJoinExec(exprA :: Nil, exprB :: Nil, LeftOuter, None, planA, planB)
+    Seq((orderingA, false), (orderingB, true)).foreach { case (ordering, needSort) =>
+      assertSortRequirementsAreSatisfied(
+        childPlan = leftSmj,
+        requiredOrdering = Seq(ordering),
+        shouldHaveSort = needSort)
+    }
+  }
+
+  test("EnsureRequirements for sort operator after right outer sort merge join") {
+    // Only right key is sorted after right outer SMJ (thus doesn't need a sort).
+    val rightSmj = SortMergeJoinExec(exprA :: Nil, exprB :: Nil, RightOuter, None, planA, planB)
+    Seq((orderingA, true), (orderingB, false)).foreach { case (ordering, needSort) =>
+      assertSortRequirementsAreSatisfied(
+        childPlan = rightSmj,
+        requiredOrdering = Seq(ordering),
+        shouldHaveSort = needSort)
+    }
+  }
+
+  test("EnsureRequirements adds sort after full outer sort merge join") {
+    // Neither keys is sorted after full outer SMJ, so they both need sorts.
+    val fullSmj = SortMergeJoinExec(exprA :: Nil, exprB :: Nil, FullOuter, None, planA, planB)
+    Seq(orderingA, orderingB).foreach { ordering =>
+      assertSortRequirementsAreSatisfied(
+        childPlan = fullSmj,
+        requiredOrdering = Seq(ordering),
+        shouldHaveSort = true)
+    }
+  }
+
+  test("EnsureRequirements adds sort when there is no existing ordering") {
+    assertSortRequirementsAreSatisfied(
+      childPlan = DummySparkPlan(outputOrdering = Seq.empty),
+      requiredOrdering = Seq(orderingB),
+      shouldHaveSort = true)
+  }
+
+  test("EnsureRequirements skips sort when required ordering is prefix of existing ordering") {
+    assertSortRequirementsAreSatisfied(
+      childPlan = DummySparkPlan(outputOrdering = Seq(orderingA, orderingB)),
+      requiredOrdering = Seq(orderingA),
+      shouldHaveSort = false)
+  }
+
+  test("EnsureRequirements skips sort when required ordering is semantically equal to " +
+    "existing ordering") {
+    val exprId: ExprId = NamedExpression.newExprId
+    val attribute1 =
+      AttributeReference(
+        name = "col1",
+        dataType = LongType,
+        nullable = false
+      ) (exprId = exprId,
+        qualifier = Some("col1_qualifier")
+      )
+
+    val attribute2 =
+      AttributeReference(
+        name = "col1",
+        dataType = LongType,
+        nullable = false
+      ) (exprId = exprId)
+
+    val orderingA1 = SortOrder(attribute1, Ascending)
+    val orderingA2 = SortOrder(attribute2, Ascending)
+
+    assert(orderingA1 != orderingA2, s"$orderingA1 should NOT equal to $orderingA2")
+    assert(orderingA1.semanticEquals(orderingA2),
+      s"$orderingA1 should be semantically equal to $orderingA2")
+
+    assertSortRequirementsAreSatisfied(
+      childPlan = DummySparkPlan(outputOrdering = Seq(orderingA1)),
+      requiredOrdering = Seq(orderingA2),
+      shouldHaveSort = false)
+  }
+
+  // This is a regression test for SPARK-11135
+  test("EnsureRequirements adds sort when required ordering isn't a prefix of existing ordering") {
+    assertSortRequirementsAreSatisfied(
+      childPlan = DummySparkPlan(outputOrdering = Seq(orderingA)),
+      requiredOrdering = Seq(orderingA, orderingB),
+      shouldHaveSort = true)
+  }
 }
 
 // Used for unit-testing EnsureRequirements
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
new file mode 100644
index 000000000000..8bceab39f71d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation}
+import org.apache.spark.sql.test.SharedSQLContext
+
+class QueryExecutionSuite extends SharedSQLContext {
+  test("toString() exception/error handling") {
+    val badRule = new SparkStrategy {
+      var mode: String = ""
+      override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase match {
+        case "exception" => throw new AnalysisException(mode)
+        case "error" => throw new Error(mode)
+        case _ => Nil
+      }
+    }
+    spark.experimental.extraStrategies = badRule :: Nil
+
+    def qe: QueryExecution = new QueryExecution(spark, OneRowRelation)
+
+    // Nothing!
+    badRule.mode = ""
+    assert(qe.toString.contains("OneRowRelation"))
+
+    // Throw an AnalysisException - this should be captured.
+    badRule.mode = "exception"
+    assert(qe.toString.contains("org.apache.spark.sql.AnalysisException"))
+
+    // Throw an Error - this should not be captured.
+    badRule.mode = "error"
+    val error = intercept[Error](qe.toString)
+    assert(error.getMessage.contains("error"))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index 797fe9ffa8be..b070138be05d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -23,9 +23,8 @@ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat,
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.command.{AnalyzeTableCommand, DescribeFunctionCommand,
-  DescribeTableCommand, ShowFunctionsCommand}
-import org.apache.spark.sql.execution.datasources.{CreateTable, CreateTempViewUsing}
+import org.apache.spark.sql.execution.command._
+import org.apache.spark.sql.execution.datasources.CreateTable
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
 
@@ -221,12 +220,22 @@ class SparkSqlParserSuite extends PlanTest {
     intercept("explain describe tables x", "Unsupported SQL statement")
   }
 
-  test("SPARK-18106 analyze table") {
+  test("analyze table statistics") {
     assertEqual("analyze table t compute statistics",
       AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
     assertEqual("analyze table t compute statistics noscan",
       AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
-    assertEqual("analyze table t partition (a) compute statistics noscan",
+    assertEqual("analyze table t partition (a) compute statistics nOscAn",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+
+    // Partitions specified - we currently parse them but don't do anything with it
+    assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
+    assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+    assertEqual("ANALYZE TABLE t PARTITION(ds, hr) COMPUTE STATISTICS",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
+    assertEqual("ANALYZE TABLE t PARTITION(ds, hr) COMPUTE STATISTICS noscan",
       AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
 
     intercept("analyze table t compute statistics xxxx",
@@ -234,4 +243,11 @@ class SparkSqlParserSuite extends PlanTest {
     intercept("analyze table t partition (a) compute statistics xxxx",
       "Expected `NOSCAN` instead of `xxxx`")
   }
+
+  test("analyze table column statistics") {
+    intercept("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS", "")
+
+    assertEqual("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS key, value",
+      AnalyzeColumnCommand(TableIdentifier("t"), Seq("key", "value")))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index c1555114e8b3..6cf18de0cc76 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -342,4 +342,44 @@ class UnsafeFixedWidthAggregationMapSuite
     }
   }
 
+  testWithMemoryLeakDetection("convert to external sorter after fail to grow (SPARK-19500)") {
+    val pageSize = 4096000
+    val map = new UnsafeFixedWidthAggregationMap(
+      emptyAggregationBuffer,
+      aggBufferSchema,
+      groupKeySchema,
+      taskMemoryManager,
+      128, // initial capacity
+      pageSize,
+      false // disable perf metrics
+    )
+
+    val rand = new Random(42)
+    for (i <- 1 to 63) {
+      val str = rand.nextString(1024)
+      val buf = map.getAggregationBuffer(InternalRow(UTF8String.fromString(str)))
+      buf.setInt(0, str.length)
+    }
+    // Simulate running out of space
+    memoryManager.limit(0)
+    var str = rand.nextString(1024)
+    var buf = map.getAggregationBuffer(InternalRow(UTF8String.fromString(str)))
+    assert(buf != null)
+    str = rand.nextString(1024)
+    buf = map.getAggregationBuffer(InternalRow(UTF8String.fromString(str)))
+    assert(buf == null)
+
+    // Convert the map into a sorter. This used to fail before the fix for SPARK-10474
+    // because we would try to acquire space for the in-memory sorter pointer array before
+    // actually releasing the pages despite having spilled all of them.
+    var sorter: UnsafeKVExternalSorter = null
+    try {
+      sorter = map.destructAndCreateExternalSorter()
+      map.free()
+    } finally {
+      if (sorter != null) {
+        sorter.cleanupResources()
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index f26e5e7b6990..9f6ef032d5f4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -113,4 +113,16 @@ class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {
         p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
     assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0)))
   }
+
+  test("SPARK-19512 codegen for comparing structs is incorrect") {
+    // this would raise CompileException before the fix
+    spark.range(10)
+      .selectExpr("named_struct('a', id) as col1", "named_struct('a', id+2) as col2")
+      .filter("col1 = col2").count()
+    // this would raise java.lang.IndexOutOfBoundsException before the fix
+    spark.range(10)
+      .selectExpr("named_struct('a', id, 'b', id) as col1",
+        "named_struct('a',id+2, 'b',id+2) as col2")
+      .filter("col1 = col2").count()
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala
new file mode 100644
index 000000000000..e7c8f2717fd7
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+import scala.concurrent.duration._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.util.Benchmark
+
+/**
+ * Benchmark [[PrimitiveArray]] for DataFrame and Dataset program using primitive array
+ * To run this:
+ *  1. replace ignore(...) with test(...)
+ *  2. build/sbt "sql/test-only *benchmark.PrimitiveArrayBenchmark"
+ *
+ * Benchmarks in this file are skipped in normal builds.
+ */
+class PrimitiveArrayBenchmark extends BenchmarkBase {
+
+  def writeDatasetArray(iters: Int): Unit = {
+    import sparkSession.implicits._
+
+    val count = 1024 * 1024 * 2
+
+    val sc = sparkSession.sparkContext
+    val primitiveIntArray = Array.fill[Int](count)(65535)
+    val dsInt = sc.parallelize(Seq(primitiveIntArray), 1).toDS
+    dsInt.count  // force to build dataset
+    val intArray = { i: Int =>
+      var n = 0
+      var len = 0
+      while (n < iters) {
+        len += dsInt.map(e => e).queryExecution.toRdd.collect.length
+        n += 1
+      }
+    }
+    val primitiveDoubleArray = Array.fill[Double](count)(65535.0)
+    val dsDouble = sc.parallelize(Seq(primitiveDoubleArray), 1).toDS
+    dsDouble.count  // force to build dataset
+    val doubleArray = { i: Int =>
+      var n = 0
+      var len = 0
+      while (n < iters) {
+        len += dsDouble.map(e => e).queryExecution.toRdd.collect.length
+        n += 1
+      }
+    }
+
+    val benchmark = new Benchmark("Write an array in Dataset", count * iters)
+    benchmark.addCase("Int   ")(intArray)
+    benchmark.addCase("Double")(doubleArray)
+    benchmark.run
+    /*
+    OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64
+    Intel Xeon E3-12xx v2 (Ivy Bridge)
+    Write an array in Dataset:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    Int                                            352 /  401         23.8          42.0       1.0X
+    Double                                         821 /  885         10.2          97.9       0.4X
+    */
+  }
+
+  ignore("Write an array in Dataset") {
+    writeDatasetArray(4)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
index 3988d9750b58..a23c87514d3a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
@@ -43,6 +43,7 @@ object TPCDSQueryBenchmark {
       .set("spark.driver.memory", "3g")
       .set("spark.executor.memory", "3g")
       .set("spark.sql.autoBroadcastJoinThreshold", (20 * 1024 * 1024).toString)
+      .set("spark.sql.crossJoin.enabled", "true")
 
   val spark = SparkSession.builder.config(conf).getOrCreate()
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 0daa29b666f6..8592a2924eac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -20,18 +20,96 @@ package org.apache.spark.sql.execution.columnar
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 
-import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.{DataFrame, QueryTest, Row}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.types._
-import org.apache.spark.storage.StorageLevel.MEMORY_ONLY
+import org.apache.spark.storage.StorageLevel._
 
 class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   setupTestData()
 
+  private def cachePrimitiveTest(data: DataFrame, dataType: String) {
+    data.createOrReplaceTempView(s"testData$dataType")
+    val storageLevel = MEMORY_ONLY
+    val plan = spark.sessionState.executePlan(data.logicalPlan).sparkPlan
+    val inMemoryRelation = InMemoryRelation(useCompression = true, 5, storageLevel, plan, None)
+
+    assert(inMemoryRelation.cachedColumnBuffers.getStorageLevel == storageLevel)
+    inMemoryRelation.cachedColumnBuffers.collect().head match {
+      case _: CachedBatch =>
+      case other => fail(s"Unexpected cached batch type: ${other.getClass.getName}")
+    }
+    checkAnswer(inMemoryRelation, data.collect().toSeq)
+  }
+
+  private def testPrimitiveType(nullability: Boolean): Unit = {
+    val dataTypes = Seq(BooleanType, ByteType, ShortType, IntegerType, LongType,
+      FloatType, DoubleType, DateType, TimestampType, DecimalType(25, 5), DecimalType(6, 5))
+    val schema = StructType(dataTypes.zipWithIndex.map { case (dataType, index) =>
+      StructField(s"col$index", dataType, nullability)
+    })
+    val rdd = spark.sparkContext.parallelize((1 to 10).map(i => Row(
+      if (nullability && i % 3 == 0) null else if (i % 2 == 0) true else false,
+      if (nullability && i % 3 == 0) null else i.toByte,
+      if (nullability && i % 3 == 0) null else i.toShort,
+      if (nullability && i % 3 == 0) null else i.toInt,
+      if (nullability && i % 3 == 0) null else i.toLong,
+      if (nullability && i % 3 == 0) null else (i + 0.25).toFloat,
+      if (nullability && i % 3 == 0) null else (i + 0.75).toDouble,
+      if (nullability && i % 3 == 0) null else new Date(i),
+      if (nullability && i % 3 == 0) null else new Timestamp(i * 1000000L),
+      if (nullability && i % 3 == 0) null else BigDecimal(Long.MaxValue.toString + ".12345"),
+      if (nullability && i % 3 == 0) null
+      else new java.math.BigDecimal(s"${i % 9 + 1}" + ".23456")
+    )))
+    cachePrimitiveTest(spark.createDataFrame(rdd, schema), "primitivesDateTimeStamp")
+  }
+
+  private def tesNonPrimitiveType(nullability: Boolean): Unit = {
+    val struct = StructType(StructField("f1", FloatType, false) ::
+      StructField("f2", ArrayType(BooleanType), true) :: Nil)
+    val schema = StructType(Seq(
+      StructField("col0", StringType, nullability),
+      StructField("col1", ArrayType(IntegerType), nullability),
+      StructField("col2", ArrayType(ArrayType(IntegerType)), nullability),
+      StructField("col3", MapType(StringType, IntegerType), nullability),
+      StructField("col4", struct, nullability)
+    ))
+    val rdd = spark.sparkContext.parallelize((1 to 10).map(i => Row(
+      if (nullability && i % 3 == 0) null else s"str${i}: test cache.",
+      if (nullability && i % 3 == 0) null else (i * 100 to i * 100 + i).toArray,
+      if (nullability && i % 3 == 0) null
+      else Array(Array(i, i + 1), Array(i * 100 + 1, i * 100, i * 100 + 2)),
+      if (nullability && i % 3 == 0) null else (i to i + i).map(j => s"key$j" -> j).toMap,
+      if (nullability && i % 3 == 0) null else Row((i + 0.25).toFloat, Seq(true, false, null))
+    )))
+    cachePrimitiveTest(spark.createDataFrame(rdd, schema), "StringArrayMapStruct")
+  }
+
+  test("primitive type with nullability:true") {
+    testPrimitiveType(true)
+  }
+
+  test("primitive type with nullability:false") {
+    testPrimitiveType(false)
+  }
+
+  test("non-primitive type with nullability:true") {
+    val schemaNull = StructType(Seq(StructField("col", NullType, true)))
+    val rddNull = spark.sparkContext.parallelize((1 to 10).map(i => Row(null)))
+    cachePrimitiveTest(spark.createDataFrame(rddNull, schemaNull), "Null")
+
+    tesNonPrimitiveType(true)
+  }
+
+  test("non-primitive type with nullability:false") {
+      tesNonPrimitiveType(false)
+  }
+
   test("simple columnar query") {
     val plan = spark.sessionState.executePlan(testData.logicalPlan).sparkPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
@@ -58,6 +136,13 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     }.map(Row.fromTuple))
   }
 
+  test("access only some column of the all of columns") {
+    val df = spark.range(1, 100).map(i => (i, (i + 1).toFloat)).toDF("i", "f")
+    df.cache
+    df.count  // forced to build cache
+    assert(df.filter("f <= 10.0").count == 9)
+  }
+
   test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
     val plan = spark.sessionState.executePlan(testData.logicalPlan).sparkPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
@@ -149,20 +234,19 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       Seq(StringType, BinaryType, NullType, BooleanType,
         ByteType, ShortType, IntegerType, LongType,
         FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5),
-        DateType, TimestampType,
-        ArrayType(IntegerType), MapType(StringType, LongType), struct)
+        DateType, TimestampType, ArrayType(IntegerType), struct)
     val fields = dataTypes.zipWithIndex.map { case (dataType, index) =>
       StructField(s"col$index", dataType, true)
     }
     val allColumns = fields.map(_.name).mkString(",")
     val schema = StructType(fields)
 
-    // Create a RDD for the schema
+    // Create an RDD for the schema
     val rdd =
-      sparkContext.parallelize((1 to 10000), 10).map { i =>
+      sparkContext.parallelize(1 to 10000, 10).map { i =>
         Row(
-          s"str${i}: test cache.",
-          s"binary${i}: test cache.".getBytes(StandardCharsets.UTF_8),
+          s"str$i: test cache.",
+          s"binary$i: test cache.".getBytes(StandardCharsets.UTF_8),
           null,
           i % 2 == 0,
           i.toByte,
@@ -170,13 +254,12 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
           i,
           Long.MaxValue - i.toLong,
           (i + 0.25).toFloat,
-          (i + 0.75),
+          i + 0.75,
           BigDecimal(Long.MaxValue.toString + ".12345"),
           new java.math.BigDecimal(s"${i % 9 + 1}" + ".23456"),
           new Date(i),
           new Timestamp(i * 1000000L),
-          (i to i + 10).toSeq,
-          (i to i + 10).map(j => s"map_key_$j" -> (Long.MaxValue - j)).toMap,
+          i to i + 10,
           Row((i - 0.25).toFloat, Seq(true, false, null)))
       }
     spark.createDataFrame(rdd, schema).createOrReplaceTempView("InMemoryCache_different_data_types")
@@ -246,4 +329,63 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     assert(cached.batchStats.value === expectedAnswer.size * INT.defaultSize)
   }
 
+  test("access primitive-type columns in CachedBatch without whole stage codegen") {
+    // whole stage codegen is not applied to a row with more than WHOLESTAGE_MAX_NUM_FIELDS fields
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "2") {
+      val data = Seq(null, true, 1.toByte, 3.toShort, 7, 15.toLong,
+        31.25.toFloat, 63.75, new Date(127), new Timestamp(255000000L), null)
+      val dataTypes = Seq(NullType, BooleanType, ByteType, ShortType, IntegerType, LongType,
+        FloatType, DoubleType, DateType, TimestampType, IntegerType)
+      val schemas = dataTypes.zipWithIndex.map { case (dataType, index) =>
+        StructField(s"col$index", dataType, true)
+      }
+      val rdd = sparkContext.makeRDD(Seq(Row.fromSeq(data)))
+      val df = spark.createDataFrame(rdd, StructType(schemas))
+      val row = df.persist.take(1).apply(0)
+      checkAnswer(df, row)
+    }
+  }
+
+  test("access decimal/string-type columns in CachedBatch without whole stage codegen") {
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "2") {
+      val data = Seq(BigDecimal(Long.MaxValue.toString + ".12345"),
+        new java.math.BigDecimal("1234567890.12345"),
+        new java.math.BigDecimal("1.23456"),
+        "test123"
+      )
+      val schemas = Seq(
+        StructField("col0", DecimalType(25, 5), true),
+        StructField("col1", DecimalType(15, 5), true),
+        StructField("col2", DecimalType(6, 5), true),
+        StructField("col3", StringType, true)
+      )
+      val rdd = sparkContext.makeRDD(Seq(Row.fromSeq(data)))
+      val df = spark.createDataFrame(rdd, StructType(schemas))
+      val row = df.persist.take(1).apply(0)
+      checkAnswer(df, row)
+    }
+  }
+
+  test("access non-primitive-type columns in CachedBatch without whole stage codegen") {
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "2") {
+      val data = Seq((1 to 10).toArray,
+        Array(Array(10, 11), Array(100, 111, 123)),
+        Map("key1" -> 111, "key2" -> 222),
+        Row(1.25.toFloat, Seq(true, false, null))
+      )
+      val struct = StructType(StructField("f1", FloatType, false) ::
+        StructField("f2", ArrayType(BooleanType), true) :: Nil)
+      val schemas = Seq(
+        StructField("col0", ArrayType(IntegerType), true),
+        StructField("col1", ArrayType(ArrayType(IntegerType)), true),
+        StructField("col2", MapType(StringType, IntegerType), true),
+        StructField("col3", struct, true)
+      )
+      val rdd = sparkContext.makeRDD(Seq(Row.fromSeq(data)))
+      val df = spark.createDataFrame(rdd, StructType(schemas))
+      val row = df.persist.take(1).apply(0)
+      checkAnswer(df, row)
+    }
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index d31e7aeb3a78..5ef5f8ee7741 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -615,7 +615,8 @@ class DDLCommandSuite extends PlanTest {
         Map("dt" -> "2008-08-08", "country" -> "us"),
         Map("dt" -> "2009-09-09", "country" -> "uk")),
       ifExists = true,
-      purge = false)
+      purge = false,
+      retainData = false)
     val expected2_table = expected1_table.copy(ifExists = false)
     val expected1_purge = expected1_table.copy(purge = true)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 9fb0f5384d88..d7fa0b5a01b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -96,7 +96,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       provider = Some("hive"),
       partitionColumnNames = Seq("a", "b"),
       createTime = 0L,
-      partitionProviderIsHive = true)
+      tracksPartitionsInCatalog = true)
   }
 
   private def createTable(catalog: SessionCatalog, name: TableIdentifier): Unit = {
@@ -125,17 +125,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       assert("file" === pathInCatalog.getScheme)
       val expectedPath = new Path(path).toUri
       assert(expectedPath.getPath === pathInCatalog.getPath)
-
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        sql(s"CREATE DATABASE db2")
-        val pathInCatalog2 = new Path(catalog.getDatabaseMetadata("db2").locationUri).toUri
-        assert("file" === pathInCatalog2.getScheme)
-        val expectedPath2 = new Path(spark.sessionState.conf.warehousePath + "/" + "db2.db").toUri
-        assert(expectedPath2.getPath === pathInCatalog2.getPath)
-      }
-
       sql("DROP DATABASE db1")
-      sql("DROP DATABASE db2")
     }
   }
 
@@ -146,55 +136,22 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     fs.makeQualified(hadoopPath).toString
   }
 
-  test("Create/Drop Database") {
-    withTempDir { tmpDir =>
-      val path = tmpDir.getCanonicalPath
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        val catalog = spark.sessionState.catalog
-        val databaseNames = Seq("db1", "`database`")
-
-        databaseNames.foreach { dbName =>
-          try {
-            val dbNameWithoutBackTicks = cleanIdentifier(dbName)
-
-            sql(s"CREATE DATABASE $dbName")
-            val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks)
-            val expectedLocation = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
-            assert(db1 == CatalogDatabase(
-              dbNameWithoutBackTicks,
-              "",
-              expectedLocation,
-              Map.empty))
-            sql(s"DROP DATABASE $dbName CASCADE")
-            assert(!catalog.databaseExists(dbNameWithoutBackTicks))
-          } finally {
-            catalog.reset()
-          }
-        }
-      }
-    }
-  }
-
   test("Create Database using Default Warehouse Path") {
-    withSQLConf(SQLConf.WAREHOUSE_PATH.key -> "") {
-      // Will use the default location if and only if we unset the conf
-      spark.conf.unset(SQLConf.WAREHOUSE_PATH.key)
-      val catalog = spark.sessionState.catalog
-      val dbName = "db1"
-      try {
-        sql(s"CREATE DATABASE $dbName")
-        val db1 = catalog.getDatabaseMetadata(dbName)
-        val expectedLocation = makeQualifiedPath(s"spark-warehouse/$dbName.db")
-        assert(db1 == CatalogDatabase(
-          dbName,
-          "",
-          expectedLocation,
-          Map.empty))
-        sql(s"DROP DATABASE $dbName CASCADE")
-        assert(!catalog.databaseExists(dbName))
-      } finally {
-        catalog.reset()
-      }
+    val catalog = spark.sessionState.catalog
+    val dbName = "db1"
+    try {
+      sql(s"CREATE DATABASE $dbName")
+      val db1 = catalog.getDatabaseMetadata(dbName)
+      val expectedLocation = makeQualifiedPath(s"spark-warehouse/$dbName.db")
+      assert(db1 == CatalogDatabase(
+        dbName,
+        "",
+        expectedLocation,
+        Map.empty))
+      sql(s"DROP DATABASE $dbName CASCADE")
+      assert(!catalog.databaseExists(dbName))
+    } finally {
+      catalog.reset()
     }
   }
 
@@ -224,31 +181,26 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
   }
 
   test("Create Database - database already exists") {
-    withTempDir { tmpDir =>
-      val path = tmpDir.getCanonicalPath
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        val catalog = spark.sessionState.catalog
-        val databaseNames = Seq("db1", "`database`")
-
-        databaseNames.foreach { dbName =>
-          try {
-            val dbNameWithoutBackTicks = cleanIdentifier(dbName)
-            sql(s"CREATE DATABASE $dbName")
-            val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks)
-            val expectedLocation = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
-            assert(db1 == CatalogDatabase(
-              dbNameWithoutBackTicks,
-              "",
-              expectedLocation,
-              Map.empty))
-
-            intercept[DatabaseAlreadyExistsException] {
-              sql(s"CREATE DATABASE $dbName")
-            }
-          } finally {
-            catalog.reset()
-          }
+    val catalog = spark.sessionState.catalog
+    val databaseNames = Seq("db1", "`database`")
+
+    databaseNames.foreach { dbName =>
+      try {
+        val dbNameWithoutBackTicks = cleanIdentifier(dbName)
+        sql(s"CREATE DATABASE $dbName")
+        val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks)
+        val expectedLocation = makeQualifiedPath(s"spark-warehouse/$dbNameWithoutBackTicks.db")
+        assert(db1 == CatalogDatabase(
+          dbNameWithoutBackTicks,
+          "",
+          expectedLocation,
+          Map.empty))
+
+        intercept[DatabaseAlreadyExistsException] {
+          sql(s"CREATE DATABASE $dbName")
         }
+      } finally {
+        catalog.reset()
       }
     }
   }
@@ -322,7 +274,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
           pathToPartitionedTable,
           userSpecifiedSchema = Option("num int, str string"),
           userSpecifiedPartitionCols = partitionCols,
-          expectedSchema = new StructType().add("num", IntegerType).add("str", StringType),
+          expectedSchema = new StructType().add("str", StringType).add("num", IntegerType),
           expectedPartitionCols = partitionCols.map(Seq(_)).getOrElse(Seq.empty[String]))
       }
     }
@@ -360,7 +312,13 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
           pathToNonPartitionedTable,
           userSpecifiedSchema = Option("num int, str string"),
           userSpecifiedPartitionCols = partitionCols,
-          expectedSchema = new StructType().add("num", IntegerType).add("str", StringType),
+          expectedSchema = if (partitionCols.isDefined) {
+            // we skipped inference, so the partition col is ordered at the end
+            new StructType().add("str", StringType).add("num", IntegerType)
+          } else {
+            // no inferred partitioning, so schema is in original order
+            new StructType().add("num", IntegerType).add("str", StringType)
+          },
           expectedPartitionCols = partitionCols.map(Seq(_)).getOrElse(Seq.empty[String]))
       }
     }
@@ -384,7 +342,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int, b string) USING json PARTITIONED BY (c)")
     }
-    assert(e.message == "partition column c is not defined in table `tbl`, " +
+    assert(e.message == "partition column c is not defined in table tbl, " +
       "defined table columns are: a, b")
   }
 
@@ -392,7 +350,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int, b string) USING json CLUSTERED BY (c) INTO 4 BUCKETS")
     }
-    assert(e.message == "bucket column c is not defined in table `tbl`, " +
+    assert(e.message == "bucket column c is not defined in table tbl, " +
       "defined table columns are: a, b")
   }
 
@@ -473,47 +431,42 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
   }
 
   test("Alter/Describe Database") {
-    withTempDir { tmpDir =>
-      val path = tmpDir.getCanonicalPath
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        val catalog = spark.sessionState.catalog
-        val databaseNames = Seq("db1", "`database`")
-
-        databaseNames.foreach { dbName =>
-          try {
-            val dbNameWithoutBackTicks = cleanIdentifier(dbName)
-            val location = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
-
-            sql(s"CREATE DATABASE $dbName")
-
-            checkAnswer(
-              sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
-              Row("Database Name", dbNameWithoutBackTicks) ::
-                Row("Description", "") ::
-                Row("Location", location) ::
-                Row("Properties", "") :: Nil)
-
-            sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('a'='a', 'b'='b', 'c'='c')")
-
-            checkAnswer(
-              sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
-              Row("Database Name", dbNameWithoutBackTicks) ::
-                Row("Description", "") ::
-                Row("Location", location) ::
-                Row("Properties", "((a,a), (b,b), (c,c))") :: Nil)
-
-            sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('d'='d')")
-
-            checkAnswer(
-              sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
-              Row("Database Name", dbNameWithoutBackTicks) ::
-                Row("Description", "") ::
-                Row("Location", location) ::
-                Row("Properties", "((a,a), (b,b), (c,c), (d,d))") :: Nil)
-          } finally {
-            catalog.reset()
-          }
-        }
+    val catalog = spark.sessionState.catalog
+    val databaseNames = Seq("db1", "`database`")
+
+    databaseNames.foreach { dbName =>
+      try {
+        val dbNameWithoutBackTicks = cleanIdentifier(dbName)
+        val location = makeQualifiedPath(s"spark-warehouse/$dbNameWithoutBackTicks.db")
+
+        sql(s"CREATE DATABASE $dbName")
+
+        checkAnswer(
+          sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
+          Row("Database Name", dbNameWithoutBackTicks) ::
+            Row("Description", "") ::
+            Row("Location", location) ::
+            Row("Properties", "") :: Nil)
+
+        sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('a'='a', 'b'='b', 'c'='c')")
+
+        checkAnswer(
+          sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
+          Row("Database Name", dbNameWithoutBackTicks) ::
+            Row("Description", "") ::
+            Row("Location", location) ::
+            Row("Properties", "((a,a), (b,b), (c,c))") :: Nil)
+
+        sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('d'='d')")
+
+        checkAnswer(
+          sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
+          Row("Database Name", dbNameWithoutBackTicks) ::
+            Row("Description", "") ::
+            Row("Location", location) ::
+            Row("Properties", "((a,a), (b,b), (c,c), (d,d))") :: Nil)
+      } finally {
+        catalog.reset()
       }
     }
   }
@@ -618,7 +571,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       val table = catalog.getTableMetadata(TableIdentifier("tbl"))
       assert(table.tableType == CatalogTableType.MANAGED)
       assert(table.provider == Some("parquet"))
-      assert(table.schema == new StructType().add("a", IntegerType).add("b", IntegerType))
+      // a is ordered last since it is a user-specified partitioning column
+      assert(table.schema == new StructType().add("b", IntegerType).add("a", IntegerType))
       assert(table.partitionColumnNames == Seq("a"))
     }
   }
@@ -875,7 +829,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
 
     val part2 = Map("a" -> "2", "b" -> "6")
-    val root = new Path(catalog.getTableMetadata(tableIdent).storage.locationUri.get)
+    val root = new Path(catalog.getTableMetadata(tableIdent).location)
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
     // valid
     fs.mkdirs(new Path(new Path(root, "a=1"), "b=5"))
@@ -1133,7 +1087,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     }
     assert(catalog.getTableMetadata(tableIdent).storage.locationUri.isDefined)
     assert(catalog.getTableMetadata(tableIdent).storage.properties.isEmpty)
-    assert(catalog.getPartition(tableIdent, partSpec).storage.locationUri.isEmpty)
+    assert(catalog.getPartition(tableIdent, partSpec).storage.locationUri.isDefined)
     assert(catalog.getPartition(tableIdent, partSpec).storage.properties.isEmpty)
     // Verify that the location is set to the expected string
     def verifyLocation(expected: String, spec: Option[TablePartitionSpec] = None): Unit = {
@@ -1145,7 +1099,6 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
           assert(storageFormat.properties.isEmpty)
           assert(storageFormat.locationUri === Some(expected))
         } else {
-          assert(storageFormat.properties.get("path") === Some(expected))
           assert(storageFormat.locationUri === Some(expected))
         }
       } else {
@@ -1297,9 +1250,9 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     sql("ALTER TABLE dbx.tab1 ADD IF NOT EXISTS " +
       "PARTITION (a='2', b='6') LOCATION 'paris' PARTITION (a='3', b='7')")
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2, part3))
-    assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isEmpty)
+    assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isDefined)
     assert(catalog.getPartition(tableIdent, part2).storage.locationUri == Option("paris"))
-    assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isEmpty)
+    assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isDefined)
 
     // add partitions without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
@@ -1446,50 +1399,56 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       sql("DESCRIBE FUNCTION log"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Logarithm") ::
         Row("Function: log") ::
-        Row("Usage: log(b, x) - Returns the logarithm of x with base b.") :: Nil
+        Row("Usage: log(base, expr) - Returns the logarithm of `expr` with `base`.") :: Nil
     )
     // predicate operator
     checkAnswer(
       sql("DESCRIBE FUNCTION or"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Or") ::
         Row("Function: or") ::
-        Row("Usage: a or b - Logical OR.") :: Nil
+        Row("Usage: expr1 or expr2 - Logical OR.") :: Nil
     )
     checkAnswer(
       sql("DESCRIBE FUNCTION !"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Not") ::
         Row("Function: !") ::
-        Row("Usage: ! a - Logical not") :: Nil
+        Row("Usage: ! expr - Logical not.") :: Nil
     )
     // arithmetic operators
     checkAnswer(
       sql("DESCRIBE FUNCTION +"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Add") ::
         Row("Function: +") ::
-        Row("Usage: a + b - Returns a+b.") :: Nil
+        Row("Usage: expr1 + expr2 - Returns `expr1`+`expr2`.") :: Nil
     )
     // comparison operators
     checkAnswer(
       sql("DESCRIBE FUNCTION <"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.LessThan") ::
         Row("Function: <") ::
-        Row("Usage: a < b - Returns TRUE if a is less than b.") :: Nil
+        Row("Usage: expr1 < expr2 - Returns true if `expr1` is less than `expr2`.") :: Nil
     )
     // STRING
     checkAnswer(
       sql("DESCRIBE FUNCTION 'concat'"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Concat") ::
         Row("Function: concat") ::
-        Row("Usage: concat(str1, str2, ..., strN) " +
-          "- Returns the concatenation of str1, str2, ..., strN") :: Nil
+        Row("Usage: concat(str1, str2, ..., strN) - " +
+            "Returns the concatenation of str1, str2, ..., strN.") :: Nil
     )
     // extended mode
     checkAnswer(
       sql("DESCRIBE FUNCTION EXTENDED ^"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.BitwiseXor") ::
-        Row("Extended Usage:\n> SELECT 3 ^ 5; 2") ::
+        Row(
+          """Extended Usage:
+            |    Examples:
+            |      > SELECT 3 ^ 5;
+            |       2
+            |  """.stripMargin) ::
         Row("Function: ^") ::
-        Row("Usage: a ^ b - Bitwise exclusive OR.") :: Nil
+        Row("Usage: expr1 ^ expr2 - Returns the result of " +
+          "bitwise exclusive OR of `expr1` and `expr2`.") :: Nil
     )
   }
 
@@ -1594,10 +1553,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
   test("drop current database") {
     sql("CREATE DATABASE temp")
     sql("USE temp")
-    val m = intercept[AnalysisException] {
-      sql("DROP DATABASE temp")
-    }.getMessage
-    assert(m.contains("Can not drop current database `temp`"))
+    sql("DROP DATABASE temp")
+    val e = intercept[AnalysisException] {
+        sql("CREATE TABLE t (a INT, b INT)")
+      }.getMessage
+    assert(e.contains("Database 'temp' not found"))
   }
 
   test("drop default database") {
@@ -1622,29 +1582,61 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
 
   test("truncate table - datasource table") {
     import testImplicits._
-    val data = (1 to 10).map { i => (i, i) }.toDF("width", "length")
 
+    val data = (1 to 10).map { i => (i, i) }.toDF("width", "length")
     // Test both a Hive compatible and incompatible code path.
     Seq("json", "parquet").foreach { format =>
       withTable("rectangles") {
         data.write.format(format).saveAsTable("rectangles")
         assume(spark.table("rectangles").collect().nonEmpty,
           "bad test; table was empty to begin with")
+
         sql("TRUNCATE TABLE rectangles")
         assert(spark.table("rectangles").collect().isEmpty)
+
+        // not supported since the table is not partitioned
+        assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)")
       }
     }
+  }
 
-    withTable("rectangles", "rectangles2") {
-      data.write.saveAsTable("rectangles")
-      data.write.partitionBy("length").saveAsTable("rectangles2")
+  test("truncate partitioned table - datasource table") {
+    import testImplicits._
 
-      // not supported since the table is not partitioned
-      assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)")
+    val data = (1 to 10).map { i => (i % 3, i % 5, i) }.toDF("width", "length", "height")
 
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
       // supported since partitions are stored in the metastore
-      sql("TRUNCATE TABLE rectangles2 PARTITION (width=1)")
-      assert(spark.table("rectangles2").collect().isEmpty)
+      sql("TRUNCATE TABLE partTable PARTITION (width=1, length=1)")
+      assert(spark.table("partTable").filter($"width" === 1).collect().nonEmpty)
+      assert(spark.table("partTable").filter($"width" === 1 && $"length" === 1).collect().isEmpty)
+    }
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // support partial partition spec
+      sql("TRUNCATE TABLE partTable PARTITION (width=1)")
+      assert(spark.table("partTable").collect().nonEmpty)
+      assert(spark.table("partTable").filter($"width" === 1).collect().isEmpty)
+    }
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // do nothing if no partition is matched for the given partial partition spec
+      sql("TRUNCATE TABLE partTable PARTITION (width=100)")
+      assert(spark.table("partTable").count() == data.count())
+
+      // throw exception if no partition is matched for the given non-partial partition spec.
+      intercept[NoSuchPartitionException] {
+        sql("TRUNCATE TABLE partTable PARTITION (width=100, length=100)")
+      }
+
+      // throw exception if the column in partition spec is not a partition column.
+      val e = intercept[AnalysisException] {
+        sql("TRUNCATE TABLE partTable PARTITION (unknown=1)")
+      }
+      assert(e.message.contains("unknown is not a valid partition column"))
     }
   }
 
@@ -1768,4 +1760,122 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     val rows: Seq[Row] = df.toLocalIterator().asScala.toSeq
     assert(rows.length > 0)
   }
+
+  test("insert data to a data source table which has a not existed location should succeed") {
+    withTable("t") {
+      withTempDir { dir =>
+        spark.sql(
+          s"""
+             |CREATE TABLE t(a string, b int)
+             |USING parquet
+             |OPTIONS(path "$dir")
+           """.stripMargin)
+        val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        val expectedPath = dir.getAbsolutePath
+        assert(table.location == expectedPath)
+
+        dir.delete
+        val tableLocFile = new File(table.location.stripPrefix("file:"))
+        assert(!tableLocFile.exists)
+        spark.sql("INSERT INTO TABLE t SELECT 'c', 1")
+        assert(tableLocFile.exists)
+        checkAnswer(spark.table("t"), Row("c", 1) :: Nil)
+
+        Utils.deleteRecursively(dir)
+        assert(!tableLocFile.exists)
+        spark.sql("INSERT OVERWRITE TABLE t SELECT 'c', 1")
+        assert(tableLocFile.exists)
+        checkAnswer(spark.table("t"), Row("c", 1) :: Nil)
+
+        val newDir = new File(dir, "x")
+        spark.sql(s"ALTER TABLE t SET LOCATION '$newDir'")
+        spark.sessionState.catalog.refreshTable(TableIdentifier("t"))
+
+        val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        assert(table1.location == newDir.getAbsolutePath)
+        assert(!newDir.exists)
+
+        spark.sql("INSERT INTO TABLE t SELECT 'c', 1")
+        assert(newDir.exists)
+        checkAnswer(spark.table("t"), Row("c", 1) :: Nil)
+      }
+    }
+  }
+
+  test("insert into a data source table with no existed partition location should succeed") {
+    withTable("t") {
+      withTempDir { dir =>
+        spark.sql(
+          s"""
+             |CREATE TABLE t(a int, b int, c int, d int)
+             |USING parquet
+             |OPTIONS(path '$dir')
+             |PARTITIONED BY(a, b)
+           """.stripMargin)
+        val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        val expectedPath = dir.getAbsolutePath
+        assert(table.location == expectedPath)
+
+        spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4")
+        checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil)
+
+        val partLoc = new File(dir, "a=1")
+        Utils.deleteRecursively(partLoc)
+        assert(!partLoc.exists())
+        // insert overwrite into a partition which location has been deleted.
+        spark.sql("INSERT OVERWRITE TABLE t PARTITION(a=1, b=2) SELECT 7, 8")
+        assert(partLoc.exists())
+        checkAnswer(spark.table("t"), Row(7, 8, 1, 2) :: Nil)
+      }
+    }
+  }
+
+  test("read data from a data source table which has a not existed location should succeed") {
+    withTable("t") {
+      withTempDir { dir =>
+        spark.sql(
+          s"""
+             |CREATE TABLE t(a string, b int)
+             |USING parquet
+             |OPTIONS(path "$dir")
+           """.stripMargin)
+        val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        val expectedPath = dir.getAbsolutePath
+        assert(table.location == expectedPath)
+
+        dir.delete()
+        checkAnswer(spark.table("t"), Nil)
+
+        val newDir = new File(dir, "x")
+        spark.sql(s"ALTER TABLE t SET LOCATION '$newDir'")
+
+        val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+        assert(table1.location == newDir.getAbsolutePath)
+        assert(!newDir.exists())
+        checkAnswer(spark.table("t"), Nil)
+      }
+    }
+  }
+
+  test("read data from a data source table with no existed partition location should succeed") {
+    withTable("t") {
+      withTempDir { dir =>
+        spark.sql(
+          s"""
+             |CREATE TABLE t(a int, b int, c int, d int)
+             |USING parquet
+             |OPTIONS(path "$dir")
+             |PARTITIONED BY(a, b)
+           """.stripMargin)
+        spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4")
+        checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil)
+
+        // select from a partition which location has been deleted.
+        Utils.deleteRecursively(dir)
+        assert(!dir.exists())
+        spark.sql("REFRESH TABLE t")
+        checkAnswer(spark.sql("select * from t where a=1 and b=2"), Nil)
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
similarity index 57%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
index 56df1face636..b4b762b1dd59 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
@@ -25,8 +25,10 @@ import scala.language.reflectiveCalls
 
 import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.util.{KnownSizeEstimation, SizeEstimator}
 
 class FileIndexSuite extends SharedSQLContext {
 
@@ -81,6 +83,58 @@ class FileIndexSuite extends SharedSQLContext {
     }
   }
 
+  test("PartitioningAwareFileIndex listing parallelized with many top level dirs") {
+    for ((scale, expectedNumPar) <- Seq((10, 0), (50, 1))) {
+      withTempDir { dir =>
+        val topLevelDirs = (1 to scale).map { i =>
+          val tmp = new File(dir, s"foo=$i.txt")
+          tmp.mkdir()
+          new Path(tmp.getCanonicalPath)
+        }
+        HiveCatalogMetrics.reset()
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 0)
+        new InMemoryFileIndex(spark, topLevelDirs, Map.empty, None)
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == expectedNumPar)
+      }
+    }
+  }
+
+  test("PartitioningAwareFileIndex listing parallelized with large child dirs") {
+    for ((scale, expectedNumPar) <- Seq((10, 0), (50, 1))) {
+      withTempDir { dir =>
+        for (i <- 1 to scale) {
+          new File(dir, s"foo=$i.txt").mkdir()
+        }
+        HiveCatalogMetrics.reset()
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 0)
+        new InMemoryFileIndex(spark, Seq(new Path(dir.getCanonicalPath)), Map.empty, None)
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == expectedNumPar)
+      }
+    }
+  }
+
+  test("PartitioningAwareFileIndex listing parallelized with large, deeply nested child dirs") {
+    for ((scale, expectedNumPar) <- Seq((10, 0), (50, 4))) {
+      withTempDir { dir =>
+        for (i <- 1 to 2) {
+          val subdirA = new File(dir, s"a=$i")
+          subdirA.mkdir()
+          for (j <- 1 to 2) {
+            val subdirB = new File(subdirA, s"b=$j")
+            subdirB.mkdir()
+            for (k <- 1 to scale) {
+              new File(subdirB, s"foo=$k.txt").mkdir()
+            }
+          }
+        }
+        HiveCatalogMetrics.reset()
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 0)
+        new InMemoryFileIndex(spark, Seq(new Path(dir.getCanonicalPath)), Map.empty, None)
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == expectedNumPar)
+      }
+    }
+  }
+
   test("PartitioningAwareFileIndex - file filtering") {
     assert(!PartitioningAwareFileIndex.shouldFilterOut("abcd"))
     assert(PartitioningAwareFileIndex.shouldFilterOut(".ab"))
@@ -124,6 +178,47 @@ class FileIndexSuite extends SharedSQLContext {
       assert(catalog2.allFiles().nonEmpty)
     }
   }
+
+  test("refresh for InMemoryFileIndex with FileStatusCache") {
+    withTempDir { dir =>
+      val fileStatusCache = FileStatusCache.getOrCreate(spark)
+      val dirPath = new Path(dir.getAbsolutePath)
+      val fs = dirPath.getFileSystem(spark.sessionState.newHadoopConf())
+      val catalog =
+        new InMemoryFileIndex(spark, Seq(dirPath), Map.empty, None, fileStatusCache) {
+          def leafFilePaths: Seq[Path] = leafFiles.keys.toSeq
+          def leafDirPaths: Seq[Path] = leafDirToChildrenFiles.keys.toSeq
+        }
+
+      val file = new File(dir, "text.txt")
+      stringToFile(file, "text")
+      assert(catalog.leafDirPaths.isEmpty)
+      assert(catalog.leafFilePaths.isEmpty)
+
+      catalog.refresh()
+
+      assert(catalog.leafFilePaths.size == 1)
+      assert(catalog.leafFilePaths.head == fs.makeQualified(new Path(file.getAbsolutePath)))
+
+      assert(catalog.leafDirPaths.size == 1)
+      assert(catalog.leafDirPaths.head == fs.makeQualified(dirPath))
+    }
+  }
+
+  test("SPARK-20280 - FileStatusCache with a partition with very many files") {
+    /* fake the size, otherwise we need to allocate 2GB of data to trigger this bug */
+    class MyFileStatus extends FileStatus with KnownSizeEstimation {
+      override def estimatedSize: Long = 1000 * 1000 * 1000
+    }
+    /* files * MyFileStatus.estimatedSize should overflow to negative integer
+     * so, make it between 2bn and 4bn
+     */
+    val files = (1 to 3).map { i =>
+      new MyFileStatus()
+    }
+    val fileStatusCache = FileStatusCache.getOrCreate(spark)
+    fileStatusCache.putLeafFiles(new Path("/tmp", "abc"), files.toArray)
+  }
 }
 
 class FakeParentPathFileSystem extends RawLocalFileSystem {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index d900ce7bb237..f36162858bf7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -476,6 +476,17 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
     }
   }
 
+  test("[SPARK-18753] keep pushed-down null literal as a filter in Spark-side post-filter") {
+    val ds = Seq(Tuple1(Some(true)), Tuple1(None), Tuple1(Some(false))).toDS()
+    withTempPath { p =>
+      val path = p.getAbsolutePath
+      ds.write.parquet(path)
+      val readBack = spark.read.parquet(path).filter($"_1" === "true")
+      val filtered = ds.filter($"_1" === "true").toDF()
+      checkAnswer(readBack, filtered)
+    }
+  }
+
   // Helpers for checking the arguments passed to the FileFormat.
 
   protected val checkPartitionSchema =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
index 5e00f669b859..8620bb9f65b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
@@ -109,4 +109,26 @@ class CSVInferSchemaSuite extends SparkFunSuite {
     val mergedNullTypes = CSVInferSchema.mergeRowTypes(Array(NullType), Array(NullType))
     assert(mergedNullTypes.deep == Array(NullType).deep)
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    val options = new CSVOptions(Map("TiMeStampFormat" -> "yyyy-mm"))
+    assert(CSVInferSchema.inferField(TimestampType, "2015-08", options) == TimestampType)
+  }
+
+  test("SPARK-18877: `inferField` on DecimalType should find a common type with `typeSoFar`") {
+    val options = new CSVOptions(Map.empty[String, String])
+
+    // 9.03E+12 is Decimal(3, -10) and 1.19E+11 is Decimal(3, -9).
+    assert(CSVInferSchema.inferField(DecimalType(3, -10), "1.19E+11", options) ==
+      DecimalType(4, -9))
+
+    // BigDecimal("12345678901234567890.01234567890123456789") is precision 40 and scale 20.
+    val value = "12345678901234567890.01234567890123456789"
+    assert(CSVInferSchema.inferField(DecimalType(3, -10), value, options) == DoubleType)
+
+    // Seq(s"${Long.MaxValue}1", "2015-12-01 00:00:00") should be StringType
+    assert(CSVInferSchema.inferField(NullType, s"${Long.MaxValue}1", options) == DecimalType(20, 0))
+    assert(CSVInferSchema.inferField(DecimalType(20, 0), "2015-12-01 00:00:00", options)
+      == StringType)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index f7c22c6c93f7..491ff72337a8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.nio.charset.UnsupportedCharsetException
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
+import java.util.Locale
 
 import org.apache.commons.lang3.time.FastDateFormat
 import org.apache.hadoop.io.SequenceFile.CompressionType
@@ -487,7 +488,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       .select("date")
       .collect()
 
-    val dateFormat = new SimpleDateFormat("dd/MM/yyyy HH:mm")
+    val dateFormat = new SimpleDateFormat("dd/MM/yyyy HH:mm", Locale.US)
     val expected =
       Seq(Seq(new Timestamp(dateFormat.parse("26/08/2015 18:00").getTime)),
         Seq(new Timestamp(dateFormat.parse("27/10/2014 18:30").getTime)),
@@ -509,7 +510,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       .select("date")
       .collect()
 
-    val dateFormat = new SimpleDateFormat("dd/MM/yyyy hh:mm")
+    val dateFormat = new SimpleDateFormat("dd/MM/yyyy hh:mm", Locale.US)
     val expected = Seq(
       new Date(dateFormat.parse("26/08/2015 18:00").getTime),
       new Date(dateFormat.parse("27/10/2014 18:30").getTime),
@@ -728,7 +729,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         .option("inferSchema", "false")
         .load(iso8601timestampsPath)
 
-      val iso8501 = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSZZ")
+      val iso8501 = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSZZ", Locale.US)
       val expectedTimestamps = timestamps.collect().map { r =>
         // This should be ISO8601 formatted string.
         Row(iso8501.format(r.toSeq.head))
@@ -761,7 +762,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         .option("inferSchema", "false")
         .load(iso8601datesPath)
 
-      val iso8501 = FastDateFormat.getInstance("yyyy-MM-dd")
+      val iso8501 = FastDateFormat.getInstance("yyyy-MM-dd", Locale.US)
       val expectedDates = dates.collect().map { r =>
         // This should be ISO8601 formatted string.
         Row(iso8501.format(r.toSeq.head))
@@ -889,4 +890,19 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       }
     }
   }
+
+  test("load null when the schema is larger than parsed tokens ") {
+    withTempPath { path =>
+      Seq("1").toDF().write.text(path.getAbsolutePath)
+      val schema = StructType(
+        StructField("a", IntegerType, true) ::
+        StructField("b", IntegerType, true) :: Nil)
+      val df = spark.read
+        .schema(schema)
+        .option("header", "false")
+        .csv(path.getAbsolutePath)
+
+      checkAnswer(df, Row(1, null))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
index 51832a13cfe0..46333d12138f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
@@ -36,7 +36,7 @@ class CSVTypeCastSuite extends SparkFunSuite {
 
     stringValues.zip(decimalValues).foreach { case (strVal, decimalVal) =>
       val decimalValue = new BigDecimal(decimalVal.toString)
-      assert(CSVTypeCast.castTo(strVal, decimalType) ===
+      assert(CSVTypeCast.castTo(strVal, "_1", decimalType) ===
         Decimal(decimalValue, decimalType.precision, decimalType.scale))
     }
   }
@@ -67,90 +67,100 @@ class CSVTypeCastSuite extends SparkFunSuite {
 
   test("Nullable types are handled") {
     assertNull(
-      CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", ByteType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", ShortType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", LongType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", FloatType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", DateType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", DateType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", StringType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", StringType, nullable = true, CSVOptions("nullValue", "-")))
+    assertNull(
+      CSVTypeCast.castTo(null, "_1", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
+
+    // casting a null to not nullable field should throw an exception.
+    var message = intercept[RuntimeException] {
+      CSVTypeCast.castTo(null, "_1", IntegerType, nullable = false, CSVOptions("nullValue", "-"))
+    }.getMessage
+    assert(message.contains("null value found but field _1 is not nullable."))
+
+    message = intercept[RuntimeException] {
+      CSVTypeCast.castTo("-", "_1", StringType, nullable = false, CSVOptions("nullValue", "-"))
+    }.getMessage
+    assert(message.contains("null value found but field _1 is not nullable."))
   }
 
   test("String type should also respect `nullValue`") {
     assertNull(
-      CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions()))
-    assert(
-      CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions()) ==
-        UTF8String.fromString(""))
+      CSVTypeCast.castTo("", "_1", StringType, nullable = true, CSVOptions()))
 
     assert(
-      CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions("nullValue", "null")) ==
+      CSVTypeCast.castTo("", "_1", StringType, nullable = true, CSVOptions("nullValue", "null")) ==
         UTF8String.fromString(""))
     assert(
-      CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions("nullValue", "null")) ==
+      CSVTypeCast.castTo("", "_1", StringType, nullable = false, CSVOptions("nullValue", "null")) ==
         UTF8String.fromString(""))
 
     assertNull(
-      CSVTypeCast.castTo(null, StringType, nullable = true, CSVOptions("nullValue", "null")))
+      CSVTypeCast.castTo(null, "_1", StringType, nullable = true, CSVOptions("nullValue", "null")))
   }
 
   test("Throws exception for empty string with non null type") {
-    val exception = intercept[NumberFormatException]{
-      CSVTypeCast.castTo("", IntegerType, nullable = false, CSVOptions())
+    val exception = intercept[RuntimeException]{
+      CSVTypeCast.castTo("", "_1", IntegerType, nullable = false, CSVOptions())
     }
-    assert(exception.getMessage.contains("For input string: \"\""))
+    assert(exception.getMessage.contains("null value found but field _1 is not nullable."))
   }
 
   test("Types are cast correctly") {
-    assert(CSVTypeCast.castTo("10", ByteType) == 10)
-    assert(CSVTypeCast.castTo("10", ShortType) == 10)
-    assert(CSVTypeCast.castTo("10", IntegerType) == 10)
-    assert(CSVTypeCast.castTo("10", LongType) == 10)
-    assert(CSVTypeCast.castTo("1.00", FloatType) == 1.0)
-    assert(CSVTypeCast.castTo("1.00", DoubleType) == 1.0)
-    assert(CSVTypeCast.castTo("true", BooleanType) == true)
+    assert(CSVTypeCast.castTo("10", "_1", ByteType) == 10)
+    assert(CSVTypeCast.castTo("10", "_1", ShortType) == 10)
+    assert(CSVTypeCast.castTo("10", "_1", IntegerType) == 10)
+    assert(CSVTypeCast.castTo("10", "_1", LongType) == 10)
+    assert(CSVTypeCast.castTo("1.00", "_1", FloatType) == 1.0)
+    assert(CSVTypeCast.castTo("1.00", "_1", DoubleType) == 1.0)
+    assert(CSVTypeCast.castTo("true", "_1", BooleanType) == true)
 
     val timestampsOptions = CSVOptions("timestampFormat", "dd/MM/yyyy hh:mm")
     val customTimestamp = "31/01/2015 00:00"
     val expectedTime = timestampsOptions.timestampFormat.parse(customTimestamp).getTime
     val castedTimestamp =
-      CSVTypeCast.castTo(customTimestamp, TimestampType, nullable = true, timestampsOptions)
+      CSVTypeCast.castTo(customTimestamp, "_1", TimestampType, nullable = true, timestampsOptions)
     assert(castedTimestamp == expectedTime * 1000L)
 
     val customDate = "31/01/2015"
     val dateOptions = CSVOptions("dateFormat", "dd/MM/yyyy")
     val expectedDate = dateOptions.dateFormat.parse(customDate).getTime
-    val castedDate = CSVTypeCast.castTo(customTimestamp, DateType, nullable = true, dateOptions)
+    val castedDate =
+      CSVTypeCast.castTo(customTimestamp, "_1", DateType, nullable = true, dateOptions)
     assert(castedDate == DateTimeUtils.millisToDays(expectedDate))
 
     val timestamp = "2015-01-01 00:00:00"
-    assert(CSVTypeCast.castTo(timestamp, TimestampType) ==
+    assert(CSVTypeCast.castTo(timestamp, "_1", TimestampType) ==
       DateTimeUtils.stringToTime(timestamp).getTime  * 1000L)
-    assert(CSVTypeCast.castTo("2015-01-01", DateType) ==
+    assert(CSVTypeCast.castTo("2015-01-01", "_1", DateType) ==
       DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime))
   }
 
-  test("Float and Double Types are cast correctly with Locale") {
+  test("Float and Double Types are cast without respect to platform default Locale") {
     val originalLocale = Locale.getDefault
     try {
-      val locale : Locale = new Locale("fr", "FR")
-      Locale.setDefault(locale)
-      assert(CSVTypeCast.castTo("1,00", FloatType) == 1.0)
-      assert(CSVTypeCast.castTo("1,00", DoubleType) == 1.0)
+      Locale.setDefault(new Locale("fr", "FR"))
+      assert(CSVTypeCast.castTo("1,00", "_1", FloatType) == 100.0) // Would parse as 1.0 in fr-FR
+      assert(CSVTypeCast.castTo("1,00", "_1", DoubleType) == 100.0)
     } finally {
       Locale.setDefault(originalLocale)
     }
@@ -158,7 +168,7 @@ class CSVTypeCastSuite extends SparkFunSuite {
 
   test("Float NaN values are parsed correctly") {
     val floatVal: Float = CSVTypeCast.castTo(
-      "nn", FloatType, nullable = true, CSVOptions("nanValue", "nn")).asInstanceOf[Float]
+      "nn", "_1", FloatType, nullable = true, CSVOptions("nanValue", "nn")).asInstanceOf[Float]
 
     // Java implements the IEEE-754 floating point standard which guarantees that any comparison
     // against NaN will return false (except != which returns true)
@@ -167,32 +177,32 @@ class CSVTypeCastSuite extends SparkFunSuite {
 
   test("Double NaN values are parsed correctly") {
     val doubleVal: Double = CSVTypeCast.castTo(
-      "-", DoubleType, nullable = true, CSVOptions("nanValue", "-")).asInstanceOf[Double]
+      "-", "_1", DoubleType, nullable = true, CSVOptions("nanValue", "-")).asInstanceOf[Double]
 
     assert(doubleVal.isNaN)
   }
 
   test("Float infinite values can be parsed") {
     val floatVal1 = CSVTypeCast.castTo(
-      "max", FloatType, nullable = true, CSVOptions("negativeInf", "max")).asInstanceOf[Float]
+      "max", "_1", FloatType, nullable = true, CSVOptions("negativeInf", "max")).asInstanceOf[Float]
 
     assert(floatVal1 == Float.NegativeInfinity)
 
     val floatVal2 = CSVTypeCast.castTo(
-      "max", FloatType, nullable = true, CSVOptions("positiveInf", "max")).asInstanceOf[Float]
+      "max", "_1", FloatType, nullable = true, CSVOptions("positiveInf", "max")).asInstanceOf[Float]
 
     assert(floatVal2 == Float.PositiveInfinity)
   }
 
   test("Double infinite values can be parsed") {
     val doubleVal1 = CSVTypeCast.castTo(
-      "max", DoubleType, nullable = true, CSVOptions("negativeInf", "max")
+      "max", "_1", DoubleType, nullable = true, CSVOptions("negativeInf", "max")
     ).asInstanceOf[Double]
 
     assert(doubleVal1 == Double.NegativeInfinity)
 
     val doubleVal2 = CSVTypeCast.castTo(
-      "max", DoubleType, nullable = true, CSVOptions("positiveInf", "max")
+      "max", "_1", DoubleType, nullable = true, CSVOptions("positiveInf", "max")
     ).asInstanceOf[Double]
 
     assert(doubleVal2 == Double.PositiveInfinity)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 456052f79afc..598e44ec8c19 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -1366,7 +1366,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
   test("SPARK-6245 JsonRDD.inferSchema on empty RDD") {
     // This is really a test that it doesn't throw an exception
-    val emptySchema = InferSchema.infer(empty, "", new JSONOptions(Map()))
+    val emptySchema = InferSchema.infer(empty, "", new JSONOptions(Map.empty[String, String]))
     assert(StructType(Seq()) === emptySchema)
   }
 
@@ -1390,7 +1390,8 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-8093 Erase empty structs") {
-    val emptySchema = InferSchema.infer(emptyRecords, "", new JSONOptions(Map()))
+    val emptySchema = InferSchema.infer(
+      emptyRecords, "", new JSONOptions(Map.empty[String, String]))
     assert(StructType(Seq()) === emptySchema)
   }
 
@@ -1749,4 +1750,18 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       checkAnswer(stringTimestampsWithFormat, expectedStringDatesWithFormat)
     }
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    val records = sparkContext
+      .parallelize("""{"a": 3, "b": 1.1}""" :: """{"a": 3.1, "b": 0.000001}""" :: Nil)
+
+    val schema = StructType(
+      StructField("a", DecimalType(21, 1), true) ::
+      StructField("b", DecimalType(7, 6), true) :: Nil)
+
+    val df1 = spark.read.option("prefersDecimal", "true").json(records)
+    assert(df1.schema == schema)
+    val df2 = spark.read.option("PREfersdecimaL", "true").json(records)
+    assert(df2.schema == schema)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala
new file mode 100644
index 000000000000..ccb34355f1ba
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+
+class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext {
+
+  test("read parquet footers in parallel") {
+    def testReadFooters(ignoreCorruptFiles: Boolean): Unit = {
+      withTempDir { dir =>
+        val fs = FileSystem.get(sparkContext.hadoopConfiguration)
+        val basePath = dir.getCanonicalPath
+
+        val path1 = new Path(basePath, "first")
+        val path2 = new Path(basePath, "second")
+        val path3 = new Path(basePath, "third")
+
+        spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString)
+        spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString)
+        spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString)
+
+        val fileStatuses =
+          Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten
+
+        val footers = ParquetFileFormat.readParquetFootersInParallel(
+          sparkContext.hadoopConfiguration, fileStatuses, ignoreCorruptFiles)
+
+        assert(footers.size == 2)
+      }
+    }
+
+    testReadFooters(true)
+    val exception = intercept[java.io.IOException] {
+      testReadFooters(false)
+    }
+    assert(exception.getMessage().contains("Could not read footer for file"))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 4246b54c21f0..a0d57d79f045 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -47,7 +47,6 @@ import org.apache.spark.util.{AccumulatorContext, LongAccumulator}
  *    data type is nullable.
  */
 class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContext {
-
   private def checkFilterPredicate(
       df: DataFrame,
       predicate: Predicate,
@@ -230,7 +229,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
-  test("filter pushdown - string") {
+  // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+  ignore("filter pushdown - string") {
     withParquetDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate(
@@ -258,7 +258,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
-  test("filter pushdown - binary") {
+  // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+  ignore("filter pushdown - binary") {
     implicit class IntToBinary(int: Int) {
       def b: Array[Byte] = int.toString.getBytes(StandardCharsets.UTF_8)
     }
@@ -558,4 +559,23 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       }
     }
   }
+
+  test("SPARK-17213: Broken Parquet filter push-down for string columns") {
+    withTempPath { dir =>
+      import testImplicits._
+
+      val path = dir.getCanonicalPath
+      // scalastyle:off nonascii
+      Seq("a", "é").toDF("name").write.parquet(path)
+      // scalastyle:on nonascii
+
+      assert(spark.read.parquet(path).where("name > 'a'").count() == 1)
+      assert(spark.read.parquet(path).where("name >= 'a'").count() == 2)
+
+      // scalastyle:off nonascii
+      assert(spark.read.parquet(path).where("name < 'é'").count() == 1)
+      assert(spark.read.parquet(path).where("name <= 'é'").count() == 2)
+      // scalastyle:on nonascii
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 580eade4b141..acdadb3103c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -736,6 +736,13 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val option = new ParquetOptions(Map("Compression" -> "uncompressed"), spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "UNCOMPRESSED")
+    }
+  }
 }
 
 class JobCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 120a3a2ef33a..f433a74da8cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -29,6 +29,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.{PartitionPath => Partition}
@@ -48,11 +49,11 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
   import PartitioningUtils._
   import testImplicits._
 
-  val defaultPartitionName = "__HIVE_DEFAULT_PARTITION__"
+  val defaultPartitionName = ExternalCatalogUtils.DEFAULT_PARTITION_NAME
 
   test("column type inference") {
     def check(raw: String, literal: Literal): Unit = {
-      assert(inferPartitionColumnValue(raw, defaultPartitionName, true) === literal)
+      assert(inferPartitionColumnValue(raw, true) === literal)
     }
 
     check("10", Literal.create(10, IntegerType))
@@ -76,7 +77,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       "hdfs://host:9000/path/a=10.5/b=hello")
 
     var exception = intercept[AssertionError] {
-      parsePartitions(paths.map(new Path(_)), defaultPartitionName, true, Set.empty[Path])
+      parsePartitions(paths.map(new Path(_)), true, Set.empty[Path])
     }
     assert(exception.getMessage().contains("Conflicting directory structures detected"))
 
@@ -88,7 +89,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
     parsePartitions(
       paths.map(new Path(_)),
-      defaultPartitionName,
       true,
       Set(new Path("hdfs://host:9000/path/")))
 
@@ -101,7 +101,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
     parsePartitions(
       paths.map(new Path(_)),
-      defaultPartitionName,
       true,
       Set(new Path("hdfs://host:9000/path/something=true/table")))
 
@@ -114,7 +113,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
     parsePartitions(
       paths.map(new Path(_)),
-      defaultPartitionName,
       true,
       Set(new Path("hdfs://host:9000/path/table=true")))
 
@@ -127,7 +125,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     exception = intercept[AssertionError] {
       parsePartitions(
         paths.map(new Path(_)),
-        defaultPartitionName,
         true,
         Set(new Path("hdfs://host:9000/path/")))
     }
@@ -147,7 +144,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     exception = intercept[AssertionError] {
       parsePartitions(
         paths.map(new Path(_)),
-        defaultPartitionName,
         true,
         Set(new Path("hdfs://host:9000/tmp/tables/")))
     }
@@ -156,13 +152,13 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
   test("parse partition") {
     def check(path: String, expected: Option[PartitionValues]): Unit = {
-      val actual = parsePartition(new Path(path), defaultPartitionName, true, Set.empty[Path])._1
+      val actual = parsePartition(new Path(path), true, Set.empty[Path])._1
       assert(expected === actual)
     }
 
     def checkThrows[T <: Throwable: Manifest](path: String, expected: String): Unit = {
       val message = intercept[T] {
-        parsePartition(new Path(path), defaultPartitionName, true, Set.empty[Path])
+        parsePartition(new Path(path), true, Set.empty[Path])
       }.getMessage
 
       assert(message.contains(expected))
@@ -204,7 +200,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     // when the basePaths is the same as the path to a leaf directory
     val partitionSpec1: Option[PartitionValues] = parsePartition(
       path = new Path("file://path/a=10"),
-      defaultPartitionName = defaultPartitionName,
       typeInference = true,
       basePaths = Set(new Path("file://path/a=10")))._1
 
@@ -213,7 +208,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     // when the basePaths is the path to a base directory of leaf directories
     val partitionSpec2: Option[PartitionValues] = parsePartition(
       path = new Path("file://path/a=10"),
-      defaultPartitionName = defaultPartitionName,
       typeInference = true,
       basePaths = Set(new Path("file://path")))._1
 
@@ -231,7 +225,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       val actualSpec =
         parsePartitions(
           paths.map(new Path(_)),
-          defaultPartitionName,
           true,
           rootPaths)
       assert(actualSpec === spec)
@@ -314,7 +307,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
   test("parse partitions with type inference disabled") {
     def check(paths: Seq[String], spec: PartitionSpec): Unit = {
       val actualSpec =
-        parsePartitions(paths.map(new Path(_)), defaultPartitionName, false, Set.empty[Path])
+        parsePartitions(paths.map(new Path(_)), false, Set.empty[Path])
       assert(actualSpec === spec)
     }
 
@@ -976,4 +969,15 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       ))
     }
   }
+
+  test("SPARK-18108 Parquet reader fails when data column types conflict with partition ones") {
+    withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+      withTempPath { dir =>
+        val path = dir.getCanonicalPath
+        val df = Seq((1L, 2.0)).toDF("a", "b")
+        df.write.parquet(s"$path/a=1")
+        checkAnswer(spark.read.parquet(s"$path"), Seq(Row(1, 2.0)))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 4c4a7d86f2bd..6033c66e2ee5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.parquet.hadoop.ParquetOutputFormat
 
+import org.apache.spark.{DebugFilesystem, SparkException}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
@@ -212,6 +213,68 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
+  test("Enabling/disabling ignoreCorruptFiles") {
+    def testIgnoreCorruptFiles(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.parquet(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.parquet(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.parquet(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(
+          df,
+          Seq(Row(0), Row(1)))
+      }
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
+      testIgnoreCorruptFiles()
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
+      val exception = intercept[SparkException] {
+        testIgnoreCorruptFiles()
+      }
+      assert(exception.getMessage().contains("is not a Parquet file"))
+    }
+  }
+
+  /**
+   * this is part of test 'Enabling/disabling ignoreCorruptFiles' but run in a loop
+   * to increase the chance of failure
+    */
+  ignore("SPARK-20407 ParquetQuerySuite 'Enabling/disabling ignoreCorruptFiles' flaky test") {
+    def testIgnoreCorruptFiles(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.parquet(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.parquet(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.parquet(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(
+          df,
+          Seq(Row(0), Row(1)))
+      }
+    }
+
+    for (i <- 1 to 100) {
+      DebugFilesystem.clearOpenStreams()
+      withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
+        val exception = intercept[SparkException] {
+          testIgnoreCorruptFiles()
+        }
+        assert(exception.getMessage().contains("is not a Parquet file"))
+      }
+      DebugFilesystem.assertNoOpenStreams()
+    }
+  }
+
   test("SPARK-8990 DataFrameReader.parquet() should respect user specified options") {
     withTempPath { dir =>
       val basePath = dir.getCanonicalPath
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index c3d202ced24c..6aa940afbb2c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -368,88 +368,6 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     }
   }
 
-  test("merge with metastore schema") {
-    // Field type conflict resolution
-    assertResult(
-      StructType(Seq(
-        StructField("lowerCase", StringType),
-        StructField("UPPERCase", DoubleType, nullable = false)))) {
-
-      ParquetFileFormat.mergeMetastoreParquetSchema(
-        StructType(Seq(
-          StructField("lowercase", StringType),
-          StructField("uppercase", DoubleType, nullable = false))),
-
-        StructType(Seq(
-          StructField("lowerCase", BinaryType),
-          StructField("UPPERCase", IntegerType, nullable = true))))
-    }
-
-    // MetaStore schema is subset of parquet schema
-    assertResult(
-      StructType(Seq(
-        StructField("UPPERCase", DoubleType, nullable = false)))) {
-
-      ParquetFileFormat.mergeMetastoreParquetSchema(
-        StructType(Seq(
-          StructField("uppercase", DoubleType, nullable = false))),
-
-        StructType(Seq(
-          StructField("lowerCase", BinaryType),
-          StructField("UPPERCase", IntegerType, nullable = true))))
-    }
-
-    // Metastore schema contains additional non-nullable fields.
-    assert(intercept[Throwable] {
-      ParquetFileFormat.mergeMetastoreParquetSchema(
-        StructType(Seq(
-          StructField("uppercase", DoubleType, nullable = false),
-          StructField("lowerCase", BinaryType, nullable = false))),
-
-        StructType(Seq(
-          StructField("UPPERCase", IntegerType, nullable = true))))
-    }.getMessage.contains("detected conflicting schemas"))
-
-    // Conflicting non-nullable field names
-    intercept[Throwable] {
-      ParquetFileFormat.mergeMetastoreParquetSchema(
-        StructType(Seq(StructField("lower", StringType, nullable = false))),
-        StructType(Seq(StructField("lowerCase", BinaryType))))
-    }
-  }
-
-  test("merge missing nullable fields from Metastore schema") {
-    // Standard case: Metastore schema contains additional nullable fields not present
-    // in the Parquet file schema.
-    assertResult(
-      StructType(Seq(
-        StructField("firstField", StringType, nullable = true),
-        StructField("secondField", StringType, nullable = true),
-        StructField("thirdfield", StringType, nullable = true)))) {
-      ParquetFileFormat.mergeMetastoreParquetSchema(
-        StructType(Seq(
-          StructField("firstfield", StringType, nullable = true),
-          StructField("secondfield", StringType, nullable = true),
-          StructField("thirdfield", StringType, nullable = true))),
-        StructType(Seq(
-          StructField("firstField", StringType, nullable = true),
-          StructField("secondField", StringType, nullable = true))))
-    }
-
-    // Merge should fail if the Metastore contains any additional fields that are not
-    // nullable.
-    assert(intercept[Throwable] {
-      ParquetFileFormat.mergeMetastoreParquetSchema(
-        StructType(Seq(
-          StructField("firstfield", StringType, nullable = true),
-          StructField("secondfield", StringType, nullable = true),
-          StructField("thirdfield", StringType, nullable = false))),
-        StructType(Seq(
-          StructField("firstField", StringType, nullable = true),
-          StructField("secondField", StringType, nullable = true))))
-    }.getMessage.contains("detected conflicting schemas"))
-  }
-
   test("schema merging failure error message") {
     import testImplicits._
 
@@ -1080,34 +998,6 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     }
   }
 
-  testSchemaClipping(
-    "falls back to case insensitive resolution",
-
-    parquetSchema =
-      """message root {
-        |  required group A {
-        |    optional int32 B;
-        |  }
-        |  optional int32 c;
-        |}
-      """.stripMargin,
-
-    catalystSchema = {
-      val nestedType = new StructType().add("b", IntegerType, nullable = true)
-      new StructType()
-        .add("a", nestedType, nullable = true)
-        .add("c", IntegerType, nullable = true)
-    },
-
-    expectedSchema =
-      """message root {
-        |  required group A {
-        |    optional int32 B;
-        |  }
-        |  optional int32 c;
-        |}
-      """.stripMargin)
-
   testSchemaClipping(
     "simple nested struct",
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
new file mode 100644
index 000000000000..c8734a4777d8
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.io._
+import java.nio.charset.StandardCharsets._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.sql.execution.streaming.FakeFileSystem._
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.test.SharedSQLContext
+
+class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext {
+
+  /** To avoid caching of FS objects */
+  override protected val sparkConf =
+    new SparkConf().set(s"spark.hadoop.fs.$scheme.impl.disable.cache", "true")
+
+  import CompactibleFileStreamLog._
+
+  /** -- testing of `object CompactibleFileStreamLog` begins -- */
+
+  test("getBatchIdFromFileName") {
+    assert(1234L === getBatchIdFromFileName("1234"))
+    assert(1234L === getBatchIdFromFileName("1234.compact"))
+    intercept[NumberFormatException] {
+      getBatchIdFromFileName("1234a")
+    }
+  }
+
+  test("isCompactionBatch") {
+    assert(false === isCompactionBatch(0, compactInterval = 3))
+    assert(false === isCompactionBatch(1, compactInterval = 3))
+    assert(true === isCompactionBatch(2, compactInterval = 3))
+    assert(false === isCompactionBatch(3, compactInterval = 3))
+    assert(false === isCompactionBatch(4, compactInterval = 3))
+    assert(true === isCompactionBatch(5, compactInterval = 3))
+  }
+
+  test("nextCompactionBatchId") {
+    assert(2 === nextCompactionBatchId(0, compactInterval = 3))
+    assert(2 === nextCompactionBatchId(1, compactInterval = 3))
+    assert(5 === nextCompactionBatchId(2, compactInterval = 3))
+    assert(5 === nextCompactionBatchId(3, compactInterval = 3))
+    assert(5 === nextCompactionBatchId(4, compactInterval = 3))
+    assert(8 === nextCompactionBatchId(5, compactInterval = 3))
+  }
+
+  test("getValidBatchesBeforeCompactionBatch") {
+    intercept[AssertionError] {
+      getValidBatchesBeforeCompactionBatch(0, compactInterval = 3)
+    }
+    intercept[AssertionError] {
+      getValidBatchesBeforeCompactionBatch(1, compactInterval = 3)
+    }
+    assert(Seq(0, 1) === getValidBatchesBeforeCompactionBatch(2, compactInterval = 3))
+    intercept[AssertionError] {
+      getValidBatchesBeforeCompactionBatch(3, compactInterval = 3)
+    }
+    intercept[AssertionError] {
+      getValidBatchesBeforeCompactionBatch(4, compactInterval = 3)
+    }
+    assert(Seq(2, 3, 4) === getValidBatchesBeforeCompactionBatch(5, compactInterval = 3))
+  }
+
+  test("getAllValidBatches") {
+    assert(Seq(0) === getAllValidBatches(0, compactInterval = 3))
+    assert(Seq(0, 1) === getAllValidBatches(1, compactInterval = 3))
+    assert(Seq(2) === getAllValidBatches(2, compactInterval = 3))
+    assert(Seq(2, 3) === getAllValidBatches(3, compactInterval = 3))
+    assert(Seq(2, 3, 4) === getAllValidBatches(4, compactInterval = 3))
+    assert(Seq(5) === getAllValidBatches(5, compactInterval = 3))
+    assert(Seq(5, 6) === getAllValidBatches(6, compactInterval = 3))
+    assert(Seq(5, 6, 7) === getAllValidBatches(7, compactInterval = 3))
+    assert(Seq(8) === getAllValidBatches(8, compactInterval = 3))
+  }
+
+  test("deriveCompactInterval") {
+    // latestCompactBatchId(4) + 1 <= default(5)
+    // then use latestestCompactBatchId + 1 === 5
+    assert(5 === deriveCompactInterval(5, 4))
+    // First divisor of 10 greater than 4 === 5
+    assert(5 === deriveCompactInterval(4, 9))
+  }
+
+  /** -- testing of `object CompactibleFileStreamLog` ends -- */
+
+  test("batchIdToPath") {
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = Long.MaxValue,
+      defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
+      compactibleLog => {
+        assert("0" === compactibleLog.batchIdToPath(0).getName)
+        assert("1" === compactibleLog.batchIdToPath(1).getName)
+        assert("2.compact" === compactibleLog.batchIdToPath(2).getName)
+        assert("3" === compactibleLog.batchIdToPath(3).getName)
+        assert("4" === compactibleLog.batchIdToPath(4).getName)
+        assert("5.compact" === compactibleLog.batchIdToPath(5).getName)
+      })
+  }
+
+  test("serialize") {
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = Long.MaxValue,
+      defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
+      compactibleLog => {
+        val logs = Array("entry_1", "entry_2", "entry_3")
+        val expected = s"""v${FakeCompactibleFileStreamLog.VERSION}
+            |"entry_1"
+            |"entry_2"
+            |"entry_3"""".stripMargin
+        val baos = new ByteArrayOutputStream()
+        compactibleLog.serialize(logs, baos)
+        assert(expected === baos.toString(UTF_8.name()))
+
+        baos.reset()
+        compactibleLog.serialize(Array(), baos)
+        assert(s"v${FakeCompactibleFileStreamLog.VERSION}" === baos.toString(UTF_8.name()))
+      })
+  }
+
+  test("deserialize") {
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = Long.MaxValue,
+      defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
+      compactibleLog => {
+        val logs = s"""v${FakeCompactibleFileStreamLog.VERSION}
+            |"entry_1"
+            |"entry_2"
+            |"entry_3"""".stripMargin
+        val expected = Array("entry_1", "entry_2", "entry_3")
+        assert(expected ===
+          compactibleLog.deserialize(new ByteArrayInputStream(logs.getBytes(UTF_8))))
+
+        assert(Nil ===
+          compactibleLog.deserialize(
+            new ByteArrayInputStream(s"v${FakeCompactibleFileStreamLog.VERSION}".getBytes(UTF_8))))
+      })
+  }
+
+  test("deserialization log written by future version") {
+    withTempDir { dir =>
+      def newFakeCompactibleFileStreamLog(version: Int): FakeCompactibleFileStreamLog =
+        new FakeCompactibleFileStreamLog(
+          version,
+          _fileCleanupDelayMs = Long.MaxValue, // this param does not matter here in this test case
+          _defaultCompactInterval = 3,         // this param does not matter here in this test case
+          _defaultMinBatchesToRetain = 1,      // this param does not matter here in this test case
+          spark,
+          dir.getCanonicalPath)
+
+      val writer = newFakeCompactibleFileStreamLog(version = 2)
+      val reader = newFakeCompactibleFileStreamLog(version = 1)
+      writer.add(0, Array("entry"))
+      val e = intercept[IllegalStateException] {
+        reader.get(0)
+      }
+      Seq(
+        "maximum supported log version is v1, but encountered v2",
+        "produced by a newer version of Spark and cannot be read by this version"
+      ).foreach { message =>
+        assert(e.getMessage.contains(message))
+      }
+    }
+  }
+
+  testWithUninterruptibleThread("compact") {
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = Long.MaxValue,
+      defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
+      compactibleLog => {
+        for (batchId <- 0 to 10) {
+          compactibleLog.add(batchId, Array("some_path_" + batchId))
+          val expectedFiles = (0 to batchId).map { id => "some_path_" + id }
+          assert(compactibleLog.allFiles() === expectedFiles)
+          if (isCompactionBatch(batchId, 3)) {
+            // Since batchId is a compaction batch, the batch log file should contain all logs
+            assert(compactibleLog.get(batchId).getOrElse(Nil) === expectedFiles)
+          }
+        }
+      })
+  }
+
+  testWithUninterruptibleThread("delete expired file") {
+    // Set `fileCleanupDelayMs` to 0 so that we can detect the deleting behaviour deterministically
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = 0,
+      defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
+      compactibleLog => {
+        val fs = compactibleLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf())
+
+        def listBatchFiles(): Set[String] = {
+          fs.listStatus(compactibleLog.metadataPath).map(_.getPath.getName).filter { fileName =>
+            try {
+              getBatchIdFromFileName(fileName)
+              true
+            } catch {
+              case _: NumberFormatException => false
+            }
+          }.toSet
+        }
+
+        compactibleLog.add(0, Array("some_path_0"))
+        assert(Set("0") === listBatchFiles())
+        compactibleLog.add(1, Array("some_path_1"))
+        assert(Set("0", "1") === listBatchFiles())
+        compactibleLog.add(2, Array("some_path_2"))
+        assert(Set("0", "1", "2.compact") === listBatchFiles())
+        compactibleLog.add(3, Array("some_path_3"))
+        assert(Set("2.compact", "3") === listBatchFiles())
+        compactibleLog.add(4, Array("some_path_4"))
+        assert(Set("2.compact", "3", "4") === listBatchFiles())
+        compactibleLog.add(5, Array("some_path_5"))
+        assert(Set("2.compact", "3", "4", "5.compact") === listBatchFiles())
+        compactibleLog.add(6, Array("some_path_6"))
+        assert(Set("5.compact", "6") === listBatchFiles())
+      })
+  }
+
+  private def withFakeCompactibleFileStreamLog(
+    fileCleanupDelayMs: Long,
+    defaultCompactInterval: Int,
+    defaultMinBatchesToRetain: Int,
+    f: FakeCompactibleFileStreamLog => Unit
+  ): Unit = {
+    withTempDir { file =>
+      val compactibleLog = new FakeCompactibleFileStreamLog(
+        FakeCompactibleFileStreamLog.VERSION,
+        fileCleanupDelayMs,
+        defaultCompactInterval,
+        defaultMinBatchesToRetain,
+        spark,
+        file.getCanonicalPath)
+      f(compactibleLog)
+    }
+  }
+}
+
+object FakeCompactibleFileStreamLog {
+  val VERSION = 1
+}
+
+class FakeCompactibleFileStreamLog(
+    metadataLogVersion: Int,
+    _fileCleanupDelayMs: Long,
+    _defaultCompactInterval: Int,
+    _defaultMinBatchesToRetain: Int,
+    sparkSession: SparkSession,
+    path: String)
+  extends CompactibleFileStreamLog[String](
+    metadataLogVersion,
+    sparkSession,
+    path
+  ) {
+
+  override protected def fileCleanupDelayMs: Long = _fileCleanupDelayMs
+
+  override protected def isDeletingExpiredLog: Boolean = true
+
+  override protected def defaultCompactInterval: Int = _defaultCompactInterval
+
+  override protected val minBatchesToRetain: Int = _defaultMinBatchesToRetain
+
+  override def compactLogs(logs: Seq[String]): Seq[String] = logs
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
index e1bc674a2807..ac71ddbc0dd4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
@@ -29,61 +29,6 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
   import CompactibleFileStreamLog._
   import FileStreamSinkLog._
 
-  test("getBatchIdFromFileName") {
-    assert(1234L === getBatchIdFromFileName("1234"))
-    assert(1234L === getBatchIdFromFileName("1234.compact"))
-    intercept[NumberFormatException] {
-      getBatchIdFromFileName("1234a")
-    }
-  }
-
-  test("isCompactionBatch") {
-    assert(false === isCompactionBatch(0, compactInterval = 3))
-    assert(false === isCompactionBatch(1, compactInterval = 3))
-    assert(true === isCompactionBatch(2, compactInterval = 3))
-    assert(false === isCompactionBatch(3, compactInterval = 3))
-    assert(false === isCompactionBatch(4, compactInterval = 3))
-    assert(true === isCompactionBatch(5, compactInterval = 3))
-  }
-
-  test("nextCompactionBatchId") {
-    assert(2 === nextCompactionBatchId(0, compactInterval = 3))
-    assert(2 === nextCompactionBatchId(1, compactInterval = 3))
-    assert(5 === nextCompactionBatchId(2, compactInterval = 3))
-    assert(5 === nextCompactionBatchId(3, compactInterval = 3))
-    assert(5 === nextCompactionBatchId(4, compactInterval = 3))
-    assert(8 === nextCompactionBatchId(5, compactInterval = 3))
-  }
-
-  test("getValidBatchesBeforeCompactionBatch") {
-    intercept[AssertionError] {
-      getValidBatchesBeforeCompactionBatch(0, compactInterval = 3)
-    }
-    intercept[AssertionError] {
-      getValidBatchesBeforeCompactionBatch(1, compactInterval = 3)
-    }
-    assert(Seq(0, 1) === getValidBatchesBeforeCompactionBatch(2, compactInterval = 3))
-    intercept[AssertionError] {
-      getValidBatchesBeforeCompactionBatch(3, compactInterval = 3)
-    }
-    intercept[AssertionError] {
-      getValidBatchesBeforeCompactionBatch(4, compactInterval = 3)
-    }
-    assert(Seq(2, 3, 4) === getValidBatchesBeforeCompactionBatch(5, compactInterval = 3))
-  }
-
-  test("getAllValidBatches") {
-    assert(Seq(0) === getAllValidBatches(0, compactInterval = 3))
-    assert(Seq(0, 1) === getAllValidBatches(1, compactInterval = 3))
-    assert(Seq(2) === getAllValidBatches(2, compactInterval = 3))
-    assert(Seq(2, 3) === getAllValidBatches(3, compactInterval = 3))
-    assert(Seq(2, 3, 4) === getAllValidBatches(4, compactInterval = 3))
-    assert(Seq(5) === getAllValidBatches(5, compactInterval = 3))
-    assert(Seq(5, 6) === getAllValidBatches(6, compactInterval = 3))
-    assert(Seq(5, 6, 7) === getAllValidBatches(7, compactInterval = 3))
-    assert(Seq(8) === getAllValidBatches(8, compactInterval = 3))
-  }
-
   test("compactLogs") {
     withFileStreamSinkLog { sinkLog =>
       val logs = Seq(
@@ -129,7 +74,7 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
           action = FileStreamSinkLog.ADD_ACTION))
 
       // scalastyle:off
-      val expected = s"""$VERSION
+      val expected = s"""v$VERSION
           |{"path":"/a/b/x","size":100,"isDir":false,"modificationTime":1000,"blockReplication":1,"blockSize":10000,"action":"add"}
           |{"path":"/a/b/y","size":200,"isDir":false,"modificationTime":2000,"blockReplication":2,"blockSize":20000,"action":"delete"}
           |{"path":"/a/b/z","size":300,"isDir":false,"modificationTime":3000,"blockReplication":3,"blockSize":30000,"action":"add"}""".stripMargin
@@ -139,14 +84,14 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
       assert(expected === baos.toString(UTF_8.name()))
       baos.reset()
       sinkLog.serialize(Array(), baos)
-      assert(VERSION === baos.toString(UTF_8.name()))
+      assert(s"v$VERSION" === baos.toString(UTF_8.name()))
     }
   }
 
   test("deserialize") {
     withFileStreamSinkLog { sinkLog =>
       // scalastyle:off
-      val logs = s"""$VERSION
+      val logs = s"""v$VERSION
           |{"path":"/a/b/x","size":100,"isDir":false,"modificationTime":1000,"blockReplication":1,"blockSize":10000,"action":"add"}
           |{"path":"/a/b/y","size":200,"isDir":false,"modificationTime":2000,"blockReplication":2,"blockSize":20000,"action":"delete"}
           |{"path":"/a/b/z","size":300,"isDir":false,"modificationTime":3000,"blockReplication":3,"blockSize":30000,"action":"add"}""".stripMargin
@@ -180,20 +125,7 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
 
       assert(expected === sinkLog.deserialize(new ByteArrayInputStream(logs.getBytes(UTF_8))))
 
-      assert(Nil === sinkLog.deserialize(new ByteArrayInputStream(VERSION.getBytes(UTF_8))))
-    }
-  }
-
-  test("batchIdToPath") {
-    withSQLConf(SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key -> "3") {
-      withFileStreamSinkLog { sinkLog =>
-        assert("0" === sinkLog.batchIdToPath(0).getName)
-        assert("1" === sinkLog.batchIdToPath(1).getName)
-        assert("2.compact" === sinkLog.batchIdToPath(2).getName)
-        assert("3" === sinkLog.batchIdToPath(3).getName)
-        assert("4" === sinkLog.batchIdToPath(4).getName)
-        assert("5.compact" === sinkLog.batchIdToPath(5).getName)
-      }
+      assert(Nil === sinkLog.deserialize(new ByteArrayInputStream(s"v$VERSION".getBytes(UTF_8))))
     }
   }
 
@@ -219,10 +151,11 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
 
   testWithUninterruptibleThread("delete expired file") {
     // Set FILE_SINK_LOG_CLEANUP_DELAY to 0 so that we can detect the deleting behaviour
-    // deterministically
+    // deterministically and one min batches to retain
     withSQLConf(
       SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key -> "3",
-      SQLConf.FILE_SINK_LOG_CLEANUP_DELAY.key -> "0") {
+      SQLConf.FILE_SINK_LOG_CLEANUP_DELAY.key -> "0",
+      SQLConf.MIN_BATCHES_TO_RETAIN.key -> "1") {
       withFileStreamSinkLog { sinkLog =>
         val fs = sinkLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf())
 
@@ -242,15 +175,69 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
         sinkLog.add(1, Array(newFakeSinkFileStatus("/a/b/1", FileStreamSinkLog.ADD_ACTION)))
         assert(Set("0", "1") === listBatchFiles())
         sinkLog.add(2, Array(newFakeSinkFileStatus("/a/b/2", FileStreamSinkLog.ADD_ACTION)))
-        assert(Set("2.compact") === listBatchFiles())
+        assert(Set("0", "1", "2.compact") === listBatchFiles())
         sinkLog.add(3, Array(newFakeSinkFileStatus("/a/b/3", FileStreamSinkLog.ADD_ACTION)))
         assert(Set("2.compact", "3") === listBatchFiles())
         sinkLog.add(4, Array(newFakeSinkFileStatus("/a/b/4", FileStreamSinkLog.ADD_ACTION)))
         assert(Set("2.compact", "3", "4") === listBatchFiles())
         sinkLog.add(5, Array(newFakeSinkFileStatus("/a/b/5", FileStreamSinkLog.ADD_ACTION)))
-        assert(Set("5.compact") === listBatchFiles())
+        assert(Set("2.compact", "3", "4", "5.compact") === listBatchFiles())
+        sinkLog.add(6, Array(newFakeSinkFileStatus("/a/b/6", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("5.compact", "6") === listBatchFiles())
       }
     }
+
+    withSQLConf(
+      SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key -> "3",
+      SQLConf.FILE_SINK_LOG_CLEANUP_DELAY.key -> "0",
+      SQLConf.MIN_BATCHES_TO_RETAIN.key -> "2") {
+      withFileStreamSinkLog { sinkLog =>
+        val fs = sinkLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf())
+
+        def listBatchFiles(): Set[String] = {
+          fs.listStatus(sinkLog.metadataPath).map(_.getPath.getName).filter { fileName =>
+            try {
+              getBatchIdFromFileName(fileName)
+              true
+            } catch {
+              case _: NumberFormatException => false
+            }
+          }.toSet
+        }
+
+        sinkLog.add(0, Array(newFakeSinkFileStatus("/a/b/0", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("0") === listBatchFiles())
+        sinkLog.add(1, Array(newFakeSinkFileStatus("/a/b/1", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("0", "1") === listBatchFiles())
+        sinkLog.add(2, Array(newFakeSinkFileStatus("/a/b/2", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("0", "1", "2.compact") === listBatchFiles())
+        sinkLog.add(3, Array(newFakeSinkFileStatus("/a/b/3", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("0", "1", "2.compact", "3") === listBatchFiles())
+        sinkLog.add(4, Array(newFakeSinkFileStatus("/a/b/4", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("2.compact", "3", "4") === listBatchFiles())
+        sinkLog.add(5, Array(newFakeSinkFileStatus("/a/b/5", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("2.compact", "3", "4", "5.compact") === listBatchFiles())
+        sinkLog.add(6, Array(newFakeSinkFileStatus("/a/b/6", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("2.compact", "3", "4", "5.compact", "6") === listBatchFiles())
+        sinkLog.add(7, Array(newFakeSinkFileStatus("/a/b/7", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("5.compact", "6", "7") === listBatchFiles())
+      }
+    }
+  }
+
+  test("read Spark 2.1.0 log format") {
+    assert(readFromResource("file-sink-log-version-2.1.0") === Seq(
+      // SinkFileStatus("/a/b/0", 100, false, 100, 1, 100, FileStreamSinkLog.ADD_ACTION), -> deleted
+      SinkFileStatus("/a/b/1", 100, false, 100, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/2", 200, false, 200, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/3", 300, false, 300, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/4", 400, false, 400, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/5", 500, false, 500, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/6", 600, false, 600, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/7", 700, false, 700, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/8", 800, false, 800, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/9", 900, false, 900, 3, 200, FileStreamSinkLog.ADD_ACTION)
+    ))
   }
 
   /**
@@ -274,4 +261,10 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
       f(sinkLog)
     }
   }
+
+  private def readFromResource(dir: String): Seq[SinkFileStatus] = {
+    val input = getClass.getResource(s"/structured-streaming/$dir")
+    val log = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, input.toString)
+    log.allFiles()
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala
deleted file mode 100644
index 4a47c04d3f08..000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming
-
-import java.io.File
-import java.net.URI
-
-import scala.util.Random
-
-import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._
-import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.StructType
-
-class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {
-
-  import FileStreamSource._
-
-  test("SeenFilesMap") {
-    val map = new SeenFilesMap(maxAgeMs = 10)
-
-    map.add("a", 5)
-    assert(map.size == 1)
-    map.purge()
-    assert(map.size == 1)
-
-    // Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
-    map.add("b", 15)
-    assert(map.size == 2)
-    map.purge()
-    assert(map.size == 2)
-
-    // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
-    map.add("c", 16)
-    assert(map.size == 3)
-    map.purge()
-    assert(map.size == 2)
-
-    // Override existing entry shouldn't change the size
-    map.add("c", 25)
-    assert(map.size == 2)
-
-    // Not a new file because we have seen c before
-    assert(!map.isNewFile("c", 20))
-
-    // Not a new file because timestamp is too old
-    assert(!map.isNewFile("d", 5))
-
-    // Finally a new file: never seen and not too old
-    assert(map.isNewFile("e", 20))
-  }
-
-  test("SeenFilesMap should only consider a file old if it is earlier than last purge time") {
-    val map = new SeenFilesMap(maxAgeMs = 10)
-
-    map.add("a", 20)
-    assert(map.size == 1)
-
-    // Timestamp 5 should still considered a new file because purge time should be 0
-    assert(map.isNewFile("b", 9))
-    assert(map.isNewFile("b", 10))
-
-    // Once purge, purge time should be 10 and then b would be a old file if it is less than 10.
-    map.purge()
-    assert(!map.isNewFile("b", 9))
-    assert(map.isNewFile("b", 10))
-  }
-
-  testWithUninterruptibleThread("do not recheck that files exist during getBatch") {
-    withTempDir { temp =>
-      spark.conf.set(
-        s"fs.$scheme.impl",
-        classOf[ExistsThrowsExceptionFileSystem].getName)
-      // add the metadata entries as a pre-req
-      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
-      val metadataLog =
-        new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
-      assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))
-
-      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
-        dir.getAbsolutePath, Map.empty)
-      // this method should throw an exception if `fs.exists` is called during resolveRelation
-      newSource.getBatch(None, LongOffset(1))
-    }
-  }
-}
-
-/** Fake FileSystem to test whether the method `fs.exists` is called during
- * `DataSource.resolveRelation`.
- */
-class ExistsThrowsExceptionFileSystem extends RawLocalFileSystem {
-  override def getUri: URI = {
-    URI.create(s"$scheme:///")
-  }
-
-  override def exists(f: Path): Boolean = {
-    throw new IllegalArgumentException("Exists shouldn't have been called!")
-  }
-
-  /** Simply return an empty file for now. */
-  override def listStatus(file: Path): Array[FileStatus] = {
-    val emptyFile = new FileStatus()
-    emptyFile.setPath(file)
-    Array(emptyFile)
-  }
-}
-
-object ExistsThrowsExceptionFileSystem {
-  val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs"
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
index 9e059216110f..9137d650e906 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
@@ -25,6 +25,7 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.ForeachWriter
+import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryException, StreamTest}
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -169,6 +170,106 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
       assert(errorEvent.error.get.getMessage === "error")
     }
   }
+
+  test("foreach with watermark: complete") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"count".as[Long])
+      .map(_.toInt)
+      .repartition(1)
+
+    val query = windowedAggregation
+      .writeStream
+      .outputMode(OutputMode.Complete)
+      .foreach(new TestForeachWriter())
+      .start()
+    try {
+      inputData.addData(10, 11, 12)
+      query.processAllAvailable()
+
+      val allEvents = ForeachSinkSuite.allEvents()
+      assert(allEvents.size === 1)
+      val expectedEvents = Seq(
+        ForeachSinkSuite.Open(partition = 0, version = 0),
+        ForeachSinkSuite.Process(value = 3),
+        ForeachSinkSuite.Close(None)
+      )
+      assert(allEvents === Seq(expectedEvents))
+    } finally {
+      query.stop()
+    }
+  }
+
+  test("foreach with watermark: append") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"count".as[Long])
+      .map(_.toInt)
+      .repartition(1)
+
+    val query = windowedAggregation
+      .writeStream
+      .outputMode(OutputMode.Append)
+      .foreach(new TestForeachWriter())
+      .start()
+    try {
+      inputData.addData(10, 11, 12)
+      query.processAllAvailable()
+      inputData.addData(25) // Advance watermark to 15 seconds
+      query.processAllAvailable()
+      inputData.addData(25) // Evict items less than previous watermark
+      query.processAllAvailable()
+
+      // There should be 3 batches and only does the last batch contain a value.
+      val allEvents = ForeachSinkSuite.allEvents()
+      assert(allEvents.size === 3)
+      val expectedEvents = Seq(
+        Seq(
+          ForeachSinkSuite.Open(partition = 0, version = 0),
+          ForeachSinkSuite.Close(None)
+        ),
+        Seq(
+          ForeachSinkSuite.Open(partition = 0, version = 1),
+          ForeachSinkSuite.Close(None)
+        ),
+        Seq(
+          ForeachSinkSuite.Open(partition = 0, version = 2),
+          ForeachSinkSuite.Process(value = 3),
+          ForeachSinkSuite.Close(None)
+        )
+      )
+      assert(allEvents === expectedEvents)
+    } finally {
+      query.stop()
+    }
+  }
+
+  test("foreach sink should support metrics") {
+    val inputData = MemoryStream[Int]
+    val query = inputData.toDS()
+      .writeStream
+      .foreach(new TestForeachWriter())
+      .start()
+    try {
+      inputData.addData(10, 11, 12)
+      query.processAllAvailable()
+      val recentProgress = query.recentProgress.filter(_.numInputRows != 0).headOption
+      assert(recentProgress.isDefined && recentProgress.get.numInputRows === 3,
+        s"recentProgress[${query.recentProgress.toList}] doesn't contain correct metrics")
+    } finally {
+      query.stop()
+    }
+  }
 }
 
 /** A global object to collect events in the executor */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
index 9c1d26dcb224..8737aba4f4b5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
@@ -119,6 +119,39 @@ class HDFSMetadataLogSuite extends SparkFunSuite with SharedSQLContext {
       assert(metadataLog.get(1).isEmpty)
       assert(metadataLog.get(2).isDefined)
       assert(metadataLog.getLatest().get._1 == 2)
+
+      // There should be exactly one file, called "2", in the metadata directory.
+      // This check also tests for regressions of SPARK-17475
+      val allFiles = new File(metadataLog.metadataPath.toString).listFiles().toSeq
+      assert(allFiles.size == 1)
+      assert(allFiles(0).getName() == "2")
+    }
+  }
+
+  test("HDFSMetadataLog: parseVersion") {
+    withTempDir { dir =>
+      val metadataLog = new HDFSMetadataLog[String](spark, dir.getAbsolutePath)
+      def assertLogFileMalformed(func: => Int): Unit = {
+        val e = intercept[IllegalStateException] { func }
+        assert(e.getMessage.contains(s"Log file was malformed: failed to read correct log version"))
+      }
+      assertLogFileMalformed { metadataLog.parseVersion("", 100) }
+      assertLogFileMalformed { metadataLog.parseVersion("xyz", 100) }
+      assertLogFileMalformed { metadataLog.parseVersion("v10.x", 100) }
+      assertLogFileMalformed { metadataLog.parseVersion("10", 100) }
+      assertLogFileMalformed { metadataLog.parseVersion("v0", 100) }
+      assertLogFileMalformed { metadataLog.parseVersion("v-10", 100) }
+
+      assert(metadataLog.parseVersion("v10", 10) === 10)
+      assert(metadataLog.parseVersion("v10", 100) === 10)
+
+      val e = intercept[IllegalStateException] { metadataLog.parseVersion("v200", 100) }
+      Seq(
+        "maximum supported log version is v100, but encountered v200",
+        "produced by a newer version of Spark and cannot be read by this version"
+      ).foreach { message =>
+        assert(e.getMessage.contains(message))
+      }
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala
similarity index 85%
rename from sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala
index 310d75630272..8f23f98f7619 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala
@@ -15,15 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.streaming
+package org.apache.spark.sql.execution.streaming
 
 import scala.language.implicitConversions
 
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.streaming.{OutputMode, StreamTest}
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
 
@@ -37,7 +36,7 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter {
 
   test("directly add data in Append output mode") {
     implicit val schema = new StructType().add(new StructField("value", IntegerType))
-    val sink = new MemorySink(schema, InternalOutputModes.Append)
+    val sink = new MemorySink(schema, OutputMode.Append)
 
     // Before adding data, check output
     assert(sink.latestBatchId === None)
@@ -71,7 +70,7 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter {
 
   test("directly add data in Update output mode") {
     implicit val schema = new StructType().add(new StructField("value", IntegerType))
-    val sink = new MemorySink(schema, InternalOutputModes.Update)
+    val sink = new MemorySink(schema, OutputMode.Update)
 
     // Before adding data, check output
     assert(sink.latestBatchId === None)
@@ -105,7 +104,7 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter {
 
   test("directly add data in Complete output mode") {
     implicit val schema = new StructType().add(new StructField("value", IntegerType))
-    val sink = new MemorySink(schema, InternalOutputModes.Complete)
+    val sink = new MemorySink(schema, OutputMode.Complete)
 
     // Before adding data, check output
     assert(sink.latestBatchId === None)
@@ -187,6 +186,45 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter {
     query.stop()
   }
 
+  test("registering as a table in Update output mode") {
+    val input = MemoryStream[Int]
+    val query = input.toDF().writeStream
+      .format("memory")
+      .outputMode("update")
+      .queryName("memStream")
+      .start()
+    input.addData(1, 2, 3)
+    query.processAllAvailable()
+
+    checkDataset(
+      spark.table("memStream").as[Int],
+      1, 2, 3)
+
+    input.addData(4, 5, 6)
+    query.processAllAvailable()
+    checkDataset(
+      spark.table("memStream").as[Int],
+      1, 2, 3, 4, 5, 6)
+
+    query.stop()
+  }
+
+  test("MemoryPlan statistics") {
+    implicit val schema = new StructType().add(new StructField("value", IntegerType))
+    val sink = new MemorySink(schema, OutputMode.Append)
+    val plan = new MemoryPlan(sink)
+
+    // Before adding data, check output
+    checkAnswer(sink.allData, Seq.empty)
+    assert(plan.statistics.sizeInBytes === 0)
+
+    sink.addBatch(0, 1 to 3)
+    assert(plan.statistics.sizeInBytes === 12)
+
+    sink.addBatch(1, 4 to 6)
+    assert(plan.statistics.sizeInBytes === 24)
+  }
+
   ignore("stress test") {
     // Ignore the stress test as it takes several minutes to run
     (0 until 1000).foreach { _ =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
new file mode 100644
index 000000000000..ee0db7c2a8d1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.io.File
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.util.stringToFile
+import org.apache.spark.sql.test.SharedSQLContext
+
+class OffsetSeqLogSuite extends SparkFunSuite with SharedSQLContext {
+
+  /** test string offset type */
+  case class StringOffset(override val json: String) extends Offset
+
+  test("OffsetSeqMetadata - deserialization") {
+    assert(OffsetSeqMetadata(0, 0) === OffsetSeqMetadata("""{}"""))
+    assert(OffsetSeqMetadata(1, 0) === OffsetSeqMetadata("""{"batchWatermarkMs":1}"""))
+    assert(OffsetSeqMetadata(0, 2) === OffsetSeqMetadata("""{"batchTimestampMs":2}"""))
+    assert(
+      OffsetSeqMetadata(1, 2) ===
+        OffsetSeqMetadata("""{"batchWatermarkMs":1,"batchTimestampMs":2}"""))
+  }
+
+  testWithUninterruptibleThread("OffsetSeqLog - serialization - deserialization") {
+    withTempDir { temp =>
+      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
+      val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath)
+      val batch0 = OffsetSeq.fill(LongOffset(0), LongOffset(1), LongOffset(2))
+      val batch1 = OffsetSeq.fill(StringOffset("one"), StringOffset("two"), StringOffset("three"))
+
+      val batch0Serialized = OffsetSeq.fill(batch0.offsets.flatMap(_.map(o =>
+        SerializedOffset(o.json))): _*)
+
+      val batch1Serialized = OffsetSeq.fill(batch1.offsets.flatMap(_.map(o =>
+        SerializedOffset(o.json))): _*)
+
+      assert(metadataLog.add(0, batch0))
+      assert(metadataLog.getLatest() === Some(0 -> batch0Serialized))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+
+      assert(metadataLog.add(1, batch1))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+      assert(metadataLog.get(1) === Some(batch1Serialized))
+      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
+      assert(metadataLog.get(None, Some(1)) ===
+        Array(0 -> batch0Serialized, 1 -> batch1Serialized))
+
+      // Adding the same batch does nothing
+      metadataLog.add(1, OffsetSeq.fill(LongOffset(3)))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+      assert(metadataLog.get(1) === Some(batch1Serialized))
+      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
+      assert(metadataLog.get(None, Some(1)) ===
+        Array(0 -> batch0Serialized, 1 -> batch1Serialized))
+    }
+  }
+
+  test("deserialization log written by future version") {
+    withTempDir { dir =>
+      stringToFile(new File(dir, "0"), "v99999")
+      val log = new OffsetSeqLog(spark, dir.getCanonicalPath)
+      val e = intercept[IllegalStateException] {
+        log.get(0)
+      }
+      Seq(
+        s"maximum supported log version is v${OffsetSeqLog.VERSION}, but encountered v99999",
+        "produced by a newer version of Spark and cannot be read by this version"
+      ).foreach { message =>
+        assert(e.getMessage.contains(message))
+      }
+    }
+  }
+
+  test("read Spark 2.1.0 log format") {
+    val (batchId, offsetSeq) = readFromResource("offset-log-version-2.1.0")
+    assert(batchId === 0)
+    assert(offsetSeq.offsets === Seq(
+      Some(SerializedOffset("""{"logOffset":345}""")),
+      Some(SerializedOffset("""{"topic-0":{"0":1}}"""))
+    ))
+    assert(offsetSeq.metadata === Some(OffsetSeqMetadata(0L, 1480981499528L)))
+  }
+
+  private def readFromResource(dir: String): (Long, OffsetSeq) = {
+    val input = getClass.getResource(s"/structured-streaming/$dir")
+    val log = new OffsetSeqLog(spark, input.toString)
+    log.getLatest().get
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetadataSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetadataSuite.scala
new file mode 100644
index 000000000000..87f8004ab958
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetadataSuite.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.io.File
+import java.util.UUID
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.sql.streaming.StreamTest
+
+class StreamMetadataSuite extends StreamTest {
+
+  test("writing and reading") {
+    withTempDir { dir =>
+      val id = UUID.randomUUID.toString
+      val metadata = StreamMetadata(id)
+      val file = new Path(new File(dir, "test").toString)
+      StreamMetadata.write(metadata, file, hadoopConf)
+      val readMetadata = StreamMetadata.read(file, hadoopConf)
+      assert(readMetadata.nonEmpty)
+      assert(readMetadata.get.id === id)
+    }
+  }
+
+  test("read Spark 2.1.0 format") {
+    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
+    assert(
+      readForResource("query-metadata-logs-version-2.1.0.txt") ===
+      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
+  }
+
+  private def readForResource(fileName: String): StreamMetadata = {
+    val input = getClass.getResource(s"/structured-streaming/$fileName")
+    StreamMetadata.read(new Path(input.toString), hadoopConf).get
+  }
+
+  private val hadoopConf = new Configuration()
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
deleted file mode 100644
index 938423db6474..000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming
-
-import org.scalactic.TolerantNumerics
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.util.ManualClock
-
-class StreamMetricsSuite extends SparkFunSuite {
-  import StreamMetrics._
-
-  // To make === between double tolerate inexact values
-  implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.01)
-
-  test("rates, latencies, trigger details - basic life cycle") {
-    val sm = newStreamMetrics(source)
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 0.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 0.0)
-    assert(sm.currentLatency() === None)
-    assert(sm.currentTriggerDetails().isEmpty)
-
-    // When trigger started, the rates should not change, but should return
-    // reported trigger details
-    sm.reportTriggerStarted(1)
-    sm.reportTriggerDetail("key", "value")
-    sm.reportSourceTriggerDetail(source, "key2", "value2")
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 0.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 0.0)
-    assert(sm.currentLatency() === None)
-    assert(sm.currentTriggerDetails() ===
-      Map(TRIGGER_ID -> "1", IS_TRIGGER_ACTIVE -> "true",
-        START_TIMESTAMP -> "0", "key" -> "value"))
-    assert(sm.currentSourceTriggerDetails(source) ===
-      Map(TRIGGER_ID -> "1", "key2" -> "value2"))
-
-    // Finishing the trigger should calculate the rates, except input rate which needs
-    // to have another trigger interval
-    sm.reportNumInputRows(Map(source -> 100L)) // 100 input rows, 10 output rows
-    clock.advance(1000)
-    sm.reportTriggerFinished()
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 100.0)  // 100 input rows processed in 1 sec
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 100.0)
-    assert(sm.currentLatency() === None)
-    assert(sm.currentTriggerDetails() ===
-      Map(TRIGGER_ID -> "1", IS_TRIGGER_ACTIVE -> "false",
-        START_TIMESTAMP -> "0", FINISH_TIMESTAMP -> "1000",
-        NUM_INPUT_ROWS -> "100", "key" -> "value"))
-    assert(sm.currentSourceTriggerDetails(source) ===
-      Map(TRIGGER_ID -> "1", NUM_SOURCE_INPUT_ROWS -> "100", "key2" -> "value2"))
-
-    // After another trigger starts, the rates and latencies should not change until
-    // new rows are reported
-    clock.advance(1000)
-    sm.reportTriggerStarted(2)
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 100.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 100.0)
-    assert(sm.currentLatency() === None)
-
-    // Reporting new rows should update the rates and latencies
-    sm.reportNumInputRows(Map(source -> 200L))     // 200 input rows
-    clock.advance(500)
-    sm.reportTriggerFinished()
-    assert(sm.currentInputRate() === 100.0)      // 200 input rows generated in 2 seconds b/w starts
-    assert(sm.currentProcessingRate() === 400.0) // 200 output rows processed in 0.5 sec
-    assert(sm.currentSourceInputRate(source) === 100.0)
-    assert(sm.currentSourceProcessingRate(source) === 400.0)
-    assert(sm.currentLatency().get === 1500.0)       // 2000 ms / 2 + 500 ms
-
-    // Rates should be set to 0 after stop
-    sm.stop()
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 0.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 0.0)
-    assert(sm.currentLatency() === None)
-    assert(sm.currentTriggerDetails().isEmpty)
-  }
-
-  test("rates and latencies - after trigger with no data") {
-    val sm = newStreamMetrics(source)
-    // Trigger 1 with data
-    sm.reportTriggerStarted(1)
-    sm.reportNumInputRows(Map(source -> 100L)) // 100 input rows
-    clock.advance(1000)
-    sm.reportTriggerFinished()
-
-    // Trigger 2 with data
-    clock.advance(1000)
-    sm.reportTriggerStarted(2)
-    sm.reportNumInputRows(Map(source -> 200L)) // 200 input rows
-    clock.advance(500)
-    sm.reportTriggerFinished()
-
-    // Make sure that all rates are set
-    require(sm.currentInputRate() === 100.0) // 200 input rows generated in 2 seconds b/w starts
-    require(sm.currentProcessingRate() === 400.0) // 200 output rows processed in 0.5 sec
-    require(sm.currentSourceInputRate(source) === 100.0)
-    require(sm.currentSourceProcessingRate(source) === 400.0)
-    require(sm.currentLatency().get === 1500.0) // 2000 ms / 2 + 500 ms
-
-    // Trigger 3 with data
-    clock.advance(500)
-    sm.reportTriggerStarted(3)
-    clock.advance(500)
-    sm.reportTriggerFinished()
-
-    // Rates are set to zero and latency is set to None
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 0.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 0.0)
-    assert(sm.currentLatency() === None)
-    sm.stop()
-  }
-
-  test("rates - after trigger with multiple sources, and one source having no info") {
-    val source1 = TestSource(1)
-    val source2 = TestSource(2)
-    val sm = newStreamMetrics(source1, source2)
-    // Trigger 1 with data
-    sm.reportTriggerStarted(1)
-    sm.reportNumInputRows(Map(source1 -> 100L, source2 -> 100L))
-    clock.advance(1000)
-    sm.reportTriggerFinished()
-
-    // Trigger 2 with data
-    clock.advance(1000)
-    sm.reportTriggerStarted(2)
-    sm.reportNumInputRows(Map(source1 -> 200L, source2 -> 200L))
-    clock.advance(500)
-    sm.reportTriggerFinished()
-
-    // Make sure that all rates are set
-    assert(sm.currentInputRate() === 200.0) // 200*2 input rows generated in 2 seconds b/w starts
-    assert(sm.currentProcessingRate() === 800.0) // 200*2 output rows processed in 0.5 sec
-    assert(sm.currentSourceInputRate(source1) === 100.0)
-    assert(sm.currentSourceInputRate(source2) === 100.0)
-    assert(sm.currentSourceProcessingRate(source1) === 400.0)
-    assert(sm.currentSourceProcessingRate(source2) === 400.0)
-
-    // Trigger 3 with data
-    clock.advance(500)
-    sm.reportTriggerStarted(3)
-    clock.advance(500)
-    sm.reportNumInputRows(Map(source1 -> 200L))
-    sm.reportTriggerFinished()
-
-    // Rates are set to zero and latency is set to None
-    assert(sm.currentInputRate() === 200.0)
-    assert(sm.currentProcessingRate() === 400.0)
-    assert(sm.currentSourceInputRate(source1) === 200.0)
-    assert(sm.currentSourceInputRate(source2) === 0.0)
-    assert(sm.currentSourceProcessingRate(source1) === 400.0)
-    assert(sm.currentSourceProcessingRate(source2) === 0.0)
-    sm.stop()
-  }
-
-  test("registered Codahale metrics") {
-    import scala.collection.JavaConverters._
-    val sm = newStreamMetrics(source)
-    val gaugeNames = sm.metricRegistry.getGauges().keySet().asScala
-
-    // so that all metrics are considered as a single metric group in Ganglia
-    assert(!gaugeNames.exists(_.contains(".")))
-    assert(gaugeNames === Set(
-      "inputRate-total",
-      "inputRate-source0",
-      "processingRate-total",
-      "processingRate-source0",
-      "latency"))
-  }
-
-  private def newStreamMetrics(sources: Source*): StreamMetrics = {
-    new StreamMetrics(sources.toSet, clock, "test")
-  }
-
-  private val clock = new ManualClock()
-  private val source = TestSource(0)
-
-  case class TestSource(id: Int) extends Source {
-    override def schema: StructType = StructType(Array.empty[StructField])
-    override def getOffset: Option[Offset] = Some(new LongOffset(0))
-    override def getBatch(start: Option[Offset], end: Offset): DataFrame = { null }
-    override def stop() {}
-    override def toString(): String = s"source$id"
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index fcf300b3c81b..255378cb0ea8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -17,13 +17,16 @@
 
 package org.apache.spark.sql.execution.streaming.state
 
-import java.io.File
+import java.io.{File, IOException}
+import java.net.URI
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.Random
 
+import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
@@ -209,13 +212,6 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     assert(store1.commit() === 2)
     assert(rowsToSet(store1.iterator()) === Set("a" -> 1, "b" -> 1))
     assert(getDataFromFiles(provider) === Set("a" -> 1, "b" -> 1))
-
-    // Overwrite the version with other data
-    val store2 = provider.getStore(1)
-    put(store2, "c", 1)
-    assert(store2.commit() === 2)
-    assert(rowsToSet(store2.iterator()) === Set("a" -> 1, "c" -> 1))
-    assert(getDataFromFiles(provider) === Set("a" -> 1, "c" -> 1))
   }
 
   test("snapshotting") {
@@ -291,6 +287,20 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     assert(getDataFromFiles(provider, 19) === Set("a" -> 19))
   }
 
+  test("SPARK-19677: Committing a delta file atop an existing one should not fail on HDFS") {
+    val conf = new Configuration()
+    conf.set("fs.fake.impl", classOf[RenameLikeHDFSFileSystem].getName)
+    conf.set("fs.default.name", "fake:///")
+
+    val provider = newStoreProvider(hadoopConf = conf)
+    provider.getStore(0).commit()
+    provider.getStore(0).commit()
+
+    // Verify we don't leak temp files
+    val tempFiles = FileUtils.listFiles(new File(provider.id.checkpointLocation),
+      null, true).asScala.filter(_.getName.startsWith("temp-"))
+    assert(tempFiles.isEmpty)
+  }
 
   test("corrupted file handling") {
     val provider = newStoreProvider(minDeltasForSnapshot = 5)
@@ -375,7 +385,9 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     val opId = 0
     val dir = Utils.createDirectory(tempDir, Random.nextString(5)).toString
     val storeId = StateStoreId(dir, opId, 0)
-    val storeConf = StateStoreConf.empty
+    val sqlConf = new SQLConf()
+    sqlConf.setConf(SQLConf.MIN_BATCHES_TO_RETAIN, 2)
+    val storeConf = StateStoreConf(sqlConf)
     val hadoopConf = new Configuration()
     val provider = new HDFSBackedStateStoreProvider(
       storeId, keySchema, valueSchema, storeConf, hadoopConf)
@@ -392,6 +404,8 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
       }
     }
 
+    val timeoutDuration = 60 seconds
+
     quietly {
       withSpark(new SparkContext(conf)) { sc =>
         withCoordinatorRef(sc) { coordinatorRef =>
@@ -400,7 +414,7 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
           // Generate sufficient versions of store for snapshots
           generateStoreVersions()
 
-          eventually(timeout(10 seconds)) {
+          eventually(timeout(timeoutDuration)) {
             // Store should have been reported to the coordinator
             assert(coordinatorRef.getLocation(storeId).nonEmpty, "active instance was not reported")
 
@@ -419,14 +433,14 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
           generateStoreVersions()
 
           // Earliest delta file should get cleaned up
-          eventually(timeout(10 seconds)) {
+          eventually(timeout(timeoutDuration)) {
             assert(!fileExists(provider, 1, isSnapshot = false), "earliest file not deleted")
           }
 
           // If driver decides to deactivate all instances of the store, then this instance
           // should be unloaded
           coordinatorRef.deactivateInstances(dir)
-          eventually(timeout(10 seconds)) {
+          eventually(timeout(timeoutDuration)) {
             assert(!StateStore.isLoaded(storeId))
           }
 
@@ -436,7 +450,7 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
 
           // If some other executor loads the store, then this instance should be unloaded
           coordinatorRef.reportActiveInstance(storeId, "other-host", "other-exec")
-          eventually(timeout(10 seconds)) {
+          eventually(timeout(timeoutDuration)) {
             assert(!StateStore.isLoaded(storeId))
           }
 
@@ -447,7 +461,7 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
       }
 
       // Verify if instance is unloaded if SparkContext is stopped
-      eventually(timeout(10 seconds)) {
+      eventually(timeout(timeoutDuration)) {
         require(SparkEnv.get === null)
         assert(!StateStore.isLoaded(storeId))
         assert(!StateStore.isMaintenanceRunning)
@@ -455,6 +469,81 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     }
   }
 
+  test("SPARK-18342: commit fails when rename fails") {
+    import RenameReturnsFalseFileSystem._
+    val dir = scheme + "://" + Utils.createDirectory(tempDir, Random.nextString(5)).toString
+    val conf = new Configuration()
+    conf.set(s"fs.$scheme.impl", classOf[RenameReturnsFalseFileSystem].getName)
+    val provider = newStoreProvider(dir = dir, hadoopConf = conf)
+    val store = provider.getStore(0)
+    put(store, "a", 0)
+    val e = intercept[IllegalStateException](store.commit())
+    assert(e.getCause.getMessage.contains("Failed to rename"))
+  }
+
+  test("SPARK-18416: do not create temp delta file until the store is updated") {
+    val dir = Utils.createDirectory(tempDir, Random.nextString(5)).toString
+    val storeId = StateStoreId(dir, 0, 0)
+    val storeConf = StateStoreConf.empty
+    val hadoopConf = new Configuration()
+    val deltaFileDir = new File(s"$dir/0/0/")
+
+    def numTempFiles: Int = {
+      if (deltaFileDir.exists) {
+        deltaFileDir.listFiles.map(_.getName).count(n => n.contains("temp") && !n.startsWith("."))
+      } else 0
+    }
+
+    def numDeltaFiles: Int = {
+      if (deltaFileDir.exists) {
+        deltaFileDir.listFiles.map(_.getName).count(n => n.contains(".delta") && !n.startsWith("."))
+      } else 0
+    }
+
+    def shouldNotCreateTempFile[T](body: => T): T = {
+      val before = numTempFiles
+      val result = body
+      assert(numTempFiles === before)
+      result
+    }
+
+    // Getting the store should not create temp file
+    val store0 = shouldNotCreateTempFile {
+      StateStore.get(storeId, keySchema, valueSchema, 0, storeConf, hadoopConf)
+    }
+
+    // Put should create a temp file
+    put(store0, "a", 1)
+    assert(numTempFiles === 1)
+    assert(numDeltaFiles === 0)
+
+    // Commit should remove temp file and create a delta file
+    store0.commit()
+    assert(numTempFiles === 0)
+    assert(numDeltaFiles === 1)
+
+    // Remove should create a temp file
+    val store1 = shouldNotCreateTempFile {
+      StateStore.get(storeId, keySchema, valueSchema, 1, storeConf, hadoopConf)
+    }
+    remove(store1, _ == "a")
+    assert(numTempFiles === 1)
+    assert(numDeltaFiles === 1)
+
+    // Commit should remove temp file and create a delta file
+    store1.commit()
+    assert(numTempFiles === 0)
+    assert(numDeltaFiles === 2)
+
+    // Commit without any updates should create a delta file
+    val store2 = shouldNotCreateTempFile {
+      StateStore.get(storeId, keySchema, valueSchema, 2, storeConf, hadoopConf)
+    }
+    store2.commit()
+    assert(numTempFiles === 0)
+    assert(numDeltaFiles === 3)
+  }
+
   def getDataFromFiles(
       provider: HDFSBackedStateStoreProvider,
     version: Int = -1): Set[(String, Int)] = {
@@ -524,17 +613,19 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
   def newStoreProvider(
       opId: Long = Random.nextLong,
       partition: Int = 0,
-      minDeltasForSnapshot: Int = SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT.defaultValue.get
+      minDeltasForSnapshot: Int = SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT.defaultValue.get,
+      dir: String = Utils.createDirectory(tempDir, Random.nextString(5)).toString,
+      hadoopConf: Configuration = new Configuration()
     ): HDFSBackedStateStoreProvider = {
-    val dir = Utils.createDirectory(tempDir, Random.nextString(5)).toString
     val sqlConf = new SQLConf()
     sqlConf.setConf(SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT, minDeltasForSnapshot)
+    sqlConf.setConf(SQLConf.MIN_BATCHES_TO_RETAIN, 2)
     new HDFSBackedStateStoreProvider(
       StateStoreId(dir, opId, partition),
       keySchema,
       valueSchema,
       new StateStoreConf(sqlConf),
-      new Configuration())
+      hadoopConf)
   }
 
   def remove(store: StateStore, condition: String => Boolean): Unit = {
@@ -591,10 +682,42 @@ private[state] object StateStoreSuite {
   }
 
   def updatesToSet(iterator: Iterator[StoreUpdate]): Set[TestUpdate] = {
-    iterator.map { _ match {
+    iterator.map {
       case ValueAdded(key, value) => Added(rowToString(key), rowToInt(value))
       case ValueUpdated(key, value) => Updated(rowToString(key), rowToInt(value))
-      case KeyRemoved(key) => Removed(rowToString(key))
-    }}.toSet
+      case ValueRemoved(key, _) => Removed(rowToString(key))
+    }.toSet
   }
 }
+
+/**
+ * Fake FileSystem that simulates HDFS rename semantic, i.e. renaming a file atop an existing
+ * one should return false.
+ * See hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/filesystem/filesystem.html
+ */
+class RenameLikeHDFSFileSystem extends RawLocalFileSystem {
+  override def rename(src: Path, dst: Path): Boolean = {
+    if (exists(dst)) {
+      return false
+    } else {
+      return super.rename(src, dst)
+    }
+  }
+}
+
+/**
+ * Fake FileSystem to test that the StateStore throws an exception while committing the
+ * delta file, when `fs.rename` returns `false`.
+ */
+class RenameReturnsFalseFileSystem extends RawLocalFileSystem {
+  import RenameReturnsFalseFileSystem._
+  override def getUri: URI = {
+    URI.create(s"$scheme:///")
+  }
+
+  override def rename(src: Path, dst: Path): Boolean = false
+}
+
+object RenameReturnsFalseFileSystem {
+  val scheme = s"StateStoreSuite${math.abs(Random.nextInt)}fs"
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 19b6d2603129..508da1b01b3f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.ui
 
 import java.util.Properties
 
+import org.json4s.jackson.JsonMethods._
 import org.mockito.Mockito.mock
 
 import org.apache.spark._
@@ -35,10 +36,10 @@ import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlanIn
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.ui.SparkUI
-import org.apache.spark.util.{AccumulatorMetadata, LongAccumulator}
+import org.apache.spark.util.{AccumulatorMetadata, JsonProtocol, LongAccumulator}
 
 
-class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
+class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTestUtils {
   import testImplicits._
   import org.apache.spark.AccumulatorSuite.makeInfo
 
@@ -146,6 +147,11 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
 
     checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
 
+    // Driver accumulator updates don't belong to this execution should be filtered and no
+    // exception will be thrown.
+    listener.onOtherEvent(SparkListenerDriverAccumUpdates(0, Seq((999L, 2L))))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
+
     listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq(
       // (task id, stage id, stage attempt, accum updates)
       (0L, 0, 0, createTaskMetrics(accumulatorUpdates).accumulators().map(makeInfo)),
@@ -416,6 +422,45 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
     assert(driverUpdates(physicalPlan.longMetric("dummy").id) == expectedAccumValue)
   }
 
+  test("roundtripping SparkListenerDriverAccumUpdates through JsonProtocol (SPARK-18462)") {
+    val event = SparkListenerDriverAccumUpdates(1L, Seq((2L, 3L)))
+    val json = JsonProtocol.sparkEventToJson(event)
+    assertValidDataInJson(json,
+      parse("""
+        |{
+        |  "Event": "org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates",
+        |  "executionId": 1,
+        |  "accumUpdates": [[2,3]]
+        |}
+      """.stripMargin))
+    JsonProtocol.sparkEventFromJson(json) match {
+      case SparkListenerDriverAccumUpdates(executionId, accums) =>
+        assert(executionId == 1L)
+        accums.foreach { case (a, b) =>
+          assert(a == 2L)
+          assert(b == 3L)
+        }
+    }
+
+    // Test a case where the numbers in the JSON can only fit in longs:
+    val longJson = parse(
+      """
+        |{
+        |  "Event": "org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates",
+        |  "executionId": 4294967294,
+        |  "accumUpdates": [[4294967294,3]]
+        |}
+      """.stripMargin)
+    JsonProtocol.sparkEventFromJson(longJson) match {
+      case SparkListenerDriverAccumUpdates(executionId, accums) =>
+        assert(executionId == 4294967294L)
+        accums.foreach { case (a, b) =>
+          assert(a == 4294967294L)
+          assert(b == 3L)
+        }
+    }
+  }
+
 }
 
 
@@ -432,9 +477,11 @@ private case class MyPlan(sc: SparkContext, expectedValue: Long) extends LeafExe
 
   override def doExecute(): RDD[InternalRow] = {
     longMetric("dummy") += expectedValue
-    sc.listenerBus.post(SparkListenerDriverAccumUpdates(
-      sc.getLocalProperty(SQLExecution.EXECUTION_ID_KEY).toLong,
-      metrics.values.map(m => m.id -> m.value).toSeq))
+
+    SQLMetrics.postDriverMetricUpdates(
+      sc,
+      sc.getLocalProperty(SQLExecution.EXECUTION_ID_KEY),
+      metrics.values.toSeq)
     sc.emptyRDD
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index e3943f31a48b..8184d7d909f4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -119,6 +119,69 @@ class ColumnarBatchSuite extends SparkFunSuite {
     }}
   }
 
+  test("Short Apis") {
+    (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
+      val seed = System.currentTimeMillis()
+      val random = new Random(seed)
+      val reference = mutable.ArrayBuffer.empty[Short]
+
+      val column = ColumnVector.allocate(1024, ShortType, memMode)
+      var idx = 0
+
+      val values = (1 :: 2 :: 3 :: 4 :: 5 :: Nil).map(_.toShort).toArray
+      column.putShorts(idx, 2, values, 0)
+      reference += 1
+      reference += 2
+      idx += 2
+
+      column.putShorts(idx, 3, values, 2)
+      reference += 3
+      reference += 4
+      reference += 5
+      idx += 3
+
+      column.putShort(idx, 9)
+      reference += 9
+      idx += 1
+
+      column.putShorts(idx, 3, 4)
+      reference += 4
+      reference += 4
+      reference += 4
+      idx += 3
+
+      while (idx < column.capacity) {
+        val single = random.nextBoolean()
+        if (single) {
+          val v = random.nextInt().toShort
+          column.putShort(idx, v)
+          reference += v
+          idx += 1
+        } else {
+          val n = math.min(random.nextInt(column.capacity / 20), column.capacity - idx)
+          val v = (n + 1).toShort
+          column.putShorts(idx, n, v)
+          var i = 0
+          while (i < n) {
+            reference += v
+            i += 1
+          }
+          idx += n
+        }
+      }
+
+      reference.zipWithIndex.foreach { v =>
+        assert(v._1 == column.getShort(v._2), "Seed = " + seed + " Mem Mode=" + memMode)
+        if (memMode == MemoryMode.OFF_HEAP) {
+          val addr = column.valuesNativeAddress()
+          assert(v._1 == Platform.getShort(null, addr + 2 * v._2))
+        }
+      }
+
+      column.close
+    }}
+  }
+
   test("Int Apis") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode => {
       val seed = System.currentTimeMillis()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 11d4693f1c2a..a283ff971adc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -215,18 +215,10 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
   }
 
   test("default value of WAREHOUSE_PATH") {
-
-    val original = spark.conf.get(SQLConf.WAREHOUSE_PATH)
-    try {
-      // to get the default value, always unset it
-      spark.conf.unset(SQLConf.WAREHOUSE_PATH.key)
-      // JVM adds a trailing slash if the directory exists and leaves it as-is, if it doesn't
-      // In our comparison, strip trailing slash off of both sides, to account for such cases
-      assert(new Path(Utils.resolveURI("spark-warehouse")).toString.stripSuffix("/") === spark
-        .sessionState.conf.warehousePath.stripSuffix("/"))
-    } finally {
-      sql(s"set ${SQLConf.WAREHOUSE_PATH}=$original")
-    }
+    // JVM adds a trailing slash if the directory exists and leaves it as-is, if it doesn't
+    // In our comparison, strip trailing slash off of both sides, to account for such cases
+    assert(new Path(Utils.resolveURI("spark-warehouse")).toString.stripSuffix("/") === spark
+      .sessionState.conf.warehousePath.stripSuffix("/"))
   }
 
   test("MAX_CASES_BRANCHES") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 71cf5e6a2291..4c964bf1b3ac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -24,12 +24,13 @@ import java.util.{Calendar, GregorianCalendar, Properties}
 import org.h2.jdbc.JdbcSQLException
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JDBCRDD, JdbcUtils}
+import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JDBCRDD, JDBCRelation, JdbcUtils}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -202,6 +203,21 @@ class JDBCSuite extends SparkFunSuite
          |partitionColumn '"Dept"', lowerBound '1', upperBound '4', numPartitions '4')
       """.stripMargin.replaceAll("\n", " "))
 
+    conn.prepareStatement(
+      """create table test."mixedCaseCols" ("Name" TEXT(32), "Id" INTEGER NOT NULL)""")
+      .executeUpdate()
+    conn.prepareStatement("""insert into test."mixedCaseCols" values ('fred', 1)""").executeUpdate()
+    conn.prepareStatement("""insert into test."mixedCaseCols" values ('mary', 2)""").executeUpdate()
+    conn.prepareStatement("""insert into test."mixedCaseCols" values (null, 3)""").executeUpdate()
+    conn.commit()
+
+    sql(
+      s"""
+         |CREATE TEMPORARY TABLE mixedCaseCols
+         |USING org.apache.spark.sql.jdbc
+         |OPTIONS (url '$url', dbtable 'TEST."mixedCaseCols"', user 'testUser', password 'testPass')
+      """.stripMargin.replaceAll("\n", " "))
+
     // Untested: IDENTITY, OTHER, UUID, ARRAY, and GEOMETRY types.
   }
 
@@ -209,6 +225,16 @@ class JDBCSuite extends SparkFunSuite
     conn.close()
   }
 
+  // Check whether the tables are fetched in the expected degree of parallelism
+  def checkNumPartitions(df: DataFrame, expectedNumPartitions: Int): Unit = {
+    val jdbcRelations = df.queryExecution.analyzed.collect {
+      case LogicalRelation(r: JDBCRelation, _, _) => r
+    }
+    assert(jdbcRelations.length == 1)
+    assert(jdbcRelations.head.parts.length == expectedNumPartitions,
+      s"Expecting a JDBCRelation with $expectedNumPartitions partitions, but got:`$jdbcRelations`")
+  }
+
   test("SELECT *") {
     assert(sql("SELECT * FROM foobar").collect().size === 3)
   }
@@ -313,13 +339,23 @@ class JDBCSuite extends SparkFunSuite
   }
 
   test("SELECT * partitioned") {
-    assert(sql("SELECT * FROM parts").collect().size == 3)
+    val df = sql("SELECT * FROM parts")
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length == 3)
   }
 
   test("SELECT WHERE (simple predicates) partitioned") {
-    assert(sql("SELECT * FROM parts WHERE THEID < 1").collect().size === 0)
-    assert(sql("SELECT * FROM parts WHERE THEID != 2").collect().size === 2)
-    assert(sql("SELECT THEID FROM parts WHERE THEID = 1").collect().size === 1)
+    val df1 = sql("SELECT * FROM parts WHERE THEID < 1")
+    checkNumPartitions(df1, expectedNumPartitions = 3)
+    assert(df1.collect().length === 0)
+
+    val df2 = sql("SELECT * FROM parts WHERE THEID != 2")
+    checkNumPartitions(df2, expectedNumPartitions = 3)
+    assert(df2.collect().length === 2)
+
+    val df3 = sql("SELECT THEID FROM parts WHERE THEID = 1")
+    checkNumPartitions(df3, expectedNumPartitions = 3)
+    assert(df3.collect().length === 1)
   }
 
   test("SELECT second field partitioned") {
@@ -370,24 +406,27 @@ class JDBCSuite extends SparkFunSuite
   }
 
   test("Partitioning via JDBCPartitioningInfo API") {
-    assert(
-      spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties())
-      .collect().length === 3)
+    val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties())
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length === 3)
   }
 
   test("Partitioning via list-of-where-clauses API") {
     val parts = Array[String]("THEID < 2", "THEID >= 2")
-    assert(spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties())
-      .collect().length === 3)
+    val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties())
+    checkNumPartitions(df, expectedNumPartitions = 2)
+    assert(df.collect().length === 3)
   }
 
   test("Partitioning on column that might have null values.") {
-    assert(
-      spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "theid", 0, 4, 3, new Properties())
-        .collect().length === 4)
-    assert(
-      spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "THEID", 0, 4, 3, new Properties())
-        .collect().length === 4)
+    val df = spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "theid", 0, 4, 3, new Properties())
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length === 4)
+
+    val df2 = spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "THEID", 0, 4, 3, new Properties())
+    checkNumPartitions(df2, expectedNumPartitions = 3)
+    assert(df2.collect().length === 4)
+
     // partitioning on a nullable quoted column
     assert(
       spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", """"Dept"""", 0, 4, 3, new Properties())
@@ -404,6 +443,7 @@ class JDBCSuite extends SparkFunSuite
       numPartitions = 0,
       connectionProperties = new Properties()
     )
+    checkNumPartitions(res, expectedNumPartitions = 1)
     assert(res.count() === 8)
   }
 
@@ -417,6 +457,7 @@ class JDBCSuite extends SparkFunSuite
       numPartitions = 10,
       connectionProperties = new Properties()
     )
+    checkNumPartitions(res, expectedNumPartitions = 4)
     assert(res.count() === 8)
   }
 
@@ -430,6 +471,7 @@ class JDBCSuite extends SparkFunSuite
       numPartitions = 4,
       connectionProperties = new Properties()
     )
+    checkNumPartitions(res, expectedNumPartitions = 1)
     assert(res.count() === 8)
   }
 
@@ -450,7 +492,9 @@ class JDBCSuite extends SparkFunSuite
   }
 
   test("SELECT * on partitioned table with a nullable partition column") {
-    assert(sql("SELECT * FROM nullparts").collect().size == 4)
+    val df = sql("SELECT * FROM nullparts")
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length == 4)
   }
 
   test("H2 integral types") {
@@ -604,28 +648,32 @@ class JDBCSuite extends SparkFunSuite
 
   test("compile filters") {
     val compileFilter = PrivateMethod[Option[String]]('compileFilter)
-    def doCompileFilter(f: Filter): String = JDBCRDD invokePrivate compileFilter(f) getOrElse("")
-    assert(doCompileFilter(EqualTo("col0", 3)) === "col0 = 3")
-    assert(doCompileFilter(Not(EqualTo("col1", "abc"))) === "(NOT (col1 = 'abc'))")
+    def doCompileFilter(f: Filter): String =
+      JDBCRDD invokePrivate compileFilter(f, JdbcDialects.get("jdbc:")) getOrElse("")
+    assert(doCompileFilter(EqualTo("col0", 3)) === """"col0" = 3""")
+    assert(doCompileFilter(Not(EqualTo("col1", "abc"))) === """(NOT ("col1" = 'abc'))""")
     assert(doCompileFilter(And(EqualTo("col0", 0), EqualTo("col1", "def")))
-      === "(col0 = 0) AND (col1 = 'def')")
+      === """("col0" = 0) AND ("col1" = 'def')""")
     assert(doCompileFilter(Or(EqualTo("col0", 2), EqualTo("col1", "ghi")))
-      === "(col0 = 2) OR (col1 = 'ghi')")
-    assert(doCompileFilter(LessThan("col0", 5)) === "col0 < 5")
+      === """("col0" = 2) OR ("col1" = 'ghi')""")
+    assert(doCompileFilter(LessThan("col0", 5)) === """"col0" < 5""")
     assert(doCompileFilter(LessThan("col3",
-      Timestamp.valueOf("1995-11-21 00:00:00.0"))) === "col3 < '1995-11-21 00:00:00.0'")
-    assert(doCompileFilter(LessThan("col4", Date.valueOf("1983-08-04"))) === "col4 < '1983-08-04'")
-    assert(doCompileFilter(LessThanOrEqual("col0", 5)) === "col0 <= 5")
-    assert(doCompileFilter(GreaterThan("col0", 3)) === "col0 > 3")
-    assert(doCompileFilter(GreaterThanOrEqual("col0", 3)) === "col0 >= 3")
-    assert(doCompileFilter(In("col1", Array("jkl"))) === "col1 IN ('jkl')")
+      Timestamp.valueOf("1995-11-21 00:00:00.0"))) === """"col3" < '1995-11-21 00:00:00.0'""")
+    assert(doCompileFilter(LessThan("col4", Date.valueOf("1983-08-04")))
+      === """"col4" < '1983-08-04'""")
+    assert(doCompileFilter(LessThanOrEqual("col0", 5)) === """"col0" <= 5""")
+    assert(doCompileFilter(GreaterThan("col0", 3)) === """"col0" > 3""")
+    assert(doCompileFilter(GreaterThanOrEqual("col0", 3)) === """"col0" >= 3""")
+    assert(doCompileFilter(In("col1", Array("jkl"))) === """"col1" IN ('jkl')""")
+    assert(doCompileFilter(In("col1", Array.empty)) ===
+      """CASE WHEN "col1" IS NULL THEN NULL ELSE FALSE END""")
     assert(doCompileFilter(Not(In("col1", Array("mno", "pqr"))))
-      === "(NOT (col1 IN ('mno', 'pqr')))")
-    assert(doCompileFilter(IsNull("col1")) === "col1 IS NULL")
-    assert(doCompileFilter(IsNotNull("col1")) === "col1 IS NOT NULL")
+      === """(NOT ("col1" IN ('mno', 'pqr')))""")
+    assert(doCompileFilter(IsNull("col1")) === """"col1" IS NULL""")
+    assert(doCompileFilter(IsNotNull("col1")) === """"col1" IS NOT NULL""")
     assert(doCompileFilter(And(EqualNullSafe("col0", "abc"), EqualTo("col1", "def")))
-      === "((NOT (col0 != 'abc' OR col0 IS NULL OR 'abc' IS NULL) "
-        + "OR (col0 IS NULL AND 'abc' IS NULL))) AND (col1 = 'def')")
+      === """((NOT ("col0" != 'abc' OR "col0" IS NULL OR 'abc' IS NULL) """
+        + """OR ("col0" IS NULL AND 'abc' IS NULL))) AND ("col1" = 'def')""")
   }
 
   test("Dialect unregister") {
@@ -720,7 +768,8 @@ class JDBCSuite extends SparkFunSuite
     }
     // test the JdbcRelation toString output
     df.queryExecution.analyzed.collect {
-      case r: LogicalRelation => assert(r.relation.toString == "JDBCRelation(TEST.PEOPLE)")
+      case r: LogicalRelation =>
+        assert(r.relation.toString == "JDBCRelation(TEST.PEOPLE) [numPartitions=3]")
     }
   }
 
@@ -732,6 +781,38 @@ class JDBCSuite extends SparkFunSuite
     }
   }
 
+  test("hide credentials in create and describe a persistent/temp table") {
+    val password = "testPass"
+    val tableName = "tab1"
+    Seq("TABLE", "TEMPORARY VIEW").foreach { tableType =>
+      withTable(tableName) {
+        val df = sql(
+          s"""
+             |CREATE $tableType $tableName
+             |USING org.apache.spark.sql.jdbc
+             |OPTIONS (
+             | url '$urlWithUserAndPass',
+             | dbtable 'TEST.PEOPLE',
+             | user 'testUser',
+             | password '$password')
+           """.stripMargin)
+
+        val explain = ExplainCommand(df.queryExecution.logical, extended = true)
+        spark.sessionState.executePlan(explain).executedPlan.executeCollect().foreach { r =>
+          assert(!r.toString.contains(password))
+        }
+
+        sql(s"DESC FORMATTED $tableName").collect().foreach { r =>
+          assert(!r.toString().contains(password))
+        }
+
+        sql(s"DESC EXTENDED $tableName").collect().foreach { r =>
+          assert(!r.toString().contains(password))
+        }
+      }
+    }
+  }
+
   test("SPARK 12941: The data type mapping for StringType to Oracle") {
     val oracleDialect = JdbcDialects.get("jdbc:oracle://127.0.0.1/db")
     assert(oracleDialect.getJDBCType(StringType).
@@ -790,4 +871,33 @@ class JDBCSuite extends SparkFunSuite
     val schema = JdbcUtils.schemaString(df.schema, "jdbc:mysql://localhost:3306/temp")
     assert(schema.contains("`order` TEXT"))
   }
+
+  test("SPARK-18141: Predicates on quoted column names in the jdbc data source") {
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id < 1").collect().size == 0)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id <= 1").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id > 1").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id >= 1").collect().size == 3)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id = 1").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id != 2").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id <=> 2").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name LIKE 'fr%'").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name LIKE '%ed'").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name LIKE '%re%'").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name IS NULL").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name IS NOT NULL").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols").filter($"Name".isin()).collect().size == 0)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name IN ('mary', 'fred')").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name NOT IN ('fred')").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id = 1 OR Name = 'mary'").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name = 'mary' AND Id = 2").collect().size == 1)
+  }
+
+  test("SPARK-18419: Fix `asConnectionProperties` to filter case-insensitively") {
+    val parameters = Map(
+      "url" -> "jdbc:mysql://localhost:3306/temp",
+      "dbtable" -> "t1",
+      "numPartitions" -> "10")
+    assert(new JDBCOptions(parameters).asConnectionProperties.isEmpty)
+    assert(new JDBCOptions(new CaseInsensitiveMap(parameters)).asConnectionProperties.isEmpty)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 96540ec92da7..e3d3c6c3a887 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -303,4 +303,13 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
     assert(e.contains("If 'partitionColumn' is specified then 'lowerBound', 'upperBound'," +
       " and 'numPartitions' are required."))
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    val df = spark.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
+    df.write.format("jdbc")
+      .option("Url", url1)
+      .option("dbtable", "TEST.SAVETEST")
+      .options(properties.asScala)
+      .save()
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index 5cc9467395ad..61939fe5ef5b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -249,4 +249,13 @@ class CreateTableAsSelectSuite
       }
     }
   }
+
+  test("specifying the column list for CTAS") {
+    withTable("t") {
+      val e = intercept[ParseException] {
+        sql("CREATE TABLE t (a int, b int) USING parquet AS SELECT 1, 2")
+      }.getMessage
+      assert(e.contains("Schema may not be specified in a Create Table As Select (CTAS)"))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
new file mode 100644
index 000000000000..bef47aacd337
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
@@ -0,0 +1,136 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.sources
+
+import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{IntegerType, Metadata, MetadataBuilder, StructType}
+
+class TestOptionsSource extends SchemaRelationProvider with CreatableRelationProvider {
+
+  // This is used in the read path.
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      schema: StructType): BaseRelation = {
+    new TestOptionsRelation(parameters)(sqlContext.sparkSession)
+  }
+
+  // This is used in the write path.
+  override def createRelation(
+      sqlContext: SQLContext,
+      mode: SaveMode,
+      parameters: Map[String, String],
+      data: DataFrame): BaseRelation = {
+    new TestOptionsRelation(parameters)(sqlContext.sparkSession)
+  }
+}
+
+class TestOptionsRelation(val options: Map[String, String])(@transient val session: SparkSession)
+  extends BaseRelation {
+
+  override def sqlContext: SQLContext = session.sqlContext
+
+  def pathOption: Option[String] = options.get("path")
+
+  // We can't get the relation directly for write path, here we put the path option in schema
+  // metadata, so that we can test it later.
+  override def schema: StructType = {
+    val metadataWithPath = pathOption.map {
+      path => new MetadataBuilder().putString("path", path).build()
+    }
+    new StructType().add("i", IntegerType, true, metadataWithPath.getOrElse(Metadata.empty))
+  }
+}
+
+class PathOptionSuite extends DataSourceTest with SharedSQLContext {
+
+  test("path option always exist") {
+    withTable("src") {
+      sql(
+        s"""
+           |CREATE TABLE src(i int)
+           |USING ${classOf[TestOptionsSource].getCanonicalName}
+           |OPTIONS (PATH '/tmp/path')
+        """.stripMargin)
+      assert(getPathOption("src") == Some("/tmp/path"))
+    }
+
+    // should exist even path option is not specified when creating table
+    withTable("src") {
+      sql(s"CREATE TABLE src(i int) USING ${classOf[TestOptionsSource].getCanonicalName}")
+      assert(getPathOption("src") == Some(defaultTablePath("src")))
+    }
+  }
+
+  test("path option also exist for write path") {
+    withTable("src") {
+      withTempPath { path =>
+        sql(
+          s"""
+            |CREATE TABLE src
+            |USING ${classOf[TestOptionsSource].getCanonicalName}
+            |OPTIONS (PATH '${path.getAbsolutePath}')
+            |AS SELECT 1
+          """.stripMargin)
+        assert(spark.table("src").schema.head.metadata.getString("path") == path.getAbsolutePath)
+      }
+    }
+
+    // should exist even path option is not specified when creating table
+    withTable("src") {
+      sql(
+        s"""
+           |CREATE TABLE src
+           |USING ${classOf[TestOptionsSource].getCanonicalName}
+           |AS SELECT 1
+          """.stripMargin)
+      assert(spark.table("src").schema.head.metadata.getString("path") == defaultTablePath("src"))
+    }
+  }
+
+  test("path option always represent the value of table location") {
+    withTable("src") {
+      sql(
+        s"""
+           |CREATE TABLE src(i int)
+           |USING ${classOf[TestOptionsSource].getCanonicalName}
+           |OPTIONS (PATH '/tmp/path')""".stripMargin)
+      sql("ALTER TABLE src SET LOCATION '/tmp/path2'")
+      assert(getPathOption("src") == Some("/tmp/path2"))
+    }
+
+    withTable("src", "src2") {
+      sql(s"CREATE TABLE src(i int) USING ${classOf[TestOptionsSource].getCanonicalName}")
+      sql("ALTER TABLE src RENAME TO src2")
+      assert(getPathOption("src2") == Some(defaultTablePath("src2")))
+    }
+  }
+
+  private def getPathOption(tableName: String): Option[String] = {
+    spark.table(tableName).queryExecution.analyzed.collect {
+      case LogicalRelation(r: TestOptionsRelation, _, _) => r.pathOption
+    }.head
+  }
+
+  private def defaultTablePath(tableName: String): String = {
+    spark.sessionState.catalog.defaultTablePath(TableIdentifier(tableName))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
index 86bcb4d4b00c..eaa5fb30edfa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala
@@ -203,6 +203,9 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
     (2 to 10).map(i => Row(i, i - 1)).toSeq)
 
   test("Schema and all fields") {
+    def hiveMetadata(dt: String): Metadata = {
+      new MetadataBuilder().putString(HIVE_TYPE_STRING, dt).build()
+    }
     val expectedSchema = StructType(
       StructField("string$%Field", StringType, true) ::
       StructField("binaryField", BinaryType, true) ::
@@ -217,8 +220,8 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
       StructField("decimalField2", DecimalType(9, 2), true) ::
       StructField("dateField", DateType, true) ::
       StructField("timestampField", TimestampType, true) ::
-      StructField("varcharField", StringType, true) ::
-      StructField("charField", StringType, true) ::
+      StructField("varcharField", StringType, true, hiveMetadata("varchar(12)")) ::
+      StructField("charField", StringType, true, hiveMetadata("char(18)")) ::
       StructField("arrayFieldSimple", ArrayType(IntegerType), true) ::
       StructField("arrayFieldComplex",
         ArrayType(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
new file mode 100644
index 000000000000..7614ea5eb3c0
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.{util => ju}
+import java.text.SimpleDateFormat
+import java.util.Date
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
+import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.functions.{count, window}
+import org.apache.spark.sql.streaming.OutputMode._
+
+class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
+
+  import testImplicits._
+
+  after {
+    sqlContext.streams.active.foreach(_.stop())
+  }
+
+  test("error on bad column") {
+    val inputData = MemoryStream[Int].toDF()
+    val e = intercept[AnalysisException] {
+      inputData.withWatermark("badColumn", "1 minute")
+    }
+    assert(e.getMessage contains "badColumn")
+  }
+
+  test("error on wrong type") {
+    val inputData = MemoryStream[Int].toDF()
+    val e = intercept[AnalysisException] {
+      inputData.withWatermark("value", "1 minute")
+    }
+    assert(e.getMessage contains "value")
+    assert(e.getMessage contains "int")
+  }
+
+  test("event time and watermark metrics") {
+    // No event time metrics when there is no watermarking
+    val inputData1 = MemoryStream[Int]
+    val aggWithoutWatermark = inputData1.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(aggWithoutWatermark, outputMode = Complete)(
+      AddData(inputData1, 15),
+      CheckAnswer((15, 1)),
+      assertEventStats { e => assert(e.isEmpty) },
+      AddData(inputData1, 10, 12, 14),
+      CheckAnswer((10, 3), (15, 1)),
+      assertEventStats { e => assert(e.isEmpty) }
+    )
+
+    // All event time metrics where watermarking is set
+    val inputData2 = MemoryStream[Int]
+    val aggWithWatermark = inputData2.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy(window($"eventTime", "5 seconds") as 'window)
+        .agg(count("*") as 'count)
+        .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(aggWithWatermark)(
+      AddData(inputData2, 15),
+      CheckAnswer(),
+      assertEventStats { e =>
+        assert(e.get("max") === formatTimestamp(15))
+        assert(e.get("min") === formatTimestamp(15))
+        assert(e.get("avg") === formatTimestamp(15))
+        assert(e.get("watermark") === formatTimestamp(0))
+      },
+      AddData(inputData2, 10, 12, 14),
+      CheckAnswer(),
+      assertEventStats { e =>
+        assert(e.get("max") === formatTimestamp(14))
+        assert(e.get("min") === formatTimestamp(10))
+        assert(e.get("avg") === formatTimestamp(12))
+        assert(e.get("watermark") === formatTimestamp(5))
+      },
+      AddData(inputData2, 25),
+      CheckAnswer(),
+      assertEventStats { e =>
+        assert(e.get("max") === formatTimestamp(25))
+        assert(e.get("min") === formatTimestamp(25))
+        assert(e.get("avg") === formatTimestamp(25))
+        assert(e.get("watermark") === formatTimestamp(5))
+      },
+      AddData(inputData2, 25),
+      CheckAnswer((10, 3)),
+      assertEventStats { e =>
+        assert(e.get("max") === formatTimestamp(25))
+        assert(e.get("min") === formatTimestamp(25))
+        assert(e.get("avg") === formatTimestamp(25))
+        assert(e.get("watermark") === formatTimestamp(15))
+      }
+    )
+  }
+
+  test("append mode") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation)(
+      AddData(inputData, 10, 11, 12, 13, 14, 15),
+      CheckLastBatch(),
+      AddData(inputData, 25),   // Advance watermark to 15 seconds
+      CheckLastBatch(),
+      assertNumStateRows(3),
+      AddData(inputData, 25),   // Emit items less than watermark and drop their state
+      CheckLastBatch((10, 5)),
+      assertNumStateRows(2),
+      AddData(inputData, 10),   // Should not emit anything as data less than watermark
+      CheckLastBatch(),
+      assertNumStateRows(2)
+    )
+  }
+
+  test("update mode") {
+    val inputData = MemoryStream[Int]
+    spark.conf.set("spark.sql.shuffle.partitions", "10")
+
+    val windowedAggregation = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation, OutputMode.Update)(
+      AddData(inputData, 10, 11, 12, 13, 14, 15),
+      CheckLastBatch((10, 5), (15, 1)),
+      AddData(inputData, 25),     // Advance watermark to 15 seconds
+      CheckLastBatch((25, 1)),
+      assertNumStateRows(3),
+      AddData(inputData, 10, 25), // Ignore 10 as its less than watermark
+      CheckLastBatch((25, 2)),
+      assertNumStateRows(2),
+      AddData(inputData, 10),     // Should not emit anything as data less than watermark
+      CheckLastBatch(),
+      assertNumStateRows(2)
+    )
+  }
+
+  test("delay in months and years handled correctly") {
+    val currentTimeMs = System.currentTimeMillis
+    val currentTime = new Date(currentTimeMs)
+
+    val input = MemoryStream[Long]
+    val aggWithWatermark = input.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "2 years 5 months")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    def monthsSinceEpoch(date: Date): Int = { date.getYear * 12 + date.getMonth }
+
+    testStream(aggWithWatermark)(
+      AddData(input, currentTimeMs / 1000),
+      CheckAnswer(),
+      AddData(input, currentTimeMs / 1000),
+      CheckAnswer(),
+      assertEventStats { e =>
+        assert(timestampFormat.parse(e.get("max")).getTime === (currentTimeMs / 1000) * 1000)
+        val watermarkTime = timestampFormat.parse(e.get("watermark"))
+        val monthDiff = monthsSinceEpoch(currentTime) - monthsSinceEpoch(watermarkTime)
+        // monthsSinceEpoch is like `math.floor(num)`, so monthDiff has two possible values.
+        assert(monthDiff === 29 || monthDiff === 30,
+          s"currentTime: $currentTime, watermarkTime: $watermarkTime")
+      }
+    )
+  }
+
+  test("recovery") {
+    val inputData = MemoryStream[Int]
+    val df = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(df)(
+      AddData(inputData, 10, 11, 12, 13, 14, 15),
+      CheckLastBatch(),
+      AddData(inputData, 25), // Advance watermark to 15 seconds
+      StopStream,
+      StartStream(),
+      CheckLastBatch(),
+      AddData(inputData, 25), // Evict items less than previous watermark.
+      CheckLastBatch((10, 5)),
+      StopStream,
+      AssertOnQuery { q => // clear the sink
+        q.sink.asInstanceOf[MemorySink].clear()
+        true
+      },
+      StartStream(),
+      CheckLastBatch((10, 5)), // Recompute last batch and re-evict timestamp 10
+      AddData(inputData, 30), // Advance watermark to 20 seconds
+      CheckLastBatch(),
+      StopStream,
+      StartStream(), // Watermark should still be 15 seconds
+      AddData(inputData, 17),
+      CheckLastBatch(), // We still do not see next batch
+      AddData(inputData, 30), // Advance watermark to 20 seconds
+      CheckLastBatch(),
+      AddData(inputData, 30), // Evict items less than previous watermark.
+      CheckLastBatch((15, 2)) // Ensure we see next window
+    )
+  }
+
+  test("dropping old data") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy(window($"eventTime", "5 seconds") as 'window)
+        .agg(count("*") as 'count)
+        .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation)(
+      AddData(inputData, 10, 11, 12),
+      CheckAnswer(),
+      AddData(inputData, 25),     // Advance watermark to 15 seconds
+      CheckAnswer(),
+      AddData(inputData, 25),     // Evict items less than previous watermark.
+      CheckAnswer((10, 3)),
+      AddData(inputData, 10),     // 10 is later than 15 second watermark
+      CheckAnswer((10, 3)),
+      AddData(inputData, 25),
+      CheckAnswer((10, 3))        // Should not emit an incorrect partial result.
+    )
+  }
+
+  test("complete mode") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy(window($"eventTime", "5 seconds") as 'window)
+        .agg(count("*") as 'count)
+        .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    // No eviction when asked to compute complete results.
+    testStream(windowedAggregation, OutputMode.Complete)(
+      AddData(inputData, 10, 11, 12),
+      CheckAnswer((10, 3)),
+      AddData(inputData, 25),
+      CheckAnswer((10, 3), (25, 1)),
+      AddData(inputData, 25),
+      CheckAnswer((10, 3), (25, 2)),
+      AddData(inputData, 10),
+      CheckAnswer((10, 4), (25, 2)),
+      AddData(inputData, 25),
+      CheckAnswer((10, 4), (25, 3))
+    )
+  }
+
+  test("group by on raw timestamp") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy($"eventTime")
+        .agg(count("*") as 'count)
+        .select($"eventTime".cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation)(
+      AddData(inputData, 10),
+      CheckAnswer(),
+      AddData(inputData, 25), // Advance watermark to 15 seconds
+      CheckAnswer(),
+      AddData(inputData, 25), // Evict items less than previous watermark.
+      CheckAnswer((10, 1))
+    )
+  }
+
+  test("delay threshold should not be negative.") {
+    val inputData = MemoryStream[Int].toDF()
+    var e = intercept[IllegalArgumentException] {
+      inputData.withWatermark("value", "-1 year")
+    }
+    assert(e.getMessage contains "should not be negative.")
+
+    e = intercept[IllegalArgumentException] {
+      inputData.withWatermark("value", "1 year -13 months")
+    }
+    assert(e.getMessage contains "should not be negative.")
+
+    e = intercept[IllegalArgumentException] {
+      inputData.withWatermark("value", "1 month -40 days")
+    }
+    assert(e.getMessage contains "should not be negative.")
+
+    e = intercept[IllegalArgumentException] {
+      inputData.withWatermark("value", "-10 seconds")
+    }
+    assert(e.getMessage contains "should not be negative.")
+  }
+
+  test("the new watermark should override the old one") {
+    val df = MemoryStream[(Long, Long)].toDF()
+      .withColumn("first", $"_1".cast("timestamp"))
+      .withColumn("second", $"_2".cast("timestamp"))
+      .withWatermark("first", "1 minute")
+      .withWatermark("second", "2 minutes")
+
+    val eventTimeColumns = df.logicalPlan.output
+      .filter(_.metadata.contains(EventTimeWatermark.delayKey))
+    assert(eventTimeColumns.size === 1)
+    assert(eventTimeColumns(0).name === "second")
+  }
+
+  private def assertNumStateRows(numTotalRows: Long): AssertOnQuery = AssertOnQuery { q =>
+    val progressWithData = q.recentProgress.filter(_.numInputRows > 0).lastOption.get
+    assert(progressWithData.stateOperators(0).numRowsTotal === numTotalRows)
+    true
+  }
+
+  private def assertEventStats(body: ju.Map[String, String] => Unit): AssertOnQuery = {
+    AssertOnQuery { q =>
+      body(q.recentProgress.filter(_.numInputRows > 0).lastOption.get.eventTime)
+      true
+    }
+  }
+
+  private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601
+  timestampFormat.setTimeZone(ju.TimeZone.getTimeZone("UTC"))
+
+  private def formatTimestamp(sec: Long): String = {
+    timestampFormat.format(new ju.Date(sec * 1000))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 902cf0534471..688829ff927a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -17,17 +17,18 @@
 
 package org.apache.spark.sql.streaming
 
-import org.apache.spark.sql._
+import org.apache.spark.sql.{AnalysisException, DataFrame}
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.streaming.{MemoryStream, MetadataLogFileIndex}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
 
 class FileStreamSinkSuite extends StreamTest {
   import testImplicits._
 
-  test("FileStreamSink - unpartitioned writing and batch reading") {
+  test("unpartitioned writing and batch reading") {
     val inputData = MemoryStream[Int]
     val df = inputData.toDF()
 
@@ -59,7 +60,7 @@ class FileStreamSinkSuite extends StreamTest {
     }
   }
 
-  test("FileStreamSink - partitioned writing and batch reading") {
+  test("partitioned writing and batch reading") {
     val inputData = MemoryStream[Int]
     val ds = inputData.toDS()
 
@@ -142,42 +143,125 @@ class FileStreamSinkSuite extends StreamTest {
     }
   }
 
-  test("FileStreamSink - supported formats") {
-    def testFormat(format: Option[String]): Unit = {
-      val inputData = MemoryStream[Int]
-      val ds = inputData.toDS()
+  // This tests whether FileStreamSink works with aggregations. Specifically, it tests
+  // whether the the correct streaming QueryExecution (i.e. IncrementalExecution) is used to
+  // to execute the trigger for writing data to file sink. See SPARK-18440 for more details.
+  test("writing with aggregation") {
 
-      val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
-      val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
+    // Since FileStreamSink currently only supports append mode, we will test FileStreamSink
+    // with aggregations using event time windows and watermark, which allows
+    // aggregation + append mode.
+    val inputData = MemoryStream[Long]
+    val inputDF = inputData.toDF.toDF("time")
+    val outputDf = inputDF
+      .selectExpr("CAST(time AS timestamp) AS timestamp")
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds"))
+      .count()
+      .select("window.start", "window.end", "count")
 
-      var query: StreamingQuery = null
+    val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
+    val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
+
+    var query: StreamingQuery = null
+
+    try {
+      query =
+        outputDf.writeStream
+          .option("checkpointLocation", checkpointDir)
+          .format("parquet")
+          .start(outputDir)
+
+
+      def addTimestamp(timestampInSecs: Int*): Unit = {
+        inputData.addData(timestampInSecs.map(_ * 1L): _*)
+        failAfter(streamingTimeout) {
+          query.processAllAvailable()
+        }
+      }
+
+      def check(expectedResult: ((Long, Long), Long)*): Unit = {
+        val outputDf = spark.read.parquet(outputDir)
+          .selectExpr(
+            "CAST(start as BIGINT) AS start",
+            "CAST(end as BIGINT) AS end",
+            "count")
+        checkDataset(
+          outputDf.as[(Long, Long, Long)],
+          expectedResult.map(x => (x._1._1, x._1._2, x._2)): _*)
+      }
+
+      addTimestamp(100) // watermark = None before this, watermark = 100 - 10 = 90 after this
+      check() // nothing emitted yet
+
+      addTimestamp(104, 123) // watermark = 90 before this, watermark = 123 - 10 = 113 after this
+      check() // nothing emitted yet
+
+      addTimestamp(140) // wm = 113 before this, emit results on 100-105, wm = 130 after this
+      check((100L, 105L) -> 2L)
+
+      addTimestamp(150) // wm = 130s before this, emit results on 120-125, wm = 150 after this
+      check((100L, 105L) -> 2L, (120L, 125L) -> 1L)
+
+    } finally {
+      if (query != null) {
+        query.stop()
+      }
+    }
+  }
 
-      try {
-        val writer =
-          ds.map(i => (i, i * 1000))
-            .toDF("id", "value")
-            .writeStream
-        if (format.nonEmpty) {
-          writer.format(format.get)
+  test("Update and Complete output mode not supported") {
+    val df = MemoryStream[Int].toDF().groupBy().count()
+    val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
+
+    withTempDir { dir =>
+
+      def testOutputMode(mode: String): Unit = {
+        val e = intercept[AnalysisException] {
+          df.writeStream.format("parquet").outputMode(mode).start(dir.getCanonicalPath)
         }
-        query = writer
-            .option("checkpointLocation", checkpointDir)
-            .start(outputDir)
-      } finally {
-        if (query != null) {
-          query.stop()
+        Seq(mode, "not support").foreach { w =>
+          assert(e.getMessage.toLowerCase.contains(w))
         }
       }
+
+      testOutputMode("update")
+      testOutputMode("complete")
     }
+  }
 
+  test("parquet") {
     testFormat(None) // should not throw error as default format parquet when not specified
     testFormat(Some("parquet"))
-    val e = intercept[UnsupportedOperationException] {
-      testFormat(Some("text"))
-    }
-    Seq("text", "not support", "stream").foreach { s =>
-      assert(e.getMessage.contains(s))
-    }
   }
 
+  test("text") {
+    testFormat(Some("text"))
+  }
+
+  test("json") {
+    testFormat(Some("json"))
+  }
+
+  def testFormat(format: Option[String]): Unit = {
+    val inputData = MemoryStream[Int]
+    val ds = inputData.toDS()
+
+    val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
+    val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
+
+    var query: StreamingQuery = null
+
+    try {
+      val writer = ds.map(i => (i, i * 1000)).toDF("id", "value").writeStream
+      if (format.nonEmpty) {
+        writer.format(format.get)
+      }
+      query = writer.option("checkpointLocation", checkpointDir).start(outputDir)
+    } finally {
+      if (query != null) {
+        query.stop()
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index fab7642994ff..f14aedbba86a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -18,19 +18,27 @@
 package org.apache.spark.sql.streaming
 
 import java.io.File
+import java.net.URI
 
+import scala.util.Random
+
+import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 import org.scalatest.PrivateMethodTester
+import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.FileStreamSource.{FileEntry, SeenFilesMap}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.streaming.ExistsThrowsExceptionFileSystem._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-class FileStreamSourceTest extends StreamTest with SharedSQLContext with PrivateMethodTester {
+abstract class FileStreamSourceTest
+  extends StreamTest with SharedSQLContext with PrivateMethodTester {
 
   import testImplicits._
 
@@ -59,7 +67,7 @@ class FileStreamSourceTest extends StreamTest with SharedSQLContext with Private
       val source = sources.head
       val newOffset = source.withBatchingLocked {
         addData(source)
-        source.currentOffset + 1
+        new FileStreamSourceOffset(source.currentLogOffset + 1)
       }
       logInfo(s"Added file to $source at offset $newOffset")
       (source, newOffset)
@@ -280,7 +288,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
           createFileStreamSourceAndGetSchema(
             format = Some("json"), path = Some(src.getCanonicalPath), schema = None)
         }
-        assert("Unable to infer schema. It must be specified manually.;" === e.getMessage)
+        assert("Unable to infer schema for JSON. It must be specified manually.;" === e.getMessage)
       }
     }
   }
@@ -743,7 +751,8 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
         .format("memory")
         .queryName("file_data")
         .start()
-        .asInstanceOf[StreamExecution]
+        .asInstanceOf[StreamingQueryWrapper]
+        .streamingQuery
       q.processAllAvailable()
       val memorySink = q.sink.asInstanceOf[MemorySink]
       val fileSource = q.logicalPlan.collect {
@@ -806,21 +815,31 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
   }
 
   test("max files per trigger - incorrect values") {
-    withTempDir { case src =>
-      def testMaxFilePerTriggerValue(value: String): Unit = {
-        val df = spark.readStream.option("maxFilesPerTrigger", value).text(src.getCanonicalPath)
-        val e = intercept[IllegalArgumentException] {
-          testStream(df)()
-        }
-        Seq("maxFilesPerTrigger", value, "positive integer").foreach { s =>
-          assert(e.getMessage.contains(s))
+    val testTable = "maxFilesPerTrigger_test"
+    withTable(testTable) {
+      withTempDir { case src =>
+        def testMaxFilePerTriggerValue(value: String): Unit = {
+          val df = spark.readStream.option("maxFilesPerTrigger", value).text(src.getCanonicalPath)
+          val e = intercept[StreamingQueryException] {
+            // Note: `maxFilesPerTrigger` is checked in the stream thread when creating the source
+            val q = df.writeStream.format("memory").queryName(testTable).start()
+            try {
+              q.processAllAvailable()
+            } finally {
+              q.stop()
+            }
+          }
+          assert(e.getCause.isInstanceOf[IllegalArgumentException])
+          Seq("maxFilesPerTrigger", value, "positive integer").foreach { s =>
+            assert(e.getMessage.contains(s))
+          }
         }
-      }
 
-      testMaxFilePerTriggerValue("not-a-integer")
-      testMaxFilePerTriggerValue("-1")
-      testMaxFilePerTriggerValue("0")
-      testMaxFilePerTriggerValue("10.1")
+        testMaxFilePerTriggerValue("not-a-integer")
+        testMaxFilePerTriggerValue("-1")
+        testMaxFilePerTriggerValue("0")
+        testMaxFilePerTriggerValue("10.1")
+      }
     }
   }
 
@@ -833,7 +852,8 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       df.explain()
 
       val q = df.writeStream.queryName("file_explain").format("memory").start()
-        .asInstanceOf[StreamExecution]
+        .asInstanceOf[StreamingQueryWrapper]
+        .streamingQuery
       try {
         assert("No physical plan. Waiting for data." === q.explainInternal(false))
         assert("No physical plan. Waiting for data." === q.explainInternal(true))
@@ -847,13 +867,13 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
         val explainWithoutExtended = q.explainInternal(false)
         // `extended = false` only displays the physical plan.
         assert("Relation.*text".r.findAllMatchIn(explainWithoutExtended).size === 0)
-        assert("TextFileFormat".r.findAllMatchIn(explainWithoutExtended).size === 1)
+        assert(": Text".r.findAllMatchIn(explainWithoutExtended).size === 1)
 
         val explainWithExtended = q.explainInternal(true)
         // `extended = true` displays 3 logical plans (Parsed/Optimized/Optimized) and 1 physical
         // plan.
         assert("Relation.*text".r.findAllMatchIn(explainWithExtended).size === 3)
-        assert("TextFileFormat".r.findAllMatchIn(explainWithExtended).size === 1)
+        assert(": Text".r.findAllMatchIn(explainWithExtended).size === 1)
       } finally {
         q.stop()
       }
@@ -889,39 +909,45 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       // This is to avoid actually running a Spark job with 10000 tasks
       val df = files.filter("1 == 0").groupBy().count()
 
-      testStream(df, InternalOutputModes.Complete)(
+      testStream(df, OutputMode.Complete)(
         AddTextFileData("0", src, tmp),
         CheckAnswer(0)
       )
     }
   }
 
-  test("compacat metadata log") {
+  test("compact interval metadata log") {
     val _sources = PrivateMethod[Seq[Source]]('sources)
     val _metadataLog = PrivateMethod[FileStreamSourceLog]('metadataLog)
 
-    def verify(execution: StreamExecution)
-      (batchId: Long, expectedBatches: Int): Boolean = {
+    def verify(
+        execution: StreamExecution,
+        batchId: Long,
+        expectedBatches: Int,
+        expectedCompactInterval: Int): Boolean = {
       import CompactibleFileStreamLog._
 
       val fileSource = (execution invokePrivate _sources()).head.asInstanceOf[FileStreamSource]
       val metadataLog = fileSource invokePrivate _metadataLog()
 
-      if (isCompactionBatch(batchId, 2)) {
+      if (isCompactionBatch(batchId, expectedCompactInterval)) {
         val path = metadataLog.batchIdToPath(batchId)
 
         // Assert path name should be ended with compact suffix.
-        assert(path.getName.endsWith(COMPACT_FILE_SUFFIX))
+        assert(path.getName.endsWith(COMPACT_FILE_SUFFIX),
+          "path does not end with compact file suffix")
 
         // Compacted batch should include all entries from start.
         val entries = metadataLog.get(batchId)
-        assert(entries.isDefined)
-        assert(entries.get.length === metadataLog.allFiles().length)
-        assert(metadataLog.get(None, Some(batchId)).flatMap(_._2).length === entries.get.length)
+        assert(entries.isDefined, "Entries not defined")
+        assert(entries.get.length === metadataLog.allFiles().length, "clean up check")
+        assert(metadataLog.get(None, Some(batchId)).flatMap(_._2).length ===
+          entries.get.length, "Length check")
       }
 
       assert(metadataLog.allFiles().sortBy(_.batchId) ===
-        metadataLog.get(None, Some(batchId)).flatMap(_._2).sortBy(_.batchId))
+        metadataLog.get(None, Some(batchId)).flatMap(_._2).sortBy(_.batchId),
+        "Batch id mismatch")
 
       metadataLog.get(None, Some(batchId)).flatMap(_._2).length === expectedBatches
     }
@@ -932,26 +958,27 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       ) {
         val fileStream = createFileStream("text", src.getCanonicalPath)
         val filtered = fileStream.filter($"value" contains "keep")
+        val updateConf = Map(SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key -> "5")
 
         testStream(filtered)(
           AddTextFileData("drop1\nkeep2\nkeep3", src, tmp),
           CheckAnswer("keep2", "keep3"),
-          AssertOnQuery(verify(_)(0L, 1)),
+          AssertOnQuery(verify(_, 0L, 1, 2)),
           AddTextFileData("drop4\nkeep5\nkeep6", src, tmp),
           CheckAnswer("keep2", "keep3", "keep5", "keep6"),
-          AssertOnQuery(verify(_)(1L, 2)),
+          AssertOnQuery(verify(_, 1L, 2, 2)),
           AddTextFileData("drop7\nkeep8\nkeep9", src, tmp),
           CheckAnswer("keep2", "keep3", "keep5", "keep6", "keep8", "keep9"),
-          AssertOnQuery(verify(_)(2L, 3)),
+          AssertOnQuery(verify(_, 2L, 3, 2)),
           StopStream,
-          StartStream(),
-          AssertOnQuery(verify(_)(2L, 3)),
+          StartStream(additionalConfs = updateConf),
+          AssertOnQuery(verify(_, 2L, 3, 2)),
           AddTextFileData("drop10\nkeep11", src, tmp),
           CheckAnswer("keep2", "keep3", "keep5", "keep6", "keep8", "keep9", "keep11"),
-          AssertOnQuery(verify(_)(3L, 4)),
+          AssertOnQuery(verify(_, 3L, 4, 2)),
           AddTextFileData("drop12\nkeep13", src, tmp),
           CheckAnswer("keep2", "keep3", "keep5", "keep6", "keep8", "keep9", "keep11", "keep13"),
-          AssertOnQuery(verify(_)(4L, 5))
+          AssertOnQuery(verify(_, 4L, 5, 2))
         )
       }
     }
@@ -978,12 +1005,17 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
             val _sources = PrivateMethod[Seq[Source]]('sources)
             val fileSource =
               (execution invokePrivate _sources()).head.asInstanceOf[FileStreamSource]
-            assert(fileSource.getBatch(None, LongOffset(2)).as[String].collect() ===
-              List("keep1", "keep2", "keep3"))
-            assert(fileSource.getBatch(Some(LongOffset(0)), LongOffset(2)).as[String].collect() ===
-              List("keep2", "keep3"))
-            assert(fileSource.getBatch(Some(LongOffset(1)), LongOffset(2)).as[String].collect() ===
-              List("keep3"))
+
+            def verify(startId: Option[Int], endId: Int, expected: String*): Unit = {
+              val start = startId.map(new FileStreamSourceOffset(_))
+              val end = FileStreamSourceOffset(endId)
+              assert(fileSource.getBatch(start, end).as[String].collect().toSeq === expected)
+            }
+
+            verify(startId = None, endId = 2, "keep1", "keep2", "keep3")
+            verify(startId = Some(0), endId = 1, "keep2")
+            verify(startId = Some(0), endId = 2, "keep2", "keep3")
+            verify(startId = Some(1), endId = 2, "keep3")
             true
           }
         )
@@ -997,13 +1029,184 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       testStream(input)(
         AddTextFileData("100", src, tmp),
         CheckAnswer("100"),
-        AssertOnLastQueryStatus { status =>
-          assert(status.triggerDetails.get("numRows.input.total") === "1")
-          assert(status.sourceStatuses(0).processingRate > 0.0)
+        AssertOnQuery { query =>
+          val actualProgress = query.recentProgress
+              .find(_.numInputRows > 0)
+              .getOrElse(sys.error("Could not find records with data."))
+          assert(actualProgress.numInputRows === 1)
+          assert(actualProgress.sources(0).processedRowsPerSecond > 0.0)
+          true
         }
       )
     }
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    val options = new FileStreamOptions(Map("maxfilespertrigger" -> "1"))
+    assert(options.maxFilesPerTrigger == Some(1))
+  }
+
+  test("FileStreamSource offset - read Spark 2.1.0 offset json format") {
+    val offset = readOffsetFromResource("file-source-offset-version-2.1.0-json.txt")
+    assert(FileStreamSourceOffset(offset) === FileStreamSourceOffset(345))
+  }
+
+  test("FileStreamSource offset - read Spark 2.1.0 offset long format") {
+    val offset = readOffsetFromResource("file-source-offset-version-2.1.0-long.txt")
+    assert(FileStreamSourceOffset(offset) === FileStreamSourceOffset(345))
+  }
+
+  test("FileStreamSourceLog - read Spark 2.1.0 log format") {
+    assert(readLogFromResource("file-source-log-version-2.1.0") === Seq(
+      FileEntry("/a/b/0", 1480730949000L, 0L),
+      FileEntry("/a/b/1", 1480730950000L, 1L),
+      FileEntry("/a/b/2", 1480730950000L, 2L),
+      FileEntry("/a/b/3", 1480730950000L, 3L),
+      FileEntry("/a/b/4", 1480730951000L, 4L)
+    ))
+  }
+
+  private def readLogFromResource(dir: String): Seq[FileEntry] = {
+    val input = getClass.getResource(s"/structured-streaming/$dir")
+    val log = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, input.toString)
+    log.allFiles()
+  }
+
+  private def readOffsetFromResource(file: String): SerializedOffset = {
+    import scala.io.Source
+    val str = Source.fromFile(getClass.getResource(s"/structured-streaming/$file").toURI).mkString
+    SerializedOffset(str.trim)
+  }
+
+  private def runTwoBatchesAndVerifyResults(
+      src: File,
+      latestFirst: Boolean,
+      firstBatch: String,
+      secondBatch: String,
+      maxFileAge: Option[String] = None): Unit = {
+    val srcOptions = Map("latestFirst" -> latestFirst.toString, "maxFilesPerTrigger" -> "1") ++
+      maxFileAge.map("maxFileAge" -> _)
+    val fileStream = createFileStream(
+      "text",
+      src.getCanonicalPath,
+      options = srcOptions)
+    val clock = new StreamManualClock()
+    testStream(fileStream)(
+      StartStream(trigger = ProcessingTime(10), triggerClock = clock),
+      AssertOnQuery { _ =>
+        // Block until the first batch finishes.
+        eventually(timeout(streamingTimeout)) {
+          assert(clock.isStreamWaitingAt(0))
+        }
+        true
+      },
+      CheckLastBatch(firstBatch),
+      AdvanceManualClock(10),
+      AssertOnQuery { _ =>
+        // Block until the second batch finishes.
+        eventually(timeout(streamingTimeout)) {
+          assert(clock.isStreamWaitingAt(10))
+        }
+        true
+      },
+      CheckLastBatch(secondBatch)
+    )
+  }
+
+  test("FileStreamSource - latestFirst") {
+    withTempDir { src =>
+      // Prepare two files: 1.txt, 2.txt, and make sure they have different modified time.
+      val f1 = stringToFile(new File(src, "1.txt"), "1")
+      val f2 = stringToFile(new File(src, "2.txt"), "2")
+      f2.setLastModified(f1.lastModified + 1000)
+
+      // Read oldest files first, so the first batch is "1", and the second batch is "2".
+      runTwoBatchesAndVerifyResults(src, latestFirst = false, firstBatch = "1", secondBatch = "2")
+
+      // Read latest files first, so the first batch is "2", and the second batch is "1".
+      runTwoBatchesAndVerifyResults(src, latestFirst = true, firstBatch = "2", secondBatch = "1")
+    }
+  }
+
+  test("SPARK-19813: Ignore maxFileAge when maxFilesPerTrigger and latestFirst is used") {
+    withTempDir { src =>
+      // Prepare two files: 1.txt, 2.txt, and make sure they have different modified time.
+      val f1 = stringToFile(new File(src, "1.txt"), "1")
+      val f2 = stringToFile(new File(src, "2.txt"), "2")
+      f2.setLastModified(f1.lastModified + 3600 * 1000 /* 1 hour later */)
+
+      runTwoBatchesAndVerifyResults(src, latestFirst = true, firstBatch = "2", secondBatch = "1",
+        maxFileAge = Some("1m") /* 1 minute */)
+    }
+  }
+
+  test("SeenFilesMap") {
+    val map = new SeenFilesMap(maxAgeMs = 10)
+
+    map.add("a", 5)
+    assert(map.size == 1)
+    map.purge()
+    assert(map.size == 1)
+
+    // Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
+    map.add("b", 15)
+    assert(map.size == 2)
+    map.purge()
+    assert(map.size == 2)
+
+    // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
+    map.add("c", 16)
+    assert(map.size == 3)
+    map.purge()
+    assert(map.size == 2)
+
+    // Override existing entry shouldn't change the size
+    map.add("c", 25)
+    assert(map.size == 2)
+
+    // Not a new file because we have seen c before
+    assert(!map.isNewFile("c", 20))
+
+    // Not a new file because timestamp is too old
+    assert(!map.isNewFile("d", 5))
+
+    // Finally a new file: never seen and not too old
+    assert(map.isNewFile("e", 20))
+  }
+
+  test("SeenFilesMap should only consider a file old if it is earlier than last purge time") {
+    val map = new SeenFilesMap(maxAgeMs = 10)
+
+    map.add("a", 20)
+    assert(map.size == 1)
+
+    // Timestamp 5 should still considered a new file because purge time should be 0
+    assert(map.isNewFile("b", 9))
+    assert(map.isNewFile("b", 10))
+
+    // Once purge, purge time should be 10 and then b would be a old file if it is less than 10.
+    map.purge()
+    assert(!map.isNewFile("b", 9))
+    assert(map.isNewFile("b", 10))
+  }
+
+  testWithUninterruptibleThread("do not recheck that files exist during getBatch") {
+    withTempDir { temp =>
+      spark.conf.set(
+        s"fs.$scheme.impl",
+        classOf[ExistsThrowsExceptionFileSystem].getName)
+      // add the metadata entries as a pre-req
+      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
+    val metadataLog =
+      new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
+      assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))
+
+      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
+        dir.getAbsolutePath, Map.empty)
+      // this method should throw an exception if `fs.exists` is called during resolveRelation
+      newSource.getBatch(None, FileStreamSourceOffset(1))
+    }
+  }
 }
 
 class FileStreamSourceStressTestSuite extends FileStreamSourceTest {
@@ -1024,3 +1227,28 @@ class FileStreamSourceStressTestSuite extends FileStreamSourceTest {
     Utils.deleteRecursively(tmp)
   }
 }
+
+/**
+ * Fake FileSystem to test whether the method `fs.exists` is called during
+ * `DataSource.resolveRelation`.
+ */
+class ExistsThrowsExceptionFileSystem extends RawLocalFileSystem {
+  override def getUri: URI = {
+    URI.create(s"$scheme:///")
+  }
+
+  override def exists(f: Path): Boolean = {
+    throw new IllegalArgumentException("Exists shouldn't have been called!")
+  }
+
+  /** Simply return an empty file for now. */
+  override def listStatus(file: Path): Array[FileStatus] = {
+    val emptyFile = new FileStatus()
+    emptyFile.setPath(file)
+    Array(emptyFile)
+  }
+}
+
+object ExistsThrowsExceptionFileSystem {
+  val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs"
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
similarity index 85%
rename from sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStressSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
index f9e236c44963..28412ea07a75 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStressSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
@@ -36,9 +36,12 @@ import org.apache.spark.util.Utils
  *
  * At the end, the resulting files are loaded and the answer is checked.
  */
-class FileStressSuite extends StreamTest {
+class FileStreamStressSuite extends StreamTest {
   import testImplicits._
 
+  // Error message thrown in the streaming job for testing recovery.
+  private val injectedErrorMsg = "test suite injected failure!"
+
   testQuietly("fault tolerance stress test - unpartitioned output") {
     stressTest(partitionWrites = false)
   }
@@ -101,13 +104,14 @@ class FileStressSuite extends StreamTest {
     val input = spark.readStream.format("text").load(inputDir)
 
     def startStream(): StreamingQuery = {
+      val errorMsg = injectedErrorMsg  // work around serialization issue
       val output = input
         .repartition(5)
         .as[String]
         .mapPartitions { iter =>
           val rand = Random.nextInt(100)
           if (rand < 10) {
-            sys.error("failure")
+            sys.error(errorMsg)
           }
           iter.map(_.toLong)
         }
@@ -131,22 +135,21 @@ class FileStressSuite extends StreamTest {
     }
 
     var failures = 0
-    val streamThread = new Thread("stream runner") {
-      while (continue) {
-        if (failures % 10 == 0) { logError(s"Query restart #$failures") }
-        stream = startStream()
-
-        try {
-          stream.awaitTermination()
-        } catch {
-          case ce: StreamingQueryException =>
-            failures += 1
-        }
+    while (continue) {
+      if (failures % 10 == 0) { logError(s"Query restart #$failures") }
+      stream = startStream()
+
+      try {
+        stream.awaitTermination()
+      } catch {
+        case e: StreamingQueryException
+          if e.getCause != null && e.getCause.getCause != null &&
+              e.getCause.getCause.getMessage.contains(injectedErrorMsg) =>
+          // Getting the expected error message
+          failures += 1
       }
     }
 
-    streamThread.join()
-
     logError(s"Stream restarted $failures times.")
     assert(spark.read.parquet(outputDir).distinct().count() == numRecords)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
index b65a98777030..f208f9bd9b6e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.streaming
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.execution.streaming.{CompositeOffset, LongOffset, Offset}
+import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}
 
 trait OffsetSuite extends SparkFunSuite {
   /** Creates test to check all the comparisons of offsets given a `one` that is less than `two`. */
@@ -35,25 +35,11 @@ trait OffsetSuite extends SparkFunSuite {
 class LongOffsetSuite extends OffsetSuite {
   val one = LongOffset(1)
   val two = LongOffset(2)
+  val three = LongOffset(3)
   compare(one, two)
-}
-
-class CompositeOffsetSuite extends OffsetSuite {
-  compare(
-    one = CompositeOffset(Some(LongOffset(1)) :: Nil),
-    two = CompositeOffset(Some(LongOffset(2)) :: Nil))
-
-  compare(
-    one = CompositeOffset(None :: Nil),
-    two = CompositeOffset(Some(LongOffset(2)) :: Nil))
-
-  compare(
-    one = CompositeOffset.fill(LongOffset(0), LongOffset(1)),
-    two = CompositeOffset.fill(LongOffset(1), LongOffset(2)))
-
-  compare(
-    one = CompositeOffset.fill(LongOffset(1), LongOffset(1)),
-    two = CompositeOffset.fill(LongOffset(1), LongOffset(2)))
 
+  compare(LongOffset(SerializedOffset(one.json)),
+          LongOffset(SerializedOffset(three.json)))
 }
 
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 6bdf47901ae6..0296a2ade345 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -21,10 +21,12 @@ import scala.reflect.ClassTag
 import scala.util.control.ControlThrowable
 
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
+import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.sources.StreamSourceProvider
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
-import org.apache.spark.util.ManualClock
 
 class StreamSuite extends StreamTest {
 
@@ -238,7 +240,7 @@ class StreamSuite extends StreamTest {
     }
   }
 
-  testQuietly("fatal errors from a source should be sent to the user") {
+  testQuietly("handle fatal errors thrown from the stream thread") {
     for (e <- Seq(
       new VirtualMachineError {},
       new ThreadDeath,
@@ -260,25 +262,43 @@ class StreamSuite extends StreamTest {
       }
       val df = Dataset[Int](sqlContext.sparkSession, StreamingExecutionRelation(source))
       testStream(df)(
-        ExpectFailure()(ClassTag(e.getClass))
+        // `ExpectFailure(isFatalError = true)` verifies two things:
+        // - Fatal errors can be propagated to `StreamingQuery.exception` and
+        //   `StreamingQuery.awaitTermination` like non fatal errors.
+        // - Fatal errors can be caught by UncaughtExceptionHandler.
+        ExpectFailure(isFatalError = true)(ClassTag(e.getClass))
       )
     }
   }
 
   test("output mode API in Scala") {
-    val o1 = OutputMode.Append
-    assert(o1 === InternalOutputModes.Append)
-    val o2 = OutputMode.Complete
-    assert(o2 === InternalOutputModes.Complete)
+    assert(OutputMode.Append === InternalOutputModes.Append)
+    assert(OutputMode.Complete === InternalOutputModes.Complete)
+    assert(OutputMode.Update === InternalOutputModes.Update)
   }
 
   test("explain") {
     val inputData = MemoryStream[String]
-    val df = inputData.toDS().map(_ + "foo")
-    // Test `explain` not throwing errors
-    df.explain()
-    val q = df.writeStream.queryName("memory_explain").format("memory").start()
-      .asInstanceOf[StreamExecution]
+    val df = inputData.toDS().map(_ + "foo").groupBy("value").agg(count("*"))
+
+    // Test `df.explain`
+    val explain = ExplainCommand(df.queryExecution.logical, extended = false)
+    val explainString =
+      spark.sessionState
+        .executePlan(explain)
+        .executedPlan
+        .executeCollect()
+        .map(_.getString(0))
+        .mkString("\n")
+    assert(explainString.contains("StateStoreRestore"))
+    assert(explainString.contains("StreamingRelation"))
+    assert(!explainString.contains("LocalTableScan"))
+
+    // Test StreamingQuery.display
+    val q = df.writeStream.queryName("memory_explain").outputMode("complete").format("memory")
+      .start()
+      .asInstanceOf[StreamingQueryWrapper]
+      .streamingQuery
     try {
       assert("No physical plan. Waiting for data." === q.explainInternal(false))
       assert("No physical plan. Waiting for data." === q.explainInternal(true))
@@ -290,16 +310,46 @@ class StreamSuite extends StreamTest {
       // `extended = false` only displays the physical plan.
       assert("LocalRelation".r.findAllMatchIn(explainWithoutExtended).size === 0)
       assert("LocalTableScan".r.findAllMatchIn(explainWithoutExtended).size === 1)
+      // Use "StateStoreRestore" to verify that it does output a streaming physical plan
+      assert(explainWithoutExtended.contains("StateStoreRestore"))
 
       val explainWithExtended = q.explainInternal(true)
       // `extended = true` displays 3 logical plans (Parsed/Optimized/Optimized) and 1 physical
       // plan.
       assert("LocalRelation".r.findAllMatchIn(explainWithExtended).size === 3)
       assert("LocalTableScan".r.findAllMatchIn(explainWithExtended).size === 1)
+      // Use "StateStoreRestore" to verify that it does output a streaming physical plan
+      assert(explainWithExtended.contains("StateStoreRestore"))
     } finally {
       q.stop()
     }
   }
+
+  test("SPARK-19065: dropDuplicates should not create expressions using the same id") {
+    withTempPath { testPath =>
+      val data = Seq((1, 2), (2, 3), (3, 4))
+      data.toDS.write.mode("overwrite").json(testPath.getCanonicalPath)
+      val schema = spark.read.json(testPath.getCanonicalPath).schema
+      val query = spark
+        .readStream
+        .schema(schema)
+        .json(testPath.getCanonicalPath)
+        .dropDuplicates("_1")
+        .writeStream
+        .format("memory")
+        .queryName("testquery")
+        .outputMode("complete")
+        .start()
+      try {
+        query.processAllAvailable()
+        if (query.exception.isDefined) {
+          throw query.exception.get
+        }
+      } finally {
+        query.stop()
+      }
+    }
+  }
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 742833065144..af2f31a34d8d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -28,7 +28,6 @@ import scala.util.control.NonFatal
 
 import org.scalatest.Assertions
 import org.scalatest.concurrent.{Eventually, Timeouts}
-import org.scalatest.concurrent.AsyncAssertions.Waiter
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
@@ -161,16 +160,24 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
   /** Starts the stream, resuming if data has already been processed. It must not be running. */
   case class StartStream(
       trigger: Trigger = ProcessingTime(0),
-      triggerClock: Clock = new SystemClock)
+      triggerClock: Clock = new SystemClock,
+      additionalConfs: Map[String, String] = Map.empty)
     extends StreamAction
 
   /** Advance the trigger clock's time manually. */
   case class AdvanceManualClock(timeToAdd: Long) extends StreamAction
 
-  /** Signals that a failure is expected and should not kill the test. */
-  case class ExpectFailure[T <: Throwable : ClassTag]() extends StreamAction {
+  /**
+   * Signals that a failure is expected and should not kill the test.
+   *
+   * @param isFatalError if this is a fatal error. If so, the error should also be caught by
+   *                     UncaughtExceptionHandler.
+   */
+  case class ExpectFailure[T <: Throwable : ClassTag](
+      isFatalError: Boolean = false) extends StreamAction {
     val causeClass: Class[T] = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
-    override def toString(): String = s"ExpectFailure[${causeClass.getName}]"
+    override def toString(): String =
+      s"ExpectFailure[${causeClass.getName}, isFatalError: $isFatalError]"
   }
 
   /** Assert that a body is true */
@@ -201,10 +208,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
     }
   }
 
-  case class AssertOnLastQueryStatus(condition: StreamingQueryStatus => Unit)
-    extends StreamAction
-
-  class StreamManualClock(time: Long = 0L) extends ManualClock(time) {
+  class StreamManualClock(time: Long = 0L) extends ManualClock(time) with Serializable {
     private var waitStartTime: Option[Long] = None
 
     override def waitTillTime(targetTime: Long): Long = synchronized {
@@ -231,18 +235,38 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
    */
   def testStream(
       _stream: Dataset[_],
-      outputMode: OutputMode = OutputMode.Append)(actions: StreamAction*): Unit = {
+      outputMode: OutputMode = OutputMode.Append)(actions: StreamAction*): Unit = synchronized {
+    // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently
+    // because this method assumes there is only one active query in its `StreamingQueryListener`
+    // and it may not work correctly when multiple `testStream`s run concurrently.
 
     val stream = _stream.toDF()
+    val sparkSession = stream.sparkSession  // use the session in DF, not the default session
     var pos = 0
-    var currentPlan: LogicalPlan = stream.logicalPlan
     var currentStream: StreamExecution = null
     var lastStream: StreamExecution = null
     val awaiting = new mutable.HashMap[Int, Offset]() // source index -> offset to wait for
     val sink = new MemorySink(stream.schema, outputMode)
+    val resetConfValues = mutable.Map[String, Option[String]]()
 
     @volatile
-    var streamDeathCause: Throwable = null
+    var streamThreadDeathCause: Throwable = null
+    // Set UncaughtExceptionHandler in `onQueryStarted` so that we can ensure catching fatal errors
+    // during query initialization.
+    val listener = new StreamingQueryListener {
+      override def onQueryStarted(event: QueryStartedEvent): Unit = {
+        // Note: this assumes there is only one query active in the `testStream` method.
+        Thread.currentThread.setUncaughtExceptionHandler(new UncaughtExceptionHandler {
+          override def uncaughtException(t: Thread, e: Throwable): Unit = {
+            streamThreadDeathCause = e
+          }
+        })
+      }
+
+      override def onQueryProgress(event: QueryProgressEvent): Unit = {}
+      override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {}
+    }
+    sparkSession.streams.addListener(listener)
 
     // If the test doesn't manually start the stream, we do it automatically at the beginning.
     val startedManually =
@@ -273,7 +297,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
          |Output Mode: $outputMode
          |Stream state: $currentOffsets
          |Thread state: $threadState
-         |${if (streamDeathCause != null) stackTraceToString(streamDeathCause) else ""}
+         |${if (streamThreadDeathCause != null) stackTraceToString(streamThreadDeathCause) else ""}
          |
          |== Sink ==
          |${sink.toDebugString}
@@ -321,16 +345,13 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
          """.stripMargin)
     }
 
-    val testThread = Thread.currentThread()
     val metadataRoot = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
-    val statusCollector = new QueryStatusCollector
     var manualClockExpectedTime = -1L
     try {
-      spark.streams.addListener(statusCollector)
       startedTest.foreach { action =>
         logInfo(s"Processing test stream action: $action")
         action match {
-          case StartStream(trigger, triggerClock) =>
+          case StartStream(trigger, triggerClock, additionalConfs) =>
             verify(currentStream == null, "stream already running")
             verify(triggerClock.isInstanceOf[SystemClock]
               || triggerClock.isInstanceOf[StreamManualClock],
@@ -338,9 +359,19 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
             if (triggerClock.isInstanceOf[StreamManualClock]) {
               manualClockExpectedTime = triggerClock.asInstanceOf[StreamManualClock].getTimeMillis()
             }
+
+            additionalConfs.foreach(pair => {
+              val value =
+                if (sparkSession.conf.contains(pair._1)) {
+                  Some(sparkSession.conf.get(pair._1))
+                } else None
+              resetConfValues(pair._1) = value
+              sparkSession.conf.set(pair._1, pair._2)
+            })
+
             lastStream = currentStream
             currentStream =
-              spark
+              sparkSession
                 .streams
                 .startQuery(
                   None,
@@ -350,13 +381,16 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
                   outputMode,
                   trigger = trigger,
                   triggerClock = triggerClock)
-                .asInstanceOf[StreamExecution]
-            currentStream.microBatchThread.setUncaughtExceptionHandler(
-              new UncaughtExceptionHandler {
-                override def uncaughtException(t: Thread, e: Throwable): Unit = {
-                  streamDeathCause = e
-                }
-              })
+                .asInstanceOf[StreamingQueryWrapper]
+                .streamingQuery
+            // Wait until the initialization finishes, because some tests need to use `logicalPlan`
+            // after starting the query.
+            try {
+              currentStream.awaitInitialization(streamingTimeout.toMillis)
+            } catch {
+              case _: StreamingQueryException =>
+                // Ignore the exception. `StopStream` or `ExpectFailure` will catch it as well.
+            }
 
           case AdvanceManualClock(timeToAdd) =>
             verify(currentStream != null,
@@ -365,10 +399,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
                    s"can not advance clock of type ${currentStream.triggerClock.getClass}")
             val clock = currentStream.triggerClock.asInstanceOf[StreamManualClock]
             assert(manualClockExpectedTime >= 0)
+
             // Make sure we don't advance ManualClock too early. See SPARK-16002.
             eventually("StreamManualClock has not yet entered the waiting state") {
               assert(clock.isStreamWaitingAt(manualClockExpectedTime))
             }
+
             clock.advance(timeToAdd)
             manualClockExpectedTime += timeToAdd
             verify(clock.getTimeMillis() === manualClockExpectedTime,
@@ -388,8 +424,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
                   currentStream.exception.map(_.toString()).getOrElse(""))
             } catch {
               case _: InterruptedException =>
-              case _: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
-                failTest("Timed out while stopping and waiting for microbatchthread to terminate.")
+              case e: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
+                failTest(
+                  "Timed out while stopping and waiting for microbatchthread to terminate.", e)
               case t: Throwable =>
                 failTest("Error while stopping stream", t)
             } finally {
@@ -406,8 +443,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
               eventually("microbatch thread not stopped after termination with failure") {
                 assert(!currentStream.microBatchThread.isAlive)
               }
-              verify(thrownException.query.eq(currentStream),
-                s"incorrect query reference in exception")
               verify(currentStream.exception === Some(thrownException),
                 s"incorrect exception returned by query.exception()")
 
@@ -415,16 +450,24 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
               verify(exception.cause.getClass === ef.causeClass,
                 "incorrect cause in exception returned by query.exception()\n" +
                   s"\tExpected: ${ef.causeClass}\n\tReturned: ${exception.cause.getClass}")
+              if (ef.isFatalError) {
+                // This is a fatal error, `streamThreadDeathCause` should be set to this error in
+                // UncaughtExceptionHandler.
+                verify(streamThreadDeathCause != null &&
+                  streamThreadDeathCause.getClass === ef.causeClass,
+                  "UncaughtExceptionHandler didn't receive the correct error\n" +
+                    s"\tExpected: ${ef.causeClass}\n\tReturned: $streamThreadDeathCause")
+                streamThreadDeathCause = null
+              }
             } catch {
               case _: InterruptedException =>
-              case _: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
-                failTest("Timed out while waiting for failure")
+              case e: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
+                failTest("Timed out while waiting for failure", e)
               case t: Throwable =>
                 failTest("Error while checking stream failure", t)
             } finally {
               lastStream = currentStream
               currentStream = null
-              streamDeathCause = null
             }
 
           case a: AssertOnQuery =>
@@ -437,13 +480,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
             val streamToAssert = Option(currentStream).getOrElse(lastStream)
             verify({ a.run(); true }, s"Assert failed: ${a.message}")
 
-          case a: AssertOnLastQueryStatus =>
-            Eventually.eventually(timeout(streamingTimeout)) {
-              require(statusCollector.lastTriggerStatus.nonEmpty)
-            }
-            val status = statusCollector.lastTriggerStatus.get
-            verify({ a.condition(status); true }, "Assert on last query status failed")
-
           case a: AddData =>
             try {
               // Add data and get the source where it was added, and the expected offset of the
@@ -509,16 +545,25 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
         }
         pos += 1
       }
+      if (streamThreadDeathCause != null) {
+        failTest("Stream Thread Died", streamThreadDeathCause)
+      }
     } catch {
-      case _: InterruptedException if streamDeathCause != null =>
-        failTest("Stream Thread Died")
-      case _: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
-        failTest("Timed out waiting for stream")
+      case _: InterruptedException if streamThreadDeathCause != null =>
+        failTest("Stream Thread Died", streamThreadDeathCause)
+      case e: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
+        failTest("Timed out waiting for stream", e)
     } finally {
       if (currentStream != null && currentStream.microBatchThread.isAlive) {
         currentStream.stop()
       }
-      spark.streams.removeListener(statusCollector)
+
+      // Rollback prev configuration values
+      resetConfValues.foreach {
+        case (key, Some(value)) => sparkSession.conf.set(key, value)
+        case (key, None) => sparkSession.conf.unset(key)
+      }
+      sparkSession.streams.removeListener(listener)
     }
   }
 
@@ -598,7 +643,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
     testStream(ds)(actions: _*)
   }
 
-
   object AwaitTerminationTester {
 
     trait ExpectedBehavior
@@ -652,58 +696,4 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
       }
     }
   }
-
-
-  class QueryStatusCollector extends StreamingQueryListener {
-    // to catch errors in the async listener events
-    @volatile private var asyncTestWaiter = new Waiter
-
-    @volatile var startStatus: StreamingQueryStatus = null
-    @volatile var terminationStatus: StreamingQueryStatus = null
-    @volatile var terminationException: Option[String] = null
-
-    private val progressStatuses = new mutable.ArrayBuffer[StreamingQueryStatus]
-
-    /** Get the info of the last trigger that processed data */
-    def lastTriggerStatus: Option[StreamingQueryStatus] = synchronized {
-      progressStatuses.filter { i =>
-        i.triggerDetails.get("isTriggerActive").toBoolean == false &&
-          i.triggerDetails.get("isDataPresentInTrigger").toBoolean == true
-      }.lastOption
-    }
-
-    def reset(): Unit = {
-      startStatus = null
-      terminationStatus = null
-      progressStatuses.clear()
-      asyncTestWaiter = new Waiter
-    }
-
-    def checkAsyncErrors(): Unit = {
-      asyncTestWaiter.await(timeout(10 seconds))
-    }
-
-
-    override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
-      asyncTestWaiter {
-        startStatus = queryStarted.queryStatus
-      }
-    }
-
-    override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
-      asyncTestWaiter {
-        assert(startStatus != null, "onQueryProgress called before onQueryStarted")
-        synchronized { progressStatuses += queryProgress.queryStatus }
-      }
-    }
-
-    override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = {
-      asyncTestWaiter {
-        assert(startStatus != null, "onQueryTerminated called before onQueryStarted")
-        terminationStatus = queryTerminated.queryStatus
-        terminationException = queryTerminated.exception
-      }
-      asyncTestWaiter.dismiss()
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index e59b5491f90b..eca2647dea52 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -17,16 +17,19 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.TimeZone
+
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.InternalOutputModes._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.expressions.scalalang.typed
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.streaming.OutputMode._
 
 object FailureSinglton {
   var firstTime = true
@@ -235,4 +238,101 @@ class StreamingAggregationSuite extends StreamTest with BeforeAndAfterAll {
       CheckLastBatch(("a", 30), ("b", 3), ("c", 1))
     )
   }
+
+  test("prune results by current_time, complete mode") {
+    import testImplicits._
+    val clock = new StreamManualClock
+    val inputData = MemoryStream[Long]
+    val aggregated =
+      inputData.toDF()
+        .groupBy($"value")
+        .agg(count("*"))
+        .where('value >= current_timestamp().cast("long") - 10L)
+
+    testStream(aggregated, Complete)(
+      StartStream(ProcessingTime("10 seconds"), triggerClock = clock),
+
+      // advance clock to 10 seconds, all keys retained
+      AddData(inputData, 0L, 5L, 5L, 10L),
+      AdvanceManualClock(10 * 1000),
+      CheckLastBatch((0L, 1), (5L, 2), (10L, 1)),
+
+      // advance clock to 20 seconds, should retain keys >= 10
+      AddData(inputData, 15L, 15L, 20L),
+      AdvanceManualClock(10 * 1000),
+      CheckLastBatch((10L, 1), (15L, 2), (20L, 1)),
+
+      // advance clock to 30 seconds, should retain keys >= 20
+      AddData(inputData, 0L, 85L),
+      AdvanceManualClock(10 * 1000),
+      CheckLastBatch((20L, 1), (85L, 1)),
+
+      // bounce stream and ensure correct batch timestamp is used
+      // i.e., we don't take it from the clock, which is at 90 seconds.
+      StopStream,
+      AssertOnQuery { q => // clear the sink
+        q.sink.asInstanceOf[MemorySink].clear()
+        // advance by a minute i.e., 90 seconds total
+        clock.advance(60 * 1000L)
+        true
+      },
+      StartStream(ProcessingTime("10 seconds"), triggerClock = clock),
+      CheckLastBatch((20L, 1), (85L, 1)),
+      AssertOnQuery { q =>
+        clock.getTimeMillis() == 90000L
+      },
+
+      // advance clock to 100 seconds, should retain keys >= 90
+      AddData(inputData, 85L, 90L, 100L, 105L),
+      AdvanceManualClock(10 * 1000),
+      CheckLastBatch((90L, 1), (100L, 1), (105L, 1))
+    )
+  }
+
+  test("prune results by current_date, complete mode") {
+    import testImplicits._
+    val clock = new StreamManualClock
+    val tz = TimeZone.getDefault.getID
+    val inputData = MemoryStream[Long]
+    val aggregated =
+      inputData.toDF()
+        .select(to_utc_timestamp(from_unixtime('value * DateTimeUtils.SECONDS_PER_DAY), tz))
+        .toDF("value")
+        .groupBy($"value")
+        .agg(count("*"))
+        .where($"value".cast("date") >= date_sub(current_date(), 10))
+        .select(($"value".cast("long") / DateTimeUtils.SECONDS_PER_DAY).cast("long"), $"count(1)")
+    testStream(aggregated, Complete)(
+      StartStream(ProcessingTime("10 day"), triggerClock = clock),
+      // advance clock to 10 days, should retain all keys
+      AddData(inputData, 0L, 5L, 5L, 10L),
+      AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
+      CheckLastBatch((0L, 1), (5L, 2), (10L, 1)),
+      // advance clock to 20 days, should retain keys >= 10
+      AddData(inputData, 15L, 15L, 20L),
+      AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
+      CheckLastBatch((10L, 1), (15L, 2), (20L, 1)),
+      // advance clock to 30 days, should retain keys >= 20
+      AddData(inputData, 85L),
+      AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
+      CheckLastBatch((20L, 1), (85L, 1)),
+
+      // bounce stream and ensure correct batch timestamp is used
+      // i.e., we don't take it from the clock, which is at 90 days.
+      StopStream,
+      AssertOnQuery { q => // clear the sink
+        q.sink.asInstanceOf[MemorySink].clear()
+        // advance by 60 days i.e., 90 days total
+        clock.advance(DateTimeUtils.MILLIS_PER_DAY * 60)
+        true
+      },
+      StartStream(ProcessingTime("10 day"), triggerClock = clock),
+      CheckLastBatch((20L, 1), (85L, 1)),
+
+      // advance clock to 100 days, should retain keys >= 90
+      AddData(inputData, 85L, 90L, 100L, 105L),
+      AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
+      CheckLastBatch((90L, 1), (100L, 1), (105L, 1))
+    )
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index cebb32a0a56c..eb09b9ffcfc5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -17,24 +17,29 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.UUID
+
 import scala.collection.mutable
+import scala.concurrent.duration._
 
 import org.scalactic.TolerantNumerics
+import org.scalatest.concurrent.AsyncAssertions.Waiter
+import org.scalatest.concurrent.Eventually._
+import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.BeforeAndAfter
 import org.scalatest.PrivateMethodTester._
 
 import org.apache.spark.SparkException
 import org.apache.spark.scheduler._
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{Encoder, SparkSession}
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.functions._
-import org.apache.spark.util.{JsonProtocol, ManualClock}
-
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.streaming.StreamingQueryListener._
+import org.apache.spark.util.JsonProtocol
 
 class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
 
   import testImplicits._
-  import StreamingQueryListenerSuite._
 
   // To make === between double tolerate inexact values
   implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.01)
@@ -42,90 +47,130 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
   after {
     spark.streams.active.foreach(_.stop())
     assert(spark.streams.active.isEmpty)
-    assert(addedListeners.isEmpty)
+    assert(addedListeners().isEmpty)
     // Make sure we don't leak any events to the next test
+    spark.sparkContext.listenerBus.waitUntilEmpty(10000)
   }
 
-  test("single listener, check trigger statuses") {
-    import StreamingQueryListenerSuite._
-    clock = new StreamManualClock
-
-    /** Custom MemoryStream that waits for manual clock to reach a time */
-    val inputData = new MemoryStream[Int](0, sqlContext) {
-      // Wait for manual clock to be 100 first time there is data
-      override def getOffset: Option[Offset] = {
-        val offset = super.getOffset
-        if (offset.nonEmpty) {
-          clock.waitTillTime(100)
+  testQuietly("single listener, check trigger events are generated correctly") {
+    val clock = new StreamManualClock
+    val inputData = new MemoryStream[Int](0, sqlContext)
+    val df = inputData.toDS().as[Long].map { 10 / _ }
+    val listener = new EventCollector
+    try {
+      // No events until started
+      spark.streams.addListener(listener)
+      assert(listener.startEvent === null)
+      assert(listener.progressEvents.isEmpty)
+      assert(listener.terminationEvent === null)
+
+      testStream(df, OutputMode.Append)(
+
+        // Start event generated when query started
+        StartStream(ProcessingTime(100), triggerClock = clock),
+        AssertOnQuery { query =>
+          assert(listener.startEvent !== null)
+          assert(listener.startEvent.id === query.id)
+          assert(listener.startEvent.runId === query.runId)
+          assert(listener.startEvent.name === query.name)
+          assert(listener.progressEvents.isEmpty)
+          assert(listener.terminationEvent === null)
+          true
+        },
+
+        // Progress event generated when data processed
+        AddData(inputData, 1, 2),
+        AdvanceManualClock(100),
+        CheckAnswer(10, 5),
+        AssertOnQuery { query =>
+          assert(listener.progressEvents.nonEmpty)
+          // SPARK-18868: We can't use query.lastProgress, because in progressEvents, we filter
+          // out non-zero input rows, but the lastProgress may be a zero input row trigger
+          val lastNonZeroProgress = query.recentProgress.filter(_.numInputRows > 0).lastOption
+            .getOrElse(fail("No progress updates received in StreamingQuery!"))
+          assert(listener.progressEvents.last.json === lastNonZeroProgress.json)
+          assert(listener.terminationEvent === null)
+          true
+        },
+
+        // Termination event generated when stopped cleanly
+        StopStream,
+        AssertOnQuery { query =>
+          eventually(Timeout(streamingTimeout)) {
+            assert(listener.terminationEvent !== null)
+            assert(listener.terminationEvent.id === query.id)
+            assert(listener.terminationEvent.runId === query.runId)
+            assert(listener.terminationEvent.exception === None)
+          }
+          listener.checkAsyncErrors()
+          listener.reset()
+          true
+        },
+
+        // Termination event generated with exception message when stopped with error
+        StartStream(ProcessingTime(100), triggerClock = clock),
+        AddData(inputData, 0),
+        AdvanceManualClock(100),
+        ExpectFailure[SparkException](),
+        AssertOnQuery { query =>
+          eventually(Timeout(streamingTimeout)) {
+            assert(listener.terminationEvent !== null)
+            assert(listener.terminationEvent.id === query.id)
+            assert(listener.terminationEvent.exception.nonEmpty)
+            // Make sure that the exception message reported through listener
+            // contains the actual exception and relevant stack trace
+            assert(!listener.terminationEvent.exception.get.contains("StreamingQueryException"))
+            assert(
+              listener.terminationEvent.exception.get.contains("java.lang.ArithmeticException"))
+            assert(listener.terminationEvent.exception.get.contains("StreamingQueryListenerSuite"))
+          }
+          listener.checkAsyncErrors()
+          true
         }
-        offset
-      }
-
-      // Wait for manual clock to be 300 first time there is data
-      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
-        clock.waitTillTime(300)
-        super.getBatch(start, end)
-      }
+      )
+    } finally {
+      spark.streams.removeListener(listener)
     }
+  }
 
-    // This is to make sure thatquery waits for manual clock to be 600 first time there is data
-    val mapped = inputData.toDS().agg(count("*")).as[Long].coalesce(1).map { x =>
-      clock.waitTillTime(600)
-      x
+  test("SPARK-19594: all of listeners should receive QueryTerminatedEvent") {
+    val df = MemoryStream[Int].toDS().as[Long]
+    val listeners = (1 to 5).map(_ => new EventCollector)
+    try {
+      listeners.foreach(listener => spark.streams.addListener(listener))
+      testStream(df, OutputMode.Append)(
+        StartStream(),
+        StopStream,
+        AssertOnQuery { query =>
+          eventually(Timeout(streamingTimeout)) {
+            listeners.foreach(listener => assert(listener.terminationEvent !== null))
+            listeners.foreach(listener => assert(listener.terminationEvent.id === query.id))
+            listeners.foreach(listener => assert(listener.terminationEvent.runId === query.runId))
+            listeners.foreach(listener => assert(listener.terminationEvent.exception === None))
+          }
+          listeners.foreach(listener => listener.checkAsyncErrors())
+          listeners.foreach(listener => listener.reset())
+          true
+        }
+      )
+    } finally {
+      listeners.foreach(spark.streams.removeListener)
     }
-
-    testStream(mapped, OutputMode.Complete)(
-      StartStream(triggerClock = clock),
-      AddData(inputData, 1, 2),
-      AdvanceManualClock(100),  // unblock getOffset, will block on getBatch
-      AdvanceManualClock(200),  // unblock getBatch, will block on computation
-      AdvanceManualClock(300),  // unblock computation
-      AssertOnQuery { _ => clock.getTimeMillis() === 600 },
-      AssertOnLastQueryStatus { status: StreamingQueryStatus =>
-        // Check the correctness of the trigger info of the last completed batch reported by
-        // onQueryProgress
-        assert(status.triggerDetails.containsKey("triggerId"))
-        assert(status.triggerDetails.get("isTriggerActive") === "false")
-        assert(status.triggerDetails.get("isDataPresentInTrigger") === "true")
-
-        assert(status.triggerDetails.get("timestamp.triggerStart") === "0")
-        assert(status.triggerDetails.get("timestamp.afterGetOffset") === "100")
-        assert(status.triggerDetails.get("timestamp.afterGetBatch") === "300")
-        assert(status.triggerDetails.get("timestamp.triggerFinish") === "600")
-
-        assert(status.triggerDetails.get("latency.getOffset.total") === "100")
-        assert(status.triggerDetails.get("latency.getBatch.total") === "200")
-        assert(status.triggerDetails.get("latency.optimizer") === "0")
-        assert(status.triggerDetails.get("latency.offsetLogWrite") === "0")
-        assert(status.triggerDetails.get("latency.fullTrigger") === "600")
-
-        assert(status.triggerDetails.get("numRows.input.total") === "2")
-        assert(status.triggerDetails.get("numRows.state.aggregation1.total") === "1")
-        assert(status.triggerDetails.get("numRows.state.aggregation1.updated") === "1")
-
-        assert(status.sourceStatuses.length === 1)
-        assert(status.sourceStatuses(0).triggerDetails.containsKey("triggerId"))
-        assert(status.sourceStatuses(0).triggerDetails.get("latency.getOffset.source") === "100")
-        assert(status.sourceStatuses(0).triggerDetails.get("latency.getBatch.source") === "200")
-        assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "2")
-      },
-      CheckAnswer(2)
-    )
   }
 
   test("adding and removing listener") {
-    def isListenerActive(listener: QueryStatusCollector): Boolean = {
+    def isListenerActive(listener: EventCollector): Boolean = {
       listener.reset()
       testStream(MemoryStream[Int].toDS)(
         StartStream(),
         StopStream
       )
-      listener.startStatus != null
+      listener.startEvent != null
     }
 
     try {
-      val listener1 = new QueryStatusCollector
-      val listener2 = new QueryStatusCollector
+      val listener1 = new EventCollector
+      val listener2 = new EventCollector
 
       spark.streams.addListener(listener1)
       assert(isListenerActive(listener1) === true)
@@ -137,19 +182,19 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
       assert(isListenerActive(listener1) === false)
       assert(isListenerActive(listener2) === true)
     } finally {
-      addedListeners.foreach(spark.streams.removeListener)
+      addedListeners().foreach(spark.streams.removeListener)
     }
   }
 
   test("event ordering") {
-    val listener = new QueryStatusCollector
+    val listener = new EventCollector
     withListenerAdded(listener) {
       for (i <- 1 to 100) {
         listener.reset()
-        require(listener.startStatus === null)
+        require(listener.startEvent === null)
         testStream(MemoryStream[Int].toDS)(
           StartStream(),
-          Assert(listener.startStatus !== null, "onQueryStarted not called before query returned"),
+          Assert(listener.startEvent !== null, "onQueryStarted not called before query returned"),
           StopStream,
           Assert { listener.checkAsyncErrors() }
         )
@@ -157,59 +202,144 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  testQuietly("exception should be reported in QueryTerminated") {
-    val listener = new QueryStatusCollector
-    withListenerAdded(listener) {
-      val input = MemoryStream[Int]
-      testStream(input.toDS.map(_ / 0))(
-        StartStream(),
-        AddData(input, 1),
-        ExpectFailure[SparkException](),
-        Assert {
-          spark.sparkContext.listenerBus.waitUntilEmpty(10000)
-          assert(listener.terminationStatus !== null)
-          assert(listener.terminationException.isDefined)
-          // Make sure that the exception message reported through listener
-          // contains the actual exception and relevant stack trace
-          assert(!listener.terminationException.get.contains("StreamingQueryException"))
-          assert(listener.terminationException.get.contains("java.lang.ArithmeticException"))
-          assert(listener.terminationException.get.contains("StreamingQueryListenerSuite"))
-        }
-      )
+  test("QueryStartedEvent serialization") {
+    def testSerialization(event: QueryStartedEvent): Unit = {
+      val json = JsonProtocol.sparkEventToJson(event)
+      val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryStartedEvent]
+      assert(newEvent.id === event.id)
+      assert(newEvent.runId === event.runId)
+      assert(newEvent.name === event.name)
     }
-  }
 
-  test("QueryStarted serialization") {
-    val queryStarted = new StreamingQueryListener.QueryStartedEvent(StreamingQueryStatus.testStatus)
-    val json = JsonProtocol.sparkEventToJson(queryStarted)
-    val newQueryStarted = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryStartedEvent]
-    assertStreamingQueryInfoEquals(queryStarted.queryStatus, newQueryStarted.queryStatus)
+    testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, "name"))
+    testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, null))
   }
 
-  test("QueryProgress serialization") {
-    val queryProcess = new StreamingQueryListener.QueryProgressEvent(
-      StreamingQueryStatus.testStatus)
-    val json = JsonProtocol.sparkEventToJson(queryProcess)
-    val newQueryProcess = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryProgressEvent]
-    assertStreamingQueryInfoEquals(queryProcess.queryStatus, newQueryProcess.queryStatus)
+  test("QueryProgressEvent serialization") {
+    def testSerialization(event: QueryProgressEvent): Unit = {
+      import scala.collection.JavaConverters._
+      val json = JsonProtocol.sparkEventToJson(event)
+      val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryProgressEvent]
+      assert(newEvent.progress.json === event.progress.json)  // json as a proxy for equality
+      assert(newEvent.progress.durationMs.asScala === event.progress.durationMs.asScala)
+      assert(newEvent.progress.eventTime.asScala === event.progress.eventTime.asScala)
+    }
+    testSerialization(new QueryProgressEvent(StreamingQueryStatusAndProgressSuite.testProgress1))
+    testSerialization(new QueryProgressEvent(StreamingQueryStatusAndProgressSuite.testProgress2))
   }
 
-  test("QueryTerminated serialization") {
+  test("QueryTerminatedEvent serialization") {
+    def testSerialization(event: QueryTerminatedEvent): Unit = {
+      val json = JsonProtocol.sparkEventToJson(event)
+      val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryTerminatedEvent]
+      assert(newEvent.id === event.id)
+      assert(newEvent.runId === event.runId)
+      assert(newEvent.exception === event.exception)
+    }
+
     val exception = new RuntimeException("exception")
-    val queryQueryTerminated = new StreamingQueryListener.QueryTerminatedEvent(
-      StreamingQueryStatus.testStatus,
-      Some(exception.getMessage))
-    val json =
-      JsonProtocol.sparkEventToJson(queryQueryTerminated)
-    val newQueryTerminated = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryTerminatedEvent]
-    assertStreamingQueryInfoEquals(queryQueryTerminated.queryStatus, newQueryTerminated.queryStatus)
-    assert(queryQueryTerminated.exception === newQueryTerminated.exception)
+    testSerialization(
+      new QueryTerminatedEvent(UUID.randomUUID, UUID.randomUUID, Some(exception.getMessage)))
+  }
+
+  test("only one progress event per interval when no data") {
+    // This test will start a query but not push any data, and then check if we push too many events
+    withSQLConf(SQLConf.STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL.key -> "100ms") {
+      @volatile var numProgressEvent = 0
+      val listener = new StreamingQueryListener {
+        override def onQueryStarted(event: QueryStartedEvent): Unit = {}
+        override def onQueryProgress(event: QueryProgressEvent): Unit = {
+          numProgressEvent += 1
+        }
+        override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {}
+      }
+      spark.streams.addListener(listener)
+      try {
+        val input = new MemoryStream[Int](0, sqlContext) {
+          @volatile var numTriggers = 0
+          override def getOffset: Option[Offset] = {
+            numTriggers += 1
+            super.getOffset
+          }
+        }
+        val clock = new StreamManualClock()
+        val actions = mutable.ArrayBuffer[StreamAction]()
+        actions += StartStream(trigger = ProcessingTime(10), triggerClock = clock)
+        for (_ <- 1 to 100) {
+          actions += AdvanceManualClock(10)
+        }
+        actions += AssertOnQuery { _ =>
+          eventually(timeout(streamingTimeout)) {
+            assert(input.numTriggers > 100) // at least 100 triggers have occurred
+          }
+          true
+        }
+        // `recentProgress` should not receive too many no data events
+        actions += AssertOnQuery { q =>
+          q.recentProgress.size > 1 && q.recentProgress.size <= 11
+        }
+        testStream(input.toDS)(actions: _*)
+        spark.sparkContext.listenerBus.waitUntilEmpty(10000)
+        // 11 is the max value of the possible numbers of events.
+        assert(numProgressEvent > 1 && numProgressEvent <= 11)
+      } finally {
+        spark.streams.removeListener(listener)
+      }
+    }
+  }
+
+  test("listener only posts events from queries started in the related sessions") {
+    val session1 = spark.newSession()
+    val session2 = spark.newSession()
+    val collector1 = new EventCollector
+    val collector2 = new EventCollector
+
+    def runQuery(session: SparkSession): Unit = {
+      collector1.reset()
+      collector2.reset()
+      val mem = MemoryStream[Int](implicitly[Encoder[Int]], session.sqlContext)
+      testStream(mem.toDS)(
+        AddData(mem, 1, 2, 3),
+        CheckAnswer(1, 2, 3)
+      )
+      session.sparkContext.listenerBus.waitUntilEmpty(5000)
+    }
+
+    def assertEventsCollected(collector: EventCollector): Unit = {
+      assert(collector.startEvent !== null)
+      assert(collector.progressEvents.nonEmpty)
+      assert(collector.terminationEvent !== null)
+    }
+
+    def assertEventsNotCollected(collector: EventCollector): Unit = {
+      assert(collector.startEvent === null)
+      assert(collector.progressEvents.isEmpty)
+      assert(collector.terminationEvent === null)
+    }
+
+    assert(session1.ne(session2))
+    assert(session1.streams.ne(session2.streams))
+
+    withListenerAdded(collector1, session1) {
+      assert(addedListeners(session1).nonEmpty)
+
+      withListenerAdded(collector2, session2) {
+        assert(addedListeners(session2).nonEmpty)
+
+        // query on session1 should send events only to collector1
+        runQuery(session1)
+        assertEventsCollected(collector1)
+        assertEventsNotCollected(collector2)
+
+        // query on session2 should send events only to collector2
+        runQuery(session2)
+        assertEventsCollected(collector2)
+        assertEventsNotCollected(collector1)
+      }
+    }
   }
 
-  test("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") {
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") {
     // query-event-logs-version-2.0.0.txt has all types of events generated by
     // Structured Streaming in Spark 2.0.0.
     // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
@@ -217,7 +347,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.0.txt")
   }
 
-  test("ReplayListenerBus should ignore broken event jsons generated in 2.0.1") {
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.1") {
     // query-event-logs-version-2.0.1.txt has all types of events generated by
     // Structured Streaming in Spark 2.0.1.
     // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
@@ -225,6 +355,14 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.1.txt")
   }
 
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.2") {
+    // query-event-logs-version-2.0.2.txt has all types of events generated by
+    // Structured Streaming in Spark 2.0.2.
+    // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
+    // to verify that we can skip broken jsons generated by Structured Streaming.
+    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.2.txt")
+  }
+
   private def testReplayListenerBusWithBorkenEventJsons(fileName: String): Unit = {
     val input = getClass.getResourceAsStream(s"/structured-streaming/$fileName")
     val events = mutable.ArrayBuffer[SparkListenerEvent]()
@@ -248,48 +386,70 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  private def assertStreamingQueryInfoEquals(
-      expected: StreamingQueryStatus,
-      actual: StreamingQueryStatus): Unit = {
-    assert(expected.name === actual.name)
-    assert(expected.sourceStatuses.size === actual.sourceStatuses.size)
-    expected.sourceStatuses.zip(actual.sourceStatuses).foreach {
-      case (expectedSource, actualSource) =>
-        assertSourceStatus(expectedSource, actualSource)
-    }
-    assertSinkStatus(expected.sinkStatus, actual.sinkStatus)
-  }
-
-  private def assertSourceStatus(expected: SourceStatus, actual: SourceStatus): Unit = {
-    assert(expected.description === actual.description)
-    assert(expected.offsetDesc === actual.offsetDesc)
-  }
-
-  private def assertSinkStatus(expected: SinkStatus, actual: SinkStatus): Unit = {
-    assert(expected.description === actual.description)
-    assert(expected.offsetDesc === actual.offsetDesc)
-  }
-
-  private def withListenerAdded(listener: StreamingQueryListener)(body: => Unit): Unit = {
+  private def withListenerAdded(
+      listener: StreamingQueryListener,
+      session: SparkSession = spark)(body: => Unit): Unit = {
     try {
       failAfter(streamingTimeout) {
-        spark.streams.addListener(listener)
+        session.streams.addListener(listener)
         body
       }
     } finally {
-      spark.streams.removeListener(listener)
+      session.streams.removeListener(listener)
     }
   }
 
-  private def addedListeners(): Array[StreamingQueryListener] = {
+  private def addedListeners(session: SparkSession = spark): Array[StreamingQueryListener] = {
     val listenerBusMethod =
       PrivateMethod[StreamingQueryListenerBus]('listenerBus)
-    val listenerBus = spark.streams invokePrivate listenerBusMethod()
+    val listenerBus = session.streams invokePrivate listenerBusMethod()
     listenerBus.listeners.toArray.map(_.asInstanceOf[StreamingQueryListener])
   }
-}
 
-object StreamingQueryListenerSuite {
-  // Singleton reference to clock that does not get serialized in task closures
-  @volatile var clock: ManualClock = null
+  /** Collects events from the StreamingQueryListener for testing */
+  class EventCollector extends StreamingQueryListener {
+    // to catch errors in the async listener events
+    @volatile private var asyncTestWaiter = new Waiter
+
+    @volatile var startEvent: QueryStartedEvent = null
+    @volatile var terminationEvent: QueryTerminatedEvent = null
+
+    private val _progressEvents = new mutable.Queue[StreamingQueryProgress]
+
+    def progressEvents: Seq[StreamingQueryProgress] = _progressEvents.synchronized {
+      _progressEvents.filter(_.numInputRows > 0)
+    }
+
+    def reset(): Unit = {
+      startEvent = null
+      terminationEvent = null
+      _progressEvents.clear()
+      asyncTestWaiter = new Waiter
+    }
+
+    def checkAsyncErrors(): Unit = {
+      asyncTestWaiter.await(timeout(streamingTimeout))
+    }
+
+    override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
+      asyncTestWaiter {
+        startEvent = queryStarted
+      }
+    }
+
+    override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
+      asyncTestWaiter {
+        assert(startEvent != null, "onQueryProgress called before onQueryStarted")
+        _progressEvents.synchronized { _progressEvents += queryProgress.progress }
+      }
+    }
+
+    override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = {
+      asyncTestWaiter {
+        assert(startEvent != null, "onQueryTerminated called before onQueryStarted")
+        terminationEvent = queryTerminated
+      }
+      asyncTestWaiter.dismiss()
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
index 41ffd56cf129..f05e9d1fda73 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.concurrent.CountDownLatch
+
 import scala.concurrent.Future
 import scala.util.Random
 import scala.util.control.NonFatal
@@ -28,8 +30,10 @@ import org.scalatest.time.Span
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.{AnalysisException, Dataset}
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.streaming.util.BlockingSource
 import org.apache.spark.util.Utils
 
 class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
@@ -62,7 +66,7 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
       assert(spark.streams.get(q1.id).eq(q1))
       assert(spark.streams.get(q2.id).eq(q2))
       assert(spark.streams.get(q3.id).eq(q3))
-      assert(spark.streams.get(-1) === null) // non-existent id
+      assert(spark.streams.get(java.util.UUID.randomUUID()) === null) // non-existent id
       q1.stop()
 
       assert(spark.streams.active.toSet === Set(q2, q3))
@@ -213,13 +217,44 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
+  test("SPARK-18811: Source resolution should not block main thread") {
+    failAfter(streamingTimeout) {
+      BlockingSource.latch = new CountDownLatch(1)
+      withTempDir { tempDir =>
+        // if source resolution was happening on the main thread, it would block the start call,
+        // now it should only be blocking the stream execution thread
+        val sq = spark.readStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .load()
+          .writeStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .option("checkpointLocation", tempDir.toString)
+          .start()
+        eventually(Timeout(streamingTimeout)) {
+          assert(sq.status.message.contains("Initializing sources"))
+        }
+        BlockingSource.latch.countDown()
+        sq.stop()
+      }
+    }
+  }
+
+  test("SPARK-19268: Adaptive query execution should be disallowed") {
+    withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") {
+      val e = intercept[AnalysisException] {
+        MemoryStream[Int].toDS.writeStream.queryName("test-query").format("memory").start()
+      }
+      assert(e.getMessage.contains(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) &&
+        e.getMessage.contains("not supported"))
+    }
+  }
 
   /** Run a body of code by defining a query on each dataset */
   private def withQueriesOn(datasets: Dataset[_]*)(body: Seq[StreamingQuery] => Unit): Unit = {
     failAfter(streamingTimeout) {
       val queries = withClue("Error starting queries") {
         datasets.zipWithIndex.map { case (ds, i) =>
-          @volatile var query: StreamExecution = null
+          var query: StreamingQuery = null
           try {
             val df = ds.toDF
             val metadataRoot =
@@ -231,7 +266,6 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
                 .option("checkpointLocation", metadataRoot)
                 .outputMode("append")
                 .start()
-                .asInstanceOf[StreamExecution]
           } catch {
             case NonFatal(e) =>
               if (query != null) query.stop()
@@ -279,7 +313,7 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
       Thread.sleep(stopAfter.toMillis)
       if (withError) {
         logDebug(s"Terminating query ${queryToStop.name} with error")
-        queryToStop.asInstanceOf[StreamExecution].logicalPlan.collect {
+        queryToStop.asInstanceOf[StreamingQueryWrapper].streamingQuery.logicalPlan.collect {
           case StreamingExecutionRelation(source, _) =>
             source.asInstanceOf[MemoryStream[Int]].addData(0)
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
new file mode 100644
index 000000000000..10d487a9b30d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.util.UUID
+
+import scala.collection.JavaConverters._
+import scala.language.postfixOps
+
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+import org.scalatest.concurrent.Eventually
+import org.scalatest.time.SpanSugar._
+
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.streaming.StreamingQueryStatusAndProgressSuite._
+
+class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually {
+  implicit class EqualsIgnoreCRLF(source: String) {
+    def equalsIgnoreCRLF(target: String): Boolean = {
+      source.replaceAll("\r\n|\r|\n", System.lineSeparator) ===
+        target.replaceAll("\r\n|\r|\n", System.lineSeparator)
+    }
+  }
+
+  test("StreamingQueryProgress - prettyJson") {
+    val json1 = testProgress1.prettyJson
+    assert(json1 ===
+      s"""
+        |{
+        |  "id" : "${testProgress1.id.toString}",
+        |  "runId" : "${testProgress1.runId.toString}",
+        |  "name" : "myName",
+        |  "timestamp" : "2016-12-05T20:54:20.827Z",
+        |  "numInputRows" : 678,
+        |  "inputRowsPerSecond" : 10.0,
+        |  "durationMs" : {
+        |    "total" : 0
+        |  },
+        |  "eventTime" : {
+        |    "avg" : "2016-12-05T20:54:20.827Z",
+        |    "max" : "2016-12-05T20:54:20.827Z",
+        |    "min" : "2016-12-05T20:54:20.827Z",
+        |    "watermark" : "2016-12-05T20:54:20.827Z"
+        |  },
+        |  "stateOperators" : [ {
+        |    "numRowsTotal" : 0,
+        |    "numRowsUpdated" : 1
+        |  } ],
+        |  "sources" : [ {
+        |    "description" : "source",
+        |    "startOffset" : 123,
+        |    "endOffset" : 456,
+        |    "numInputRows" : 678,
+        |    "inputRowsPerSecond" : 10.0
+        |  } ],
+        |  "sink" : {
+        |    "description" : "sink"
+        |  }
+        |}
+      """.stripMargin.trim)
+    assert(compact(parse(json1)) === testProgress1.json)
+
+    val json2 = testProgress2.prettyJson
+    assert(
+      json2 ===
+        s"""
+         |{
+         |  "id" : "${testProgress2.id.toString}",
+         |  "runId" : "${testProgress2.runId.toString}",
+         |  "name" : null,
+         |  "timestamp" : "2016-12-05T20:54:20.827Z",
+         |  "numInputRows" : 678,
+         |  "durationMs" : {
+         |    "total" : 0
+         |  },
+         |  "stateOperators" : [ {
+         |    "numRowsTotal" : 0,
+         |    "numRowsUpdated" : 1
+         |  } ],
+         |  "sources" : [ {
+         |    "description" : "source",
+         |    "startOffset" : 123,
+         |    "endOffset" : 456,
+         |    "numInputRows" : 678
+         |  } ],
+         |  "sink" : {
+         |    "description" : "sink"
+         |  }
+         |}
+      """.stripMargin.trim)
+    assert(compact(parse(json2)) === testProgress2.json)
+  }
+
+  test("StreamingQueryProgress - json") {
+    assert(compact(parse(testProgress1.json)) === testProgress1.json)
+    assert(compact(parse(testProgress2.json)) === testProgress2.json)
+  }
+
+  test("StreamingQueryProgress - toString") {
+    assert(testProgress1.toString === testProgress1.prettyJson)
+    assert(testProgress2.toString === testProgress2.prettyJson)
+  }
+
+  test("StreamingQueryStatus - prettyJson") {
+    val json = testStatus.prettyJson
+    assert(json ===
+      """
+        |{
+        |  "message" : "active",
+        |  "isDataAvailable" : true,
+        |  "isTriggerActive" : false
+        |}
+      """.stripMargin.trim)
+  }
+
+  test("StreamingQueryStatus - json") {
+    assert(compact(parse(testStatus.json)) === testStatus.json)
+  }
+
+  test("StreamingQueryStatus - toString") {
+    assert(testStatus.toString === testStatus.prettyJson)
+  }
+
+  test("progress classes should be Serializable") {
+    import testImplicits._
+
+    val inputData = MemoryStream[Int]
+
+    val query = inputData.toDS()
+      .groupBy($"value")
+      .agg(count("*"))
+      .writeStream
+      .queryName("progress_serializable_test")
+      .format("memory")
+      .outputMode("complete")
+      .start()
+    try {
+      inputData.addData(1, 2, 3)
+      query.processAllAvailable()
+
+      val progress = query.recentProgress
+
+      // Make sure it generates the progress objects we want to test
+      assert(progress.exists { p =>
+        p.sources.size >= 1 && p.stateOperators.size >= 1 && p.sink != null
+      })
+
+      val array = spark.sparkContext.parallelize(progress).collect()
+      assert(array.length === progress.length)
+      array.zip(progress).foreach { case (p1, p2) =>
+        // Make sure we did serialize and deserialize the object
+        assert(p1 ne p2)
+        assert(p1.json === p2.json)
+      }
+    } finally {
+      query.stop()
+    }
+  }
+
+  test("SPARK-19378: Continue reporting stateOp metrics even if there is no active trigger") {
+    import testImplicits._
+
+    withSQLConf(SQLConf.STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL.key -> "10") {
+      val inputData = MemoryStream[Int]
+
+      val query = inputData.toDS().toDF("value")
+        .select('value)
+        .groupBy($"value")
+        .agg(count("*"))
+        .writeStream
+        .queryName("metric_continuity")
+        .format("memory")
+        .outputMode("complete")
+        .start()
+      try {
+        inputData.addData(1, 2)
+        query.processAllAvailable()
+
+        val progress = query.lastProgress
+        assert(progress.stateOperators.length > 0)
+        // Should emit new progresses every 10 ms, but we could be facing a slow Jenkins
+        eventually(timeout(1 minute)) {
+          val nextProgress = query.lastProgress
+          assert(nextProgress.timestamp !== progress.timestamp)
+          assert(nextProgress.numInputRows === 0)
+          assert(nextProgress.stateOperators.head.numRowsTotal === 2)
+          assert(nextProgress.stateOperators.head.numRowsUpdated === 0)
+        }
+      } finally {
+        query.stop()
+      }
+    }
+  }
+}
+
+object StreamingQueryStatusAndProgressSuite {
+  val testProgress1 = new StreamingQueryProgress(
+    id = UUID.randomUUID,
+    runId = UUID.randomUUID,
+    name = "myName",
+    timestamp = "2016-12-05T20:54:20.827Z",
+    batchId = 2L,
+    durationMs = new java.util.HashMap(Map("total" -> 0L).mapValues(long2Long).asJava),
+    eventTime = new java.util.HashMap(Map(
+      "max" -> "2016-12-05T20:54:20.827Z",
+      "min" -> "2016-12-05T20:54:20.827Z",
+      "avg" -> "2016-12-05T20:54:20.827Z",
+      "watermark" -> "2016-12-05T20:54:20.827Z").asJava),
+    stateOperators = Array(new StateOperatorProgress(numRowsTotal = 0, numRowsUpdated = 1)),
+    sources = Array(
+      new SourceProgress(
+        description = "source",
+        startOffset = "123",
+        endOffset = "456",
+        numInputRows = 678,
+        inputRowsPerSecond = 10.0,
+        processedRowsPerSecond = Double.PositiveInfinity  // should not be present in the json
+      )
+    ),
+    sink = new SinkProgress("sink")
+  )
+
+  val testProgress2 = new StreamingQueryProgress(
+    id = UUID.randomUUID,
+    runId = UUID.randomUUID,
+    name = null, // should not be present in the json
+    timestamp = "2016-12-05T20:54:20.827Z",
+    batchId = 2L,
+    durationMs = new java.util.HashMap(Map("total" -> 0L).mapValues(long2Long).asJava),
+    // empty maps should be handled correctly
+    eventTime = new java.util.HashMap(Map.empty[String, String].asJava),
+    stateOperators = Array(new StateOperatorProgress(numRowsTotal = 0, numRowsUpdated = 1)),
+    sources = Array(
+      new SourceProgress(
+        description = "source",
+        startOffset = "123",
+        endOffset = "456",
+        numInputRows = 678,
+        inputRowsPerSecond = Double.NaN, // should not be present in the json
+        processedRowsPerSecond = Double.NegativeInfinity // should not be present in the json
+      )
+    ),
+    sink = new SinkProgress("sink")
+  )
+
+  val testStatus = new StreamingQueryStatus("active", true, false)
+}
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
deleted file mode 100644
index 1a98cf2ba74e..000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import org.apache.spark.SparkFunSuite
-
-class StreamingQueryStatusSuite extends SparkFunSuite {
-  test("toString") {
-    assert(StreamingQueryStatus.testStatus.sourceStatuses(0).toString ===
-      """
-        |Status of source MySource1
-        |    Available offset: #0
-        |    Input rate: 15.5 rows/sec
-        |    Processing rate: 23.5 rows/sec
-        |    Trigger details:
-        |        numRows.input.source: 100
-        |        latency.getOffset.source: 10
-        |        latency.getBatch.source: 20
-      """.stripMargin.trim, "SourceStatus.toString does not match")
-
-    assert(StreamingQueryStatus.testStatus.sinkStatus.toString ===
-      """
-        |Status of sink MySink
-        |    Committed offsets: [#1, -]
-      """.stripMargin.trim, "SinkStatus.toString does not match")
-
-    assert(StreamingQueryStatus.testStatus.toString ===
-      """
-        |Status of query 'query'
-        |    Query id: 1
-        |    Status timestamp: 123
-        |    Input rate: 15.5 rows/sec
-        |    Processing rate 23.5 rows/sec
-        |    Latency: 345.0 ms
-        |    Trigger details:
-        |        isDataPresentInTrigger: true
-        |        isTriggerActive: true
-        |        latency.getBatch.total: 20
-        |        latency.getOffset.total: 10
-        |        numRows.input.total: 100
-        |        triggerId: 5
-        |    Source statuses [1 source]:
-        |        Source 1 - MySource1
-        |            Available offset: #0
-        |            Input rate: 15.5 rows/sec
-        |            Processing rate: 23.5 rows/sec
-        |            Trigger details:
-        |                numRows.input.source: 100
-        |                latency.getOffset.source: 10
-        |                latency.getBatch.source: 20
-        |    Sink status - MySink
-        |        Committed offsets: [#1, -]
-      """.stripMargin.trim, "StreamingQueryStatus.toString does not match")
-
-  }
-
-  test("json") {
-    assert(StreamingQueryStatus.testStatus.json ===
-      """
-        |{"sourceStatuses":[{"description":"MySource1","offsetDesc":"#0","inputRate":15.5,
-        |"processingRate":23.5,"triggerDetails":{"numRows.input.source":"100",
-        |"latency.getOffset.source":"10","latency.getBatch.source":"20"}}],
-        |"sinkStatus":{"description":"MySink","offsetDesc":"[#1, -]"}}
-      """.stripMargin.replace("\n", "").trim)
-  }
-
-  test("prettyJson") {
-    assert(
-      StreamingQueryStatus.testStatus.prettyJson ===
-        """
-          |{
-          |  "sourceStatuses" : [ {
-          |    "description" : "MySource1",
-          |    "offsetDesc" : "#0",
-          |    "inputRate" : 15.5,
-          |    "processingRate" : 23.5,
-          |    "triggerDetails" : {
-          |      "numRows.input.source" : "100",
-          |      "latency.getOffset.source" : "10",
-          |      "latency.getBatch.source" : "20"
-          |    }
-          |  } ],
-          |  "sinkStatus" : {
-          |    "description" : "MySink",
-          |    "offsetDesc" : "[#1, -]"
-          |  }
-          |}
-        """.stripMargin.trim)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 464c443beb6e..a0a2b2b4c9b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -17,20 +17,28 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.concurrent.CountDownLatch
+
+import org.apache.commons.lang3.RandomStringUtils
+import org.mockito.Mockito._
 import org.scalactic.TolerantNumerics
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.BeforeAndAfter
+import org.scalatest.concurrent.PatienceConfiguration.Timeout
+import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.streaming.StreamingQueryListener._
+import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.SparkException
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.util.Utils
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.streaming.util.{BlockingSource, MockSourceProvider}
+import org.apache.spark.util.ManualClock
 
 
-class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
+class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging with MockitoSugar {
 
   import AwaitTerminationTester._
   import testImplicits._
@@ -42,41 +50,80 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     sqlContext.streams.active.foreach(_.stop())
   }
 
-  test("names unique across active queries, ids unique across all started queries") {
-    val inputData = MemoryStream[Int]
-    val mapped = inputData.toDS().map { 6 / _}
+  test("name unique in active queries") {
+    withTempDir { dir =>
+      def startQuery(name: Option[String]): StreamingQuery = {
+        val writer = MemoryStream[Int].toDS.writeStream
+        name.foreach(writer.queryName)
+        writer
+          .foreach(new TestForeachWriter)
+          .start()
+      }
 
-    def startQuery(queryName: String): StreamingQuery = {
-      val metadataRoot = Utils.createTempDir(namePrefix = "streaming.checkpoint").getCanonicalPath
-      val writer = mapped.writeStream
-      writer
-        .queryName(queryName)
-        .format("memory")
-        .option("checkpointLocation", metadataRoot)
-        .start()
-    }
+      // No name by default, multiple active queries can have no name
+      val q1 = startQuery(name = None)
+      assert(q1.name === null)
+      val q2 = startQuery(name = None)
+      assert(q2.name === null)
+
+      // Can be set by user
+      val q3 = startQuery(name = Some("q3"))
+      assert(q3.name === "q3")
 
-    val q1 = startQuery("q1")
-    assert(q1.name === "q1")
+      // Multiple active queries cannot have same name
+      val e = intercept[IllegalArgumentException] {
+        startQuery(name = Some("q3"))
+      }
 
-    // Verify that another query with same name cannot be started
-    val e1 = intercept[IllegalArgumentException] {
-      startQuery("q1")
+      q1.stop()
+      q2.stop()
+      q3.stop()
     }
-    Seq("q1", "already active").foreach { s => assert(e1.getMessage.contains(s)) }
+  }
 
-    // Verify q1 was unaffected by the above exception and stop it
-    assert(q1.isActive)
-    q1.stop()
+  test(
+    "id unique in active queries + persists across restarts, runId unique across start/restarts") {
+    val inputData = MemoryStream[Int]
+    withTempDir { dir =>
+      var cpDir: String = null
+
+      def startQuery(restart: Boolean): StreamingQuery = {
+        if (cpDir == null || !restart) cpDir = s"$dir/${RandomStringUtils.randomAlphabetic(10)}"
+        MemoryStream[Int].toDS().groupBy().count()
+          .writeStream
+          .format("memory")
+          .outputMode("complete")
+          .queryName(s"name${RandomStringUtils.randomAlphabetic(10)}")
+          .option("checkpointLocation", cpDir)
+          .start()
+      }
 
-    // Verify another query can be started with name q1, but will have different id
-    val q2 = startQuery("q1")
-    assert(q2.name === "q1")
-    assert(q2.id !== q1.id)
-    q2.stop()
+      // id and runId unique for new queries
+      val q1 = startQuery(restart = false)
+      val q2 = startQuery(restart = false)
+      assert(q1.id !== q2.id)
+      assert(q1.runId !== q2.runId)
+      q1.stop()
+      q2.stop()
+
+      // id persists across restarts, runId unique across restarts
+      val q3 = startQuery(restart = false)
+      q3.stop()
+
+      val q4 = startQuery(restart = true)
+      q4.stop()
+      assert(q3.id === q3.id)
+      assert(q3.runId !== q4.runId)
+
+      // Only one query with same id can be active
+      val q5 = startQuery(restart = false)
+      val e = intercept[IllegalStateException] {
+        startQuery(restart = true)
+      }
+    }
   }
 
-  testQuietly("lifecycle states and awaitTermination") {
+  testQuietly("isActive, exception, and awaitTermination") {
     val inputData = MemoryStream[Int]
     val mapped = inputData.toDS().map { 6 / _}
 
@@ -97,106 +144,201 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       StartStream(),
       AssertOnQuery(_.isActive === true),
       AddData(inputData, 0),
-      ExpectFailure[SparkException],
+      ExpectFailure[SparkException](),
       AssertOnQuery(_.isActive === false),
       TestAwaitTermination(ExpectException[SparkException]),
       TestAwaitTermination(ExpectException[SparkException], timeoutMs = 2000),
       TestAwaitTermination(ExpectException[SparkException], timeoutMs = 10),
-      AssertOnQuery(
-        q =>
-          q.exception.get.startOffset.get === q.committedOffsets.toCompositeOffset(Seq(inputData)),
-        "incorrect start offset on exception")
+      AssertOnQuery(q => {
+        q.exception.get.startOffset ===
+          q.committedOffsets.toOffsetSeq(Seq(inputData), OffsetSeqMetadata()).toString &&
+          q.exception.get.endOffset ===
+            q.availableOffsets.toOffsetSeq(Seq(inputData), OffsetSeqMetadata()).toString
+      }, "incorrect start offset or end offset on exception")
     )
   }
 
-  testQuietly("query statuses") {
-    val inputData = MemoryStream[Int]
-    val mapped = inputData.toDS().map(6 / _)
-    testStream(mapped)(
-      AssertOnQuery(q => q.status.name === q.name),
-      AssertOnQuery(q => q.status.id === q.id),
-      AssertOnQuery(_.status.timestamp <= System.currentTimeMillis),
-      AssertOnQuery(_.status.inputRate === 0.0),
-      AssertOnQuery(_.status.processingRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).description.contains("Memory")),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === "-"),
-      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.status.sinkStatus.description.contains("Memory")),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc === CompositeOffset(None :: Nil).toString),
-      AssertOnQuery(_.sourceStatuses(0).description.contains("Memory")),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === "-"),
-      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.sinkStatus.description.contains("Memory")),
-      AssertOnQuery(_.sinkStatus.offsetDesc === new CompositeOffset(None :: Nil).toString),
+  testQuietly("status, lastProgress, and recentProgress") {
+    import StreamingQuerySuite._
+    clock = new StreamManualClock
+
+    /** Custom MemoryStream that waits for manual clock to reach a time */
+    val inputData = new MemoryStream[Int](0, sqlContext) {
+      // Wait for manual clock to be 100 first time there is data
+      override def getOffset: Option[Offset] = {
+        val offset = super.getOffset
+        if (offset.nonEmpty) {
+          clock.waitTillTime(300)
+        }
+        offset
+      }
 
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3),
-      AssertOnQuery(_.status.timestamp <= System.currentTimeMillis),
-      AssertOnQuery(_.status.inputRate >= 0.0),
-      AssertOnQuery(_.status.processingRate >= 0.0),
-      AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).description.contains("Memory")),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(0).toString),
-      AssertOnQuery(_.status.sourceStatuses(0).inputRate >= 0.0),
-      AssertOnQuery(_.status.sourceStatuses(0).processingRate >= 0.0),
-      AssertOnQuery(_.status.sinkStatus.description.contains("Memory")),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        CompositeOffset.fill(LongOffset(0)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(0).toString),
-      AssertOnQuery(_.sourceStatuses(0).inputRate >= 0.0),
-      AssertOnQuery(_.sourceStatuses(0).processingRate >= 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(0)).toString),
+      // Wait for manual clock to be 300 first time there is data
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        clock.waitTillTime(600)
+        super.getBatch(start, end)
+      }
+    }
+
+    // This is to make sure thatquery waits for manual clock to be 600 first time there is data
+    val mapped = inputData.toDS().as[Long].map { x =>
+      clock.waitTillTime(1100)
+      10 / x
+    }.agg(count("*")).as[Long]
+
+    case class AssertStreamExecThreadToWaitForClock()
+      extends AssertOnQuery(q => {
+        eventually(Timeout(streamingTimeout)) {
+          if (q.exception.isEmpty) {
+            assert(clock.asInstanceOf[StreamManualClock].isStreamWaitingAt(clock.getTimeMillis))
+          }
+        }
+        if (q.exception.isDefined) {
+          throw q.exception.get
+        }
+        true
+      }, "")
+
+    var lastProgressBeforeStop: StreamingQueryProgress = null
 
+    testStream(mapped, OutputMode.Complete)(
+      StartStream(ProcessingTime(100), triggerClock = clock),
+      AssertStreamExecThreadToWaitForClock(),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Waiting for next trigger"),
+      AssertOnQuery(_.recentProgress.count(_.numInputRows > 0) === 0),
+
+      // Test status and progress while offset is being fetched
       AddData(inputData, 1, 2),
-      CheckAnswer(6, 3, 6, 3),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        CompositeOffset.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
-      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString),
+      AdvanceManualClock(100), // time = 100 to start new trigger, will block on getOffset
+      AssertStreamExecThreadToWaitForClock(),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === true),
+      AssertOnQuery(_.status.message.startsWith("Getting offsets from")),
+      AssertOnQuery(_.recentProgress.count(_.numInputRows > 0) === 0),
+
+      // Test status and progress while batch is being fetched
+      AdvanceManualClock(200), // time = 300 to unblock getOffset, will block on getBatch
+      AssertStreamExecThreadToWaitForClock(),
+      AssertOnQuery(_.status.isDataAvailable === true),
+      AssertOnQuery(_.status.isTriggerActive === true),
+      AssertOnQuery(_.status.message === "Processing new data"),
+      AssertOnQuery(_.recentProgress.count(_.numInputRows > 0) === 0),
+
+      // Test status and progress while batch is being processed
+      AdvanceManualClock(300), // time = 600 to unblock getBatch, will block in Spark job
+      AssertOnQuery(_.status.isDataAvailable === true),
+      AssertOnQuery(_.status.isTriggerActive === true),
+      AssertOnQuery(_.status.message === "Processing new data"),
+      AssertOnQuery(_.recentProgress.count(_.numInputRows > 0) === 0),
+
+      // Test status and progress while batch processing has completed
+      AdvanceManualClock(500), // time = 1100 to unblock job
+      AssertOnQuery { _ => clock.getTimeMillis() === 1100 },
+      CheckAnswer(2),
+      AssertOnQuery(_.status.isDataAvailable === true),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Waiting for next trigger"),
+      AssertOnQuery { query =>
+        assert(query.lastProgress != null)
+        assert(query.recentProgress.exists(_.numInputRows > 0))
+        assert(query.recentProgress.last.eq(query.lastProgress))
+
+        val progress = query.lastProgress
+        assert(progress.id === query.id)
+        assert(progress.name === query.name)
+        assert(progress.batchId === 0)
+        assert(progress.timestamp === "1970-01-01T00:00:00.100Z") // 100 ms in UTC
+        assert(progress.numInputRows === 2)
+        assert(progress.processedRowsPerSecond === 2.0)
+
+        assert(progress.durationMs.get("getOffset") === 200)
+        assert(progress.durationMs.get("getBatch") === 300)
+        assert(progress.durationMs.get("queryPlanning") === 0)
+        assert(progress.durationMs.get("walCommit") === 0)
+        assert(progress.durationMs.get("triggerExecution") === 1000)
+
+        assert(progress.sources.length === 1)
+        assert(progress.sources(0).description contains "MemoryStream")
+        assert(progress.sources(0).startOffset === null)
+        assert(progress.sources(0).endOffset !== null)
+        assert(progress.sources(0).processedRowsPerSecond === 2.0)
+
+        assert(progress.stateOperators.length === 1)
+        assert(progress.stateOperators(0).numRowsUpdated === 1)
+        assert(progress.stateOperators(0).numRowsTotal === 1)
+
+        assert(progress.sink.description contains "MemorySink")
+        true
+      },
 
+      AddData(inputData, 1, 2),
+      AdvanceManualClock(100), // allow another trigger
+      CheckAnswer(4),
+      AssertOnQuery(_.status.isDataAvailable === true),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Waiting for next trigger"),
+      AssertOnQuery { query =>
+        assert(query.recentProgress.last.eq(query.lastProgress))
+        assert(query.lastProgress.batchId === 1)
+        assert(query.lastProgress.sources(0).inputRowsPerSecond === 1.818)
+        true
+      },
+
+      // Test status and progress after data is not available for a trigger
+      AdvanceManualClock(100), // allow another trigger
+      AssertStreamExecThreadToWaitForClock(),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Waiting for next trigger"),
+
+      // Test status and progress after query stopped
+      AssertOnQuery { query =>
+        lastProgressBeforeStop = query.lastProgress
+        true
+      },
       StopStream,
-      AssertOnQuery(_.status.inputRate === 0.0),
-      AssertOnQuery(_.status.processingRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
-      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        CompositeOffset.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
-      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.status.triggerDetails.isEmpty),
+      AssertOnQuery(_.lastProgress.json === lastProgressBeforeStop.json),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Stopped"),
 
-      StartStream(),
+      // Test status and progress after query terminated with error
+      StartStream(ProcessingTime(100), triggerClock = clock),
       AddData(inputData, 0),
-      ExpectFailure[SparkException],
-      AssertOnQuery(_.status.inputRate === 0.0),
-      AssertOnQuery(_.status.processingRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(2).toString),
-      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        CompositeOffset.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(2).toString),
-      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString)
+      AdvanceManualClock(100),
+      ExpectFailure[SparkException](),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message.startsWith("Terminated with exception"))
     )
   }
 
+  test("lastProgress should be null when recentProgress is empty") {
+    BlockingSource.latch = new CountDownLatch(1)
+    withTempDir { tempDir =>
+      val sq = spark.readStream
+        .format("org.apache.spark.sql.streaming.util.BlockingSource")
+        .load()
+        .writeStream
+        .format("org.apache.spark.sql.streaming.util.BlockingSource")
+        .option("checkpointLocation", tempDir.toString)
+        .start()
+      // Creating source is blocked so recentProgress is empty and lastProgress should be null
+      assert(sq.lastProgress === null)
+      // Release the latch and stop the query
+      BlockingSource.latch.countDown()
+      sq.stop()
+    }
+  }
+
   test("codahale metrics") {
     val inputData = MemoryStream[Int]
 
     /** Whether metrics of a query is registered for reporting */
     def isMetricsRegistered(query: StreamingQuery): Boolean = {
-      val sourceName = s"StructuredStreaming.${query.name}"
+      val sourceName = s"spark.streaming.${query.id}"
       val sources = spark.sparkContext.env.metricsSystem.getSourcesByName(sourceName)
       require(sources.size <= 1)
       sources.nonEmpty
@@ -229,47 +371,185 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
 
     // Trigger input has 10 rows, static input has 2 rows,
     // therefore after the first trigger, the calculated input rows should be 10
-    val status = getFirstTriggerStatus(streamingInputDF.join(staticInputDF, "value"))
-    assert(status.triggerDetails.get("numRows.input.total") === "10")
-    assert(status.sourceStatuses.size === 1)
-    assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "10")
+    val progress = getFirstProgress(streamingInputDF.join(staticInputDF, "value"))
+    assert(progress.numInputRows === 10)
+    assert(progress.sources.size === 1)
+    assert(progress.sources(0).numInputRows === 10)
   }
 
-  test("input row calculation with trigger DF having multiple leaves") {
+  test("input row calculation with trigger input DF having multiple leaves") {
     val streamingTriggerDF =
       spark.createDataset(1 to 5).toDF.union(spark.createDataset(6 to 10).toDF)
     require(streamingTriggerDF.logicalPlan.collectLeaves().size > 1)
     val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF)
 
     // After the first trigger, the calculated input rows should be 10
-    val status = getFirstTriggerStatus(streamingInputDF)
-    assert(status.triggerDetails.get("numRows.input.total") === "10")
-    assert(status.sourceStatuses.size === 1)
-    assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "10")
+    val progress = getFirstProgress(streamingInputDF)
+    assert(progress.numInputRows === 10)
+    assert(progress.sources.size === 1)
+    assert(progress.sources(0).numInputRows === 10)
   }
 
   testQuietly("StreamExecution metadata garbage collection") {
     val inputData = MemoryStream[Int]
     val mapped = inputData.toDS().map(6 / _)
+    withSQLConf(SQLConf.MIN_BATCHES_TO_RETAIN.key -> "1") {
+      // Run 3 batches, and then assert that only 2 metadata files is are at the end
+      // since the first should have been purged.
+      testStream(mapped)(
+        AddData(inputData, 1, 2),
+        CheckAnswer(6, 3),
+        AddData(inputData, 1, 2),
+        CheckAnswer(6, 3, 6, 3),
+        AddData(inputData, 4, 6),
+        CheckAnswer(6, 3, 6, 3, 1, 1),
+
+        AssertOnQuery("metadata log should contain only two files") { q =>
+          val metadataLogDir = new java.io.File(q.offsetLog.metadataPath.toString)
+          val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
+          val toTest = logFileNames.filter(!_.endsWith(".crc")).sorted // Workaround for SPARK-17475
+          assert(toTest.size == 2 && toTest.head == "1")
+          true
+        }
+      )
+    }
 
-    // Run 3 batches, and then assert that only 2 metadata files is are at the end
-    // since the first should have been purged.
-    testStream(mapped)(
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3),
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3, 6, 3),
-      AddData(inputData, 4, 6),
-      CheckAnswer(6, 3, 6, 3, 1, 1),
-
-      AssertOnQuery("metadata log should contain only two files") { q =>
-        val metadataLogDir = new java.io.File(q.offsetLog.metadataPath.toString)
-        val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
-        val toTest = logFileNames.filter(! _.endsWith(".crc")).sorted  // Workaround for SPARK-17475
-        assert(toTest.size == 2 && toTest.head == "1")
-        true
+    val inputData2 = MemoryStream[Int]
+    withSQLConf(SQLConf.MIN_BATCHES_TO_RETAIN.key -> "2") {
+      // Run 5 batches, and then assert that 3 metadata files is are at the end
+      // since the two should have been purged.
+      testStream(inputData2.toDS())(
+        AddData(inputData2, 1, 2),
+        CheckAnswer(1, 2),
+        AddData(inputData2, 1, 2),
+        CheckAnswer(1, 2, 1, 2),
+        AddData(inputData2, 3, 4),
+        CheckAnswer(1, 2, 1, 2, 3, 4),
+        AddData(inputData2, 5, 6),
+        CheckAnswer(1, 2, 1, 2, 3, 4, 5, 6),
+        AddData(inputData2, 7, 8),
+        CheckAnswer(1, 2, 1, 2, 3, 4, 5, 6, 7, 8),
+
+        AssertOnQuery("metadata log should contain three files") { q =>
+          val metadataLogDir = new java.io.File(q.offsetLog.metadataPath.toString)
+          val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
+          val toTest = logFileNames.filter(!_.endsWith(".crc")).sorted // Workaround for SPARK-17475
+          assert(toTest.size == 3 && toTest.head == "2")
+          true
+        }
+      )
+    }
+  }
+
+  test("StreamingQuery should be Serializable but cannot be used in executors") {
+    def startQuery(ds: Dataset[Int], queryName: String): StreamingQuery = {
+      ds.writeStream
+        .queryName(queryName)
+        .format("memory")
+        .start()
+    }
+
+    val input = MemoryStream[Int]
+    val q1 = startQuery(input.toDS, "stream_serializable_test_1")
+    val q2 = startQuery(input.toDS.map { i =>
+      // Emulate that `StreamingQuery` get captured with normal usage unintentionally.
+      // It should not fail the query.
+      q1
+      i
+    }, "stream_serializable_test_2")
+    val q3 = startQuery(input.toDS.map { i =>
+      // Emulate that `StreamingQuery` is used in executors. We should fail the query with a clear
+      // error message.
+      q1.explain()
+      i
+    }, "stream_serializable_test_3")
+    try {
+      input.addData(1)
+
+      // q2 should not fail since it doesn't use `q1` in the closure
+      q2.processAllAvailable()
+
+      // The user calls `StreamingQuery` in the closure and it should fail
+      val e = intercept[StreamingQueryException] {
+        q3.processAllAvailable()
       }
-    )
+      assert(e.getCause.isInstanceOf[SparkException])
+      assert(e.getCause.getCause.isInstanceOf[IllegalStateException])
+      assert(e.getMessage.contains("StreamingQuery cannot be used in executors"))
+    } finally {
+      q1.stop()
+      q2.stop()
+      q3.stop()
+    }
+  }
+
+  test("StreamExecution should call stop() on sources when a stream is stopped") {
+    var calledStop = false
+    val source = new Source {
+      override def stop(): Unit = {
+        calledStop = true
+      }
+      override def getOffset: Option[Offset] = None
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        spark.emptyDataFrame
+      }
+      override def schema: StructType = MockSourceProvider.fakeSchema
+    }
+
+    MockSourceProvider.withMockSources(source) {
+      val df = spark.readStream
+        .format("org.apache.spark.sql.streaming.util.MockSourceProvider")
+        .load()
+
+      testStream(df)(StopStream)
+
+      assert(calledStop, "Did not call stop on source for stopped stream")
+    }
+  }
+
+  testQuietly("SPARK-19774: StreamExecution should call stop() on sources when a stream fails") {
+    var calledStop = false
+    val source1 = new Source {
+      override def stop(): Unit = {
+        throw new RuntimeException("Oh no!")
+      }
+      override def getOffset: Option[Offset] = Some(LongOffset(1))
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        spark.range(2).toDF(MockSourceProvider.fakeSchema.fieldNames: _*)
+      }
+      override def schema: StructType = MockSourceProvider.fakeSchema
+    }
+    val source2 = new Source {
+      override def stop(): Unit = {
+        calledStop = true
+      }
+      override def getOffset: Option[Offset] = None
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        spark.emptyDataFrame
+      }
+      override def schema: StructType = MockSourceProvider.fakeSchema
+    }
+
+    MockSourceProvider.withMockSources(source1, source2) {
+      val df1 = spark.readStream
+        .format("org.apache.spark.sql.streaming.util.MockSourceProvider")
+        .load()
+        .as[Int]
+
+      val df2 = spark.readStream
+        .format("org.apache.spark.sql.streaming.util.MockSourceProvider")
+        .load()
+        .as[Int]
+
+      testStream(df1.union(df2).map(i => i / 0))(
+        AssertOnQuery { sq =>
+          intercept[StreamingQueryException](sq.processAllAvailable())
+          sq.exception.isDefined && !sq.isActive
+        }
+      )
+
+      assert(calledStop, "Did not call stop on source for stopped stream")
+    }
   }
 
   /** Create a streaming DF that only execute one batch in which it returns the given static DF */
@@ -285,29 +565,14 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     StreamingExecutionRelation(source)
   }
 
-  /** Returns the query status at the end of the first trigger of streaming DF */
-  private def getFirstTriggerStatus(streamingDF: DataFrame): StreamingQueryStatus = {
-    // A StreamingQueryListener that gets the query status after the first completed trigger
-    val listener = new StreamingQueryListener {
-      @volatile var firstStatus: StreamingQueryStatus = null
-      override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = { }
-      override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
-       if (firstStatus == null) firstStatus = queryProgress.queryStatus
-      }
-      override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = { }
-    }
-
+  /** Returns the query progress at the end of the first trigger of streaming DF */
+  private def getFirstProgress(streamingDF: DataFrame): StreamingQueryProgress = {
     try {
-      spark.streams.addListener(listener)
       val q = streamingDF.writeStream.format("memory").queryName("test").start()
       q.processAllAvailable()
-      eventually(timeout(streamingTimeout)) {
-        assert(listener.firstStatus != null)
-      }
-      listener.firstStatus
+      q.recentProgress.head
     } finally {
       spark.streams.active.map(_.stop())
-      spark.streams.removeListener(listener)
     }
   }
 
@@ -364,3 +629,8 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     }
   }
 }
+
+object StreamingQuerySuite {
+  // Singleton reference to clock that does not get serialized in task closures
+  var clock: ManualClock = null
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index f0994395813e..4e63b04497cf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -17,18 +17,22 @@
 
 package org.apache.spark.sql.streaming.test
 
+import java.io.File
 import java.util.concurrent.TimeUnit
 
 import scala.concurrent.duration._
 
+import org.apache.hadoop.fs.Path
 import org.mockito.Mockito._
-import org.scalatest.BeforeAndAfter
+import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
+import org.scalatest.PrivateMethodTester.PrivateMethod
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
-import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, StreamingQuery, StreamTest}
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 object LastOptions {
@@ -103,7 +107,7 @@ class DefaultSource extends StreamSourceProvider with StreamSinkProvider {
   }
 }
 
-class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
+class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter with PrivateMethodTester {
 
   private def newMetadataDir =
     Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
@@ -337,7 +341,7 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
       .start()
     q.stop()
 
-    assert(q.asInstanceOf[StreamExecution].trigger == ProcessingTime(10000))
+    assert(q.asInstanceOf[StreamingQueryWrapper].streamingQuery.trigger == ProcessingTime(10000))
 
     q = df.writeStream
       .format("org.apache.spark.sql.streaming.test")
@@ -346,7 +350,7 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
       .start()
     q.stop()
 
-    assert(q.asInstanceOf[StreamExecution].trigger == ProcessingTime(100000))
+    assert(q.asInstanceOf[StreamingQueryWrapper].streamingQuery.trigger == ProcessingTime(100000))
   }
 
   test("source metadataPath") {
@@ -386,19 +390,40 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
 
   private def newTextInput = Utils.createTempDir(namePrefix = "text").getCanonicalPath
 
-  test("check outputMode(string) throws exception on unsupported modes") {
-    def testError(outputMode: String): Unit = {
+  test("supported strings in outputMode(string)") {
+    val outputModeMethod = PrivateMethod[OutputMode]('outputMode)
+
+    def testMode(outputMode: String, expected: OutputMode): Unit = {
+      val df = spark.readStream
+        .format("org.apache.spark.sql.streaming.test")
+        .load()
+      val w = df.writeStream
+      w.outputMode(outputMode)
+      val setOutputMode = w invokePrivate outputModeMethod()
+      assert(setOutputMode === expected)
+    }
+
+    testMode("append", OutputMode.Append)
+    testMode("Append", OutputMode.Append)
+    testMode("complete", OutputMode.Complete)
+    testMode("Complete", OutputMode.Complete)
+    testMode("update", OutputMode.Update)
+    testMode("Update", OutputMode.Update)
+  }
+
+  test("unsupported strings in outputMode(string)") {
+    def testMode(outputMode: String): Unit = {
+      val acceptedModes = Seq("append", "update", "complete")
       val df = spark.readStream
         .format("org.apache.spark.sql.streaming.test")
         .load()
       val w = df.writeStream
       val e = intercept[IllegalArgumentException](w.outputMode(outputMode))
-      Seq("output mode", "unknown", outputMode).foreach { s =>
+      (Seq("output mode", "unknown", outputMode) ++ acceptedModes).foreach { s =>
         assert(e.getMessage.toLowerCase.contains(s.toLowerCase))
       }
     }
-    testError("Update")
-    testError("Xyz")
+    testMode("Xyz")
   }
 
   test("check foreach() catches null writers") {
@@ -467,4 +492,208 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
     val sq = df.writeStream.format("console").start()
     sq.stop()
   }
+
+  private def testMemorySinkCheckpointRecovery(chkLoc: String, provideInWriter: Boolean): Unit = {
+    import testImplicits._
+    val ms = new MemoryStream[Int](0, sqlContext)
+    val df = ms.toDF().toDF("a")
+    val tableName = "test"
+    def startQuery: StreamingQuery = {
+      val writer = df.groupBy("a")
+        .count()
+        .writeStream
+        .format("memory")
+        .queryName(tableName)
+        .outputMode("complete")
+      if (provideInWriter) {
+        writer.option("checkpointLocation", chkLoc)
+      }
+      writer.start()
+    }
+    // no exception here
+    val q = startQuery
+    ms.addData(0, 1)
+    q.processAllAvailable()
+    q.stop()
+
+    checkAnswer(
+      spark.table(tableName),
+      Seq(Row(0, 1), Row(1, 1))
+    )
+    spark.sql(s"drop table $tableName")
+    // verify table is dropped
+    intercept[AnalysisException](spark.table(tableName).collect())
+    val q2 = startQuery
+    ms.addData(0)
+    q2.processAllAvailable()
+    checkAnswer(
+      spark.table(tableName),
+      Seq(Row(0, 2), Row(1, 1))
+    )
+
+    q2.stop()
+  }
+
+  test("MemorySink can recover from a checkpoint in Complete Mode") {
+    val checkpointLoc = newMetadataDir
+    val checkpointDir = new File(checkpointLoc, "offsets")
+    checkpointDir.mkdirs()
+    assert(checkpointDir.exists())
+    testMemorySinkCheckpointRecovery(checkpointLoc, provideInWriter = true)
+  }
+
+  test("SPARK-18927: MemorySink can recover from a checkpoint provided in conf in Complete Mode") {
+    val checkpointLoc = newMetadataDir
+    val checkpointDir = new File(checkpointLoc, "offsets")
+    checkpointDir.mkdirs()
+    assert(checkpointDir.exists())
+    withSQLConf(SQLConf.CHECKPOINT_LOCATION.key -> checkpointLoc) {
+      testMemorySinkCheckpointRecovery(checkpointLoc, provideInWriter = false)
+    }
+  }
+
+  test("append mode memory sink's do not support checkpoint recovery") {
+    import testImplicits._
+    val ms = new MemoryStream[Int](0, sqlContext)
+    val df = ms.toDF().toDF("a")
+    val checkpointLoc = newMetadataDir
+    val checkpointDir = new File(checkpointLoc, "offsets")
+    checkpointDir.mkdirs()
+    assert(checkpointDir.exists())
+
+    val e = intercept[AnalysisException] {
+      df.writeStream
+        .format("memory")
+        .queryName("test")
+        .option("checkpointLocation", checkpointLoc)
+        .outputMode("append")
+        .start()
+    }
+    assert(e.getMessage.contains("does not support recovering"))
+    assert(e.getMessage.contains("checkpoint location"))
+  }
+
+  test("SPARK-18510: use user specified types for partition columns in file sources") {
+    import org.apache.spark.sql.functions.udf
+    import testImplicits._
+    withTempDir { src =>
+      val createArray = udf { (length: Long) =>
+        for (i <- 1 to length.toInt) yield i.toString
+      }
+      spark.range(4).select(createArray('id + 1) as 'ex, 'id, 'id % 4 as 'part).coalesce(1).write
+        .partitionBy("part", "id")
+        .mode("overwrite")
+        .parquet(src.toString)
+      // Specify a random ordering of the schema, partition column in the middle, etc.
+      // Also let's say that the partition columns are Strings instead of Longs.
+      // partition columns should go to the end
+      val schema = new StructType()
+        .add("id", StringType)
+        .add("ex", ArrayType(StringType))
+
+      val sdf = spark.readStream
+        .schema(schema)
+        .format("parquet")
+        .load(src.toString)
+
+      assert(sdf.schema.toList === List(
+        StructField("ex", ArrayType(StringType)),
+        StructField("part", IntegerType), // inferred partitionColumn dataType
+        StructField("id", StringType))) // used user provided partitionColumn dataType
+
+      val sq = sdf.writeStream
+        .queryName("corruption_test")
+        .format("memory")
+        .start()
+      sq.processAllAvailable()
+      checkAnswer(
+        spark.table("corruption_test"),
+        // notice how `part` is ordered before `id`
+        Row(Array("1"), 0, "0") :: Row(Array("1", "2"), 1, "1") ::
+          Row(Array("1", "2", "3"), 2, "2") :: Row(Array("1", "2", "3", "4"), 3, "3") :: Nil
+      )
+      sq.stop()
+    }
+  }
+
+  test("user specified checkpointLocation precedes SQLConf") {
+    import testImplicits._
+    withTempDir { checkpointPath =>
+      withTempPath { userCheckpointPath =>
+        assert(!userCheckpointPath.exists(), s"$userCheckpointPath should not exist")
+        withSQLConf(SQLConf.CHECKPOINT_LOCATION.key -> checkpointPath.getAbsolutePath) {
+          val queryName = "test_query"
+          val ds = MemoryStream[Int].toDS
+          ds.writeStream
+            .format("memory")
+            .queryName(queryName)
+            .option("checkpointLocation", userCheckpointPath.getAbsolutePath)
+            .start()
+            .stop()
+          assert(checkpointPath.listFiles().isEmpty,
+            "SQLConf path is used even if user specified checkpointLoc: " +
+              s"${checkpointPath.listFiles()} is not empty")
+          assert(userCheckpointPath.exists(),
+            s"The user specified checkpointLoc (userCheckpointPath) is not created")
+        }
+      }
+    }
+  }
+
+  test("use SQLConf checkpoint dir when checkpointLocation is not specified") {
+    import testImplicits._
+    withTempDir { checkpointPath =>
+      withSQLConf(SQLConf.CHECKPOINT_LOCATION.key -> checkpointPath.getAbsolutePath) {
+        val queryName = "test_query"
+        val ds = MemoryStream[Int].toDS
+        ds.writeStream.format("memory").queryName(queryName).start().stop()
+        // Should use query name to create a folder in `checkpointPath`
+        val queryCheckpointDir = new File(checkpointPath, queryName)
+        assert(queryCheckpointDir.exists(), s"$queryCheckpointDir doesn't exist")
+        assert(
+          checkpointPath.listFiles().size === 1,
+          s"${checkpointPath.listFiles().toList} has 0 or more than 1 files ")
+      }
+    }
+  }
+
+  test("use SQLConf checkpoint dir when checkpointLocation is not specified without query name") {
+    import testImplicits._
+    withTempDir { checkpointPath =>
+      withSQLConf(SQLConf.CHECKPOINT_LOCATION.key -> checkpointPath.getAbsolutePath) {
+        val ds = MemoryStream[Int].toDS
+        ds.writeStream.format("console").start().stop()
+        // Should create a random folder in `checkpointPath`
+        assert(
+          checkpointPath.listFiles().size === 1,
+          s"${checkpointPath.listFiles().toList} has 0 or more than 1 files ")
+      }
+    }
+  }
+
+  test("temp checkpoint dir should be deleted if a query is stopped without errors") {
+    import testImplicits._
+    val query = MemoryStream[Int].toDS.writeStream.format("console").start()
+    val checkpointDir = new Path(
+      query.asInstanceOf[StreamingQueryWrapper].streamingQuery.checkpointRoot)
+    val fs = checkpointDir.getFileSystem(spark.sessionState.newHadoopConf())
+    assert(fs.exists(checkpointDir))
+    query.stop()
+    assert(!fs.exists(checkpointDir))
+  }
+
+  testQuietly("temp checkpoint dir should not be deleted if a query is stopped with an error") {
+    import testImplicits._
+    val input = MemoryStream[Int]
+    val query = input.toDS.map(_ / 0).writeStream.format("console").start()
+    val checkpointDir = new Path(
+      query.asInstanceOf[StreamingQueryWrapper].streamingQuery.checkpointRoot)
+    val fs = checkpointDir.getFileSystem(spark.sessionState.newHadoopConf())
+    assert(fs.exists(checkpointDir))
+    input.addData(1)
+    intercept[StreamingQueryException] {
+      query.awaitTermination()
+    }
+    assert(fs.exists(checkpointDir))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
new file mode 100644
index 000000000000..19ab2ff13e14
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming.util
+
+import java.util.concurrent.CountDownLatch
+
+import org.apache.spark.sql.{SQLContext, _}
+import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
+import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+
+/** Dummy provider: returns a SourceProvider with a blocking `createSource` call. */
+class BlockingSource extends StreamSourceProvider with StreamSinkProvider {
+
+  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)
+
+  override def sourceSchema(
+      spark: SQLContext,
+      schema: Option[StructType],
+      providerName: String,
+      parameters: Map[String, String]): (String, StructType) = {
+    ("dummySource", fakeSchema)
+  }
+
+  override def createSource(
+      spark: SQLContext,
+      metadataPath: String,
+      schema: Option[StructType],
+      providerName: String,
+      parameters: Map[String, String]): Source = {
+    BlockingSource.latch.await()
+    new Source {
+      override def schema: StructType = fakeSchema
+      override def getOffset: Option[Offset] = Some(new LongOffset(0))
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        import spark.implicits._
+        Seq[Int]().toDS().toDF()
+      }
+      override def stop() {}
+    }
+  }
+
+  override def createSink(
+      spark: SQLContext,
+      parameters: Map[String, String],
+      partitionColumns: Seq[String],
+      outputMode: OutputMode): Sink = {
+    new Sink {
+      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
+    }
+  }
+}
+
+object BlockingSource {
+  var latch: CountDownLatch = null
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/MockSourceProvider.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/MockSourceProvider.scala
new file mode 100644
index 000000000000..0bf05381a7f3
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/MockSourceProvider.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming.util
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.execution.streaming.Source
+import org.apache.spark.sql.sources.StreamSourceProvider
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+
+/**
+ * A StreamSourceProvider that provides mocked Sources for unit testing. Example usage:
+ *
+ * {{{
+ *    MockSourceProvider.withMockSources(source1, source2) {
+ *      val df1 = spark.readStream
+ *        .format("org.apache.spark.sql.streaming.util.MockSourceProvider")
+ *        .load()
+ *
+ *      val df2 = spark.readStream
+ *        .format("org.apache.spark.sql.streaming.util.MockSourceProvider")
+ *        .load()
+ *
+ *      df1.union(df2)
+ *      ...
+ *    }
+ * }}}
+ */
+class MockSourceProvider extends StreamSourceProvider {
+  override def sourceSchema(
+      spark: SQLContext,
+      schema: Option[StructType],
+      providerName: String,
+      parameters: Map[String, String]): (String, StructType) = {
+    ("dummySource", MockSourceProvider.fakeSchema)
+  }
+
+  override def createSource(
+      spark: SQLContext,
+      metadataPath: String,
+      schema: Option[StructType],
+      providerName: String,
+      parameters: Map[String, String]): Source = {
+    MockSourceProvider.sourceProviderFunction()
+  }
+}
+
+object MockSourceProvider {
+  // Function to generate sources. May provide multiple sources if the user implements such a
+  // function.
+  private var sourceProviderFunction: () => Source = _
+
+  final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)
+
+  def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = {
+    var i = 0
+    val sources = source +: otherSources
+    sourceProviderFunction = () => {
+      val source = sources(i % sources.length)
+      i += 1
+      source
+    }
+    try {
+      f
+    } finally {
+      sourceProviderFunction = null
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
index a7fda0109856..4bec2e3fdb9d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -24,7 +24,7 @@ import org.scalatest.BeforeAndAfter
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 
@@ -108,16 +108,14 @@ class DefaultSourceWithoutUserSpecifiedSchema
 }
 
 class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with BeforeAndAfter {
-
+  import testImplicits._
 
   private val userSchema = new StructType().add("s", StringType)
   private val textSchema = new StructType().add("value", StringType)
   private val data = Seq("1", "2", "3")
   private val dir = Utils.createTempDir(namePrefix = "input").getCanonicalPath
-  private implicit var enc: Encoder[String] = _
 
   before {
-    enc = spark.implicits.newStringEncoder
     Utils.deleteRecursively(new File(dir))
   }
 
@@ -459,8 +457,6 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
   }
 
   test("column nullability and comment - write and then read") {
-    import testImplicits._
-
     Seq("json", "parquet", "csv").foreach { format =>
       val schema = StructType(
         StructField("cl1", IntegerType, nullable = false).withComment("test") ::
@@ -573,4 +569,70 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
       }
     }
   }
+
+  test("SPARK-18510: use user specified types for partition columns in file sources") {
+    import org.apache.spark.sql.functions.udf
+    withTempDir { src =>
+      val createArray = udf { (length: Long) =>
+        for (i <- 1 to length.toInt) yield i.toString
+      }
+      spark.range(4).select(createArray('id + 1) as 'ex, 'id, 'id % 4 as 'part).coalesce(1).write
+        .partitionBy("part", "id")
+        .mode("overwrite")
+        .parquet(src.toString)
+      // Specify a random ordering of the schema, partition column in the middle, etc.
+      // Also let's say that the partition columns are Strings instead of Longs.
+      // partition columns should go to the end
+      val schema = new StructType()
+        .add("id", StringType)
+        .add("ex", ArrayType(StringType))
+      val df = spark.read
+        .schema(schema)
+        .format("parquet")
+        .load(src.toString)
+
+      assert(df.schema.toList === List(
+        StructField("ex", ArrayType(StringType)),
+        StructField("part", IntegerType), // inferred partitionColumn dataType
+        StructField("id", StringType))) // used user provided partitionColumn dataType
+
+      checkAnswer(
+        df,
+        // notice how `part` is ordered before `id`
+        Row(Array("1"), 0, "0") :: Row(Array("1", "2"), 1, "1") ::
+          Row(Array("1", "2", "3"), 2, "2") :: Row(Array("1", "2", "3", "4"), 3, "3") :: Nil
+      )
+    }
+  }
+
+  test("SPARK-18899: append to a bucketed table using DataFrameWriter with mismatched bucketing") {
+    withTable("t") {
+      Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.bucketBy(2, "i").saveAsTable("t")
+      val e = intercept[AnalysisException] {
+        Seq(3 -> "c").toDF("i", "j").write.bucketBy(3, "i").mode("append").saveAsTable("t")
+      }
+      assert(e.message.contains("Specified bucketing does not match that of the existing table"))
+    }
+  }
+
+  test("SPARK-18912: number of columns mismatch for non-file-based data source table") {
+    withTable("t") {
+      sql("CREATE TABLE t USING org.apache.spark.sql.test.DefaultSource")
+
+      val e = intercept[AnalysisException] {
+        Seq(1 -> "a").toDF("a", "b").write
+          .format("org.apache.spark.sql.test.DefaultSource")
+          .mode("append").saveAsTable("t")
+      }
+      assert(e.message.contains("The column number of the existing table"))
+    }
+  }
+
+  test("SPARK-18913: append to a table with special column names") {
+    withTable("t") {
+      Seq(1 -> "a").toDF("x.x", "y.y").write.saveAsTable("t")
+      Seq(2 -> "b").toDF("x.x", "y.y").write.mode("append").saveAsTable("t")
+      checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Nil)
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index d4d8e3e4e83d..24ba0f571eef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -20,12 +20,14 @@ package org.apache.spark.sql.test
 import java.io.File
 import java.util.UUID
 
+import scala.concurrent.duration._
 import scala.language.implicitConversions
 import scala.util.Try
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.conf.Configuration
 import org.scalatest.BeforeAndAfterAll
+import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
@@ -48,7 +50,7 @@ import org.apache.spark.util.{UninterruptibleThread, Utils}
  * prone to leaving multiple overlapping [[org.apache.spark.SparkContext]]s in the same JVM.
  */
 private[sql] trait SQLTestUtils
-  extends SparkFunSuite
+  extends SparkFunSuite with Eventually
   with BeforeAndAfterAll
   with SQLTestData { self =>
 
@@ -94,7 +96,13 @@ private[sql] trait SQLTestUtils
    */
   protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
     val (keys, values) = pairs.unzip
-    val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption)
+    val currentValues = keys.map { key =>
+      if (spark.conf.contains(key)) {
+        Some(spark.conf.get(key))
+      } else {
+        None
+      }
+    }
     (keys, values).zipped.foreach(spark.conf.set)
     try f finally {
       keys.zip(currentValues).foreach {
@@ -116,6 +124,15 @@ private[sql] trait SQLTestUtils
     try f(path) finally Utils.deleteRecursively(path)
   }
 
+  /**
+   * Waits for all tasks on all executors to be finished.
+   */
+  protected def waitForTasksToFinish(): Unit = {
+    eventually(timeout(10.seconds)) {
+      assert(spark.sparkContext.statusTracker
+        .getExecutorInfos.map(_.numRunningTasks()).sum == 0)
+    }
+  }
   /**
    * Creates a temporary directory, which is then passed to `f` and will be deleted after `f`
    * returns.
@@ -124,7 +141,11 @@ private[sql] trait SQLTestUtils
    */
   protected def withTempDir(f: File => Unit): Unit = {
     val dir = Utils.createTempDir().getCanonicalFile
-    try f(dir) finally Utils.deleteRecursively(dir)
+    try f(dir) finally {
+      // wait for all tasks to finish before deleting files
+      waitForTasksToFinish()
+      Utils.deleteRecursively(dir)
+    }
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index db24ee8b46dd..243845dfba60 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.sql.test
 
+import scala.concurrent.duration._
+
 import org.scalatest.BeforeAndAfterEach
+import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.{DebugFilesystem, SparkConf}
 import org.apache.spark.sql.{SparkSession, SQLContext}
@@ -26,7 +29,7 @@ import org.apache.spark.sql.{SparkSession, SQLContext}
 /**
  * Helper trait for SQL test suites where all tests share a single [[TestSparkSession]].
  */
-trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach {
+trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach with Eventually {
 
   protected val sparkConf = new SparkConf()
 
@@ -48,14 +51,18 @@ trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach {
    */
   protected implicit def sqlContext: SQLContext = _spark.sqlContext
 
+  protected def createSparkSession: TestSparkSession = {
+    new TestSparkSession(
+      sparkConf.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName))
+  }
+
   /**
    * Initialize the [[TestSparkSession]].
    */
   protected override def beforeAll(): Unit = {
     SparkSession.sqlListener.set(null)
     if (_spark == null) {
-      _spark = new TestSparkSession(
-        sparkConf.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName))
+      _spark = createSparkSession
     }
     // Ensure we have initialized the context before calling parent code
     super.beforeAll()
@@ -82,6 +89,10 @@ trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach {
 
   protected override def afterEach(): Unit = {
     super.afterEach()
-    DebugFilesystem.assertNoOpenStreams()
+    // files can be closed from other threads, so wait a bit
+    // normally this doesn't take more than 1s
+    eventually(timeout(10.seconds)) {
+      DebugFilesystem.assertNoOpenStreams()
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
index 3ae5ce610d2a..f372e9494ce0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
@@ -58,7 +58,7 @@ class DataFrameCallbackSuite extends QueryTest with SharedSQLContext {
     spark.listenerManager.unregister(listener)
   }
 
-  test("execute callback functions when a DataFrame action failed") {
+  testQuietly("execute callback functions when a DataFrame action failed") {
     val metrics = ArrayBuffer.empty[(String, QueryExecution, Exception)]
     val listener = new QueryExecutionListener {
       override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
@@ -73,8 +73,6 @@ class DataFrameCallbackSuite extends QueryTest with SharedSQLContext {
     val errorUdf = udf[Int, Int] { _ => throw new RuntimeException("udf error") }
     val df = sparkContext.makeRDD(Seq(1 -> "a")).toDF("i", "j")
 
-    // Ignore the log when we are expecting an exception.
-    sparkContext.setLogLevel("FATAL")
     val e = intercept[SparkException](df.select(errorUdf($"i")).collect())
 
     assert(metrics.length == 1)
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 819897cd4685..1abc0a253098 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -85,6 +85,18 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
+
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>net.sf.jpam</groupId>
       <artifactId>jpam</artifactId>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index aeabd6a15881..517b01f18392 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -50,8 +50,13 @@ private[hive] class SparkExecuteStatementOperation(
   with Logging {
 
   private var result: DataFrame = _
+
+  // We cache the returned rows to get iterators again in case the user wants to use FETCH_FIRST.
+  // This is only used when `spark.sql.thriftServer.incrementalCollect` is set to `false`.
+  // In case of `true`, this will be `None` and FETCH_FIRST will trigger re-execution.
+  private var resultList: Option[Array[SparkRow]] = _
+
   private var iter: Iterator[SparkRow] = _
-  private var iterHeader: Iterator[SparkRow] = _
   private var dataTypes: Array[DataType] = _
   private var statementId: String = _
 
@@ -111,9 +116,15 @@ private[hive] class SparkExecuteStatementOperation(
 
     // Reset iter to header when fetching start from first row
     if (order.equals(FetchOrientation.FETCH_FIRST)) {
-      val (ita, itb) = iterHeader.duplicate
-      iter = ita
-      iterHeader = itb
+      iter = if (sqlContext.getConf(SQLConf.THRIFTSERVER_INCREMENTAL_COLLECT.key).toBoolean) {
+        resultList = None
+        result.toLocalIterator.asScala
+      } else {
+        if (resultList.isEmpty) {
+          resultList = Some(result.collect())
+        }
+        resultList.get.iterator
+      }
     }
 
     if (!iter.hasNext) {
@@ -227,17 +238,14 @@ private[hive] class SparkExecuteStatementOperation(
       }
       HiveThriftServer2.listener.onStatementParsed(statementId, result.queryExecution.toString())
       iter = {
-        val useIncrementalCollect =
-          sqlContext.getConf("spark.sql.thriftServer.incrementalCollect", "false").toBoolean
-        if (useIncrementalCollect) {
+        if (sqlContext.getConf(SQLConf.THRIFTSERVER_INCREMENTAL_COLLECT.key).toBoolean) {
+          resultList = None
           result.toLocalIterator.asScala
         } else {
-          result.collect().iterator
+          resultList = Some(result.collect())
+          resultList.get.iterator
         }
       }
-      val (itra, itrb) = iter.duplicate
-      iterHeader = itra
-      iter = itrb
       dataTypes = result.queryExecution.analyzed.output.map(_.dataType).toArray
     } catch {
       case e: HiveSQLException =>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 5dafec1c3021..0c79b6f4211f 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -38,7 +38,7 @@ import org.apache.thrift.transport.TSocket
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.hive.{HiveSessionState, HiveUtils}
 import org.apache.spark.util.ShutdownHookManager
 
 /**
@@ -291,6 +291,10 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
     throw new RuntimeException("Remote operations not supported")
   }
 
+  override def setHiveVariables(hiveVariables: java.util.Map[String, String]): Unit = {
+    hiveVariables.asScala.foreach(kv => SparkSQLEnv.sqlContext.conf.setConfString(kv._1, kv._2))
+  }
+
   override def processCmd(cmd: String): Int = {
     val cmd_trimmed: String = cmd.trim()
     val cmd_lower = cmd_trimmed.toLowerCase(Locale.ENGLISH)
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
index f39e9dcd3a5b..38b860574575 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala
@@ -39,7 +39,8 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab)
 
   /** Render the page */
   def render(request: HttpServletRequest): Seq[Node] = {
-    val parameterId = request.getParameter("id")
+    // stripXSS is called first to remove suspicious characters used in XSS attacks
+    val parameterId = UIUtils.stripXSS(request.getParameter("id"))
     require(parameterId != null && parameterId.nonEmpty, "Missing id parameter")
 
     val content =
@@ -197,4 +198,3 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab)
     UIUtils.listingTable(headers, generateDataRow, data, fixedWidth = true)
   }
 }
-
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index f5d10de8cd2b..5cd4935e225e 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -57,8 +57,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5)
     // Enable in-memory partition pruning for testing purposes
     TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
-    // Use Hive hash expression instead of the native one
-    TestHive.sessionState.functionRegistry.unregisterFunction("hash")
     // Ensures that the plans generation use metastore relation and not OrcRelation
     // Was done because SqlBuilder does not work with plans having logical relation
     TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, false)
@@ -76,7 +74,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
       TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
       TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, originalConvertMetastoreOrc)
       TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled)
-      TestHive.sessionState.functionRegistry.restore()
 
       // For debugging dump some statistics about how much time was spent in various optimizer rules
       logWarning(RuleExecutor.dumpTimeSpent())
@@ -581,7 +578,26 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "auto_join6",
     "auto_join7",
     "auto_join8",
-    "auto_join9"
+    "auto_join9",
+
+    // These tests are based on the Hive's hash function, which is different from Spark
+    "auto_join19",
+    "auto_join22",
+    "auto_join25",
+    "auto_join26",
+    "auto_join27",
+    "auto_join28",
+    "auto_join30",
+    "auto_join31",
+    "auto_join_nulls",
+    "auto_join_reordering_values",
+    "correlationoptimizer1",
+    "correlationoptimizer2",
+    "correlationoptimizer3",
+    "correlationoptimizer4",
+    "multiMapJoin1",
+    "orc_dictionary_threshold",
+    "udf_hash"
   )
 
   /**
@@ -601,16 +617,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "annotate_stats_part",
     "annotate_stats_table",
     "annotate_stats_union",
-    "auto_join19",
-    "auto_join22",
-    "auto_join25",
-    "auto_join26",
-    "auto_join27",
-    "auto_join28",
-    "auto_join30",
-    "auto_join31",
-    "auto_join_nulls",
-    "auto_join_reordering_values",
     "binary_constant",
     "binarysortable_1",
     "cast1",
@@ -623,15 +629,11 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "compute_stats_long",
     "compute_stats_string",
     "convert_enum_to_string",
-    "correlationoptimizer1",
     "correlationoptimizer10",
     "correlationoptimizer11",
     "correlationoptimizer13",
     "correlationoptimizer14",
     "correlationoptimizer15",
-    "correlationoptimizer2",
-    "correlationoptimizer3",
-    "correlationoptimizer4",
     "correlationoptimizer6",
     "correlationoptimizer7",
     "correlationoptimizer8",
@@ -871,7 +873,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "merge2",
     "merge4",
     "mergejoins",
-    "multiMapJoin1",
     "multiMapJoin2",
     "multi_insert_gby",
     "multi_insert_gby3",
@@ -893,7 +894,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "nullinput2",
     "nullscript",
     "optional_outer",
-    "orc_dictionary_threshold",
     "order",
     "order2",
     "outer_join_ppr",
@@ -1026,7 +1026,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_from_unixtime",
     "udf_greaterthan",
     "udf_greaterthanorequal",
-    "udf_hash",
     "udf_hex",
     "udf_if",
     "udf_index",
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 2be99cb1046f..b62f800277ce 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -60,6 +60,8 @@
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
     </dependency>
 <!--
     <dependency>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 409c316c6802..23777f2fe35a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -17,8 +17,12 @@
 
 package org.apache.spark.sql.hive
 
+import java.io.IOException
+import java.lang.reflect.InvocationTargetException
+import java.net.URI
 import java.util
 
+import scala.collection.mutable
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.conf.Configuration
@@ -26,21 +30,22 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.ql.metadata.HiveException
 import org.apache.thrift.TException
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
-import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
-import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.execution.command.DDLUtils
+import org.apache.spark.sql.execution.datasources.PartitioningUtils
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
-import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.internal.StaticSQLConf._
-import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.sql.types.{DataType, StructType}
 
 
 /**
@@ -64,7 +69,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   // Exceptions thrown by the hive client that we would like to wrap
   private val clientExceptions = Set(
     classOf[HiveException].getCanonicalName,
-    classOf[TException].getCanonicalName)
+    classOf[TException].getCanonicalName,
+    classOf[InvocationTargetException].getCanonicalName)
 
   /**
    * Whether this is an exception thrown by the hive client that should be wrapped.
@@ -90,14 +96,26 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     try {
       body
     } catch {
-      case NonFatal(e) if isClientException(e) =>
+      case NonFatal(exception) if isClientException(exception) =>
+        val e = exception match {
+          // Since we are using shim, the exceptions thrown by the underlying method of
+          // Method.invoke() are wrapped by InvocationTargetException
+          case i: InvocationTargetException => i.getCause
+          case o => o
+        }
         throw new AnalysisException(
           e.getClass.getCanonicalName + ": " + e.getMessage, cause = Some(e))
     }
   }
 
-  private def requireTableExists(db: String, table: String): Unit = {
-    withClient { getTable(db, table) }
+  /**
+   * Get the raw table metadata from hive metastore directly. The raw table metadata may contains
+   * special data source properties and should not be exposed outside of `HiveExternalCatalog`. We
+   * should interpret these special data source properties and restore the original table metadata
+   * before returning it.
+   */
+  private def getRawTable(db: String, table: String): CatalogTable = withClient {
+    client.getTable(db, table)
   }
 
   /**
@@ -157,7 +175,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   }
 
   override def databaseExists(db: String): Boolean = withClient {
-    client.getDatabaseOption(db).isDefined
+    client.databaseExists(db)
   }
 
   override def listDatabases(): Seq[String] = withClient {
@@ -188,153 +206,218 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     if (tableExists(db, table) && !ignoreIfExists) {
       throw new TableAlreadyExistsException(db = db, table = table)
     }
-    // Before saving data source table metadata into Hive metastore, we should:
-    //  1. Put table provider, schema, partition column names, bucket specification and partition
-    //     provider in table properties.
-    //  2. Check if this table is hive compatible
-    //    2.1  If it's not hive compatible, set schema, partition columns and bucket spec to empty
-    //         and save table metadata to Hive.
-    //    2.2  If it's hive compatible, set serde information in table metadata and try to save
-    //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
-    if (DDLUtils.isDatasourceTable(tableDefinition)) {
-      // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
-      val provider = tableDefinition.provider.get
-      val partitionColumns = tableDefinition.partitionColumnNames
-      val bucketSpec = tableDefinition.bucketSpec
-
-      val tableProperties = new scala.collection.mutable.HashMap[String, String]
-      tableProperties.put(DATASOURCE_PROVIDER, provider)
-      if (tableDefinition.partitionProviderIsHive) {
-        tableProperties.put(TABLE_PARTITION_PROVIDER, "hive")
-      }
 
-      // Serialized JSON schema string may be too long to be stored into a single metastore table
-      // property. In this case, we split the JSON string and store each part as a separate table
-      // property.
-      val threshold = conf.get(SCHEMA_STRING_LENGTH_THRESHOLD)
-      val schemaJsonString = tableDefinition.schema.json
-      // Split the JSON string.
-      val parts = schemaJsonString.grouped(threshold).toSeq
-      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
-      parts.zipWithIndex.foreach { case (part, index) =>
-        tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
+    if (tableDefinition.tableType == VIEW) {
+      client.createTable(tableDefinition, ignoreIfExists)
+    } else {
+      // Ideally we should not create a managed table with location, but Hive serde table can
+      // specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have
+      // to create the table directory and write out data before we create this table, to avoid
+      // exposing a partial written table.
+      val needDefaultTableLocation = tableDefinition.tableType == MANAGED &&
+        tableDefinition.storage.locationUri.isEmpty
+
+      val tableLocation = if (needDefaultTableLocation) {
+        Some(defaultTablePath(tableDefinition.identifier))
+      } else {
+        tableDefinition.storage.locationUri
       }
 
-      if (partitionColumns.nonEmpty) {
-        tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString)
-        partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
-          tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
-        }
+      if (tableDefinition.provider.get == DDLUtils.HIVE_PROVIDER) {
+        val tableWithDataSourceProps = tableDefinition.copy(
+          // We can't leave `locationUri` empty and count on Hive metastore to set a default table
+          // location, because Hive metastore uses hive.metastore.warehouse.dir to generate default
+          // table location for tables in default database, while we expect to use the location of
+          // default database.
+          storage = tableDefinition.storage.copy(locationUri = tableLocation),
+          // Here we follow data source tables and put table metadata like table schema, partition
+          // columns etc. in table properties, so that we can work around the Hive metastore issue
+          // about not case preserving and make Hive serde table support mixed-case column names.
+          properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition))
+        client.createTable(tableWithDataSourceProps, ignoreIfExists)
+      } else {
+        createDataSourceTable(
+          tableDefinition.withNewStorage(locationUri = tableLocation),
+          ignoreIfExists)
       }
+    }
+  }
+
+  private def createDataSourceTable(table: CatalogTable, ignoreIfExists: Boolean): Unit = {
+    // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
+    val provider = table.provider.get
+
+    // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type
+    // support, no column nullability, etc., we should do some extra works before saving table
+    // metadata into Hive metastore:
+    //  1. Put table metadata like table schema, partition columns, etc. in table properties.
+    //  2. Check if this table is hive compatible.
+    //    2.1  If it's not hive compatible, set location URI, schema, partition columns and bucket
+    //         spec to empty and save table metadata to Hive.
+    //    2.2  If it's hive compatible, set serde information in table metadata and try to save
+    //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
+    val tableProperties = tableMetaToTableProps(table)
 
-      if (bucketSpec.isDefined) {
-        val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get
+    // put table provider and partition provider in table properties.
+    tableProperties.put(DATASOURCE_PROVIDER, provider)
+    if (table.tracksPartitionsInCatalog) {
+      tableProperties.put(TABLE_PARTITION_PROVIDER, TABLE_PARTITION_PROVIDER_CATALOG)
+    }
 
-        tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString)
-        tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, bucketColumnNames.length.toString)
-        bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) =>
-          tableProperties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol)
-        }
+    // Ideally we should also put `locationUri` in table properties like provider, schema, etc.
+    // However, in older version of Spark we already store table location in storage properties
+    // with key "path". Here we keep this behaviour for backward compatibility.
+    val storagePropsWithLocation = table.storage.properties ++
+      table.storage.locationUri.map("path" -> _)
+
+    // converts the table metadata to Spark SQL specific format, i.e. set data schema, names and
+    // bucket specification to empty. Note that partition columns are retained, so that we can
+    // call partition-related Hive API later.
+    def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
+      table.copy(
+        // Hive only allows directory paths as location URIs while Spark SQL data source tables
+        // also allow file paths. For non-hive-compatible format, we should not set location URI
+        // to avoid hive metastore to throw exception.
+        storage = table.storage.copy(
+          locationUri = None,
+          properties = storagePropsWithLocation),
+        schema = table.partitionSchema,
+        bucketSpec = None,
+        properties = table.properties ++ tableProperties)
+    }
 
-        if (sortColumnNames.nonEmpty) {
-          tableProperties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, sortColumnNames.length.toString)
-          sortColumnNames.zipWithIndex.foreach { case (sortCol, index) =>
-            tableProperties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol)
-          }
-        }
+    // converts the table metadata to Hive compatible format, i.e. set the serde information.
+    def newHiveCompatibleMetastoreTable(serde: HiveSerDe): CatalogTable = {
+      val location = if (table.tableType == EXTERNAL) {
+        // When we hit this branch, we are saving an external data source table with hive
+        // compatible format, which means the data source is file-based and must have a `path`.
+        require(table.storage.locationUri.isDefined,
+          "External file-based data source table must have a `path` entry in storage properties.")
+        Some(new Path(table.location).toUri.toString)
+      } else {
+        None
       }
 
-      // converts the table metadata to Spark SQL specific format, i.e. set data schema, names and
-      // bucket specification to empty. Note that partition columns are retained, so that we can
-      // call partition-related Hive API later.
-      def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
-        tableDefinition.copy(
-          schema = tableDefinition.partitionSchema,
-          bucketSpec = None,
-          properties = tableDefinition.properties ++ tableProperties)
-      }
+      table.copy(
+        storage = table.storage.copy(
+          locationUri = location,
+          inputFormat = serde.inputFormat,
+          outputFormat = serde.outputFormat,
+          serde = serde.serde,
+          properties = storagePropsWithLocation
+        ),
+        properties = table.properties ++ tableProperties)
+    }
 
-      // converts the table metadata to Hive compatible format, i.e. set the serde information.
-      def newHiveCompatibleMetastoreTable(serde: HiveSerDe): CatalogTable = {
-        val location = if (tableDefinition.tableType == EXTERNAL) {
-          // When we hit this branch, we are saving an external data source table with hive
-          // compatible format, which means the data source is file-based and must have a `path`.
-          val map = new CaseInsensitiveMap(tableDefinition.storage.properties)
-          require(map.contains("path"),
-            "External file-based data source table must have a `path` entry in storage properties.")
-          Some(new Path(map("path")).toUri.toString)
-        } else {
-          None
+    val qualifiedTableName = table.identifier.quotedString
+    val maybeSerde = HiveSerDe.sourceToSerDe(provider)
+    val skipHiveMetadata = table.storage.properties
+      .getOrElse("skipHiveMetadata", "false").toBoolean
+
+    val (hiveCompatibleTable, logMessage) = maybeSerde match {
+      case _ if skipHiveMetadata =>
+        val message =
+          s"Persisting data source table $qualifiedTableName into Hive metastore in" +
+            "Spark SQL specific format, which is NOT compatible with Hive."
+        (None, message)
+
+      // our bucketing is un-compatible with hive(different hash function)
+      case _ if table.bucketSpec.nonEmpty =>
+        val message =
+          s"Persisting bucketed data source table $qualifiedTableName into " +
+            "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. "
+        (None, message)
+
+      case Some(serde) =>
+        val message =
+          s"Persisting file based data source table $qualifiedTableName into " +
+            s"Hive metastore in Hive compatible format."
+        (Some(newHiveCompatibleMetastoreTable(serde)), message)
+
+      case _ =>
+        val message =
+          s"Couldn't find corresponding Hive SerDe for data source provider $provider. " +
+            s"Persisting data source table $qualifiedTableName into Hive metastore in " +
+            s"Spark SQL specific format, which is NOT compatible with Hive."
+        (None, message)
+    }
+
+    (hiveCompatibleTable, logMessage) match {
+      case (Some(table), message) =>
+        // We first try to save the metadata of the table in a Hive compatible way.
+        // If Hive throws an error, we fall back to save its metadata in the Spark SQL
+        // specific way.
+        try {
+          logInfo(message)
+          saveTableIntoHive(table, ignoreIfExists)
+        } catch {
+          case NonFatal(e) =>
+            val warningMessage =
+              s"Could not persist ${table.identifier.quotedString} in a Hive " +
+                "compatible way. Persisting it into Hive metastore in Spark SQL specific format."
+            logWarning(warningMessage, e)
+            saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
         }
 
-        tableDefinition.copy(
-          storage = tableDefinition.storage.copy(
-            locationUri = location,
-            inputFormat = serde.inputFormat,
-            outputFormat = serde.outputFormat,
-            serde = serde.serde
-          ),
-          properties = tableDefinition.properties ++ tableProperties)
-      }
+      case (None, message) =>
+        logWarning(message)
+        saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
+    }
+  }
 
-      val qualifiedTableName = tableDefinition.identifier.quotedString
-      val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get)
-      val skipHiveMetadata = tableDefinition.storage.properties
-        .getOrElse("skipHiveMetadata", "false").toBoolean
-
-      val (hiveCompatibleTable, logMessage) = maybeSerde match {
-        case _ if skipHiveMetadata =>
-          val message =
-            s"Persisting data source table $qualifiedTableName into Hive metastore in" +
-              "Spark SQL specific format, which is NOT compatible with Hive."
-          (None, message)
-
-        // our bucketing is un-compatible with hive(different hash function)
-        case _ if tableDefinition.bucketSpec.nonEmpty =>
-          val message =
-            s"Persisting bucketed data source table $qualifiedTableName into " +
-              "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. "
-          (None, message)
-
-        case Some(serde) =>
-          val message =
-            s"Persisting file based data source table $qualifiedTableName into " +
-              s"Hive metastore in Hive compatible format."
-          (Some(newHiveCompatibleMetastoreTable(serde)), message)
-
-        case _ =>
-          val provider = tableDefinition.provider.get
-          val message =
-            s"Couldn't find corresponding Hive SerDe for data source provider $provider. " +
-              s"Persisting data source table $qualifiedTableName into Hive metastore in " +
-              s"Spark SQL specific format, which is NOT compatible with Hive."
-          (None, message)
+  /**
+   * Data source tables may be non Hive compatible and we need to store table metadata in table
+   * properties to workaround some Hive metastore limitations.
+   * This method puts table schema, partition column names, bucket specification into a map, which
+   * can be used as table properties later.
+   */
+  private def tableMetaToTableProps(table: CatalogTable): mutable.Map[String, String] = {
+    val partitionColumns = table.partitionColumnNames
+    val bucketSpec = table.bucketSpec
+
+    val properties = new mutable.HashMap[String, String]
+    // Serialized JSON schema string may be too long to be stored into a single metastore table
+    // property. In this case, we split the JSON string and store each part as a separate table
+    // property.
+    val threshold = conf.get(SCHEMA_STRING_LENGTH_THRESHOLD)
+    val schemaJsonString = table.schema.json
+    // Split the JSON string.
+    val parts = schemaJsonString.grouped(threshold).toSeq
+    properties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
+    parts.zipWithIndex.foreach { case (part, index) =>
+      properties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
+    }
+
+    if (partitionColumns.nonEmpty) {
+      properties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString)
+      partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
+        properties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
       }
+    }
 
-      (hiveCompatibleTable, logMessage) match {
-        case (Some(table), message) =>
-          // We first try to save the metadata of the table in a Hive compatible way.
-          // If Hive throws an error, we fall back to save its metadata in the Spark SQL
-          // specific way.
-          try {
-            logInfo(message)
-            saveTableIntoHive(table, ignoreIfExists)
-          } catch {
-            case NonFatal(e) =>
-              val warningMessage =
-                s"Could not persist ${tableDefinition.identifier.quotedString} in a Hive " +
-                  "compatible way. Persisting it into Hive metastore in Spark SQL specific format."
-              logWarning(warningMessage, e)
-              saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
-          }
+    if (bucketSpec.isDefined) {
+      val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get
 
-        case (None, message) =>
-          logWarning(message)
-          saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
+      properties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString)
+      properties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, bucketColumnNames.length.toString)
+      bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) =>
+        properties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol)
+      }
+
+      if (sortColumnNames.nonEmpty) {
+        properties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, sortColumnNames.length.toString)
+        sortColumnNames.zipWithIndex.foreach { case (sortCol, index) =>
+          properties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol)
+        }
       }
-    } else {
-      client.createTable(tableDefinition, ignoreIfExists)
     }
+
+    properties
+  }
+
+  private def defaultTablePath(tableIdent: TableIdentifier): String = {
+    val dbLocation = getDatabase(tableIdent.database.get).locationUri
+    new Path(new Path(dbLocation), tableIdent.table).toString
   }
 
   private def saveTableIntoHive(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = {
@@ -383,11 +466,45 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   }
 
   override def renameTable(db: String, oldName: String, newName: String): Unit = withClient {
-    val newTable = client.getTable(db, oldName)
-      .copy(identifier = TableIdentifier(newName, Some(db)))
+    val rawTable = getRawTable(db, oldName)
+
+    // Note that Hive serde tables don't use path option in storage properties to store the value
+    // of table location, but use `locationUri` field to store it directly. And `locationUri` field
+    // will be updated automatically in Hive metastore by the `alterTable` call at the end of this
+    // method. Here we only update the path option if the path option already exists in storage
+    // properties, to avoid adding a unnecessary path option for Hive serde tables.
+    val hasPathOption = new CaseInsensitiveMap(rawTable.storage.properties).contains("path")
+    val storageWithNewPath = if (rawTable.tableType == MANAGED && hasPathOption) {
+      // If it's a managed table with path option and we are renaming it, then the path option
+      // becomes inaccurate and we need to update it according to the new table name.
+      val newTablePath = defaultTablePath(TableIdentifier(newName, Some(db)))
+      updateLocationInStorageProps(rawTable, Some(newTablePath))
+    } else {
+      rawTable.storage
+    }
+
+    val newTable = rawTable.copy(
+      identifier = TableIdentifier(newName, Some(db)),
+      storage = storageWithNewPath)
+
     client.alterTable(oldName, newTable)
   }
 
+  private def getLocationFromStorageProps(table: CatalogTable): Option[String] = {
+    new CaseInsensitiveMap(table.storage.properties).get("path")
+  }
+
+  private def updateLocationInStorageProps(
+      table: CatalogTable,
+      newPath: Option[String]): CatalogStorageFormat = {
+    // We can't use `filterKeys` here, as the map returned by `filterKeys` is not serializable,
+    // while `CatalogTable` should be serializable.
+    val propsWithoutPath = table.storage.properties.filter {
+      case (k, v) => k.toLowerCase != "path"
+    }
+    table.storage.copy(properties = propsWithoutPath ++ newPath.map("path" -> _))
+  }
+
   /**
    * Alter a table whose name that matches the one specified in `tableDefinition`,
    * assuming the table exists.
@@ -410,38 +527,105 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         statsProperties += STATISTICS_NUM_ROWS -> stats.rowCount.get.toString()
       }
       stats.colStats.foreach { case (colName, colStat) =>
-        statsProperties += (STATISTICS_COL_STATS_PREFIX + colName) -> colStat.toString
+        colStat.toMap.foreach { case (k, v) =>
+          statsProperties += (columnStatKeyPropName(colName, k) -> v)
+        }
       }
       tableDefinition.copy(properties = tableDefinition.properties ++ statsProperties)
     } else {
       tableDefinition
     }
 
-    if (DDLUtils.isDatasourceTable(withStatsProps)) {
-      val oldDef = client.getTable(db, withStatsProps.identifier.table)
+    if (tableDefinition.tableType == VIEW) {
+      client.alterTable(withStatsProps)
+    } else {
+      val oldTableDef = getRawTable(db, withStatsProps.identifier.table)
+
+      val newStorage = if (tableDefinition.provider.get == DDLUtils.HIVE_PROVIDER) {
+        tableDefinition.storage
+      } else {
+        // We can't alter the table storage of data source table directly for 2 reasons:
+        //   1. internally we use path option in storage properties to store the value of table
+        //      location, but the given `tableDefinition` is from outside and doesn't have the path
+        //      option, we need to add it manually.
+        //   2. this data source table may be created on a file, not a directory, then we can't set
+        //      the `locationUri` field and save it to Hive metastore, because Hive only allows
+        //      directory as table location.
+        //
+        // For example, an external data source table is created with a single file '/path/to/file'.
+        // Internally, we will add a path option with value '/path/to/file' to storage properties,
+        // and set the `locationUri` to a special value due to SPARK-15269(please see
+        // `saveTableIntoHive` for more details). When users try to get the table metadata back, we
+        // will restore the `locationUri` field from the path option and remove the path option from
+        // storage properties. When users try to alter the table storage, the given
+        // `tableDefinition` will have `locationUri` field with value `/path/to/file` and the path
+        // option is not set.
+        //
+        // Here we need 2 extra steps:
+        //   1. add path option to storage properties, to match the internal format, i.e. using path
+        //      option to store the value of table location.
+        //   2. set the `locationUri` field back to the old one from the existing table metadata,
+        //      if users don't want to alter the table location. This step is necessary as the
+        //      `locationUri` is not always same with the path option, e.g. in the above example
+        //      `locationUri` is a special value and we should respect it. Note that, if users
+        //       want to alter the table location to a file path, we will fail. This should be fixed
+        //       in the future.
+
+        val newLocation = tableDefinition.storage.locationUri
+        val storageWithPathOption = tableDefinition.storage.copy(
+          properties = tableDefinition.storage.properties ++ newLocation.map("path" -> _))
+
+        val oldLocation = getLocationFromStorageProps(oldTableDef)
+        if (oldLocation == newLocation) {
+          storageWithPathOption.copy(locationUri = oldTableDef.storage.locationUri)
+        } else {
+          storageWithPathOption
+        }
+      }
+
+      val partitionProviderProp = if (tableDefinition.tracksPartitionsInCatalog) {
+        TABLE_PARTITION_PROVIDER -> TABLE_PARTITION_PROVIDER_CATALOG
+      } else {
+        TABLE_PARTITION_PROVIDER -> TABLE_PARTITION_PROVIDER_FILESYSTEM
+      }
+
       // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
       // to retain the spark specific format if it is. Also add old data source properties to table
       // properties, to retain the data source table format.
-      val oldDataSourceProps = oldDef.properties.filter(_._1.startsWith(SPARK_SQL_PREFIX))
-      val partitionProviderProp = if (tableDefinition.partitionProviderIsHive) {
-        TABLE_PARTITION_PROVIDER -> "hive"
-      } else {
-        TABLE_PARTITION_PROVIDER -> "builtin"
-      }
+      val oldDataSourceProps = oldTableDef.properties.filter(_._1.startsWith(DATASOURCE_PREFIX))
+      val newTableProps = oldDataSourceProps ++ withStatsProps.properties + partitionProviderProp
       val newDef = withStatsProps.copy(
-        schema = oldDef.schema,
-        partitionColumnNames = oldDef.partitionColumnNames,
-        bucketSpec = oldDef.bucketSpec,
-        properties = oldDataSourceProps ++ withStatsProps.properties + partitionProviderProp)
+        storage = newStorage,
+        schema = oldTableDef.schema,
+        partitionColumnNames = oldTableDef.partitionColumnNames,
+        bucketSpec = oldTableDef.bucketSpec,
+        properties = newTableProps)
 
       client.alterTable(newDef)
-    } else {
-      client.alterTable(withStatsProps)
+    }
+  }
+
+  override def alterTableSchema(db: String, table: String, schema: StructType): Unit = withClient {
+    requireTableExists(db, table)
+    val rawTable = getRawTable(db, table)
+    val withNewSchema = rawTable.copy(schema = schema)
+    // Add table metadata such as table schema, partition columns, etc. to table properties.
+    val updatedTable = withNewSchema.copy(
+      properties = withNewSchema.properties ++ tableMetaToTableProps(withNewSchema))
+    try {
+      client.alterTable(updatedTable)
+    } catch {
+      case NonFatal(e) =>
+        val warningMessage =
+          s"Could not alter schema of table  ${rawTable.identifier.quotedString} in a Hive " +
+            "compatible way. Updating Hive metastore in Spark SQL specific format."
+        logWarning(warningMessage, e)
+        client.alterTable(updatedTable.copy(schema = updatedTable.partitionSchema))
     }
   }
 
   override def getTable(db: String, table: String): CatalogTable = withClient {
-    restoreTableMetadata(client.getTable(db, table))
+    restoreTableMetadata(getRawTable(db, table))
   }
 
   override def getTableOption(db: String, table: String): Option[CatalogTable] = withClient {
@@ -455,62 +639,109 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * It reads table schema, provider, partition column names and bucket specification from table
    * properties, and filter out these special entries from table properties.
    */
-  private def restoreTableMetadata(table: CatalogTable): CatalogTable = {
+  private def restoreTableMetadata(inputTable: CatalogTable): CatalogTable = {
     if (conf.get(DEBUG_MODE)) {
-      return table
+      return inputTable
     }
 
-    val tableWithSchema = if (table.tableType == VIEW) {
-      table
-    } else {
-      getProviderFromTableProperties(table).map { provider =>
-        assert(provider != "hive", "Hive serde table should not save provider in table properties.")
-        // SPARK-15269: Persisted data source tables always store the location URI as a storage
-        // property named "path" instead of standard Hive `dataLocation`, because Hive only
-        // allows directory paths as location URIs while Spark SQL data source tables also
-        // allows file paths. So the standard Hive `dataLocation` is meaningless for Spark SQL
-        // data source tables.
-        // Spark SQL may also save external data source in Hive compatible format when
-        // possible, so that these tables can be directly accessed by Hive. For these tables,
-        // `dataLocation` is still necessary. Here we also check for input format because only
-        // these Hive compatible tables set this field.
-        val storage = if (table.tableType == EXTERNAL && table.storage.inputFormat.isEmpty) {
-          table.storage.copy(locationUri = None)
-        } else {
-          table.storage
-        }
-        table.copy(
-          storage = storage,
-          schema = getSchemaFromTableProperties(table),
-          provider = Some(provider),
-          partitionColumnNames = getPartitionColumnsFromTableProperties(table),
-          bucketSpec = getBucketSpecFromTableProperties(table),
-          partitionProviderIsHive = table.properties.get(TABLE_PARTITION_PROVIDER) == Some("hive"))
-      } getOrElse {
-        table.copy(provider = Some("hive"), partitionProviderIsHive = true)
+    var table = inputTable
+
+    if (table.tableType != VIEW) {
+      table.properties.get(DATASOURCE_PROVIDER) match {
+        // No provider in table properties, which means this is a Hive serde table.
+        case None =>
+          table = restoreHiveSerdeTable(table)
+
+        // This is a regular data source table.
+        case Some(provider) =>
+          table = restoreDataSourceTable(table, provider)
       }
     }
 
     // construct Spark's statistics from information in Hive metastore
-    val statsProps = tableWithSchema.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
-    val tableWithStats = if (statsProps.nonEmpty) {
-      val colStatsProps = statsProps.filterKeys(_.startsWith(STATISTICS_COL_STATS_PREFIX))
-        .map { case (k, v) => (k.drop(STATISTICS_COL_STATS_PREFIX.length), v) }
-      val colStats: Map[String, ColumnStat] = tableWithSchema.schema.collect {
-        case f if colStatsProps.contains(f.name) =>
-          val numFields = ColumnStatStruct.numStatFields(f.dataType)
-          (f.name, ColumnStat(numFields, colStatsProps(f.name)))
-      }.toMap
-      tableWithSchema.copy(
+    val statsProps = table.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
+
+    if (statsProps.nonEmpty) {
+      val colStats = new mutable.HashMap[String, ColumnStat]
+
+      // For each column, recover its column stats. Note that this is currently a O(n^2) operation,
+      // but given the number of columns it usually not enormous, this is probably OK as a start.
+      // If we want to map this a linear operation, we'd need a stronger contract between the
+      // naming convention used for serialization.
+      table.schema.foreach { field =>
+        if (statsProps.contains(columnStatKeyPropName(field.name, ColumnStat.KEY_VERSION))) {
+          // If "version" field is defined, then the column stat is defined.
+          val keyPrefix = columnStatKeyPropName(field.name, "")
+          val colStatMap = statsProps.filterKeys(_.startsWith(keyPrefix)).map { case (k, v) =>
+            (k.drop(keyPrefix.length), v)
+          }
+
+          ColumnStat.fromMap(table.identifier.table, field, colStatMap).foreach {
+            colStat => colStats += field.name -> colStat
+          }
+        }
+      }
+
+      table = table.copy(
         stats = Some(Statistics(
-          sizeInBytes = BigInt(tableWithSchema.properties(STATISTICS_TOTAL_SIZE)),
-          rowCount = tableWithSchema.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
-          colStats = colStats)))
+          sizeInBytes = BigInt(table.properties(STATISTICS_TOTAL_SIZE)),
+          rowCount = table.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
+          colStats = colStats.toMap)))
+    }
+
+    // Get the original table properties as defined by the user.
+    table.copy(
+      properties = table.properties.filterNot { case (key, _) => key.startsWith(SPARK_SQL_PREFIX) })
+  }
+
+  private def restoreHiveSerdeTable(table: CatalogTable): CatalogTable = {
+    val hiveTable = table.copy(
+      provider = Some(DDLUtils.HIVE_PROVIDER),
+      tracksPartitionsInCatalog = true)
+
+    // If this is a Hive serde table created by Spark 2.1 or higher versions, we should restore its
+    // schema from table properties.
+    if (table.properties.contains(DATASOURCE_SCHEMA_NUMPARTS)) {
+      val schemaFromTableProps = getSchemaFromTableProperties(table)
+      if (DataType.equalsIgnoreCaseAndNullability(schemaFromTableProps, table.schema)) {
+        hiveTable.copy(
+          schema = schemaFromTableProps,
+          partitionColumnNames = getPartitionColumnsFromTableProperties(table),
+          bucketSpec = getBucketSpecFromTableProperties(table))
+      } else {
+        // Hive metastore may change the table schema, e.g. schema inference. If the table
+        // schema we read back is different(ignore case and nullability) from the one in table
+        // properties which was written when creating table, we should respect the table schema
+        // from hive.
+        logWarning(s"The table schema given by Hive metastore(${table.schema.simpleString}) is " +
+          "different from the schema when this table was created by Spark SQL" +
+          s"(${schemaFromTableProps.simpleString}). We have to fall back to the table schema " +
+          "from Hive metastore which is not case preserving.")
+        hiveTable.copy(schemaPreservesCase = false)
+      }
     } else {
-      tableWithSchema
+      hiveTable.copy(schemaPreservesCase = false)
     }
+  }
 
-    tableWithStats.copy(properties = getOriginalTableProperties(table))
+  private def restoreDataSourceTable(table: CatalogTable, provider: String): CatalogTable = {
+    // Internally we store the table location in storage properties with key "path" for data
+    // source tables. Here we set the table location to `locationUri` field and filter out the
+    // path option in storage properties, to avoid exposing this concept externally.
+    val storageWithLocation = {
+      val tableLocation = getLocationFromStorageProps(table)
+      // We pass None as `newPath` here, to remove the path option in storage properties.
+      updateLocationInStorageProps(table, newPath = None).copy(locationUri = tableLocation)
+    }
+    val partitionProvider = table.properties.get(TABLE_PARTITION_PROVIDER)
+
+    table.copy(
+      provider = Some(provider),
+      storage = storageWithLocation,
+      schema = getSchemaFromTableProperties(table),
+      partitionColumnNames = getPartitionColumnsFromTableProperties(table),
+      bucketSpec = getBucketSpecFromTableProperties(table),
+      tracksPartitionsInCatalog = partitionProvider == Some(TABLE_PARTITION_PROVIDER_CATALOG))
   }
 
   override def tableExists(db: String, table: String): Boolean = withClient {
@@ -553,7 +784,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     val orderedPartitionSpec = new util.LinkedHashMap[String, String]()
     getTable(db, table).partitionColumnNames.foreach { colName =>
-      orderedPartitionSpec.put(colName, partition(colName))
+      // Hive metastore is not case preserving and keeps partition columns with lower cased names,
+      // and Hive will validate the column names in partition spec to make sure they are partition
+      // columns. Here we Lowercase the column names before passing the partition spec to Hive
+      // client, to satisfy Hive.
+      orderedPartitionSpec.put(colName.toLowerCase, partition(colName))
     }
 
     client.loadPartition(
@@ -578,7 +813,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     val orderedPartitionSpec = new util.LinkedHashMap[String, String]()
     getTable(db, table).partitionColumnNames.foreach { colName =>
-      orderedPartitionSpec.put(colName, partition(colName))
+      // Hive metastore is not case preserving and keeps partition columns with lower cased names,
+      // and Hive will validate the column names in partition spec to make sure they are partition
+      // columns. Here we Lowercase the column names before passing the partition spec to Hive
+      // client, to satisfy Hive.
+      orderedPartitionSpec.put(colName.toLowerCase, partition(colName))
     }
 
     client.loadDynamicPartitions(
@@ -602,9 +841,21 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     spec.map { case (k, v) => k.toLowerCase -> v }
   }
 
+  // Build a map from lower-cased partition column names to exact column names for a given table
+  private def buildLowerCasePartColNameMap(table: CatalogTable): Map[String, String] = {
+    val actualPartColNames = table.partitionColumnNames
+    actualPartColNames.map(colName => (colName.toLowerCase, colName)).toMap
+  }
+
   // Hive metastore is not case preserving and the column names of the partition specification we
   // get from the metastore are always lower cased. We should restore them w.r.t. the actual table
   // partition columns.
+  private def restorePartitionSpec(
+      spec: TablePartitionSpec,
+      partColMap: Map[String, String]): TablePartitionSpec = {
+    spec.map { case (k, v) => partColMap(k.toLowerCase) -> v }
+  }
+
   private def restorePartitionSpec(
       spec: TablePartitionSpec,
       partCols: Seq[String]): TablePartitionSpec = {
@@ -617,7 +868,21 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       parts: Seq[CatalogTablePartition],
       ignoreIfExists: Boolean): Unit = withClient {
     requireTableExists(db, table)
-    val lowerCasedParts = parts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+
+    val tableMeta = getTable(db, table)
+    val partitionColumnNames = tableMeta.partitionColumnNames
+    val tablePath = new Path(tableMeta.location)
+    val partsWithLocation = parts.map { p =>
+      // Ideally we can leave the partition location empty and let Hive metastore to set it.
+      // However, Hive metastore is not case preserving and will generate wrong partition location
+      // with lower cased partition column names. Here we set the default partition location
+      // manually to avoid this problem.
+      val partitionPath = p.storage.locationUri.map(uri => new Path(new URI(uri))).getOrElse {
+        ExternalCatalogUtils.generatePartitionPath(p.spec, partitionColumnNames, tablePath)
+      }
+      p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toUri.toString)))
+    }
+    val lowerCasedParts = partsWithLocation.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
     client.createPartitions(db, table, lowerCasedParts, ignoreIfExists)
   }
 
@@ -626,9 +891,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       parts: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit = withClient {
+      purge: Boolean,
+      retainData: Boolean): Unit = withClient {
     requireTableExists(db, table)
-    client.dropPartitions(db, table, parts.map(lowerCasePartitionSpec), ignoreIfNotExists, purge)
+    client.dropPartitions(
+      db, table, parts.map(lowerCasePartitionSpec), ignoreIfNotExists, purge, retainData)
   }
 
   override def renamePartitions(
@@ -638,6 +905,31 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       newSpecs: Seq[TablePartitionSpec]): Unit = withClient {
     client.renamePartitions(
       db, table, specs.map(lowerCasePartitionSpec), newSpecs.map(lowerCasePartitionSpec))
+
+    val tableMeta = getTable(db, table)
+    val partitionColumnNames = tableMeta.partitionColumnNames
+    // Hive metastore is not case preserving and keeps partition columns with lower cased names.
+    // When Hive rename partition for managed tables, it will create the partition location with
+    // a default path generate by the new spec with lower cased partition column names. This is
+    // unexpected and we need to rename them manually and alter the partition location.
+    val hasUpperCasePartitionColumn = partitionColumnNames.exists(col => col.toLowerCase != col)
+    if (tableMeta.tableType == MANAGED && hasUpperCasePartitionColumn) {
+      val tablePath = new Path(tableMeta.location)
+      val newParts = newSpecs.map { spec =>
+        val partition = client.getPartition(db, table, lowerCasePartitionSpec(spec))
+        val wrongPath = new Path(partition.location)
+        val rightPath = ExternalCatalogUtils.generatePartitionPath(
+          spec, partitionColumnNames, tablePath)
+        try {
+          tablePath.getFileSystem(hadoopConf).rename(wrongPath, rightPath)
+        } catch {
+          case e: IOException => throw new SparkException(
+            s"Unable to rename partition path from $wrongPath to $rightPath", e)
+        }
+        partition.copy(storage = partition.storage.copy(locationUri = Some(rightPath.toString)))
+      }
+      alterPartitions(db, table, newParts)
+    }
   }
 
   override def alterPartitions(
@@ -645,6 +937,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       newParts: Seq[CatalogTablePartition]): Unit = withClient {
     val lowerCasedParts = newParts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+    // Note: Before altering table partitions in Hive, you *must* set the current database
+    // to the one that contains the table of interest. Otherwise you will end up with the
+    // most helpful error message ever: "Unable to alter partition. alter is not possible."
+    // See HIVE-2742 for more detail.
+    client.setCurrentDatabase(db)
     client.alterPartitions(db, table, lowerCasedParts)
   }
 
@@ -671,12 +968,32 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   /**
    * Returns the partition names from hive metastore for a given table in a database.
    */
+  override def listPartitionNames(
+      db: String,
+      table: String,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String] = withClient {
+    val catalogTable = getTable(db, table)
+    val partColNameMap = buildLowerCasePartColNameMap(catalogTable).mapValues(escapePathName)
+    val clientPartitionNames =
+      client.getPartitionNames(catalogTable, partialSpec.map(lowerCasePartitionSpec))
+    clientPartitionNames.map { partitionPath =>
+      val partSpec = PartitioningUtils.parsePathFragmentAsSeq(partitionPath)
+      partSpec.map { case (partName, partValue) =>
+        partColNameMap(partName.toLowerCase) + "=" + escapePathName(partValue)
+      }.mkString("/")
+    }
+  }
+
+  /**
+   * Returns the partitions from hive metastore for a given table in a database.
+   */
   override def listPartitions(
       db: String,
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient {
+    val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table))
     client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
-      part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
+      part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
     }
   }
 
@@ -684,7 +1001,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       db: String,
       table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient {
-    val rawTable = client.getTable(db, table)
+    val rawTable = getRawTable(db, table)
     val catalogTable = restoreTableMetadata(rawTable)
     val partitionColumnNames = catalogTable.partitionColumnNames.toSet
     val nonPartitionPruningPredicates = predicates.filterNot {
@@ -697,10 +1014,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
 
     val partitionSchema = catalogTable.partitionSchema
+    val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table))
 
     if (predicates.nonEmpty) {
       val clientPrunedPartitions = client.getPartitionsByFilter(rawTable, predicates).map { part =>
-        part.copy(spec = restorePartitionSpec(part.spec, catalogTable.partitionColumnNames))
+        part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
       }
       val boundPredicate =
         InterpretedPredicate.create(predicates.reduce(And).transform {
@@ -711,7 +1029,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       clientPrunedPartitions.filter { p => boundPredicate(p.toRow(partitionSchema)) }
     } else {
       client.getPartitions(catalogTable).map { part =>
-        part.copy(spec = restorePartitionSpec(part.spec, catalogTable.partitionColumnNames))
+        part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
       }
     }
   }
@@ -784,18 +1102,20 @@ object HiveExternalCatalog {
   val STATISTICS_COL_STATS_PREFIX = STATISTICS_PREFIX + "colStats."
 
   val TABLE_PARTITION_PROVIDER = SPARK_SQL_PREFIX + "partitionProvider"
+  val TABLE_PARTITION_PROVIDER_CATALOG = "catalog"
+  val TABLE_PARTITION_PROVIDER_FILESYSTEM = "filesystem"
 
-
-  def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = {
-    metadata.properties.get(DATASOURCE_PROVIDER)
-  }
-
-  def getOriginalTableProperties(metadata: CatalogTable): Map[String, String] = {
-    metadata.properties.filterNot { case (key, _) => key.startsWith(SPARK_SQL_PREFIX) }
+  /**
+   * Returns the fully qualified name used in table properties for a particular column stat.
+   * For example, for column "mycol", and "min" stat, this should return
+   * "spark.sql.statistics.colStats.mycol.min".
+   */
+  private def columnStatKeyPropName(columnName: String, statKey: String): String = {
+    STATISTICS_COL_STATS_PREFIX + columnName + "." + statKey
   }
 
   // A persisted data source table always store its schema in the catalog.
-  def getSchemaFromTableProperties(metadata: CatalogTable): StructType = {
+  private def getSchemaFromTableProperties(metadata: CatalogTable): StructType = {
     val errorMessage = "Could not read schema from the hive metastore because it is corrupted."
     val props = metadata.properties
     val schema = props.get(DATASOURCE_SCHEMA)
@@ -804,6 +1124,11 @@ object HiveExternalCatalog {
       // After SPARK-6024, we removed this flag.
       // Although we are not using `spark.sql.sources.schema` any more, we need to still support.
       DataType.fromJson(schema.get).asInstanceOf[StructType]
+    } else if (props.filterKeys(_.startsWith(DATASOURCE_SCHEMA_PREFIX)).isEmpty) {
+      // If there is no schema information in table properties, it means the schema of this table
+      // was empty when saving into metastore, which is possible in older version(prior to 2.1) of
+      // Spark. We should respect it.
+      new StructType()
     } else {
       val numSchemaParts = props.get(DATASOURCE_SCHEMA_NUMPARTS)
       if (numSchemaParts.isDefined) {
@@ -838,11 +1163,11 @@ object HiveExternalCatalog {
     )
   }
 
-  def getPartitionColumnsFromTableProperties(metadata: CatalogTable): Seq[String] = {
+  private def getPartitionColumnsFromTableProperties(metadata: CatalogTable): Seq[String] = {
     getColumnNamesByType(metadata.properties, "part", "partitioning columns")
   }
 
-  def getBucketSpecFromTableProperties(metadata: CatalogTable): Option[BucketSpec] = {
+  private def getBucketSpecFromTableProperties(metadata: CatalogTable): Option[BucketSpec] = {
     metadata.properties.get(DATASOURCE_SCHEMA_NUMBUCKETS).map { numBuckets =>
       BucketSpec(
         numBuckets.toInt,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index e303065127c3..52aa1088acd4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -246,6 +246,9 @@ private[hive] trait HiveInspectors {
    * Wraps with Hive types based on object inspector.
    */
   protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => Any = oi match {
+    case _ if dataType.isInstanceOf[UserDefinedType[_]] =>
+      val sqlType = dataType.asInstanceOf[UserDefinedType[_]].sqlType
+      wrapperFor(oi, sqlType)
     case x: ConstantObjectInspector =>
       (o: Any) =>
         x.getWritableConstantValue
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 624ab747e442..f93922073704 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -17,25 +17,26 @@
 
 package org.apache.spark.sql.hive
 
-import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
 
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
+import com.google.common.util.concurrent.Striped
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions}
 import org.apache.spark.sql.hive.orc.OrcFileFormat
+import org.apache.spark.sql.internal.SQLConf.HiveCaseSensitiveInferenceMode._
 import org.apache.spark.sql.types._
 
-
 /**
  * Legacy catalog for interacting with the Hive metastore.
  *
@@ -48,6 +49,8 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
   /** A fully qualified identifier for a table (i.e., database.tableName) */
   case class QualifiedTableName(database: String, name: String)
 
+  import HiveMetastoreCatalog._
+
   private def getCurrentDatabase: String = sessionState.catalog.getCurrentDatabase
 
   def getQualifiedTableName(tableIdent: TableIdentifier): QualifiedTableName = {
@@ -56,10 +59,16 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       tableIdent.table.toLowerCase)
   }
 
-  private def getQualifiedTableName(t: CatalogTable): QualifiedTableName = {
-    QualifiedTableName(
-      t.identifier.database.getOrElse(getCurrentDatabase).toLowerCase,
-      t.identifier.table.toLowerCase)
+  /** These locks guard against multiple attempts to instantiate a table, which wastes memory. */
+  private val tableCreationLocks = Striped.lazyWeakLock(100)
+
+  /** Acquires a lock on the table cache for the duration of `f`. */
+  private def withTableCreationLock[A](tableName: QualifiedTableName, f: => A): A = {
+    val lock = tableCreationLocks.get(tableName)
+    lock.lock()
+    try f finally {
+      lock.unlock()
+    }
   }
 
   /** A cache of Spark SQL data source tables that have been accessed. */
@@ -69,14 +78,17 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         logDebug(s"Creating new cached data source for $in")
         val table = sparkSession.sharedState.externalCatalog.getTable(in.database, in.name)
 
+        val pathOption = table.storage.locationUri.map("path" -> _)
         val dataSource =
           DataSource(
             sparkSession,
-            userSpecifiedSchema = Some(table.schema),
+            // In older version(prior to 2.1) of Spark, the table schema can be empty and should be
+            // inferred at runtime. We should still support it.
+            userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
             partitionColumns = table.partitionColumnNames,
             bucketSpec = table.bucketSpec,
             className = table.provider.get,
-            options = table.storage.properties,
+            options = table.storage.properties ++ pathOption,
             catalogTable = Some(table))
 
         LogicalRelation(dataSource.resolveRelation(), catalogTable = Some(table))
@@ -194,9 +206,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     val bucketSpec = None  // We don't support hive bucketed tables, only ones we write out.
 
     val lazyPruningEnabled = sparkSession.sqlContext.conf.manageFilesourcePartitions
+    val fileFormat = fileFormatClass.newInstance()
+
     val result = if (metastoreRelation.hiveQlTable.isPartitioned) {
       val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
-
       val rootPaths: Seq[Path] = if (lazyPruningEnabled) {
         Seq(metastoreRelation.hiveQlTable.getDataLocation)
       } else {
@@ -215,74 +228,127 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         }
       }
 
-      val cached = getCached(
-        tableIdentifier,
-        rootPaths,
-        metastoreRelation,
-        metastoreSchema,
-        fileFormatClass,
-        bucketSpec,
-        Some(partitionSchema))
-
-      val logicalRelation = cached.getOrElse {
-        val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
-        val fileCatalog = {
-          val catalog = new CatalogFileIndex(
-            sparkSession, metastoreRelation.catalogTable, sizeInBytes)
-          if (lazyPruningEnabled) {
-            catalog
-          } else {
-            catalog.filterPartitions(Nil)  // materialize all the partitions in memory
+      withTableCreationLock(tableIdentifier, {
+        val cached = getCached(
+          tableIdentifier,
+          rootPaths,
+          metastoreRelation,
+          metastoreSchema,
+          fileFormatClass,
+          bucketSpec,
+          Some(partitionSchema))
+
+        val logicalRelation = cached.getOrElse {
+          val sizeInBytes = metastoreRelation.statistics.sizeInBytes.toLong
+          val fileIndex = {
+            val index = new CatalogFileIndex(
+              sparkSession, metastoreRelation.catalogTable, sizeInBytes)
+            if (lazyPruningEnabled) {
+              index
+            } else {
+              index.filterPartitions(Nil)  // materialize all the partitions in memory
+            }
           }
+          val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
+
+          val (dataSchema, updatedTable) =
+            inferIfNeeded(metastoreRelation, options, fileFormat, Option(fileIndex))
+
+          val relation = HadoopFsRelation(
+            location = fileIndex,
+            partitionSchema = partitionSchema,
+            dataSchema = dataSchema,
+            bucketSpec = bucketSpec,
+            fileFormat = defaultSource,
+            options = options)(sparkSession = sparkSession)
+
+          val created = LogicalRelation(relation,
+            catalogTable = Some(updatedTable))
+          cachedDataSourceTables.put(tableIdentifier, created)
+          created
         }
-        val partitionSchemaColumnNames = partitionSchema.map(_.name.toLowerCase).toSet
-        val dataSchema =
-          StructType(metastoreSchema
-            .filterNot(field => partitionSchemaColumnNames.contains(field.name.toLowerCase)))
-
-        val relation = HadoopFsRelation(
-          location = fileCatalog,
-          partitionSchema = partitionSchema,
-          dataSchema = dataSchema,
-          bucketSpec = bucketSpec,
-          fileFormat = defaultSource,
-          options = options)(sparkSession = sparkSession)
-
-        val created = LogicalRelation(relation, catalogTable = Some(metastoreRelation.catalogTable))
-        cachedDataSourceTables.put(tableIdentifier, created)
-        created
-      }
 
-      logicalRelation
+        logicalRelation
+      })
     } else {
       val rootPath = metastoreRelation.hiveQlTable.getDataLocation
+      withTableCreationLock(tableIdentifier, {
+        val cached = getCached(tableIdentifier,
+          Seq(rootPath),
+          metastoreRelation,
+          metastoreSchema,
+          fileFormatClass,
+          bucketSpec,
+          None)
+        val logicalRelation = cached.getOrElse {
+          val (dataSchema, updatedTable) = inferIfNeeded(metastoreRelation, options, fileFormat)
+          val created =
+            LogicalRelation(
+              DataSource(
+                sparkSession = sparkSession,
+                paths = rootPath.toString :: Nil,
+                userSpecifiedSchema = Option(dataSchema),
+                bucketSpec = bucketSpec,
+                options = options,
+                className = fileType).resolveRelation(),
+              catalogTable = Some(updatedTable))
+
+          cachedDataSourceTables.put(tableIdentifier, created)
+          created
+        }
 
-      val cached = getCached(tableIdentifier,
-        Seq(rootPath),
-        metastoreRelation,
-        metastoreSchema,
-        fileFormatClass,
-        bucketSpec,
-        None)
-      val logicalRelation = cached.getOrElse {
-        val created =
-          LogicalRelation(
-            DataSource(
-              sparkSession = sparkSession,
-              paths = rootPath.toString :: Nil,
-              userSpecifiedSchema = Some(metastoreRelation.schema),
-              bucketSpec = bucketSpec,
-              options = options,
-              className = fileType).resolveRelation(),
-              catalogTable = Some(metastoreRelation.catalogTable))
-
-        cachedDataSourceTables.put(tableIdentifier, created)
-        created
+        logicalRelation
+      })
+    }
+    result.copy(expectedOutputAttributes = Some(metastoreRelation.output))
+  }
+
+  private def inferIfNeeded(
+      relation: MetastoreRelation,
+      options: Map[String, String],
+      fileFormat: FileFormat,
+      fileIndexOpt: Option[FileIndex] = None): (StructType, CatalogTable) = {
+    val inferenceMode = sparkSession.sessionState.conf.caseSensitiveInferenceMode
+    val shouldInfer = (inferenceMode != NEVER_INFER) && !relation.catalogTable.schemaPreservesCase
+    val tableName = relation.catalogTable.identifier.unquotedString
+    if (shouldInfer) {
+      logInfo(s"Inferring case-sensitive schema for table $tableName (inference mode: " +
+        s"$inferenceMode)")
+      val fileIndex = fileIndexOpt.getOrElse {
+        val rootPath = new Path(relation.catalogTable.location)
+        new InMemoryFileIndex(sparkSession, Seq(rootPath), options, None)
       }
 
-      logicalRelation
+      val inferredSchema = fileFormat
+        .inferSchema(
+          sparkSession,
+          options,
+          fileIndex.listFiles(Nil).flatMap(_.files))
+        .map(mergeWithMetastoreSchema(relation.catalogTable.schema, _))
+
+      inferredSchema match {
+        case Some(schema) =>
+          if (inferenceMode == INFER_AND_SAVE) {
+            updateCatalogSchema(relation.catalogTable.identifier, schema)
+          }
+          (schema, relation.catalogTable.copy(schema = schema))
+        case None =>
+          logWarning(s"Unable to infer schema for table $tableName from file format " +
+            s"$fileFormat (inference mode: $inferenceMode). Using metastore schema.")
+          (relation.catalogTable.schema, relation.catalogTable)
+      }
+    } else {
+      (relation.catalogTable.schema, relation.catalogTable)
     }
-    result.copy(expectedOutputAttributes = Some(metastoreRelation.output))
+  }
+
+  private def updateCatalogSchema(identifier: TableIdentifier, schema: StructType): Unit = try {
+    val db = identifier.database.get
+    logInfo(s"Saving case-sensitive schema for table ${identifier.unquotedString}")
+    sparkSession.sharedState.externalCatalog.alterTableSchema(db, identifier.table, schema)
+  } catch {
+    case NonFatal(ex) =>
+      logWarning(s"Unable to save case-sensitive schema for table ${identifier.unquotedString}", ex)
   }
 
   /**
@@ -363,3 +429,30 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     }
   }
 }
+
+private[hive] object HiveMetastoreCatalog {
+  def mergeWithMetastoreSchema(
+      metastoreSchema: StructType,
+      inferredSchema: StructType): StructType = try {
+    // Find any nullable fields in mestastore schema that are missing from the inferred schema.
+    val metastoreFields = metastoreSchema.map(f => f.name.toLowerCase -> f).toMap
+    val missingNullables = metastoreFields
+      .filterKeys(!inferredSchema.map(_.name.toLowerCase).contains(_))
+      .values
+      .filter(_.nullable)
+    // Merge missing nullable fields to inferred schema and build a case-insensitive field map.
+    val inferredFields = StructType(inferredSchema ++ missingNullables)
+      .map(f => f.name.toLowerCase -> f).toMap
+    StructType(metastoreSchema.map(f => f.copy(name = inferredFields(f.name).name)))
+  } catch {
+    case NonFatal(_) =>
+      val msg = s"""Detected conflicting schemas when merging the schema obtained from the Hive
+         | Metastore with the one inferred from the file format. Metastore schema:
+         |${metastoreSchema.prettyJson}
+         |
+         |Inferred schema:
+         |${inferredSchema.prettyJson}
+       """.stripMargin
+      throw new SparkException(msg)
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 4f2910abfd21..462b3c2686c6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -56,23 +56,25 @@ private[sql] class HiveSessionCatalog(
     hadoopConf) {
 
   override def lookupRelation(name: TableIdentifier, alias: Option[String]): LogicalPlan = {
-    val table = formatTableName(name.table)
-    val db = formatDatabaseName(name.database.getOrElse(currentDb))
-    if (db == globalTempViewManager.database) {
-      val relationAlias = alias.getOrElse(table)
-      globalTempViewManager.get(table).map { viewDef =>
-        SubqueryAlias(relationAlias, viewDef, Some(name))
-      }.getOrElse(throw new NoSuchTableException(db, table))
-    } else if (name.database.isDefined || !tempTables.contains(table)) {
-      val database = name.database.map(formatDatabaseName)
-      val newName = name.copy(database = database, table = table)
-      metastoreCatalog.lookupRelation(newName, alias)
-    } else {
-      val relation = tempTables(table)
-      val tableWithQualifiers = SubqueryAlias(table, relation, None)
-      // If an alias was specified by the lookup, wrap the plan in a subquery so that
-      // attributes are properly qualified with this alias.
-      alias.map(a => SubqueryAlias(a, tableWithQualifiers, None)).getOrElse(tableWithQualifiers)
+    synchronized {
+      val table = formatTableName(name.table)
+      val db = formatDatabaseName(name.database.getOrElse(currentDb))
+      if (db == globalTempViewManager.database) {
+        val relationAlias = alias.getOrElse(table)
+        globalTempViewManager.get(table).map { viewDef =>
+          SubqueryAlias(relationAlias, viewDef, Some(name))
+        }.getOrElse(throw new NoSuchTableException(db, table))
+      } else if (name.database.isDefined || !tempTables.contains(table)) {
+        val database = name.database.map(formatDatabaseName)
+        val newName = name.copy(database = database, table = table)
+        metastoreCatalog.lookupRelation(newName, alias)
+      } else {
+        val relation = tempTables(table)
+        val tableWithQualifiers = SubqueryAlias(table, relation, None)
+        // If an alias was specified by the lookup, wrap the plan in a subquery so that
+        // attributes are properly qualified with this alias.
+        alias.map(a => SubqueryAlias(a, tableWithQualifiers, None)).getOrElse(tableWithQualifiers)
+      }
     }
   }
 
@@ -232,9 +234,8 @@ private[sql] class HiveSessionCatalog(
   // current_user, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field,
   // in_file, index, matchpath, ngrams, noop, noopstreaming, noopwithmap,
   // noopwithmapstreaming, parse_url_tuple, reflect2, windowingtablefunction.
+  // Note: don't forget to update SessionCatalog.isTemporaryFunction
   private val hiveFunctions = Seq(
-    "hash",
-    "histogram_numeric",
-    "percentile"
+    "histogram_numeric"
   )
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index 0d2a765a388a..9e9894803ce2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -69,13 +69,13 @@ private[hive] object HiveShim {
   }
 
   /*
-   * Cannot use ColumnProjectionUtils.appendReadColumns directly, if ids is null or empty
+   * Cannot use ColumnProjectionUtils.appendReadColumns directly, if ids is null
    */
   def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]) {
-    if (ids != null && ids.nonEmpty) {
+    if (ids != null) {
       ColumnProjectionUtils.appendReadColumns(conf, ids.asJava)
     }
-    if (names != null && names.nonEmpty) {
+    if (names != null) {
       appendReadColumnNames(conf, names)
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 9d2930948d6b..ce1e3eb1a5bc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -46,7 +46,8 @@ private[hive] trait HiveStrategies {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.InsertIntoTable(
           table: MetastoreRelation, partition, child, overwrite, ifNotExists) =>
-        InsertIntoHiveTable(table, partition, planLater(child), overwrite, ifNotExists) :: Nil
+        InsertIntoHiveTable(
+          table, partition, planLater(child), overwrite.enabled, ifNotExists) :: Nil
 
       case CreateTable(tableDesc, mode, Some(query)) if tableDesc.provider.get == "hive" =>
         val newTableDesc = if (tableDesc.storage.serde.isEmpty) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index a5ef8723c8b6..81cd65c3cc33 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf._
-import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
+import org.apache.spark.sql.internal.StaticSQLConf.{CATALOG_IMPLEMENTATION, WAREHOUSE_PATH}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -373,7 +373,7 @@ private[spark] object HiveUtils extends Logging {
         propMap.put(confvar.varname, confvar.getDefaultExpr())
       }
     }
-    propMap.put(SQLConf.WAREHOUSE_PATH.key, localMetastore.toURI.toString)
+    propMap.put(WAREHOUSE_PATH.key, localMetastore.toURI.toString)
     propMap.put(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname,
       s"jdbc:derby:${withInMemoryMode};databaseName=${localMetastore.getAbsolutePath};create=true")
     propMap.put("datanucleus.rdbms.datastoreAdapterClassName",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
index da809cf991de..8f40a59fc15e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
@@ -35,8 +35,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference, Expression}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.execution.FileRelation
-import org.apache.spark.sql.hive.client.HiveClient
-import org.apache.spark.sql.types.StructField
+import org.apache.spark.sql.types._
 
 
 private[hive] case class MetastoreRelation(
@@ -61,7 +60,12 @@ private[hive] case class MetastoreRelation(
   override protected def otherCopyArgs: Seq[AnyRef] = catalogTable :: sparkSession :: Nil
 
   private def toHiveColumn(c: StructField): FieldSchema = {
-    new FieldSchema(c.name, c.dataType.catalogString, c.getComment.orNull)
+    val typeString = if (c.metadata.contains(HIVE_TYPE_STRING)) {
+      c.metadata.getString(HIVE_TYPE_STRING)
+    } else {
+      c.dataType.catalogString
+    }
+    new FieldSchema(c.name, typeString, c.getComment.orNull)
   }
 
   // TODO: merge this with HiveClientImpl#toHiveTable
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 569a9c11398e..0be5b0bedfe7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -58,12 +58,10 @@ private[hive] trait HiveClient {
   def setCurrentDatabase(databaseName: String): Unit
 
   /** Returns the metadata for specified database, throwing an exception if it doesn't exist */
-  final def getDatabase(name: String): CatalogDatabase = {
-    getDatabaseOption(name).getOrElse(throw new NoSuchDatabaseException(name))
-  }
+  def getDatabase(name: String): CatalogDatabase
 
-  /** Returns the metadata for a given database, or None if it doesn't exist. */
-  def getDatabaseOption(name: String): Option[CatalogDatabase]
+  /** Return whether a table/view with the specified name exists. */
+  def databaseExists(dbName: String): Boolean
 
   /** List the names of all the databases that match the specified pattern. */
   def listDatabases(pattern: String): Seq[String]
@@ -125,7 +123,8 @@ private[hive] trait HiveClient {
       table: String,
       specs: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit
+      purge: Boolean,
+      retainData: Boolean): Unit
 
   /**
    * Rename one or many existing table partitions, assuming they exist.
@@ -155,6 +154,16 @@ private[hive] trait HiveClient {
     }
   }
 
+  /**
+   * Returns the partition names for the given table that match the supplied partition spec.
+   * If no partition spec is specified, all partitions are returned.
+   *
+   * The returned sequence is sorted as strings.
+   */
+  def getPartitionNames(
+      table: CatalogTable,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String]
+
   /** Returns the specified partition or None if it does not exist. */
   final def getPartitionOption(
       db: String,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 84873bbbb81c..205b7ffe3016 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -46,7 +46,8 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
 import org.apache.spark.sql.execution.QueryExecutionException
-import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.types._
 import org.apache.spark.util.{CircularBuffer, Utils}
 
 /**
@@ -97,17 +98,13 @@ private[hive] class HiveClientImpl(
   }
 
   // Create an internal session state for this HiveClientImpl.
-  val state = {
+  val state: SessionState = {
     val original = Thread.currentThread().getContextClassLoader
     // Switch to the initClassLoader.
     Thread.currentThread().setContextClassLoader(initClassLoader)
 
     // Set up kerberos credentials for UserGroupInformation.loginUser within
     // current class loader
-    // Instead of using the spark conf of the current spark context, a new
-    // instance of SparkConf is needed for the original value of spark.yarn.keytab
-    // and spark.yarn.principal set in SparkSubmit, as yarn.Client resets the
-    // keytab configuration for the link name in distributed cache
     if (sparkConf.contains("spark.yarn.principal") && sparkConf.contains("spark.yarn.keytab")) {
       val principalName = sparkConf.get("spark.yarn.principal")
       val keytabFileName = sparkConf.get("spark.yarn.keytab")
@@ -281,6 +278,7 @@ private[hive] class HiveClientImpl(
     shim.setCurrentSessionState(state)
     val ret = try f finally {
       Thread.currentThread().setContextClassLoader(original)
+      HiveCatalogMetrics.incrementHiveClientCalls(1)
     }
     ret
   }
@@ -298,7 +296,7 @@ private[hive] class HiveClientImpl(
   }
 
   override def setCurrentDatabase(databaseName: String): Unit = withHiveState {
-    if (getDatabaseOption(databaseName).isDefined) {
+    if (databaseExists(databaseName)) {
       state.setCurrentDatabase(databaseName)
     } else {
       throw new NoSuchDatabaseException(databaseName)
@@ -334,14 +332,18 @@ private[hive] class HiveClientImpl(
         Option(database.properties).map(_.asJava).orNull))
   }
 
-  override def getDatabaseOption(name: String): Option[CatalogDatabase] = withHiveState {
-    Option(client.getDatabase(name)).map { d =>
+  override def getDatabase(dbName: String): CatalogDatabase = withHiveState {
+    Option(client.getDatabase(dbName)).map { d =>
       CatalogDatabase(
         name = d.getName,
         description = d.getDescription,
         locationUri = d.getLocationUri,
         properties = Option(d.getParameters).map(_.asScala.toMap).orNull)
-    }
+    }.getOrElse(throw new NoSuchDatabaseException(dbName))
+  }
+
+  override def databaseExists(dbName: String): Boolean = withHiveState {
+    client.databaseExists(dbName)
   }
 
   override def listDatabases(pattern: String): Seq[String] = withHiveState {
@@ -377,6 +379,10 @@ private[hive] class HiveClientImpl(
         unsupportedFeatures += "bucketing"
       }
 
+      if (h.getTableType == HiveTableType.VIRTUAL_VIEW && partCols.nonEmpty) {
+        unsupportedFeatures += "partitioned view"
+      }
+
       val properties = Option(h.getParameters).map(_.asScala.toMap).orNull
 
       CatalogTable(
@@ -447,12 +453,14 @@ private[hive] class HiveClientImpl(
       table: String,
       specs: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit = withHiveState {
+      purge: Boolean,
+      retainData: Boolean): Unit = withHiveState {
     // TODO: figure out how to drop multiple partitions in one call
     val hiveTable = client.getTable(db, table, true /* throw exception */)
     // do the check at first and collect all the matching partitions
     val matchingParts =
       specs.flatMap { s =>
+        assert(s.values.forall(_.nonEmpty), s"partition spec '$s' is invalid")
         // The provided spec here can be a partial spec, i.e. it will match all partitions
         // whose specs are supersets of this partial spec. E.g. If a table has partitions
         // (b='1', c='1') and (b='1', c='2'), a partial spec of (b='1') will match both.
@@ -467,8 +475,7 @@ private[hive] class HiveClientImpl(
     var droppedParts = ArrayBuffer.empty[java.util.List[String]]
     matchingParts.foreach { partition =>
       try {
-        val deleteData = true
-        shim.dropPartition(client, db, table, partition, deleteData, purge)
+        shim.dropPartition(client, db, table, partition, !retainData, purge)
       } catch {
         case e: Exception =>
           val remainingParts = matchingParts.toBuffer -- droppedParts
@@ -513,6 +520,27 @@ private[hive] class HiveClientImpl(
     client.alterPartitions(table, newParts.map { p => toHivePartition(p, hiveTable) }.asJava)
   }
 
+  /**
+   * Returns the partition names for the given table that match the supplied partition spec.
+   * If no partition spec is specified, all partitions are returned.
+   *
+   * The returned sequence is sorted as strings.
+   */
+  override def getPartitionNames(
+      table: CatalogTable,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String] = withHiveState {
+    val hivePartitionNames =
+      partialSpec match {
+        case None =>
+          // -1 for result limit means "no limit/return all"
+          client.getPartitionNames(table.database, table.identifier.table, -1)
+        case Some(s) =>
+          assert(s.values.forall(_.nonEmpty), s"partition spec '$s' is invalid")
+          client.getPartitionNames(table.database, table.identifier.table, s.asJava, -1)
+      }
+    hivePartitionNames.asScala.sorted
+  }
+
   override def getPartitionOption(
       table: CatalogTable,
       spec: TablePartitionSpec): Option[CatalogTablePartition] = withHiveState {
@@ -531,7 +559,9 @@ private[hive] class HiveClientImpl(
     val hiveTable = toHiveTable(table)
     val parts = spec match {
       case None => shim.getAllPartitions(client, hiveTable).map(fromHivePartition)
-      case Some(s) => client.getPartitions(hiveTable, s.asJava).asScala.map(fromHivePartition)
+      case Some(s) =>
+        assert(s.values.forall(_.nonEmpty), s"partition spec '$s' is invalid")
+        client.getPartitions(hiveTable, s.asJava).asScala.map(fromHivePartition)
     }
     HiveCatalogMetrics.incrementFetchedPartitions(parts.length)
     parts
@@ -743,7 +773,12 @@ private[hive] class HiveClientImpl(
       .asInstanceOf[Class[_ <: org.apache.hadoop.hive.ql.io.HiveOutputFormat[_, _]]]
 
   private def toHiveColumn(c: StructField): FieldSchema = {
-    new FieldSchema(c.name, c.dataType.catalogString, c.getComment().orNull)
+    val typeString = if (c.metadata.contains(HIVE_TYPE_STRING)) {
+      c.metadata.getString(HIVE_TYPE_STRING)
+    } else {
+      c.dataType.catalogString
+    }
+    new FieldSchema(c.name, typeString, c.getComment().orNull)
   }
 
   private def fromHiveColumn(hc: FieldSchema): StructField = {
@@ -753,10 +788,13 @@ private[hive] class HiveClientImpl(
       case e: ParseException =>
         throw new SparkException("Cannot recognize hive type string: " + hc.getType, e)
     }
+
+    val metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
     val field = StructField(
       name = hc.getName,
       dataType = columnType,
-      nullable = true)
+      nullable = true,
+      metadata = metadata)
     Option(hc.getComment).map(field.withComment).getOrElse(field)
   }
 
@@ -790,7 +828,7 @@ private[hive] class HiveClientImpl(
       hiveTable.setFields(schema.asJava)
     }
     hiveTable.setPartCols(partCols.asJava)
-    hiveTable.setOwner(conf.getUser)
+    hiveTable.setOwner(state.getAuthenticator().getUserName())
     hiveTable.setCreateTime((table.createTime / 1000).toInt)
     hiveTable.setLastAccessTime((table.lastAccessTime / 1000).toInt)
     table.storage.locationUri.foreach { loc => shim.setDataLocation(hiveTable, loc) }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 3d9642dd1463..64be1ed96da6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -268,7 +268,8 @@ private[client] class Shim_v0_12 extends Shim with Logging {
       ignoreIfExists: Boolean): Unit = {
     val table = hive.getTable(database, tableName)
     parts.foreach { s =>
-      val location = s.storage.locationUri.map(new Path(table.getPath, _)).orNull
+      val location = s.storage.locationUri.map(
+        uri => new Path(table.getPath, new Path(new URI(uri)))).orNull
       val params = if (s.parameters.nonEmpty) s.parameters.asJava else null
       val spec = s.spec.asJava
       if (hive.getPartition(table, spec, false) != null && ignoreIfExists) {
@@ -463,7 +464,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       ignoreIfExists: Boolean): Unit = {
     val addPartitionDesc = new AddPartitionDesc(db, table, ignoreIfExists)
     parts.zipWithIndex.foreach { case (s, i) =>
-      addPartitionDesc.addPartition(s.spec.asJava, s.storage.locationUri.orNull)
+      addPartitionDesc.addPartition(
+        s.spec.asJava, s.storage.locationUri.map(u => new Path(new URI(u)).toString).orNull)
       if (s.parameters.nonEmpty) {
         addPartitionDesc.getPartition(i).setPartParams(s.parameters.asJava)
       }
@@ -567,13 +569,24 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
         s"$v ${op.symbol} ${a.name}"
       case op @ BinaryComparison(a: Attribute, Literal(v, _: StringType))
           if !varcharKeys.contains(a.name) =>
-        s"""${a.name} ${op.symbol} "$v""""
+        s"""${a.name} ${op.symbol} ${quoteStringLiteral(v.toString)}"""
       case op @ BinaryComparison(Literal(v, _: StringType), a: Attribute)
           if !varcharKeys.contains(a.name) =>
-        s""""$v" ${op.symbol} ${a.name}"""
+        s"""${quoteStringLiteral(v.toString)} ${op.symbol} ${a.name}"""
     }.mkString(" and ")
   }
 
+  private def quoteStringLiteral(str: String): String = {
+    if (!str.contains("\"")) {
+      s""""$str""""
+    } else if (!str.contains("'")) {
+      s"""'$str'"""
+    } else {
+      throw new UnsupportedOperationException(
+        """Partition filter cannot have both `"` and `'` characters""")
+    }
+  }
+
   override def getPartitionsByFilter(
       hive: Hive,
       table: Table,
@@ -582,14 +595,18 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
     // Hive getPartitionsByFilter() takes a string that represents partition
     // predicates like "str_key=\"value\" and int_key=1 ..."
     val filter = convertFilters(table, predicates)
+
     val partitions =
       if (filter.isEmpty) {
         getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]]
       } else {
         logDebug(s"Hive metastore filter is '$filter'.")
         val tryDirectSqlConfVar = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL
-        val tryDirectSql =
-          hive.getConf.getBoolean(tryDirectSqlConfVar.varname, tryDirectSqlConfVar.defaultBoolVal)
+        // We should get this config value from the metaStore. otherwise hit SPARK-18681.
+        // To be compatible with hive-0.12 and hive-0.13, In the future we can achieve this by:
+        // val tryDirectSql = hive.getMetaConf(tryDirectSqlConfVar.varname).toBoolean
+        val tryDirectSql = hive.getMSC.getConfigValue(tryDirectSqlConfVar.varname,
+          tryDirectSqlConfVar.defaultBoolVal.toString).toBoolean
         try {
           // Hive may throw an exception when calling this method in some circumstances, such as
           // when filtering on a non-string partition column when the hive config key
@@ -732,12 +749,8 @@ private[client] class Shim_v0_14 extends Shim_v0_13 {
       deleteData: Boolean,
       ignoreIfNotExists: Boolean,
       purge: Boolean): Unit = {
-    try {
-      dropTableMethod.invoke(hive, dbName, tableName, deleteData: JBoolean,
-        ignoreIfNotExists: JBoolean, purge: JBoolean)
-    } catch {
-      case e: InvocationTargetException => throw e.getCause()
-    }
+    dropTableMethod.invoke(hive, dbName, tableName, deleteData: JBoolean,
+      ignoreIfNotExists: JBoolean, purge: JBoolean)
   }
 
   override def getMetastoreClientConnectRetryDelayMillis(conf: HiveConf): Long = {
@@ -830,11 +843,7 @@ private[client] class Shim_v1_2 extends Shim_v1_1 {
     val dropOptions = dropOptionsClass.newInstance().asInstanceOf[Object]
     dropOptionsDeleteData.setBoolean(dropOptions, deleteData)
     dropOptionsPurge.setBoolean(dropOptions, purge)
-    try {
-      dropPartitionMethod.invoke(hive, dbName, tableName, part, dropOptions)
-    } catch {
-      case e: InvocationTargetException => throw e.getCause()
-    }
+    dropPartitionMethod.invoke(hive, dbName, tableName, part, dropOptions)
   }
 
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
index ef5a5a001fb6..cac43597aef2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
@@ -21,7 +21,7 @@ import scala.util.control.NonFatal
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, OverwriteOptions}
 import org.apache.spark.sql.execution.command.RunnableCommand
 import org.apache.spark.sql.hive.MetastoreRelation
 
@@ -88,7 +88,8 @@ case class CreateHiveTableAsSelectCommand(
     } else {
       try {
         sparkSession.sessionState.executePlan(InsertIntoTable(
-          metastoreRelation, Map(), query, overwrite = true, ifNotExists = false)).toRdd
+          metastoreRelation, Map(), query, overwrite = OverwriteOptions(true),
+          ifNotExists = false)).toRdd
       } catch {
         case NonFatal(e) =>
           // drop the created table.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
index 231f204b12b4..c80695bd3e0f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -154,8 +154,9 @@ case class HiveTableScanExec(
     val numOutputRows = longMetric("numOutputRows")
     // Avoid to serialize MetastoreRelation because schema is lazy. (see SPARK-15649)
     val outputSchema = schema
-    rdd.mapPartitionsInternal { iter =>
+    rdd.mapPartitionsWithIndexInternal { (index, iter) =>
       val proj = UnsafeProjection.create(outputSchema)
+      proj.initialize(index)
       iter.map { r =>
         numOutputRows += 1
         proj(r)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 2843100fb3b3..3b9c2fcb0ce1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -20,11 +20,10 @@ package org.apache.spark.sql.hive.execution
 import java.io.IOException
 import java.net.URI
 import java.text.SimpleDateFormat
-import java.util.{Date, Random}
+import java.util.{Date, Locale, Random}
 
-import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.common.FileUtils
 import org.apache.hadoop.hive.ql.exec.TaskRunner
@@ -37,13 +36,43 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
-import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, AlterTableDropPartitionCommand}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.SparkException
 import org.apache.spark.util.SerializableJobConf
 
 
+/**
+ * Command for writing data out to a Hive table.
+ *
+ * This class is mostly a mess, for legacy reasons (since it evolved in organic ways and had to
+ * follow Hive's internal implementations closely, which itself was a mess too). Please don't
+ * blame Reynold for this! He was just moving code around!
+ *
+ * In the future we should converge the write path for Hive with the normal data source write path,
+ * as defined in [[org.apache.spark.sql.execution.datasources.FileFormatWriter]].
+ *
+ * @param table the logical plan representing the table. In the future this should be a
+ *              [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] once we converge Hive tables
+ *              and data source tables.
+ * @param partition a map from the partition key to the partition value (optional). If the partition
+ *                  value is optional, dynamic partition insert will be performed.
+ *                  As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have
+ *
+ *                  {{{
+ *                  Map('a' -> Some('1'), 'b' -> Some('2'))
+ *                  }}}
+ *
+ *                  and `INSERT INTO tbl PARTITION (a=1, b) AS ...`
+ *                  would have
+ *
+ *                  {{{
+ *                  Map('a' -> Some('1'), 'b' -> None)
+ *                  }}}.
+ * @param child the logical plan representing data to write to.
+ * @param overwrite overwrite existing table or partitions.
+ * @param ifNotExists If true, only write if the table or partition does not exist.
+ */
 case class InsertIntoHiveTable(
     table: MetastoreRelation,
     partition: Map[String, Option[String]],
@@ -56,16 +85,18 @@ case class InsertIntoHiveTable(
 
   def output: Seq[Attribute] = Seq.empty
 
-  val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
+  val hadoopConf = sessionState.newHadoopConf()
+  var createdTempDir: Option[Path] = None
+  val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
+  val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive")
 
   private def executionId: String = {
     val rand: Random = new Random
-    val format: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS")
-    val executionId: String = "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-    return executionId
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
   }
 
-  private def getStagingDir(inputPath: Path, hadoopConf: Configuration): Path = {
+  private def getStagingDir(inputPath: Path): Path = {
     val inputPathUri: URI = inputPath.toUri
     val inputPathName: String = inputPathUri.getPath
     val fs: FileSystem = inputPath.getFileSystem(hadoopConf)
@@ -83,32 +114,79 @@ case class InsertIntoHiveTable(
       if (!FileUtils.mkdir(fs, dir, true, hadoopConf)) {
         throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
       }
+      createdTempDir = Some(dir)
       fs.deleteOnExit(dir)
-    }
-    catch {
+    } catch {
       case e: IOException =>
         throw new RuntimeException(
           "Cannot create staging directory '" + dir.toString + "': " + e.getMessage, e)
-
     }
     return dir
   }
 
-  private def getExternalScratchDir(extURI: URI, hadoopConf: Configuration): Path = {
-    getStagingDir(new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), hadoopConf)
+  private def getExternalScratchDir(extURI: URI): Path = {
+    getStagingDir(new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath))
+  }
+
+  def getExternalTmpPath(path: Path): Path = {
+    import org.apache.spark.sql.hive.client.hive._
+
+    val hiveVersion = externalCatalog.asInstanceOf[HiveExternalCatalog].client.version
+    // Before Hive 1.1, when inserting into a table, Hive will create the staging directory under
+    // a common scratch directory. After the writing is finished, Hive will simply empty the table
+    // directory and move the staging directory to it.
+    // After Hive 1.1, Hive will create the staging directory under the table directory, and when
+    // moving staging directory to table directory, Hive will still empty the table directory, but
+    // will exclude the staging directory there.
+    // We have to follow the Hive behavior here, to avoid troubles. For example, if we create
+    // staging directory under the table director for Hive prior to 1.1, the staging directory will
+    // be removed by Hive when Hive is trying to empty the table directory.
+    if (hiveVersion == v12 || hiveVersion == v13 || hiveVersion == v14 || hiveVersion == v1_0) {
+      oldVersionExternalTempPath(path)
+    } else if (hiveVersion == v1_1 || hiveVersion == v1_2) {
+      newVersionExternalTempPath(path)
+    } else {
+      throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion)
+    }
   }
 
-  def getExternalTmpPath(path: Path, hadoopConf: Configuration): Path = {
+  // Mostly copied from Context.java#getExternalTmpPath of Hive 0.13
+  def oldVersionExternalTempPath(path: Path): Path = {
+    val extURI: URI = path.toUri
+    val scratchPath = new Path(scratchDir, executionId)
+    var dirPath = new Path(
+      extURI.getScheme,
+      extURI.getAuthority,
+      scratchPath.toUri.getPath + "-" + TaskRunner.getTaskRunnerID())
+
+    try {
+      val fs: FileSystem = dirPath.getFileSystem(hadoopConf)
+      dirPath = new Path(fs.makeQualified(dirPath).toString())
+
+      if (!FileUtils.mkdir(fs, dirPath, true, hadoopConf)) {
+        throw new IllegalStateException("Cannot create staging directory: " + dirPath.toString)
+      }
+      createdTempDir = Some(dirPath)
+      fs.deleteOnExit(dirPath)
+    } catch {
+      case e: IOException =>
+        throw new RuntimeException("Cannot create staging directory: " + dirPath.toString, e)
+    }
+    dirPath
+  }
+
+  // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2
+  def newVersionExternalTempPath(path: Path): Path = {
     val extURI: URI = path.toUri
     if (extURI.getScheme == "viewfs") {
-      getExtTmpPathRelTo(path.getParent, hadoopConf)
+      getExtTmpPathRelTo(path.getParent)
     } else {
-      new Path(getExternalScratchDir(extURI, hadoopConf), "-ext-10000")
+      new Path(getExternalScratchDir(extURI), "-ext-10000")
     }
   }
 
-  def getExtTmpPathRelTo(path: Path, hadoopConf: Configuration): Path = {
-    new Path(getStagingDir(path, hadoopConf), "-ext-10000") // Hive uses 10000
+  def getExtTmpPathRelTo(path: Path): Path = {
+    new Path(getStagingDir(path), "-ext-10000") // Hive uses 10000
   }
 
   private def saveAsHiveFile(
@@ -126,7 +204,7 @@ case class InsertIntoHiveTable(
 
     FileOutputFormat.setOutputPath(
       conf.value,
-      SparkHiveWriterContainer.createPathFromString(fileSinkConf.getDirName, conf.value))
+      SparkHiveWriterContainer.createPathFromString(fileSinkConf.getDirName(), conf.value))
     log.debug("Saving as hadoop file of type " + valueClass.getSimpleName)
     writerContainer.driverSideSetup()
     sqlContext.sparkContext.runJob(rdd, writerContainer.writeToFile _)
@@ -145,8 +223,7 @@ case class InsertIntoHiveTable(
     // instances within the closure, since Serializer is not serializable while TableDesc is.
     val tableDesc = table.tableDesc
     val tableLocation = table.hiveQlTable.getDataLocation
-    val hadoopConf = sessionState.newHadoopConf()
-    val tmpLocation = getExternalTmpPath(tableLocation, hadoopConf)
+    val tmpLocation = getExternalTmpPath(tableLocation)
     val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
     val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean
 
@@ -266,7 +343,7 @@ case class InsertIntoHiveTable(
           // version and we may not want to catch up new Hive version every time. We delete the
           // Hive partition first and then load data file into the Hive partition.
           if (oldPart.nonEmpty && overwrite) {
-            oldPart.get.storage.locationUri.map { uri =>
+            oldPart.get.storage.locationUri.foreach { uri =>
               val partitionPath = new Path(uri)
               val fs = partitionPath.getFileSystem(hadoopConf)
               if (fs.exists(partitionPath)) {
@@ -302,8 +379,17 @@ case class InsertIntoHiveTable(
         holdDDLTime)
     }
 
-    // Invalidate the cache.
-    sqlContext.sharedState.cacheManager.invalidateCache(table)
+    // Attempt to delete the staging directory and the inclusive files. If failed, the files are
+    // expected to be dropped at the normal termination of VM since deleteOnExit is used.
+    try {
+      createdTempDir.foreach { path => path.getFileSystem(hadoopConf).delete(path, true) }
+    } catch {
+      case NonFatal(e) =>
+        logWarning(s"Unable to delete staging directory: $stagingDir.\n" + e)
+    }
+
+    // un-cache this table.
+    sqlContext.sparkSession.catalog.uncacheTable(table.catalogTable.identifier.quotedString)
     sqlContext.sessionState.catalog.refreshTable(table.catalogTable.identifier)
 
     // It would be nice to just return the childRdd unchanged so insert operations could be chained,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 42033080dc34..3e46b7461358 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -42,7 +42,7 @@ private[hive] case class HiveSimpleUDF(
     name: String, funcWrapper: HiveFunctionWrapper, children: Seq[Expression])
   extends Expression with HiveInspectors with CodegenFallback with Logging {
 
-  override def deterministic: Boolean = isUDFDeterministic
+  override def deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic)
 
   override def nullable: Boolean = true
 
@@ -59,7 +59,7 @@ private[hive] case class HiveSimpleUDF(
   @transient
   private lazy val isUDFDeterministic = {
     val udfType = function.getClass().getAnnotation(classOf[HiveUDFType])
-    udfType != null && udfType.deterministic()
+    udfType != null && udfType.deterministic() && !udfType.stateful()
   }
 
   override def foldable: Boolean = isUDFDeterministic && children.forall(_.foldable)
@@ -120,7 +120,7 @@ private[hive] case class HiveGenericUDF(
 
   override def nullable: Boolean = true
 
-  override def deterministic: Boolean = isUDFDeterministic
+  override def deterministic: Boolean = isUDFDeterministic && children.forall(_.deterministic)
 
   override def foldable: Boolean =
     isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector]
@@ -142,7 +142,7 @@ private[hive] case class HiveGenericUDF(
   @transient
   private lazy val isUDFDeterministic = {
     val udfType = function.getClass.getAnnotation(classOf[HiveUDFType])
-    udfType != null && udfType.deterministic()
+    udfType != null && udfType.deterministic() && !udfType.stateful()
   }
 
   @transient
@@ -175,7 +175,7 @@ private[hive] case class HiveGenericUDF(
 
 /**
  * Converts a Hive Generic User Defined Table Generating Function (UDTF) to a
- * [[Generator]].  Note that the semantics of Generators do not allow
+ * `Generator`. Note that the semantics of Generators do not allow
  * Generators to maintain state in between input rows.  Thus UDTFs that rely on partitioning
  * dependent operations like calls to `close()` before producing output will not operate the same as
  * in Hive.  However, in practice this should not affect compatibility for most sane UDTFs
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index ea88276bb96c..16cfa9d1cc5c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.text.NumberFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import scala.collection.JavaConverters._
 
@@ -48,7 +48,7 @@ import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
 
 /**
  * Internal helper class that saves an RDD using a Hive OutputFormat.
- * It is based on [[SparkHadoopWriter]].
+ * It is based on `SparkHadoopWriter`.
  */
 private[hive] class SparkHiveWriterContainer(
     @transient private val jobConf: JobConf,
@@ -95,7 +95,7 @@ private[hive] class SparkHiveWriterContainer(
   }
 
   protected def getOutputName: String = {
-    val numberFormat = NumberFormat.getInstance()
+    val numberFormat = NumberFormat.getInstance(Locale.US)
     numberFormat.setMinimumIntegerDigits(5)
     numberFormat.setGroupingUsed(false)
     val extension = Utilities.getFileExtension(conf.value, fileSinkConf.getCompressed, outputFormat)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 7c519a074317..0a7631f78219 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -42,8 +42,8 @@ import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
 
 /**
- * [[FileFormat]] for reading ORC files. If this is moved or renamed, please update
- * [[DataSource]]'s backwardCompatibilityMap.
+ * `FileFormat` for reading ORC files. If this is moved or renamed, please update
+ * `DataSource`'s backwardCompatibilityMap.
  */
 class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable {
 
@@ -305,17 +305,7 @@ private[orc] object OrcRelation extends HiveInspectors {
 
   def setRequiredColumns(
       conf: Configuration, physicalSchema: StructType, requestedSchema: StructType): Unit = {
-    val caseInsensitiveFieldMap: Map[String, Int] = physicalSchema.fieldNames
-      .zipWithIndex
-      .map(f => (f._1.toLowerCase, f._2))
-      .toMap
-    val ids = requestedSchema.map { a =>
-      val exactMatch: Option[Int] = physicalSchema.getFieldIndex(a.name)
-      val res = exactMatch.getOrElse(
-        caseInsensitiveFieldMap.getOrElse(a.name,
-          throw new IllegalArgumentException(s"""Field "$a.name" does not exist.""")))
-      res: Integer
-    }
+    val ids = requestedSchema.map(a => physicalSchema.fieldIndex(a.name): Integer)
     val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIDs, sortedNames)
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
index f5db73b71582..3f1f86c278db 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
@@ -38,7 +38,7 @@ private[orc] object OrcFileOperator extends Logging {
    * 1. Retrieving file metadata (schema and compression codecs, etc.)
    * 2. Read the actual file content (in this case, the given path should point to the target file)
    *
-   * @note As recorded by SPARK-8501, ORC writes an empty schema (<code>struct&lt;&gt;</code) to an
+   * @note As recorded by SPARK-8501, ORC writes an empty schema (<code>struct&lt;&gt;</code>) to an
    *       ORC file if the file contains zero rows. This is OK for Hive since the schema of the
    *       table is managed by metastore.  But this becomes a problem when reading ORC files
    *       directly from HDFS via Spark SQL, because we have to discover the schema from raw ORC
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
index c2a126d3bf9c..ac587ab99ae2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
@@ -17,14 +17,18 @@
 
 package org.apache.spark.sql.hive.orc
 
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+
 /**
  * Options for the ORC data source.
  */
-private[orc] class OrcOptions(@transient private val parameters: Map[String, String])
+private[orc] class OrcOptions(@transient private val parameters: CaseInsensitiveMap)
   extends Serializable {
 
   import OrcOptions._
 
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
+
   /**
    * Compression codec to use. By default snappy compression.
    * Acceptable values are defined in [[shortOrcCompressionCodecNames]].
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 6eb571b91ffa..a8dd5102b750 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -190,6 +190,12 @@ private[hive] class TestHiveSparkSession(
     new File(Thread.currentThread().getContextClassLoader.getResource(path).getFile)
   }
 
+  private def quoteHiveFile(path : String) = if (Utils.isWindows) {
+    getHiveFile(path).getPath.replace('\\', '/')
+  } else {
+    getHiveFile(path).getPath
+  }
+
   def getWarehousePath(): String = {
     val tempConf = new SQLConf
     sc.conf.getAll.foreach { case (k, v) => tempConf.setConfString(k, v) }
@@ -225,16 +231,16 @@ private[hive] class TestHiveSparkSession(
     val hiveQTestUtilTables: Seq[TestTable] = Seq(
       TestTable("src",
         "CREATE TABLE src (key INT, value STRING)".cmd,
-        s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}' INTO TABLE src".cmd),
+        s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}' INTO TABLE src".cmd),
       TestTable("src1",
         "CREATE TABLE src1 (key INT, value STRING)".cmd,
-        s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv3.txt")}' INTO TABLE src1".cmd),
+        s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv3.txt")}' INTO TABLE src1".cmd),
       TestTable("srcpart", () => {
         sql(
           "CREATE TABLE srcpart (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING)")
         for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq("11", "12")) {
           sql(
-            s"""LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}'
+            s"""LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}'
                |OVERWRITE INTO TABLE srcpart PARTITION (ds='$ds',hr='$hr')
              """.stripMargin)
         }
@@ -244,7 +250,7 @@ private[hive] class TestHiveSparkSession(
           "CREATE TABLE srcpart1 (key INT, value STRING) PARTITIONED BY (ds STRING, hr INT)")
         for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- 11 to 12) {
           sql(
-            s"""LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}'
+            s"""LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}'
                |OVERWRITE INTO TABLE srcpart1 PARTITION (ds='$ds',hr='$hr')
              """.stripMargin)
         }
@@ -269,7 +275,7 @@ private[hive] class TestHiveSparkSession(
 
         sql(
           s"""
-             |LOAD DATA LOCAL INPATH '${getHiveFile("data/files/complex.seq")}'
+             |LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/complex.seq")}'
              |INTO TABLE src_thrift
            """.stripMargin)
       }),
@@ -308,7 +314,7 @@ private[hive] class TestHiveSparkSession(
            |)
          """.stripMargin.cmd,
         s"""
-           |LOAD DATA LOCAL INPATH '${getHiveFile("data/files/episodes.avro")}'
+           |LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/episodes.avro")}'
            |INTO TABLE episodes
          """.stripMargin.cmd
       ),
@@ -379,7 +385,7 @@ private[hive] class TestHiveSparkSession(
       TestTable("src_json",
         s"""CREATE TABLE src_json (json STRING) STORED AS TEXTFILE
          """.stripMargin.cmd,
-        s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/json.txt")}' INTO TABLE src_json".cmd)
+        s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/json.txt")}' INTO TABLE src_json".cmd)
     )
 
     hiveQTestUtilTables.foreach(registerTestTable)
@@ -486,24 +492,6 @@ private[hive] class TestHiveQueryExecution(
   }
 }
 
-
-private[hive] class TestHiveFunctionRegistry extends SimpleFunctionRegistry {
-
-  private val removedFunctions =
-    collection.mutable.ArrayBuffer.empty[(String, (ExpressionInfo, FunctionBuilder))]
-
-  def unregisterFunction(name: String): Unit = synchronized {
-    functionBuilders.remove(name).foreach(f => removedFunctions += name -> f)
-  }
-
-  def restore(): Unit = synchronized {
-    removedFunctions.foreach {
-      case (name, (info, builder)) => registerFunction(name, info, builder)
-    }
-  }
-}
-
-
 private[hive] class TestHiveSessionState(
     sparkSession: TestHiveSparkSession)
   extends HiveSessionState(sparkSession) { self =>
@@ -519,16 +507,6 @@ private[hive] class TestHiveSessionState(
     }
   }
 
-  override lazy val functionRegistry: TestHiveFunctionRegistry = {
-    // We use TestHiveFunctionRegistry at here to track functions that have been explicitly
-    // unregistered (through TestHiveFunctionRegistry.unregisterFunction method).
-    val fr = new TestHiveFunctionRegistry
-    org.apache.spark.sql.catalyst.analysis.FunctionRegistry.expressions.foreach {
-      case (name, (info, builder)) => fr.registerFunction(name, info, builder)
-    }
-    fr
-  }
-
   override def executePlan(plan: LogicalPlan): TestHiveQueryExecution = {
     new TestHiveQueryExecution(sparkSession, plan)
   }
diff --git a/sql/hive/src/test/resources/golden/input1-2-d3aa54d5436b7b59ff5c7091b7ca6145 b/sql/hive/src/test/resources/golden/input1-2-d3aa54d5436b7b59ff5c7091b7ca6145
index d3ffb995aff4..93ba96ec8c15 100644
--- a/sql/hive/src/test/resources/golden/input1-2-d3aa54d5436b7b59ff5c7091b7ca6145
+++ b/sql/hive/src/test/resources/golden/input1-2-d3aa54d5436b7b59ff5c7091b7ca6145
@@ -1,2 +1,2 @@
-a                   	int                 	                    
-b                   	double              	                    
+A                   	int
+B                   	double
diff --git a/sql/hive/src/test/resources/golden/input2-1-e0efeda558cd0194f4764a5735147b16 b/sql/hive/src/test/resources/golden/input2-1-e0efeda558cd0194f4764a5735147b16
index d3ffb995aff4..93ba96ec8c15 100644
--- a/sql/hive/src/test/resources/golden/input2-1-e0efeda558cd0194f4764a5735147b16
+++ b/sql/hive/src/test/resources/golden/input2-1-e0efeda558cd0194f4764a5735147b16
@@ -1,2 +1,2 @@
-a                   	int                 	                    
-b                   	double              	                    
+A                   	int
+B                   	double
diff --git a/sql/hive/src/test/resources/golden/input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd b/sql/hive/src/test/resources/golden/input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd
index d3ffb995aff4..93ba96ec8c15 100644
--- a/sql/hive/src/test/resources/golden/input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd
+++ b/sql/hive/src/test/resources/golden/input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd
@@ -1,2 +1,2 @@
-a                   	int                 	                    
-b                   	double              	                    
+A                   	int
+B                   	double
diff --git a/sql/hive/src/test/resources/golden/input2-4-235f92683416fab031e6e7490487b15b b/sql/hive/src/test/resources/golden/input2-4-235f92683416fab031e6e7490487b15b
index 77eaef91c9c3..d52fcf0ebbdb 100644
--- a/sql/hive/src/test/resources/golden/input2-4-235f92683416fab031e6e7490487b15b
+++ b/sql/hive/src/test/resources/golden/input2-4-235f92683416fab031e6e7490487b15b
@@ -1,3 +1,3 @@
-a                   	array<int>          	                    
-b                   	double              	                    
-c                   	map<double,int>     	                    
+A                   	array<int>
+B                   	double
+C                   	map<double,int>
diff --git a/sql/hive/src/test/resources/golden/show_columns-2-b74990316ec4245fd8a7011e684b39da b/sql/hive/src/test/resources/golden/show_columns-2-b74990316ec4245fd8a7011e684b39da
index 70c14c3ef34a..2f7168cba930 100644
--- a/sql/hive/src/test/resources/golden/show_columns-2-b74990316ec4245fd8a7011e684b39da
+++ b/sql/hive/src/test/resources/golden/show_columns-2-b74990316ec4245fd8a7011e684b39da
@@ -1,3 +1,3 @@
-key                 
-value               
-ds                  
+KEY
+VALUE
+ds
diff --git a/sql/hive/src/test/resources/log4j.properties b/sql/hive/src/test/resources/log4j.properties
index fea3404769d9..072bb25d30a8 100644
--- a/sql/hive/src/test/resources/log4j.properties
+++ b/sql/hive/src/test/resources/log4j.properties
@@ -59,3 +59,7 @@ log4j.logger.hive.ql.metadata.Hive=OFF
 
 log4j.additivity.org.apache.hadoop.hive.ql.io.RCFile=false
 log4j.logger.org.apache.hadoop.hive.ql.io.RCFile=ERROR
+
+# Parquet related logging
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
diff --git a/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql b/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql
index ab444d0c7093..0739f8fff546 100644
--- a/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql
+++ b/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql
@@ -5,4 +5,4 @@ WHERE id > 2
 ORDER BY val, id
 LIMIT 5
 --------------------------------------------------------------------------------
-SELECT `gen_attr_0` AS `val`, `gen_attr_1` AS `id` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT gen_subquery_0.`gen_attr_2`, gen_subquery_0.`gen_attr_3`, gen_subquery_0.`gen_attr_4`, gen_subquery_0.`gen_attr_1` FROM (SELECT `arr` AS `gen_attr_2`, `arr2` AS `gen_attr_3`, `json` AS `gen_attr_4`, `id` AS `gen_attr_1` FROM `default`.`parquet_t3`) AS gen_subquery_0 WHERE (`gen_attr_1` > CAST(2 AS BIGINT))) AS gen_subquery_1 LATERAL VIEW explode(`gen_attr_2`) gen_subquery_2 AS `gen_attr_0` ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST LIMIT 5) AS parquet_t3
+SELECT `gen_attr_0` AS `val`, `gen_attr_1` AS `id` FROM ((SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT gen_subquery_0.`gen_attr_2`, gen_subquery_0.`gen_attr_3`, gen_subquery_0.`gen_attr_4`, gen_subquery_0.`gen_attr_1` FROM (SELECT `arr` AS `gen_attr_2`, `arr2` AS `gen_attr_3`, `json` AS `gen_attr_4`, `id` AS `gen_attr_1` FROM `default`.`parquet_t3`) AS gen_subquery_0 WHERE (`gen_attr_1` > CAST(2 AS BIGINT))) AS gen_subquery_1 LATERAL VIEW explode(`gen_attr_2`) gen_subquery_2 AS `gen_attr_0` ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST LIMIT 5)) AS parquet_t3
diff --git a/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql b/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql
index 42a2369f34d1..c4b344ee238a 100644
--- a/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql
+++ b/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql
@@ -7,4 +7,4 @@ WHERE val > 2
 ORDER BY val, id
 LIMIT 5
 --------------------------------------------------------------------------------
-SELECT `gen_attr_0` AS `val`, `gen_attr_1` AS `id` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `arr` AS `gen_attr_4`, `arr2` AS `gen_attr_3`, `json` AS `gen_attr_5`, `id` AS `gen_attr_1` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr_3`) gen_subquery_2 AS `gen_attr_2` LATERAL VIEW explode(`gen_attr_2`) gen_subquery_3 AS `gen_attr_0` WHERE (`gen_attr_0` > CAST(2 AS BIGINT)) ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST LIMIT 5) AS gen_subquery_1
+SELECT `gen_attr_0` AS `val`, `gen_attr_1` AS `id` FROM ((SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `arr` AS `gen_attr_4`, `arr2` AS `gen_attr_3`, `json` AS `gen_attr_5`, `id` AS `gen_attr_1` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr_3`) gen_subquery_2 AS `gen_attr_2` LATERAL VIEW explode(`gen_attr_2`) gen_subquery_3 AS `gen_attr_0` WHERE (`gen_attr_0` > CAST(2 AS BIGINT)) ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST LIMIT 5)) AS gen_subquery_1
diff --git a/sql/hive/src/test/resources/sqlgen/limit.sql b/sql/hive/src/test/resources/sqlgen/limit.sql
new file mode 100644
index 000000000000..7a6b060fbf50
--- /dev/null
+++ b/sql/hive/src/test/resources/sqlgen/limit.sql
@@ -0,0 +1,4 @@
+-- This file is automatically generated by LogicalPlanToSQLSuite.
+SELECT * FROM (SELECT id FROM tbl LIMIT 2)
+--------------------------------------------------------------------------------
+SELECT `gen_attr_0` AS `id` FROM (SELECT `gen_attr_0` FROM (SELECT `gen_attr_0` FROM (SELECT `id` AS `gen_attr_0`, `name` AS `gen_attr_1` FROM `default`.`tbl`) AS gen_subquery_0 LIMIT 2)) AS tbl
diff --git a/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql b/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
index de0116a4dcba..cdda29af50e3 100644
--- a/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
+++ b/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
@@ -7,4 +7,4 @@ having b.key in (select a.key
                  where a.value > 'val_9' and a.value = min(b.value))
 order by b.key
 --------------------------------------------------------------------------------
-SELECT `gen_attr_0` AS `key`, `gen_attr_1` AS `min(value)` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_0`, min(`gen_attr_5`) AS `gen_attr_1`, min(`gen_attr_5`) AS `gen_attr_4` FROM (SELECT `key` AS `gen_attr_0`, `value` AS `gen_attr_5` FROM `default`.`src`) AS gen_subquery_0 GROUP BY `gen_attr_0` HAVING (struct(`gen_attr_0`, `gen_attr_4`) IN (SELECT `gen_attr_6` AS `_c0`, `gen_attr_7` AS `_c1` FROM (SELECT `gen_attr_2` AS `gen_attr_6`, `gen_attr_3` AS `gen_attr_7` FROM (SELECT `gen_attr_2`, `gen_attr_3` FROM (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_3` FROM `default`.`src`) AS gen_subquery_3 WHERE (`gen_attr_3` > 'val_9')) AS gen_subquery_2) AS gen_subquery_4))) AS gen_subquery_1 ORDER BY `gen_attr_0` ASC NULLS FIRST) AS b
+SELECT `gen_attr_0` AS `key`, `gen_attr_1` AS `min(value)` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_0`, min(`gen_attr_5`) AS `gen_attr_1`, min(`gen_attr_5`) AS `gen_attr_4` FROM (SELECT `key` AS `gen_attr_0`, `value` AS `gen_attr_5` FROM `default`.`src`) AS gen_subquery_0 GROUP BY `gen_attr_0` HAVING (named_struct('gen_attr_0', `gen_attr_0`, 'gen_attr_4', `gen_attr_4`) IN (SELECT `gen_attr_6` AS `_c0`, `gen_attr_7` AS `_c1` FROM (SELECT `gen_attr_2` AS `gen_attr_6`, `gen_attr_3` AS `gen_attr_7` FROM (SELECT `gen_attr_2`, `gen_attr_3` FROM (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_3` FROM `default`.`src`) AS gen_subquery_3 WHERE (`gen_attr_3` > 'val_9')) AS gen_subquery_2) AS gen_subquery_4))) AS gen_subquery_1 ORDER BY `gen_attr_0` ASC NULLS FIRST) AS b
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
index fdd02821dfa2..27ea167b9050 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
@@ -173,6 +173,8 @@ class ExpressionToSQLSuite extends SQLBuilderTest with SQLTestUtils {
     checkSqlGeneration("SELECT max(value) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT mean(value) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT min(value) FROM t1 GROUP BY key")
+    checkSqlGeneration("SELECT percentile(value, 0.25) FROM t1 GROUP BY key")
+    checkSqlGeneration("SELECT percentile(value, array(0.25, 0.75)) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT skewness(value) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT stddev(value) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT stddev_pop(value) FROM t1 GROUP BY key")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
index c7f10e569fa4..557ea44d1c80 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst
 import java.nio.charset.StandardCharsets
 import java.nio.file.{Files, NoSuchFileException, Paths}
 
+import scala.io.Source
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.Column
@@ -45,7 +46,15 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
 
   // Used for generating new query answer files by saving
   private val regenerateGoldenFiles: Boolean = System.getenv("SPARK_GENERATE_GOLDEN_FILES") == "1"
-  private val goldenSQLPath = getTestResourcePath("sqlgen")
+  private val goldenSQLPath = {
+    // If regenerateGoldenFiles is true, we must be running this in SBT and we use hard-coded
+    // relative path. Otherwise, we use classloader's getResource to find the location.
+    if (regenerateGoldenFiles) {
+      java.nio.file.Paths.get("src", "test", "resources", "sqlgen").toFile.getCanonicalPath
+    } else {
+      getTestResourcePath("sqlgen")
+    }
+  }
 
   protected override def beforeAll(): Unit = {
     super.beforeAll()
@@ -109,12 +118,15 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
         Files.write(path, answerText.getBytes(StandardCharsets.UTF_8))
       } else {
         val goldenFileName = s"sqlgen/$answerFile.sql"
-        val resourceFile = getClass.getClassLoader.getResource(goldenFileName)
-        if (resourceFile == null) {
+        val resourceStream = getClass.getClassLoader.getResourceAsStream(goldenFileName)
+        if (resourceStream == null) {
           throw new NoSuchFileException(goldenFileName)
         }
-        val path = resourceFile.getPath
-        val answerText = new String(Files.readAllBytes(Paths.get(path)), StandardCharsets.UTF_8)
+        val answerText = try {
+          Source.fromInputStream(resourceStream).mkString
+        } finally {
+          resourceStream.close
+        }
         val sqls = answerText.split(separator)
         assert(sqls.length == 2, "Golden sql files should have a separator.")
         val expectedSQL = sqls(1).trim()
@@ -1161,4 +1173,14 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
       )
     }
   }
+
+  test("SPARK-17982 - limit") {
+    withTable("tbl") {
+      sql("CREATE TABLE tbl(id INT, name STRING)")
+      checkSQL(
+        "SELECT * FROM (SELECT id FROM tbl LIMIT 2)",
+        "limit"
+      )
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index fc35304c80ec..9b24ad045d2a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -101,13 +101,16 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
     sql("DROP TABLE IF EXISTS nonexistantTable")
   }
 
-  test("correct error on uncache of nonexistant tables") {
+  test("uncache of nonexistant tables") {
+    // make sure table doesn't exist
+    intercept[NoSuchTableException](spark.table("nonexistantTable"))
     intercept[NoSuchTableException] {
       spark.catalog.uncacheTable("nonexistantTable")
     }
     intercept[NoSuchTableException] {
       sql("UNCACHE TABLE nonexistantTable")
     }
+    sql("UNCACHE TABLE IF EXISTS nonexistantTable")
   }
 
   test("no error on uncache of non-cached table") {
@@ -193,9 +196,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
     table("src").write.mode(SaveMode.Overwrite).parquet(tempPath.toString)
     sql("DROP TABLE IF EXISTS refreshTable")
     sparkSession.catalog.createExternalTable("refreshTable", tempPath.toString, "parquet")
-    checkAnswer(
-      table("refreshTable"),
-      table("src").collect())
+    checkAnswer(table("refreshTable"), table("src"))
     // Cache the table.
     sql("CACHE TABLE refreshTable")
     assertCached(table("refreshTable"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
new file mode 100644
index 000000000000..c5753cec80da
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.net.URI
+
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.hive.client.HiveClient
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.Utils
+
+
+class HiveExternalCatalogBackwardCompatibilitySuite extends QueryTest
+  with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach {
+
+  // To test `HiveExternalCatalog`, we need to read/write the raw table meta from/to hive client.
+  val hiveClient: HiveClient =
+    spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+
+  val tempDir = Utils.createTempDir().getCanonicalFile
+
+  override def beforeEach(): Unit = {
+    sql("CREATE DATABASE test_db")
+    for ((tbl, _) <- rawTablesAndExpectations) {
+      hiveClient.createTable(tbl, ignoreIfExists = false)
+    }
+  }
+
+  override def afterEach(): Unit = {
+    Utils.deleteRecursively(tempDir)
+    hiveClient.dropDatabase("test_db", ignoreIfNotExists = false, cascade = true)
+  }
+
+  private def getTableMetadata(tableName: String): CatalogTable = {
+    spark.sharedState.externalCatalog.getTable("test_db", tableName)
+  }
+
+  private def defaultTablePath(tableName: String): String = {
+    spark.sessionState.catalog.defaultTablePath(TableIdentifier(tableName, Some("test_db")))
+  }
+
+
+  // Raw table metadata that are dumped from tables created by Spark 2.0. Note that, all spark
+  // versions prior to 2.1 would generate almost same raw table metadata for a specific table.
+  val simpleSchema = new StructType().add("i", "int")
+  val partitionedSchema = new StructType().add("i", "int").add("j", "int")
+
+  lazy val hiveTable = CatalogTable(
+    identifier = TableIdentifier("tbl1", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(
+      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
+      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+    schema = simpleSchema)
+
+  lazy val externalHiveTable = CatalogTable(
+    identifier = TableIdentifier("tbl2", Some("test_db")),
+    tableType = CatalogTableType.EXTERNAL,
+    storage = CatalogStorageFormat.empty.copy(
+      locationUri = Some(tempDir.getAbsolutePath),
+      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
+      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+    schema = simpleSchema)
+
+  lazy val partitionedHiveTable = CatalogTable(
+    identifier = TableIdentifier("tbl3", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(
+      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
+      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+    schema = partitionedSchema,
+    partitionColumnNames = Seq("j"))
+
+
+  val simpleSchemaJson =
+    """
+      |{
+      | "type": "struct",
+      | "fields": [{
+      |             "name": "i",
+      |             "type": "integer",
+      |             "nullable": true,
+      |             "metadata": {}
+      |            }]
+      |}
+    """.stripMargin
+
+  val partitionedSchemaJson =
+    """
+      |{
+      | "type": "struct",
+      | "fields": [{
+      |             "name": "i",
+      |             "type": "integer",
+      |             "nullable": true,
+      |             "metadata": {}
+      |            },
+      |            {
+      |             "name": "j",
+      |             "type": "integer",
+      |             "nullable": true,
+      |             "metadata": {}
+      |            }]
+      |}
+    """.stripMargin
+
+  lazy val dataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl4", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(properties = Map("path" -> defaultTablePath("tbl4"))),
+    schema = new StructType(),
+    properties = Map(
+      "spark.sql.sources.provider" -> "json",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
+
+  lazy val hiveCompatibleDataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl5", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(properties = Map("path" -> defaultTablePath("tbl5"))),
+    schema = simpleSchema,
+    properties = Map(
+      "spark.sql.sources.provider" -> "parquet",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
+
+  lazy val partitionedDataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl6", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(properties = Map("path" -> defaultTablePath("tbl6"))),
+    schema = new StructType(),
+    properties = Map(
+      "spark.sql.sources.provider" -> "json",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> partitionedSchemaJson,
+      "spark.sql.sources.schema.numPartCols" -> "1",
+      "spark.sql.sources.schema.partCol.0" -> "j"))
+
+  lazy val externalDataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl7", Some("test_db")),
+    tableType = CatalogTableType.EXTERNAL,
+    storage = CatalogStorageFormat.empty.copy(
+      locationUri = Some(defaultTablePath("tbl7") + "-__PLACEHOLDER__"),
+      properties = Map("path" -> tempDir.getAbsolutePath)),
+    schema = new StructType(),
+    properties = Map(
+      "spark.sql.sources.provider" -> "json",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
+
+  lazy val hiveCompatibleExternalDataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl8", Some("test_db")),
+    tableType = CatalogTableType.EXTERNAL,
+    storage = CatalogStorageFormat.empty.copy(
+      locationUri = Some(tempDir.getAbsolutePath),
+      properties = Map("path" -> tempDir.getAbsolutePath)),
+    schema = simpleSchema,
+    properties = Map(
+      "spark.sql.sources.provider" -> "parquet",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
+
+  lazy val dataSourceTableWithoutSchema = CatalogTable(
+    identifier = TableIdentifier("tbl9", Some("test_db")),
+    tableType = CatalogTableType.EXTERNAL,
+    storage = CatalogStorageFormat.empty.copy(
+      locationUri = Some(defaultTablePath("tbl9") + "-__PLACEHOLDER__"),
+      properties = Map("path" -> tempDir.getAbsolutePath)),
+    schema = new StructType(),
+    properties = Map("spark.sql.sources.provider" -> "json"))
+
+  // A list of all raw tables we want to test, with their expected schema.
+  lazy val rawTablesAndExpectations = Seq(
+    hiveTable -> simpleSchema,
+    externalHiveTable -> simpleSchema,
+    partitionedHiveTable -> partitionedSchema,
+    dataSourceTable -> simpleSchema,
+    hiveCompatibleDataSourceTable -> simpleSchema,
+    partitionedDataSourceTable -> partitionedSchema,
+    externalDataSourceTable -> simpleSchema,
+    hiveCompatibleExternalDataSourceTable -> simpleSchema,
+    dataSourceTableWithoutSchema -> new StructType())
+
+  test("make sure we can read table created by old version of Spark") {
+    for ((tbl, expectedSchema) <- rawTablesAndExpectations) {
+      val readBack = getTableMetadata(tbl.identifier.table)
+      assert(readBack.schema.sameType(expectedSchema))
+
+      if (tbl.tableType == CatalogTableType.EXTERNAL) {
+        // trim the URI prefix
+        val tableLocation = new URI(readBack.storage.locationUri.get).getPath
+        assert(tableLocation == tempDir.getAbsolutePath)
+      }
+    }
+  }
+
+  test("make sure we can alter table location created by old version of Spark") {
+    withTempDir { dir =>
+      for ((tbl, _) <- rawTablesAndExpectations if tbl.tableType == CatalogTableType.EXTERNAL) {
+        sql(s"ALTER TABLE ${tbl.identifier} SET LOCATION '${dir.getAbsolutePath}'")
+
+        val readBack = getTableMetadata(tbl.identifier.table)
+
+        // trim the URI prefix
+        val actualTableLocation = new URI(readBack.storage.locationUri.get).getPath
+        assert(actualTableLocation == dir.getAbsolutePath)
+      }
+    }
+  }
+
+  test("make sure we can rename table created by old version of Spark") {
+    for ((tbl, expectedSchema) <- rawTablesAndExpectations) {
+      val newName = tbl.identifier.table + "_renamed"
+      sql(s"ALTER TABLE ${tbl.identifier} RENAME TO $newName")
+
+      val readBack = getTableMetadata(newName)
+      assert(readBack.schema.sameType(expectedSchema))
+
+      // trim the URI prefix
+      val actualTableLocation = new URI(readBack.storage.locationUri.get).getPath
+      val expectedLocation = if (tbl.tableType == CatalogTableType.EXTERNAL) {
+        tempDir.getAbsolutePath
+      } else {
+        // trim the URI prefix
+        new URI(defaultTablePath(newName)).getPath
+      }
+      assert(actualTableLocation == expectedLocation)
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
index efa0beb85030..6fee45824ea3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
@@ -20,8 +20,11 @@ package org.apache.spark.sql.hive
 import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.SparkConf
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.execution.command.DDLUtils
+import org.apache.spark.sql.types.StructType
 
 /**
  * Test suite for the [[HiveExternalCatalog]].
@@ -52,4 +55,19 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite {
     assert(selectedPartitions.length == 1)
     assert(selectedPartitions.head.spec == part1.spec)
   }
+
+  test("SPARK-18647: do not put provider in table properties for Hive serde table") {
+    val catalog = newBasicCatalog()
+    val hiveTable = CatalogTable(
+      identifier = TableIdentifier("hive_tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = storageFormat,
+      schema = new StructType().add("col1", "int").add("col2", "string"),
+      provider = Some("hive"))
+    catalog.createTable(hiveTable, ignoreIfExists = false)
+
+    val rawTable = externalCatalog.client.getTable("db1", "hive_tbl")
+    assert(!rawTable.properties.contains(HiveExternalCatalog.DATASOURCE_PROVIDER))
+    assert(externalCatalog.getTable("db1", "hive_tbl").provider == Some(DDLUtils.HIVE_PROVIDER))
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 0477ea4d4c38..0a280b495215 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.hive
 
-import java.io.File
-
 import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType
@@ -142,8 +140,7 @@ class DataSourceWithHiveMetastoreCatalogSuite
           assert(hiveTable.storage.serde === Some(serde))
 
           assert(hiveTable.tableType === CatalogTableType.EXTERNAL)
-          assert(hiveTable.storage.locationUri ===
-            Some(path.toURI.toString.stripSuffix(File.separator)))
+          assert(hiveTable.storage.locationUri === Some(path.toString))
 
           val columns = hiveTable.schema
           assert(columns.map(_.name) === Seq("d1", "d2"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
new file mode 100644
index 000000000000..5a80c41938a6
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
@@ -0,0 +1,333 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.File
+import java.util.concurrent.{Executors, TimeUnit}
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.execution.datasources.FileStatusCache
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.hive.client.HiveClient
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
+import org.apache.spark.sql.internal.SQLConf.HiveCaseSensitiveInferenceMode.{Value => InferenceMode, _}
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types._
+
+class HiveSchemaInferenceSuite
+  extends QueryTest with TestHiveSingleton with SQLTestUtils with BeforeAndAfterEach {
+
+  import HiveSchemaInferenceSuite._
+  import HiveExternalCatalog.DATASOURCE_SCHEMA_PREFIX
+
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+    FileStatusCache.resetForTesting()
+  }
+
+  override def afterEach(): Unit = {
+    super.afterEach()
+    // spark.sessionState.catalog.tableRelationCache.invalidateAll()
+    FileStatusCache.resetForTesting()
+  }
+
+  private val externalCatalog = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog]
+  private val client = externalCatalog.client
+
+  // Return a copy of the given schema with all field names converted to lower case.
+  private def lowerCaseSchema(schema: StructType): StructType = {
+    StructType(schema.map(f => f.copy(name = f.name.toLowerCase)))
+  }
+
+  // Create a Hive external test table containing the given field and partition column names.
+  // Returns a case-sensitive schema for the table.
+  private def setupExternalTable(
+      fileType: String,
+      fields: Seq[String],
+      partitionCols: Seq[String],
+      dir: File): StructType = {
+    // Treat all table fields as bigints...
+    val structFields = fields.map { field =>
+      StructField(
+        name = field,
+        dataType = LongType,
+        nullable = true,
+        metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, "bigint").build())
+    }
+    // and all partition columns as ints
+    val partitionStructFields = partitionCols.map { field =>
+      StructField(
+        // Partition column case isn't preserved
+        name = field.toLowerCase,
+        dataType = IntegerType,
+        nullable = true,
+        metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, "int").build())
+    }
+    val schema = StructType(structFields ++ partitionStructFields)
+
+    // Write some test data (partitioned if specified)
+    val writer = spark.range(NUM_RECORDS)
+      .selectExpr((fields ++ partitionCols).map("id as " + _): _*)
+      .write
+      .partitionBy(partitionCols: _*)
+      .mode("overwrite")
+    fileType match {
+      case ORC_FILE_TYPE =>
+       writer.orc(dir.getAbsolutePath)
+      case PARQUET_FILE_TYPE =>
+       writer.parquet(dir.getAbsolutePath)
+    }
+
+    // Create Hive external table with lowercased schema
+    val serde = HiveSerDe.sourceToSerDe(fileType).get
+    client.createTable(
+      CatalogTable(
+        identifier = TableIdentifier(table = TEST_TABLE_NAME, database = Option(DATABASE)),
+        tableType = CatalogTableType.EXTERNAL,
+        storage = CatalogStorageFormat(
+          locationUri = Option(dir.getAbsolutePath),
+          inputFormat = serde.inputFormat,
+          outputFormat = serde.outputFormat,
+          serde = serde.serde,
+          compressed = false,
+          properties = Map("serialization.format" -> "1")),
+        schema = schema,
+        provider = Option("hive"),
+        partitionColumnNames = partitionCols.map(_.toLowerCase),
+        properties = Map.empty),
+      true)
+
+    // Add partition records (if specified)
+    if (!partitionCols.isEmpty) {
+      spark.catalog.recoverPartitions(TEST_TABLE_NAME)
+    }
+
+    // Check that the table returned by HiveExternalCatalog has schemaPreservesCase set to false
+    // and that the raw table returned by the Hive client doesn't have any Spark SQL properties
+    // set (table needs to be obtained from client since HiveExternalCatalog filters these
+    // properties out).
+    assert(!externalCatalog.getTable(DATABASE, TEST_TABLE_NAME).schemaPreservesCase)
+    val rawTable = client.getTable(DATABASE, TEST_TABLE_NAME)
+    assert(rawTable.properties.filterKeys(_.startsWith(DATASOURCE_SCHEMA_PREFIX)) == Map.empty)
+    schema
+  }
+
+  private def withTestTables(
+    fileType: String)(f: (Seq[String], Seq[String], StructType) => Unit): Unit = {
+    // Test both a partitioned and unpartitioned Hive table
+    val tableFields = Seq(
+      (Seq("fieldOne"), Seq("partCol1", "partCol2")),
+      (Seq("fieldOne", "fieldTwo"), Seq.empty[String]))
+
+    tableFields.foreach { case (fields, partCols) =>
+      withTempDir { dir =>
+        val schema = setupExternalTable(fileType, fields, partCols, dir)
+        withTable(TEST_TABLE_NAME) { f(fields, partCols, schema) }
+      }
+    }
+  }
+
+  private def withFileTypes(f: (String) => Unit): Unit
+    = Seq(ORC_FILE_TYPE, PARQUET_FILE_TYPE).foreach(f)
+
+  private def withInferenceMode(mode: InferenceMode)(f: => Unit): Unit = {
+    withSQLConf(
+      HiveUtils.CONVERT_METASTORE_ORC.key -> "true",
+      SQLConf.HIVE_CASE_SENSITIVE_INFERENCE.key -> mode.toString)(f)
+  }
+
+  private val inferenceKey = SQLConf.HIVE_CASE_SENSITIVE_INFERENCE.key
+
+  private def testFieldQuery(fields: Seq[String]): Unit = {
+    if (!fields.isEmpty) {
+      val query = s"SELECT * FROM ${TEST_TABLE_NAME} WHERE ${Random.shuffle(fields).head} >= 0"
+      assert(spark.sql(query).count == NUM_RECORDS)
+    }
+  }
+
+  private def testTableSchema(expectedSchema: StructType): Unit = {
+    // Spark 2.1 doesn't add metadata for partition columns when the schema isn't read from the
+    // table properties so strip all field metadata before making the comparison.
+    val tableSchema =
+      StructType(spark.table(TEST_TABLE_NAME).schema.map(_.copy(metadata = Metadata.empty)))
+    val expected =
+      StructType(expectedSchema.map(_.copy(metadata = Metadata.empty)))
+    assert(tableSchema == expected)
+  }
+
+  withFileTypes { fileType =>
+    test(s"$fileType: schema should be inferred and saved when INFER_AND_SAVE is specified") {
+      withInferenceMode(INFER_AND_SAVE) {
+        withTestTables(fileType) { (fields, partCols, schema) =>
+          testFieldQuery(fields)
+          testFieldQuery(partCols)
+          testTableSchema(schema)
+
+          // Verify the catalog table now contains the updated schema and properties
+          val catalogTable = externalCatalog.getTable(DATABASE, TEST_TABLE_NAME)
+          assert(catalogTable.schemaPreservesCase)
+          assert(catalogTable.schema == schema)
+          assert(catalogTable.partitionColumnNames == partCols.map(_.toLowerCase))
+        }
+      }
+    }
+  }
+
+  withFileTypes { fileType =>
+    test(s"$fileType: schema should be inferred but not stored when INFER_ONLY is specified") {
+      withInferenceMode(INFER_ONLY) {
+        withTestTables(fileType) { (fields, partCols, schema) =>
+          val originalTable = externalCatalog.getTable(DATABASE, TEST_TABLE_NAME)
+          testFieldQuery(fields)
+          testFieldQuery(partCols)
+          testTableSchema(schema)
+          // Catalog table shouldn't be altered
+          assert(externalCatalog.getTable(DATABASE, TEST_TABLE_NAME) == originalTable)
+        }
+      }
+    }
+  }
+
+  withFileTypes { fileType =>
+    test(s"$fileType: schema should not be inferred when NEVER_INFER is specified") {
+      withInferenceMode(NEVER_INFER) {
+        withTestTables(fileType) { (fields, partCols, schema) =>
+          val originalTable = externalCatalog.getTable(DATABASE, TEST_TABLE_NAME)
+          // Only check the table schema as the test queries will break
+          testTableSchema(lowerCaseSchema(schema))
+          assert(externalCatalog.getTable(DATABASE, TEST_TABLE_NAME) == originalTable)
+        }
+      }
+    }
+  }
+
+  test("mergeWithMetastoreSchema() should return expected results") {
+    // Field type conflict resolution
+    assertResult(
+      StructType(Seq(
+        StructField("lowerCase", StringType),
+        StructField("UPPERCase", DoubleType, nullable = false)))) {
+
+      HiveMetastoreCatalog.mergeWithMetastoreSchema(
+        StructType(Seq(
+          StructField("lowercase", StringType),
+          StructField("uppercase", DoubleType, nullable = false))),
+
+        StructType(Seq(
+          StructField("lowerCase", BinaryType),
+          StructField("UPPERCase", IntegerType, nullable = true))))
+    }
+
+    // MetaStore schema is subset of parquet schema
+    assertResult(
+      StructType(Seq(
+        StructField("UPPERCase", DoubleType, nullable = false)))) {
+
+      HiveMetastoreCatalog.mergeWithMetastoreSchema(
+        StructType(Seq(
+          StructField("uppercase", DoubleType, nullable = false))),
+
+        StructType(Seq(
+          StructField("lowerCase", BinaryType),
+          StructField("UPPERCase", IntegerType, nullable = true))))
+    }
+
+    // Metastore schema contains additional non-nullable fields.
+    assert(intercept[Throwable] {
+      HiveMetastoreCatalog.mergeWithMetastoreSchema(
+        StructType(Seq(
+          StructField("uppercase", DoubleType, nullable = false),
+          StructField("lowerCase", BinaryType, nullable = false))),
+
+        StructType(Seq(
+          StructField("UPPERCase", IntegerType, nullable = true))))
+    }.getMessage.contains("Detected conflicting schemas"))
+
+    // Conflicting non-nullable field names
+    intercept[Throwable] {
+      HiveMetastoreCatalog.mergeWithMetastoreSchema(
+        StructType(Seq(StructField("lower", StringType, nullable = false))),
+        StructType(Seq(StructField("lowerCase", BinaryType))))
+    }
+
+    // Check that merging missing nullable fields works as expected.
+    assertResult(
+      StructType(Seq(
+        StructField("firstField", StringType, nullable = true),
+        StructField("secondField", StringType, nullable = true),
+        StructField("thirdfield", StringType, nullable = true)))) {
+      HiveMetastoreCatalog.mergeWithMetastoreSchema(
+        StructType(Seq(
+          StructField("firstfield", StringType, nullable = true),
+          StructField("secondfield", StringType, nullable = true),
+          StructField("thirdfield", StringType, nullable = true))),
+        StructType(Seq(
+          StructField("firstField", StringType, nullable = true),
+          StructField("secondField", StringType, nullable = true))))
+    }
+
+    // Merge should fail if the Metastore contains any additional fields that are not
+    // nullable.
+    assert(intercept[Throwable] {
+      HiveMetastoreCatalog.mergeWithMetastoreSchema(
+        StructType(Seq(
+          StructField("firstfield", StringType, nullable = true),
+          StructField("secondfield", StringType, nullable = true),
+          StructField("thirdfield", StringType, nullable = false))),
+        StructType(Seq(
+          StructField("firstField", StringType, nullable = true),
+          StructField("secondField", StringType, nullable = true))))
+    }.getMessage.contains("Detected conflicting schemas"))
+
+    // Schema merge should maintain metastore order.
+    assertResult(
+      StructType(Seq(
+        StructField("first_field", StringType, nullable = true),
+        StructField("second_field", StringType, nullable = true),
+        StructField("third_field", StringType, nullable = true),
+        StructField("fourth_field", StringType, nullable = true),
+        StructField("fifth_field", StringType, nullable = true)))) {
+      HiveMetastoreCatalog.mergeWithMetastoreSchema(
+        StructType(Seq(
+          StructField("first_field", StringType, nullable = true),
+          StructField("second_field", StringType, nullable = true),
+          StructField("third_field", StringType, nullable = true),
+          StructField("fourth_field", StringType, nullable = true),
+          StructField("fifth_field", StringType, nullable = true))),
+        StructType(Seq(
+          StructField("fifth_field", StringType, nullable = true),
+          StructField("third_field", StringType, nullable = true),
+          StructField("second_field", StringType, nullable = true))))
+    }
+  }
+}
+
+object HiveSchemaInferenceSuite {
+  private val NUM_RECORDS = 10
+  private val DATABASE = "default"
+  private val TEST_TABLE_NAME = "test_table"
+  private val ORC_FILE_TYPE = "orc"
+  private val PARQUET_FILE_TYPE = "parquet"
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index d3873cf6c823..a670560c5969 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -24,6 +24,7 @@ import java.util.Date
 import scala.collection.mutable.ArrayBuffer
 import scala.tools.nsc.Properties
 
+import org.apache.hadoop.fs.Path
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
@@ -33,11 +34,12 @@ import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, FunctionResource, JarResource}
+import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
 import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
-import org.apache.spark.sql.types.DecimalType
+import org.apache.spark.sql.types.{DecimalType, StructType}
 import org.apache.spark.util.{ResetSystemProperties, Utils}
 
 /**
@@ -295,6 +297,20 @@ class HiveSparkSubmitSuite
     runSparkSubmit(args)
   }
 
+  test("SPARK-18360: default table path of tables in default database should depend on the " +
+    "location of default database") {
+    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+    val args = Seq(
+      "--class", SPARK_18360.getClass.getName.stripSuffix("$"),
+      "--name", "SPARK-18360",
+      "--master", "local-cluster[2,1,1024]",
+      "--conf", "spark.ui.enabled=false",
+      "--conf", "spark.master.rest.enabled=false",
+      "--driver-java-options", "-Dderby.system.durability=test",
+      unusedJar.toString)
+    runSparkSubmit(args)
+  }
+
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   // This is copied from org.apache.spark.deploy.SparkSubmitSuite
   private def runSparkSubmit(args: Seq[String]): Unit = {
@@ -397,11 +413,7 @@ object SetWarehouseLocationTest extends Logging {
   def main(args: Array[String]): Unit = {
     Utils.configTestLog4j("INFO")
 
-    val sparkConf = new SparkConf(loadDefaults = true)
-    val builder = SparkSession.builder()
-      .config(sparkConf)
-      .config("spark.ui.enabled", "false")
-      .enableHiveSupport()
+    val sparkConf = new SparkConf(loadDefaults = true).set("spark.ui.enabled", "false")
     val providedExpectedWarehouseLocation =
       sparkConf.getOption("spark.sql.test.expectedWarehouseDir")
 
@@ -410,7 +422,7 @@ object SetWarehouseLocationTest extends Logging {
         // If spark.sql.test.expectedWarehouseDir is set, the warehouse dir is set
         // through spark-summit. So, neither spark.sql.warehouse.dir nor
         // hive.metastore.warehouse.dir is set at here.
-        (builder.getOrCreate(), warehouseDir)
+        (new TestHiveContext(new SparkContext(sparkConf)).sparkSession, warehouseDir)
       case None =>
         val warehouseLocation = Utils.createTempDir()
         warehouseLocation.delete()
@@ -420,10 +432,10 @@ object SetWarehouseLocationTest extends Logging {
         // spark.sql.warehouse.dir and hive.metastore.warehouse.dir.
         // We are expecting that the value of spark.sql.warehouse.dir will override the
         // value of hive.metastore.warehouse.dir.
-        val session = builder
-          .config("spark.sql.warehouse.dir", warehouseLocation.toString)
-          .config("hive.metastore.warehouse.dir", hiveWarehouseLocation.toString)
-          .getOrCreate()
+        val session = new TestHiveContext(new SparkContext(sparkConf
+          .set("spark.sql.warehouse.dir", warehouseLocation.toString)
+          .set("hive.metastore.warehouse.dir", hiveWarehouseLocation.toString)))
+          .sparkSession
         (session, warehouseLocation.toString)
 
     }
@@ -445,7 +457,7 @@ object SetWarehouseLocationTest extends Logging {
         catalog.getTableMetadata(TableIdentifier("testLocation", Some("default")))
       val expectedLocation =
         "file:" + expectedWarehouseLocation.toString + "/testlocation"
-      val actualLocation = tableMetadata.storage.locationUri.get
+      val actualLocation = tableMetadata.location
       if (actualLocation != expectedLocation) {
         throw new Exception(
           s"Expected table location is $expectedLocation. But, it is actually $actualLocation")
@@ -461,7 +473,7 @@ object SetWarehouseLocationTest extends Logging {
         catalog.getTableMetadata(TableIdentifier("testLocation", Some("testLocationDB")))
       val expectedLocation =
         "file:" + expectedWarehouseLocation.toString + "/testlocationdb.db/testlocation"
-      val actualLocation = tableMetadata.storage.locationUri.get
+      val actualLocation = tableMetadata.location
       if (actualLocation != expectedLocation) {
         throw new Exception(
           s"Expected table location is $expectedLocation. But, it is actually $actualLocation")
@@ -801,3 +813,43 @@ object SPARK_14244 extends QueryTest {
     }
   }
 }
+
+object SPARK_18360 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession.builder()
+      .config("spark.ui.enabled", "false")
+      .enableHiveSupport().getOrCreate()
+
+    val defaultDbLocation = spark.catalog.getDatabase("default").locationUri
+    assert(new Path(defaultDbLocation) == new Path(spark.sharedState.warehousePath))
+
+    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+
+    try {
+      val tableMeta = CatalogTable(
+        identifier = TableIdentifier("test_tbl", Some("default")),
+        tableType = CatalogTableType.MANAGED,
+        storage = CatalogStorageFormat.empty,
+        schema = new StructType().add("i", "int"),
+        provider = Some(DDLUtils.HIVE_PROVIDER))
+
+      val newWarehousePath = Utils.createTempDir().getAbsolutePath
+      hiveClient.runSqlHive(s"SET hive.metastore.warehouse.dir=$newWarehousePath")
+      hiveClient.createTable(tableMeta, ignoreIfExists = false)
+      val rawTable = hiveClient.getTable("default", "test_tbl")
+      // Hive will use the value of `hive.metastore.warehouse.dir` to generate default table
+      // location for tables in default database.
+      assert(rawTable.storage.locationUri.get.contains(newWarehousePath))
+      hiveClient.dropTable("default", "test_tbl", ignoreIfNotExists = false, purge = false)
+
+      spark.sharedState.externalCatalog.createTable(tableMeta, ignoreIfExists = false)
+      val readBack = spark.sharedState.externalCatalog.getTable("default", "test_tbl")
+      // Spark SQL will use the location of default database to generate default table
+      // location for tables in default database.
+      assert(readBack.storage.locationUri.get.contains(defaultDbLocation))
+    } finally {
+      hiveClient.dropTable("default", "test_tbl", ignoreIfNotExists = true, purge = false)
+      hiveClient.runSqlHive(s"SET hive.metastore.warehouse.dir=$defaultDbLocation")
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveVariableSubstitutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveVariableSubstitutionSuite.scala
new file mode 100644
index 000000000000..84d3946ca5c6
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveVariableSubstitutionSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+
+class HiveVariableSubstitutionSuite extends QueryTest with TestHiveSingleton {
+  test("SET hivevar with prefix") {
+    spark.sql("SET hivevar:county=gram")
+    assert(spark.conf.getOption("county") === Some("gram"))
+  }
+
+  test("SET hivevar with dotted name") {
+    spark.sql("SET hivevar:eloquent.mosquito.alphabet=zip")
+    assert(spark.conf.getOption("eloquent.mosquito.alphabet") === Some("zip"))
+  }
+
+  test("hivevar substitution") {
+    spark.conf.set("pond", "bus")
+    checkAnswer(spark.sql("SELECT '${hivevar:pond}'"), Row("bus") :: Nil)
+  }
+
+  test("variable substitution without a prefix") {
+    spark.sql("SET hivevar:flask=plaid")
+    checkAnswer(spark.sql("SELECT '${flask}'"), Row("plaid") :: Nil)
+  }
+
+  test("variable substitution precedence") {
+    spark.conf.set("turn.aloof", "questionable")
+    spark.sql("SET hivevar:turn.aloof=dime")
+    // hivevar clobbers the conf setting
+    checkAnswer(spark.sql("SELECT '${turn.aloof}'"), Row("dime") :: Nil)
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index eaa67d370db3..deb40f046401 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -413,6 +413,26 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
     }
   }
 
+  test("saveAsTable(CTAS) using append and insertInto when the target table is Hive serde") {
+    val tableName = "tab1"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName STORED AS SEQUENCEFILE AS SELECT 1 AS key, 'abc' AS value")
+
+      val df = sql(s"SELECT key, value FROM $tableName")
+      val e = intercept[AnalysisException] {
+        df.write.mode(SaveMode.Append).saveAsTable(tableName)
+      }.getMessage
+      assert(e.contains("Saving data in the Hive serde table default.tab1 is not supported " +
+        "yet. Please use the insertInto() API as an alternative."))
+
+      df.write.insertInto(tableName)
+      checkAnswer(
+        sql(s"SELECT * FROM $tableName"),
+        Row(1, "abc") :: Row(1, "abc") :: Nil
+      )
+    }
+  }
+
   test("SPARK-5839 HiveMetastoreCatalog does not recognize table aliases of data source tables.") {
     withTable("savedJsonTable") {
       // Save the df as a managed table (by not specifying the path).
@@ -908,9 +928,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         createDF(10, 19).write.mode(SaveMode.Append).format("orc").saveAsTable("appendOrcToParquet")
       }
       assert(e.getMessage.contains(
-        "The file format of the existing table default.appendOrcToParquet " +
-        "is `org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat`. " +
-        "It doesn't match the specified format `orc`"))
+        "The format of the existing table default.appendOrcToParquet is `ParquetFileFormat`. " +
+          "It doesn't match the specified format `OrcFileFormat`"))
     }
 
     withTable("appendParquetToJson") {
@@ -920,9 +939,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
           .saveAsTable("appendParquetToJson")
       }
       assert(e.getMessage.contains(
-        "The file format of the existing table default.appendParquetToJson " +
-        "is `org.apache.spark.sql.execution.datasources.json.JsonFileFormat`. " +
-        "It doesn't match the specified format `parquet`"))
+        "The format of the existing table default.appendParquetToJson is `JsonFileFormat`. " +
+        "It doesn't match the specified format `ParquetFileFormat`"))
     }
 
     withTable("appendTextToJson") {
@@ -932,9 +950,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
           .saveAsTable("appendTextToJson")
       }
       assert(e.getMessage.contains(
-        "The file format of the existing table default.appendTextToJson is " +
-        "`org.apache.spark.sql.execution.datasources.json.JsonFileFormat`. " +
-        "It doesn't match the specified format `text`"))
+        "The format of the existing table default.appendTextToJson is `JsonFileFormat`. " +
+        "It doesn't match the specified format `TextFileFormat`"))
     }
   }
 
@@ -998,7 +1015,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         identifier = TableIdentifier("not_skip_hive_metadata"),
         tableType = CatalogTableType.EXTERNAL,
         storage = CatalogStorageFormat.empty.copy(
-          properties = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "false")
+          locationUri = Some(tempPath.getCanonicalPath),
+          properties = Map("skipHiveMetadata" -> "false")
         ),
         schema = schema,
         provider = Some("parquet")
@@ -1282,9 +1300,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         sql("insert into t values (2, 3, 4)")
         checkAnswer(table("t"), Seq(Row(1, 2, 3), Row(2, 3, 4)))
         val catalogTable = hiveClient.getTable("default", "t")
-        // there should not be a lowercase key 'path' now
-        assert(catalogTable.storage.properties.get("path").isEmpty)
-        assert(catalogTable.storage.properties.get("PATH").isDefined)
+        assert(catalogTable.storage.locationUri.isDefined)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
index 7ba880e47613..73224651092f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -29,7 +29,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
     val expectedPath =
       spark.sharedState.externalCatalog.getDatabase(dbName).locationUri + "/" + tableName
 
-    assert(metastoreTable.storage.properties("path") === expectedPath)
+    assert(metastoreTable.location === expectedPath)
   }
 
   private def getTableNames(dbName: Option[String] = None): Array[String] = {
@@ -269,17 +269,17 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
       val message = intercept[AnalysisException] {
         df.write.format("parquet").saveAsTable("`d:b`.`t:a`")
       }.getMessage
-      assert(message.contains("is not a valid name for metastore"))
+      assert(message.contains("Database 'd:b' not found"))
     }
 
     {
       val message = intercept[AnalysisException] {
         df.write.format("parquet").saveAsTable("`d:b`.`table`")
       }.getMessage
-      assert(message.contains("is not a valid name for metastore"))
+      assert(message.contains("Database 'd:b' not found"))
     }
 
-    withTempPath { dir =>
+    withTempDir { dir =>
       val path = dir.getCanonicalPath
 
       {
@@ -293,7 +293,8 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
             |)
             """.stripMargin)
         }.getMessage
-        assert(message.contains("is not a valid name for metastore"))
+        assert(message.contains("`t:a` is not a valid name for tables/databases. " +
+          "Valid names only contain alphabet characters, numbers and _."))
       }
 
       {
@@ -307,7 +308,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
               |)
               """.stripMargin)
         }.getMessage
-        assert(message.contains("is not a valid name for metastore"))
+        assert(message.contains("Database 'd:b' not found"))
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index 5f16960fb149..d98f174f6c55 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -20,13 +20,15 @@ package org.apache.spark.sql.hive
 import java.io.File
 
 import org.apache.spark.metrics.source.HiveCatalogMetrics
-import org.apache.spark.sql.{AnalysisException, QueryTest}
+import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.Utils
 
 class PartitionProviderCompatibilitySuite
   extends QueryTest with TestHiveSingleton with SQLTestUtils {
+  import testImplicits._
 
   private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
     spark.range(5).selectExpr("id as fieldOne", "id as partCol").write
@@ -69,7 +71,7 @@ class PartitionProviderCompatibilitySuite
         }
         withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
           verifyIsLegacyTable("test")
-          spark.sql("msck repair table test")
+          spark.catalog.recoverPartitions("test")
           spark.sql("show partitions test").count()  // check we are a new table
 
           // sanity check table performance
@@ -89,7 +91,7 @@ class PartitionProviderCompatibilitySuite
           setupPartitionedDatasourceTable("test", dir)
           spark.sql("show partitions test").count()  // check we are a new table
           assert(spark.sql("select * from test").count() == 0)  // needs repair
-          spark.sql("msck repair table test")
+          spark.catalog.recoverPartitions("test")
           assert(spark.sql("select * from test").count() == 5)
         }
       }
@@ -134,4 +136,340 @@ class PartitionProviderCompatibilitySuite
       }
     }
   }
+
+  test("insert overwrite partition of legacy datasource table") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir)
+          spark.sql(
+            """insert overwrite table test
+              |partition (partCol=1)
+              |select * from range(100)""".stripMargin)
+          assert(spark.sql("select * from test").count() == 104)
+
+          // Overwriting entire table
+          spark.sql("insert overwrite table test select id, id from range(10)".stripMargin)
+          assert(spark.sql("select * from test").count() == 10)
+        }
+      }
+    }
+  }
+
+  test("insert overwrite partition of new datasource table overwrites just partition") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir)
+          spark.catalog.recoverPartitions("test")
+          spark.sql(
+            """insert overwrite table test
+              |partition (partCol=1)
+              |select * from range(100)""".stripMargin)
+          assert(spark.sql("select * from test").count() == 104)
+
+          // Test overwriting a partition that has a custom location
+          withTempDir { dir2 =>
+            sql(
+              s"""alter table test partition (partCol=1)
+                |set location '${dir2.getAbsolutePath}'""".stripMargin)
+            assert(sql("select * from test").count() == 4)
+            sql(
+              """insert overwrite table test
+                |partition (partCol=1)
+                |select * from range(30)""".stripMargin)
+            sql(
+              """insert overwrite table test
+                |partition (partCol=1)
+                |select * from range(20)""".stripMargin)
+            assert(sql("select * from test").count() == 24)
+          }
+        }
+      }
+    }
+  }
+
+  for (enabled <- Seq(true, false)) {
+    test(s"SPARK-18544 append with saveAsTable - partition management $enabled") {
+      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> enabled.toString) {
+        withTable("test") {
+          withTempDir { dir =>
+            setupPartitionedDatasourceTable("test", dir)
+            if (enabled) {
+              spark.sql("msck repair table test")
+            }
+            assert(spark.sql("select * from test").count() == 5)
+            spark.range(10).selectExpr("id as fieldOne", "id as partCol")
+              .write.partitionBy("partCol").mode("append").saveAsTable("test")
+            assert(spark.sql("select * from test").count() == 15)
+          }
+        }
+      }
+    }
+
+    test(s"SPARK-18635 special chars in partition values - partition management $enabled") {
+      withTable("test") {
+        spark.range(10)
+          .selectExpr("id", "id as A", "'%' as B")
+          .write.partitionBy("A", "B").mode("overwrite")
+          .saveAsTable("test")
+        assert(spark.sql("select * from test").count() == 10)
+        assert(spark.sql("select * from test where B = '%'").count() == 10)
+        assert(spark.sql("select * from test where B = '$'").count() == 0)
+        spark.range(10)
+          .selectExpr("id", "id as A", "'=' as B")
+          .write.mode("append").insertInto("test")
+        spark.sql("insert into test partition (A, B) select id, id, '%=' from range(10)")
+        assert(spark.sql("select * from test").count() == 30)
+        assert(spark.sql("select * from test where B = '%'").count() == 10)
+        assert(spark.sql("select * from test where B = '='").count() == 10)
+        assert(spark.sql("select * from test where B = '%='").count() == 10)
+
+        // show partitions sanity check
+        val parts = spark.sql("show partitions test").collect().map(_.get(0)).toSeq
+        assert(parts.length == 30)
+        assert(parts.contains("A=0/B=%25"))
+        assert(parts.contains("A=0/B=%3D"))
+        assert(parts.contains("A=0/B=%25%3D"))
+
+        // drop partition sanity check
+        spark.sql("alter table test drop partition (A=1, B='%')")
+        assert(spark.sql("select * from test").count() == 29)  // 1 file in dropped partition
+
+        withTempDir { dir =>
+          // custom locations sanity check
+          spark.sql(s"""
+            |alter table test partition (A=0, B='%')
+            |set location '${dir.getAbsolutePath}'""".stripMargin)
+          assert(spark.sql("select * from test").count() == 28)  // moved to empty dir
+
+          // rename partition sanity check
+          spark.sql(s"""
+            |alter table test partition (A=5, B='%')
+            |rename to partition (A=100, B='%')""".stripMargin)
+          assert(spark.sql("select * from test where a = 5 and b = '%'").count() == 0)
+          assert(spark.sql("select * from test where a = 100 and b = '%'").count() == 1)
+
+          // try with A=0 which has a custom location
+          spark.sql("insert into test partition (A=0, B='%') select 1")
+          spark.sql(s"""
+            |alter table test partition (A=0, B='%')
+            |rename to partition (A=101, B='%')""".stripMargin)
+          assert(spark.sql("select * from test where a = 0 and b = '%'").count() == 0)
+          assert(spark.sql("select * from test where a = 101 and b = '%'").count() == 1)
+        }
+      }
+    }
+
+    test(s"SPARK-18659 insert overwrite table files - partition management $enabled") {
+      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> enabled.toString) {
+        withTable("test") {
+          spark.range(10)
+            .selectExpr("id", "id as A", "'x' as B")
+            .write.partitionBy("A", "B").mode("overwrite")
+            .saveAsTable("test")
+          spark.sql("insert overwrite table test select id, id, 'x' from range(1)")
+          assert(spark.sql("select * from test").count() == 1)
+
+          spark.range(10)
+            .selectExpr("id", "id as A", "'x' as B")
+            .write.partitionBy("A", "B").mode("overwrite")
+            .saveAsTable("test")
+          spark.sql(
+            "insert overwrite table test partition (A, B) select id, id, 'x' from range(1)")
+          assert(spark.sql("select * from test").count() == 1)
+        }
+      }
+    }
+
+    test(s"SPARK-18659 insert overwrite table with lowercase - partition management $enabled") {
+      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> enabled.toString) {
+        withTable("test") {
+          spark.range(10)
+            .selectExpr("id", "id as A", "'x' as B")
+            .write.partitionBy("A", "B").mode("overwrite")
+            .saveAsTable("test")
+          // note that 'A', 'B' are lowercase instead of their original case here
+          spark.sql("insert overwrite table test partition (a=1, b) select id, 'x' from range(1)")
+          assert(spark.sql("select * from test").count() == 10)
+        }
+      }
+    }
+
+    test(s"SPARK-19887 partition value is null - partition management $enabled") {
+      withTable("test") {
+        Seq((1, "p", 1), (2, null, 2)).toDF("a", "b", "c")
+          .write.partitionBy("b", "c").saveAsTable("test")
+        checkAnswer(spark.table("test"),
+          Row(1, "p", 1) :: Row(2, null, 2) :: Nil)
+
+        Seq((3, null: String, 3)).toDF("a", "b", "c")
+          .write.mode("append").partitionBy("b", "c").saveAsTable("test")
+        checkAnswer(spark.table("test"),
+          Row(1, "p", 1) :: Row(2, null, 2) :: Row(3, null, 3) :: Nil)
+        // make sure partition pruning also works.
+        checkAnswer(spark.table("test").filter($"b".isNotNull), Row(1, "p", 1))
+
+        // empty string is an invalid partition value and we treat it as null when read back.
+        Seq((4, "", 4)).toDF("a", "b", "c")
+          .write.mode("append").partitionBy("b", "c").saveAsTable("test")
+        checkAnswer(spark.table("test"),
+          Row(1, "p", 1) :: Row(2, null, 2) :: Row(3, null, 3) :: Row(4, null, 4) :: Nil)
+      }
+    }
+  }
+
+  /**
+   * Runs a test against a multi-level partitioned table, then validates that the custom locations
+   * were respected by the output writer.
+   *
+   * The initial partitioning structure is:
+   *   /P1=0/P2=0  -- custom location a
+   *   /P1=0/P2=1  -- custom location b
+   *   /P1=1/P2=0  -- custom location c
+   *   /P1=1/P2=1  -- default location
+   */
+  private def testCustomLocations(testFn: => Unit): Unit = {
+    val base = Utils.createTempDir(namePrefix = "base")
+    val a = Utils.createTempDir(namePrefix = "a")
+    val b = Utils.createTempDir(namePrefix = "b")
+    val c = Utils.createTempDir(namePrefix = "c")
+    try {
+      spark.sql(s"""
+        |create table test (id long, P1 int, P2 int)
+        |using parquet
+        |options (path "${base.getAbsolutePath}")
+        |partitioned by (P1, P2)""".stripMargin)
+      spark.sql(s"alter table test add partition (P1=0, P2=0) location '${a.getAbsolutePath}'")
+      spark.sql(s"alter table test add partition (P1=0, P2=1) location '${b.getAbsolutePath}'")
+      spark.sql(s"alter table test add partition (P1=1, P2=0) location '${c.getAbsolutePath}'")
+      spark.sql(s"alter table test add partition (P1=1, P2=1)")
+
+      testFn
+
+      // Now validate the partition custom locations were respected
+      val initialCount = spark.sql("select * from test").count()
+      val numA = spark.sql("select * from test where P1=0 and P2=0").count()
+      val numB = spark.sql("select * from test where P1=0 and P2=1").count()
+      val numC = spark.sql("select * from test where P1=1 and P2=0").count()
+      Utils.deleteRecursively(a)
+      spark.sql("refresh table test")
+      assert(spark.sql("select * from test where P1=0 and P2=0").count() == 0)
+      assert(spark.sql("select * from test").count() == initialCount - numA)
+      Utils.deleteRecursively(b)
+      spark.sql("refresh table test")
+      assert(spark.sql("select * from test where P1=0 and P2=1").count() == 0)
+      assert(spark.sql("select * from test").count() == initialCount - numA - numB)
+      Utils.deleteRecursively(c)
+      spark.sql("refresh table test")
+      assert(spark.sql("select * from test where P1=1 and P2=0").count() == 0)
+      assert(spark.sql("select * from test").count() == initialCount - numA - numB - numC)
+    } finally {
+      Utils.deleteRecursively(base)
+      Utils.deleteRecursively(a)
+      Utils.deleteRecursively(b)
+      Utils.deleteRecursively(c)
+      spark.sql("drop table test")
+    }
+  }
+
+  test("sanity check table setup") {
+    testCustomLocations {
+      assert(spark.sql("select * from test").count() == 0)
+      assert(spark.sql("show partitions test").count() == 4)
+    }
+  }
+
+  test("insert into partial dynamic partitions") {
+    testCustomLocations {
+      spark.sql("insert into test partition (P1=0, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 12)
+      spark.sql("insert into test partition (P1=0, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 20)
+      assert(spark.sql("show partitions test").count() == 12)
+      spark.sql("insert into test partition (P1=1, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 30)
+      assert(spark.sql("show partitions test").count() == 20)
+      spark.sql("insert into test partition (P1=2, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 40)
+      assert(spark.sql("show partitions test").count() == 30)
+    }
+  }
+
+  test("insert into fully dynamic partitions") {
+    testCustomLocations {
+      spark.sql("insert into test partition (P1, P2) select id, id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 12)
+      spark.sql("insert into test partition (P1, P2) select id, id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 20)
+      assert(spark.sql("show partitions test").count() == 12)
+    }
+  }
+
+  test("insert into static partition") {
+    testCustomLocations {
+      spark.sql("insert into test partition (P1=0, P2=0) select id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert into test partition (P1=0, P2=0) select id from range(10)")
+      assert(spark.sql("select * from test").count() == 20)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert into test partition (P1=1, P2=1) select id from range(10)")
+      assert(spark.sql("select * from test").count() == 30)
+      assert(spark.sql("show partitions test").count() == 4)
+    }
+  }
+
+  test("overwrite partial dynamic partitions") {
+    testCustomLocations {
+      spark.sql("insert overwrite table test partition (P1=0, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 12)
+      spark.sql("insert overwrite table test partition (P1=0, P2) select id, id from range(5)")
+      assert(spark.sql("select * from test").count() == 5)
+      assert(spark.sql("show partitions test").count() == 7)
+      spark.sql("insert overwrite table test partition (P1=0, P2) select id, id from range(1)")
+      assert(spark.sql("select * from test").count() == 1)
+      assert(spark.sql("show partitions test").count() == 3)
+      spark.sql("insert overwrite table test partition (P1=1, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 11)
+      assert(spark.sql("show partitions test").count() == 11)
+      spark.sql("insert overwrite table test partition (P1=1, P2) select id, id from range(1)")
+      assert(spark.sql("select * from test").count() == 2)
+      assert(spark.sql("show partitions test").count() == 2)
+      spark.sql("insert overwrite table test partition (P1=3, P2) select id, id from range(100)")
+      assert(spark.sql("select * from test").count() == 102)
+      assert(spark.sql("show partitions test").count() == 102)
+    }
+  }
+
+  test("overwrite fully dynamic partitions") {
+    testCustomLocations {
+      spark.sql("insert overwrite table test partition (P1, P2) select id, id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 10)
+      spark.sql("insert overwrite table test partition (P1, P2) select id, id, id from range(5)")
+      assert(spark.sql("select * from test").count() == 5)
+      assert(spark.sql("show partitions test").count() == 5)
+    }
+  }
+
+  test("overwrite static partition") {
+    testCustomLocations {
+      spark.sql("insert overwrite table test partition (P1=0, P2=0) select id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert overwrite table test partition (P1=0, P2=0) select id from range(5)")
+      assert(spark.sql("select * from test").count() == 5)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert overwrite table test partition (P1=1, P2=1) select id from range(5)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert overwrite table test partition (P1=1, P2=2) select id from range(5)")
+      assert(spark.sql("select * from test").count() == 15)
+      assert(spark.sql("show partitions test").count() == 5)
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index d8e31c4e39a5..5bca90b7c923 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.io.File
+import java.util.concurrent.{Executors, TimeUnit}
 
 import org.scalatest.BeforeAndAfterEach
 
@@ -57,7 +58,12 @@ class PartitionedTablePerfStatsSuite
   }
 
   private def setupPartitionedHiveTable(tableName: String, dir: File): Unit = {
-    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+    setupPartitionedHiveTable(tableName, dir, 5)
+  }
+
+  private def setupPartitionedHiveTable(
+      tableName: String, dir: File, scale: Int, repair: Boolean = true): Unit = {
+    spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
@@ -67,11 +73,18 @@ class PartitionedTablePerfStatsSuite
       |partitioned by (partCol1 int, partCol2 int)
       |stored as parquet
       |location "${dir.getAbsolutePath}"""".stripMargin)
-    spark.sql(s"msck repair table $tableName")
+    if (repair) {
+      spark.sql(s"msck repair table $tableName")
+    }
   }
 
   private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
-    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+    setupPartitionedDatasourceTable(tableName, dir, 5)
+  }
+
+  private def setupPartitionedDatasourceTable(
+      tableName: String, dir: File, scale: Int, repair: Boolean = true): Unit = {
+    spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
@@ -81,7 +94,9 @@ class PartitionedTablePerfStatsSuite
       |using parquet
       |options (path "${dir.getAbsolutePath}")
       |partitioned by (partCol1, partCol2)""".stripMargin)
-    spark.sql(s"msck repair table $tableName")
+    if (repair) {
+      spark.sql(s"msck repair table $tableName")
+    }
   }
 
   genericTest("partitioned pruned table reports only selected files") { spec =>
@@ -105,12 +120,9 @@ class PartitionedTablePerfStatsSuite
         assert(df4.count() == 0)
         assert(df4.inputFiles.length == 0)
 
-        // TODO(ekl) enable for hive tables as well once SPARK-17983 is fixed
-        if (spec.isDatasourceTable) {
-          val df5 = spark.sql("select * from test where fieldOne = 4")
-          assert(df5.count() == 1)
-          assert(df5.inputFiles.length == 5)
-        }
+        val df5 = spark.sql("select * from test where fieldOne = 4")
+        assert(df5.count() == 1)
+        assert(df5.inputFiles.length == 5)
       }
     }
   }
@@ -245,6 +257,78 @@ class PartitionedTablePerfStatsSuite
     }
   }
 
+  test("datasource table: table setup does not scan filesystem") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          HiveCatalogMetrics.reset()
+          setupPartitionedDatasourceTable("test", dir, scale = 10, repair = false)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("hive table: table setup does not scan filesystem") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          HiveCatalogMetrics.reset()
+          setupPartitionedHiveTable("test", dir, scale = 10, repair = false)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("hive table: num hive client calls does not scale with partition count") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedHiveTable("test", dir, scale = 100)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 1").count() == 1)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() > 0)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("show partitions test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+        }
+      }
+    }
+  }
+
+  test("datasource table: num hive client calls does not scale with partition count") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir, scale = 100)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 1").count() == 1)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() > 0)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("show partitions test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+        }
+      }
+    }
+  }
+
   test("hive table: files read and cached when filesource partition management is off") {
     withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
       withTable("test") {
@@ -301,4 +385,30 @@ class PartitionedTablePerfStatsSuite
       }
     }
   }
+
+  test("SPARK-18700: table loaded only once even when resolved concurrently") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          HiveCatalogMetrics.reset()
+          setupPartitionedHiveTable("test", dir, 50)
+          // select the table in multi-threads
+          val executorPool = Executors.newFixedThreadPool(10)
+          (1 to 10).map(threadId => {
+            val runnable = new Runnable {
+              override def run(): Unit = {
+                spark.sql("select * from test where partCol1 = 999").count()
+              }
+            }
+            executorPool.execute(runnable)
+            None
+          })
+          executorPool.shutdown()
+          executorPool.awaitTermination(30, TimeUnit.SECONDS)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 50)
+          assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 1)
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ShowCreateTableSuite.scala
index e925921165d6..68df80943430 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ShowCreateTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ShowCreateTableSuite.scala
@@ -265,6 +265,34 @@ class ShowCreateTableSuite extends QueryTest with SQLTestUtils with TestHiveSing
     }
   }
 
+  test("hive partitioned view is not supported") {
+    withTable("t1") {
+      withView("v1") {
+        sql(
+          s"""
+             |CREATE TABLE t1 (c1 INT, c2 STRING)
+             |PARTITIONED BY (
+             |  p1 BIGINT COMMENT 'bla',
+             |  p2 STRING )
+           """.stripMargin)
+
+        createRawHiveTable(
+          s"""
+             |CREATE VIEW v1
+             |PARTITIONED ON (p1, p2)
+             |AS SELECT * from t1
+           """.stripMargin
+        )
+
+        val cause = intercept[AnalysisException] {
+          sql("SHOW CREATE TABLE v1")
+        }
+
+        assert(cause.getMessage.contains(" - partitioned view"))
+      }
+    }
+  }
+
   private def createRawHiveTable(ddl: String): Unit = {
     hiveContext.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client.runSqlHive(ddl)
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 4f5ebc3d838b..5ae202fdc98d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -22,56 +22,16 @@ import java.io.{File, PrintWriter}
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
-import org.apache.spark.sql.execution.command.{AnalyzeTableCommand, DDLUtils}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
-class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
-
-  test("parse analyze commands") {
-    def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
-      val parsed = spark.sessionState.sqlParser.parsePlan(analyzeCommand)
-      val operators = parsed.collect {
-        case a: AnalyzeTableCommand => a
-        case o => o
-      }
-
-      assert(operators.size === 1)
-      if (operators(0).getClass() != c) {
-        fail(
-          s"""$analyzeCommand expected command: $c, but got ${operators(0)}
-             |parsed command:
-             |$parsed
-           """.stripMargin)
-      }
-    }
-
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 COMPUTE STATISTICS",
-      classOf[AnalyzeTableCommand])
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
-      classOf[AnalyzeTableCommand])
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
-      classOf[AnalyzeTableCommand])
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS",
-      classOf[AnalyzeTableCommand])
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS noscan",
-      classOf[AnalyzeTableCommand])
-
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 COMPUTE STATISTICS nOscAn",
-      classOf[AnalyzeTableCommand])
-  }
+class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
 
   test("MetastoreRelations fallback to HDFS for size estimation") {
     val enableFallBackToHdfsForStats = spark.sessionState.conf.fallBackToHdfsForStatsEnabled
@@ -310,6 +270,110 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     }
   }
 
+  test("verify serialized column stats after analyzing columns") {
+    import testImplicits._
+
+    val tableName = "column_stats_test2"
+    // (data.head.productArity - 1) because the last column does not support stats collection.
+    assert(stats.size == data.head.productArity - 1)
+    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+
+    withTable(tableName) {
+      df.write.saveAsTable(tableName)
+
+      // Collect statistics
+      sql(s"analyze table $tableName compute STATISTICS FOR COLUMNS " + stats.keys.mkString(", "))
+
+      // Validate statistics
+      val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+      val table = hiveClient.getTable("default", tableName)
+
+      val props = table.properties.filterKeys(_.startsWith("spark.sql.statistics.colStats"))
+      assert(props == Map(
+        "spark.sql.statistics.colStats.cbinary.avgLen" -> "3",
+        "spark.sql.statistics.colStats.cbinary.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cbinary.maxLen" -> "3",
+        "spark.sql.statistics.colStats.cbinary.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cbinary.version" -> "1",
+        "spark.sql.statistics.colStats.cbool.avgLen" -> "1",
+        "spark.sql.statistics.colStats.cbool.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cbool.max" -> "true",
+        "spark.sql.statistics.colStats.cbool.maxLen" -> "1",
+        "spark.sql.statistics.colStats.cbool.min" -> "false",
+        "spark.sql.statistics.colStats.cbool.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cbool.version" -> "1",
+        "spark.sql.statistics.colStats.cbyte.avgLen" -> "1",
+        "spark.sql.statistics.colStats.cbyte.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cbyte.max" -> "2",
+        "spark.sql.statistics.colStats.cbyte.maxLen" -> "1",
+        "spark.sql.statistics.colStats.cbyte.min" -> "1",
+        "spark.sql.statistics.colStats.cbyte.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cbyte.version" -> "1",
+        "spark.sql.statistics.colStats.cdate.avgLen" -> "4",
+        "spark.sql.statistics.colStats.cdate.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cdate.max" -> "2016-05-09",
+        "spark.sql.statistics.colStats.cdate.maxLen" -> "4",
+        "spark.sql.statistics.colStats.cdate.min" -> "2016-05-08",
+        "spark.sql.statistics.colStats.cdate.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cdate.version" -> "1",
+        "spark.sql.statistics.colStats.cdecimal.avgLen" -> "16",
+        "spark.sql.statistics.colStats.cdecimal.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cdecimal.max" -> "8.000000000000000000",
+        "spark.sql.statistics.colStats.cdecimal.maxLen" -> "16",
+        "spark.sql.statistics.colStats.cdecimal.min" -> "1.000000000000000000",
+        "spark.sql.statistics.colStats.cdecimal.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cdecimal.version" -> "1",
+        "spark.sql.statistics.colStats.cdouble.avgLen" -> "8",
+        "spark.sql.statistics.colStats.cdouble.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cdouble.max" -> "6.0",
+        "spark.sql.statistics.colStats.cdouble.maxLen" -> "8",
+        "spark.sql.statistics.colStats.cdouble.min" -> "1.0",
+        "spark.sql.statistics.colStats.cdouble.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cdouble.version" -> "1",
+        "spark.sql.statistics.colStats.cfloat.avgLen" -> "4",
+        "spark.sql.statistics.colStats.cfloat.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cfloat.max" -> "7.0",
+        "spark.sql.statistics.colStats.cfloat.maxLen" -> "4",
+        "spark.sql.statistics.colStats.cfloat.min" -> "1.0",
+        "spark.sql.statistics.colStats.cfloat.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cfloat.version" -> "1",
+        "spark.sql.statistics.colStats.cint.avgLen" -> "4",
+        "spark.sql.statistics.colStats.cint.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cint.max" -> "4",
+        "spark.sql.statistics.colStats.cint.maxLen" -> "4",
+        "spark.sql.statistics.colStats.cint.min" -> "1",
+        "spark.sql.statistics.colStats.cint.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cint.version" -> "1",
+        "spark.sql.statistics.colStats.clong.avgLen" -> "8",
+        "spark.sql.statistics.colStats.clong.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.clong.max" -> "5",
+        "spark.sql.statistics.colStats.clong.maxLen" -> "8",
+        "spark.sql.statistics.colStats.clong.min" -> "1",
+        "spark.sql.statistics.colStats.clong.nullCount" -> "1",
+        "spark.sql.statistics.colStats.clong.version" -> "1",
+        "spark.sql.statistics.colStats.cshort.avgLen" -> "2",
+        "spark.sql.statistics.colStats.cshort.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cshort.max" -> "3",
+        "spark.sql.statistics.colStats.cshort.maxLen" -> "2",
+        "spark.sql.statistics.colStats.cshort.min" -> "1",
+        "spark.sql.statistics.colStats.cshort.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cshort.version" -> "1",
+        "spark.sql.statistics.colStats.cstring.avgLen" -> "3",
+        "spark.sql.statistics.colStats.cstring.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cstring.maxLen" -> "3",
+        "spark.sql.statistics.colStats.cstring.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cstring.version" -> "1",
+        "spark.sql.statistics.colStats.ctimestamp.avgLen" -> "8",
+        "spark.sql.statistics.colStats.ctimestamp.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.ctimestamp.max" -> "2016-05-09 00:00:02.0",
+        "spark.sql.statistics.colStats.ctimestamp.maxLen" -> "8",
+        "spark.sql.statistics.colStats.ctimestamp.min" -> "2016-05-08 00:00:01.0",
+        "spark.sql.statistics.colStats.ctimestamp.nullCount" -> "1",
+        "spark.sql.statistics.colStats.ctimestamp.version" -> "1"
+      ))
+    }
+  }
+
   private def testUpdatingTableStats(tableDescription: String, createTableCmd: String): Unit = {
     test("test table-level statistics for " + tableDescription) {
       val parquetTable = "parquetTable"
@@ -319,7 +383,8 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
           TableIdentifier(parquetTable))
         assert(DDLUtils.isDatasourceTable(catalogTable))
 
-        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+        // Add a filter to avoid creating too many partitions
+        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src WHERE key < 10")
         checkTableStats(
           parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
 
@@ -328,7 +393,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
         val fetchedStats1 = checkTableStats(
           parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
 
-        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src WHERE key < 10")
         sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
         val fetchedStats2 = checkTableStats(
           parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
@@ -340,7 +405,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
           parquetTable,
           isDataSourceTable = true,
           hasSizeInBytes = true,
-          expectedRowCounts = Some(1000))
+          expectedRowCounts = Some(20))
         assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
       }
     }
@@ -369,6 +434,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     }
   }
 
+  /** Used to test refreshing cached metadata once table stats are updated. */
   private def getStatsBeforeAfterUpdate(isAnalyzeColumns: Boolean): (Statistics, Statistics) = {
     val tableName = "tbl"
     var statsBeforeUpdate: Statistics = null
@@ -411,145 +477,6 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     assert(statsAfterUpdate.rowCount == Some(2))
   }
 
-  test("test refreshing column stats of cached data source table by `ANALYZE TABLE` statement") {
-    val (statsBeforeUpdate, statsAfterUpdate) = getStatsBeforeAfterUpdate(isAnalyzeColumns = true)
-
-    assert(statsBeforeUpdate.sizeInBytes > 0)
-    assert(statsBeforeUpdate.rowCount == Some(1))
-    StatisticsTest.checkColStat(
-      dataType = IntegerType,
-      colStat = statsBeforeUpdate.colStats("key"),
-      expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
-      rsd = spark.sessionState.conf.ndvMaxError)
-
-    assert(statsAfterUpdate.sizeInBytes > statsBeforeUpdate.sizeInBytes)
-    assert(statsAfterUpdate.rowCount == Some(2))
-    StatisticsTest.checkColStat(
-      dataType = IntegerType,
-      colStat = statsAfterUpdate.colStats("key"),
-      expectedColStat = ColumnStat(InternalRow(0L, 2, 1, 2L)),
-      rsd = spark.sessionState.conf.ndvMaxError)
-  }
-
-  private lazy val (testDataFrame, expectedColStatsSeq) = {
-    import testImplicits._
-
-    val intSeq = Seq(1, 2)
-    val stringSeq = Seq("a", "bb")
-    val binarySeq = Seq("a", "bb").map(_.getBytes)
-    val booleanSeq = Seq(true, false)
-    val data = intSeq.indices.map { i =>
-      (intSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i))
-    }
-    val df: DataFrame = data.toDF("c1", "c2", "c3", "c4")
-    val expectedColStatsSeq: Seq[(StructField, ColumnStat)] = df.schema.map { f =>
-      val colStat = f.dataType match {
-        case IntegerType =>
-          ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, intSeq.distinct.length.toLong))
-        case StringType =>
-          ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / stringSeq.length.toDouble,
-            stringSeq.map(_.length).max.toInt, stringSeq.distinct.length.toLong))
-        case BinaryType =>
-          ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / binarySeq.length.toDouble,
-            binarySeq.map(_.length).max.toInt))
-        case BooleanType =>
-          ColumnStat(InternalRow(0L, booleanSeq.count(_.equals(true)).toLong,
-            booleanSeq.count(_.equals(false)).toLong))
-      }
-      (f, colStat)
-    }
-    (df, expectedColStatsSeq)
-  }
-
-  private def checkColStats(
-      tableName: String,
-      isDataSourceTable: Boolean,
-      expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
-    val readback = spark.table(tableName)
-    val stats = readback.queryExecution.analyzed.collect {
-      case rel: MetastoreRelation =>
-        assert(!isDataSourceTable, "Expected a Hive serde table, but got a data source table")
-        rel.catalogTable.stats.get
-      case rel: LogicalRelation =>
-        assert(isDataSourceTable, "Expected a data source table, but got a Hive serde table")
-        rel.catalogTable.get.stats.get
-    }
-    assert(stats.length == 1)
-    val columnStats = stats.head.colStats
-    assert(columnStats.size == expectedColStatsSeq.length)
-    expectedColStatsSeq.foreach { case (field, expectedColStat) =>
-      StatisticsTest.checkColStat(
-        dataType = field.dataType,
-        colStat = columnStats(field.name),
-        expectedColStat = expectedColStat,
-        rsd = spark.sessionState.conf.ndvMaxError)
-    }
-  }
-
-  test("generate and load column-level stats for data source table") {
-    val dsTable = "dsTable"
-    withTable(dsTable) {
-      testDataFrame.write.format("parquet").saveAsTable(dsTable)
-      sql(s"ANALYZE TABLE $dsTable COMPUTE STATISTICS FOR COLUMNS c1, c2, c3, c4")
-      checkColStats(dsTable, isDataSourceTable = true, expectedColStatsSeq)
-    }
-  }
-
-  test("generate and load column-level stats for hive serde table") {
-    val hTable = "hTable"
-    val tmp = "tmp"
-    withTable(hTable, tmp) {
-      testDataFrame.write.format("parquet").saveAsTable(tmp)
-      sql(s"CREATE TABLE $hTable (c1 int, c2 string, c3 binary, c4 boolean) STORED AS TEXTFILE")
-      sql(s"INSERT INTO $hTable SELECT * FROM $tmp")
-      sql(s"ANALYZE TABLE $hTable COMPUTE STATISTICS FOR COLUMNS c1, c2, c3, c4")
-      checkColStats(hTable, isDataSourceTable = false, expectedColStatsSeq)
-    }
-  }
-
-  // When caseSensitive is on, for columns with only case difference, they are different columns
-  // and we should generate column stats for all of them.
-  private def checkCaseSensitiveColStats(columnName: String): Unit = {
-    val tableName = "tbl"
-    withTable(tableName) {
-      val column1 = columnName.toLowerCase
-      val column2 = columnName.toUpperCase
-      withSQLConf("spark.sql.caseSensitive" -> "true") {
-        sql(s"CREATE TABLE $tableName (`$column1` int, `$column2` double) USING PARQUET")
-        sql(s"INSERT INTO $tableName SELECT 1, 3.0")
-        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS `$column1`, `$column2`")
-        val readback = spark.table(tableName)
-        val relations = readback.queryExecution.analyzed.collect { case rel: LogicalRelation =>
-          val columnStats = rel.catalogTable.get.stats.get.colStats
-          assert(columnStats.size == 2)
-          StatisticsTest.checkColStat(
-            dataType = IntegerType,
-            colStat = columnStats(column1),
-            expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
-            rsd = spark.sessionState.conf.ndvMaxError)
-          StatisticsTest.checkColStat(
-            dataType = DoubleType,
-            colStat = columnStats(column2),
-            expectedColStat = ColumnStat(InternalRow(0L, 3.0d, 3.0d, 1L)),
-            rsd = spark.sessionState.conf.ndvMaxError)
-          rel
-        }
-        assert(relations.size == 1)
-      }
-    }
-  }
-
-  test("check column statistics for case sensitive column names") {
-    checkCaseSensitiveColStats(columnName = "c1")
-  }
-
-  test("check column statistics for case sensitive non-ascii column names") {
-    // scalastyle:off
-    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
-    checkCaseSensitiveColStats(columnName = "列c")
-    // scalastyle:on
-  }
-
   test("estimates the size of a test MetastoreRelation") {
     val df = sql("""SELECT * FROM src""")
     val sizes = df.queryExecution.analyzed.collect { case mr: MetastoreRelation =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index cd96c85f3e20..031c1a5ec0ec 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -65,6 +65,11 @@ class FiltersSuite extends SparkFunSuite with Logging {
     (Literal("") === a("varchar", StringType)) :: Nil,
     "")
 
+  filterTest("SPARK-19912 String literals should be escaped for Hive metastore partition pruning",
+    (a("stringcol", StringType) === Literal("p1\" and q=\"q1")) ::
+      (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil,
+    """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""")
+
   private def filterTest(name: String, filters: Seq[Expression], result: String) = {
     test(name) {
       val converted = shim.convertFilters(testTable, filters)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 081b0ed9bd68..e706e2eb1f43 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -26,13 +26,15 @@ import org.apache.hadoop.mapred.TextInputFormat
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.analysis.NoSuchPermanentFunctionException
+import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPermanentFunctionException}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal}
 import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.tags.ExtendedHiveTest
@@ -45,7 +47,7 @@ import org.apache.spark.util.{MutableURLClassLoader, Utils}
  * is not fully tested.
  */
 @ExtendedHiveTest
-class VersionsSuite extends SparkFunSuite with Logging {
+class VersionsSuite extends SparkFunSuite with SQLTestUtils with TestHiveSingleton with Logging {
 
   private val clientBuilder = new HiveClientBuilder
   import clientBuilder.buildClient
@@ -135,11 +137,12 @@ class VersionsSuite extends SparkFunSuite with Logging {
     test(s"$version: getDatabase") {
       // No exception should be thrown
       client.getDatabase("default")
+      intercept[NoSuchDatabaseException](client.getDatabase("nonexist"))
     }
 
-    test(s"$version: getDatabaseOption") {
-      assert(client.getDatabaseOption("default").isDefined)
-      assert(client.getDatabaseOption("nonexist") == None)
+    test(s"$version: databaseExists") {
+      assert(client.databaseExists("default") == true)
+      assert(client.databaseExists("nonexist") == false)
     }
 
     test(s"$version: listDatabases") {
@@ -153,9 +156,9 @@ class VersionsSuite extends SparkFunSuite with Logging {
     }
 
     test(s"$version: dropDatabase") {
-      assert(client.getDatabaseOption("temporary").isDefined)
+      assert(client.databaseExists("temporary") == true)
       client.dropDatabase("temporary", ignoreIfNotExists = false, cascade = true)
-      assert(client.getDatabaseOption("temporary").isEmpty)
+      assert(client.databaseExists("temporary") == false)
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -254,6 +257,11 @@ class VersionsSuite extends SparkFunSuite with Logging {
         "default", "src_part", partitions, ignoreIfExists = true)
     }
 
+    test(s"$version: getPartitionNames(catalogTable)") {
+      val partitionNames = (1 to testPartitionCount).map(key2 => s"key1=1/key2=$key2")
+      assert(partitionNames == client.getPartitionNames(client.getTable("default", "src_part")))
+    }
+
     test(s"$version: getPartitions(catalogTable)") {
       assert(testPartitionCount ==
         client.getPartitions(client.getTable("default", "src_part")).size)
@@ -352,13 +360,13 @@ class VersionsSuite extends SparkFunSuite with Logging {
       // with a version that is older than the minimum (1.2 in this case).
       try {
         client.dropPartitions("default", "src_part", Seq(spec), ignoreIfNotExists = true,
-          purge = true)
+          purge = true, retainData = false)
         assert(!versionsWithoutPurge.contains(version))
       } catch {
         case _: UnsupportedOperationException =>
           assert(versionsWithoutPurge.contains(version))
           client.dropPartitions("default", "src_part", Seq(spec), ignoreIfNotExists = true,
-            purge = false)
+            purge = false, retainData = false)
       }
 
       assert(client.getPartitionOption("default", "src_part", spec).isEmpty)
@@ -525,5 +533,42 @@ class VersionsSuite extends SparkFunSuite with Logging {
       client.reset()
       assert(client.listTables("default").isEmpty)
     }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // End-To-End tests
+    ///////////////////////////////////////////////////////////////////////////
+
+    test(s"$version: CREATE TABLE AS SELECT") {
+      withTable("tbl") {
+        spark.sql("CREATE TABLE tbl AS SELECT 1 AS a")
+        assert(spark.table("tbl").collect().toSeq == Seq(Row(1)))
+      }
+    }
+
+    test(s"$version: Delete the temporary staging directory and files after each insert") {
+      withTempDir { tmpDir =>
+        withTable("tab") {
+          spark.sql(
+            s"""
+               |CREATE TABLE tab(c1 string)
+               |location '${tmpDir.toURI.toString}'
+             """.stripMargin)
+
+          (1 to 3).map { i =>
+            spark.sql(s"INSERT OVERWRITE TABLE tab SELECT '$i'")
+          }
+          def listFiles(path: File): List[String] = {
+            val dir = path.listFiles()
+            val folders = dir.filter(_.isDirectory).toList
+            val filePaths = dir.map(_.getName).toList
+            folders.flatMap(listFiles) ++: filePaths
+          }
+          val expectedFiles = ".part-00000.crc" :: "part-00000" :: Nil
+          assert(listFiles(tmpDir).sorted == expectedFiles)
+        }
+      }
+    }
+
+    // TODO: add more tests.
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 4a8086d7e540..84f915977bd8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -509,6 +509,19 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
         Row(null, null, 110.0, null, null, 10.0) :: Nil)
   }
 
+  test("non-deterministic children expressions of UDAF") {
+    val e = intercept[AnalysisException] {
+      spark.sql(
+        """
+          |SELECT mydoublesum(value + 1.5 * key + rand())
+          |FROM agg1
+          |GROUP BY key
+        """.stripMargin)
+    }.getMessage
+    assert(Seq("nondeterministic expression",
+      "should not appear in the arguments of an aggregate function").forall(e.contains))
+  }
+
   test("interpreted aggregate function") {
     checkAnswer(
       spark.sql(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index e9268a922cf5..037ab470f303 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -23,11 +23,10 @@ import org.apache.hadoop.fs.Path
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode}
-import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
+import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException}
 import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.command.DDLUtils
-import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
@@ -148,6 +147,51 @@ class HiveDDLSuite
     }
   }
 
+  test("create Hive-serde table and view with unicode columns and comment") {
+    val catalog = spark.sessionState.catalog
+    val tabName = "tab1"
+    val viewName = "view1"
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    val colName1 = "和"
+    val colName2 = "尼"
+    val comment = "庙"
+    // scalastyle:on
+    withTable(tabName) {
+      sql(s"""
+             |CREATE TABLE $tabName(`$colName1` int COMMENT '$comment')
+             |COMMENT '$comment'
+             |PARTITIONED BY (`$colName2` int)
+           """.stripMargin)
+      sql(s"INSERT OVERWRITE TABLE $tabName partition (`$colName2`=2) SELECT 1")
+      withView(viewName) {
+        sql(
+          s"""
+             |CREATE VIEW $viewName(`$colName1` COMMENT '$comment', `$colName2`)
+             |COMMENT '$comment'
+             |AS SELECT `$colName1`, `$colName2` FROM $tabName
+           """.stripMargin)
+        val tableMetadata = catalog.getTableMetadata(TableIdentifier(tabName, Some("default")))
+        val viewMetadata = catalog.getTableMetadata(TableIdentifier(viewName, Some("default")))
+        assert(tableMetadata.comment == Option(comment))
+        assert(viewMetadata.comment == Option(comment))
+
+        assert(tableMetadata.schema.fields.length == 2 && viewMetadata.schema.fields.length == 2)
+        val column1InTable = tableMetadata.schema.fields.head
+        val column1InView = viewMetadata.schema.fields.head
+        assert(column1InTable.name == colName1 && column1InView.name == colName1)
+        assert(column1InTable.getComment() == Option(comment))
+        assert(column1InView.getComment() == Option(comment))
+
+        assert(tableMetadata.schema.fields(1).name == colName2 &&
+          viewMetadata.schema.fields(1).name == colName2)
+
+        checkAnswer(sql(s"SELECT `$colName1`, `$colName2` FROM $tabName"), Row(1, 2) :: Nil)
+        checkAnswer(sql(s"SELECT `$colName1`, `$colName2` FROM $viewName"), Row(1, 2) :: Nil)
+      }
+    }
+  }
+
   test("create table: partition column names exist in table definition") {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int) PARTITIONED BY (a string)")
@@ -155,6 +199,62 @@ class HiveDDLSuite
     assert(e.message == "Found duplicate column(s) in table definition of `tbl`: a")
   }
 
+  test("add/drop partition with location - managed table") {
+    val tab = "tab_with_partitions"
+    withTempDir { tmpDir =>
+      val basePath = new File(tmpDir.getCanonicalPath)
+      val part1Path = new File(basePath + "/part1")
+      val part2Path = new File(basePath + "/part2")
+      val dirSet = part1Path :: part2Path :: Nil
+
+      // Before data insertion, all the directory are empty
+      assert(dirSet.forall(dir => dir.listFiles == null || dir.listFiles.isEmpty))
+
+      withTable(tab) {
+        sql(
+          s"""
+             |CREATE TABLE $tab (key INT, value STRING)
+             |PARTITIONED BY (ds STRING, hr STRING)
+           """.stripMargin)
+        sql(
+          s"""
+             |ALTER TABLE $tab ADD
+             |PARTITION (ds='2008-04-08', hr=11) LOCATION '$part1Path'
+             |PARTITION (ds='2008-04-08', hr=12) LOCATION '$part2Path'
+           """.stripMargin)
+        assert(dirSet.forall(dir => dir.listFiles == null || dir.listFiles.isEmpty))
+
+        sql(s"INSERT OVERWRITE TABLE $tab partition (ds='2008-04-08', hr=11) SELECT 1, 'a'")
+        sql(s"INSERT OVERWRITE TABLE $tab partition (ds='2008-04-08', hr=12) SELECT 2, 'b'")
+        // add partition will not delete the data
+        assert(dirSet.forall(dir => dir.listFiles.nonEmpty))
+        checkAnswer(
+          spark.table(tab),
+          Row(1, "a", "2008-04-08", "11") :: Row(2, "b", "2008-04-08", "12") :: Nil
+        )
+
+        sql(s"ALTER TABLE $tab DROP PARTITION (ds='2008-04-08', hr=11)")
+        // drop partition will delete the data
+        assert(part1Path.listFiles == null || part1Path.listFiles.isEmpty)
+        assert(part2Path.listFiles.nonEmpty)
+
+        sql(s"DROP TABLE $tab")
+        // drop table will delete the data of the managed table
+        assert(dirSet.forall(dir => dir.listFiles == null || dir.listFiles.isEmpty))
+      }
+    }
+  }
+
+  test("SPARK-19129: drop partition with a empty string will drop the whole table") {
+    val df = spark.createDataFrame(Seq((0, "a"), (1, "b"))).toDF("partCol1", "name")
+    df.write.mode("overwrite").partitionBy("partCol1").saveAsTable("partitionedTable")
+    val e = intercept[AnalysisException] {
+      spark.sql("alter table partitionedTable drop partition(partCol1='')")
+    }.getMessage
+    assert(e.contains("Partition spec is invalid. The spec ([partCol1=]) contains an empty " +
+      "partition column value"))
+  }
+
   test("add/drop partitions - external table") {
     val catalog = spark.sessionState.catalog
     withTempDir { tmpDir =>
@@ -213,9 +313,15 @@ class HiveDDLSuite
         // drop partition will not delete the data of external table
         assert(dirSet.forall(dir => dir.listFiles.nonEmpty))
 
-        sql(s"ALTER TABLE $externalTab ADD PARTITION (ds='2008-04-08', hr='12')")
+        sql(
+          s"""
+             |ALTER TABLE $externalTab ADD PARTITION (ds='2008-04-08', hr='12')
+             |PARTITION (ds='2008-04-08', hr=11)
+          """.stripMargin)
         assert(catalog.listPartitions(TableIdentifier(externalTab)).map(_.spec).toSet ==
-          Set(Map("ds" -> "2008-04-08", "hr" -> "12"), Map("ds" -> "2008-04-09", "hr" -> "11")))
+          Set(Map("ds" -> "2008-04-08", "hr" -> "11"),
+            Map("ds" -> "2008-04-08", "hr" -> "12"),
+            Map("ds" -> "2008-04-09", "hr" -> "11")))
         // add partition will not delete the data
         assert(dirSet.forall(dir => dir.listFiles.nonEmpty))
 
@@ -426,7 +532,7 @@ class HiveDDLSuite
     sql("CREATE TABLE tab1 (height INT, length INT) PARTITIONED BY (a INT, b INT)")
     val part1 = Map("a" -> "1", "b" -> "5")
     val part2 = Map("a" -> "2", "b" -> "6")
-    val root = new Path(catalog.getTableMetadata(tableIdent).storage.locationUri.get)
+    val root = new Path(catalog.getTableMetadata(tableIdent).location)
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
     // valid
     fs.mkdirs(new Path(new Path(root, "a=1"), "b=5"))
@@ -620,53 +726,46 @@ class HiveDDLSuite
   }
 
   private def dropDatabase(cascade: Boolean, tableExists: Boolean): Unit = {
-    withTempPath { tmpDir =>
-      val path = tmpDir.toString
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        val dbName = "db1"
-        val fs = new Path(path).getFileSystem(spark.sessionState.newHadoopConf())
-        val dbPath = new Path(path)
-        // the database directory does not exist
-        assert(!fs.exists(dbPath))
-
-        sql(s"CREATE DATABASE $dbName")
-        val catalog = spark.sessionState.catalog
-        val expectedDBLocation = "file:" + appendTrailingSlash(dbPath.toString) + s"$dbName.db"
-        val db1 = catalog.getDatabaseMetadata(dbName)
-        assert(db1 == CatalogDatabase(
-          dbName,
-          "",
-          expectedDBLocation,
-          Map.empty))
-        // the database directory was created
-        assert(fs.exists(dbPath) && fs.isDirectory(dbPath))
-        sql(s"USE $dbName")
-
-        val tabName = "tab1"
-        assert(!tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
-        sql(s"CREATE TABLE $tabName as SELECT 1")
-        assert(tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
-
-        if (!tableExists) {
-          sql(s"DROP TABLE $tabName")
-          assert(!tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
-        }
+    val dbName = "db1"
+    val dbPath = new Path(spark.sessionState.conf.warehousePath)
+    val fs = dbPath.getFileSystem(spark.sessionState.newHadoopConf())
 
-        sql(s"USE default")
-        val sqlDropDatabase = s"DROP DATABASE $dbName ${if (cascade) "CASCADE" else "RESTRICT"}"
-        if (tableExists && !cascade) {
-          val message = intercept[AnalysisException] {
-            sql(sqlDropDatabase)
-          }.getMessage
-          assert(message.contains(s"Database $dbName is not empty. One or more tables exist."))
-          // the database directory was not removed
-          assert(fs.exists(new Path(expectedDBLocation)))
-        } else {
-          sql(sqlDropDatabase)
-          // the database directory was removed and the inclusive table directories are also removed
-          assert(!fs.exists(new Path(expectedDBLocation)))
-        }
-      }
+    sql(s"CREATE DATABASE $dbName")
+    val catalog = spark.sessionState.catalog
+    val expectedDBLocation = "file:" + appendTrailingSlash(dbPath.toString) + s"$dbName.db"
+    val db1 = catalog.getDatabaseMetadata(dbName)
+    assert(db1 == CatalogDatabase(
+      dbName,
+      "",
+      expectedDBLocation,
+      Map.empty))
+    // the database directory was created
+    assert(fs.exists(dbPath) && fs.isDirectory(dbPath))
+    sql(s"USE $dbName")
+
+    val tabName = "tab1"
+    assert(!tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
+    sql(s"CREATE TABLE $tabName as SELECT 1")
+    assert(tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
+
+    if (!tableExists) {
+      sql(s"DROP TABLE $tabName")
+      assert(!tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
+    }
+
+    sql(s"USE default")
+    val sqlDropDatabase = s"DROP DATABASE $dbName ${if (cascade) "CASCADE" else "RESTRICT"}"
+    if (tableExists && !cascade) {
+      val message = intercept[AnalysisException] {
+        sql(sqlDropDatabase)
+      }.getMessage
+      assert(message.contains(s"Database $dbName is not empty. One or more tables exist."))
+      // the database directory was not removed
+      assert(fs.exists(new Path(expectedDBLocation)))
+    } else {
+      sql(sqlDropDatabase)
+      // the database directory was removed and the inclusive table directories are also removed
+      assert(!fs.exists(new Path(expectedDBLocation)))
     }
   }
 
@@ -859,14 +958,6 @@ class HiveDDLSuite
     }
   }
 
-  private def getTablePath(table: CatalogTable): Option[String] = {
-    if (DDLUtils.isDatasourceTable(table)) {
-      new CaseInsensitiveMap(table.storage.properties).get("path")
-    } else {
-      table.storage.locationUri
-    }
-  }
-
   private def checkCreateTableLike(sourceTable: CatalogTable, targetTable: CatalogTable): Unit = {
     // The created table should be a MANAGED table with empty view text and original text.
     assert(targetTable.tableType == CatalogTableType.MANAGED,
@@ -915,10 +1006,8 @@ class HiveDDLSuite
       assert(targetTable.provider == sourceTable.provider)
     }
 
-    val sourceTablePath = getTablePath(sourceTable)
-    val targetTablePath = getTablePath(targetTable)
-    assert(targetTablePath.nonEmpty, "target table path should not be empty")
-    assert(sourceTablePath != targetTablePath,
+    assert(targetTable.storage.locationUri.nonEmpty, "target table path should not be empty")
+    assert(sourceTable.storage.locationUri != targetTable.storage.locationUri,
       "source table/view path should be different from target table path")
 
     // The source table contents should not been seen in the target table.
@@ -971,6 +1060,14 @@ class HiveDDLSuite
           s"CREATE INDEX $indexName ON TABLE $tabName (a) AS 'COMPACT' WITH DEFERRED REBUILD")
         val indexTabName =
           spark.sessionState.catalog.listTables("default", s"*$indexName*").head.table
+
+        // Even if index tables exist, listTables and getTable APIs should still work
+        checkAnswer(
+          spark.catalog.listTables().toDF(),
+          Row(indexTabName, "default", null, null, false) ::
+            Row(tabName, "default", null, "MANAGED", false) :: Nil)
+        assert(spark.catalog.getTable("default", indexTabName).name === indexTabName)
+
         intercept[TableAlreadyExistsException] {
           sql(s"CREATE TABLE $indexTabName(b int)")
         }
@@ -1108,4 +1205,67 @@ class HiveDDLSuite
       }
     }
   }
+
+  test("truncate table - datasource table") {
+    import testImplicits._
+
+    val data = (1 to 10).map { i => (i, i) }.toDF("width", "length")
+    // Test both a Hive compatible and incompatible code path.
+    Seq("json", "parquet").foreach { format =>
+      withTable("rectangles") {
+        data.write.format(format).saveAsTable("rectangles")
+        assume(spark.table("rectangles").collect().nonEmpty,
+          "bad test; table was empty to begin with")
+
+        sql("TRUNCATE TABLE rectangles")
+        assert(spark.table("rectangles").collect().isEmpty)
+
+        // not supported since the table is not partitioned
+        val e = intercept[AnalysisException] {
+          sql("TRUNCATE TABLE rectangles PARTITION (width=1)")
+        }
+        assert(e.message.contains("Operation not allowed"))
+      }
+    }
+  }
+
+  test("truncate partitioned table - datasource table") {
+    import testImplicits._
+
+    val data = (1 to 10).map { i => (i % 3, i % 5, i) }.toDF("width", "length", "height")
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // supported since partitions are stored in the metastore
+      sql("TRUNCATE TABLE partTable PARTITION (width=1, length=1)")
+      assert(spark.table("partTable").filter($"width" === 1).collect().nonEmpty)
+      assert(spark.table("partTable").filter($"width" === 1 && $"length" === 1).collect().isEmpty)
+    }
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // support partial partition spec
+      sql("TRUNCATE TABLE partTable PARTITION (width=1)")
+      assert(spark.table("partTable").collect().nonEmpty)
+      assert(spark.table("partTable").filter($"width" === 1).collect().isEmpty)
+    }
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // do nothing if no partition is matched for the given partial partition spec
+      sql("TRUNCATE TABLE partTable PARTITION (width=100)")
+      assert(spark.table("partTable").count() == data.count())
+
+      // throw exception if no partition is matched for the given non-partial partition spec.
+      intercept[NoSuchPartitionException] {
+        sql("TRUNCATE TABLE partTable PARTITION (width=100, length=100)")
+      }
+
+      // throw exception if the column in partition spec is not a partition column.
+      val e = intercept[AnalysisException] {
+        sql("TRUNCATE TABLE partTable PARTITION (unknown=1)")
+      }
+      assert(e.message.contains("unknown is not a valid partition column"))
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index f690035c845f..78c80dacb9fa 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -21,15 +21,18 @@ import java.io.{DataInput, DataOutput, File, PrintWriter}
 import java.util.{ArrayList, Arrays, Properties}
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hive.ql.udf.UDAFPercentile
+import org.apache.hadoop.hive.ql.exec.UDF
+import org.apache.hadoop.hive.ql.udf.{UDAFPercentile, UDFType}
 import org.apache.hadoop.hive.ql.udf.generic._
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject
 import org.apache.hadoop.hive.serde2.{AbstractSerDe, SerDeStats}
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
-import org.apache.hadoop.io.Writable
+import org.apache.hadoop.io.{LongWritable, Writable}
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
+import org.apache.spark.sql.catalyst.plans.logical.Project
+import org.apache.spark.sql.functions.max
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.util.Utils
@@ -150,6 +153,41 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   }
 
   test("Generic UDAF aggregates") {
+
+    checkAnswer(sql(
+     """
+       |SELECT percentile_approx(2, 0.99999),
+       |       sum(distinct 1),
+       |       count(distinct 1,2,3,4) FROM src LIMIT 1
+     """.stripMargin), sql("SELECT 2, 1, 1 FROM src LIMIT 1").collect().toSeq)
+
+    checkAnswer(sql(
+      """
+        |SELECT ceiling(percentile_approx(distinct key, 0.99999)),
+        |       count(distinct key),
+        |       sum(distinct key),
+        |       count(distinct 1),
+        |       sum(distinct 1),
+        |       sum(1) FROM src LIMIT 1
+      """.stripMargin),
+      sql(
+        """
+          |SELECT max(key),
+          |       count(distinct key),
+          |       sum(distinct key),
+          |       1, 1, sum(1) FROM src LIMIT 1
+        """.stripMargin).collect().toSeq)
+
+    checkAnswer(sql(
+      """
+        |SELECT ceiling(percentile_approx(distinct key, 0.9 + 0.09999)),
+        |       count(distinct key), sum(distinct key),
+        |       count(distinct 1), sum(distinct 1),
+        |       sum(1) FROM src LIMIT 1
+      """.stripMargin),
+      sql("SELECT max(key), count(distinct key), sum(distinct key), 1, 1, sum(1) FROM src LIMIT 1")
+        .collect().toSeq)
+
     checkAnswer(sql("SELECT ceiling(percentile_approx(key, 0.99999D)) FROM src LIMIT 1"),
       sql("SELECT max(key) FROM src LIMIT 1").collect().toSeq)
 
@@ -301,6 +339,35 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
     hiveContext.reset()
   }
 
+  test("non-deterministic children of UDF") {
+    withUserDefinedFunction("testStringStringUDF" -> true, "testGenericUDFHash" -> true) {
+      // HiveSimpleUDF
+      sql(s"CREATE TEMPORARY FUNCTION testStringStringUDF AS '${classOf[UDFStringString].getName}'")
+      val df1 = sql("SELECT testStringStringUDF(rand(), \"hello\")")
+      assert(!df1.logicalPlan.asInstanceOf[Project].projectList.forall(_.deterministic))
+
+      // HiveGenericUDF
+      sql(s"CREATE TEMPORARY FUNCTION testGenericUDFHash AS '${classOf[GenericUDFHash].getName}'")
+      val df2 = sql("SELECT testGenericUDFHash(rand())")
+      assert(!df2.logicalPlan.asInstanceOf[Project].projectList.forall(_.deterministic))
+    }
+  }
+
+  test("non-deterministic children expressions of UDAF") {
+    withTempView("view1") {
+      spark.range(1).selectExpr("id as x", "id as y").createTempView("view1")
+      withUserDefinedFunction("testUDAFPercentile" -> true) {
+        // non-deterministic children of Hive UDAF
+        sql(s"CREATE TEMPORARY FUNCTION testUDAFPercentile AS '${classOf[UDAFPercentile].getName}'")
+        val e1 = intercept[AnalysisException] {
+          sql("SELECT testUDAFPercentile(x, rand()) from view1 group by y")
+        }.getMessage
+        assert(Seq("nondeterministic expression",
+          "should not appear in the arguments of an aggregate function").forall(e1.contains))
+      }
+    }
+  }
+
   test("Hive UDFs with insufficient number of input arguments should trigger an analysis error") {
     Seq((1, 2)).toDF("a", "b").createOrReplaceTempView("testUDF")
 
@@ -452,6 +519,26 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
     assert(count4 == 1)
     sql("DROP TABLE parquet_tmp")
   }
+
+  test("Hive Stateful UDF") {
+    withUserDefinedFunction("statefulUDF" -> true, "statelessUDF" -> true) {
+      sql(s"CREATE TEMPORARY FUNCTION statefulUDF AS '${classOf[StatefulUDF].getName}'")
+      sql(s"CREATE TEMPORARY FUNCTION statelessUDF AS '${classOf[StatelessUDF].getName}'")
+      val testData = spark.range(10).repartition(1)
+
+      // Expected Max(s) is 10 as statefulUDF returns the sequence number starting from 1.
+      checkAnswer(testData.selectExpr("statefulUDF() as s").agg(max($"s")), Row(10))
+
+      // Expected Max(s) is 5 as statefulUDF returns the sequence number starting from 1,
+      // and the data is evenly distributed into 2 partitions.
+      checkAnswer(testData.repartition(2)
+        .selectExpr("statefulUDF() as s").agg(max($"s")), Row(5))
+
+      // Expected Max(s) is 1, as stateless UDF is deterministic and foldable and replaced
+      // by constant 1 by ConstantFolding optimizer.
+      checkAnswer(testData.selectExpr("statelessUDF() as s").agg(max($"s")), Row(1))
+    }
+  }
 }
 
 class TestPair(x: Int, y: Int) extends Writable with Serializable {
@@ -516,3 +603,22 @@ class PairUDF extends GenericUDF {
 
   override def getDisplayString(p1: Array[String]): String = ""
 }
+
+@UDFType(stateful = true)
+class StatefulUDF extends UDF {
+  private val result = new LongWritable(0)
+
+  def evaluate(): LongWritable = {
+    result.set(result.get() + 1)
+    result
+  }
+}
+
+class StatelessUDF extends UDF {
+  private val result = new LongWritable(0)
+
+  def evaluate(): LongWritable = {
+    result.set(result.get() + 1)
+    result
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index b9353b5b5d2a..161911500744 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -271,15 +271,16 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     checkKeywordsExist(sql("describe function extended upper"),
       "Function: upper",
       "Class: org.apache.spark.sql.catalyst.expressions.Upper",
-      "Usage: upper(str) - Returns str with all characters changed to uppercase",
+      "Usage: upper(str) - Returns `str` with all characters changed to uppercase",
       "Extended Usage:",
-      "> SELECT upper('SparkSql')",
-      "'SPARKSQL'")
+      "Examples:",
+      "> SELECT upper('SparkSql');",
+      "SPARKSQL")
 
     checkKeywordsExist(sql("describe functioN Upper"),
       "Function: upper",
       "Class: org.apache.spark.sql.catalyst.expressions.Upper",
-      "Usage: upper(str) - Returns str with all characters changed to uppercase")
+      "Usage: upper(str) - Returns `str` with all characters changed to uppercase")
 
     checkKeywordsNotExist(sql("describe functioN Upper"),
       "Extended Usage")
@@ -290,25 +291,28 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     checkKeywordsExist(sql("describe functioN  `~`"),
       "Function: ~",
       "Class: org.apache.spark.sql.catalyst.expressions.BitwiseNot",
-      "Usage: ~ b - Bitwise NOT.")
+      "Usage: ~ expr - Returns the result of bitwise NOT of `expr`.")
 
     // Hard coded describe functions
     checkKeywordsExist(sql("describe function  `<>`"),
       "Function: <>",
-      "Usage: a <> b - Returns TRUE if a is not equal to b")
+      "Usage: expr1 <> expr2 - Returns true if `expr1` is not equal to `expr2`")
 
     checkKeywordsExist(sql("describe function  `!=`"),
       "Function: !=",
-      "Usage: a != b - Returns TRUE if a is not equal to b")
+      "Usage: expr1 != expr2 - Returns true if `expr1` is not equal to `expr2`")
 
     checkKeywordsExist(sql("describe function  `between`"),
       "Function: between",
-      "Usage: a [NOT] BETWEEN b AND c - evaluate if a is [not] in between b and c")
+      "Usage: expr1 [NOT] BETWEEN expr2 AND expr3 - " +
+        "evaluate if `expr1` is [not] in between `expr2` and `expr3`")
 
     checkKeywordsExist(sql("describe function  `case`"),
       "Function: case",
-      "Usage: CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - " +
-        "When a = b, returns c; when a = d, return e; else return f")
+      "Usage: CASE expr1 WHEN expr2 THEN expr3 " +
+        "[WHEN expr4 THEN expr5]* [ELSE expr6] END - " +
+        "When `expr1` = `expr2`, returns `expr3`; " +
+        "when `expr1` = `expr4`, return `expr5`; else return `expr6`")
   }
 
   test("describe functions - user defined functions") {
@@ -538,7 +542,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         }
         userSpecifiedLocation match {
           case Some(location) =>
-            assert(r.catalogTable.storage.locationUri.get === location)
+            assert(r.catalogTable.location === location)
           case None => // OK.
         }
         // Also make sure that the format and serde are as desired.
@@ -1566,26 +1570,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   ignore("SPARK-10562: partition by column with mixed case name") {
-    def runOnce() {
-      withTable("tbl10562") {
-        val df = Seq(2012 -> "a").toDF("Year", "val")
-        df.write.partitionBy("Year").saveAsTable("tbl10562")
-        checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012))
-        checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012))
-        checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012))
-        checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil)
-        checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a"))
-      }
-    }
-    try {
-      runOnce()
-    } catch {
-      case t: Throwable =>
-        // Retry to gather more test data. TODO(ekl) revert this once we deflake this test.
-        runOnce()
-        runOnce()
-        runOnce()
-        throw t
+    withTable("tbl10562") {
+      val df = Seq(2012 -> "a").toDF("Year", "val")
+      df.write.partitionBy("Year").saveAsTable("tbl10562")
+      checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012))
+      checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012))
+      checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012))
+// TODO(ekl) this is causing test flakes [SPARK-18167], but we think the issue is derby specific
+//      checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil)
+      checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a"))
     }
   }
 
@@ -1946,6 +1939,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+
+  test("SPARK-17108: Fix BIGINT and INT comparison failure in spark sql") {
+    sql("create table t1(a map<bigint, array<string>>)")
+    sql("select * from t1 where a[1] is not null")
+
+    sql("create table t2(a map<int, array<string>>)")
+    sql("select * from t2 where a[1] is not null")
+
+    sql("create table t3(a map<bigint, array<string>>)")
+    sql("select * from t3 where a[1L] is not null")
+  }
+
   test("SPARK-17796 Support wildcard character in filename for LOAD DATA LOCAL INPATH") {
     withTempDir { dir =>
       for (i <- 1 to 3) {
@@ -2010,4 +2015,20 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     val attempt = Try(Process(command).run(ProcessLogger(_ => ())).exitValue())
     attempt.isSuccess && attempt.get == 0
   }
+
+  test("SPARK-19912 String literals should be escaped for Hive metastore partition pruning") {
+    withTable("spark_19912") {
+      Seq(
+        (1, "p1", "q1"),
+        (2, "'", "q2"),
+        (3, "\"", "q3"),
+        (4, "p1\" and q=\"q1", "q4")
+      ).toDF("a", "p", "q").write.partitionBy("p", "q").saveAsTable("spark_19912")
+
+      val table = spark.table("spark_19912")
+      checkAnswer(table.filter($"p" === "'").select($"a"), Row(2))
+      checkAnswer(table.filter($"p" === "\"").select($"a"), Row(3))
+      checkAnswer(table.filter($"p" === "p1\" and q=\"q1").select($"a"), Row(4))
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLViewSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLViewSuite.scala
index 2af935da689c..ba65db71ede7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLViewSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLViewSuite.scala
@@ -38,21 +38,46 @@ class SQLViewSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     spark.sql(s"DROP TABLE IF EXISTS jt")
   }
 
-  test("nested views (interleaved with temporary views)") {
-    withView("jtv1", "jtv2", "jtv3", "temp_jtv1", "temp_jtv2", "temp_jtv3") {
+  test("create a permanent view on a permanent view") {
+    withView("jtv1", "jtv2") {
       sql("CREATE VIEW jtv1 AS SELECT * FROM jt WHERE id > 3")
       sql("CREATE VIEW jtv2 AS SELECT * FROM jtv1 WHERE id < 6")
       checkAnswer(sql("select count(*) FROM jtv2"), Row(2))
+    }
+  }
 
-      // Checks temporary views
+  test("create a temp view on a permanent view") {
+    withView("jtv1", "temp_jtv1") {
+      sql("CREATE VIEW jtv1 AS SELECT * FROM jt WHERE id > 3")
+      sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jtv1 WHERE id < 6")
+      checkAnswer(sql("select count(*) FROM temp_jtv1"), Row(2))
+    }
+  }
+
+  test("create a temp view on a temp view") {
+    withView("temp_jtv1", "temp_jtv2") {
       sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
       sql("CREATE TEMPORARY VIEW temp_jtv2 AS SELECT * FROM temp_jtv1 WHERE id < 6")
       checkAnswer(sql("select count(*) FROM temp_jtv2"), Row(2))
+    }
+  }
+
+  test("create a permanent view on a temp view") {
+    withView("jtv1", "temp_jtv1", "global_temp_jtv1") {
+      sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
+      var e = intercept[AnalysisException] {
+        sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
+      }.getMessage
+      assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
+        "referencing a temporary view `temp_jtv1`"))
 
-      // Checks interleaved temporary view and normal view
-      sql("CREATE TEMPORARY VIEW temp_jtv3 AS SELECT * FROM jt WHERE id > 3")
-      sql("CREATE VIEW jtv3 AS SELECT * FROM temp_jtv3 WHERE id < 6")
-      checkAnswer(sql("select count(*) FROM jtv3"), Row(2))
+      val globalTempDB = spark.sharedState.globalTempViewManager.database
+      sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0")
+      e = intercept[AnalysisException] {
+        sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
+      }.getMessage
+      assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
+        s"a temporary view `global_temp`.`global_temp_jtv1`"))
     }
   }
 
@@ -439,7 +464,7 @@ class SQLViewSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  test("SPARK-14933 - create view from hive parquet tabale") {
+  test("SPARK-14933 - create view from hive parquet table") {
     withTable("t_part") {
       withView("v_part") {
         spark.sql("create table t_part stored as parquet as select 1 as a, 2 as b")
@@ -451,7 +476,7 @@ class SQLViewSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  test("SPARK-14933 - create view from hive orc tabale") {
+  test("SPARK-14933 - create view from hive orc table") {
     withTable("t_orc") {
       withView("v_orc") {
         spark.sql("create table t_orc stored as orc as select 1 as a, 2 as b")
@@ -462,4 +487,60 @@ class SQLViewSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  test("create a permanent/temp view using a hive, built-in, and permanent user function") {
+    val permanentFuncName = "myUpper"
+    val permanentFuncClass =
+      classOf[org.apache.hadoop.hive.ql.udf.generic.GenericUDFUpper].getCanonicalName
+    val builtInFuncNameInLowerCase = "abs"
+    val builtInFuncNameInMixedCase = "aBs"
+    val hiveFuncName = "histogram_numeric"
+
+    withUserDefinedFunction(permanentFuncName -> false) {
+      sql(s"CREATE FUNCTION $permanentFuncName AS '$permanentFuncClass'")
+      withTable("tab1") {
+        (1 to 10).map(i => (s"$i", i)).toDF("str", "id").write.saveAsTable("tab1")
+        Seq("VIEW", "TEMPORARY VIEW").foreach { viewMode =>
+          withView("view1") {
+            sql(
+              s"""
+                 |CREATE $viewMode view1
+                 |AS SELECT
+                 |$permanentFuncName(str),
+                 |$builtInFuncNameInLowerCase(id),
+                 |$builtInFuncNameInMixedCase(id) as aBs,
+                 |$hiveFuncName(id, 5) over()
+                 |FROM tab1
+               """.stripMargin)
+            checkAnswer(sql("select count(*) FROM view1"), Row(10))
+          }
+        }
+      }
+    }
+  }
+
+  test("create a permanent/temp view using a temporary function") {
+    val tempFunctionName = "temp"
+    val functionClass =
+      classOf[org.apache.hadoop.hive.ql.udf.generic.GenericUDFUpper].getCanonicalName
+    withUserDefinedFunction(tempFunctionName -> true) {
+      sql(s"CREATE TEMPORARY FUNCTION $tempFunctionName AS '$functionClass'")
+      withView("view1", "tempView1") {
+        withTable("tab1") {
+          (1 to 10).map(i => s"$i").toDF("id").write.saveAsTable("tab1")
+
+          // temporary view
+          sql(s"CREATE TEMPORARY VIEW tempView1 AS SELECT $tempFunctionName(id) from tab1")
+          checkAnswer(sql("select count(*) FROM tempView1"), Row(10))
+
+          // permanent view
+          val e = intercept[AnalysisException] {
+            sql(s"CREATE VIEW view1 AS SELECT $tempFunctionName(id) from tab1")
+          }.getMessage
+          assert(e.contains("Not allowed to create a permanent view `view1` by referencing " +
+            s"a temporary function `$tempFunctionName`"))
+        }
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index ecb597298452..c85f9d0ec541 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -20,11 +20,13 @@ package org.apache.spark.sql.hive.orc
 import java.nio.charset.StandardCharsets
 import java.sql.Timestamp
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.ql.io.orc.{OrcStruct, SparkOrcNewRecordReader}
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.{LogicalRelation, RecordReaderIterator}
 import org.apache.spark.sql.hive.{HiveUtils, MetastoreRelation}
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
@@ -91,6 +93,16 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("Read/write UserDefinedType") {
+    withTempPath { path =>
+      val data = Seq((1, new UDT.MyDenseVector(Array(0.25, 2.25, 4.25))))
+      val udtDF = data.toDF("id", "vectors")
+      udtDF.write.orc(path.getAbsolutePath)
+      val readBack = spark.read.schema(udtDF.schema).orc(path.getAbsolutePath)
+      checkAnswer(udtDF, readBack)
+    }
+  }
+
   test("Creating case class RDD table") {
     val data = (1 to 100).map(i => (i, s"val_$i"))
     sparkContext.parallelize(data).toDF().createOrReplaceTempView("t")
@@ -271,7 +283,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       val queryOutput = selfJoin.queryExecution.analyzed.output
 
       assertResult(4, "Field count mismatches")(queryOutput.size)
-      assertResult(2, "Duplicated expression ID in query plan:\n $selfJoin") {
+      assertResult(2, s"Duplicated expression ID in query plan:\n $selfJoin") {
         queryOutput.filter(_.name == "_1").map(_.exprId).size
       }
 
@@ -280,7 +292,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
   }
 
   test("nested data - struct with array field") {
-    val data = (1 to 10).map(i => Tuple1((i, Seq("val_$i"))))
+    val data = (1 to 10).map(i => Tuple1((i, Seq(s"val_$i"))))
     withOrcTable(data, "t") {
       checkAnswer(sql("SELECT `_1`.`_2`[0] FROM t"), data.map {
         case Tuple1((_, Seq(string))) => Row(string)
@@ -289,7 +301,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
   }
 
   test("nested data - array of struct") {
-    val data = (1 to 10).map(i => Tuple1(Seq(i -> "val_$i")))
+    val data = (1 to 10).map(i => Tuple1(Seq(i -> s"val_$i")))
     withOrcTable(data, "t") {
       checkAnswer(sql("SELECT `_1`[0].`_2` FROM t"), data.map {
         case Tuple1(Seq((_, string))) => Row(string)
@@ -577,4 +589,25 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       assert(spark.table(tableName).schema == schema.copy(fields = expectedFields))
     }
   }
+
+  test("Empty schema does not read data from ORC file") {
+    val data = Seq((1, 1), (2, 2))
+    withOrcFile(data) { path =>
+      val requestedSchema = StructType(Nil)
+      val conf = new Configuration()
+      val physicalSchema = OrcFileOperator.readSchema(Seq(path), Some(conf)).get
+      OrcRelation.setRequiredColumns(conf, physicalSchema, requestedSchema)
+      val maybeOrcReader = OrcFileOperator.getFileReader(path, Some(conf))
+      assert(maybeOrcReader.isDefined)
+      val orcRecordReader = new SparkOrcNewRecordReader(
+        maybeOrcReader.get, conf, 0, maybeOrcReader.get.getContentLength)
+
+      val recordsIterator = new RecordReaderIterator[OrcStruct](orcRecordReader)
+      try {
+        assert(recordsIterator.next().toString == "{null, null}")
+      } finally {
+        recordsIterator.close()
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 0f37cd7bf365..aa60a3fd4f47 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -146,6 +147,52 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
 
     sql("DROP TABLE IF EXISTS orcNullValues")
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    assert(new OrcOptions(Map("Orc.Compress" -> "NONE")).compressionCodec == "NONE")
+  }
+
+  test("SPARK-18220: read Hive orc table with varchar column") {
+    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+    val location = Utils.createTempDir()
+    val uri = location.toURI
+    try {
+      hiveClient.runSqlHive(
+        """
+           |CREATE EXTERNAL TABLE hive_orc(
+           |  a STRING,
+           |  b CHAR(10),
+           |  c VARCHAR(10),
+           |  d ARRAY<CHAR(3)>)
+           |STORED AS orc""".stripMargin)
+      // Hive throws an exception if I assign the location in the create table statement.
+      hiveClient.runSqlHive(
+        s"ALTER TABLE hive_orc SET LOCATION '$uri'")
+      hiveClient.runSqlHive(
+        """INSERT INTO TABLE hive_orc
+          |SELECT 'a', 'b', 'c', ARRAY(CAST('d' AS CHAR(3)))
+          |FROM (SELECT 1) t""".stripMargin)
+
+      // We create a different table in Spark using the same schema which points to
+      // the same location.
+      spark.sql(
+        s"""
+           |CREATE EXTERNAL TABLE spark_orc(
+           |  a STRING,
+           |  b CHAR(10),
+           |  c VARCHAR(10),
+           |  d ARRAY<CHAR(3)>)
+           |STORED AS orc
+           |LOCATION '$uri'""".stripMargin)
+      val result = Row("a", "b         ", "c", Seq("d  "))
+      checkAnswer(spark.table("hive_orc"), result)
+      checkAnswer(spark.table("spark_orc"), result)
+    } finally {
+      hiveClient.runSqlHive("DROP TABLE IF EXISTS hive_orc")
+      hiveClient.runSqlHive("DROP TABLE IF EXISTS spark_orc")
+      Utils.deleteRecursively(location)
+    }
+  }
 }
 
 class OrcSourceSuite extends OrcSuite {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 9fc62a389db4..fbb228e0873e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.command.ExecutedCommandExec
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InsertIntoDataSourceCommand, InsertIntoHadoopFsRelationCommand, LogicalRelation}
+import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.hive.execution.HiveTableScanExec
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
@@ -186,7 +186,8 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       "normal_parquet",
       "jt",
       "jt_array",
-       "test_parquet")
+      "test_parquet")
+    super.afterAll()
   }
 
   test(s"conversion is working") {
@@ -307,7 +308,8 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt")
       df.queryExecution.sparkPlan match {
-        case ExecutedCommandExec(_: InsertIntoHadoopFsRelationCommand) => // OK
+        case ExecutedCommandExec(cmd: InsertIntoHadoopFsRelationCommand) =>
+          assert(cmd.catalogTable.map(_.identifier.table) === Some("test_insert_parquet"))
         case o => fail("test_insert_parquet should be converted to a " +
           s"${classOf[HadoopFsRelation ].getCanonicalName} and " +
           s"${classOf[InsertIntoDataSourceCommand].getCanonicalName} should have been SparkPlan. " +
@@ -337,7 +339,8 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array")
       df.queryExecution.sparkPlan match {
-        case ExecutedCommandExec(_: InsertIntoHadoopFsRelationCommand) => // OK
+        case ExecutedCommandExec(cmd: InsertIntoHadoopFsRelationCommand) =>
+          assert(cmd.catalogTable.map(_.identifier.table) === Some("test_insert_parquet"))
         case o => fail("test_insert_parquet should be converted to a " +
           s"${classOf[HadoopFsRelation ].getCanonicalName} and " +
           s"${classOf[InsertIntoDataSourceCommand].getCanonicalName} should have been SparkPlan." +
@@ -454,7 +457,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
       // Converted test_parquet should be cached.
       sessionState.catalog.getCachedDataSourceTable(tableIdentifier) match {
         case null => fail("Converted test_parquet should be cached in the cache.")
-        case logical @ LogicalRelation(parquetRelation: HadoopFsRelation, _, _) => // OK
+        case LogicalRelation(_: HadoopFsRelation, _, _) => // OK
         case other =>
           fail(
             "The cached test_parquet should be a Parquet Relation. " +
@@ -572,30 +575,30 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
         checkAnswer(
           sql("SELECT * FROM test_added_partitions"),
-          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
+          Seq(Row("foo", 0), Row("bar", 0)))
 
         // Create partition without data files and check whether it can be read
         sql(s"ALTER TABLE test_added_partitions ADD PARTITION (b='1') LOCATION '$partitionDir'")
         checkAnswer(
           sql("SELECT * FROM test_added_partitions"),
-          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
+          Seq(Row("foo", 0), Row("bar", 0)))
 
         // Add data files to partition directory and check whether they can be read
         sql("INSERT INTO TABLE test_added_partitions PARTITION (b=1) select 'baz' as a")
         checkAnswer(
           sql("SELECT * FROM test_added_partitions"),
-          Seq(("foo", 0), ("bar", 0), ("baz", 1)).toDF("a", "b"))
+          Seq(Row("foo", 0), Row("bar", 0), Row("baz", 1)))
 
         // Check it with pruning predicates
         checkAnswer(
           sql("SELECT * FROM test_added_partitions where b = 0"),
-          Seq(("foo", 0), ("bar", 0)).toDF("a", "b"))
+          Seq(Row("foo", 0), Row("bar", 0)))
         checkAnswer(
           sql("SELECT * FROM test_added_partitions where b = 1"),
-          Seq(("baz", 1)).toDF("a", "b"))
+          Seq(Row("baz", 1)))
         checkAnswer(
           sql("SELECT * FROM test_added_partitions where b = 2"),
-          Seq[(String, Int)]().toDF("a", "b"))
+          Seq.empty)
 
         // Also verify the inputFiles implementation
         assert(sql("select * from test_added_partitions").inputFiles.length == 2)
@@ -606,6 +609,63 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
     }
   }
 
+  test("Explicitly added partitions should be readable after load") {
+    withTable("test_added_partitions") {
+      withTempDir { src =>
+        val newPartitionDir = src.getCanonicalPath
+        spark.range(2).selectExpr("cast(id as string)").toDF("a").write
+          .mode("overwrite")
+          .parquet(newPartitionDir)
+
+        sql(
+          """
+            |CREATE TABLE test_added_partitions (a STRING)
+            |PARTITIONED BY (b INT)
+            |STORED AS PARQUET
+          """.stripMargin)
+
+        // Create partition without data files and check whether it can be read
+        sql(s"ALTER TABLE test_added_partitions ADD PARTITION (b='1')")
+        // This table fetch is to fill the cache with zero leaf files
+        checkAnswer(spark.table("test_added_partitions"), Seq.empty)
+
+        sql(
+          s"""
+             |LOAD DATA LOCAL INPATH '$newPartitionDir' OVERWRITE
+             |INTO TABLE test_added_partitions PARTITION(b='1')
+           """.stripMargin)
+
+        checkAnswer(
+          spark.table("test_added_partitions"),
+          Seq(Row("0", 1), Row("1", 1)))
+      }
+    }
+  }
+
+  test("Non-partitioned table readable after load") {
+    withTable("tab") {
+      withTempDir { src =>
+        val newPartitionDir = src.getCanonicalPath
+        spark.range(2).selectExpr("cast(id as string)").toDF("a").write
+          .mode("overwrite")
+          .parquet(newPartitionDir)
+
+        sql("CREATE TABLE tab (a STRING) STORED AS PARQUET")
+
+        // This table fetch is to fill the cache with zero leaf files
+        checkAnswer(spark.table("tab"), Seq.empty)
+
+        sql(
+          s"""
+             |LOAD DATA LOCAL INPATH '$newPartitionDir' OVERWRITE
+             |INTO TABLE tab
+           """.stripMargin)
+
+        checkAnswer(spark.table("tab"), Seq(Row("0"), Row("1")))
+      }
+    }
+  }
+
   test("self-join") {
     val table = spark.table("normal_parquet")
     val selfJoin = table.as("t1").crossJoin(table.as("t2"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
index 22f13a494cd4..224b2c6c6f79 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
@@ -446,7 +446,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
       .saveAsTable("t")
 
     // Using only a subset of all partition columns
-    intercept[Throwable] {
+    intercept[AnalysisException] {
       partitionedTestDF2.write
         .format(dataSourceName)
         .mode(SaveMode.Append)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 64d0ecbeefc9..cecfd9909865 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources
 
 import java.text.NumberFormat
+import java.util.Locale
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
@@ -141,7 +142,7 @@ class SimpleTextOutputWriter(path: String, context: TaskAttemptContext)
 
 class AppendingTextOutputFormat(path: String) extends TextOutputFormat[NullWritable, Text] {
 
-  val numberFormat = NumberFormat.getInstance()
+  val numberFormat = NumberFormat.getInstance(Locale.US)
   numberFormat.setMinimumIntegerDigits(5)
   numberFormat.setGroupingUsed(false)
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 07a0dab0ee04..644fc50bf507 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -51,6 +51,17 @@
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
 
+    <!--
+      This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
+      them will yield errors.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
     <!-- Explicit listing of transitive deps that are shaded. Otherwise, odd compiler crashes. -->
     <dependency>
       <groupId>com.google.guava</groupId>
diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
index f82323a1cdd9..d004f34ab186 100644
--- a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
+++ b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
@@ -169,7 +169,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
             .style("cursor", "pointer")
             .attr("cx", function(d) { return x(d.x); })
             .attr("cy", function(d) { return y(d.y); })
-            .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "0";})
+            .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "3";})
             .on('mouseover', function(d) {
                 var tip = formatYValue(d.y) + " " + unitY + " at " + timeFormat[d.x];
                 showBootstrapTooltip(d3.select(this).node(), tip);
@@ -187,7 +187,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
                     .attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "white";})
                     .attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "white";})
                     .attr("opacity", function(d) { return isFailedBatch(d.x) ? "1" : "0";})
-                    .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "0";});
+                    .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "3";});
             })
             .on("click", function(d) {
                 if (lastTimeout != null) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
index 7c1ea2f89ddb..dcd698c860d8 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
@@ -30,7 +30,7 @@ import org.apache.spark.util.ClosureCleaner
  * `mapWithState` operation of a
  * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a
  * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java).
- * Use [[org.apache.spark.streaming.StateSpec.function() StateSpec.function]] factory methods
+ * Use `org.apache.spark.streaming.StateSpec.function()` factory methods
  * to create instances of this class.
  *
  * Example in Scala:
@@ -70,10 +70,14 @@ import org.apache.spark.util.ClosureCleaner
 @Experimental
 sealed abstract class StateSpec[KeyType, ValueType, StateType, MappedType] extends Serializable {
 
-  /** Set the RDD containing the initial states that will be used by `mapWithState` */
+  /**
+   * Set the RDD containing the initial states that will be used by `mapWithState`
+   */
   def initialState(rdd: RDD[(KeyType, StateType)]): this.type
 
-  /** Set the RDD containing the initial states that will be used by `mapWithState` */
+  /**
+   * Set the RDD containing the initial states that will be used by `mapWithState`
+   */
   def initialState(javaPairRDD: JavaPairRDD[KeyType, StateType]): this.type
 
   /**
@@ -100,7 +104,7 @@ sealed abstract class StateSpec[KeyType, ValueType, StateType, MappedType] exten
 
 /**
  * :: Experimental ::
- * Builder object for creating instances of [[org.apache.spark.streaming.StateSpec StateSpec]]
+ * Builder object for creating instances of `org.apache.spark.streaming.StateSpec`
  * that is used for specifying the parameters of the DStream transformation `mapWithState`
  * that is used for specifying the parameters of the DStream transformation
  * `mapWithState` operation of a
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 4808d0fcbc6c..4be02e7084f9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -421,11 +421,11 @@ class StreamingContext private[streaming] (
    * by "moving" them from another location within the same file system. File names
    * starting with . are ignored.
    *
-   * '''Note:''' We ensure that the byte array for each record in the
-   * resulting RDDs of the DStream has the provided record length.
-   *
    * @param directory HDFS directory to monitor for new file
    * @param recordLength length of each record in bytes
+   *
+   * @note We ensure that the byte array for each record in the
+   * resulting RDDs of the DStream has the provided record length.
    */
   def binaryRecordsStream(
       directory: String,
@@ -434,25 +434,24 @@ class StreamingContext private[streaming] (
     conf.setInt(FixedLengthBinaryInputFormat.RECORD_LENGTH_PROPERTY, recordLength)
     val br = fileStream[LongWritable, BytesWritable, FixedLengthBinaryInputFormat](
       directory, FileInputDStream.defaultFilter: Path => Boolean, newFilesOnly = true, conf)
-    val data = br.map { case (k, v) =>
-      val bytes = v.getBytes
+    br.map { case (k, v) =>
+      val bytes = v.copyBytes()
       require(bytes.length == recordLength, "Byte array does not have correct length. " +
         s"${bytes.length} did not equal recordLength: $recordLength")
       bytes
     }
-    data
   }
 
   /**
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE: Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
-   * those RDDs, so `queueStream` doesn't support checkpointing.
-   *
    * @param queue      Queue of RDDs. Modifications to this data structure must be synchronized.
    * @param oneAtATime Whether only one RDD should be consumed from the queue in every interval
    * @tparam T         Type of objects in the RDD
+   *
+   * @note Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
+   * those RDDs, so `queueStream` doesn't support checkpointing.
    */
   def queueStream[T: ClassTag](
       queue: Queue[RDD[T]],
@@ -465,14 +464,14 @@ class StreamingContext private[streaming] (
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE: Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
-   * those RDDs, so `queueStream` doesn't support checkpointing.
-   *
    * @param queue      Queue of RDDs. Modifications to this data structure must be synchronized.
    * @param oneAtATime Whether only one RDD should be consumed from the queue in every interval
    * @param defaultRDD Default RDD is returned by the DStream when the queue is empty.
    *                   Set as null if no RDD should be returned when empty
    * @tparam T         Type of objects in the RDD
+   *
+   * @note Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
+   * those RDDs, so `queueStream` doesn't support checkpointing.
    */
   def queueStream[T: ClassTag](
       queue: Queue[RDD[T]],
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index da9ff858853c..2ec907c8cfd5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -74,7 +74,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    */
   def repartition(numPartitions: Int): JavaPairDStream[K, V] = dstream.repartition(numPartitions)
 
-  /** Method that generates a RDD for the given Duration */
+  /** Method that generates an RDD for the given Duration */
   def compute(validTime: Time): JavaPairRDD[K, V] = {
     dstream.compute(validTime) match {
       case Some(rdd) => new JavaPairRDD(rdd)
@@ -434,8 +434,8 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * Return a [[JavaMapWithStateDStream]] by applying a function to every key-value element of
    * `this` stream, while maintaining some state data for each unique key. The mapping function
    * and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this
-   * transformation can be specified using [[StateSpec]] class. The state data is accessible in
-   * as a parameter of type [[State]] in the mapping function.
+   * transformation can be specified using `StateSpec` class. The state data is accessible in
+   * as a parameter of type `State` in the mapping function.
    *
    * Example of using `mapWithState`:
    * {{{
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 4c4376a089f5..982e72cffbf3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -44,7 +44,7 @@ import org.apache.spark.streaming.scheduler.StreamingListener
  * A Java-friendly version of [[org.apache.spark.streaming.StreamingContext]] which is the main
  * entry point for Spark Streaming functionality. It provides methods to create
  * [[org.apache.spark.streaming.api.java.JavaDStream]] and
- * [[org.apache.spark.streaming.api.java.JavaPairDStream.]] from input sources. The internal
+ * [[org.apache.spark.streaming.api.java.JavaPairDStream]] from input sources. The internal
  * org.apache.spark.api.java.JavaSparkContext (see core Spark documentation) can be accessed
  * using `context.sparkContext`. After creating and transforming DStreams, the streaming
  * computation can be started and stopped using `context.start()` and `context.stop()`,
@@ -218,11 +218,11 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * for new files and reads them as flat binary files with fixed record lengths,
    * yielding byte arrays
    *
-   * '''Note:''' We ensure that the byte array for each record in the
-   * resulting RDDs of the DStream has the provided record length.
-   *
    * @param directory HDFS directory to monitor for new files
    * @param recordLength The length at which to split the records
+   *
+   * @note We ensure that the byte array for each record in the
+   * resulting RDDs of the DStream has the provided record length.
    */
   def binaryRecordsStream(directory: String, recordLength: Int): JavaDStream[Array[Byte]] = {
     ssc.binaryRecordsStream(directory, recordLength)
@@ -352,13 +352,13 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE:
+   * @param queue      Queue of RDDs
+   * @tparam T         Type of objects in the RDD
+   *
+   * @note
    * 1. Changes to the queue after the stream is created will not be recognized.
    * 2. Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
    * those RDDs, so `queueStream` doesn't support checkpointing.
-   *
-   * @param queue      Queue of RDDs
-   * @tparam T         Type of objects in the RDD
    */
   def queueStream[T](queue: java.util.Queue[JavaRDD[T]]): JavaDStream[T] = {
     implicit val cm: ClassTag[T] =
@@ -372,14 +372,14 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE:
-   * 1. Changes to the queue after the stream is created will not be recognized.
-   * 2. Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
-   * those RDDs, so `queueStream` doesn't support checkpointing.
-   *
    * @param queue      Queue of RDDs
    * @param oneAtATime Whether only one RDD should be consumed from the queue in every interval
    * @tparam T         Type of objects in the RDD
+   *
+   * @note
+   * 1. Changes to the queue after the stream is created will not be recognized.
+   * 2. Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
+   * those RDDs, so `queueStream` doesn't support checkpointing.
    */
   def queueStream[T](
       queue: java.util.Queue[JavaRDD[T]],
@@ -396,7 +396,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE:
+   * @note
    * 1. Changes to the queue after the stream is created will not be recognized.
    * 2. Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
    * those RDDs, so `queueStream` doesn't support checkpointing.
@@ -454,9 +454,10 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
   /**
    * Create a new DStream in which each RDD is generated by applying a function on RDDs of
    * the DStreams. The order of the JavaRDDs in the transform function parameter will be the
-   * same as the order of corresponding DStreams in the list. Note that for adding a
-   * JavaPairDStream in the list of JavaDStreams, convert it to a JavaDStream using
-   * [[org.apache.spark.streaming.api.java.JavaPairDStream]].toJavaDStream().
+   * same as the order of corresponding DStreams in the list.
+   *
+   * @note For adding a JavaPairDStream in the list of JavaDStreams, convert it to a
+   * JavaDStream using [[org.apache.spark.streaming.api.java.JavaPairDStream]].toJavaDStream().
    * In the transform function, convert the JavaRDD corresponding to that JavaDStream to
    * a JavaPairRDD using org.apache.spark.api.java.JavaPairRDD.fromJavaRDD().
    */
@@ -476,9 +477,10 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
   /**
    * Create a new DStream in which each RDD is generated by applying a function on RDDs of
    * the DStreams. The order of the JavaRDDs in the transform function parameter will be the
-   * same as the order of corresponding DStreams in the list. Note that for adding a
-   * JavaPairDStream in the list of JavaDStreams, convert it to a JavaDStream using
-   * [[org.apache.spark.streaming.api.java.JavaPairDStream]].toJavaDStream().
+   * same as the order of corresponding DStreams in the list.
+   *
+   * @note For adding a JavaPairDStream in the list of JavaDStreams, convert it to
+   * a JavaDStream using [[org.apache.spark.streaming.api.java.JavaPairDStream]].toJavaDStream().
    * In the transform function, convert the JavaRDD corresponding to that JavaDStream to
    * a JavaPairRDD using org.apache.spark.api.java.JavaPairRDD.fromJavaRDD().
    */
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index fa15a0bf65ab..938a7fac1af4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -68,13 +68,13 @@ abstract class DStream[T: ClassTag] (
   // Methods that should be implemented by subclasses of DStream
   // =======================================================================
 
-  /** Time interval after which the DStream generates a RDD */
+  /** Time interval after which the DStream generates an RDD */
   def slideDuration: Duration
 
   /** List of parent DStreams on which this DStream depends on */
   def dependencies: List[DStream[_]]
 
-  /** Method that generates a RDD for the given time */
+  /** Method that generates an RDD for the given time */
   def compute(validTime: Time): Option[RDD[T]]
 
   // =======================================================================
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index ed9305875cb7..905b1c52afa6 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -230,7 +230,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]](
    * - It must pass the user-provided file filter.
    * - It must be newer than the ignore threshold. It is assumed that files older than the ignore
    *   threshold have already been considered or are existing files before start
-   *   (when newFileOnly = true).
+   *   (when newFilesOnly = true).
    * - It must not be present in the recently selected files that this class remembers.
    * - It must not be newer than the time of the batch (i.e. `currentTime` for which this
    *   file is being tested. This can occur if the driver was recovered, and the missing batches
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
index ed08191f41cc..9512db7d7d75 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
@@ -128,7 +128,7 @@ class InternalMapWithStateDStream[K: ClassTag, V: ClassTag, S: ClassTag, E: Clas
     super.initialize(time)
   }
 
-  /** Method that generates a RDD for the given time */
+  /** Method that generates an RDD for the given time */
   override def compute(validTime: Time): Option[RDD[MapWithStateRDDRecord[K, S, E]]] = {
     // Get the previous state or create a new empty state RDD
     val prevStateRDD = getOrCompute(validTime - slideDuration) match {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index 2f2a6d13dd79..f38c1e799659 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -356,8 +356,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
    * Return a [[MapWithStateDStream]] by applying a function to every key-value element of
    * `this` stream, while maintaining some state data for each unique key. The mapping function
    * and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this
-   * transformation can be specified using [[StateSpec]] class. The state data is accessible in
-   * as a parameter of type [[State]] in the mapping function.
+   * transformation can be specified using `StateSpec` class. The state data is accessible in
+   * as a parameter of type `State` in the mapping function.
    *
    * Example of using `mapWithState`:
    * {{{
@@ -453,9 +453,12 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
   def updateStateByKey[S: ClassTag](
       updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
       partitioner: Partitioner,
-      rememberPartitioner: Boolean
-    ): DStream[(K, S)] = ssc.withScope {
-     new StateDStream(self, ssc.sc.clean(updateFunc), partitioner, rememberPartitioner, None)
+      rememberPartitioner: Boolean): DStream[(K, S)] = ssc.withScope {
+    val cleanedFunc = ssc.sc.clean(updateFunc)
+    val newUpdateFunc = (_: Time, it: Iterator[(K, Seq[V], Option[S])]) => {
+      cleanedFunc(it)
+    }
+    new StateDStream(self, newUpdateFunc, partitioner, rememberPartitioner, None)
   }
 
   /**
@@ -499,10 +502,33 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
       updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
       partitioner: Partitioner,
       rememberPartitioner: Boolean,
-      initialRDD: RDD[(K, S)]
-    ): DStream[(K, S)] = ssc.withScope {
-     new StateDStream(self, ssc.sc.clean(updateFunc), partitioner,
-       rememberPartitioner, Some(initialRDD))
+      initialRDD: RDD[(K, S)]): DStream[(K, S)] = ssc.withScope {
+    val cleanedFunc = ssc.sc.clean(updateFunc)
+    val newUpdateFunc = (_: Time, it: Iterator[(K, Seq[V], Option[S])]) => {
+      cleanedFunc(it)
+    }
+    new StateDStream(self, newUpdateFunc, partitioner, rememberPartitioner, Some(initialRDD))
+  }
+
+  /**
+   * Return a new "state" DStream where the state for each key is updated by applying
+   * the given function on the previous state of the key and the new values of the key.
+   * org.apache.spark.Partitioner is used to control the partitioning of each RDD.
+   * @param updateFunc State update function. If `this` function returns None, then
+   *                   corresponding state key-value pair will be eliminated.
+   * @param partitioner Partitioner for controlling the partitioning of each RDD in the new
+   *                    DStream.
+   * @tparam S State type
+   */
+  def updateStateByKey[S: ClassTag](updateFunc: (Time, K, Seq[V], Option[S]) => Option[S],
+      partitioner: Partitioner,
+      rememberPartitioner: Boolean,
+      initialRDD: Option[RDD[(K, S)]] = None): DStream[(K, S)] = ssc.withScope {
+    val cleanedFunc = ssc.sc.clean(updateFunc)
+    val newUpdateFunc = (time: Time, iterator: Iterator[(K, Seq[V], Option[S])]) => {
+      iterator.flatMap(t => cleanedFunc(time, t._1, t._2, t._3).map(s => (t._1, s)))
+    }
+    new StateDStream(self, newUpdateFunc, partitioner, rememberPartitioner, initialRDD)
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
index 8efb09a8ce98..5bf1dabf08f4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
@@ -27,7 +27,7 @@ import org.apache.spark.streaming.{Duration, Time}
 private[streaming]
 class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
     parent: DStream[(K, V)],
-    updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
+    updateFunc: (Time, Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
     partitioner: Partitioner,
     preservePartitioning: Boolean,
     initialRDD: Option[RDD[(K, S)]]
@@ -41,8 +41,10 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
 
   override val mustCheckpoint = true
 
-  private [this] def computeUsingPreviousRDD (
-    parentRDD: RDD[(K, V)], prevStateRDD: RDD[(K, S)]) = {
+  private [this] def computeUsingPreviousRDD(
+      batchTime: Time,
+      parentRDD: RDD[(K, V)],
+      prevStateRDD: RDD[(K, S)]) = {
     // Define the function for the mapPartition operation on cogrouped RDD;
     // first map the cogrouped tuple to tuples of required type,
     // and then apply the update function
@@ -53,7 +55,7 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
         val headOption = if (itr.hasNext) Some(itr.next()) else None
         (t._1, t._2._1.toSeq, headOption)
       }
-      updateFuncLocal(i)
+      updateFuncLocal(batchTime, i)
     }
     val cogroupedRDD = parentRDD.cogroup(prevStateRDD, partitioner)
     val stateRDD = cogroupedRDD.mapPartitions(finalFunc, preservePartitioning)
@@ -68,15 +70,14 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
       case Some(prevStateRDD) =>    // If previous state RDD exists
         // Try to get the parent RDD
         parent.getOrCompute(validTime) match {
-          case Some(parentRDD) =>   // If parent RDD exists, then compute as usual
-            computeUsingPreviousRDD(parentRDD, prevStateRDD)
-          case None =>    // If parent RDD does not exist
-
+          case Some(parentRDD) =>    // If parent RDD exists, then compute as usual
+            computeUsingPreviousRDD (validTime, parentRDD, prevStateRDD)
+          case None =>     // If parent RDD does not exist
             // Re-apply the update function to the old state RDD
             val updateFuncLocal = updateFunc
             val finalFunc = (iterator: Iterator[(K, S)]) => {
               val i = iterator.map(t => (t._1, Seq[V](), Option(t._2)))
-              updateFuncLocal(i)
+              updateFuncLocal(validTime, i)
             }
             val stateRDD = prevStateRDD.mapPartitions(finalFunc, preservePartitioning)
             Some(stateRDD)
@@ -93,15 +94,16 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
                 // and then apply the update function
                 val updateFuncLocal = updateFunc
                 val finalFunc = (iterator: Iterator[(K, Iterable[V])]) => {
-                  updateFuncLocal(iterator.map(tuple => (tuple._1, tuple._2.toSeq, None)))
+                  updateFuncLocal (validTime,
+                    iterator.map (tuple => (tuple._1, tuple._2.toSeq, None)))
                 }
 
                 val groupedRDD = parentRDD.groupByKey(partitioner)
                 val sessionRDD = groupedRDD.mapPartitions(finalFunc, preservePartitioning)
                 // logDebug("Generating state RDD for time " + validTime + " (first)")
-                Some(sessionRDD)
-              case Some(initialStateRDD) =>
-                computeUsingPreviousRDD(parentRDD, initialStateRDD)
+                Some (sessionRDD)
+              case Some (initialStateRDD) =>
+                computeUsingPreviousRDD(validTime, parentRDD, initialStateRDD)
             }
           case None => // If parent RDD does not exist, then nothing to do!
             // logDebug("Not generating state RDD (no previous state, no parent)")
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
index 0b2ec298132a..d0864fd3678b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
@@ -27,7 +27,7 @@ import org.apache.spark._
 import org.apache.spark.rdd.BlockRDD
 import org.apache.spark.storage.{BlockId, StorageLevel}
 import org.apache.spark.streaming.util._
-import org.apache.spark.util.SerializableConfiguration
+import org.apache.spark.util._
 import org.apache.spark.util.io.ChunkedByteBuffer
 
 /**
@@ -158,13 +158,16 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag](
       logInfo(s"Read partition data of $this from write ahead log, record handle " +
         partition.walRecordHandle)
       if (storeInBlockManager) {
-        blockManager.putBytes(blockId, new ChunkedByteBuffer(dataRead.duplicate()), storageLevel)
+        blockManager.putBytes(blockId, new ChunkedByteBuffer(dataRead.duplicate()), storageLevel,
+          encrypt = true)
         logDebug(s"Stored partition data of $this into block manager with level $storageLevel")
         dataRead.rewind()
       }
       serializerManager
         .dataDeserializeStream(
-          blockId, new ChunkedByteBuffer(dataRead).toInputStream())(elementClassTag)
+          blockId,
+          new ChunkedByteBuffer(dataRead).toInputStream(),
+          maybeEncrypted = false)(elementClassTag)
         .asInstanceOf[Iterator[T]]
     }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
index 80c07958b41f..2b488038f062 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
@@ -87,7 +87,8 @@ private[streaming] class BlockManagerBasedBlockHandler(
         putResult
       case ByteBufferBlock(byteBuffer) =>
         blockManager.putBytes(
-          blockId, new ChunkedByteBuffer(byteBuffer.duplicate()), storageLevel, tellMaster = true)
+          blockId, new ChunkedByteBuffer(byteBuffer.duplicate()), storageLevel, tellMaster = true,
+          encrypt = true)
       case o =>
         throw new SparkException(
           s"Could not store $blockId to block manager, unexpected block type ${o.getClass.getName}")
@@ -175,10 +176,11 @@ private[streaming] class WriteAheadLogBasedBlockHandler(
     val serializedBlock = block match {
       case ArrayBufferBlock(arrayBuffer) =>
         numRecords = Some(arrayBuffer.size.toLong)
-        serializerManager.dataSerialize(blockId, arrayBuffer.iterator)
+        serializerManager.dataSerialize(blockId, arrayBuffer.iterator, allowEncryption = false)
       case IteratorBlock(iterator) =>
         val countIterator = new CountingIterator(iterator)
-        val serializedBlock = serializerManager.dataSerialize(blockId, countIterator)
+        val serializedBlock = serializerManager.dataSerialize(blockId, countIterator,
+          allowEncryption = false)
         numRecords = countIterator.count
         serializedBlock
       case ByteBufferBlock(byteBuffer) =>
@@ -193,7 +195,8 @@ private[streaming] class WriteAheadLogBasedBlockHandler(
         blockId,
         serializedBlock,
         effectiveStorageLevel,
-        tellMaster = true)
+        tellMaster = true,
+        encrypt = true)
       if (!putSucceeded) {
         throw new SparkException(
           s"Could not store $blockId to block manager with storage level $storageLevel")
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 98e099354a7d..74ec19fddfc9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -200,18 +200,20 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
     listenerBus.post(StreamingListenerOutputOperationCompleted(job.toOutputOperationInfo))
     logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
     if (jobSet.hasCompleted) {
-      jobSets.remove(jobSet.time)
-      jobGenerator.onBatchCompletion(jobSet.time)
-      logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
-        jobSet.totalDelay / 1000.0, jobSet.time.toString,
-        jobSet.processingDelay / 1000.0
-      ))
       listenerBus.post(StreamingListenerBatchCompleted(jobSet.toBatchInfo))
     }
     job.result match {
       case Failure(e) =>
         reportError("Error running job " + job, e)
       case _ =>
+        if (jobSet.hasCompleted) {
+          jobSets.remove(jobSet.time)
+          jobGenerator.onBatchCompletion(jobSet.time)
+          logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(
+            jobSet.totalDelay / 1000.0, jobSet.time.toString,
+            jobSet.processingDelay / 1000.0
+          ))
+        }
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index b9d898a72362..8f55d982a904 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -197,6 +197,13 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       receivedBlockTracker.stop()
       logInfo("ReceiverTracker stopped")
       trackerState = Stopped
+    } else if (isTrackerInitialized) {
+      trackerState = Stopping
+      // `ReceivedBlockTracker` is open when this instance is created. We should
+      // close this even if this `ReceiverTracker` is not started.
+      receivedBlockTracker.stop()
+      logInfo("ReceiverTracker stopped")
+      trackerState = Stopped
     }
   }
 
@@ -446,6 +453,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     endpoint.send(StartAllReceivers(receivers))
   }
 
+  /** Check if tracker has been marked for initiated */
+  private def isTrackerInitialized: Boolean = trackerState == Initialized
+
   /** Check if tracker has been marked for starting */
   private def isTrackerStarted: Boolean = trackerState == Started
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index 1a87fc790f91..02f39f9875d4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -304,7 +304,10 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   }
 
   def render(request: HttpServletRequest): Seq[Node] = streamingListener.synchronized {
-    val batchTime = Option(request.getParameter("id")).map(id => Time(id.toLong)).getOrElse {
+    // stripXSS is called first to remove suspicious characters used in XSS attacks
+    val batchTime =
+      Option(SparkUIUtils.stripXSS(request.getParameter("id"))).map(id => Time(id.toLong))
+      .getOrElse {
       throw new IllegalArgumentException(s"Missing id parameter")
     }
     val formattedBatchTime =
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
index 9b1c939e9329..84ecf81abfbf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.streaming.ui
 
 import java.text.SimpleDateFormat
-import java.util.TimeZone
+import java.util.{Locale, TimeZone}
 import java.util.concurrent.TimeUnit
 
 import scala.xml.Node
@@ -80,11 +80,13 @@ private[streaming] object UIUtils {
 
   // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
   private val batchTimeFormat = new ThreadLocal[SimpleDateFormat]() {
-    override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
+    override def initialValue(): SimpleDateFormat =
+      new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US)
   }
 
   private val batchTimeFormatWithMilliseconds = new ThreadLocal[SimpleDateFormat]() {
-    override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss.SSS")
+    override def initialValue(): SimpleDateFormat =
+      new SimpleDateFormat("yyyy/MM/dd HH:mm:ss.SSS", Locale.US)
   }
 
   /**
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index cfcbdc7c382f..a3062ac94614 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -19,13 +19,13 @@ package org.apache.spark.streaming
 
 import java.util.concurrent.ConcurrentLinkedQueue
 
-import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.language.existentials
 import scala.reflect.ClassTag
 
+import org.scalatest.concurrent.Eventually.eventually
+
 import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.{BlockRDD, RDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.{DStream, WindowedDStream}
@@ -471,6 +471,72 @@ class BasicOperationsSuite extends TestSuiteBase {
     testOperation(inputData, updateStateOperation, outputData, true)
   }
 
+  test("updateStateByKey - testing time stamps as input") {
+    type StreamingState = Long
+    val initial: Seq[(String, StreamingState)] = Seq(("a", 0L), ("c", 0L))
+
+    val inputData =
+      Seq(
+        Seq("a"),
+        Seq("a", "b"),
+        Seq("a", "b", "c"),
+        Seq("a", "b"),
+        Seq("a"),
+        Seq()
+      )
+
+    // a -> 1000, 3000, 6000, 10000, 15000, 15000
+    // b -> 0, 2000, 5000, 9000, 9000, 9000
+    // c -> 1000, 1000, 3000, 3000, 3000, 3000
+
+    val outputData: Seq[Seq[(String, StreamingState)]] = Seq(
+        Seq(
+          ("a", 1000L),
+          ("c", 0L)), // t = 1000
+        Seq(
+          ("a", 3000L),
+          ("b", 2000L),
+          ("c", 0L)), // t = 2000
+        Seq(
+          ("a", 6000L),
+          ("b", 5000L),
+          ("c", 3000L)), // t = 3000
+        Seq(
+          ("a", 10000L),
+          ("b", 9000L),
+          ("c", 3000L)), // t = 4000
+        Seq(
+          ("a", 15000L),
+          ("b", 9000L),
+          ("c", 3000L)), // t = 5000
+        Seq(
+          ("a", 15000L),
+          ("b", 9000L),
+          ("c", 3000L)) // t = 6000
+      )
+
+    val updateStateOperation = (s: DStream[String]) => {
+      val initialRDD = s.context.sparkContext.makeRDD(initial)
+      val updateFunc = (time: Time,
+                        key: String,
+                        values: Seq[Int],
+                        state: Option[StreamingState]) => {
+        // Update only if we receive values for this key during the batch.
+        if (values.nonEmpty) {
+          Option(time.milliseconds + state.getOrElse(0L))
+        } else {
+          Option(state.getOrElse(0L))
+        }
+      }
+      s.map(x => (x, 1)).updateStateByKey[StreamingState](updateFunc = updateFunc,
+        partitioner = new HashPartitioner (numInputPartitions), rememberPartitioner = false,
+        initialRDD = Option(initialRDD))
+    }
+
+    testOperation(input = inputData, operation = updateStateOperation,
+      expectedOutput = outputData, useSet = true)
+  }
+
   test("updateStateByKey - with initial value RDD") {
     val initial = Seq(("a", 1), ("c", 2))
 
@@ -591,48 +657,57 @@ class BasicOperationsSuite extends TestSuiteBase {
        .window(Seconds(4), Seconds(2))
     }
 
-    val operatedStream = runCleanupTest(conf, operation _,
-      numExpectedOutput = cleanupTestInput.size / 2, rememberDuration = Seconds(3))
-    val windowedStream2 = operatedStream.asInstanceOf[WindowedDStream[_]]
-    val windowedStream1 = windowedStream2.dependencies.head.asInstanceOf[WindowedDStream[_]]
-    val mappedStream = windowedStream1.dependencies.head
-
-    // Checkpoint remember durations
-    assert(windowedStream2.rememberDuration === rememberDuration)
-    assert(windowedStream1.rememberDuration === rememberDuration + windowedStream2.windowDuration)
-    assert(mappedStream.rememberDuration ===
-      rememberDuration + windowedStream2.windowDuration + windowedStream1.windowDuration)
-
-    // WindowedStream2 should remember till 7 seconds: 10, 9, 8, 7
-    // WindowedStream1 should remember till 4 seconds: 10, 9, 8, 7, 6, 5, 4
-    // MappedStream should remember till 2 seconds:    10, 9, 8, 7, 6, 5, 4, 3, 2
-
-    // WindowedStream2
-    assert(windowedStream2.generatedRDDs.contains(Time(10000)))
-    assert(windowedStream2.generatedRDDs.contains(Time(8000)))
-    assert(!windowedStream2.generatedRDDs.contains(Time(6000)))
-
-    // WindowedStream1
-    assert(windowedStream1.generatedRDDs.contains(Time(10000)))
-    assert(windowedStream1.generatedRDDs.contains(Time(4000)))
-    assert(!windowedStream1.generatedRDDs.contains(Time(3000)))
-
-    // MappedStream
-    assert(mappedStream.generatedRDDs.contains(Time(10000)))
-    assert(mappedStream.generatedRDDs.contains(Time(2000)))
-    assert(!mappedStream.generatedRDDs.contains(Time(1000)))
+    runCleanupTest(
+        conf,
+        operation _,
+        numExpectedOutput = cleanupTestInput.size / 2,
+        rememberDuration = Seconds(3)) { operatedStream =>
+      eventually(eventuallyTimeout) {
+        val windowedStream2 = operatedStream.asInstanceOf[WindowedDStream[_]]
+        val windowedStream1 = windowedStream2.dependencies.head.asInstanceOf[WindowedDStream[_]]
+        val mappedStream = windowedStream1.dependencies.head
+
+        // Checkpoint remember durations
+        assert(windowedStream2.rememberDuration === rememberDuration)
+        assert(
+          windowedStream1.rememberDuration === rememberDuration + windowedStream2.windowDuration)
+        assert(mappedStream.rememberDuration ===
+          rememberDuration + windowedStream2.windowDuration + windowedStream1.windowDuration)
+
+        // WindowedStream2 should remember till 7 seconds: 10, 9, 8, 7
+        // WindowedStream1 should remember till 4 seconds: 10, 9, 8, 7, 6, 5, 4
+        // MappedStream should remember till 2 seconds:    10, 9, 8, 7, 6, 5, 4, 3, 2
+
+        // WindowedStream2
+        assert(windowedStream2.generatedRDDs.contains(Time(10000)))
+        assert(windowedStream2.generatedRDDs.contains(Time(8000)))
+        assert(!windowedStream2.generatedRDDs.contains(Time(6000)))
+
+        // WindowedStream1
+        assert(windowedStream1.generatedRDDs.contains(Time(10000)))
+        assert(windowedStream1.generatedRDDs.contains(Time(4000)))
+        assert(!windowedStream1.generatedRDDs.contains(Time(3000)))
+
+        // MappedStream
+        assert(mappedStream.generatedRDDs.contains(Time(10000)))
+        assert(mappedStream.generatedRDDs.contains(Time(2000)))
+        assert(!mappedStream.generatedRDDs.contains(Time(1000)))
+      }
+    }
   }
 
   test("rdd cleanup - updateStateByKey") {
     val updateFunc = (values: Seq[Int], state: Option[Int]) => {
       Some(values.sum + state.getOrElse(0))
     }
-    val stateStream = runCleanupTest(
-      conf, _.map(_ -> 1).updateStateByKey(updateFunc).checkpoint(Seconds(3)))
-
-    assert(stateStream.rememberDuration === stateStream.checkpointDuration * 2)
-    assert(stateStream.generatedRDDs.contains(Time(10000)))
-    assert(!stateStream.generatedRDDs.contains(Time(4000)))
+    runCleanupTest(
+      conf, _.map(_ -> 1).updateStateByKey(updateFunc).checkpoint(Seconds(3))) { stateStream =>
+      eventually(eventuallyTimeout) {
+        assert(stateStream.rememberDuration === stateStream.checkpointDuration * 2)
+        assert(stateStream.generatedRDDs.contains(Time(10000)))
+        assert(!stateStream.generatedRDDs.contains(Time(4000)))
+      }
+    }
   }
 
   test("rdd cleanup - input blocks and persisted RDDs") {
@@ -713,13 +788,16 @@ class BasicOperationsSuite extends TestSuiteBase {
     }
   }
 
-  /** Test cleanup of RDDs in DStream metadata */
+  /**
+   * Test cleanup of RDDs in DStream metadata. `assertCleanup` is the function that asserts the
+   * cleanup of RDDs is successful.
+   */
   def runCleanupTest[T: ClassTag](
       conf2: SparkConf,
       operation: DStream[Int] => DStream[T],
       numExpectedOutput: Int = cleanupTestInput.size,
       rememberDuration: Duration = null
-    ): DStream[T] = {
+    )(assertCleanup: (DStream[T]) => Unit): DStream[T] = {
 
     // Setup the stream computation
     assert(batchDuration === Seconds(1),
@@ -728,7 +806,11 @@ class BasicOperationsSuite extends TestSuiteBase {
       val operatedStream =
         ssc.graph.getOutputStreams().head.dependencies.head.asInstanceOf[DStream[T]]
       if (rememberDuration != null) ssc.remember(rememberDuration)
-      val output = runStreams[(Int, Int)](ssc, cleanupTestInput.size, numExpectedOutput)
+      val output = runStreams[(Int, Int)](
+        ssc,
+        cleanupTestInput.size,
+        numExpectedOutput,
+        () => assertCleanup(operatedStream))
       val clock = ssc.scheduler.clock.asInstanceOf[Clock]
       assert(clock.getTimeMillis() === Seconds(10).milliseconds)
       assert(output.size === numExpectedOutput)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index b79cc65d8b5e..eaedc8e54b3e 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -152,11 +152,9 @@ trait DStreamCheckpointTester { self: SparkFunSuite =>
       stopSparkContext: Boolean
     ): Seq[Seq[V]] = {
     try {
-      val batchDuration = ssc.graph.batchDuration
       val batchCounter = new BatchCounter(ssc)
       ssc.start()
       val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-      val currentTime = clock.getTimeMillis()
 
       logInfo("Manual clock before advancing = " + clock.getTimeMillis())
       clock.setTime(targetBatchTime.milliseconds)
@@ -171,7 +169,7 @@ trait DStreamCheckpointTester { self: SparkFunSuite =>
 
       eventually(timeout(10 seconds)) {
         val checkpointFilesOfLatestTime = Checkpoint.getCheckpointFiles(checkpointDir).filter {
-          _.toString.contains(clock.getTimeMillis.toString)
+          _.getName.contains(clock.getTimeMillis.toString)
         }
         // Checkpoint files are written twice for every batch interval. So assert that both
         // are written to make sure that both of them have been written.
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
index 1fc34f569f9f..2ab600ab817e 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
@@ -164,6 +164,10 @@ class DStreamClosureSuite extends SparkFunSuite with BeforeAndAfterAll {
   private def testUpdateStateByKey(ds: DStream[(Int, Int)]): Unit = {
     val updateF1 = (_: Seq[Int], _: Option[Int]) => { return; Some(1) }
     val updateF2 = (_: Iterator[(Int, Seq[Int], Option[Int])]) => { return; Seq((1, 1)).toIterator }
+    val updateF3 = (_: Time, _: Int, _: Seq[Int], _: Option[Int]) => {
+      return
+      Option(1)
+    }
     val initialRDD = ds.ssc.sparkContext.emptyRDD[Int].map { i => (i, i) }
     expectCorrectException { ds.updateStateByKey(updateF1) }
     expectCorrectException { ds.updateStateByKey(updateF1, 5) }
@@ -177,6 +181,14 @@ class DStreamClosureSuite extends SparkFunSuite with BeforeAndAfterAll {
     expectCorrectException {
       ds.updateStateByKey(updateF2, new HashPartitioner(5), true, initialRDD)
     }
+    expectCorrectException {
+      ds.updateStateByKey(
+        updateFunc = updateF3,
+        partitioner = new HashPartitioner(5),
+        rememberPartitioner = true,
+        initialRDD = Option(initialRDD)
+      )
+    }
   }
   private def testMapValues(ds: DStream[(Int, Int)]): Unit = expectCorrectException {
     ds.mapValues { _ => return; 1 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index 9ecfa48091a0..b5d36a36513a 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -67,42 +67,33 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
         val expectedOutput = input.map(_.toString)
         for (i <- input.indices) {
           testServer.send(input(i).toString + "\n")
-          Thread.sleep(500)
           clock.advance(batchDuration.milliseconds)
         }
-        // Make sure we finish all batches before "stop"
-        if (!batchCounter.waitUntilBatchesCompleted(input.size, 30000)) {
-          fail("Timeout: cannot finish all batches in 30 seconds")
+
+        eventually(eventuallyTimeout) {
+          clock.advance(batchDuration.milliseconds)
+          // Verify whether data received was as expected
+          logInfo("--------------------------------")
+          logInfo("output.size = " + outputQueue.size)
+          logInfo("output")
+          outputQueue.asScala.foreach(x => logInfo("[" + x.mkString(",") + "]"))
+          logInfo("expected output.size = " + expectedOutput.size)
+          logInfo("expected output")
+          expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]"))
+          logInfo("--------------------------------")
+
+          // Verify whether all the elements received are as expected
+          // (whether the elements were received one in each interval is not verified)
+          val output = outputQueue.asScala.flatten.toArray
+          assert(output.length === expectedOutput.size)
+          for (i <- output.indices) {
+            assert(output(i) === expectedOutput(i))
+          }
         }
 
-        // Ensure progress listener has been notified of all events
-        ssc.sparkContext.listenerBus.waitUntilEmpty(500)
-
-        // Verify all "InputInfo"s have been reported
-        assert(ssc.progressListener.numTotalReceivedRecords === input.size)
-        assert(ssc.progressListener.numTotalProcessedRecords === input.size)
-
-        logInfo("Stopping server")
-        testServer.stop()
-        logInfo("Stopping context")
-        ssc.stop()
-
-        // Verify whether data received was as expected
-        logInfo("--------------------------------")
-        logInfo("output.size = " + outputQueue.size)
-        logInfo("output")
-        outputQueue.asScala.foreach(x => logInfo("[" + x.mkString(",") + "]"))
-        logInfo("expected output.size = " + expectedOutput.size)
-        logInfo("expected output")
-        expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]"))
-        logInfo("--------------------------------")
-
-        // Verify whether all the elements received are as expected
-        // (whether the elements were received one in each interval is not verified)
-        val output: Array[String] = outputQueue.asScala.flatMap(x => x).toArray
-        assert(output.length === expectedOutput.size)
-        for (i <- output.indices) {
-          assert(output(i) === expectedOutput(i))
+        eventually(eventuallyTimeout) {
+          assert(ssc.progressListener.numTotalReceivedRecords === input.length)
+          assert(ssc.progressListener.numTotalProcessedRecords === input.length)
         }
       }
     }
@@ -164,14 +155,15 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
         // not enough to trigger a batch
         clock.advance(batchDuration.milliseconds / 2)
 
-        val input = Seq(1, 2, 3, 4, 5)
-        input.foreach { i =>
+        val numCopies = 3
+        val input = Array[Byte](1, 2, 3, 4, 5)
+        for (i <- 0 until numCopies) {
           Thread.sleep(batchDuration.milliseconds)
           val file = new File(testDir, i.toString)
-          Files.write(Array[Byte](i.toByte), file)
+          Files.write(input.map(b => (b + i).toByte), file)
           assert(file.setLastModified(clock.getTimeMillis()))
           assert(file.lastModified === clock.getTimeMillis())
-          logInfo("Created file " + file)
+          logInfo(s"Created file $file")
           // Advance the clock after creating the file to avoid a race when
           // setting its modification time
           clock.advance(batchDuration.milliseconds)
@@ -179,10 +171,10 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
             assert(batchCounter.getNumCompletedBatches === i)
           }
         }
-
-        val expectedOutput = input.map(i => i.toByte)
-        val obtainedOutput = outputQueue.asScala.flatten.toList.map(i => i(0).toByte)
-        assert(obtainedOutput.toSeq === expectedOutput)
+        val obtainedOutput = outputQueue.asScala.map(_.flatten).toSeq
+        for (i <- obtainedOutput.indices) {
+          assert(obtainedOutput(i) === input.map(b => (b + i).toByte))
+        }
       }
     } finally {
       if (testDir != null) Utils.deleteRecursively(testDir)
@@ -267,7 +259,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     val testReceiver = new MultiThreadTestReceiver(numThreads, numRecordsPerThread)
     MultiThreadTestReceiver.haveAllThreadsFinished = false
     val outputQueue = new ConcurrentLinkedQueue[Seq[Long]]
-    def output: Iterable[Long] = outputQueue.asScala.flatMap(x => x)
+    def output: Iterable[Long] = outputQueue.asScala.flatten
 
     // set up the network stream using the test receiver
     withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc =>
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index f2241936000a..c2b0389b8c6f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -32,10 +32,12 @@ import org.scalatest.concurrent.Eventually._
 import org.apache.spark._
 import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 import org.apache.spark.memory.StaticMemoryManager
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.LiveListenerBus
+import org.apache.spark.security.CryptoStreamUtils
 import org.apache.spark.serializer.{KryoSerializer, SerializerManager}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.storage._
@@ -44,7 +46,7 @@ import org.apache.spark.streaming.util._
 import org.apache.spark.util.{ManualClock, Utils}
 import org.apache.spark.util.io.ChunkedByteBuffer
 
-class ReceivedBlockHandlerSuite
+abstract class BaseReceivedBlockHandlerSuite(enableEncryption: Boolean)
   extends SparkFunSuite
   with BeforeAndAfter
   with Matchers
@@ -57,14 +59,22 @@ class ReceivedBlockHandlerSuite
   val conf = new SparkConf()
     .set("spark.streaming.receiver.writeAheadLog.rollingIntervalSecs", "1")
     .set("spark.app.id", "streaming-test")
+    .set(IO_ENCRYPTION_ENABLED, enableEncryption)
+  val encryptionKey =
+    if (enableEncryption) {
+      Some(CryptoStreamUtils.createKey(conf))
+    } else {
+      None
+    }
+
   val hadoopConf = new Configuration()
   val streamId = 1
-  val securityMgr = new SecurityManager(conf)
+  val securityMgr = new SecurityManager(conf, encryptionKey)
   val broadcastManager = new BroadcastManager(true, conf, securityMgr)
   val mapOutputTracker = new MapOutputTrackerMaster(conf, broadcastManager, true)
   val shuffleManager = new SortShuffleManager(conf)
   val serializer = new KryoSerializer(conf)
-  var serializerManager = new SerializerManager(serializer, conf)
+  var serializerManager = new SerializerManager(serializer, conf, encryptionKey)
   val manualClock = new ManualClock
   val blockManagerSize = 10000000
   val blockManagerBuffer = new ArrayBuffer[BlockManager]()
@@ -164,7 +174,9 @@ class ReceivedBlockHandlerSuite
           val bytes = reader.read(fileSegment)
           reader.close()
           serializerManager.dataDeserializeStream(
-            generateBlockId(), new ChunkedByteBuffer(bytes).toInputStream())(ClassTag.Any).toList
+            generateBlockId(),
+            new ChunkedByteBuffer(bytes).toInputStream(),
+            maybeEncrypted = false)(ClassTag.Any).toList
         }
         loggedData shouldEqual data
       }
@@ -208,6 +220,8 @@ class ReceivedBlockHandlerSuite
     sparkConf.set("spark.storage.unrollMemoryThreshold", "512")
     // spark.storage.unrollFraction set to 0.4 for BlockManager
     sparkConf.set("spark.storage.unrollFraction", "0.4")
+
+    sparkConf.set(IO_ENCRYPTION_ENABLED, enableEncryption)
     // Block Manager with 12000 * 0.4 = 4800 bytes of free space for unroll
     blockManager = createBlockManager(12000, sparkConf)
 
@@ -343,7 +357,7 @@ class ReceivedBlockHandlerSuite
     }
 
     def dataToByteBuffer(b: Seq[String]) =
-      serializerManager.dataSerialize(generateBlockId, b.iterator)
+      serializerManager.dataSerialize(generateBlockId, b.iterator, allowEncryption = false)
 
     val blocks = data.grouped(10).toSeq
 
@@ -418,3 +432,6 @@ class ReceivedBlockHandlerSuite
   private def generateBlockId(): StreamBlockId = StreamBlockId(streamId, scala.util.Random.nextLong)
 }
 
+class ReceivedBlockHandlerSuite extends BaseReceivedBlockHandlerSuite(false)
+
+class ReceivedBlockHandlerWithEncryptionSuite extends BaseReceivedBlockHandlerSuite(true)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index f1482e5c06cd..5645996de5a6 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.streaming
 
 import java.io.{File, NotSerializableException}
+import java.util.concurrent.{CountDownLatch, TimeUnit}
 import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.mutable.ArrayBuffer
@@ -806,6 +807,36 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     ssc.stop()
   }
 
+  test("SPARK-18560 Receiver data should be deserialized properly.") {
+    // Start a two nodes cluster, so receiver will use one node, and Spark jobs will use the
+    // other one. Then Spark jobs need to fetch remote blocks and it will trigger SPARK-18560.
+    val conf = new SparkConf().setMaster("local-cluster[2,1,1024]").setAppName(appName)
+    ssc = new StreamingContext(conf, Milliseconds(100))
+    val input = ssc.receiverStream(new TestReceiver)
+    val latch = new CountDownLatch(1)
+    @volatile var stopping = false
+    input.count().foreachRDD { rdd =>
+      // Make sure we can read from BlockRDD
+      if (rdd.collect().headOption.getOrElse(0L) > 0 && !stopping) {
+        // Stop StreamingContext to unblock "awaitTerminationOrTimeout"
+        stopping = true
+        new Thread() {
+          setDaemon(true)
+          override def run(): Unit = {
+            ssc.stop(stopSparkContext = true, stopGracefully = false)
+            latch.countDown()
+          }
+        }.start()
+      }
+    }
+    ssc.start()
+    ssc.awaitTerminationOrTimeout(60000)
+    // Wait until `ssc.top` returns. Otherwise, we may finish this test too fast and leak an active
+    // SparkContext. Note: the stop codes in `after` will just do nothing if `ssc.stop` in this test
+    // is running.
+    assert(latch.await(60, TimeUnit.SECONDS))
+  }
+
   def addInputStream(s: StreamingContext): DStream[Int] = {
     val input = (1 to 100).map(i => 1 to i)
     val inputStream = new TestInputStream(s, input, 1)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
index fa975a146216..dbab70886102 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
@@ -359,14 +359,20 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging {
    * output data has been collected or timeout (set by `maxWaitTimeMillis`) is reached.
    *
    * Returns a sequence of items for each RDD.
+   *
+   * @param ssc The StreamingContext
+   * @param numBatches The number of batches should be run
+   * @param numExpectedOutput The number of expected output
+   * @param preStop The function to run before stopping StreamingContext
    */
   def runStreams[V: ClassTag](
       ssc: StreamingContext,
       numBatches: Int,
-      numExpectedOutput: Int
+      numExpectedOutput: Int,
+      preStop: () => Unit = () => {}
     ): Seq[Seq[V]] = {
     // Flatten each RDD into a single Seq
-    runStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq)
+    runStreamsWithPartitions(ssc, numBatches, numExpectedOutput, preStop).map(_.flatten.toSeq)
   }
 
   /**
@@ -376,11 +382,17 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging {
    *
    * Returns a sequence of RDD's. Each RDD is represented as several sequences of items, each
    * representing one partition.
+   *
+   * @param ssc The StreamingContext
+   * @param numBatches The number of batches should be run
+   * @param numExpectedOutput The number of expected output
+   * @param preStop The function to run before stopping StreamingContext
    */
   def runStreamsWithPartitions[V: ClassTag](
       ssc: StreamingContext,
       numBatches: Int,
-      numExpectedOutput: Int
+      numExpectedOutput: Int,
+      preStop: () => Unit = () => {}
     ): Seq[Seq[Seq[V]]] = {
     assert(numBatches > 0, "Number of batches to run stream computation is zero")
     assert(numExpectedOutput > 0, "Number of expected outputs after " + numBatches + " is zero")
@@ -424,6 +436,7 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging {
       assert(output.size === numExpectedOutput, "Unexpected number of outputs generated")
 
       Thread.sleep(100) // Give some time for the forgetting old RDDs to complete
+      preStop()
     } finally {
       ssc.stop(stopSparkContext = true)
     }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
index ce5a6e00fb2f..ee69bf87d4ef 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
@@ -24,6 +24,7 @@ import org.apache.hadoop.conf.Configuration
 import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
 
 import org.apache.spark.{SparkConf, SparkContext, SparkException, SparkFunSuite}
+import org.apache.spark.internal.config._
 import org.apache.spark.serializer.SerializerManager
 import org.apache.spark.storage.{BlockId, BlockManager, StorageLevel, StreamBlockId}
 import org.apache.spark.streaming.util.{FileBasedWriteAheadLogSegment, FileBasedWriteAheadLogWriter}
@@ -45,6 +46,7 @@ class WriteAheadLogBackedBlockRDDSuite
 
   override def beforeEach(): Unit = {
     super.beforeEach()
+    initSparkContext()
     dir = Utils.createTempDir()
   }
 
@@ -56,22 +58,33 @@ class WriteAheadLogBackedBlockRDDSuite
     }
   }
 
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    sparkContext = new SparkContext(conf)
-    blockManager = sparkContext.env.blockManager
-    serializerManager = sparkContext.env.serializerManager
+  override def afterAll(): Unit = {
+    try {
+      stopSparkContext()
+    } finally {
+      super.afterAll()
+    }
   }
 
-  override def afterAll(): Unit = {
+  private def initSparkContext(_conf: Option[SparkConf] = None): Unit = {
+    if (sparkContext == null) {
+      sparkContext = new SparkContext(_conf.getOrElse(conf))
+      blockManager = sparkContext.env.blockManager
+      serializerManager = sparkContext.env.serializerManager
+    }
+  }
+
+  private def stopSparkContext(): Unit = {
     // Copied from LocalSparkContext, simpler than to introduced test dependencies to core tests.
     try {
-      sparkContext.stop()
+      if (sparkContext != null) {
+        sparkContext.stop()
+      }
       System.clearProperty("spark.driver.port")
       blockManager = null
       serializerManager = null
     } finally {
-      super.afterAll()
+      sparkContext = null
     }
   }
 
@@ -106,6 +119,17 @@ class WriteAheadLogBackedBlockRDDSuite
       numPartitions = 5, numPartitionsInBM = 0, numPartitionsInWAL = 5, testStoreInBM = true)
   }
 
+  test("read data in block manager and WAL with encryption on") {
+    stopSparkContext()
+    try {
+      val testConf = conf.clone().set(IO_ENCRYPTION_ENABLED, true)
+      initSparkContext(Some(testConf))
+      testRDD(numPartitions = 5, numPartitionsInBM = 3, numPartitionsInWAL = 2)
+    } finally {
+      stopSparkContext()
+    }
+  }
+
   /**
    * Test the WriteAheadLogBackedRDD, by writing some partitions of the data to block manager
    * and the rest to a write ahead log, and then reading reading it all back using the RDD.
@@ -186,7 +210,7 @@ class WriteAheadLogBackedBlockRDDSuite
     assert(rdd.collect() === data.flatten)
 
     // Verify that the block fetching is skipped when isBlockValid is set to false.
-    // This is done by using a RDD whose data is only in memory but is set to skip block fetching
+    // This is done by using an RDD whose data is only in memory but is set to skip block fetching
     // Using that RDD will throw exception, as it skips block fetching even if the blocks are in
     // in BlockManager.
     if (testIsBlockValid) {
@@ -226,7 +250,8 @@ class WriteAheadLogBackedBlockRDDSuite
     require(blockData.size === blockIds.size)
     val writer = new FileBasedWriteAheadLogWriter(new File(dir, "logFile").toString, hadoopConf)
     val segments = blockData.zip(blockIds).map { case (data, id) =>
-      writer.write(serializerManager.dataSerialize(id, data.iterator).toByteBuffer)
+      writer.write(serializerManager.dataSerialize(id, data.iterator, allowEncryption = false)
+        .toByteBuffer)
     }
     writer.close()
     segments
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
index b49e5790711c..1d2bf35a6d45 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
@@ -36,11 +36,11 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite
 
   private val batchDurationMillis = 1000L
   private var allocationClient: ExecutorAllocationClient = null
-  private var clock: ManualClock = null
+  private var clock: StreamManualClock = null
 
   before {
     allocationClient = mock[ExecutorAllocationClient]
-    clock = new ManualClock()
+    clock = new StreamManualClock()
   }
 
   test("basic functionality") {
@@ -57,10 +57,14 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite
         reset(allocationClient)
         when(allocationClient.getExecutorIds()).thenReturn(Seq("1", "2"))
         addBatchProcTime(allocationManager, batchProcTimeMs.toLong)
-        clock.advance(SCALING_INTERVAL_DEFAULT_SECS * 1000 + 1)
+        val advancedTime = SCALING_INTERVAL_DEFAULT_SECS * 1000 + 1
+        val expectedWaitTime = clock.getTimeMillis() + advancedTime
+        clock.advance(advancedTime)
+        // Make sure ExecutorAllocationManager.manageAllocation is called
         eventually(timeout(10 seconds)) {
-          body
+          assert(clock.isStreamWaitingAt(expectedWaitTime))
         }
+        body
       }
 
       /** Verify that the expected number of total executor were requested */
@@ -394,3 +398,27 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite
     }
   }
 }
+
+/**
+ * A special manual clock that provide `isStreamWaitingAt` to allow the user to check if the clock
+ * is blocking.
+ */
+class StreamManualClock(time: Long = 0L) extends ManualClock(time) with Serializable {
+  private var waitStartTime: Option[Long] = None
+
+  override def waitTillTime(targetTime: Long): Long = synchronized {
+    try {
+      waitStartTime = Some(getTimeMillis())
+      super.waitTillTime(targetTime)
+    } finally {
+      waitStartTime = None
+    }
+  }
+
+  /**
+   * Returns if the clock is blocking and the time it started to block is the parameter `time`.
+   */
+  def isStreamWaitingAt(time: Long): Boolean = synchronized {
+    waitStartTime == Some(time)
+  }
+}
diff --git a/tools/pom.xml b/tools/pom.xml
index b9be8db684a9..11b58afdcac7 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 64ff845b5ae9..e21df4ec1dc5 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -54,6 +54,8 @@
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 55e4a833b670..1ba736b221db 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -101,6 +101,7 @@ private[spark] class Client(
   private var principal: String = null
   private var keytab: String = null
   private var credentials: Credentials = null
+  private var amKeytabFileName: String = null
 
   private val launcherBackend = new LauncherBackend() {
     override def onStopRequest(): Unit = {
@@ -358,6 +359,7 @@ private[spark] class Client(
       destDir: Path,
       srcPath: Path,
       replication: Short,
+      symlinkCache: Map[URI, Path],
       force: Boolean = false,
       destName: Option[String] = None): Path = {
     val destFs = destDir.getFileSystem(hadoopConf)
@@ -375,8 +377,12 @@ private[spark] class Client(
     // Resolve any symlinks in the URI path so using a "current" symlink to point to a specific
     // version shows the specific version in the distributed cache configuration
     val qualifiedDestPath = destFs.makeQualified(destPath)
-    val fc = FileContext.getFileContext(qualifiedDestPath.toUri(), hadoopConf)
-    fc.resolvePath(qualifiedDestPath)
+    val qualifiedDestDir = qualifiedDestPath.getParent
+    val resolvedDestDir = symlinkCache.getOrElseUpdate(qualifiedDestDir.toUri(), {
+      val fc = FileContext.getFileContext(qualifiedDestDir.toUri(), hadoopConf)
+      fc.resolvePath(qualifiedDestDir)
+    })
+    new Path(resolvedDestDir, qualifiedDestPath.getName())
   }
 
   /**
@@ -397,6 +403,9 @@ private[spark] class Client(
     val nearestTimeOfNextRenewal = credentialManager.obtainCredentials(hadoopConf, credentials)
 
     if (credentials != null) {
+      // Add credentials to current user's UGI, so that following operations don't need to use the
+      // Kerberos tgt to get delegations again in the client side.
+      UserGroupInformation.getCurrentUser.addCredentials(credentials)
       logDebug(YarnSparkHadoopUtil.get.dumpTokens(credentials).mkString("\n"))
     }
 
@@ -432,6 +441,7 @@ private[spark] class Client(
     FileSystem.mkdirs(fs, destDir, new FsPermission(STAGING_DIR_PERMISSION))
 
     val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
+    val symlinkCache: Map[URI, Path] = HashMap[URI, Path]()
 
     def addDistributedUri(uri: URI): Boolean = {
       val uriStr = uri.toString()
@@ -477,7 +487,7 @@ private[spark] class Client(
           val localPath = getQualifiedLocalPath(localURI, hadoopConf)
           val linkname = targetDir.map(_ + "/").getOrElse("") +
             destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName())
-          val destPath = copyFileToRemote(destDir, localPath, replication)
+          val destPath = copyFileToRemote(destDir, localPath, replication, symlinkCache)
           val destFs = FileSystem.get(destPath.toUri(), hadoopConf)
           distCacheMgr.addResource(
             destFs, hadoopConf, destPath, localResources, resType, linkname, statCache,
@@ -497,7 +507,7 @@ private[spark] class Client(
       logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" +
         " via the YARN Secure Distributed Cache.")
       val (_, localizedPath) = distribute(keytab,
-        destName = sparkConf.get(KEYTAB),
+        destName = Some(amKeytabFileName),
         appMasterOnly = true)
       require(localizedPath != null, "Keytab file already distributed.")
     }
@@ -529,8 +539,9 @@ private[spark] class Client(
               val path = getQualifiedLocalPath(Utils.resolveURI(jar), hadoopConf)
               val pathFs = FileSystem.get(path.toUri(), hadoopConf)
               pathFs.globStatus(path).filter(_.isFile()).foreach { entry =>
-                distribute(entry.getPath().toUri().toString(),
-                  targetDir = Some(LOCALIZED_LIB_DIR))
+                val uri = entry.getPath().toUri()
+                statCache.update(uri, entry)
+                distribute(uri.toString(), targetDir = Some(LOCALIZED_LIB_DIR))
               }
             } else {
               localJars += jar
@@ -598,8 +609,16 @@ private[spark] class Client(
     ).foreach { case (flist, resType, addToClasspath) =>
       flist.foreach { file =>
         val (_, localizedPath) = distribute(file, resType = resType)
-        if (addToClasspath && localizedPath != null) {
-          cachedSecondaryJarLinks += localizedPath
+        // If addToClassPath, we ignore adding jar multiple times to distitrbuted cache.
+        if (addToClasspath) {
+          if (localizedPath != null) {
+            cachedSecondaryJarLinks += localizedPath
+          }
+        } else {
+          if (localizedPath == null) {
+            throw new IllegalArgumentException(s"Attempt to add ($file) multiple times" +
+              " to the distributed cache.")
+          }
         }
       }
     }
@@ -638,7 +657,7 @@ private[spark] class Client(
     sparkConf.set(CACHED_CONF_ARCHIVE, remoteConfArchivePath.toString())
 
     val localConfArchive = new Path(createConfArchive().toURI())
-    copyFileToRemote(destDir, localConfArchive, replication, force = true,
+    copyFileToRemote(destDir, localConfArchive, replication, symlinkCache, force = true,
       destName = Some(LOCALIZED_CONF_ARCHIVE))
 
     // Manually add the config archive to the cache manager so that the AM is launched with
@@ -725,6 +744,9 @@ private[spark] class Client(
       // Save Spark configuration to a file in the archive.
       val props = new Properties()
       sparkConf.getAll.foreach { case (k, v) => props.setProperty(k, v) }
+      // Override spark.yarn.key to point to the location in distributed cache which will be used
+      // by AM.
+      Option(amKeytabFileName).foreach { k => props.setProperty(KEYTAB.key, k) }
       confStream.putNextEntry(new ZipEntry(SPARK_CONF_FILE))
       val writer = new OutputStreamWriter(confStream, StandardCharsets.UTF_8)
       props.store(writer, "Spark configuration.")
@@ -1005,12 +1027,7 @@ private[spark] class Client(
     val securityManager = new SecurityManager(sparkConf)
     amContainer.setApplicationACLs(
       YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager).asJava)
-
-    if (sparkConf.get(IO_ENCRYPTION_ENABLED)) {
-      SecurityManager.initIOEncryptionKey(sparkConf, credentials)
-    }
     setupSecurityToken(amContainer)
-
     amContainer
   }
 
@@ -1026,8 +1043,7 @@ private[spark] class Client(
       val f = new File(keytab)
       // Generate a file name that can be used for the keytab file, that does not conflict
       // with any user file.
-      val keytabFileName = f.getName + "-" + UUID.randomUUID().toString
-      sparkConf.set(KEYTAB.key, keytabFileName)
+      amKeytabFileName = f.getName + "-" + UUID.randomUUID().toString
       sparkConf.set(PRINCIPAL.key, principal)
     }
     // Defensive copy of the credentials
@@ -1202,7 +1218,10 @@ private object Client extends Logging {
     // Note that any env variable with the SPARK_ prefix gets propagated to all (remote) processes
     System.setProperty("SPARK_YARN_MODE", "true")
     val sparkConf = new SparkConf
-
+    // SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
+    // so remove them from sparkConf here for yarn mode.
+    sparkConf.remove("spark.jars")
+    sparkConf.remove("spark.files")
     val args = new ClientArguments(argStrings)
     new Client(args, sparkConf).run()
   }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
index dcc2288dd155..e6e0ea38ade9 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
@@ -68,7 +68,7 @@ private[spark] class ClientDistributedCacheManager() extends Logging {
       link: String,
       statCache: Map[URI, FileStatus],
       appMasterOnly: Boolean = false): Unit = {
-    val destStatus = fs.getFileStatus(destPath)
+    val destStatus = statCache.getOrElse(destPath.toUri(), fs.getFileStatus(destPath))
     val amJarRsrc = Records.newRecord(classOf[LocalResource])
     amJarRsrc.setType(resourceType)
     val visibility = getVisibility(conf, destPath.toUri(), statCache)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
index 8772e26f4314..872fd354273f 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
@@ -23,7 +23,6 @@ import scala.collection.JavaConverters._
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.yarn.api.records.{ContainerId, Resource}
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
-import org.apache.hadoop.yarn.util.RackResolver
 
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.config._
@@ -83,7 +82,8 @@ private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], rack
 private[yarn] class LocalityPreferredContainerPlacementStrategy(
     val sparkConf: SparkConf,
     val yarnConf: Configuration,
-    val resource: Resource) {
+    val resource: Resource,
+    resolver: SparkRackResolver) {
 
   /**
    * Calculate each container's node locality and rack locality
@@ -129,9 +129,9 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
       val largestRatio = updatedHostToContainerCount.values.max
       // Round the ratio of preferred locality to the number of locality required container
       // number, which is used for locality preferred host calculating.
-      var preferredLocalityRatio = updatedHostToContainerCount.mapValues { ratio =>
+      var preferredLocalityRatio = updatedHostToContainerCount.map { case(k, ratio) =>
         val adjustedRatio = ratio.toDouble * requiredLocalityAwareContainerNum / largestRatio
-        adjustedRatio.ceil.toInt
+        (k, adjustedRatio.ceil.toInt)
       }
 
       for (i <- 0 until requiredLocalityAwareContainerNum) {
@@ -139,13 +139,13 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
         // still be allocated with new container request.
         val hosts = preferredLocalityRatio.filter(_._2 > 0).keys.toArray
         val racks = hosts.map { h =>
-          RackResolver.resolve(yarnConf, h).getNetworkLocation
+          resolver.resolve(yarnConf, h)
         }.toSet
         containerLocalityPreferences += ContainerLocalityPreferences(hosts, racks.toArray)
 
         // Minus 1 each time when the host is used. When the current ratio is 0,
         // which means all the required ratio is satisfied, this host will not be allocated again.
-        preferredLocalityRatio = preferredLocalityRatio.mapValues(_ - 1)
+        preferredLocalityRatio = preferredLocalityRatio.map { case (k, v) => (k, v - 1) }
       }
     }
 
@@ -218,7 +218,8 @@ private[yarn] class LocalityPreferredContainerPlacementStrategy(
 
     val possibleTotalContainerNum = pendingHostToContainerCount.values.sum
     val localityMatchedPendingNum = localityMatchedPendingAllocations.size.toDouble
-    pendingHostToContainerCount.mapValues(_ * localityMatchedPendingNum / possibleTotalContainerNum)
-      .toMap
+    pendingHostToContainerCount.map { case (k, v) =>
+      (k, v * localityMatchedPendingNum / possibleTotalContainerNum)
+    }.toMap
   }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/SparkRackResolver.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/SparkRackResolver.scala
new file mode 100644
index 000000000000..c711d088f211
--- /dev/null
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/SparkRackResolver.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.yarn.util.RackResolver
+import org.apache.log4j.{Level, Logger}
+
+/**
+ * Wrapper around YARN's [[RackResolver]]. This allows Spark tests to easily override the
+ * default behavior, since YARN's class self-initializes the first time it's called, and
+ * future calls all use the initial configuration.
+ */
+private[yarn] class SparkRackResolver {
+
+  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
+  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
+    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
+  }
+
+  def resolve(conf: Configuration, hostName: String): String = {
+    RackResolver.resolve(conf, hostName).getNetworkLocation()
+  }
+
+}
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 0b66d1cf08ea..639e564d4684 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -30,7 +30,6 @@ import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.AMRMClient
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
 import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.RackResolver
 import org.apache.log4j.{Level, Logger}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
@@ -65,16 +64,12 @@ private[yarn] class YarnAllocator(
     amClient: AMRMClient[ContainerRequest],
     appAttemptId: ApplicationAttemptId,
     securityMgr: SecurityManager,
-    localResources: Map[String, LocalResource])
+    localResources: Map[String, LocalResource],
+    resolver: SparkRackResolver)
   extends Logging {
 
   import YarnAllocator._
 
-  // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
-  if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
-    Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
-  }
-
   // Visible for testing.
   val allocatedHostToContainersMap = new HashMap[String, collection.mutable.Set[ContainerId]]
   val allocatedContainerToHostMap = new HashMap[ContainerId, String]
@@ -171,7 +166,7 @@ private[yarn] class YarnAllocator(
 
   // A container placement strategy based on pending tasks' locality preference
   private[yarn] val containerPlacementStrategy =
-    new LocalityPreferredContainerPlacementStrategy(sparkConf, conf, resource)
+    new LocalityPreferredContainerPlacementStrategy(sparkConf, conf, resource, resolver)
 
   /**
    * Use a different clock for YarnAllocator. This is mainly used for testing.
@@ -422,7 +417,7 @@ private[yarn] class YarnAllocator(
     // Match remaining by rack
     val remainingAfterRackMatches = new ArrayBuffer[Container]
     for (allocatedContainer <- remainingAfterHostMatches) {
-      val rack = RackResolver.resolve(conf, allocatedContainer.getNodeId.getHost).getNetworkLocation
+      val rack = resolver.resolve(conf, allocatedContainer.getNodeId.getHost)
       matchContainerToRequest(allocatedContainer, rack, containersToUse,
         remainingAfterRackMatches)
     }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index 53df11eb6602..9e14d63be55e 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -75,7 +75,7 @@ private[spark] class YarnRMClient extends Logging {
       registered = true
     }
     new YarnAllocator(driverUrl, driverRef, conf, sparkConf, amClient, getAttemptId(), securityMgr,
-      localResources)
+      localResources, new SparkRackResolver())
   }
 
   /**
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
index c4c07b49301f..933736bd2271 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
@@ -38,6 +38,8 @@ import org.apache.spark.util.Utils
  *
  * Also each credential provider is controlled by
  * spark.yarn.security.credentials.{service}.enabled, it will not be loaded in if set to false.
+ * For example, Hive's credential provider [[HiveCredentialProvider]] can be enabled/disabled by
+ * the configuration spark.yarn.security.credentials.hive.enabled.
  */
 private[yarn] final class ConfigurableCredentialManager(
     sparkConf: SparkConf, hadoopConf: Configuration) extends Logging {
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
index 5df4fbd9c153..41b7b5d60b03 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
@@ -55,12 +55,12 @@ private[spark] class CredentialUpdater(
 
   /** Start the credential updater task */
   def start(): Unit = {
-    val startTime = sparkConf.get(CREDENTIALS_RENEWAL_TIME)
+    val startTime = sparkConf.get(CREDENTIALS_UPDATE_TIME)
     val remainingTime = startTime - System.currentTimeMillis()
     if (remainingTime <= 0) {
       credentialUpdater.schedule(credentialUpdaterRunnable, 1, TimeUnit.MINUTES)
     } else {
-      logInfo(s"Scheduling credentials refresh from HDFS in $remainingTime millis.")
+      logInfo(s"Scheduling credentials refresh from HDFS in $remainingTime ms.")
       credentialUpdater.schedule(credentialUpdaterRunnable, remainingTime, TimeUnit.MILLISECONDS)
     }
   }
@@ -81,8 +81,8 @@ private[spark] class CredentialUpdater(
             UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
             logInfo("Credentials updated from credentials file.")
 
-            val remainingTime = getTimeOfNextUpdateFromFileName(credentialsStatus.getPath)
-              - System.currentTimeMillis()
+            val remainingTime = (getTimeOfNextUpdateFromFileName(credentialsStatus.getPath)
+              - System.currentTimeMillis())
             if (remainingTime <= 0) TimeUnit.MINUTES.toMillis(1) else remainingTime
           } else {
             // If current credential file is older than expected, sleep 1 hour and check again.
@@ -100,6 +100,7 @@ private[spark] class CredentialUpdater(
         TimeUnit.HOURS.toMillis(1)
     }
 
+    logInfo(s"Scheduling credentials refresh from HDFS in $timeToNextUpdate ms.")
     credentialUpdater.schedule(
       credentialUpdaterRunnable, timeToNextUpdate, TimeUnit.MILLISECONDS)
   }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
index 5571df09a2ec..5adeb8e605ff 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.security.token.{Token, TokenIdentifier}
 
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
 
 private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging {
 
@@ -36,7 +37,7 @@ private[security] class HBaseCredentialProvider extends ServiceCredentialProvide
       sparkConf: SparkConf,
       creds: Credentials): Option[Long] = {
     try {
-      val mirror = universe.runtimeMirror(getClass.getClassLoader)
+      val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader)
       val obtainToken = mirror.classLoader.
         loadClass("org.apache.hadoop.hbase.security.token.TokenUtil").
         getMethod("obtainToken", classOf[Configuration])
@@ -60,7 +61,7 @@ private[security] class HBaseCredentialProvider extends ServiceCredentialProvide
 
   private def hbaseConf(conf: Configuration): Configuration = {
     try {
-      val mirror = universe.runtimeMirror(getClass.getClassLoader)
+      val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader)
       val confCreate = mirror.classLoader.
         loadClass("org.apache.hadoop.hbase.HBaseConfiguration").
         getMethod("create", classOf[Configuration])
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
index 8d06d735bad5..ebb176bc95ca 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
@@ -72,21 +72,22 @@ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider
     // We cannot use the tokens generated with renewer yarn. Trying to renew
     // those will fail with an access control issue. So create new tokens with the logged in
     // user as renewer.
-    sparkConf.get(PRINCIPAL).map { renewer =>
+    sparkConf.get(PRINCIPAL).flatMap { renewer =>
       val creds = new Credentials()
       nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
         val dstFs = dst.getFileSystem(hadoopConf)
         dstFs.addDelegationTokens(renewer, creds)
       }
-      val t = creds.getAllTokens.asScala
-        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
-        .head
-      val newExpiration = t.renew(hadoopConf)
-      val identifier = new DelegationTokenIdentifier()
-      identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
-      val interval = newExpiration - identifier.getIssueDate
-      logInfo(s"Renewal Interval is $interval")
-      interval
+      val hdfsToken = creds.getAllTokens.asScala
+        .find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
+      hdfsToken.map { t =>
+        val newExpiration = t.renew(hadoopConf)
+        val identifier = new DelegationTokenIdentifier()
+        identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
+        val interval = newExpiration - identifier.getIssueDate
+        logInfo(s"Renewal Interval is $interval")
+        interval
+      }
     }
   }
 
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
index ced597bed36d..4f3d5ebf403e 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -55,8 +55,8 @@ private[spark] class YarnClusterSchedulerBackend(
       val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
       logDebug(s"Base URL for logs: $baseUrl")
       driverLogs = Some(Map(
-        "stderr" -> s"$baseUrl/stderr?start=-4096",
-        "stdout" -> s"$baseUrl/stdout?start=-4096"))
+        "stdout" -> s"$baseUrl/stdout?start=-4096",
+        "stderr" -> s"$baseUrl/stderr?start=-4096"))
     } catch {
       case e: Exception =>
         logInfo("Error while building AM log links, so AM" +
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 0a4f291e25fb..cb985f66eae7 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -145,7 +145,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
       .set("spark.yarn.dist.jars", ADDED)
     val client = createClient(sparkConf, args = Array("--jar", USER))
     doReturn(new Path("/")).when(client).copyFileToRemote(any(classOf[Path]),
-      any(classOf[Path]), anyShort(), anyBoolean(), any())
+      any(classOf[Path]), anyShort(), any(classOf[MutableHashMap[URI, Path]]), anyBoolean(), any())
 
     val tempDir = Utils.createTempDir()
     try {
@@ -251,11 +251,11 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
       Some(Seq(s"local:${jar4.getPath()}", s"local:${single.getAbsolutePath()}/*")))
 
     verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar1.toURI())), anyShort(),
-      anyBoolean(), any())
+      any(classOf[MutableHashMap[URI, Path]]), anyBoolean(), any())
     verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar2.toURI())), anyShort(),
-      anyBoolean(), any())
+      any(classOf[MutableHashMap[URI, Path]]), anyBoolean(), any())
     verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar3.toURI())), anyShort(),
-      anyBoolean(), any())
+      any(classOf[MutableHashMap[URI, Path]]), anyBoolean(), any())
 
     val cp = classpath(client)
     cp should contain (buildPath(PWD, LOCALIZED_LIB_DIR, "*"))
@@ -273,7 +273,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
     client.prepareLocalResources(new Path(temp.getAbsolutePath()), Nil)
 
     verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(archive.toURI())), anyShort(),
-      anyBoolean(), any())
+      any(classOf[MutableHashMap[URI, Path]]), anyBoolean(), any())
     classpath(client) should contain (buildPath(PWD, LOCALIZED_LIB_DIR, "*"))
 
     sparkConf.set(SPARK_ARCHIVE, LOCAL_SCHEME + ":" + archive.getPath())
@@ -282,6 +282,65 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
     }
   }
 
+  test("distribute archive multiple times") {
+    val libs = Utils.createTempDir()
+    // Create jars dir and RELEASE file to avoid IllegalStateException.
+    val jarsDir = new File(libs, "jars")
+    assert(jarsDir.mkdir())
+    new FileOutputStream(new File(libs, "RELEASE")).close()
+
+    val userLib1 = Utils.createTempDir()
+    val testJar = TestUtils.createJarWithFiles(Map(), userLib1)
+
+    // Case 1:  FILES_TO_DISTRIBUTE and ARCHIVES_TO_DISTRIBUTE can't have duplicate files
+    val sparkConf = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath))
+      .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+    val client = createClient(sparkConf)
+    val tempDir = Utils.createTempDir()
+    intercept[IllegalArgumentException] {
+      client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
+    }
+
+    // Case 2: FILES_TO_DISTRIBUTE can't have duplicate files.
+    val sparkConfFiles = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath, testJar.getPath))
+
+    val clientFiles = createClient(sparkConfFiles)
+    val tempDirForFiles = Utils.createTempDir()
+    intercept[IllegalArgumentException] {
+      clientFiles.prepareLocalResources(new Path(tempDirForFiles.getAbsolutePath()), Nil)
+    }
+
+    // Case 3: ARCHIVES_TO_DISTRIBUTE can't have duplicate files.
+    val sparkConfArchives = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath, testJar.getPath))
+
+    val clientArchives = createClient(sparkConfArchives)
+    val tempDirForArchives = Utils.createTempDir()
+    intercept[IllegalArgumentException] {
+      clientArchives.prepareLocalResources(new Path(tempDirForArchives.getAbsolutePath()), Nil)
+    }
+
+    // Case 4: FILES_TO_DISTRIBUTE can have unique file.
+    val sparkConfFilesUniq = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+    val clientFilesUniq = createClient(sparkConfFilesUniq)
+    val tempDirForFilesUniq = Utils.createTempDir()
+    clientFilesUniq.prepareLocalResources(new Path(tempDirForFilesUniq.getAbsolutePath()), Nil)
+
+    // Case 5: ARCHIVES_TO_DISTRIBUTE can have unique file.
+    val sparkConfArchivesUniq = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+    val clientArchivesUniq = createClient(sparkConfArchivesUniq)
+    val tempDirArchivesUniq = Utils.createTempDir()
+    clientArchivesUniq.prepareLocalResources(new Path(tempDirArchivesUniq.getAbsolutePath()), Nil)
+
+  }
+
   test("distribute local spark jars") {
     val temp = Utils.createTempDir()
     val jarsDir = new File(temp, "jars")
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/IOEncryptionSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/IOEncryptionSuite.scala
deleted file mode 100644
index 1c60315b21ae..000000000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/IOEncryptionSuite.scala
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.yarn
-
-import java.io._
-import java.nio.charset.StandardCharsets
-import java.security.PrivilegedExceptionAction
-import java.util.UUID
-
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Matchers}
-
-import org.apache.spark._
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.internal.config._
-import org.apache.spark.serializer._
-import org.apache.spark.storage._
-
-class IOEncryptionSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
-  with BeforeAndAfterEach {
-  private[this] val blockId = new TempShuffleBlockId(UUID.randomUUID())
-  private[this] val conf = new SparkConf()
-  private[this] val ugi = UserGroupInformation.createUserForTesting("testuser", Array("testgroup"))
-  private[this] val serializer = new KryoSerializer(conf)
-
-  override def beforeAll(): Unit = {
-    System.setProperty("SPARK_YARN_MODE", "true")
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        val creds = new Credentials()
-        SecurityManager.initIOEncryptionKey(conf, creds)
-        SparkHadoopUtil.get.addCurrentUserCredentials(creds)
-      }
-    })
-  }
-
-  override def afterAll(): Unit = {
-    SparkEnv.set(null)
-    System.clearProperty("SPARK_YARN_MODE")
-  }
-
-  override def beforeEach(): Unit = {
-    super.beforeEach()
-  }
-
-  override def afterEach(): Unit = {
-    super.afterEach()
-    conf.set("spark.shuffle.compress", false.toString)
-    conf.set("spark.shuffle.spill.compress", false.toString)
-  }
-
-  test("IO encryption read and write") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit] {
-      override def run(): Unit = {
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        conf.set("spark.shuffle.compress", false.toString)
-        conf.set("spark.shuffle.spill.compress", false.toString)
-        testYarnIOEncryptionWriteRead()
-      }
-    })
-  }
-
-  test("IO encryption read and write with shuffle compression enabled") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit] {
-      override def run(): Unit = {
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        conf.set("spark.shuffle.compress", true.toString)
-        conf.set("spark.shuffle.spill.compress", true.toString)
-        testYarnIOEncryptionWriteRead()
-      }
-    })
-  }
-
-  private[this] def testYarnIOEncryptionWriteRead(): Unit = {
-    val plainStr = "hello world"
-    val outputStream = new ByteArrayOutputStream()
-    val serializerManager = new SerializerManager(serializer, conf)
-    val wrappedOutputStream = serializerManager.wrapStream(blockId, outputStream)
-    wrappedOutputStream.write(plainStr.getBytes(StandardCharsets.UTF_8))
-    wrappedOutputStream.close()
-
-    val encryptedBytes = outputStream.toByteArray
-    val encryptedStr = new String(encryptedBytes)
-    assert(plainStr !== encryptedStr)
-
-    val inputStream = new ByteArrayInputStream(encryptedBytes)
-    val wrappedInputStream = serializerManager.wrapStream(blockId, inputStream)
-    val decryptedBytes = new Array[Byte](1024)
-    val len = wrappedInputStream.read(decryptedBytes)
-    val decryptedStr = new String(decryptedBytes, 0, len, StandardCharsets.UTF_8)
-    assert(decryptedStr === plainStr)
-  }
-}
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/LocalityPlacementStrategySuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/LocalityPlacementStrategySuite.scala
new file mode 100644
index 000000000000..b7f25656e49a
--- /dev/null
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/LocalityPlacementStrategySuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.{HashMap, HashSet, Set}
+
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.mockito.Mockito._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+
+class LocalityPlacementStrategySuite extends SparkFunSuite {
+
+  test("handle large number of containers and tasks (SPARK-18750)") {
+    // Run the test in a thread with a small stack size, since the original issue
+    // surfaced as a StackOverflowError.
+    var error: Throwable = null
+
+    val runnable = new Runnable() {
+      override def run(): Unit = try {
+        runTest()
+      } catch {
+        case e: Throwable => error = e
+      }
+    }
+
+    val thread = new Thread(new ThreadGroup("test"), runnable, "test-thread", 32 * 1024)
+    thread.start()
+    thread.join()
+
+    assert(error === null)
+  }
+
+  private def runTest(): Unit = {
+    val yarnConf = new YarnConfiguration()
+
+    // The numbers below have been chosen to balance being large enough to replicate the
+    // original issue while not taking too long to run when the issue is fixed. The main
+    // goal is to create enough requests for localized containers (so there should be many
+    // tasks on several hosts that have no allocated containers).
+
+    val resource = Resource.newInstance(8 * 1024, 4)
+    val strategy = new LocalityPreferredContainerPlacementStrategy(new SparkConf(),
+      yarnConf, resource, new MockResolver())
+
+    val totalTasks = 32 * 1024
+    val totalContainers = totalTasks / 16
+    val totalHosts = totalContainers / 16
+
+    val mockId = mock(classOf[ContainerId])
+    val hosts = (1 to totalHosts).map { i => (s"host_$i", totalTasks % i) }.toMap
+    val containers = (1 to totalContainers).map { i => mockId }
+    val count = containers.size / hosts.size / 2
+
+    val hostToContainerMap = new HashMap[String, Set[ContainerId]]()
+    hosts.keys.take(hosts.size / 2).zipWithIndex.foreach { case (host, i) =>
+      val hostContainers = new HashSet[ContainerId]()
+      containers.drop(count * i).take(i).foreach { c => hostContainers += c }
+      hostToContainerMap(host) = hostContainers
+    }
+
+    strategy.localityOfRequestedContainers(containers.size * 2, totalTasks, hosts,
+      hostToContainerMap, Nil)
+  }
+
+}
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 994dc75d34c3..1b3f438be4d8 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -17,10 +17,7 @@
 
 package org.apache.spark.deploy.yarn
 
-import java.util.{Arrays, List => JList}
-
-import org.apache.hadoop.fs.CommonConfigurationKeysPublic
-import org.apache.hadoop.net.DNSToSwitchMapping
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.client.api.AMRMClient
 import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
@@ -36,24 +33,16 @@ import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.scheduler.SplitInfo
 import org.apache.spark.util.ManualClock
 
-class MockResolver extends DNSToSwitchMapping {
+class MockResolver extends SparkRackResolver {
 
-  override def resolve(names: JList[String]): JList[String] = {
-    if (names.size > 0 && names.get(0) == "host3") Arrays.asList("/rack2")
-    else Arrays.asList("/rack1")
+  override def resolve(conf: Configuration, hostName: String): String = {
+    if (hostName == "host3") "/rack2" else "/rack1"
   }
 
-  override def reloadCachedMappings() {}
-
-  def reloadCachedMappings(names: JList[String]) {}
 }
 
 class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
   val conf = new YarnConfiguration()
-  conf.setClass(
-    CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
-    classOf[MockResolver], classOf[DNSToSwitchMapping])
-
   val sparkConf = new SparkConf()
   sparkConf.set("spark.driver.host", "localhost")
   sparkConf.set("spark.driver.port", "4040")
@@ -107,7 +96,8 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
       rmClient,
       appAttemptId,
       new SecurityManager(sparkConf),
-      Map())
+      Map(),
+      new MockResolver())
   }
 
   def createContainer(host: String): Container = {