diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 565a2b1a68b5f..cab39d68c3f52 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -267,9 +267,11 @@ export("as.DataFrame",
        "createExternalTable",
        "dropTempTable",
        "jsonFile",
+       "read.json",
        "loadDF",
        "parquetFile",
        "read.df",
+       "read.parquet",
        "sql",
        "table",
        "tableNames",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 81b4e6b91d8a2..975b058c0aaf1 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -24,14 +24,14 @@ setOldClass("jobj")
 
 #' @title S4 class that represents a DataFrame
 #' @description DataFrames can be created using functions like \link{createDataFrame},
-#'              \link{jsonFile}, \link{table} etc.
+#'              \link{read.json}, \link{table} etc.
 #' @family DataFrame functions
 #' @rdname DataFrame
 #' @docType class
 #'
 #' @slot env An R environment that stores bookkeeping states of the DataFrame
 #' @slot sdf A Java object reference to the backing Scala DataFrame
-#' @seealso \link{createDataFrame}, \link{jsonFile}, \link{table}
+#' @seealso \link{createDataFrame}, \link{read.json}, \link{table}
 #' @seealso \url{https://spark.apache.org/docs/latest/sparkr.html#sparkr-dataframes}
 #' @export
 #' @examples
@@ -77,7 +77,7 @@ dataFrame <- function(sdf, isCached = FALSE) {
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' printSchema(df)
 #'}
 setMethod("printSchema",
@@ -102,7 +102,7 @@ setMethod("printSchema",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' dfSchema <- schema(df)
 #'}
 setMethod("schema",
@@ -126,7 +126,7 @@ setMethod("schema",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' explain(df, TRUE)
 #'}
 setMethod("explain",
@@ -157,7 +157,7 @@ setMethod("explain",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' isLocal(df)
 #'}
 setMethod("isLocal",
@@ -182,7 +182,7 @@ setMethod("isLocal",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' showDF(df)
 #'}
 setMethod("showDF",
@@ -207,7 +207,7 @@ setMethod("showDF",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' df
 #'}
 setMethod("show", "DataFrame",
@@ -234,7 +234,7 @@ setMethod("show", "DataFrame",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' dtypes(df)
 #'}
 setMethod("dtypes",
@@ -261,7 +261,7 @@ setMethod("dtypes",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' columns(df)
 #' colnames(df)
 #'}
@@ -376,7 +376,7 @@ setMethod("coltypes",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' coltypes(df) <- c("character", "integer")
 #' coltypes(df) <- c(NA, "numeric")
 #'}
@@ -423,7 +423,7 @@ setMethod("coltypes<-",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' registerTempTable(df, "json_df")
 #' new_df <- sql(sqlContext, "SELECT * FROM json_df")
 #'}
@@ -476,7 +476,7 @@ setMethod("insertInto",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' cache(df)
 #'}
 setMethod("cache",
@@ -504,7 +504,7 @@ setMethod("cache",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' persist(df, "MEMORY_AND_DISK")
 #'}
 setMethod("persist",
@@ -532,7 +532,7 @@ setMethod("persist",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' persist(df, "MEMORY_AND_DISK")
 #' unpersist(df)
 #'}
@@ -560,7 +560,7 @@ setMethod("unpersist",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' newDF <- repartition(df, 2L)
 #'}
 setMethod("repartition",
@@ -585,7 +585,7 @@ setMethod("repartition",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' newRDD <- toJSON(df)
 #'}
 setMethod("toJSON",
@@ -613,7 +613,7 @@ setMethod("toJSON",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' saveAsParquetFile(df, "/tmp/sparkr-tmp/")
 #'}
 setMethod("saveAsParquetFile",
@@ -637,7 +637,7 @@ setMethod("saveAsParquetFile",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' distinctDF <- distinct(df)
 #'}
 setMethod("distinct",
@@ -672,7 +672,7 @@ setMethod("unique",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' collect(sample(df, FALSE, 0.5))
 #' collect(sample(df, TRUE, 0.5))
 #'}
@@ -711,7 +711,7 @@ setMethod("sample_frac",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' count(df)
 #' }
 setMethod("count",
@@ -741,7 +741,7 @@ setMethod("nrow",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' ncol(df)
 #' }
 setMethod("ncol",
@@ -762,7 +762,7 @@ setMethod("ncol",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' dim(df)
 #' }
 setMethod("dim",
@@ -786,7 +786,7 @@ setMethod("dim",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' collected <- collect(df)
 #' firstName <- collected[[1]]$name
 #' }
@@ -858,7 +858,7 @@ setMethod("collect",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' limitedDF <- limit(df, 10)
 #' }
 setMethod("limit",
@@ -879,7 +879,7 @@ setMethod("limit",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' take(df, 2)
 #' }
 setMethod("take",
@@ -908,7 +908,7 @@ setMethod("take",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' head(df)
 #' }
 setMethod("head",
@@ -931,7 +931,7 @@ setMethod("head",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' first(df)
 #' }
 setMethod("first",
@@ -952,7 +952,7 @@ setMethod("first",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' rdd <- toRDD(df)
 #'}
 setMethod("toRDD",
@@ -1185,7 +1185,7 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
 #'
 #' Return subsets of DataFrame according to given conditions
 #' @param x A DataFrame
-#' @param subset A logical expression to filter on rows
+#' @param subset (Optional) A logical expression to filter on rows
 #' @param select expression for the single Column or a list of columns to select from the DataFrame
 #' @return A new DataFrame containing only the rows that meet the condition with selected columns
 #' @export
@@ -1206,10 +1206,15 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
 #'   df[df$age %in% c(19, 30), 1:2]
 #'   subset(df, df$age %in% c(19, 30), 1:2)
 #'   subset(df, df$age %in% c(19), select = c(1,2))
+#'   subset(df, select = c(1,2))
 #' }
 setMethod("subset", signature(x = "DataFrame"),
           function(x, subset, select, ...) {
-            x[subset, select, ...]
+            if (missing(subset)) {
+              x[, select, ...]
+            } else {
+              x[subset, select, ...]
+            }
           })
 
 #' Select
@@ -1293,7 +1298,7 @@ setMethod("select",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' selectExpr(df, "col1", "(col2 * 5) as newCol")
 #' }
 setMethod("selectExpr",
@@ -1322,7 +1327,7 @@ setMethod("selectExpr",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' newDF <- withColumn(df, "newCol", df$col1 * 5)
 #' }
 setMethod("withColumn",
@@ -1347,7 +1352,7 @@ setMethod("withColumn",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' newDF <- mutate(df, newCol = df$col1 * 5, newCol2 = df$col1 * 2)
 #' names(newDF) # Will contain newCol, newCol2
 #' newDF2 <- transform(df, newCol = df$col1 / 5, newCol2 = df$col1 * 2)
@@ -1397,7 +1402,7 @@ setMethod("transform",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' newDF <- withColumnRenamed(df, "col1", "newCol1")
 #' }
 setMethod("withColumnRenamed",
@@ -1422,7 +1427,7 @@ setMethod("withColumnRenamed",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' newDF <- rename(df, col1 = df$newCol1)
 #' }
 setMethod("rename",
@@ -1466,7 +1471,7 @@ setClassUnion("characterOrColumn", c("character", "Column"))
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' arrange(df, df$col1)
 #' arrange(df, asc(df$col1), desc(abs(df$col2)))
 #' arrange(df, "col1", decreasing = TRUE)
@@ -1542,7 +1547,7 @@ setMethod("orderBy",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' filter(df, "col1 > 0")
 #' filter(df, df$col2 != "abcdefg")
 #' }
@@ -1586,8 +1591,8 @@ setMethod("where",
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
+#' df1 <- read.json(sqlContext, path)
+#' df2 <- read.json(sqlContext, path2)
 #' join(df1, df2) # Performs a Cartesian
 #' join(df1, df2, df1$col1 == df2$col2) # Performs an inner join based on expression
 #' join(df1, df2, df1$col1 == df2$col2, "right_outer")
@@ -1643,8 +1648,8 @@ setMethod("join",
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
+#' df1 <- read.json(sqlContext, path)
+#' df2 <- read.json(sqlContext, path2)
 #' merge(df1, df2) # Performs a Cartesian
 #' merge(df1, df2, by = "col1") # Performs an inner join based on expression
 #' merge(df1, df2, by.x = "col1", by.y = "col2", all.y = TRUE)
@@ -1776,8 +1781,8 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
+#' df1 <- read.json(sqlContext, path)
+#' df2 <- read.json(sqlContext, path2)
 #' unioned <- unionAll(df, df2)
 #' }
 setMethod("unionAll",
@@ -1819,8 +1824,8 @@ setMethod("rbind",
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
+#' df1 <- read.json(sqlContext, path)
+#' df2 <- read.json(sqlContext, path2)
 #' intersectDF <- intersect(df, df2)
 #' }
 setMethod("intersect",
@@ -1846,8 +1851,8 @@ setMethod("intersect",
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
+#' df1 <- read.json(sqlContext, path)
+#' df2 <- read.json(sqlContext, path2)
 #' exceptDF <- except(df, df2)
 #' }
 #' @rdname except
@@ -1887,7 +1892,7 @@ setMethod("except",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' write.df(df, "myfile", "parquet", "overwrite")
 #' saveDF(df, parquetPath2, "parquet", mode = saveMode, mergeSchema = mergeSchema)
 #' }
@@ -1952,7 +1957,7 @@ setMethod("saveDF",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' saveAsTable(df, "myfile")
 #' }
 setMethod("saveAsTable",
@@ -1993,7 +1998,7 @@ setMethod("saveAsTable",
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' describe(df)
 #' describe(df, "col1")
 #' describe(df, "col1", "col2")
@@ -2049,7 +2054,7 @@ setMethod("summary",
 #' sc <- sparkR.init()
 #' sqlCtx <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- read.json(sqlCtx, path)
 #' dropna(df)
 #' }
 setMethod("dropna",
@@ -2103,7 +2108,7 @@ setMethod("na.omit",
 #' sc <- sparkR.init()
 #' sqlCtx <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
+#' df <- read.json(sqlCtx, path)
 #' fillna(df, 1)
 #' fillna(df, list("age" = 20, "name" = "unknown"))
 #' }
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 85541c8e22447..9243d70e66f75 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -208,24 +208,33 @@ setMethod("toDF", signature(x = "RDD"),
 #' @param sqlContext SQLContext to use
 #' @param path Path of file to read. A vector of multiple paths is allowed.
 #' @return DataFrame
+#' @rdname read.json
+#' @name read.json
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
+#' df <- read.json(sqlContext, path)
 #' df <- jsonFile(sqlContext, path)
 #' }
-
-jsonFile <- function(sqlContext, path) {
+read.json <- function(sqlContext, path) {
   # Allow the user to have a more flexible definiton of the text file path
-  path <- suppressWarnings(normalizePath(path))
-  # Convert a string vector of paths to a string containing comma separated paths
-  path <- paste(path, collapse = ",")
-  sdf <- callJMethod(sqlContext, "jsonFile", path)
+  paths <- as.list(suppressWarnings(normalizePath(path)))
+  read <- callJMethod(sqlContext, "read")
+  sdf <- callJMethod(read, "json", paths)
   dataFrame(sdf)
 }
 
+#' @rdname read.json
+#' @name jsonFile
+#' @export
+jsonFile <- function(sqlContext, path) {
+  .Deprecated("read.json")
+  read.json(sqlContext, path)
+}
+
 
 #' JSON RDD
 #'
@@ -256,18 +265,30 @@ jsonRDD <- function(sqlContext, rdd, schema = NULL, samplingRatio = 1.0) {
   }
 }
 
-
 #' Create a DataFrame from a Parquet file.
 #'
 #' Loads a Parquet file, returning the result as a DataFrame.
 #'
 #' @param sqlContext SQLContext to use
-#' @param ... Path(s) of parquet file(s) to read.
+#' @param path Path of file to read. A vector of multiple paths is allowed.
 #' @return DataFrame
+#' @rdname read.parquet
+#' @name read.parquet
 #' @export
+read.parquet <- function(sqlContext, path) {
+  # Allow the user to have a more flexible definiton of the text file path
+  paths <- as.list(suppressWarnings(normalizePath(path)))
+  read <- callJMethod(sqlContext, "read")
+  sdf <- callJMethod(read, "parquet", paths)
+  dataFrame(sdf)
+}
 
+#' @rdname read.parquet
+#' @name parquetFile
+#' @export
 # TODO: Implement saveasParquetFile and write examples for both
 parquetFile <- function(sqlContext, ...) {
+  .Deprecated("read.parquet")
   # Allow the user to have a more flexible definiton of the text file path
   paths <- lapply(list(...), function(x) suppressWarnings(normalizePath(x)))
   sdf <- callJMethod(sqlContext, "parquetFile", paths)
@@ -287,7 +308,7 @@ parquetFile <- function(sqlContext, ...) {
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' registerTempTable(df, "table")
 #' new_df <- sql(sqlContext, "SELECT * FROM table")
 #' }
@@ -311,7 +332,7 @@ sql <- function(sqlContext, sqlQuery) {
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' registerTempTable(df, "table")
 #' new_df <- table(sqlContext, "table")
 #' }
@@ -384,7 +405,7 @@ tableNames <- function(sqlContext, databaseName = NULL) {
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' registerTempTable(df, "table")
 #' cacheTable(sqlContext, "table")
 #' }
@@ -406,7 +427,7 @@ cacheTable <- function(sqlContext, tableName) {
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
 #' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
+#' df <- read.json(sqlContext, path)
 #' registerTempTable(df, "table")
 #' uncacheTable(sqlContext, "table")
 #' }
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 39fc94aea5fb1..ed9b2c9d4d16c 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -330,7 +330,7 @@ writeLines(mockLinesMapType, mapTypeJsonPath)
 
 test_that("Collect DataFrame with complex types", {
   # ArrayType
-  df <- jsonFile(sqlContext, complexTypeJsonPath)
+  df <- read.json(sqlContext, complexTypeJsonPath)
 
   ldf <- collect(df)
   expect_equal(nrow(ldf), 3)
@@ -357,7 +357,7 @@ test_that("Collect DataFrame with complex types", {
   expect_equal(bob$height, 176.5)
 
   # StructType
-  df <- jsonFile(sqlContext, mapTypeJsonPath)
+  df <- read.json(sqlContext, mapTypeJsonPath)
   expect_equal(dtypes(df), list(c("info", "struct<age:bigint,height:double>"),
                                 c("name", "string")))
   ldf <- collect(df)
@@ -371,10 +371,22 @@ test_that("Collect DataFrame with complex types", {
   expect_equal(bob$height, 176.5)
 })
 
-test_that("jsonFile() on a local file returns a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+test_that("read.json()/jsonFile() on a local file returns a DataFrame", {
+  df <- read.json(sqlContext, jsonPath)
   expect_is(df, "DataFrame")
   expect_equal(count(df), 3)
+  # read.json()/jsonFile() works with multiple input paths
+  jsonPath2 <- tempfile(pattern="jsonPath2", fileext=".json")
+  write.df(df, jsonPath2, "json", mode="overwrite")
+  jsonDF1 <- read.json(sqlContext, c(jsonPath, jsonPath2))
+  expect_is(jsonDF1, "DataFrame")
+  expect_equal(count(jsonDF1), 6)
+  # Suppress warnings because jsonFile is deprecated
+  jsonDF2 <- suppressWarnings(jsonFile(sqlContext, c(jsonPath, jsonPath2)))
+  expect_is(jsonDF2, "DataFrame")
+  expect_equal(count(jsonDF2), 6)
+
+  unlink(jsonPath2)
 })
 
 test_that("jsonRDD() on a RDD with json string", {
@@ -391,7 +403,7 @@ test_that("jsonRDD() on a RDD with json string", {
 })
 
 test_that("test cache, uncache and clearCache", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   registerTempTable(df, "table1")
   cacheTable(sqlContext, "table1")
   uncacheTable(sqlContext, "table1")
@@ -400,7 +412,7 @@ test_that("test cache, uncache and clearCache", {
 })
 
 test_that("test tableNames and tables", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   registerTempTable(df, "table1")
   expect_equal(length(tableNames(sqlContext)), 1)
   df <- tables(sqlContext)
@@ -409,7 +421,7 @@ test_that("test tableNames and tables", {
 })
 
 test_that("registerTempTable() results in a queryable table and sql() results in a new DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   registerTempTable(df, "table1")
   newdf <- sql(sqlContext, "SELECT * FROM table1 where name = 'Michael'")
   expect_is(newdf, "DataFrame")
@@ -445,7 +457,7 @@ test_that("insertInto() on a registered table", {
 })
 
 test_that("table() returns a new DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   registerTempTable(df, "table1")
   tabledf <- table(sqlContext, "table1")
   expect_is(tabledf, "DataFrame")
@@ -458,14 +470,14 @@ test_that("table() returns a new DataFrame", {
 })
 
 test_that("toRDD() returns an RRDD", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   testRDD <- toRDD(df)
   expect_is(testRDD, "RDD")
   expect_equal(count(testRDD), 3)
 })
 
 test_that("union on two RDDs created from DataFrames returns an RRDD", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   RDD1 <- toRDD(df)
   RDD2 <- toRDD(df)
   unioned <- unionRDD(RDD1, RDD2)
@@ -487,7 +499,7 @@ test_that("union on mixed serialization types correctly returns a byte RRDD", {
   writeLines(textLines, textPath)
   textRDD <- textFile(sc, textPath)
 
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   dfRDD <- toRDD(df)
 
   unionByte <- unionRDD(rdd, dfRDD)
@@ -505,7 +517,7 @@ test_that("union on mixed serialization types correctly returns a byte RRDD", {
 
 test_that("objectFile() works with row serialization", {
   objectPath <- tempfile(pattern="spark-test", fileext=".tmp")
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   dfRDD <- toRDD(df)
   saveAsObjectFile(coalesce(dfRDD, 1L), objectPath)
   objectIn <- objectFile(sc, objectPath)
@@ -516,7 +528,7 @@ test_that("objectFile() works with row serialization", {
 })
 
 test_that("lapply() on a DataFrame returns an RDD with the correct columns", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   testRDD <- lapply(df, function(row) {
     row$newCol <- row$age + 5
     row
@@ -528,7 +540,7 @@ test_that("lapply() on a DataFrame returns an RDD with the correct columns", {
 })
 
 test_that("collect() returns a data.frame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   rdf <- collect(df)
   expect_true(is.data.frame(rdf))
   expect_equal(names(rdf)[1], "age")
@@ -550,14 +562,14 @@ test_that("collect() returns a data.frame", {
 })
 
 test_that("limit() returns DataFrame with the correct number of rows", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   dfLimited <- limit(df, 2)
   expect_is(dfLimited, "DataFrame")
   expect_equal(count(dfLimited), 2)
 })
 
 test_that("collect() and take() on a DataFrame return the same number of rows and columns", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   expect_equal(nrow(collect(df)), nrow(take(df, 10)))
   expect_equal(ncol(collect(df)), ncol(take(df, 10)))
 })
@@ -584,7 +596,7 @@ test_that("collect() support Unicode characters", {
 })
 
 test_that("multiple pipeline transformations result in an RDD with the correct values", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   first <- lapply(df, function(row) {
     row$age <- row$age + 5
     row
@@ -601,7 +613,7 @@ test_that("multiple pipeline transformations result in an RDD with the correct v
 })
 
 test_that("cache(), persist(), and unpersist() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   expect_false(df@env$isCached)
   cache(df)
   expect_true(df@env$isCached)
@@ -620,7 +632,7 @@ test_that("cache(), persist(), and unpersist() on a DataFrame", {
 })
 
 test_that("schema(), dtypes(), columns(), names() return the correct values/format", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   testSchema <- schema(df)
   expect_equal(length(testSchema$fields()), 2)
   expect_equal(testSchema$fields()[[1]]$dataType.toString(), "LongType")
@@ -641,7 +653,7 @@ test_that("schema(), dtypes(), columns(), names() return the correct values/form
 })
 
 test_that("names() colnames() set the column names", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   names(df) <- c("col1", "col2")
   expect_equal(colnames(df)[2], "col2")
 
@@ -661,7 +673,7 @@ test_that("names() colnames() set the column names", {
 })
 
 test_that("head() and first() return the correct data", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   testHead <- head(df)
   expect_equal(nrow(testHead), 3)
   expect_equal(ncol(testHead), 2)
@@ -694,7 +706,7 @@ test_that("distinct() and unique on DataFrames", {
   jsonPathWithDup <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(lines, jsonPathWithDup)
 
-  df <- jsonFile(sqlContext, jsonPathWithDup)
+  df <- read.json(sqlContext, jsonPathWithDup)
   uniques <- distinct(df)
   expect_is(uniques, "DataFrame")
   expect_equal(count(uniques), 3)
@@ -705,7 +717,7 @@ test_that("distinct() and unique on DataFrames", {
 })
 
 test_that("sample on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   sampled <- sample(df, FALSE, 1.0)
   expect_equal(nrow(collect(sampled)), count(df))
   expect_is(sampled, "DataFrame")
@@ -721,7 +733,7 @@ test_that("sample on a DataFrame", {
 })
 
 test_that("select operators", {
-  df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
+  df <- select(read.json(sqlContext, jsonPath), "name", "age")
   expect_is(df$name, "Column")
   expect_is(df[[2]], "Column")
   expect_is(df[["age"]], "Column")
@@ -747,7 +759,7 @@ test_that("select operators", {
 })
 
 test_that("select with column", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   df1 <- select(df, "name")
   expect_equal(columns(df1), c("name"))
   expect_equal(count(df1), 3)
@@ -770,8 +782,8 @@ test_that("select with column", {
 })
 
 test_that("subsetting", {
-  # jsonFile returns columns in random order
-  df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
+  # read.json returns columns in random order
+  df <- select(read.json(sqlContext, jsonPath), "name", "age")
   filtered <- df[df$age > 20,]
   expect_equal(count(filtered), 1)
   expect_equal(columns(filtered), c("name", "age"))
@@ -799,12 +811,16 @@ test_that("subsetting", {
   expect_equal(count(df6), 1)
   expect_equal(columns(df6), c("name", "age"))
 
+  df7 <- subset(df, select = "name")
+  expect_equal(count(df7), 3)
+  expect_equal(columns(df7), c("name"))
+
   # Test base::subset is working
   expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68)
 })
 
 test_that("selectExpr() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   selected <- selectExpr(df, "age * 2")
   expect_equal(names(selected), "(age * 2)")
   expect_equal(collect(selected), collect(select(df, df$age * 2L)))
@@ -815,12 +831,12 @@ test_that("selectExpr() on a DataFrame", {
 })
 
 test_that("expr() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   expect_equal(collect(select(df, expr("abs(-123)")))[1, 1], 123)
 })
 
 test_that("column calculation", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   d <- collect(select(df, alias(df$age + 1, "age2")))
   expect_equal(names(d), c("age2"))
   df2 <- select(df, lower(df$name), abs(df$age))
@@ -911,7 +927,7 @@ test_that("column functions", {
   expect_equal(class(rank())[[1]], "Column")
   expect_equal(rank(1:3), as.numeric(c(1:3)))
 
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))
   expect_equal(collect(df2)[[2, 1]], TRUE)
   expect_equal(collect(df2)[[2, 2]], FALSE)
@@ -979,7 +995,7 @@ test_that("column binary mathfunctions", {
              "{\"a\":4, \"b\":8}")
   jsonPathWithDup <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(lines, jsonPathWithDup)
-  df <- jsonFile(sqlContext, jsonPathWithDup)
+  df <- read.json(sqlContext, jsonPathWithDup)
   expect_equal(collect(select(df, atan2(df$a, df$b)))[1, "ATAN2(a, b)"], atan2(1, 5))
   expect_equal(collect(select(df, atan2(df$a, df$b)))[2, "ATAN2(a, b)"], atan2(2, 6))
   expect_equal(collect(select(df, atan2(df$a, df$b)))[3, "ATAN2(a, b)"], atan2(3, 7))
@@ -1000,7 +1016,7 @@ test_that("column binary mathfunctions", {
 })
 
 test_that("string operators", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   expect_equal(count(where(df, like(df$name, "A%"))), 1)
   expect_equal(count(where(df, startsWith(df$name, "A"))), 1)
   expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi")
@@ -1096,7 +1112,7 @@ test_that("when(), otherwise() and ifelse() on a DataFrame", {
 })
 
 test_that("group by, agg functions", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   df1 <- agg(df, name = "max", age = "sum")
   expect_equal(1, count(df1))
   df1 <- agg(df, age2 = max(df$age))
@@ -1141,7 +1157,7 @@ test_that("group by, agg functions", {
                   "{\"name\":\"ID2\", \"value\": \"-3\"}")
   jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(mockLines2, jsonPath2)
-  gd2 <- groupBy(jsonFile(sqlContext, jsonPath2), "name")
+  gd2 <- groupBy(read.json(sqlContext, jsonPath2), "name")
   df6 <- agg(gd2, value = "sum")
   df6_local <- collect(df6)
   expect_equal(42, df6_local[df6_local$name == "ID1",][1, 2])
@@ -1158,7 +1174,7 @@ test_that("group by, agg functions", {
                   "{\"name\":\"Justin\", \"age\":1}")
   jsonPath3 <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(mockLines3, jsonPath3)
-  df8 <- jsonFile(sqlContext, jsonPath3)
+  df8 <- read.json(sqlContext, jsonPath3)
   gd3 <- groupBy(df8, "name")
   gd3_local <- collect(sum(gd3))
   expect_equal(60, gd3_local[gd3_local$name == "Andy",][1, 2])
@@ -1177,7 +1193,7 @@ test_that("group by, agg functions", {
 })
 
 test_that("arrange() and orderBy() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   sorted <- arrange(df, df$age)
   expect_equal(collect(sorted)[1,2], "Michael")
 
@@ -1203,7 +1219,7 @@ test_that("arrange() and orderBy() on a DataFrame", {
 })
 
 test_that("filter() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   filtered <- filter(df, "age > 20")
   expect_equal(count(filtered), 1)
   expect_equal(collect(filtered)$name, "Andy")
@@ -1226,7 +1242,7 @@ test_that("filter() on a DataFrame", {
 })
 
 test_that("join() and merge() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
 
   mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}",
                   "{\"name\":\"Andy\",  \"test\": \"no\"}",
@@ -1234,7 +1250,7 @@ test_that("join() and merge() on a DataFrame", {
                   "{\"name\":\"Bob\", \"test\": \"yes\"}")
   jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(mockLines2, jsonPath2)
-  df2 <- jsonFile(sqlContext, jsonPath2)
+  df2 <- read.json(sqlContext, jsonPath2)
 
   joined <- join(df, df2)
   expect_equal(names(joined), c("age", "name", "name", "test"))
@@ -1309,14 +1325,14 @@ test_that("join() and merge() on a DataFrame", {
                   "{\"name\":\"Bob\", \"name_y\":\"Bob\", \"test\": \"yes\"}")
   jsonPath3 <- tempfile(pattern="sparkr-test", fileext=".tmp")
   writeLines(mockLines3, jsonPath3)
-  df3 <- jsonFile(sqlContext, jsonPath3)
+  df3 <- read.json(sqlContext, jsonPath3)
   expect_error(merge(df, df3),
                paste("The following column name: name_y occurs more than once in the 'DataFrame'.",
                      "Please use different suffixes for the intersected columns.", sep = ""))
 })
 
 test_that("toJSON() returns an RDD of the correct values", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   testRDD <- toJSON(df)
   expect_is(testRDD, "RDD")
   expect_equal(getSerializedMode(testRDD), "string")
@@ -1324,7 +1340,7 @@ test_that("toJSON() returns an RDD of the correct values", {
 })
 
 test_that("showDF()", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   s <- capture.output(showDF(df))
   expected <- paste("+----+-------+\n",
                     "| age|   name|\n",
@@ -1337,12 +1353,12 @@ test_that("showDF()", {
 })
 
 test_that("isLocal()", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   expect_false(isLocal(df))
 })
 
 test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
 
   lines <- c("{\"name\":\"Bob\", \"age\":24}",
              "{\"name\":\"Andy\", \"age\":30}",
@@ -1379,7 +1395,7 @@ test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
 })
 
 test_that("withColumn() and withColumnRenamed()", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   newDF <- withColumn(df, "newAge", df$age + 2)
   expect_equal(length(columns(newDF)), 3)
   expect_equal(columns(newDF)[3], "newAge")
@@ -1391,7 +1407,7 @@ test_that("withColumn() and withColumnRenamed()", {
 })
 
 test_that("mutate(), transform(), rename() and names()", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   newDF <- mutate(df, newAge = df$age + 2)
   expect_equal(length(columns(newDF)), 3)
   expect_equal(columns(newDF)[3], "newAge")
@@ -1420,22 +1436,25 @@ test_that("mutate(), transform(), rename() and names()", {
   detach(airquality)
 })
 
-test_that("write.df() on DataFrame and works with parquetFile", {
-  df <- jsonFile(sqlContext, jsonPath)
+test_that("write.df() on DataFrame and works with read.parquet", {
+  df <- read.json(sqlContext, jsonPath)
   write.df(df, parquetPath, "parquet", mode="overwrite")
-  parquetDF <- parquetFile(sqlContext, parquetPath)
+  parquetDF <- read.parquet(sqlContext, parquetPath)
   expect_is(parquetDF, "DataFrame")
   expect_equal(count(df), count(parquetDF))
 })
 
-test_that("parquetFile works with multiple input paths", {
-  df <- jsonFile(sqlContext, jsonPath)
+test_that("read.parquet()/parquetFile() works with multiple input paths", {
+  df <- read.json(sqlContext, jsonPath)
   write.df(df, parquetPath, "parquet", mode="overwrite")
   parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
   write.df(df, parquetPath2, "parquet", mode="overwrite")
-  parquetDF <- parquetFile(sqlContext, parquetPath, parquetPath2)
+  parquetDF <- read.parquet(sqlContext, c(parquetPath, parquetPath2))
   expect_is(parquetDF, "DataFrame")
   expect_equal(count(parquetDF), count(df) * 2)
+  parquetDF2 <- suppressWarnings(parquetFile(sqlContext, parquetPath, parquetPath2))
+  expect_is(parquetDF2, "DataFrame")
+  expect_equal(count(parquetDF2), count(df) * 2)
 
   # Test if varargs works with variables
   saveMode <- "overwrite"
@@ -1445,7 +1464,7 @@ test_that("parquetFile works with multiple input paths", {
 })
 
 test_that("describe() and summarize() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   stats <- describe(df, "age")
   expect_equal(collect(stats)[1, "summary"], "count")
   expect_equal(collect(stats)[2, "age"], "24.5")
@@ -1463,7 +1482,7 @@ test_that("describe() and summarize() on a DataFrame", {
 })
 
 test_that("dropna() and na.omit() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPathNa)
+  df <- read.json(sqlContext, jsonPathNa)
   rows <- collect(df)
 
   # drop with columns
@@ -1549,7 +1568,7 @@ test_that("dropna() and na.omit() on a DataFrame", {
 })
 
 test_that("fillna() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPathNa)
+  df <- read.json(sqlContext, jsonPathNa)
   rows <- collect(df)
 
   # fill with value
@@ -1658,7 +1677,7 @@ test_that("Method as.data.frame as a synonym for collect()", {
 })
 
 test_that("attach() on a DataFrame", {
-  df <- jsonFile(sqlContext, jsonPath)
+  df <- read.json(sqlContext, jsonPath)
   expect_error(age)
   attach(df)
   expect_is(age, "DataFrame")
@@ -1706,7 +1725,7 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", {
     list("a"="b", "c"="d", "e"="f")))))
   expect_equal(coltypes(x), "map<string,string>")
 
-  df <- selectExpr(jsonFile(sqlContext, jsonPath), "name", "(age * 1.21) as age")
+  df <- selectExpr(read.json(sqlContext, jsonPath), "name", "(age * 1.21) as age")
   expect_equal(dtypes(df), list(c("name", "string"), c("age", "decimal(24,2)")))
 
   df1 <- select(df, cast(df$age, "integer"))
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 19633a3ce6a02..d3384fb297732 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -597,7 +597,9 @@ private[spark] object SparkConf extends Logging {
     "spark.streaming.fileStream.minRememberDuration" -> Seq(
       AlternateConfig("spark.streaming.minRememberDuration", "1.5")),
     "spark.yarn.max.executor.failures" -> Seq(
-      AlternateConfig("spark.yarn.max.worker.failures", "1.5"))
+      AlternateConfig("spark.yarn.max.worker.failures", "1.5")),
+    "spark.memory.offHeap.enabled" -> Seq(
+      AlternateConfig("spark.unsafe.offHeap", "1.6"))
     )
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala b/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala
index 9023e1ac012b7..dbb0ad8d5c673 100644
--- a/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala
+++ b/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala
@@ -70,11 +70,28 @@ private[memory] class ExecutionMemoryPool(
    * active tasks) before it is forced to spill. This can happen if the number of tasks increase
    * but an older task had a lot of memory already.
    *
+   * @param numBytes number of bytes to acquire
+   * @param taskAttemptId the task attempt acquiring memory
+   * @param maybeGrowPool a callback that potentially grows the size of this pool. It takes in
+   *                      one parameter (Long) that represents the desired amount of memory by
+   *                      which this pool should be expanded.
+   * @param computeMaxPoolSize a callback that returns the maximum allowable size of this pool
+   *                           at this given moment. This is not a field because the max pool
+   *                           size is variable in certain cases. For instance, in unified
+   *                           memory management, the execution pool can be expanded by evicting
+   *                           cached blocks, thereby shrinking the storage pool.
+   *
    * @return the number of bytes granted to the task.
    */
-  def acquireMemory(numBytes: Long, taskAttemptId: Long): Long = lock.synchronized {
+  private[memory] def acquireMemory(
+      numBytes: Long,
+      taskAttemptId: Long,
+      maybeGrowPool: Long => Unit = (additionalSpaceNeeded: Long) => Unit,
+      computeMaxPoolSize: () => Long = () => poolSize): Long = lock.synchronized {
     assert(numBytes > 0, s"invalid number of bytes requested: $numBytes")
 
+    // TODO: clean up this clunky method signature
+
     // Add this task to the taskMemory map just so we can keep an accurate count of the number
     // of active tasks, to let other tasks ramp down their memory in calls to `acquireMemory`
     if (!memoryForTask.contains(taskAttemptId)) {
@@ -91,25 +108,31 @@ private[memory] class ExecutionMemoryPool(
       val numActiveTasks = memoryForTask.keys.size
       val curMem = memoryForTask(taskAttemptId)
 
-      // How much we can grant this task; don't let it grow to more than 1 / numActiveTasks;
-      // don't let it be negative
-      val maxToGrant =
-        math.min(numBytes, math.max(0, (poolSize / numActiveTasks) - curMem))
+      // In every iteration of this loop, we should first try to reclaim any borrowed execution
+      // space from storage. This is necessary because of the potential race condition where new
+      // storage blocks may steal the free execution memory that this task was waiting for.
+      maybeGrowPool(numBytes - memoryFree)
+
+      // Maximum size the pool would have after potentially growing the pool.
+      // This is used to compute the upper bound of how much memory each task can occupy. This
+      // must take into account potential free memory as well as the amount this pool currently
+      // occupies. Otherwise, we may run into SPARK-12155 where, in unified memory management,
+      // we did not take into account space that could have been freed by evicting cached blocks.
+      val maxPoolSize = computeMaxPoolSize()
+      val maxMemoryPerTask = maxPoolSize / numActiveTasks
+      val minMemoryPerTask = poolSize / (2 * numActiveTasks)
+
+      // How much we can grant this task; keep its share within 0 <= X <= 1 / numActiveTasks
+      val maxToGrant = math.min(numBytes, math.max(0, maxMemoryPerTask - curMem))
       // Only give it as much memory as is free, which might be none if it reached 1 / numTasks
       val toGrant = math.min(maxToGrant, memoryFree)
 
-      if (curMem < poolSize / (2 * numActiveTasks)) {
-        // We want to let each task get at least 1 / (2 * numActiveTasks) before blocking;
-        // if we can't give it this much now, wait for other tasks to free up memory
-        // (this happens if older tasks allocated lots of memory before N grew)
-        if (memoryFree >= math.min(maxToGrant, poolSize / (2 * numActiveTasks) - curMem)) {
-          memoryForTask(taskAttemptId) += toGrant
-          return toGrant
-        } else {
-          logInfo(
-            s"TID $taskAttemptId waiting for at least 1/2N of $poolName pool to be free")
-          lock.wait()
-        }
+      // We want to let each task get at least 1 / (2 * numActiveTasks) before blocking;
+      // if we can't give it this much now, wait for other tasks to free up memory
+      // (this happens if older tasks allocated lots of memory before N grew)
+      if (toGrant < numBytes && curMem + toGrant < minMemoryPerTask) {
+        logInfo(s"TID $taskAttemptId waiting for at least 1/2N of $poolName pool to be free")
+        lock.wait()
       } else {
         memoryForTask(taskAttemptId) += toGrant
         return toGrant
diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
index ceb8ea434e1be..e707e27d96b50 100644
--- a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
@@ -50,7 +50,7 @@ private[spark] abstract class MemoryManager(
 
   storageMemoryPool.incrementPoolSize(storageMemory)
   onHeapExecutionMemoryPool.incrementPoolSize(onHeapExecutionMemory)
-  offHeapExecutionMemoryPool.incrementPoolSize(conf.getSizeAsBytes("spark.memory.offHeapSize", 0))
+  offHeapExecutionMemoryPool.incrementPoolSize(conf.getSizeAsBytes("spark.memory.offHeap.size", 0))
 
   /**
    * Total available memory for storage, in bytes. This amount can vary over time, depending on
@@ -77,9 +77,7 @@ private[spark] abstract class MemoryManager(
   def acquireStorageMemory(
       blockId: BlockId,
       numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
-    storageMemoryPool.acquireMemory(blockId, numBytes, evictedBlocks)
-  }
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean
 
   /**
    * Acquire N bytes of memory to unroll the given block, evicting existing ones if necessary.
@@ -109,12 +107,7 @@ private[spark] abstract class MemoryManager(
   def acquireExecutionMemory(
       numBytes: Long,
       taskAttemptId: Long,
-      memoryMode: MemoryMode): Long = synchronized {
-    memoryMode match {
-      case MemoryMode.ON_HEAP => onHeapExecutionMemoryPool.acquireMemory(numBytes, taskAttemptId)
-      case MemoryMode.OFF_HEAP => offHeapExecutionMemoryPool.acquireMemory(numBytes, taskAttemptId)
-    }
-  }
+      memoryMode: MemoryMode): Long
 
   /**
    * Release numBytes of execution memory belonging to the given task.
@@ -189,7 +182,13 @@ private[spark] abstract class MemoryManager(
    * sun.misc.Unsafe.
    */
   final val tungstenMemoryMode: MemoryMode = {
-    if (conf.getBoolean("spark.unsafe.offHeap", false)) MemoryMode.OFF_HEAP else MemoryMode.ON_HEAP
+    if (conf.getBoolean("spark.memory.offHeap.enabled", false)) {
+      require(conf.getSizeAsBytes("spark.memory.offHeap.size", 0) > 0,
+        "spark.memory.offHeap.size must be > 0 when spark.memory.offHeap.enabled == true")
+      MemoryMode.OFF_HEAP
+    } else {
+      MemoryMode.ON_HEAP
+    }
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
index 12a094306861f..3554b558f2123 100644
--- a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
@@ -49,19 +49,50 @@ private[spark] class StaticMemoryManager(
   }
 
   // Max number of bytes worth of blocks to evict when unrolling
-  private val maxMemoryToEvictForUnroll: Long = {
+  private val maxUnrollMemory: Long = {
     (maxStorageMemory * conf.getDouble("spark.storage.unrollFraction", 0.2)).toLong
   }
 
+  override def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
+    if (numBytes > maxStorageMemory) {
+      // Fail fast if the block simply won't fit
+      logInfo(s"Will not store $blockId as the required space ($numBytes bytes) exceeds our " +
+        s"memory limit ($maxStorageMemory bytes)")
+      false
+    } else {
+      storageMemoryPool.acquireMemory(blockId, numBytes, evictedBlocks)
+    }
+  }
+
   override def acquireUnrollMemory(
       blockId: BlockId,
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
     val currentUnrollMemory = storageMemoryPool.memoryStore.currentUnrollMemory
-    val maxNumBytesToFree = math.max(0, maxMemoryToEvictForUnroll - currentUnrollMemory)
-    val numBytesToFree = math.min(numBytes, maxNumBytesToFree)
+    val freeMemory = storageMemoryPool.memoryFree
+    // When unrolling, we will use all of the existing free memory, and, if necessary,
+    // some extra space freed from evicting cached blocks. We must place a cap on the
+    // amount of memory to be evicted by unrolling, however, otherwise unrolling one
+    // big block can blow away the entire cache.
+    val maxNumBytesToFree = math.max(0, maxUnrollMemory - currentUnrollMemory - freeMemory)
+    // Keep it within the range 0 <= X <= maxNumBytesToFree
+    val numBytesToFree = math.max(0, math.min(maxNumBytesToFree, numBytes - freeMemory))
     storageMemoryPool.acquireMemory(blockId, numBytes, numBytesToFree, evictedBlocks)
   }
+
+  private[memory]
+  override def acquireExecutionMemory(
+      numBytes: Long,
+      taskAttemptId: Long,
+      memoryMode: MemoryMode): Long = synchronized {
+    memoryMode match {
+      case MemoryMode.ON_HEAP => onHeapExecutionMemoryPool.acquireMemory(numBytes, taskAttemptId)
+      case MemoryMode.OFF_HEAP => offHeapExecutionMemoryPool.acquireMemory(numBytes, taskAttemptId)
+    }
+  }
 }
 
 
diff --git a/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala b/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala
index fc4f0357e9f16..70af83b5ee092 100644
--- a/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala
+++ b/core/src/main/scala/org/apache/spark/memory/StorageMemoryPool.scala
@@ -65,7 +65,8 @@ private[memory] class StorageMemoryPool(lock: Object) extends MemoryPool(lock) w
       blockId: BlockId,
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = lock.synchronized {
-    acquireMemory(blockId, numBytes, numBytes, evictedBlocks)
+    val numBytesToFree = math.max(0, numBytes - memoryFree)
+    acquireMemory(blockId, numBytes, numBytesToFree, evictedBlocks)
   }
 
   /**
@@ -73,7 +74,7 @@ private[memory] class StorageMemoryPool(lock: Object) extends MemoryPool(lock) w
    *
    * @param blockId the ID of the block we are acquiring storage memory for
    * @param numBytesToAcquire the size of this block
-   * @param numBytesToFree the size of space to be freed through evicting blocks
+   * @param numBytesToFree the amount of space to be freed through evicting blocks
    * @return whether all N bytes were successfully granted.
    */
   def acquireMemory(
@@ -84,16 +85,18 @@ private[memory] class StorageMemoryPool(lock: Object) extends MemoryPool(lock) w
     assert(numBytesToAcquire >= 0)
     assert(numBytesToFree >= 0)
     assert(memoryUsed <= poolSize)
-    memoryStore.ensureFreeSpace(blockId, numBytesToFree, evictedBlocks)
-    // Register evicted blocks, if any, with the active task metrics
-    Option(TaskContext.get()).foreach { tc =>
-      val metrics = tc.taskMetrics()
-      val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())
-      metrics.updatedBlocks = Some(lastUpdatedBlocks ++ evictedBlocks.toSeq)
+    if (numBytesToFree > 0) {
+      memoryStore.evictBlocksToFreeSpace(Some(blockId), numBytesToFree, evictedBlocks)
+      // Register evicted blocks, if any, with the active task metrics
+      Option(TaskContext.get()).foreach { tc =>
+        val metrics = tc.taskMetrics()
+        val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())
+        metrics.updatedBlocks = Some(lastUpdatedBlocks ++ evictedBlocks.toSeq)
+      }
     }
     // NOTE: If the memory store evicts blocks, then those evictions will synchronously call
-    // back into this StorageMemoryPool in order to free. Therefore, these variables should have
-    // been updated.
+    // back into this StorageMemoryPool in order to free memory. Therefore, these variables
+    // should have been updated.
     val enoughMemory = numBytesToAcquire <= memoryFree
     if (enoughMemory) {
       _memoryUsed += numBytesToAcquire
@@ -121,18 +124,20 @@ private[memory] class StorageMemoryPool(lock: Object) extends MemoryPool(lock) w
    */
   def shrinkPoolToFreeSpace(spaceToFree: Long): Long = lock.synchronized {
     // First, shrink the pool by reclaiming free memory:
-    val spaceFreedByReleasingUnusedMemory = Math.min(spaceToFree, memoryFree)
+    val spaceFreedByReleasingUnusedMemory = math.min(spaceToFree, memoryFree)
     decrementPoolSize(spaceFreedByReleasingUnusedMemory)
-    if (spaceFreedByReleasingUnusedMemory == spaceToFree) {
-      spaceFreedByReleasingUnusedMemory
-    } else {
+    val remainingSpaceToFree = spaceToFree - spaceFreedByReleasingUnusedMemory
+    if (remainingSpaceToFree > 0) {
       // If reclaiming free memory did not adequately shrink the pool, begin evicting blocks:
       val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
-      memoryStore.ensureFreeSpace(spaceToFree - spaceFreedByReleasingUnusedMemory, evictedBlocks)
+      memoryStore.evictBlocksToFreeSpace(None, remainingSpaceToFree, evictedBlocks)
       val spaceFreedByEviction = evictedBlocks.map(_._2.memSize).sum
-      _memoryUsed -= spaceFreedByEviction
+      // When a block is released, BlockManager.dropFromMemory() calls releaseMemory(), so we do
+      // not need to decrement _memoryUsed here. However, we do need to decrement the pool size.
       decrementPoolSize(spaceFreedByEviction)
       spaceFreedByReleasingUnusedMemory + spaceFreedByEviction
+    } else {
+      spaceFreedByReleasingUnusedMemory
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
index 0f1ea9ab39c07..829f054dba0e9 100644
--- a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
@@ -81,26 +81,55 @@ private[spark] class UnifiedMemoryManager private[memory] (
     assert(numBytes >= 0)
     memoryMode match {
       case MemoryMode.ON_HEAP =>
-        if (numBytes > onHeapExecutionMemoryPool.memoryFree) {
-          val extraMemoryNeeded = numBytes - onHeapExecutionMemoryPool.memoryFree
-          // There is not enough free memory in the execution pool, so try to reclaim memory from
-          // storage. We can reclaim any free memory from the storage pool. If the storage pool
-          // has grown to become larger than `storageRegionSize`, we can evict blocks and reclaim
-          // the memory that storage has borrowed from execution.
-          val memoryReclaimableFromStorage =
-            math.max(storageMemoryPool.memoryFree, storageMemoryPool.poolSize - storageRegionSize)
-          if (memoryReclaimableFromStorage > 0) {
-            // Only reclaim as much space as is necessary and available:
-            val spaceReclaimed = storageMemoryPool.shrinkPoolToFreeSpace(
-              math.min(extraMemoryNeeded, memoryReclaimableFromStorage))
-            onHeapExecutionMemoryPool.incrementPoolSize(spaceReclaimed)
+
+        /**
+         * Grow the execution pool by evicting cached blocks, thereby shrinking the storage pool.
+         *
+         * When acquiring memory for a task, the execution pool may need to make multiple
+         * attempts. Each attempt must be able to evict storage in case another task jumps in
+         * and caches a large block between the attempts. This is called once per attempt.
+         */
+        def maybeGrowExecutionPool(extraMemoryNeeded: Long): Unit = {
+          if (extraMemoryNeeded > 0) {
+            // There is not enough free memory in the execution pool, so try to reclaim memory from
+            // storage. We can reclaim any free memory from the storage pool. If the storage pool
+            // has grown to become larger than `storageRegionSize`, we can evict blocks and reclaim
+            // the memory that storage has borrowed from execution.
+            val memoryReclaimableFromStorage =
+              math.max(storageMemoryPool.memoryFree, storageMemoryPool.poolSize - storageRegionSize)
+            if (memoryReclaimableFromStorage > 0) {
+              // Only reclaim as much space as is necessary and available:
+              val spaceReclaimed = storageMemoryPool.shrinkPoolToFreeSpace(
+                math.min(extraMemoryNeeded, memoryReclaimableFromStorage))
+              onHeapExecutionMemoryPool.incrementPoolSize(spaceReclaimed)
+            }
           }
         }
-        onHeapExecutionMemoryPool.acquireMemory(numBytes, taskAttemptId)
+
+        /**
+         * The size the execution pool would have after evicting storage memory.
+         *
+         * The execution memory pool divides this quantity among the active tasks evenly to cap
+         * the execution memory allocation for each task. It is important to keep this greater
+         * than the execution pool size, which doesn't take into account potential memory that
+         * could be freed by evicting storage. Otherwise we may hit SPARK-12155.
+         *
+         * Additionally, this quantity should be kept below `maxMemory` to arbitrate fairness
+         * in execution memory allocation across tasks, Otherwise, a task may occupy more than
+         * its fair share of execution memory, mistakenly thinking that other tasks can acquire
+         * the portion of storage memory that cannot be evicted.
+         */
+        def computeMaxExecutionPoolSize(): Long = {
+          maxMemory - math.min(storageMemoryUsed, storageRegionSize)
+        }
+
+        onHeapExecutionMemoryPool.acquireMemory(
+          numBytes, taskAttemptId, maybeGrowExecutionPool, computeMaxExecutionPoolSize)
+
       case MemoryMode.OFF_HEAP =>
         // For now, we only support on-heap caching of data, so we do not need to interact with
         // the storage pool when allocating off-heap memory. This will change in the future, though.
-        super.acquireExecutionMemory(numBytes, taskAttemptId, memoryMode)
+        offHeapExecutionMemoryPool.acquireMemory(numBytes, taskAttemptId)
     }
   }
 
@@ -110,6 +139,12 @@ private[spark] class UnifiedMemoryManager private[memory] (
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
     assert(onHeapExecutionMemoryPool.poolSize + storageMemoryPool.poolSize == maxMemory)
     assert(numBytes >= 0)
+    if (numBytes > maxStorageMemory) {
+      // Fail fast if the block simply won't fit
+      logInfo(s"Will not store $blockId as the required space ($numBytes bytes) exceeds our " +
+        s"memory limit ($maxStorageMemory bytes)")
+      return false
+    }
     if (numBytes > storageMemoryPool.memoryFree) {
       // There is not enough free memory in the storage pool, so try to borrow free memory from
       // the execution pool.
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 2fcd5aa57d11b..304f723e4924e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -93,6 +93,12 @@ private[spark] abstract class Task[T](
         Utils.tryLogNonFatalError {
           // Release memory used by this thread for unrolling blocks
           SparkEnv.get.blockManager.memoryStore.releaseUnrollMemoryForThisTask()
+          // Notify any tasks waiting for execution memory to be freed to wake up and try to
+          // acquire memory again. This makes impossible the scenario where a task sleeps forever
+          // because there are no other tasks left to notify it. Since this is safe to do but may
+          // not be strictly necessary, we should revisit whether we can remove this in the future.
+          val memoryManager = SparkEnv.get.memoryManager
+          memoryManager.synchronized { memoryManager.notifyAll() }
         }
       } finally {
         TaskContext.unset()
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
index 4dbac388e098b..bdab8c2332fae 100644
--- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -406,85 +406,41 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
   }
 
   /**
-   * Try to free up a given amount of space by evicting existing blocks.
-   *
-   * @param space the amount of memory to free, in bytes
-   * @param droppedBlocks a holder for blocks evicted in the process
-   * @return whether the requested free space is freed.
-   */
-  private[spark] def ensureFreeSpace(
-      space: Long,
-      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
-    ensureFreeSpace(None, space, droppedBlocks)
-  }
-
-  /**
-   * Try to free up a given amount of space to store a block by evicting existing ones.
-   *
-   * @param space the amount of memory to free, in bytes
-   * @param droppedBlocks a holder for blocks evicted in the process
-   * @return whether the requested free space is freed.
-   */
-  private[spark] def ensureFreeSpace(
-      blockId: BlockId,
-      space: Long,
-      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
-    ensureFreeSpace(Some(blockId), space, droppedBlocks)
-  }
-
-  /**
-   * Try to free up a given amount of space to store a particular block, but can fail if
-   * either the block is bigger than our memory or it would require replacing another block
-   * from the same RDD (which leads to a wasteful cyclic replacement pattern for RDDs that
-   * don't fit into memory that we want to avoid).
-   *
-   * @param blockId the ID of the block we are freeing space for, if any
-   * @param space the size of this block
-   * @param droppedBlocks a holder for blocks evicted in the process
-   * @return whether the requested free space is freed.
-   */
-  private def ensureFreeSpace(
+    * Try to evict blocks to free up a given amount of space to store a particular block.
+    * Can fail if either the block is bigger than our memory or it would require replacing
+    * another block from the same RDD (which leads to a wasteful cyclic replacement pattern for
+    * RDDs that don't fit into memory that we want to avoid).
+    *
+    * @param blockId the ID of the block we are freeing space for, if any
+    * @param space the size of this block
+    * @param droppedBlocks a holder for blocks evicted in the process
+    * @return whether the requested free space is freed.
+    */
+  private[spark] def evictBlocksToFreeSpace(
       blockId: Option[BlockId],
       space: Long,
       droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    assert(space > 0)
     memoryManager.synchronized {
-      val freeMemory = maxMemory - memoryUsed
+      var freedMemory = 0L
       val rddToAdd = blockId.flatMap(getRddId)
       val selectedBlocks = new ArrayBuffer[BlockId]
-      var selectedMemory = 0L
-
-      logInfo(s"Ensuring $space bytes of free space " +
-        blockId.map { id => s"for block $id" }.getOrElse("") +
-        s"(free: $freeMemory, max: $maxMemory)")
-
-      // Fail fast if the block simply won't fit
-      if (space > maxMemory) {
-        logInfo("Will not " + blockId.map { id => s"store $id" }.getOrElse("free memory") +
-          s" as the required space ($space bytes) exceeds our memory limit ($maxMemory bytes)")
-        return false
-      }
-
-      // No need to evict anything if there is already enough free space
-      if (freeMemory >= space) {
-        return true
-      }
-
       // This is synchronized to ensure that the set of entries is not changed
       // (because of getValue or getBytes) while traversing the iterator, as that
       // can lead to exceptions.
       entries.synchronized {
         val iterator = entries.entrySet().iterator()
-        while (freeMemory + selectedMemory < space && iterator.hasNext) {
+        while (freedMemory < space && iterator.hasNext) {
           val pair = iterator.next()
           val blockId = pair.getKey
           if (rddToAdd.isEmpty || rddToAdd != getRddId(blockId)) {
             selectedBlocks += blockId
-            selectedMemory += pair.getValue.size
+            freedMemory += pair.getValue.size
           }
         }
       }
 
-      if (freeMemory + selectedMemory >= space) {
+      if (freedMemory >= space) {
         logInfo(s"${selectedBlocks.size} blocks selected for dropping")
         for (blockId <- selectedBlocks) {
           val entry = entries.synchronized { entries.get(blockId) }
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 1e8194f57888e..81a6f07ec836a 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -448,7 +448,6 @@ private[spark] object UIUtils extends Logging {
       new RuleTransformer(rule).transform(xml)
     } catch {
       case NonFatal(e) =>
-        logWarning(s"Invalid job description: $desc ", e)
         <span class="description-input">{desc}</span>
     }
   }
diff --git a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
index 711eed0193bc0..776a2997cf91f 100644
--- a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
+++ b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
@@ -29,7 +29,7 @@ public class TaskMemoryManagerSuite {
   public void leakedPageMemoryIsDetected() {
     final TaskMemoryManager manager = new TaskMemoryManager(
       new StaticMemoryManager(
-        new SparkConf().set("spark.unsafe.offHeap", "false"),
+        new SparkConf().set("spark.memory.offHeap.enabled", "false"),
         Long.MAX_VALUE,
         Long.MAX_VALUE,
         1),
@@ -41,8 +41,10 @@ public void leakedPageMemoryIsDetected() {
 
   @Test
   public void encodePageNumberAndOffsetOffHeap() {
-    final TaskMemoryManager manager = new TaskMemoryManager(
-      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "true")), 0);
+    final SparkConf conf = new SparkConf()
+      .set("spark.memory.offHeap.enabled", "true")
+      .set("spark.memory.offHeap.size", "1000");
+    final TaskMemoryManager manager = new TaskMemoryManager(new TestMemoryManager(conf), 0);
     final MemoryBlock dataPage = manager.allocatePage(256, null);
     // In off-heap mode, an offset is an absolute address that may require more than 51 bits to
     // encode. This test exercises that corner-case:
@@ -55,7 +57,7 @@ public void encodePageNumberAndOffsetOffHeap() {
   @Test
   public void encodePageNumberAndOffsetOnHeap() {
     final TaskMemoryManager manager = new TaskMemoryManager(
-      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
     final MemoryBlock dataPage = manager.allocatePage(256, null);
     final long encodedAddress = manager.encodePageNumberAndOffset(dataPage, 64);
     Assert.assertEquals(dataPage.getBaseObject(), manager.getPage(encodedAddress));
@@ -104,4 +106,15 @@ public void cooperativeSpilling() {
     assert(manager.cleanUpAllAllocatedMemory() == 0);
   }
 
+  @Test
+  public void offHeapConfigurationBackwardsCompatibility() {
+    // Tests backwards-compatibility with the old `spark.unsafe.offHeap` configuration, which
+    // was deprecated in Spark 1.6 and replaced by `spark.memory.offHeap.enabled` (see SPARK-12251).
+    final SparkConf conf = new SparkConf()
+      .set("spark.unsafe.offHeap", "true")
+      .set("spark.memory.offHeap.size", "1000");
+    final TaskMemoryManager manager = new TaskMemoryManager(new TestMemoryManager(conf), 0);
+    assert(manager.tungstenMemoryMode == MemoryMode.OFF_HEAP);
+  }
+
 }
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
index 9a43f1f3a9235..fe5abc5c23049 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/PackedRecordPointerSuite.java
@@ -35,7 +35,7 @@ public class PackedRecordPointerSuite {
 
   @Test
   public void heap() throws IOException {
-    final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "false");
+    final SparkConf conf = new SparkConf().set("spark.memory.offHeap.enabled", "false");
     final TaskMemoryManager memoryManager =
       new TaskMemoryManager(new TestMemoryManager(conf), 0);
     final MemoryBlock page0 = memoryManager.allocatePage(128, null);
@@ -54,7 +54,9 @@ public void heap() throws IOException {
 
   @Test
   public void offHeap() throws IOException {
-    final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "true");
+    final SparkConf conf = new SparkConf()
+      .set("spark.memory.offHeap.enabled", "true")
+      .set("spark.memory.offHeap.size", "10000");
     final TaskMemoryManager memoryManager =
       new TaskMemoryManager(new TestMemoryManager(conf), 0);
     final MemoryBlock page0 = memoryManager.allocatePage(128, null);
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
index faa5a863ee630..0328e63e45439 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorterSuite.java
@@ -34,7 +34,7 @@
 public class ShuffleInMemorySorterSuite {
 
   final TestMemoryManager memoryManager =
-    new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"));
+    new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false"));
   final TaskMemoryManager taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
   final TestMemoryConsumer consumer = new TestMemoryConsumer(taskMemoryManager);
 
@@ -64,7 +64,7 @@ public void testBasicSorting() throws Exception {
       "Lychee",
       "Mango"
     };
-    final SparkConf conf = new SparkConf().set("spark.unsafe.offHeap", "false");
+    final SparkConf conf = new SparkConf().set("spark.memory.offHeap.enabled", "false");
     final TaskMemoryManager memoryManager =
       new TaskMemoryManager(new TestMemoryManager(conf), 0);
     final MemoryBlock dataPage = memoryManager.allocatePage(2048, null);
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
index bc85918c59aab..5fe64bde3604a 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -108,7 +108,7 @@ public void setUp() throws IOException {
     spillFilesCreated.clear();
     conf = new SparkConf()
       .set("spark.buffer.pageSize", "1m")
-      .set("spark.unsafe.offHeap", "false");
+      .set("spark.memory.offHeap.enabled", "false");
     taskMetrics = new TaskMetrics();
     memoryManager = new TestMemoryManager(conf);
     taskMemoryManager =  new TaskMemoryManager(memoryManager, 0);
diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index d87a1d2a56d99..e4d857b0ce9fb 100644
--- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -84,8 +84,8 @@ public void setup() {
     memoryManager =
       new TestMemoryManager(
         new SparkConf()
-          .set("spark.unsafe.offHeap", "" + useOffHeapMemoryAllocator())
-          .set("spark.memory.offHeapSize", "256mb"));
+          .set("spark.memory.offHeap.enabled", "" + useOffHeapMemoryAllocator())
+          .set("spark.memory.offHeap.size", "256mb"));
     taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
 
     tempDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "unsafe-test");
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index a1c9f6fab8e65..e0ee281e98b71 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -58,7 +58,7 @@ public class UnsafeExternalSorterSuite {
 
   final LinkedList<File> spillFilesCreated = new LinkedList<File>();
   final TestMemoryManager memoryManager =
-    new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"));
+    new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false"));
   final TaskMemoryManager taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
   // Use integer comparison for comparing prefixes (which are partition ids, in this case)
   final PrefixComparator prefixComparator = new PrefixComparator() {
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
index a203a09648ac0..93efd033eb940 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java
@@ -46,7 +46,7 @@ private static String getStringFromDataPage(Object baseObject, long baseOffset,
   @Test
   public void testSortingEmptyInput() {
     final TaskMemoryManager memoryManager = new TaskMemoryManager(
-      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
     final TestMemoryConsumer consumer = new TestMemoryConsumer(memoryManager);
     final UnsafeInMemorySorter sorter = new UnsafeInMemorySorter(consumer,
       memoryManager,
@@ -71,7 +71,7 @@ public void testSortingOnlyByIntegerPrefix() throws Exception {
       "Mango"
     };
     final TaskMemoryManager memoryManager = new TaskMemoryManager(
-      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false")), 0);
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
     final TestMemoryConsumer consumer = new TestMemoryConsumer(memoryManager);
     final MemoryBlock dataPage = memoryManager.allocatePage(2048, null);
     final Object baseObject = dataPage.getBaseObject();
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
index f55d435fa33a6..555b640cb4244 100644
--- a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
@@ -24,9 +24,10 @@ import scala.concurrent.duration.Duration
 import scala.concurrent.{Await, ExecutionContext, Future}
 
 import org.mockito.Matchers.{any, anyLong}
-import org.mockito.Mockito.{mock, when}
+import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfterEach
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkFunSuite
@@ -36,105 +37,105 @@ import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, StorageLevel
 /**
  * Helper trait for sharing code among [[MemoryManager]] tests.
  */
-private[memory] trait MemoryManagerSuite extends SparkFunSuite {
+private[memory] trait MemoryManagerSuite extends SparkFunSuite with BeforeAndAfterEach {
 
-  import MemoryManagerSuite.DEFAULT_ENSURE_FREE_SPACE_CALLED
+  protected val evictedBlocks = new mutable.ArrayBuffer[(BlockId, BlockStatus)]
+
+  import MemoryManagerSuite.DEFAULT_EVICT_BLOCKS_TO_FREE_SPACE_CALLED
 
   // Note: Mockito's verify mechanism does not provide a way to reset method call counts
   // without also resetting stubbed methods. Since our test code relies on the latter,
-  // we need to use our own variable to track invocations of `ensureFreeSpace`.
+  // we need to use our own variable to track invocations of `evictBlocksToFreeSpace`.
 
   /**
-   * The amount of free space requested in the last call to [[MemoryStore.ensureFreeSpace]]
+   * The amount of space requested in the last call to [[MemoryStore.evictBlocksToFreeSpace]].
    *
-   * This set whenever [[MemoryStore.ensureFreeSpace]] is called, and cleared when the test
-   * code makes explicit assertions on this variable through [[assertEnsureFreeSpaceCalled]].
+   * This set whenever [[MemoryStore.evictBlocksToFreeSpace]] is called, and cleared when the test
+   * code makes explicit assertions on this variable through
+   * [[assertEvictBlocksToFreeSpaceCalled]].
    */
-  private val ensureFreeSpaceCalled = new AtomicLong(DEFAULT_ENSURE_FREE_SPACE_CALLED)
+  private val evictBlocksToFreeSpaceCalled = new AtomicLong(0)
+
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+    evictedBlocks.clear()
+    evictBlocksToFreeSpaceCalled.set(DEFAULT_EVICT_BLOCKS_TO_FREE_SPACE_CALLED)
+  }
 
   /**
-   * Make a mocked [[MemoryStore]] whose [[MemoryStore.ensureFreeSpace]] method is stubbed.
+   * Make a mocked [[MemoryStore]] whose [[MemoryStore.evictBlocksToFreeSpace]] method is stubbed.
    *
-   * This allows our test code to release storage memory when [[MemoryStore.ensureFreeSpace]]
-   * is called without relying on [[org.apache.spark.storage.BlockManager]] and all of its
-   * dependencies.
+   * This allows our test code to release storage memory when these methods are called
+   * without relying on [[org.apache.spark.storage.BlockManager]] and all of its dependencies.
    */
   protected def makeMemoryStore(mm: MemoryManager): MemoryStore = {
-    val ms = mock(classOf[MemoryStore])
-    when(ms.ensureFreeSpace(anyLong(), any())).thenAnswer(ensureFreeSpaceAnswer(mm, 0))
-    when(ms.ensureFreeSpace(any(), anyLong(), any())).thenAnswer(ensureFreeSpaceAnswer(mm, 1))
+    val ms = mock(classOf[MemoryStore], RETURNS_SMART_NULLS)
+    when(ms.evictBlocksToFreeSpace(any(), anyLong(), any()))
+      .thenAnswer(evictBlocksToFreeSpaceAnswer(mm))
     mm.setMemoryStore(ms)
     ms
   }
 
   /**
-   * Make an [[Answer]] that stubs [[MemoryStore.ensureFreeSpace]] with the right arguments.
-   */
-  private def ensureFreeSpaceAnswer(mm: MemoryManager, numBytesPos: Int): Answer[Boolean] = {
+    * Simulate the part of [[MemoryStore.evictBlocksToFreeSpace]] that releases storage memory.
+    *
+    * This is a significant simplification of the real method, which actually drops existing
+    * blocks based on the size of each block. Instead, here we simply release as many bytes
+    * as needed to ensure the requested amount of free space. This allows us to set up the
+    * test without relying on the [[org.apache.spark.storage.BlockManager]], which brings in
+    * many other dependencies.
+    *
+    * Every call to this method will set a global variable, [[evictBlocksToFreeSpaceCalled]], that
+    * records the number of bytes this is called with. This variable is expected to be cleared
+    * by the test code later through [[assertEvictBlocksToFreeSpaceCalled]].
+    */
+  private def evictBlocksToFreeSpaceAnswer(mm: MemoryManager): Answer[Boolean] = {
     new Answer[Boolean] {
       override def answer(invocation: InvocationOnMock): Boolean = {
         val args = invocation.getArguments
-        require(args.size > numBytesPos, s"bad test: expected >$numBytesPos arguments " +
-          s"in ensureFreeSpace, found ${args.size}")
-        require(args(numBytesPos).isInstanceOf[Long], s"bad test: expected ensureFreeSpace " +
-          s"argument at index $numBytesPos to be a Long: ${args.mkString(", ")}")
-        val numBytes = args(numBytesPos).asInstanceOf[Long]
-        val success = mockEnsureFreeSpace(mm, numBytes)
-        if (success) {
+        val numBytesToFree = args(1).asInstanceOf[Long]
+        assert(numBytesToFree > 0)
+        require(evictBlocksToFreeSpaceCalled.get() === DEFAULT_EVICT_BLOCKS_TO_FREE_SPACE_CALLED,
+          "bad test: evictBlocksToFreeSpace() variable was not reset")
+        evictBlocksToFreeSpaceCalled.set(numBytesToFree)
+        if (numBytesToFree <= mm.storageMemoryUsed) {
+          // We can evict enough blocks to fulfill the request for space
+          mm.releaseStorageMemory(numBytesToFree)
           args.last.asInstanceOf[mutable.Buffer[(BlockId, BlockStatus)]].append(
-            (null, BlockStatus(StorageLevel.MEMORY_ONLY, numBytes, 0L, 0L)))
+            (null, BlockStatus(StorageLevel.MEMORY_ONLY, numBytesToFree, 0L, 0L)))
+          // We need to add this call so that that the suite-level `evictedBlocks` is updated when
+          // execution evicts storage; in that case, args.last will not be equal to evictedBlocks
+          // because it will be a temporary buffer created inside of the MemoryManager rather than
+          // being passed in by the test code.
+          if (!(evictedBlocks eq args.last)) {
+            evictedBlocks.append(
+              (null, BlockStatus(StorageLevel.MEMORY_ONLY, numBytesToFree, 0L, 0L)))
+          }
+          true
+        } else {
+          // No blocks were evicted because eviction would not free enough space.
+          false
         }
-        success
-      }
-    }
-  }
-
-  /**
-   * Simulate the part of [[MemoryStore.ensureFreeSpace]] that releases storage memory.
-   *
-   * This is a significant simplification of the real method, which actually drops existing
-   * blocks based on the size of each block. Instead, here we simply release as many bytes
-   * as needed to ensure the requested amount of free space. This allows us to set up the
-   * test without relying on the [[org.apache.spark.storage.BlockManager]], which brings in
-   * many other dependencies.
-   *
-   * Every call to this method will set a global variable, [[ensureFreeSpaceCalled]], that
-   * records the number of bytes this is called with. This variable is expected to be cleared
-   * by the test code later through [[assertEnsureFreeSpaceCalled]].
-   */
-  private def mockEnsureFreeSpace(mm: MemoryManager, numBytes: Long): Boolean = mm.synchronized {
-    require(ensureFreeSpaceCalled.get() === DEFAULT_ENSURE_FREE_SPACE_CALLED,
-      "bad test: ensure free space variable was not reset")
-    // Record the number of bytes we freed this call
-    ensureFreeSpaceCalled.set(numBytes)
-    if (numBytes <= mm.maxStorageMemory) {
-      def freeMemory = mm.maxStorageMemory - mm.storageMemoryUsed
-      val spaceToRelease = numBytes - freeMemory
-      if (spaceToRelease > 0) {
-        mm.releaseStorageMemory(spaceToRelease)
       }
-      freeMemory >= numBytes
-    } else {
-      // We attempted to free more bytes than our max allowable memory
-      false
     }
   }
 
   /**
-   * Assert that [[MemoryStore.ensureFreeSpace]] is called with the given parameters.
+   * Assert that [[MemoryStore.evictBlocksToFreeSpace]] is called with the given parameters.
    */
-  protected def assertEnsureFreeSpaceCalled(ms: MemoryStore, numBytes: Long): Unit = {
-    assert(ensureFreeSpaceCalled.get() === numBytes,
-      s"expected ensure free space to be called with $numBytes")
-    ensureFreeSpaceCalled.set(DEFAULT_ENSURE_FREE_SPACE_CALLED)
+  protected def assertEvictBlocksToFreeSpaceCalled(ms: MemoryStore, numBytes: Long): Unit = {
+    assert(evictBlocksToFreeSpaceCalled.get() === numBytes,
+      s"expected evictBlocksToFreeSpace() to be called with $numBytes")
+    evictBlocksToFreeSpaceCalled.set(DEFAULT_EVICT_BLOCKS_TO_FREE_SPACE_CALLED)
   }
 
   /**
-   * Assert that [[MemoryStore.ensureFreeSpace]] is NOT called.
+   * Assert that [[MemoryStore.evictBlocksToFreeSpace]] is NOT called.
    */
-  protected def assertEnsureFreeSpaceNotCalled[T](ms: MemoryStore): Unit = {
-    assert(ensureFreeSpaceCalled.get() === DEFAULT_ENSURE_FREE_SPACE_CALLED,
-      "ensure free space should not have been called!")
+  protected def assertEvictBlocksToFreeSpaceNotCalled[T](ms: MemoryStore): Unit = {
+    assert(evictBlocksToFreeSpaceCalled.get() === DEFAULT_EVICT_BLOCKS_TO_FREE_SPACE_CALLED,
+      "evictBlocksToFreeSpace() should not have been called!")
+    assert(evictedBlocks.isEmpty)
   }
 
   /**
@@ -291,5 +292,5 @@ private[memory] trait MemoryManagerSuite extends SparkFunSuite {
 }
 
 private object MemoryManagerSuite {
-  private val DEFAULT_ENSURE_FREE_SPACE_CALLED = -1L
+  private val DEFAULT_EVICT_BLOCKS_TO_FREE_SPACE_CALLED = -1L
 }
diff --git a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
index 54cb28c389c2f..68cf26fc3ed5d 100644
--- a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
@@ -17,16 +17,13 @@
 
 package org.apache.spark.memory
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.mockito.Mockito.when
 
 import org.apache.spark.SparkConf
-import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, TestBlockId}
+import org.apache.spark.storage.{MemoryStore, TestBlockId}
 
 class StaticMemoryManagerSuite extends MemoryManagerSuite {
   private val conf = new SparkConf().set("spark.storage.unrollFraction", "0.4")
-  private val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
 
   /**
    * Make a [[StaticMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
@@ -50,7 +47,7 @@ class StaticMemoryManagerSuite extends MemoryManagerSuite {
       conf.clone
         .set("spark.memory.fraction", "1")
         .set("spark.testing.memory", maxOnHeapExecutionMemory.toString)
-        .set("spark.memory.offHeapSize", maxOffHeapExecutionMemory.toString),
+        .set("spark.memory.offHeap.size", maxOffHeapExecutionMemory.toString),
       maxOnHeapExecutionMemory = maxOnHeapExecutionMemory,
       maxStorageMemory = 0,
       numCores = 1)
@@ -85,33 +82,38 @@ class StaticMemoryManagerSuite extends MemoryManagerSuite {
     val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.acquireStorageMemory(dummyBlock, 10L, evictedBlocks))
-    // `ensureFreeSpace` should be called with the number of bytes requested
-    assertEnsureFreeSpaceCalled(ms, 10L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 10L)
+
     assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 100L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 110L)
     // Acquire more than the max, not granted
     assert(!mm.acquireStorageMemory(dummyBlock, maxStorageMem + 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, maxStorageMem + 1L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 110L)
     // Acquire up to the max, requests after this are still granted due to LRU eviction
     assert(mm.acquireStorageMemory(dummyBlock, maxStorageMem, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 1000L)
+    assertEvictBlocksToFreeSpaceCalled(ms, 110L)
     assert(mm.storageMemoryUsed === 1000L)
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 1L)
+    assertEvictBlocksToFreeSpaceCalled(ms, 1L)
+    assert(evictedBlocks.nonEmpty)
+    evictedBlocks.clear()
+    // Note: We evicted 1 byte to put another 1-byte block in, so the storage memory used remains at
+    // 1000 bytes. This is different from real behavior, where the 1-byte block would have evicted
+    // the 1000-byte block entirely. This is set up differently so we can write finer-grained tests.
     assert(mm.storageMemoryUsed === 1000L)
     mm.releaseStorageMemory(800L)
     assert(mm.storageMemoryUsed === 200L)
     // Acquire after release
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 1L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 201L)
     mm.releaseAllStorageMemory()
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 1L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 1L)
     // Release beyond what was acquired
     mm.releaseStorageMemory(100L)
@@ -133,7 +135,7 @@ class StaticMemoryManagerSuite extends MemoryManagerSuite {
     assert(mm.executionMemoryUsed === 200L)
     // Only storage memory should increase
     assert(mm.acquireStorageMemory(dummyBlock, 50L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 50L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 50L)
     assert(mm.executionMemoryUsed === 200L)
     // Only execution memory should be released
@@ -151,21 +153,30 @@ class StaticMemoryManagerSuite extends MemoryManagerSuite {
     val dummyBlock = TestBlockId("lonely water")
     val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
     assert(mm.acquireUnrollMemory(dummyBlock, 100L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 100L)
+    when(ms.currentUnrollMemory).thenReturn(100L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 100L)
     mm.releaseUnrollMemory(40L)
     assert(mm.storageMemoryUsed === 60L)
     when(ms.currentUnrollMemory).thenReturn(60L)
-    assert(mm.acquireUnrollMemory(dummyBlock, 500L, evictedBlocks))
+    assert(mm.acquireStorageMemory(dummyBlock, 800L, evictedBlocks))
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
+    assert(mm.storageMemoryUsed === 860L)
     // `spark.storage.unrollFraction` is 0.4, so the max unroll space is 400 bytes.
-    // Since we already occupy 60 bytes, we will try to ensure only 400 - 60 = 340 bytes.
-    assertEnsureFreeSpaceCalled(ms, 340L)
-    assert(mm.storageMemoryUsed === 560L)
-    when(ms.currentUnrollMemory).thenReturn(560L)
-    assert(!mm.acquireUnrollMemory(dummyBlock, 800L, evictedBlocks))
-    assert(mm.storageMemoryUsed === 560L)
-    // We already have 560 bytes > the max unroll space of 400 bytes, so no bytes are freed
-    assertEnsureFreeSpaceCalled(ms, 0L)
+    // As of this point, cache memory is 800 bytes and current unroll memory is 60 bytes.
+    // Requesting 240 more bytes of unroll memory will leave our total unroll memory at
+    // 300 bytes, still under the 400-byte limit. Therefore, all 240 bytes are granted.
+    assert(mm.acquireUnrollMemory(dummyBlock, 240L, evictedBlocks))
+    assertEvictBlocksToFreeSpaceCalled(ms, 100L) // 860 + 240 - 1000
+    when(ms.currentUnrollMemory).thenReturn(300L) // 60 + 240
+    assert(mm.storageMemoryUsed === 1000L)
+    evictedBlocks.clear()
+    // We already have 300 bytes of unroll memory, so requesting 150 more will leave us
+    // above the 400-byte limit. Since there is not enough free memory, this request will
+    // fail even after evicting as much as we can (400 - 300 = 100 bytes).
+    assert(!mm.acquireUnrollMemory(dummyBlock, 150L, evictedBlocks))
+    assertEvictBlocksToFreeSpaceCalled(ms, 100L)
+    assert(mm.storageMemoryUsed === 900L)
     // Release beyond what was acquired
     mm.releaseUnrollMemory(maxStorageMem)
     assert(mm.storageMemoryUsed === 0L)
diff --git a/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
index e97c898a44783..6cc48597d38f9 100644
--- a/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
@@ -17,16 +17,13 @@
 
 package org.apache.spark.memory
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.scalatest.PrivateMethodTester
 
 import org.apache.spark.SparkConf
-import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, TestBlockId}
+import org.apache.spark.storage.{MemoryStore, TestBlockId}
 
 class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTester {
   private val dummyBlock = TestBlockId("--")
-  private val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
 
   private val storageFraction: Double = 0.5
 
@@ -45,7 +42,7 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     val conf = new SparkConf()
       .set("spark.memory.fraction", "1")
       .set("spark.testing.memory", maxOnHeapExecutionMemory.toString)
-      .set("spark.memory.offHeapSize", maxOffHeapExecutionMemory.toString)
+      .set("spark.memory.offHeap.size", maxOffHeapExecutionMemory.toString)
       .set("spark.memory.storageFraction", storageFraction.toString)
     UnifiedMemoryManager(conf, numCores = 1)
   }
@@ -78,33 +75,40 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     val (mm, ms) = makeThings(maxMemory)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.acquireStorageMemory(dummyBlock, 10L, evictedBlocks))
-    // `ensureFreeSpace` should be called with the number of bytes requested
-    assertEnsureFreeSpaceCalled(ms, 10L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 10L)
+
     assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 100L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 110L)
     // Acquire more than the max, not granted
     assert(!mm.acquireStorageMemory(dummyBlock, maxMemory + 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, maxMemory + 1L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 110L)
     // Acquire up to the max, requests after this are still granted due to LRU eviction
     assert(mm.acquireStorageMemory(dummyBlock, maxMemory, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 1000L)
+    assertEvictBlocksToFreeSpaceCalled(ms, 110L)
     assert(mm.storageMemoryUsed === 1000L)
+    assert(evictedBlocks.nonEmpty)
+    evictedBlocks.clear()
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 1L)
+    assertEvictBlocksToFreeSpaceCalled(ms, 1L)
+    assert(evictedBlocks.nonEmpty)
+    evictedBlocks.clear()
+    // Note: We evicted 1 byte to put another 1-byte block in, so the storage memory used remains at
+    // 1000 bytes. This is different from real behavior, where the 1-byte block would have evicted
+    // the 1000-byte block entirely. This is set up differently so we can write finer-grained tests.
     assert(mm.storageMemoryUsed === 1000L)
     mm.releaseStorageMemory(800L)
     assert(mm.storageMemoryUsed === 200L)
     // Acquire after release
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 1L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 201L)
     mm.releaseAllStorageMemory()
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 1L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 1L)
     // Release beyond what was acquired
     mm.releaseStorageMemory(100L)
@@ -117,25 +121,27 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     val (mm, ms) = makeThings(maxMemory)
     // Acquire enough storage memory to exceed the storage region
     assert(mm.acquireStorageMemory(dummyBlock, 750L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 750L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.executionMemoryUsed === 0L)
     assert(mm.storageMemoryUsed === 750L)
     // Execution needs to request 250 bytes to evict storage memory
     assert(mm.acquireExecutionMemory(100L, taskAttemptId, MemoryMode.ON_HEAP) === 100L)
     assert(mm.executionMemoryUsed === 100L)
     assert(mm.storageMemoryUsed === 750L)
-    assertEnsureFreeSpaceNotCalled(ms)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     // Execution wants 200 bytes but only 150 are free, so storage is evicted
     assert(mm.acquireExecutionMemory(200L, taskAttemptId, MemoryMode.ON_HEAP) === 200L)
     assert(mm.executionMemoryUsed === 300L)
-    assertEnsureFreeSpaceCalled(ms, 50L)
-    assert(mm.executionMemoryUsed === 300L)
+    assert(mm.storageMemoryUsed === 700L)
+    assertEvictBlocksToFreeSpaceCalled(ms, 50L)
+    assert(evictedBlocks.nonEmpty)
+    evictedBlocks.clear()
     mm.releaseAllStorageMemory()
     require(mm.executionMemoryUsed === 300L)
     require(mm.storageMemoryUsed === 0, "bad test: all storage memory should have been released")
     // Acquire some storage memory again, but this time keep it within the storage region
     assert(mm.acquireStorageMemory(dummyBlock, 400L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, 400L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(mm.storageMemoryUsed === 400L)
     assert(mm.executionMemoryUsed === 300L)
     // Execution cannot evict storage because the latter is within the storage fraction,
@@ -143,7 +149,27 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     assert(mm.acquireExecutionMemory(400L, taskAttemptId, MemoryMode.ON_HEAP) === 300L)
     assert(mm.executionMemoryUsed === 600L)
     assert(mm.storageMemoryUsed === 400L)
-    assertEnsureFreeSpaceNotCalled(ms)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
+  }
+
+  test("execution memory requests smaller than free memory should evict storage (SPARK-12165)") {
+    val maxMemory = 1000L
+    val taskAttemptId = 0L
+    val (mm, ms) = makeThings(maxMemory)
+    // Acquire enough storage memory to exceed the storage region size
+    assert(mm.acquireStorageMemory(dummyBlock, 700L, evictedBlocks))
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
+    assert(mm.executionMemoryUsed === 0L)
+    assert(mm.storageMemoryUsed === 700L)
+    // SPARK-12165: previously, MemoryStore would not evict anything because it would
+    // mistakenly think that the 300 bytes of free space was still available even after
+    // using it to expand the execution pool. Consequently, no storage memory was released
+    // and the following call granted only 300 bytes to execution.
+    assert(mm.acquireExecutionMemory(500L, taskAttemptId, MemoryMode.ON_HEAP) === 500L)
+    assertEvictBlocksToFreeSpaceCalled(ms, 200L)
+    assert(mm.storageMemoryUsed === 500L)
+    assert(mm.executionMemoryUsed === 500L)
+    assert(evictedBlocks.nonEmpty)
   }
 
   test("storage does not evict execution") {
@@ -154,32 +180,34 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     assert(mm.acquireExecutionMemory(800L, taskAttemptId, MemoryMode.ON_HEAP) === 800L)
     assert(mm.executionMemoryUsed === 800L)
     assert(mm.storageMemoryUsed === 0L)
-    assertEnsureFreeSpaceNotCalled(ms)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     // Storage should not be able to evict execution
     assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
     assert(mm.executionMemoryUsed === 800L)
     assert(mm.storageMemoryUsed === 100L)
-    assertEnsureFreeSpaceCalled(ms, 100L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     assert(!mm.acquireStorageMemory(dummyBlock, 250L, evictedBlocks))
     assert(mm.executionMemoryUsed === 800L)
     assert(mm.storageMemoryUsed === 100L)
-    assertEnsureFreeSpaceCalled(ms, 250L)
+    // Do not attempt to evict blocks, since evicting will not free enough memory:
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     mm.releaseExecutionMemory(maxMemory, taskAttemptId, MemoryMode.ON_HEAP)
     mm.releaseStorageMemory(maxMemory)
     // Acquire some execution memory again, but this time keep it within the execution region
     assert(mm.acquireExecutionMemory(200L, taskAttemptId, MemoryMode.ON_HEAP) === 200L)
     assert(mm.executionMemoryUsed === 200L)
     assert(mm.storageMemoryUsed === 0L)
-    assertEnsureFreeSpaceNotCalled(ms)
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
     // Storage should still not be able to evict execution
     assert(mm.acquireStorageMemory(dummyBlock, 750L, evictedBlocks))
     assert(mm.executionMemoryUsed === 200L)
     assert(mm.storageMemoryUsed === 750L)
-    assertEnsureFreeSpaceCalled(ms, 750L)
+    assertEvictBlocksToFreeSpaceNotCalled(ms) // since there were 800 bytes free
     assert(!mm.acquireStorageMemory(dummyBlock, 850L, evictedBlocks))
     assert(mm.executionMemoryUsed === 200L)
     assert(mm.storageMemoryUsed === 750L)
-    assertEnsureFreeSpaceCalled(ms, 850L)
+    // Do not attempt to evict blocks, since evicting will not free enough memory:
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
   }
 
   test("small heap") {
@@ -202,4 +230,29 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes
     assert(exception.getMessage.contains("larger heap size"))
   }
 
+  test("execution can evict cached blocks when there are multiple active tasks (SPARK-12155)") {
+    val conf = new SparkConf()
+      .set("spark.memory.fraction", "1")
+      .set("spark.memory.storageFraction", "0")
+      .set("spark.testing.memory", "1000")
+    val mm = UnifiedMemoryManager(conf, numCores = 2)
+    val ms = makeMemoryStore(mm)
+    assert(mm.maxMemory === 1000)
+    // Have two tasks each acquire some execution memory so that the memory pool registers that
+    // there are two active tasks:
+    assert(mm.acquireExecutionMemory(100L, 0, MemoryMode.ON_HEAP) === 100L)
+    assert(mm.acquireExecutionMemory(100L, 1, MemoryMode.ON_HEAP) === 100L)
+    // Fill up all of the remaining memory with storage.
+    assert(mm.acquireStorageMemory(dummyBlock, 800L, evictedBlocks))
+    assertEvictBlocksToFreeSpaceNotCalled(ms)
+    assert(mm.storageMemoryUsed === 800)
+    assert(mm.executionMemoryUsed === 200)
+    // A task should still be able to allocate 100 bytes execution memory by evicting blocks
+    assert(mm.acquireExecutionMemory(100L, 0, MemoryMode.ON_HEAP) === 100L)
+    assertEvictBlocksToFreeSpaceCalled(ms, 100L)
+    assert(mm.executionMemoryUsed === 300)
+    assert(mm.storageMemoryUsed === 700)
+    assert(evictedBlocks.nonEmpty)
+  }
+
 }
diff --git a/docker-integration-tests/pom.xml b/docker-integration-tests/pom.xml
index 1a7fd363bff20..0f4c3af86f530 100644
--- a/docker-integration-tests/pom.xml
+++ b/docker-integration-tests/pom.xml
@@ -71,6 +71,18 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.apache.httpcomponents</groupId>
+      <artifactId>httpclient</artifactId>
+      <version>4.5</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.httpcomponents</groupId>
+      <artifactId>httpcore</artifactId>
+      <version>4.4.1</version>
+      <scope>test</scope>
+    </dependency>
     <!-- Necessary in order to avoid errors in log messages: -->
     <dependency>
       <groupId>com.google.guava</groupId>
@@ -109,6 +121,16 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>mysql</groupId>
+      <artifactId>mysql-connector-java</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.postgresql</groupId>
+      <artifactId>postgresql</artifactId>
+      <scope>test</scope>
+    </dependency>
     <!-- Jersey dependencies, used to override version.
      See https://github.com/apache/spark/pull/9503#issuecomment-154369560 for
      background on why we need to use a newer Jersey only in this test module;
diff --git a/docs/_data/menu-ml.yaml b/docs/_data/menu-ml.yaml
index fe37d0573e46b..2eea9a917a4cc 100644
--- a/docs/_data/menu-ml.yaml
+++ b/docs/_data/menu-ml.yaml
@@ -1,5 +1,5 @@
 - text: "Overview: estimators, transformers and pipelines"
-  url: ml-intro.html
+  url: ml-guide.html
 - text: Extracting, transforming and selecting features
   url: ml-features.html
 - text: Classification and Regression
diff --git a/docs/_includes/nav-left-wrapper-ml.html b/docs/_includes/nav-left-wrapper-ml.html
index 0103e890cc21a..e2d7eda027c6e 100644
--- a/docs/_includes/nav-left-wrapper-ml.html
+++ b/docs/_includes/nav-left-wrapper-ml.html
@@ -1,8 +1,8 @@
 <div class="left-menu-wrapper">
     <div class="left-menu">
-        <h3>spark.ml package</h3>
+        <h3><a href="ml-guide.html">spark.ml package</a></h3>
         {% include nav-left.html nav=include.nav-ml %}
-        <h3>spark.mllib package</h3>
+        <h3><a href="mllib-guide.html">spark.mllib package</a></h3>
         {% include nav-left.html nav=include.nav-mllib %}
     </div>
 </div>
\ No newline at end of file
diff --git a/docs/configuration.md b/docs/configuration.md
index ee20733120caa..488dc52ed61ed 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -738,6 +738,22 @@ Apart from these, the following properties are also available, and may be useful
     <a href="tuning.html#memory-management-overview">this description</a>.
   </td>
 </tr>
+<tr>
+  <td><code>spark.memory.offHeap.enabled</code></td>
+  <td>true</td>
+  <td>
+    If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then <code>spark.memory.offHeap.size</code> must be positive.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.memory.offHeap.size</code></td>
+  <td>0</td>
+  <td>
+    The absolute amount of memory which can be used for off-heap allocation.
+    This setting has no impact on heap memory usage, so if your executors' total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly.
+    This must be set to a positive value when <code>spark.memory.offHeap.enabled=true</code>.
+  </td>
+</tr>
 <tr>
   <td><code>spark.memory.useLegacyMode</code></td>
   <td>false</td>
diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 6a512ab234bb2..9dea9b5904d2d 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -70,7 +70,7 @@ operators (e.g., [subgraph](#structural_operators), [joinVertices](#join_operato
 
 ## Migrating from Spark 1.1
 
-GraphX in Spark {{site.SPARK_VERSION}} contains a few user facing API changes:
+GraphX in Spark 1.2 contains a few user facing API changes:
 
 1. To improve performance we have introduced a new version of
 [`mapReduceTriplets`][Graph.mapReduceTriplets] called
diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md
index b005633e56c11..91731d78a2d43 100644
--- a/docs/ml-advanced.md
+++ b/docs/ml-advanced.md
@@ -1,7 +1,7 @@
 ---
 layout: global
 title: Advanced topics - spark.ml
-displayTitle: Advanced topics
+displayTitle: Advanced topics - spark.ml
 ---
 
 # Optimization of linear methods
diff --git a/docs/ml-ann.md b/docs/ml-ann.md
new file mode 100644
index 0000000000000..c2d9bd200f62f
--- /dev/null
+++ b/docs/ml-ann.md
@@ -0,0 +1,8 @@
+---
+layout: global
+title: Multilayer perceptron classifier - spark.ml
+displayTitle: Multilayer perceptron classifier - spark.ml
+---
+
+  > This section has been moved into the
+   [classification and regression section](ml-classification-regression.html#multilayer-perceptron-classifier).
diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index 3663ffee32753..d63438bf74c17 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -1,7 +1,7 @@
 ---
 layout: global
 title: Classification and regression - spark.ml
-displayTitle: Classification and regression in spark.ml
+displayTitle: Classification and regression - spark.ml
 ---
 
 
@@ -27,10 +27,10 @@ displayTitle: Classification and regression in spark.ml
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-In MLlib, we implement popular linear methods such as logistic
+In `spark.ml`, we implement popular linear methods such as logistic
 regression and linear least squares with $L_1$ or $L_2$ regularization.
 Refer to [the linear methods in mllib](mllib-linear-methods.html) for
-details.  In `spark.ml`, we also include Pipelines API for [Elastic
+details about implementation and tuning.  We also include a DataFrame API for [Elastic
 net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid
 of $L_1$ and $L_2$ regularization proposed in [Zou et al, Regularization
 and variable selection via the elastic
@@ -86,7 +86,7 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
 
 The `spark.ml` implementation of logistic regression also supports
 extracting a summary of the model over the training set. Note that the
-predictions and metrics which are stored as `Dataframe` in
+predictions and metrics which are stored as `DataFrame` in
 `BinaryLogisticRegressionSummary` are annotated `@transient` and hence
 only available on the driver.
 
@@ -523,7 +523,7 @@ feature scaling, and are able to capture non-linearities and feature interaction
 algorithms such as random forests and boosting are among the top performers for classification and
 regression tasks.
 
-MLlib supports decision trees for binary and multiclass classification and for regression,
+The `spark.ml` implementation supports decision trees for binary and multiclass classification and for regression,
 using both continuous and categorical features. The implementation partitions data by rows,
 allowing distributed training with millions or even billions of instances.
 
@@ -611,24 +611,25 @@ All output columns are optional; to exclude an output column, set its correspond
 
 # Tree Ensembles
 
-The Pipelines API supports two major tree ensemble algorithms: [Random Forests](http://en.wikipedia.org/wiki/Random_forest) and [Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting).
-Both use [MLlib decision trees](ml-decision-tree.html) as their base models.
+The DataFrame API supports two major tree ensemble algorithms: [Random Forests](http://en.wikipedia.org/wiki/Random_forest) and [Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting).
+Both use [`spark.ml` decision trees](ml-classification-regression.html#decision-trees) as their base models.
 
-Users can find more information about ensemble algorithms in the [MLlib Ensemble guide](mllib-ensembles.html).  In this section, we demonstrate the Pipelines API for ensembles.
+Users can find more information about ensemble algorithms in the [MLlib Ensemble guide](mllib-ensembles.html).  
+In this section, we demonstrate the DataFrame API for ensembles.
 
 The main differences between this API and the [original MLlib ensembles API](mllib-ensembles.html) are:
 
-* support for ML Pipelines
+* support for DataFrames and ML Pipelines
 * separation of classification vs. regression
 * use of DataFrame metadata to distinguish continuous and categorical features
-* a bit more functionality for random forests: estimates of feature importance, as well as the predicted probability of each class (a.k.a. class conditional probabilities) for classification.
+* more functionality for random forests: estimates of feature importance, as well as the predicted probability of each class (a.k.a. class conditional probabilities) for classification.
 
 ## Random Forests
 
 [Random forests](http://en.wikipedia.org/wiki/Random_forest)
 are ensembles of [decision trees](ml-decision-tree.html).
 Random forests combine many decision trees in order to reduce the risk of overfitting.
-MLlib supports random forests for binary and multiclass classification and for regression,
+The `spark.ml` implementation supports random forests for binary and multiclass classification and for regression,
 using both continuous and categorical features.
 
 For more information on the algorithm itself, please see the [`spark.mllib` documentation on random forests](mllib-ensembles.html).
@@ -709,7 +710,7 @@ All output columns are optional; to exclude an output column, set its correspond
 [Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting)
 are ensembles of [decision trees](ml-decision-tree.html).
 GBTs iteratively train decision trees in order to minimize a loss function.
-MLlib supports GBTs for binary classification and for regression,
+The `spark.ml` implementation supports GBTs for binary classification and for regression,
 using both continuous and categorical features.
 
 For more information on the algorithm itself, please see the [`spark.mllib` documentation on GBTs](mllib-ensembles.html).
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index 697777714b05b..a59f7e3005a3e 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Clustering - ML
-displayTitle: <a href="ml-guide.html">ML</a> - Clustering
+title: Clustering - spark.ml
+displayTitle: Clustering - spark.ml
 ---
 
 In this section, we introduce the pipeline API for [clustering in mllib](mllib-clustering.html).
diff --git a/docs/ml-decision-tree.md b/docs/ml-decision-tree.md
index 2bfac6f6c8378..a721d55bc675b 100644
--- a/docs/ml-decision-tree.md
+++ b/docs/ml-decision-tree.md
@@ -1,171 +1,8 @@
 ---
 layout: global
-title: Decision Trees - SparkML
-displayTitle: <a href="ml-guide.html">ML</a> - Decision Trees
+title: Decision trees - spark.ml
+displayTitle: Decision trees - spark.ml
 ---
 
-**Table of Contents**
-
-* This will become a table of contents (this text will be scraped).
-{:toc}
-
-
-# Overview
-
-[Decision trees](http://en.wikipedia.org/wiki/Decision_tree_learning)
-and their ensembles are popular methods for the machine learning tasks of
-classification and regression. Decision trees are widely used since they are easy to interpret,
-handle categorical features, extend to the multiclass classification setting, do not require
-feature scaling, and are able to capture non-linearities and feature interactions. Tree ensemble
-algorithms such as random forests and boosting are among the top performers for classification and
-regression tasks.
-
-MLlib supports decision trees for binary and multiclass classification and for regression,
-using both continuous and categorical features. The implementation partitions data by rows,
-allowing distributed training with millions or even billions of instances.
-
-Users can find more information about the decision tree algorithm in the [MLlib Decision Tree guide](mllib-decision-tree.html).  In this section, we demonstrate the Pipelines API for Decision Trees.
-
-The Pipelines API for Decision Trees offers a bit more functionality than the original API.  In particular, for classification, users can get the predicted probability of each class (a.k.a. class conditional probabilities).
-
-Ensembles of trees (Random Forests and Gradient-Boosted Trees) are described in the [Ensembles guide](ml-ensembles.html).
-
-# Inputs and Outputs
-
-We list the input and output (prediction) column types here.
-All output columns are optional; to exclude an output column, set its corresponding Param to an empty string.
-
-## Input Columns
-
-<table class="table">
-  <thead>
-    <tr>
-      <th align="left">Param name</th>
-      <th align="left">Type(s)</th>
-      <th align="left">Default</th>
-      <th align="left">Description</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>labelCol</td>
-      <td>Double</td>
-      <td>"label"</td>
-      <td>Label to predict</td>
-    </tr>
-    <tr>
-      <td>featuresCol</td>
-      <td>Vector</td>
-      <td>"features"</td>
-      <td>Feature vector</td>
-    </tr>
-  </tbody>
-</table>
-
-## Output Columns
-
-<table class="table">
-  <thead>
-    <tr>
-      <th align="left">Param name</th>
-      <th align="left">Type(s)</th>
-      <th align="left">Default</th>
-      <th align="left">Description</th>
-      <th align="left">Notes</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>predictionCol</td>
-      <td>Double</td>
-      <td>"prediction"</td>
-      <td>Predicted label</td>
-      <td></td>
-    </tr>
-    <tr>
-      <td>rawPredictionCol</td>
-      <td>Vector</td>
-      <td>"rawPrediction"</td>
-      <td>Vector of length # classes, with the counts of training instance labels at the tree node which makes the prediction</td>
-      <td>Classification only</td>
-    </tr>
-    <tr>
-      <td>probabilityCol</td>
-      <td>Vector</td>
-      <td>"probability"</td>
-      <td>Vector of length # classes equal to rawPrediction normalized to a multinomial distribution</td>
-      <td>Classification only</td>
-    </tr>
-  </tbody>
-</table>
-
-# Examples
-
-The below examples demonstrate the Pipelines API for Decision Trees. The main differences between this API and the [original MLlib Decision Tree API](mllib-decision-tree.html) are:
-
-* support for ML Pipelines
-* separation of Decision Trees for classification vs. regression
-* use of DataFrame metadata to distinguish continuous and categorical features
-
-
-## Classification
-
-The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
-We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the `DataFrame` which the Decision Tree algorithm can recognize.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier).
-
-{% include_example scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala %}
-
-</div>
-
-<div data-lang="java" markdown="1">
-
-More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/classification/DecisionTreeClassifier.html).
-
-{% include_example java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java %}
-
-</div>
-
-<div data-lang="python" markdown="1">
-
-More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier).
-
-{% include_example python/ml/decision_tree_classification_example.py %}
-
-</div>
-
-</div>
-
-
-## Regression
-
-The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
-We use a feature transformer to index categorical features, adding metadata to the `DataFrame` which the Decision Tree algorithm can recognize.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.regression.DecisionTreeRegressor).
-
-{% include_example scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/regression/DecisionTreeRegressor.html).
-
-{% include_example java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java %}
-</div>
-
-<div data-lang="python" markdown="1">
-
-More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.regression.DecisionTreeRegressor).
-
-{% include_example python/ml/decision_tree_regression_example.py %}
-</div>
-
-</div>
+  > This section has been moved into the
+   [classification and regression section](ml-classification-regression.html#decision-trees).
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index 14fef76f260ff..303773e8038fc 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -1,319 +1,8 @@
 ---
 layout: global
-title: Ensembles
-displayTitle: <a href="ml-guide.html">ML</a> - Ensembles
+title: Tree ensemble methods - spark.ml
+displayTitle: Tree ensemble methods - spark.ml
 ---
 
-**Table of Contents**
-
-* This will become a table of contents (this text will be scraped).
-{:toc}
-
-An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning)
-is a learning algorithm which creates a model composed of a set of other base models.
-
-## Tree Ensembles
-
-The Pipelines API supports two major tree ensemble algorithms: [Random Forests](http://en.wikipedia.org/wiki/Random_forest) and [Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting).
-Both use [MLlib decision trees](ml-decision-tree.html) as their base models.
-
-Users can find more information about ensemble algorithms in the [MLlib Ensemble guide](mllib-ensembles.html).  In this section, we demonstrate the Pipelines API for ensembles.
-
-The main differences between this API and the [original MLlib ensembles API](mllib-ensembles.html) are:
-
-* support for ML Pipelines
-* separation of classification vs. regression
-* use of DataFrame metadata to distinguish continuous and categorical features
-* a bit more functionality for random forests: estimates of feature importance, as well as the predicted probability of each class (a.k.a. class conditional probabilities) for classification.
-
-### Random Forests
-
-[Random forests](http://en.wikipedia.org/wiki/Random_forest)
-are ensembles of [decision trees](ml-decision-tree.html).
-Random forests combine many decision trees in order to reduce the risk of overfitting.
-MLlib supports random forests for binary and multiclass classification and for regression,
-using both continuous and categorical features.
-
-This section gives examples of using random forests with the Pipelines API.
-For more information on the algorithm, please see the [main MLlib docs on random forests](mllib-ensembles.html).
-
-#### Inputs and Outputs
-
-We list the input and output (prediction) column types here.
-All output columns are optional; to exclude an output column, set its corresponding Param to an empty string.
-
-##### Input Columns
-
-<table class="table">
-  <thead>
-    <tr>
-      <th align="left">Param name</th>
-      <th align="left">Type(s)</th>
-      <th align="left">Default</th>
-      <th align="left">Description</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>labelCol</td>
-      <td>Double</td>
-      <td>"label"</td>
-      <td>Label to predict</td>
-    </tr>
-    <tr>
-      <td>featuresCol</td>
-      <td>Vector</td>
-      <td>"features"</td>
-      <td>Feature vector</td>
-    </tr>
-  </tbody>
-</table>
-
-##### Output Columns (Predictions)
-
-<table class="table">
-  <thead>
-    <tr>
-      <th align="left">Param name</th>
-      <th align="left">Type(s)</th>
-      <th align="left">Default</th>
-      <th align="left">Description</th>
-      <th align="left">Notes</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>predictionCol</td>
-      <td>Double</td>
-      <td>"prediction"</td>
-      <td>Predicted label</td>
-      <td></td>
-    </tr>
-    <tr>
-      <td>rawPredictionCol</td>
-      <td>Vector</td>
-      <td>"rawPrediction"</td>
-      <td>Vector of length # classes, with the counts of training instance labels at the tree node which makes the prediction</td>
-      <td>Classification only</td>
-    </tr>
-    <tr>
-      <td>probabilityCol</td>
-      <td>Vector</td>
-      <td>"probability"</td>
-      <td>Vector of length # classes equal to rawPrediction normalized to a multinomial distribution</td>
-      <td>Classification only</td>
-    </tr>
-  </tbody>
-</table>
-
-#### Example: Classification
-
-The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
-We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier) for more details.
-
-{% include_example scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/RandomForestClassifier.html) for more details.
-
-{% include_example java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java %}
-</div>
-
-<div data-lang="python" markdown="1">
-
-Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.RandomForestClassifier) for more details.
-
-{% include_example python/ml/random_forest_classifier_example.py %}
-</div>
-</div>
-
-#### Example: Regression
-
-The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
-We use a feature transformer to index categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.RandomForestRegressor) for more details.
-
-{% include_example scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/RandomForestRegressor.html) for more details.
-
-{% include_example java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java %}
-</div>
-
-<div data-lang="python" markdown="1">
-
-Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.RandomForestRegressor) for more details.
-
-{% include_example python/ml/random_forest_regressor_example.py %}
-</div>
-</div>
-
-### Gradient-Boosted Trees (GBTs)
-
-[Gradient-Boosted Trees (GBTs)](http://en.wikipedia.org/wiki/Gradient_boosting)
-are ensembles of [decision trees](ml-decision-tree.html).
-GBTs iteratively train decision trees in order to minimize a loss function.
-MLlib supports GBTs for binary classification and for regression,
-using both continuous and categorical features.
-
-This section gives examples of using GBTs with the Pipelines API.
-For more information on the algorithm, please see the [main MLlib docs on GBTs](mllib-ensembles.html).
-
-#### Inputs and Outputs
-
-We list the input and output (prediction) column types here.
-All output columns are optional; to exclude an output column, set its corresponding Param to an empty string.
-
-##### Input Columns
-
-<table class="table">
-  <thead>
-    <tr>
-      <th align="left">Param name</th>
-      <th align="left">Type(s)</th>
-      <th align="left">Default</th>
-      <th align="left">Description</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>labelCol</td>
-      <td>Double</td>
-      <td>"label"</td>
-      <td>Label to predict</td>
-    </tr>
-    <tr>
-      <td>featuresCol</td>
-      <td>Vector</td>
-      <td>"features"</td>
-      <td>Feature vector</td>
-    </tr>
-  </tbody>
-</table>
-
-Note that `GBTClassifier` currently only supports binary labels.
-
-##### Output Columns (Predictions)
-
-<table class="table">
-  <thead>
-    <tr>
-      <th align="left">Param name</th>
-      <th align="left">Type(s)</th>
-      <th align="left">Default</th>
-      <th align="left">Description</th>
-      <th align="left">Notes</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>predictionCol</td>
-      <td>Double</td>
-      <td>"prediction"</td>
-      <td>Predicted label</td>
-      <td></td>
-    </tr>
-  </tbody>
-</table>
-
-In the future, `GBTClassifier` will also output columns for `rawPrediction` and `probability`, just as `RandomForestClassifier` does.
-
-#### Example: Classification
-
-The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
-We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the `DataFrame` which the tree-based algorithms can recognize.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.GBTClassifier) for more details.
-
-{% include_example scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/GBTClassifier.html) for more details.
-
-{% include_example java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java %}
-</div>
-
-<div data-lang="python" markdown="1">
-
-Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.GBTClassifier) for more details.
-
-{% include_example python/ml/gradient_boosted_tree_classifier_example.py %}
-</div>
-</div>
-
-#### Example: Regression
-
-Note: For this example dataset, `GBTRegressor` actually only needs 1 iteration, but that will not
-be true in general.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.GBTRegressor) for more details.
-
-{% include_example scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/GBTRegressor.html) for more details.
-
-{% include_example java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java %}
-</div>
-
-<div data-lang="python" markdown="1">
-
-Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.GBTRegressor) for more details.
-
-{% include_example python/ml/gradient_boosted_tree_regressor_example.py %}
-</div>
-</div>
-
-
-## One-vs-Rest (a.k.a. One-vs-All)
-
-[OneVsRest](http://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is an example of a machine learning reduction for performing multiclass classification given a base classifier that can perform binary classification efficiently.  It is also known as "One-vs-All."
-
-`OneVsRest` is implemented as an `Estimator`. For the base classifier it takes instances of `Classifier` and creates a binary classification problem for each of the k classes. The classifier for class i is trained to predict whether the label is i or not, distinguishing class i from all other classes.
-
-Predictions are done by evaluating each binary classifier and the index of the most confident classifier is output as label.
-
-### Example
-
-The example below demonstrates how to load the
-[Iris dataset](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/iris.scale), parse it as a DataFrame and perform multiclass classification using `OneVsRest`. The test error is calculated to measure the algorithm accuracy.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest) for more details.
-
-{% include_example scala/org/apache/spark/examples/ml/OneVsRestExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/OneVsRest.html) for more details.
-
-{% include_example java/org/apache/spark/examples/ml/JavaOneVsRestExample.java %}
-</div>
-</div>
+  > This section has been moved into the
+   [classification and regression section](ml-classification-regression.html#tree-ensembles).
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 55e401221917e..6494fed0a01e5 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Extracting, transforming and selecting features
-displayTitle: Extracting, transforming and selecting features
+title: Extracting, transforming and selecting features - spark.ml
+displayTitle: Extracting, transforming and selecting features - spark.ml
 ---
 
 This section covers algorithms for working with features, roughly divided into these groups:
@@ -170,25 +170,7 @@ Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.fea
 and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
-
-val sentenceDataFrame = sqlContext.createDataFrame(Seq(
-  (0, "Hi I heard about Spark"),
-  (1, "I wish Java could use case classes"),
-  (2, "Logistic,regression,models,are,neat")
-)).toDF("label", "sentence")
-val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
-val regexTokenizer = new RegexTokenizer()
-  .setInputCol("sentence")
-  .setOutputCol("words")
-  .setPattern("\\W")  // alternatively .setPattern("\\w+").setGaps(false)
-
-val tokenized = tokenizer.transform(sentenceDataFrame)
-tokenized.select("words", "label").take(3).foreach(println)
-val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
-regexTokenized.select("words", "label").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/TokenizerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -197,44 +179,7 @@ Refer to the [Tokenizer Java docs](api/java/org/apache/spark/ml/feature/Tokenize
 and the [RegexTokenizer Java docs](api/java/org/apache/spark/ml/feature/RegexTokenizer.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.RegexTokenizer;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(0, "Hi I heard about Spark"),
-  RowFactory.create(1, "I wish Java could use case classes"),
-  RowFactory.create(2, "Logistic,regression,models,are,neat")
-));
-StructType schema = new StructType(new StructField[]{
-  new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
-  new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
-});
-DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
-Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
-DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
-for (Row r : wordsDataFrame.select("words", "label").take(3)) {
-  java.util.List<String> words = r.getList(0);
-  for (String word : words) System.out.print(word + " ");
-  System.out.println();
-}
-
-RegexTokenizer regexTokenizer = new RegexTokenizer()
-  .setInputCol("sentence")
-  .setOutputCol("words")
-  .setPattern("\\W");  // alternatively .setPattern("\\w+").setGaps(false);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaTokenizerExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -243,21 +188,7 @@ Refer to the [Tokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.featu
 the the [RegexTokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RegexTokenizer)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import Tokenizer, RegexTokenizer
-
-sentenceDataFrame = sqlContext.createDataFrame([
-  (0, "Hi I heard about Spark"),
-  (1, "I wish Java could use case classes"),
-  (2, "Logistic,regression,models,are,neat")
-], ["label", "sentence"])
-tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
-wordsDataFrame = tokenizer.transform(sentenceDataFrame)
-for words_label in wordsDataFrame.select("words", "label").take(3):
-  print(words_label)
-regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
-# alternatively, pattern="\\w+", gaps(False)
-{% endhighlight %}
+{% include_example python/ml/tokenizer_example.py %}
 </div>
 </div>
 
@@ -306,19 +237,7 @@ filtered out.
 Refer to the [StopWordsRemover Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.StopWordsRemover
-
-val remover = new StopWordsRemover()
-  .setInputCol("raw")
-  .setOutputCol("filtered")
-val dataSet = sqlContext.createDataFrame(Seq(
-  (0, Seq("I", "saw", "the", "red", "baloon")),
-  (1, Seq("Mary", "had", "a", "little", "lamb"))
-)).toDF("id", "raw")
-
-remover.transform(dataSet).show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -326,34 +245,7 @@ remover.transform(dataSet).show()
 Refer to the [StopWordsRemover Java docs](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.StopWordsRemover;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-StopWordsRemover remover = new StopWordsRemover()
-  .setInputCol("raw")
-  .setOutputCol("filtered");
-
-JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
-  RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
-));
-StructType schema = new StructType(new StructField[] {
-  new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
-});
-DataFrame dataset = jsql.createDataFrame(rdd, schema);
-
-remover.transform(dataset).show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -361,17 +253,7 @@ remover.transform(dataset).show();
 Refer to the [StopWordsRemover Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import StopWordsRemover
-
-sentenceData = sqlContext.createDataFrame([
-  (0, ["I", "saw", "the", "red", "baloon"]),
-  (1, ["Mary", "had", "a", "little", "lamb"])
-], ["label", "raw"])
-
-remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
-remover.transform(sentenceData).show(truncate=False)
-{% endhighlight %}
+{% include_example python/ml/stopwords_remover_example.py %}
 </div>
 </div>
 
@@ -388,19 +270,7 @@ An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (t
 Refer to the [NGram Scala docs](api/scala/index.html#org.apache.spark.ml.feature.NGram)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.NGram
-
-val wordDataFrame = sqlContext.createDataFrame(Seq(
-  (0, Array("Hi", "I", "heard", "about", "Spark")),
-  (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
-  (2, Array("Logistic", "regression", "models", "are", "neat"))
-)).toDF("label", "words")
-
-val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
-val ngramDataFrame = ngram.transform(wordDataFrame)
-ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/NGramExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -408,38 +278,7 @@ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(pri
 Refer to the [NGram Java docs](api/java/org/apache/spark/ml/feature/NGram.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.NGram;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
-  RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
-  RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
-));
-StructType schema = new StructType(new StructField[]{
-  new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
-  new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
-});
-DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
-NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
-DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame);
-for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
-  java.util.List<String> ngrams = r.getList(0);
-  for (String ngram : ngrams) System.out.print(ngram + " --- ");
-  System.out.println();
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaNGramExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -447,19 +286,7 @@ for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
 Refer to the [NGram Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.NGram)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import NGram
-
-wordDataFrame = sqlContext.createDataFrame([
-  (0, ["Hi", "I", "heard", "about", "Spark"]),
-  (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
-  (2, ["Logistic", "regression", "models", "are", "neat"])
-], ["label", "words"])
-ngram = NGram(inputCol="words", outputCol="ngrams")
-ngramDataFrame = ngram.transform(wordDataFrame)
-for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
-  print(ngrams_label)
-{% endhighlight %}
+{% include_example python/ml/n_gram_example.py %}
 </div>
 </div>
 
@@ -476,26 +303,7 @@ Binarization is the process of thresholding numerical features to binary (0/1) f
 Refer to the [Binarizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Binarizer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.Binarizer
-import org.apache.spark.sql.DataFrame
-
-val data = Array(
-  (0, 0.1),
-  (1, 0.8),
-  (2, 0.2)
-)
-val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
-
-val binarizer: Binarizer = new Binarizer()
-  .setInputCol("feature")
-  .setOutputCol("binarized_feature")
-  .setThreshold(0.5)
-
-val binarizedDataFrame = binarizer.transform(dataFrame)
-val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
-binarizedFeatures.collect().foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/BinarizerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -503,40 +311,7 @@ binarizedFeatures.collect().foreach(println)
 Refer to the [Binarizer Java docs](api/java/org/apache/spark/ml/feature/Binarizer.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.Binarizer;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(0, 0.1),
-  RowFactory.create(1, 0.8),
-  RowFactory.create(2, 0.2)
-));
-StructType schema = new StructType(new StructField[]{
-  new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
-  new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
-});
-DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
-Binarizer binarizer = new Binarizer()
-  .setInputCol("feature")
-  .setOutputCol("binarized_feature")
-  .setThreshold(0.5);
-DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
-DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
-for (Row r : binarizedFeatures.collect()) {
-  Double binarized_value = r.getDouble(0);
-  System.out.println(binarized_value);
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaBinarizerExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -544,20 +319,7 @@ for (Row r : binarizedFeatures.collect()) {
 Refer to the [Binarizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import Binarizer
-
-continuousDataFrame = sqlContext.createDataFrame([
-  (0, 0.1),
-  (1, 0.8),
-  (2, 0.2)
-], ["label", "feature"])
-binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
-binarizedDataFrame = binarizer.transform(continuousDataFrame)
-binarizedFeatures = binarizedDataFrame.select("binarized_feature")
-for binarized_feature, in binarizedFeatures.collect():
-  print(binarized_feature)
-{% endhighlight %}
+{% include_example python/ml/binarizer_example.py %}
 </div>
 </div>
 
@@ -571,25 +333,7 @@ for binarized_feature, in binarizedFeatures.collect():
 Refer to the [PCA Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PCA)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.PCA
-import org.apache.spark.mllib.linalg.Vectors
-
-val data = Array(
-  Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
-  Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
-  Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
-)
-val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
-val pca = new PCA()
-  .setInputCol("features")
-  .setOutputCol("pcaFeatures")
-  .setK(3)
-  .fit(df)
-val pcaDF = pca.transform(df)
-val result = pcaDF.select("pcaFeatures")
-result.show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/PCAExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -597,42 +341,7 @@ result.show()
 Refer to the [PCA Java docs](api/java/org/apache/spark/ml/feature/PCA.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.feature.PCA
-import org.apache.spark.ml.feature.PCAModel
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaSparkContext jsc = ...
-SQLContext jsql = ...
-JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
-  RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
-  RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
-));
-StructType schema = new StructType(new StructField[] {
-  new StructField("features", new VectorUDT(), false, Metadata.empty()),
-});
-DataFrame df = jsql.createDataFrame(data, schema);
-PCAModel pca = new PCA()
-  .setInputCol("features")
-  .setOutputCol("pcaFeatures")
-  .setK(3)
-  .fit(df);
-DataFrame result = pca.transform(df).select("pcaFeatures");
-result.show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaPCAExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -640,19 +349,7 @@ result.show();
 Refer to the [PCA Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PCA)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import PCA
-from pyspark.mllib.linalg import Vectors
-
-data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
-  (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
-  (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
-df = sqlContext.createDataFrame(data,["features"])
-pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
-model = pca.fit(df)
-result = model.transform(df).select("pcaFeatures")
-result.show(truncate=False)
-{% endhighlight %}
+{% include_example python/ml/pca_example.py %}
 </div>
 </div>
 
@@ -666,23 +363,7 @@ result.show(truncate=False)
 Refer to the [PolynomialExpansion Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.PolynomialExpansion
-import org.apache.spark.mllib.linalg.Vectors
-
-val data = Array(
-  Vectors.dense(-2.0, 2.3),
-  Vectors.dense(0.0, 0.0),
-  Vectors.dense(0.6, -1.1)
-)
-val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
-val polynomialExpansion = new PolynomialExpansion()
-  .setInputCol("features")
-  .setOutputCol("polyFeatures")
-  .setDegree(3)
-val polyDF = polynomialExpansion.transform(df)
-polyDF.select("polyFeatures").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -690,43 +371,7 @@ polyDF.select("polyFeatures").take(3).foreach(println)
 Refer to the [PolynomialExpansion Java docs](api/java/org/apache/spark/ml/feature/PolynomialExpansion.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaSparkContext jsc = ...
-SQLContext jsql = ...
-PolynomialExpansion polyExpansion = new PolynomialExpansion()
-  .setInputCol("features")
-  .setOutputCol("polyFeatures")
-  .setDegree(3);
-JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Vectors.dense(-2.0, 2.3)),
-  RowFactory.create(Vectors.dense(0.0, 0.0)),
-  RowFactory.create(Vectors.dense(0.6, -1.1))
-));
-StructType schema = new StructType(new StructField[] {
-  new StructField("features", new VectorUDT(), false, Metadata.empty()),
-});
-DataFrame df = jsql.createDataFrame(data, schema);
-DataFrame polyDF = polyExpansion.transform(df);
-Row[] row = polyDF.select("polyFeatures").take(3);
-for (Row r : row) {
-  System.out.println(r.get(0));
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -734,20 +379,7 @@ for (Row r : row) {
 Refer to the [PolynomialExpansion Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import PolynomialExpansion
-from pyspark.mllib.linalg import Vectors
-
-df = sqlContext.createDataFrame(
-  [(Vectors.dense([-2.0, 2.3]), ),
-  (Vectors.dense([0.0, 0.0]), ),
-  (Vectors.dense([0.6, -1.1]), )],
-  ["features"])
-px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
-polyDF = px.transform(df)
-for expanded in polyDF.select("polyFeatures").take(3):
-  print(expanded)
-{% endhighlight %}
+{% include_example python/ml/polynomial_expansion_example.py %}
 </div>
 </div>
 
@@ -771,22 +403,7 @@ $0$th DCT coefficient and _not_ the $N/2$th).
 Refer to the [DCT Scala docs](api/scala/index.html#org.apache.spark.ml.feature.DCT)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.DCT
-import org.apache.spark.mllib.linalg.Vectors
-
-val data = Seq(
-  Vectors.dense(0.0, 1.0, -2.0, 3.0),
-  Vectors.dense(-1.0, 2.0, 4.0, -7.0),
-  Vectors.dense(14.0, -2.0, -5.0, 1.0))
-val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
-val dct = new DCT()
-  .setInputCol("features")
-  .setOutputCol("featuresDCT")
-  .setInverse(false)
-val dctDf = dct.transform(df)
-dctDf.select("featuresDCT").show(3)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/DCTExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -794,39 +411,7 @@ dctDf.select("featuresDCT").show(3)
 Refer to the [DCT Java docs](api/java/org/apache/spark/ml/feature/DCT.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.feature.DCT;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
-  RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
-  RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
-  RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
-));
-StructType schema = new StructType(new StructField[] {
-  new StructField("features", new VectorUDT(), false, Metadata.empty()),
-});
-DataFrame df = jsql.createDataFrame(data, schema);
-DCT dct = new DCT()
-  .setInputCol("features")
-  .setOutputCol("featuresDCT")
-  .setInverse(false);
-DataFrame dctDf = dct.transform(df);
-dctDf.select("featuresDCT").show(3);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaDCTExample.java %}}
 </div>
 </div>
 
@@ -881,18 +466,7 @@ index `2`.
 Refer to the [StringIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.StringIndexer
-
-val df = sqlContext.createDataFrame(
-  Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
-).toDF("id", "category")
-val indexer = new StringIndexer()
-  .setInputCol("category")
-  .setOutputCol("categoryIndex")
-val indexed = indexer.fit(df).transform(df)
-indexed.show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/StringIndexerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -900,37 +474,7 @@ indexed.show()
 Refer to the [StringIndexer Java docs](api/java/org/apache/spark/ml/feature/StringIndexer.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.StringIndexer;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(0, "a"),
-  RowFactory.create(1, "b"),
-  RowFactory.create(2, "c"),
-  RowFactory.create(3, "a"),
-  RowFactory.create(4, "a"),
-  RowFactory.create(5, "c")
-));
-StructType schema = new StructType(new StructField[] {
-  createStructField("id", DoubleType, false),
-  createStructField("category", StringType, false)
-});
-DataFrame df = sqlContext.createDataFrame(jrdd, schema);
-StringIndexer indexer = new StringIndexer()
-  .setInputCol("category")
-  .setOutputCol("categoryIndex");
-DataFrame indexed = indexer.fit(df).transform(df);
-indexed.show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaStringIndexerExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -938,16 +482,7 @@ indexed.show();
 Refer to the [StringIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import StringIndexer
-
-df = sqlContext.createDataFrame(
-    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
-    ["id", "category"])
-indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
-indexed = indexer.fit(df).transform(df)
-indexed.show()
-{% endhighlight %}
+{% include_example python/ml/string_indexer_example.py %}
 </div>
 </div>
 
@@ -1030,30 +565,7 @@ for more details on the API.
 Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
-
-val df = sqlContext.createDataFrame(Seq(
-  (0, "a"),
-  (1, "b"),
-  (2, "c"),
-  (3, "a"),
-  (4, "a"),
-  (5, "c")
-)).toDF("id", "category")
-
-val indexer = new StringIndexer()
-  .setInputCol("category")
-  .setOutputCol("categoryIndex")
-  .fit(df)
-val indexed = indexer.transform(df)
-
-val encoder = new OneHotEncoder()
-  .setInputCol("categoryIndex")
-  .setOutputCol("categoryVec")
-val encoded = encoder.transform(indexed)
-encoded.select("id", "categoryVec").show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -1061,46 +573,7 @@ encoded.select("id", "categoryVec").show()
 Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.OneHotEncoder;
-import org.apache.spark.ml.feature.StringIndexer;
-import org.apache.spark.ml.feature.StringIndexerModel;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-    RowFactory.create(0, "a"),
-    RowFactory.create(1, "b"),
-    RowFactory.create(2, "c"),
-    RowFactory.create(3, "a"),
-    RowFactory.create(4, "a"),
-    RowFactory.create(5, "c")
-));
-StructType schema = new StructType(new StructField[]{
-    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
-    new StructField("category", DataTypes.StringType, false, Metadata.empty())
-});
-DataFrame df = sqlContext.createDataFrame(jrdd, schema);
-StringIndexerModel indexer = new StringIndexer()
-  .setInputCol("category")
-  .setOutputCol("categoryIndex")
-  .fit(df);
-DataFrame indexed = indexer.transform(df);
-
-OneHotEncoder encoder = new OneHotEncoder()
-  .setInputCol("categoryIndex")
-  .setOutputCol("categoryVec");
-DataFrame encoded = encoder.transform(indexed);
-encoded.select("id", "categoryVec").show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -1108,25 +581,7 @@ encoded.select("id", "categoryVec").show();
 Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import OneHotEncoder, StringIndexer
-
-df = sqlContext.createDataFrame([
-  (0, "a"),
-  (1, "b"),
-  (2, "c"),
-  (3, "a"),
-  (4, "a"),
-  (5, "c")
-], ["id", "category"])
-
-stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
-model = stringIndexer.fit(df)
-indexed = model.transform(df)
-encoder = OneHotEncoder(includeFirst=False, inputCol="categoryIndex", outputCol="categoryVec")
-encoded = encoder.transform(indexed)
-encoded.select("id", "categoryVec").show()
-{% endhighlight %}
+{% include_example python/ml/onehot_encoder_example.py %}
 </div>
 </div>
 
@@ -1150,23 +605,7 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
 Refer to the [VectorIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.VectorIndexer
-
-val data = sqlContext.read.format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt")
-val indexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexed")
-  .setMaxCategories(10)
-val indexerModel = indexer.fit(data)
-val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
-println(s"Chose ${categoricalFeatures.size} categorical features: " +
-  categoricalFeatures.mkString(", "))
-
-// Create new column "indexed" with categorical values transformed to indices
-val indexedData = indexerModel.transform(data)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/VectorIndexerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -1174,30 +613,7 @@ val indexedData = indexerModel.transform(data)
 Refer to the [VectorIndexer Java docs](api/java/org/apache/spark/ml/feature/VectorIndexer.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Map;
-
-import org.apache.spark.ml.feature.VectorIndexer;
-import org.apache.spark.ml.feature.VectorIndexerModel;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame data = sqlContext.read().format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt");
-VectorIndexer indexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexed")
-  .setMaxCategories(10);
-VectorIndexerModel indexerModel = indexer.fit(data);
-Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
-System.out.print("Chose " + categoryMaps.size() + "categorical features:");
-for (Integer feature : categoryMaps.keySet()) {
-  System.out.print(" " + feature);
-}
-System.out.println();
-
-// Create new column "indexed" with categorical values transformed to indices
-DataFrame indexedData = indexerModel.transform(data);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -1205,17 +621,7 @@ DataFrame indexedData = indexerModel.transform(data);
 Refer to the [VectorIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorIndexer)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import VectorIndexer
-
-data = sqlContext.read.format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt")
-indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
-indexerModel = indexer.fit(data)
-
-# Create new column "indexed" with categorical values transformed to indices
-indexedData = indexerModel.transform(data)
-{% endhighlight %}
+{% include_example python/ml/vector_indexer_example.py %}
 </div>
 </div>
 
@@ -1232,22 +638,7 @@ The following example demonstrates how to load a dataset in libsvm format and th
 Refer to the [Normalizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Normalizer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.Normalizer
-
-val dataFrame = sqlContext.read.format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt")
-
-// Normalize each Vector using $L^1$ norm.
-val normalizer = new Normalizer()
-  .setInputCol("features")
-  .setOutputCol("normFeatures")
-  .setP(1.0)
-val l1NormData = normalizer.transform(dataFrame)
-
-// Normalize each Vector using $L^\infty$ norm.
-val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/NormalizerExample.scala %}
 </div>
 
 <div data-lang="java">
@@ -1255,24 +646,7 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
 Refer to the [Normalizer Java docs](api/java/org/apache/spark/ml/feature/Normalizer.html)
 for more details on the API.
 
-{% highlight java %}
-import org.apache.spark.ml.feature.Normalizer;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame dataFrame = sqlContext.read().format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt");
-
-// Normalize each Vector using $L^1$ norm.
-Normalizer normalizer = new Normalizer()
-  .setInputCol("features")
-  .setOutputCol("normFeatures")
-  .setP(1.0);
-DataFrame l1NormData = normalizer.transform(dataFrame);
-
-// Normalize each Vector using $L^\infty$ norm.
-DataFrame lInfNormData =
-  normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaNormalizerExample.java %}
 </div>
 
 <div data-lang="python">
@@ -1280,19 +654,7 @@ DataFrame lInfNormData =
 Refer to the [Normalizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Normalizer)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import Normalizer
-
-dataFrame = sqlContext.read.format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt")
-
-# Normalize each Vector using $L^1$ norm.
-normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
-l1NormData = normalizer.transform(dataFrame)
-
-# Normalize each Vector using $L^\infty$ norm.
-lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
-{% endhighlight %}
+{% include_example python/ml/normalizer_example.py %}
 </div>
 </div>
 
@@ -1316,23 +678,7 @@ The following example demonstrates how to load a dataset in libsvm format and th
 Refer to the [StandardScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.StandardScaler
-
-val dataFrame = sqlContext.read.format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt")
-val scaler = new StandardScaler()
-  .setInputCol("features")
-  .setOutputCol("scaledFeatures")
-  .setWithStd(true)
-  .setWithMean(false)
-
-// Compute summary statistics by fitting the StandardScaler
-val scalerModel = scaler.fit(dataFrame)
-
-// Normalize each feature to have unit standard deviation.
-val scaledData = scalerModel.transform(dataFrame)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/StandardScalerExample.scala %}
 </div>
 
 <div data-lang="java">
@@ -1340,25 +686,7 @@ val scaledData = scalerModel.transform(dataFrame)
 Refer to the [StandardScaler Java docs](api/java/org/apache/spark/ml/feature/StandardScaler.html)
 for more details on the API.
 
-{% highlight java %}
-import org.apache.spark.ml.feature.StandardScaler;
-import org.apache.spark.ml.feature.StandardScalerModel;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame dataFrame = sqlContext.read().format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt");
-StandardScaler scaler = new StandardScaler()
-  .setInputCol("features")
-  .setOutputCol("scaledFeatures")
-  .setWithStd(true)
-  .setWithMean(false);
-
-// Compute summary statistics by fitting the StandardScaler
-StandardScalerModel scalerModel = scaler.fit(dataFrame);
-
-// Normalize each feature to have unit standard deviation.
-DataFrame scaledData = scalerModel.transform(dataFrame);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaStandardScalerExample.java %}
 </div>
 
 <div data-lang="python">
@@ -1366,20 +694,7 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
 Refer to the [StandardScaler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StandardScaler)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import StandardScaler
-
-dataFrame = sqlContext.read.format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt")
-scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
-                        withStd=True, withMean=False)
-
-# Compute summary statistics by fitting the StandardScaler
-scalerModel = scaler.fit(dataFrame)
-
-# Normalize each feature to have unit standard deviation.
-scaledData = scalerModel.transform(dataFrame)
-{% endhighlight %}
+{% include_example python/ml/standard_scaler_example.py %}
 </div>
 </div>
 
@@ -1409,21 +724,7 @@ Refer to the [MinMaxScaler Scala docs](api/scala/index.html#org.apache.spark.ml.
 and the [MinMaxScalerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.MinMaxScaler
-
-val dataFrame = sqlContext.read.format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt")
-val scaler = new MinMaxScaler()
-  .setInputCol("features")
-  .setOutputCol("scaledFeatures")
-
-// Compute summary statistics and generate MinMaxScalerModel
-val scalerModel = scaler.fit(dataFrame)
-
-// rescale each feature to range [min, max].
-val scaledData = scalerModel.transform(dataFrame)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -1432,24 +733,7 @@ Refer to the [MinMaxScaler Java docs](api/java/org/apache/spark/ml/feature/MinMa
 and the [MinMaxScalerModel Java docs](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html)
 for more details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.MinMaxScaler;
-import org.apache.spark.ml.feature.MinMaxScalerModel;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame dataFrame = sqlContext.read().format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt");
-MinMaxScaler scaler = new MinMaxScaler()
-  .setInputCol("features")
-  .setOutputCol("scaledFeatures");
-
-// Compute summary statistics and generate MinMaxScalerModel
-MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
-
-// rescale each feature to range [min, max].
-DataFrame scaledData = scalerModel.transform(dataFrame);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java %}
 </div>
 </div>
 
@@ -1473,23 +757,7 @@ The following example demonstrates how to bucketize a column of `Double`s into a
 Refer to the [Bucketizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.Bucketizer
-import org.apache.spark.sql.DataFrame
-
-val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
-
-val data = Array(-0.5, -0.3, 0.0, 0.2)
-val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
-
-val bucketizer = new Bucketizer()
-  .setInputCol("features")
-  .setOutputCol("bucketedFeatures")
-  .setSplits(splits)
-
-// Transform original data into its bucket index.
-val bucketedData = bucketizer.transform(dataFrame)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/BucketizerExample.scala %}
 </div>
 
 <div data-lang="java">
@@ -1497,38 +765,7 @@ val bucketedData = bucketizer.transform(dataFrame)
 Refer to the [Bucketizer Java docs](api/java/org/apache/spark/ml/feature/Bucketizer.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
-
-JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
-  RowFactory.create(-0.5),
-  RowFactory.create(-0.3),
-  RowFactory.create(0.0),
-  RowFactory.create(0.2)
-));
-StructType schema = new StructType(new StructField[] {
-  new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
-});
-DataFrame dataFrame = jsql.createDataFrame(data, schema);
-
-Bucketizer bucketizer = new Bucketizer()
-  .setInputCol("features")
-  .setOutputCol("bucketedFeatures")
-  .setSplits(splits);
-
-// Transform original data into its bucket index.
-DataFrame bucketedData = bucketizer.transform(dataFrame);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaBucketizerExample.java %}
 </div>
 
 <div data-lang="python">
@@ -1536,19 +773,7 @@ DataFrame bucketedData = bucketizer.transform(dataFrame);
 Refer to the [Bucketizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import Bucketizer
-
-splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
-
-data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
-dataFrame = sqlContext.createDataFrame(data, ["features"])
-
-bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
-
-# Transform original data into its bucket index.
-bucketedData = bucketizer.transform(dataFrame)
-{% endhighlight %}
+{% include_example python/ml/bucketizer_example.py %}
 </div>
 </div>
 
@@ -1580,25 +805,7 @@ This example below demonstrates how to transform vectors using a transforming ve
 Refer to the [ElementwiseProduct Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.ElementwiseProduct
-import org.apache.spark.mllib.linalg.Vectors
-
-// Create some vector data; also works for sparse vectors
-val dataFrame = sqlContext.createDataFrame(Seq(
-  ("a", Vectors.dense(1.0, 2.0, 3.0)),
-  ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
-
-val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
-val transformer = new ElementwiseProduct()
-  .setScalingVec(transformingVector)
-  .setInputCol("vector")
-  .setOutputCol("transformedVector")
-
-// Batch transform the vectors to create new column:
-transformer.transform(dataFrame).show()
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -1606,41 +813,7 @@ transformer.transform(dataFrame).show()
 Refer to the [ElementwiseProduct Java docs](api/java/org/apache/spark/ml/feature/ElementwiseProduct.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.ElementwiseProduct;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-// Create some vector data; also works for sparse vectors
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
-  RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
-));
-List<StructField> fields = new ArrayList<StructField>(2);
-fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
-fields.add(DataTypes.createStructField("vector", DataTypes.StringType, false));
-StructType schema = DataTypes.createStructType(fields);
-DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema);
-Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
-ElementwiseProduct transformer = new ElementwiseProduct()
-  .setScalingVec(transformingVector)
-  .setInputCol("vector")
-  .setOutputCol("transformedVector");
-// Batch transform the vectors to create new column:
-transformer.transform(dataFrame).show();
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -1648,19 +821,8 @@ transformer.transform(dataFrame).show();
 Refer to the [ElementwiseProduct Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import ElementwiseProduct
-from pyspark.mllib.linalg import Vectors
-
-data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
-df = sqlContext.createDataFrame(data, ["vector"])
-transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
-                                 inputCol="vector", outputCol="transformedVector")
-transformer.transform(df).show()
-
-{% endhighlight %}
+{% include_example python/ml/elementwise_product_example.py %}
 </div>
-
 </div>
 
 ## SQLTransformer
@@ -1763,19 +925,7 @@ output column to `features`, after transformation we should get the following Da
 Refer to the [VectorAssembler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.ml.feature.VectorAssembler
-
-val dataset = sqlContext.createDataFrame(
-  Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
-).toDF("id", "hour", "mobile", "userFeatures", "clicked")
-val assembler = new VectorAssembler()
-  .setInputCols(Array("hour", "mobile", "userFeatures"))
-  .setOutputCol("features")
-val output = assembler.transform(dataset)
-println(output.select("features", "clicked").first())
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -1783,36 +933,7 @@ println(output.select("features", "clicked").first())
 Refer to the [VectorAssembler Java docs](api/java/org/apache/spark/ml/feature/VectorAssembler.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-StructType schema = createStructType(new StructField[] {
-  createStructField("id", IntegerType, false),
-  createStructField("hour", IntegerType, false),
-  createStructField("mobile", DoubleType, false),
-  createStructField("userFeatures", new VectorUDT(), false),
-  createStructField("clicked", DoubleType, false)
-});
-Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
-JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
-DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
-
-VectorAssembler assembler = new VectorAssembler()
-  .setInputCols(new String[] {"hour", "mobile", "userFeatures"})
-  .setOutputCol("features");
-
-DataFrame output = assembler.transform(dataset);
-System.out.println(output.select("features", "clicked").first());
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -1820,19 +941,7 @@ System.out.println(output.select("features", "clicked").first());
 Refer to the [VectorAssembler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.linalg import Vectors
-from pyspark.ml.feature import VectorAssembler
-
-dataset = sqlContext.createDataFrame(
-    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
-    ["id", "hour", "mobile", "userFeatures", "clicked"])
-assembler = VectorAssembler(
-    inputCols=["hour", "mobile", "userFeatures"],
-    outputCol="features")
-output = assembler.transform(dataset)
-print(output.select("features", "clicked").first())
-{% endhighlight %}
+{% include_example python/ml/vector_assembler_example.py %}
 </div>
 </div>
 
@@ -1962,33 +1071,7 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i
 Refer to the [VectorSlicer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
-import org.apache.spark.ml.feature.VectorSlicer
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-
-val data = Array(
-  Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
-  Vectors.dense(-2.0, 2.3, 0.0)
-)
-
-val defaultAttr = NumericAttribute.defaultAttr
-val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
-val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])
-
-val dataRDD = sc.parallelize(data).map(Row.apply)
-val dataset = sqlContext.createDataFrame(dataRDD, StructType(attrGroup.toStructField()))
-
-val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
-
-slicer.setIndices(1).setNames("f3")
-// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
-
-val output = slicer.transform(dataset)
-println(output.select("userFeatures", "features").first())
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/VectorSlicerExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -1996,41 +1079,7 @@ println(output.select("userFeatures", "features").first())
 Refer to the [VectorSlicer Java docs](api/java/org/apache/spark/ml/feature/VectorSlicer.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-Attribute[] attrs = new Attribute[]{
-  NumericAttribute.defaultAttr().withName("f1"),
-  NumericAttribute.defaultAttr().withName("f2"),
-  NumericAttribute.defaultAttr().withName("f3")
-};
-AttributeGroup group = new AttributeGroup("userFeatures", attrs);
-
-JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
-  RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
-  RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
-));
-
-DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
-
-VectorSlicer vectorSlicer = new VectorSlicer()
-  .setInputCol("userFeatures").setOutputCol("features");
-
-vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
-// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
-
-DataFrame output = vectorSlicer.transform(dataset);
-
-System.out.println(output.select("userFeatures", "features").first());
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java %}
 </div>
 </div>
 
@@ -2067,21 +1116,7 @@ id | country | hour | clicked | features         | label
 Refer to the [RFormula Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RFormula)
 for more details on the API.
 
-{% highlight scala %}
-import org.apache.spark.ml.feature.RFormula
-
-val dataset = sqlContext.createDataFrame(Seq(
-  (7, "US", 18, 1.0),
-  (8, "CA", 12, 0.0),
-  (9, "NZ", 15, 0.0)
-)).toDF("id", "country", "hour", "clicked")
-val formula = new RFormula()
-  .setFormula("clicked ~ country + hour")
-  .setFeaturesCol("features")
-  .setLabelCol("label")
-val output = formula.fit(dataset).transform(dataset)
-output.select("features", "label").show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/RFormulaExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -2089,38 +1124,7 @@ output.select("features", "label").show()
 Refer to the [RFormula Java docs](api/java/org/apache/spark/ml/feature/RFormula.html)
 for more details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.RFormula;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-StructType schema = createStructType(new StructField[] {
-  createStructField("id", IntegerType, false),
-  createStructField("country", StringType, false),
-  createStructField("hour", IntegerType, false),
-  createStructField("clicked", DoubleType, false)
-});
-JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(7, "US", 18, 1.0),
-  RowFactory.create(8, "CA", 12, 0.0),
-  RowFactory.create(9, "NZ", 15, 0.0)
-));
-DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
-
-RFormula formula = new RFormula()
-  .setFormula("clicked ~ country + hour")
-  .setFeaturesCol("features")
-  .setLabelCol("label");
-
-DataFrame output = formula.fit(dataset).transform(dataset);
-output.select("features", "label").show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaRFormulaExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -2128,21 +1132,7 @@ output.select("features", "label").show();
 Refer to the [RFormula Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula)
 for more details on the API.
 
-{% highlight python %}
-from pyspark.ml.feature import RFormula
-
-dataset = sqlContext.createDataFrame(
-    [(7, "US", 18, 1.0),
-     (8, "CA", 12, 0.0),
-     (9, "NZ", 15, 0.0)],
-    ["id", "country", "hour", "clicked"])
-formula = RFormula(
-    formula="clicked ~ country + hour",
-    featuresCol="features",
-    labelCol="label")
-output = formula.fit(dataset).transform(dataset)
-output.select("features", "label").show()
-{% endhighlight %}
+{% include_example python/ml/rformula_example.py %}
 </div>
 </div>
 
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 6f35b30c3d4df..5c96c2b7d5cc9 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -1,8 +1,10 @@
 ---
 layout: global
-title: Spark ML Programming Guide
+title: "Overview: estimators, transformers and pipelines - spark.ml"
+displayTitle: "Overview: estimators, transformers and pipelines - spark.ml"
 ---
 
+
 `\[
 \newcommand{\R}{\mathbb{R}}
 \newcommand{\E}{\mathbb{E}}
@@ -32,21 +34,6 @@ See the [algorithm guides](#algorithm-guides) section below for guides on sub-pa
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
-# Algorithm guides
-
-We provide several algorithm guides specific to the Pipelines API.
-Several of these algorithms, such as certain feature transformers, are not in the `spark.mllib` API.
-Also, some algorithms have additional capabilities in the `spark.ml` API; e.g., random forests
-provide class probabilities, and linear models provide model summaries.
-
-* [Feature extraction, transformation, and selection](ml-features.html)
-* [Clustering](ml-clustering.html)
-* [Decision Trees for classification and regression](ml-decision-tree.html)
-* [Ensembles](ml-ensembles.html)
-* [Linear methods with elastic net regularization](ml-linear-methods.html)
-* [Multilayer perceptron classifier](ml-ann.html)
-* [Survival Regression](ml-survival-regression.html)
-
 
 # Main concepts in Pipelines
 
diff --git a/docs/ml-intro.md b/docs/ml-intro.md
deleted file mode 100644
index d95a66ba23566..0000000000000
--- a/docs/ml-intro.md
+++ /dev/null
@@ -1,941 +0,0 @@
----
-layout: global
-title: "Overview: estimators, transformers and pipelines - spark.ml"
-displayTitle: "Overview: estimators, transformers and pipelines"
----
-
-
-`\[
-\newcommand{\R}{\mathbb{R}}
-\newcommand{\E}{\mathbb{E}}
-\newcommand{\x}{\mathbf{x}}
-\newcommand{\y}{\mathbf{y}}
-\newcommand{\wv}{\mathbf{w}}
-\newcommand{\av}{\mathbf{\alpha}}
-\newcommand{\bv}{\mathbf{b}}
-\newcommand{\N}{\mathbb{N}}
-\newcommand{\id}{\mathbf{I}}
-\newcommand{\ind}{\mathbf{1}}
-\newcommand{\0}{\mathbf{0}}
-\newcommand{\unit}{\mathbf{e}}
-\newcommand{\one}{\mathbf{1}}
-\newcommand{\zero}{\mathbf{0}}
-\]`
-
-
-The `spark.ml` package aims to provide a uniform set of high-level APIs built on top of
-[DataFrames](sql-programming-guide.html#dataframes) that help users create and tune practical
-machine learning pipelines.
-See the [algorithm guides](#algorithm-guides) section below for guides on sub-packages of
-`spark.ml`, including feature transformers unique to the Pipelines API, ensembles, and more.
-
-**Table of contents**
-
-* This will become a table of contents (this text will be scraped).
-{:toc}
-
-
-# Main concepts in Pipelines
-
-Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple
-algorithms into a single pipeline, or workflow.
-This section covers the key concepts introduced by the Spark ML API, where the pipeline concept is
-mostly inspired by the [scikit-learn](http://scikit-learn.org/) project.
-
-* **[`DataFrame`](ml-guide.html#dataframe)**: Spark ML uses `DataFrame` from Spark SQL as an ML
-  dataset, which can hold a variety of data types.
-  E.g., a `DataFrame` could have different columns storing text, feature vectors, true labels, and predictions.
-
-* **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`.
-E.g., an ML model is a `Transformer` which transforms `DataFrame` with features into a `DataFrame` with predictions.
-
-* **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`.
-E.g., a learning algorithm is an `Estimator` which trains on a `DataFrame` and produces a model.
-
-* **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow.
-
-* **[`Parameter`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
-
-## DataFrame
-
-Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data.
-Spark ML adopts the `DataFrame` from Spark SQL in order to support a variety of data types.
-
-`DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types.
-In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](mllib-data-types.html#local-vector) types.
-
-A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`.  See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples.
-
-Columns in a `DataFrame` are named.  The code examples below use names such as "text," "features," and "label."
-
-## Pipeline components
-
-### Transformers
-
-A `Transformer` is an abstraction that includes feature transformers and learned models.
-Technically, a `Transformer` implements a method `transform()`, which converts one `DataFrame` into
-another, generally by appending one or more columns.
-For example:
-
-* A feature transformer might take a `DataFrame`, read a column (e.g., text), map it into a new
-  column (e.g., feature vectors), and output a new `DataFrame` with the mapped column appended.
-* A learning model might take a `DataFrame`, read the column containing feature vectors, predict the
-  label for each feature vector, and output a new `DataFrame` with predicted labels appended as a
-  column.
-
-### Estimators
-
-An `Estimator` abstracts the concept of a learning algorithm or any algorithm that fits or trains on
-data.
-Technically, an `Estimator` implements a method `fit()`, which accepts a `DataFrame` and produces a
-`Model`, which is a `Transformer`.
-For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling
-`fit()` trains a `LogisticRegressionModel`, which is a `Model` and hence a `Transformer`.
-
-### Properties of pipeline components
-
-`Transformer.transform()`s and `Estimator.fit()`s are both stateless.  In the future, stateful algorithms may be supported via alternative concepts.
-
-Each instance of a `Transformer` or `Estimator` has a unique ID, which is useful in specifying parameters (discussed below).
-
-## Pipeline
-
-In machine learning, it is common to run a sequence of algorithms to process and learn from data.
-E.g., a simple text document processing workflow might include several stages:
-
-* Split each document's text into words.
-* Convert each document's words into a numerical feature vector.
-* Learn a prediction model using the feature vectors and labels.
-
-Spark ML represents such a workflow as a `Pipeline`, which consists of a sequence of
-`PipelineStage`s (`Transformer`s and `Estimator`s) to be run in a specific order.
-We will use this simple workflow as a running example in this section.
-
-### How it works
-
-A `Pipeline` is specified as a sequence of stages, and each stage is either a `Transformer` or an `Estimator`.
-These stages are run in order, and the input `DataFrame` is transformed as it passes through each stage.
-For `Transformer` stages, the `transform()` method is called on the `DataFrame`.
-For `Estimator` stages, the `fit()` method is called to produce a `Transformer` (which becomes part of the `PipelineModel`, or fitted `Pipeline`), and that `Transformer`'s `transform()` method is called on the `DataFrame`.
-
-We illustrate this for the simple text document workflow.  The figure below is for the *training time* usage of a `Pipeline`.
-
-<p style="text-align: center;">
-  <img
-    src="img/ml-Pipeline.png"
-    title="Spark ML Pipeline Example"
-    alt="Spark ML Pipeline Example"
-    width="80%"
-  />
-</p>
-
-Above, the top row represents a `Pipeline` with three stages.
-The first two (`Tokenizer` and `HashingTF`) are `Transformer`s (blue), and the third (`LogisticRegression`) is an `Estimator` (red).
-The bottom row represents data flowing through the pipeline, where cylinders indicate `DataFrame`s.
-The `Pipeline.fit()` method is called on the original `DataFrame`, which has raw text documents and labels.
-The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words to the `DataFrame`.
-The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the `DataFrame`.
-Now, since `LogisticRegression` is an `Estimator`, the `Pipeline` first calls `LogisticRegression.fit()` to produce a `LogisticRegressionModel`.
-If the `Pipeline` had more stages, it would call the `LogisticRegressionModel`'s `transform()`
-method on the `DataFrame` before passing the `DataFrame` to the next stage.
-
-A `Pipeline` is an `Estimator`.
-Thus, after a `Pipeline`'s `fit()` method runs, it produces a `PipelineModel`, which is a
-`Transformer`.
-This `PipelineModel` is used at *test time*; the figure below illustrates this usage.
-
-<p style="text-align: center;">
-  <img
-    src="img/ml-PipelineModel.png"
-    title="Spark ML PipelineModel Example"
-    alt="Spark ML PipelineModel Example"
-    width="80%"
-  />
-</p>
-
-In the figure above, the `PipelineModel` has the same number of stages as the original `Pipeline`, but all `Estimator`s in the original `Pipeline` have become `Transformer`s.
-When the `PipelineModel`'s `transform()` method is called on a test dataset, the data are passed
-through the fitted pipeline in order.
-Each stage's `transform()` method updates the dataset and passes it to the next stage.
-
-`Pipeline`s and `PipelineModel`s help to ensure that training and test data go through identical feature processing steps.
-
-### Details
-
-*DAG `Pipeline`s*: A `Pipeline`'s stages are specified as an ordered array.  The examples given here are all for linear `Pipeline`s, i.e., `Pipeline`s in which each stage uses data produced by the previous stage.  It is possible to create non-linear `Pipeline`s as long as the data flow graph forms a Directed Acyclic Graph (DAG).  This graph is currently specified implicitly based on the input and output column names of each stage (generally specified as parameters).  If the `Pipeline` forms a DAG, then the stages must be specified in topological order.
-
-*Runtime checking*: Since `Pipeline`s can operate on `DataFrame`s with varied types, they cannot use
-compile-time type checking.
-`Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`.
-This type checking is done using the `DataFrame` *schema*, a description of the data types of columns in the `DataFrame`.
-
-*Unique Pipeline stages*: A `Pipeline`'s stages should be unique instances.  E.g., the same instance
-`myHashingTF` should not be inserted into the `Pipeline` twice since `Pipeline` stages must have
-unique IDs.  However, different instances `myHashingTF1` and `myHashingTF2` (both of type `HashingTF`)
-can be put into the same `Pipeline` since different instances will be created with different IDs.
-
-## Parameters
-
-Spark ML `Estimator`s and `Transformer`s use a uniform API for specifying parameters.
-
-A `Param` is a named parameter with self-contained documentation.
-A `ParamMap` is a set of (parameter, value) pairs.
-
-There are two main ways to pass parameters to an algorithm:
-
-1. Set parameters for an instance.  E.g., if `lr` is an instance of `LogisticRegression`, one could
-   call `lr.setMaxIter(10)` to make `lr.fit()` use at most 10 iterations.
-   This API resembles the API used in `spark.mllib` package.
-2. Pass a `ParamMap` to `fit()` or `transform()`.  Any parameters in the `ParamMap` will override parameters previously specified via setter methods.
-
-Parameters belong to specific instances of `Estimator`s and `Transformer`s.
-For example, if we have two `LogisticRegression` instances `lr1` and `lr2`, then we can build a `ParamMap` with both `maxIter` parameters specified: `ParamMap(lr1.maxIter -> 10, lr2.maxIter -> 20)`.
-This is useful if there are two algorithms with the `maxIter` parameter in a `Pipeline`.
-
-# Code examples
-
-This section gives code examples illustrating the functionality discussed above.
-For more info, please refer to the API documentation
-([Scala](api/scala/index.html#org.apache.spark.ml.package),
-[Java](api/java/org/apache/spark/ml/package-summary.html),
-and [Python](api/python/pyspark.ml.html)).
-Some Spark ML algorithms are wrappers for `spark.mllib` algorithms, and the
-[MLlib programming guide](mllib-guide.html) has details on specific algorithms.
-
-## Example: Estimator, Transformer, and Param
-
-This example covers the concepts of `Estimator`, `Transformer`, and `Param`.
-
-<div class="codetabs">
-
-<div data-lang="scala">
-{% highlight scala %}
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
-import org.apache.spark.sql.Row
-
-// Prepare training data from a list of (label, features) tuples.
-val training = sqlContext.createDataFrame(Seq(
-  (1.0, Vectors.dense(0.0, 1.1, 0.1)),
-  (0.0, Vectors.dense(2.0, 1.0, -1.0)),
-  (0.0, Vectors.dense(2.0, 1.3, 1.0)),
-  (1.0, Vectors.dense(0.0, 1.2, -0.5))
-)).toDF("label", "features")
-
-// Create a LogisticRegression instance.  This instance is an Estimator.
-val lr = new LogisticRegression()
-// Print out the parameters, documentation, and any default values.
-println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
-
-// We may set parameters using setter methods.
-lr.setMaxIter(10)
-  .setRegParam(0.01)
-
-// Learn a LogisticRegression model.  This uses the parameters stored in lr.
-val model1 = lr.fit(training)
-// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
-// we can view the parameters it used during fit().
-// This prints the parameter (name: value) pairs, where names are unique IDs for this
-// LogisticRegression instance.
-println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
-
-// We may alternatively specify parameters using a ParamMap,
-// which supports several methods for specifying parameters.
-val paramMap = ParamMap(lr.maxIter -> 20)
-  .put(lr.maxIter, 30) // Specify 1 Param.  This overwrites the original maxIter.
-  .put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
-
-// One can also combine ParamMaps.
-val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
-val paramMapCombined = paramMap ++ paramMap2
-
-// Now learn a new model using the paramMapCombined parameters.
-// paramMapCombined overrides all parameters set earlier via lr.set* methods.
-val model2 = lr.fit(training, paramMapCombined)
-println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
-
-// Prepare test data.
-val test = sqlContext.createDataFrame(Seq(
-  (1.0, Vectors.dense(-1.0, 1.5, 1.3)),
-  (0.0, Vectors.dense(3.0, 2.0, -0.1)),
-  (1.0, Vectors.dense(0.0, 2.2, -1.5))
-)).toDF("label", "features")
-
-// Make predictions on test data using the Transformer.transform() method.
-// LogisticRegression.transform will only use the 'features' column.
-// Note that model2.transform() outputs a 'myProbability' column instead of the usual
-// 'probability' column since we renamed the lr.probabilityCol parameter previously.
-model2.transform(test)
-  .select("features", "label", "myProbability", "prediction")
-  .collect()
-  .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
-    println(s"($features, $label) -> prob=$prob, prediction=$prediction")
-  }
-
-{% endhighlight %}
-</div>
-
-<div data-lang="java">
-{% highlight java %}
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-
-// Prepare training data.
-// We use LabeledPoint, which is a JavaBean.  Spark SQL can convert RDDs of JavaBeans
-// into DataFrames, where it uses the bean metadata to infer the schema.
-DataFrame training = sqlContext.createDataFrame(Arrays.asList(
-  new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
-  new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
-  new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
-  new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))
-), LabeledPoint.class);
-
-// Create a LogisticRegression instance.  This instance is an Estimator.
-LogisticRegression lr = new LogisticRegression();
-// Print out the parameters, documentation, and any default values.
-System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
-
-// We may set parameters using setter methods.
-lr.setMaxIter(10)
-  .setRegParam(0.01);
-
-// Learn a LogisticRegression model.  This uses the parameters stored in lr.
-LogisticRegressionModel model1 = lr.fit(training);
-// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
-// we can view the parameters it used during fit().
-// This prints the parameter (name: value) pairs, where names are unique IDs for this
-// LogisticRegression instance.
-System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
-
-// We may alternatively specify parameters using a ParamMap.
-ParamMap paramMap = new ParamMap()
-  .put(lr.maxIter().w(20)) // Specify 1 Param.
-  .put(lr.maxIter(), 30) // This overwrites the original maxIter.
-  .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
-
-// One can also combine ParamMaps.
-ParamMap paramMap2 = new ParamMap()
-  .put(lr.probabilityCol().w("myProbability")); // Change output column name
-ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
-
-// Now learn a new model using the paramMapCombined parameters.
-// paramMapCombined overrides all parameters set earlier via lr.set* methods.
-LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
-System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
-
-// Prepare test documents.
-DataFrame test = sqlContext.createDataFrame(Arrays.asList(
-  new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
-  new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
-  new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))
-), LabeledPoint.class);
-
-// Make predictions on test documents using the Transformer.transform() method.
-// LogisticRegression.transform will only use the 'features' column.
-// Note that model2.transform() outputs a 'myProbability' column instead of the usual
-// 'probability' column since we renamed the lr.probabilityCol parameter previously.
-DataFrame results = model2.transform(test);
-for (Row r: results.select("features", "label", "myProbability", "prediction").collect()) {
-  System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
-      + ", prediction=" + r.get(3));
-}
-
-{% endhighlight %}
-</div>
-
-<div data-lang="python">
-{% highlight python %}
-from pyspark.mllib.linalg import Vectors
-from pyspark.ml.classification import LogisticRegression
-from pyspark.ml.param import Param, Params
-
-# Prepare training data from a list of (label, features) tuples.
-training = sqlContext.createDataFrame([
-    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
-    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
-    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
-    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
-
-# Create a LogisticRegression instance. This instance is an Estimator.
-lr = LogisticRegression(maxIter=10, regParam=0.01)
-# Print out the parameters, documentation, and any default values.
-print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
-
-# Learn a LogisticRegression model. This uses the parameters stored in lr.
-model1 = lr.fit(training)
-
-# Since model1 is a Model (i.e., a transformer produced by an Estimator),
-# we can view the parameters it used during fit().
-# This prints the parameter (name: value) pairs, where names are unique IDs for this
-# LogisticRegression instance.
-print "Model 1 was fit using parameters: "
-print model1.extractParamMap()
-
-# We may alternatively specify parameters using a Python dictionary as a paramMap
-paramMap = {lr.maxIter: 20}
-paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter.
-paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params.
-
-# You can combine paramMaps, which are python dictionaries.
-paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name
-paramMapCombined = paramMap.copy()
-paramMapCombined.update(paramMap2)
-
-# Now learn a new model using the paramMapCombined parameters.
-# paramMapCombined overrides all parameters set earlier via lr.set* methods.
-model2 = lr.fit(training, paramMapCombined)
-print "Model 2 was fit using parameters: "
-print model2.extractParamMap()
-
-# Prepare test data
-test = sqlContext.createDataFrame([
-    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
-    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
-    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
-
-# Make predictions on test data using the Transformer.transform() method.
-# LogisticRegression.transform will only use the 'features' column.
-# Note that model2.transform() outputs a "myProbability" column instead of the usual
-# 'probability' column since we renamed the lr.probabilityCol parameter previously.
-prediction = model2.transform(test)
-selected = prediction.select("features", "label", "myProbability", "prediction")
-for row in selected.collect():
-    print row
-
-{% endhighlight %}
-</div>
-
-</div>
-
-## Example: Pipeline
-
-This example follows the simple text document `Pipeline` illustrated in the figures above.
-
-<div class="codetabs">
-
-<div data-lang="scala">
-{% highlight scala %}
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.sql.Row
-
-// Prepare training documents from a list of (id, text, label) tuples.
-val training = sqlContext.createDataFrame(Seq(
-  (0L, "a b c d e spark", 1.0),
-  (1L, "b d", 0.0),
-  (2L, "spark f g h", 1.0),
-  (3L, "hadoop mapreduce", 0.0)
-)).toDF("id", "text", "label")
-
-// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
-val tokenizer = new Tokenizer()
-  .setInputCol("text")
-  .setOutputCol("words")
-val hashingTF = new HashingTF()
-  .setNumFeatures(1000)
-  .setInputCol(tokenizer.getOutputCol)
-  .setOutputCol("features")
-val lr = new LogisticRegression()
-  .setMaxIter(10)
-  .setRegParam(0.01)
-val pipeline = new Pipeline()
-  .setStages(Array(tokenizer, hashingTF, lr))
-
-// Fit the pipeline to training documents.
-val model = pipeline.fit(training)
-
-// Prepare test documents, which are unlabeled (id, text) tuples.
-val test = sqlContext.createDataFrame(Seq(
-  (4L, "spark i j k"),
-  (5L, "l m n"),
-  (6L, "mapreduce spark"),
-  (7L, "apache hadoop")
-)).toDF("id", "text")
-
-// Make predictions on test documents.
-model.transform(test)
-  .select("id", "text", "probability", "prediction")
-  .collect()
-  .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
-    println(s"($id, $text) --> prob=$prob, prediction=$prediction")
-  }
-
-{% endhighlight %}
-</div>
-
-<div data-lang="java">
-{% highlight java %}
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-
-// Labeled and unlabeled instance types.
-// Spark SQL can infer schema from Java Beans.
-public class Document implements Serializable {
-  private long id;
-  private String text;
-
-  public Document(long id, String text) {
-    this.id = id;
-    this.text = text;
-  }
-
-  public long getId() { return this.id; }
-  public void setId(long id) { this.id = id; }
-
-  public String getText() { return this.text; }
-  public void setText(String text) { this.text = text; }
-}
-
-public class LabeledDocument extends Document implements Serializable {
-  private double label;
-
-  public LabeledDocument(long id, String text, double label) {
-    super(id, text);
-    this.label = label;
-  }
-
-  public double getLabel() { return this.label; }
-  public void setLabel(double label) { this.label = label; }
-}
-
-// Prepare training documents, which are labeled.
-DataFrame training = sqlContext.createDataFrame(Arrays.asList(
-  new LabeledDocument(0L, "a b c d e spark", 1.0),
-  new LabeledDocument(1L, "b d", 0.0),
-  new LabeledDocument(2L, "spark f g h", 1.0),
-  new LabeledDocument(3L, "hadoop mapreduce", 0.0)
-), LabeledDocument.class);
-
-// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
-Tokenizer tokenizer = new Tokenizer()
-  .setInputCol("text")
-  .setOutputCol("words");
-HashingTF hashingTF = new HashingTF()
-  .setNumFeatures(1000)
-  .setInputCol(tokenizer.getOutputCol())
-  .setOutputCol("features");
-LogisticRegression lr = new LogisticRegression()
-  .setMaxIter(10)
-  .setRegParam(0.01);
-Pipeline pipeline = new Pipeline()
-  .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
-
-// Fit the pipeline to training documents.
-PipelineModel model = pipeline.fit(training);
-
-// Prepare test documents, which are unlabeled.
-DataFrame test = sqlContext.createDataFrame(Arrays.asList(
-  new Document(4L, "spark i j k"),
-  new Document(5L, "l m n"),
-  new Document(6L, "mapreduce spark"),
-  new Document(7L, "apache hadoop")
-), Document.class);
-
-// Make predictions on test documents.
-DataFrame predictions = model.transform(test);
-for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
-  System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
-      + ", prediction=" + r.get(3));
-}
-
-{% endhighlight %}
-</div>
-
-<div data-lang="python">
-{% highlight python %}
-from pyspark.ml import Pipeline
-from pyspark.ml.classification import LogisticRegression
-from pyspark.ml.feature import HashingTF, Tokenizer
-from pyspark.sql import Row
-
-# Prepare training documents from a list of (id, text, label) tuples.
-LabeledDocument = Row("id", "text", "label")
-training = sqlContext.createDataFrame([
-    (0L, "a b c d e spark", 1.0),
-    (1L, "b d", 0.0),
-    (2L, "spark f g h", 1.0),
-    (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])
-
-# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
-tokenizer = Tokenizer(inputCol="text", outputCol="words")
-hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
-lr = LogisticRegression(maxIter=10, regParam=0.01)
-pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
-
-# Fit the pipeline to training documents.
-model = pipeline.fit(training)
-
-# Prepare test documents, which are unlabeled (id, text) tuples.
-test = sqlContext.createDataFrame([
-    (4L, "spark i j k"),
-    (5L, "l m n"),
-    (6L, "mapreduce spark"),
-    (7L, "apache hadoop")], ["id", "text"])
-
-# Make predictions on test documents and print columns of interest.
-prediction = model.transform(test)
-selected = prediction.select("id", "text", "prediction")
-for row in selected.collect():
-    print(row)
-
-{% endhighlight %}
-</div>
-
-</div>
-
-## Example: model selection via cross-validation
-
-An important task in ML is *model selection*, or using data to find the best model or parameters for a given task.  This is also called *tuning*.
-`Pipeline`s facilitate model selection by making it easy to tune an entire `Pipeline` at once, rather than tuning each element in the `Pipeline` separately.
-
-Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class, which takes an `Estimator`, a set of `ParamMap`s, and an [`Evaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.Evaluator).
-`CrossValidator` begins by splitting the dataset into a set of *folds* which are used as separate training and test datasets; e.g., with `$k=3$` folds, `CrossValidator` will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing.
-`CrossValidator` iterates through the set of `ParamMap`s. For each `ParamMap`, it trains the given `Estimator` and evaluates it using the given `Evaluator`.
-
-The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.RegressionEvaluator)
-for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator)
-for binary data, or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MultiClassClassificationEvaluator)
-for multiclass problems. The default metric used to choose the best `ParamMap` can be overriden by the `setMetric`
-method in each of these evaluators.
-
-The `ParamMap` which produces the best evaluation metric (averaged over the `$k$` folds) is selected as the best model.
-`CrossValidator` finally fits the `Estimator` using the best `ParamMap` and the entire dataset.
-
-The following example demonstrates using `CrossValidator` to select from a grid of parameters.
-To help construct the parameter grid, we use the [`ParamGridBuilder`](api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) utility.
-
-Note that cross-validation over a grid of parameters is expensive.
-E.g., in the example below, the parameter grid has 3 values for `hashingTF.numFeatures` and 2 values for `lr.regParam`, and `CrossValidator` uses 2 folds.  This multiplies out to `$(3 \times 2) \times 2 = 12$` different models being trained.
-In realistic settings, it can be common to try many more parameters and use more folds (`$k=3$` and `$k=10$` are common).
-In other words, using `CrossValidator` can be very expensive.
-However, it is also a well-established method for choosing parameters which is more statistically sound than heuristic hand-tuning.
-
-<div class="codetabs">
-
-<div data-lang="scala">
-{% highlight scala %}
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
-import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
-import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.sql.Row
-
-// Prepare training data from a list of (id, text, label) tuples.
-val training = sqlContext.createDataFrame(Seq(
-  (0L, "a b c d e spark", 1.0),
-  (1L, "b d", 0.0),
-  (2L, "spark f g h", 1.0),
-  (3L, "hadoop mapreduce", 0.0),
-  (4L, "b spark who", 1.0),
-  (5L, "g d a y", 0.0),
-  (6L, "spark fly", 1.0),
-  (7L, "was mapreduce", 0.0),
-  (8L, "e spark program", 1.0),
-  (9L, "a e c l", 0.0),
-  (10L, "spark compile", 1.0),
-  (11L, "hadoop software", 0.0)
-)).toDF("id", "text", "label")
-
-// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
-val tokenizer = new Tokenizer()
-  .setInputCol("text")
-  .setOutputCol("words")
-val hashingTF = new HashingTF()
-  .setInputCol(tokenizer.getOutputCol)
-  .setOutputCol("features")
-val lr = new LogisticRegression()
-  .setMaxIter(10)
-val pipeline = new Pipeline()
-  .setStages(Array(tokenizer, hashingTF, lr))
-
-// We use a ParamGridBuilder to construct a grid of parameters to search over.
-// With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
-// this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
-val paramGrid = new ParamGridBuilder()
-  .addGrid(hashingTF.numFeatures, Array(10, 100, 1000))
-  .addGrid(lr.regParam, Array(0.1, 0.01))
-  .build()
-
-// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
-// This will allow us to jointly choose parameters for all Pipeline stages.
-// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
-// is areaUnderROC.
-val cv = new CrossValidator()
-  .setEstimator(pipeline)
-  .setEvaluator(new BinaryClassificationEvaluator)
-  .setEstimatorParamMaps(paramGrid)
-  .setNumFolds(2) // Use 3+ in practice
-
-// Run cross-validation, and choose the best set of parameters.
-val cvModel = cv.fit(training)
-
-// Prepare test documents, which are unlabeled (id, text) tuples.
-val test = sqlContext.createDataFrame(Seq(
-  (4L, "spark i j k"),
-  (5L, "l m n"),
-  (6L, "mapreduce spark"),
-  (7L, "apache hadoop")
-)).toDF("id", "text")
-
-// Make predictions on test documents. cvModel uses the best model found (lrModel).
-cvModel.transform(test)
-  .select("id", "text", "probability", "prediction")
-  .collect()
-  .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
-    println(s"($id, $text) --> prob=$prob, prediction=$prediction")
-  }
-
-{% endhighlight %}
-</div>
-
-<div data-lang="java">
-{% highlight java %}
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.tuning.CrossValidator;
-import org.apache.spark.ml.tuning.CrossValidatorModel;
-import org.apache.spark.ml.tuning.ParamGridBuilder;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-
-// Labeled and unlabeled instance types.
-// Spark SQL can infer schema from Java Beans.
-public class Document implements Serializable {
-  private long id;
-  private String text;
-
-  public Document(long id, String text) {
-    this.id = id;
-    this.text = text;
-  }
-
-  public long getId() { return this.id; }
-  public void setId(long id) { this.id = id; }
-
-  public String getText() { return this.text; }
-  public void setText(String text) { this.text = text; }
-}
-
-public class LabeledDocument extends Document implements Serializable {
-  private double label;
-
-  public LabeledDocument(long id, String text, double label) {
-    super(id, text);
-    this.label = label;
-  }
-
-  public double getLabel() { return this.label; }
-  public void setLabel(double label) { this.label = label; }
-}
-
-
-// Prepare training documents, which are labeled.
-DataFrame training = sqlContext.createDataFrame(Arrays.asList(
-  new LabeledDocument(0L, "a b c d e spark", 1.0),
-  new LabeledDocument(1L, "b d", 0.0),
-  new LabeledDocument(2L, "spark f g h", 1.0),
-  new LabeledDocument(3L, "hadoop mapreduce", 0.0),
-  new LabeledDocument(4L, "b spark who", 1.0),
-  new LabeledDocument(5L, "g d a y", 0.0),
-  new LabeledDocument(6L, "spark fly", 1.0),
-  new LabeledDocument(7L, "was mapreduce", 0.0),
-  new LabeledDocument(8L, "e spark program", 1.0),
-  new LabeledDocument(9L, "a e c l", 0.0),
-  new LabeledDocument(10L, "spark compile", 1.0),
-  new LabeledDocument(11L, "hadoop software", 0.0)
-), LabeledDocument.class);
-
-// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
-Tokenizer tokenizer = new Tokenizer()
-  .setInputCol("text")
-  .setOutputCol("words");
-HashingTF hashingTF = new HashingTF()
-  .setNumFeatures(1000)
-  .setInputCol(tokenizer.getOutputCol())
-  .setOutputCol("features");
-LogisticRegression lr = new LogisticRegression()
-  .setMaxIter(10)
-  .setRegParam(0.01);
-Pipeline pipeline = new Pipeline()
-  .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
-
-// We use a ParamGridBuilder to construct a grid of parameters to search over.
-// With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
-// this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
-ParamMap[] paramGrid = new ParamGridBuilder()
-    .addGrid(hashingTF.numFeatures(), new int[]{10, 100, 1000})
-    .addGrid(lr.regParam(), new double[]{0.1, 0.01})
-    .build();
-
-// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
-// This will allow us to jointly choose parameters for all Pipeline stages.
-// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
-// is areaUnderROC.
-CrossValidator cv = new CrossValidator()
-  .setEstimator(pipeline)
-  .setEvaluator(new BinaryClassificationEvaluator())
-  .setEstimatorParamMaps(paramGrid)
-  .setNumFolds(2); // Use 3+ in practice
-
-// Run cross-validation, and choose the best set of parameters.
-CrossValidatorModel cvModel = cv.fit(training);
-
-// Prepare test documents, which are unlabeled.
-DataFrame test = sqlContext.createDataFrame(Arrays.asList(
-  new Document(4L, "spark i j k"),
-  new Document(5L, "l m n"),
-  new Document(6L, "mapreduce spark"),
-  new Document(7L, "apache hadoop")
-), Document.class);
-
-// Make predictions on test documents. cvModel uses the best model found (lrModel).
-DataFrame predictions = cvModel.transform(test);
-for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
-  System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
-      + ", prediction=" + r.get(3));
-}
-
-{% endhighlight %}
-</div>
-
-</div>
-
-## Example: model selection via train validation split
-In addition to  `CrossValidator` Spark also offers `TrainValidationSplit` for hyper-parameter tuning.
-`TrainValidationSplit` only evaluates each combination of parameters once as opposed to k times in
- case of `CrossValidator`. It is therefore less expensive,
- but will not produce as reliable results when the training dataset is not sufficiently large.
-
-`TrainValidationSplit` takes an `Estimator`, a set of `ParamMap`s provided in the `estimatorParamMaps` parameter,
-and an `Evaluator`.
-It begins by splitting the dataset into two parts using `trainRatio` parameter
-which are used as separate training and test datasets. For example with `$trainRatio=0.75$` (default),
-`TrainValidationSplit` will generate a training and test dataset pair where 75% of the data is used for training and 25% for validation.
-Similar to `CrossValidator`, `TrainValidationSplit` also iterates through the set of `ParamMap`s.
-For each combination of parameters, it trains the given `Estimator` and evaluates it using the given `Evaluator`.
-The `ParamMap` which produces the best evaluation metric is selected as the best option.
-`TrainValidationSplit` finally fits the `Estimator` using the best `ParamMap` and the entire dataset.
-
-<div class="codetabs">
-
-<div data-lang="scala" markdown="1">
-{% highlight scala %}
-import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.ml.regression.LinearRegression
-import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
-
-// Prepare training and test data.
-val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
-
-val lr = new LinearRegression()
-
-// We use a ParamGridBuilder to construct a grid of parameters to search over.
-// TrainValidationSplit will try all combinations of values and determine best model using
-// the evaluator.
-val paramGrid = new ParamGridBuilder()
-  .addGrid(lr.regParam, Array(0.1, 0.01))
-  .addGrid(lr.fitIntercept)
-  .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
-  .build()
-
-// In this case the estimator is simply the linear regression.
-// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-val trainValidationSplit = new TrainValidationSplit()
-  .setEstimator(lr)
-  .setEvaluator(new RegressionEvaluator)
-  .setEstimatorParamMaps(paramGrid)
-  // 80% of the data will be used for training and the remaining 20% for validation.
-  .setTrainRatio(0.8)
-
-// Run train validation split, and choose the best set of parameters.
-val model = trainValidationSplit.fit(training)
-
-// Make predictions on test data. model is the model with combination of parameters
-// that performed best.
-model.transform(test)
-  .select("features", "label", "prediction")
-  .show()
-
-{% endhighlight %}
-</div>
-
-<div data-lang="java" markdown="1">
-{% highlight java %}
-import org.apache.spark.ml.evaluation.RegressionEvaluator;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.regression.LinearRegression;
-import org.apache.spark.ml.tuning.*;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
-
-// Prepare training and test data.
-DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345);
-DataFrame training = splits[0];
-DataFrame test = splits[1];
-
-LinearRegression lr = new LinearRegression();
-
-// We use a ParamGridBuilder to construct a grid of parameters to search over.
-// TrainValidationSplit will try all combinations of values and determine best model using
-// the evaluator.
-ParamMap[] paramGrid = new ParamGridBuilder()
-  .addGrid(lr.regParam(), new double[] {0.1, 0.01})
-  .addGrid(lr.fitIntercept())
-  .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0})
-  .build();
-
-// In this case the estimator is simply the linear regression.
-// A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-TrainValidationSplit trainValidationSplit = new TrainValidationSplit()
-  .setEstimator(lr)
-  .setEvaluator(new RegressionEvaluator())
-  .setEstimatorParamMaps(paramGrid)
-  .setTrainRatio(0.8); // 80% for training and the remaining 20% for validation
-
-// Run train validation split, and choose the best set of parameters.
-TrainValidationSplitModel model = trainValidationSplit.fit(training);
-
-// Make predictions on test data. model is the model with combination of parameters
-// that performed best.
-model.transform(test)
-  .select("features", "label", "prediction")
-  .show();
-
-{% endhighlight %}
-</div>
-
-</div>
\ No newline at end of file
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index 0c13d7d0c82b3..a8754835cab95 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -1,148 +1,8 @@
 ---
 layout: global
-title: Linear Methods - ML
-displayTitle: <a href="ml-guide.html">ML</a> - Linear Methods
+title: Linear methods - spark.ml
+displayTitle: Linear methods - spark.ml
 ---
 
-
-`\[
-\newcommand{\R}{\mathbb{R}}
-\newcommand{\E}{\mathbb{E}}
-\newcommand{\x}{\mathbf{x}}
-\newcommand{\y}{\mathbf{y}}
-\newcommand{\wv}{\mathbf{w}}
-\newcommand{\av}{\mathbf{\alpha}}
-\newcommand{\bv}{\mathbf{b}}
-\newcommand{\N}{\mathbb{N}}
-\newcommand{\id}{\mathbf{I}}
-\newcommand{\ind}{\mathbf{1}}
-\newcommand{\0}{\mathbf{0}}
-\newcommand{\unit}{\mathbf{e}}
-\newcommand{\one}{\mathbf{1}}
-\newcommand{\zero}{\mathbf{0}}
-\]`
-
-
-In MLlib, we implement popular linear methods such as logistic
-regression and linear least squares with $L_1$ or $L_2$ regularization.
-Refer to [the linear methods in mllib](mllib-linear-methods.html) for
-details.  In `spark.ml`, we also include Pipelines API for [Elastic
-net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid
-of $L_1$ and $L_2$ regularization proposed in [Zou et al, Regularization
-and variable selection via the elastic
-net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf).
-Mathematically, it is defined as a convex combination of the $L_1$ and
-the $L_2$ regularization terms:
-`\[
-\alpha \left( \lambda \|\wv\|_1 \right) + (1-\alpha) \left( \frac{\lambda}{2}\|\wv\|_2^2 \right) , \alpha \in [0, 1], \lambda \geq 0
-\]`
-By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$
-regularization as special cases. For example, if a [linear
-regression](https://en.wikipedia.org/wiki/Linear_regression) model is
-trained with the elastic net parameter $\alpha$ set to $1$, it is
-equivalent to a
-[Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model.
-On the other hand, if $\alpha$ is set to $0$, the trained model reduces
-to a [ridge
-regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model.
-We implement Pipelines API for both linear regression and logistic
-regression with elastic net regularization.
-
-## Example: Logistic Regression
-
-The following example shows how to train a logistic regression model
-with elastic net regularization. `elasticNetParam` corresponds to
-$\alpha$ and `regParam` corresponds to $\lambda$.
-
-<div class="codetabs">
-
-<div data-lang="scala" markdown="1">
-{% include_example scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-{% include_example java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java %}
-</div>
-
-<div data-lang="python" markdown="1">
-{% include_example python/ml/logistic_regression_with_elastic_net.py %}
-</div>
-
-</div>
-
-The `spark.ml` implementation of logistic regression also supports
-extracting a summary of the model over the training set. Note that the
-predictions and metrics which are stored as `Dataframe` in
-`BinaryLogisticRegressionSummary` are annotated `@transient` and hence
-only available on the driver.
-
-<div class="codetabs">
-
-<div data-lang="scala" markdown="1">
-
-[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionTrainingSummary)
-provides a summary for a
-[`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel).
-Currently, only binary classification is supported and the
-summary must be explicitly cast to
-[`BinaryLogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary).
-This will likely change when multiclass classification is supported.
-
-Continuing the earlier example:
-
-{% include_example scala/org/apache/spark/examples/ml/LogisticRegressionSummaryExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-[`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary.html)
-provides a summary for a
-[`LogisticRegressionModel`](api/java/org/apache/spark/ml/classification/LogisticRegressionModel.html).
-Currently, only binary classification is supported and the
-summary must be explicitly cast to
-[`BinaryLogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/BinaryLogisticRegressionTrainingSummary.html).
-This will likely change when multiclass classification is supported.
-
-Continuing the earlier example:
-
-{% include_example java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java %}
-</div>
-
-<!--- TODO: Add python model summaries once implemented -->
-<div data-lang="python" markdown="1">
-Logistic regression model summary is not yet supported in Python.
-</div>
-
-</div>
-
-## Example: Linear Regression
-
-The interface for working with linear regression models and model
-summaries is similar to the logistic regression case. The following
-example demonstrates training an elastic net regularized linear
-regression model and extracting model summary statistics.
-
-<div class="codetabs">
-
-<div data-lang="scala" markdown="1">
-{% include_example scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-{% include_example java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java %}
-</div>
-
-<div data-lang="python" markdown="1">
-<!--- TODO: Add python model summaries once implemented -->
-{% include_example python/ml/linear_regression_with_elastic_net.py %}
-</div>
-
-</div>
-
-# Optimization
-
-The optimization algorithm underlying the implementation is called
-[Orthant-Wise Limited-memory
-QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
-(OWL-QN). It is an extension of L-BFGS that can effectively handle L1
-regularization and elastic net.
-
+  > This section has been moved into the
+   [classification and regression section](ml-classification-regression.html).
diff --git a/docs/ml-survival-regression.md b/docs/ml-survival-regression.md
index ab275213b9a84..856ceb2f4e7f6 100644
--- a/docs/ml-survival-regression.md
+++ b/docs/ml-survival-regression.md
@@ -1,96 +1,8 @@
 ---
 layout: global
-title: Survival Regression - ML
-displayTitle: <a href="ml-guide.html">ML</a> - Survival Regression
+title: Survival Regression - spark.ml
+displayTitle: Survival Regression - spark.ml
 ---
 
-
-`\[
-\newcommand{\R}{\mathbb{R}}
-\newcommand{\E}{\mathbb{E}}
-\newcommand{\x}{\mathbf{x}}
-\newcommand{\y}{\mathbf{y}}
-\newcommand{\wv}{\mathbf{w}}
-\newcommand{\av}{\mathbf{\alpha}}
-\newcommand{\bv}{\mathbf{b}}
-\newcommand{\N}{\mathbb{N}}
-\newcommand{\id}{\mathbf{I}}
-\newcommand{\ind}{\mathbf{1}}
-\newcommand{\0}{\mathbf{0}}
-\newcommand{\unit}{\mathbf{e}}
-\newcommand{\one}{\mathbf{1}}
-\newcommand{\zero}{\mathbf{0}}
-\]`
-
-
-In `spark.ml`, we implement the [Accelerated failure time (AFT)](https://en.wikipedia.org/wiki/Accelerated_failure_time_model) 
-model which is a parametric survival regression model for censored data. 
-It describes a model for the log of survival time, so it's often called 
-log-linear model for survival analysis. Different from 
-[Proportional hazards](https://en.wikipedia.org/wiki/Proportional_hazards_model) model
-designed for the same purpose, the AFT model is more easily to parallelize 
-because each instance contribute to the objective function independently.
-
-Given the values of the covariates $x^{'}$, for random lifetime $t_{i}$ of 
-subjects i = 1, ..., n, with possible right-censoring, 
-the likelihood function under the AFT model is given as:
-`\[
-L(\beta,\sigma)=\prod_{i=1}^n[\frac{1}{\sigma}f_{0}(\frac{\log{t_{i}}-x^{'}\beta}{\sigma})]^{\delta_{i}}S_{0}(\frac{\log{t_{i}}-x^{'}\beta}{\sigma})^{1-\delta_{i}}
-\]`
-Where $\delta_{i}$ is the indicator of the event has occurred i.e. uncensored or not.
-Using $\epsilon_{i}=\frac{\log{t_{i}}-x^{'}\beta}{\sigma}$, the log-likelihood function
-assumes the form:
-`\[
-\iota(\beta,\sigma)=\sum_{i=1}^{n}[-\delta_{i}\log\sigma+\delta_{i}\log{f_{0}}(\epsilon_{i})+(1-\delta_{i})\log{S_{0}(\epsilon_{i})}]
-\]`
-Where $S_{0}(\epsilon_{i})$ is the baseline survivor function,
-and $f_{0}(\epsilon_{i})$ is corresponding density function.
-
-The most commonly used AFT model is based on the Weibull distribution of the survival time. 
-The Weibull distribution for lifetime corresponding to extreme value distribution for 
-log of the lifetime, and the $S_{0}(\epsilon)$ function is:
-`\[   
-S_{0}(\epsilon_{i})=\exp(-e^{\epsilon_{i}})
-\]`
-the $f_{0}(\epsilon_{i})$ function is:
-`\[
-f_{0}(\epsilon_{i})=e^{\epsilon_{i}}\exp(-e^{\epsilon_{i}})
-\]`
-The log-likelihood function for AFT model with Weibull distribution of lifetime is:
-`\[
-\iota(\beta,\sigma)= -\sum_{i=1}^n[\delta_{i}\log\sigma-\delta_{i}\epsilon_{i}+e^{\epsilon_{i}}]
-\]`
-Due to minimizing the negative log-likelihood equivalent to maximum a posteriori probability,
-the loss function we use to optimize is $-\iota(\beta,\sigma)$.
-The gradient functions for $\beta$ and $\log\sigma$ respectively are:
-`\[   
-\frac{\partial (-\iota)}{\partial \beta}=\sum_{1=1}^{n}[\delta_{i}-e^{\epsilon_{i}}]\frac{x_{i}}{\sigma}
-\]`
-`\[ 
-\frac{\partial (-\iota)}{\partial (\log\sigma)}=\sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}]
-\]`
-
-The AFT model can be formulated as a convex optimization problem, 
-i.e. the task of finding a minimizer of a convex function $-\iota(\beta,\sigma)$ 
-that depends coefficients vector $\beta$ and the log of scale parameter $\log\sigma$.
-The optimization algorithm underlying the implementation is L-BFGS.
-The implementation matches the result from R's survival function 
-[survreg](https://stat.ethz.ch/R-manual/R-devel/library/survival/html/survreg.html)
-
-## Example:
-
-<div class="codetabs">
-
-<div data-lang="scala" markdown="1">
-{% include_example scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala %}
-</div>
-
-<div data-lang="java" markdown="1">
-{% include_example java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java %}
-</div>
-
-<div data-lang="python" markdown="1">
-{% include_example python/ml/aft_survival_regression.py %}
-</div>
-
-</div>
\ No newline at end of file
+  > This section has been moved into the
+   [classification and regression section](ml-classification-regression.html#survival-regression).
diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md
index 0210950b89906..aaf8bd465c9ab 100644
--- a/docs/mllib-classification-regression.md
+++ b/docs/mllib-classification-regression.md
@@ -1,10 +1,10 @@
 ---
 layout: global
-title: Classification and Regression - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Classification and Regression
+title: Classification and Regression - spark.mllib
+displayTitle: Classification and Regression - spark.mllib
 ---
 
-MLlib supports various methods for 
+The `spark.mllib` package supports various methods for 
 [binary classification](http://en.wikipedia.org/wiki/Binary_classification),
 [multiclass
 classification](http://en.wikipedia.org/wiki/Multiclass_classification), and
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 8fbced6c87d9f..48d64cd402b11 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Clustering - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Clustering
+title: Clustering - spark.mllib
+displayTitle: Clustering - spark.mllib
 ---
 
 [Clustering](https://en.wikipedia.org/wiki/Cluster_analysis) is an unsupervised learning problem whereby we aim to group subsets
@@ -10,19 +10,19 @@ often used for exploratory analysis and/or as a component of a hierarchical
 [supervised learning](https://en.wikipedia.org/wiki/Supervised_learning) pipeline (in which distinct classifiers or regression
 models are trained for each cluster).
 
-MLlib supports the following models:
+The `spark.mllib` package supports the following models:
 
 * Table of contents
 {:toc}
 
 ## K-means
 
-[k-means](http://en.wikipedia.org/wiki/K-means_clustering) is one of the
+[K-means](http://en.wikipedia.org/wiki/K-means_clustering) is one of the
 most commonly used clustering algorithms that clusters the data points into a
-predefined number of clusters. The MLlib implementation includes a parallelized
+predefined number of clusters. The `spark.mllib` implementation includes a parallelized
 variant of the [k-means++](http://en.wikipedia.org/wiki/K-means%2B%2B) method
 called [kmeans||](http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf).
-The implementation in MLlib has the following parameters:
+The implementation in `spark.mllib` has the following parameters:
 
 * *k* is the number of desired clusters.
 * *maxIterations* is the maximum number of iterations to run.
@@ -171,7 +171,7 @@ sameModel = KMeansModel.load(sc, "myModelPath")
 
 A [Gaussian Mixture Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
 represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions,
-each with its own probability.  The MLlib implementation uses the
+each with its own probability.  The `spark.mllib` implementation uses the
 [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
  algorithm to induce the maximum-likelihood model given a set of samples.  The implementation
 has the following parameters:
@@ -308,13 +308,13 @@ graph given pairwise similarties as edge properties,
 described in [Lin and Cohen, Power Iteration Clustering](http://www.icml2010.org/papers/387.pdf).
 It computes a pseudo-eigenvector of the normalized affinity matrix of the graph via
 [power iteration](http://en.wikipedia.org/wiki/Power_iteration)  and uses it to cluster vertices.
-MLlib includes an implementation of PIC using GraphX as its backend.
+`spark.mllib` includes an implementation of PIC using GraphX as its backend.
 It takes an `RDD` of `(srcId, dstId, similarity)` tuples and outputs a model with the clustering assignments.
 The similarities must be nonnegative.
 PIC assumes that the similarity measure is symmetric.
 A pair `(srcId, dstId)` regardless of the ordering should appear at most once in the input data.
 If a pair is missing from input, their similarity is treated as zero.
-MLlib's PIC implementation takes the following (hyper-)parameters:
+`spark.mllib`'s PIC implementation takes the following (hyper-)parameters:
 
 * `k`: number of clusters
 * `maxIterations`: maximum number of power iterations
@@ -323,7 +323,7 @@ MLlib's PIC implementation takes the following (hyper-)parameters:
 
 **Examples**
 
-In the following, we show code snippets to demonstrate how to use PIC in MLlib.
+In the following, we show code snippets to demonstrate how to use PIC in `spark.mllib`.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -493,7 +493,7 @@ checkpointing can help reduce shuffle file sizes on disk and help with
 failure recovery.
 
 
-All of MLlib's LDA models support:
+All of `spark.mllib`'s LDA models support:
 
 * `describeTopics`: Returns topics as arrays of most important terms and
 term weights
@@ -721,7 +721,7 @@ sameModel = LDAModel.load(sc, "myModelPath")
 ## Streaming k-means
 
 When data arrive in a stream, we may want to estimate clusters dynamically,
-updating them as new data arrive. MLlib provides support for streaming k-means clustering,
+updating them as new data arrive. `spark.mllib` provides support for streaming k-means clustering,
 with parameters to control the decay (or "forgetfulness") of the estimates. The algorithm
 uses a generalization of the mini-batch k-means update rule. For each batch of data, we assign
 all points to their nearest cluster, compute new cluster centers, then update each cluster using:
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index 7cd1b894e7cb5..1ebb4654aef12 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Collaborative Filtering - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Collaborative Filtering 
+title: Collaborative Filtering - spark.mllib
+displayTitle: Collaborative Filtering - spark.mllib
 ---
 
 * Table of contents
@@ -11,12 +11,12 @@ displayTitle: <a href="mllib-guide.html">MLlib</a> - Collaborative Filtering
 
 [Collaborative filtering](http://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering)
 is commonly used for recommender systems.  These techniques aim to fill in the
-missing entries of a user-item association matrix.  MLlib currently supports
+missing entries of a user-item association matrix.  `spark.mllib` currently supports
 model-based collaborative filtering, in which users and products are described
 by a small set of latent factors that can be used to predict missing entries.
-MLlib uses the [alternating least squares
+`spark.mllib` uses the [alternating least squares
 (ALS)](http://dl.acm.org/citation.cfm?id=1608614)
-algorithm to learn these latent factors. The implementation in MLlib has the
+algorithm to learn these latent factors. The implementation in `spark.mllib` has the
 following parameters:
 
 * *numBlocks* is the number of blocks used to parallelize computation (set to -1 to auto-configure).
@@ -34,7 +34,7 @@ The standard approach to matrix factorization based collaborative filtering trea
 the entries in the user-item matrix as *explicit* preferences given by the user to the item.
 
 It is common in many real-world use cases to only have access to *implicit feedback* (e.g. views,
-clicks, purchases, likes, shares etc.). The approach used in MLlib to deal with such data is taken
+clicks, purchases, likes, shares etc.). The approach used in `spark.mllib` to deal with such data is taken
 from
 [Collaborative Filtering for Implicit Feedback Datasets](http://dx.doi.org/10.1109/ICDM.2008.22).
 Essentially instead of trying to model the matrix of ratings directly, this approach treats the data
@@ -119,4 +119,4 @@ a dependency.
 ## Tutorial
 
 The [training exercises](https://databricks-training.s3.amazonaws.com/index.html) from the Spark Summit 2014 include a hands-on tutorial for
-[personalized movie recommendation with MLlib](https://databricks-training.s3.amazonaws.com/movie-recommendation-with-mllib.html).
+[personalized movie recommendation with `spark.mllib`](https://databricks-training.s3.amazonaws.com/movie-recommendation-with-mllib.html).
diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index 3c0c0479674df..363dc7c13b306 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -1,7 +1,7 @@
 ---
 layout: global
 title: Data Types - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Data Types
+displayTitle: Data Types - MLlib
 ---
 
 * Table of contents
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 77ce34e91af3c..a8612b6c84fe9 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Decision Trees - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Decision Trees
+title: Decision Trees - spark.mllib
+displayTitle: Decision Trees - spark.mllib
 ---
 
 * Table of contents
@@ -15,7 +15,7 @@ feature scaling, and are able to capture non-linearities and feature interaction
 algorithms such as random forests and boosting are among the top performers for classification and
 regression tasks.
 
-MLlib supports decision trees for binary and multiclass classification and for regression,
+`spark.mllib` supports decision trees for binary and multiclass classification and for regression,
 using both continuous and categorical features. The implementation partitions data by rows,
 allowing distributed training with millions of instances.
 
diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index ac3526908a9f4..11d8e0bd1d23d 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Dimensionality Reduction - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Dimensionality Reduction
+title: Dimensionality Reduction - spark.mllib
+displayTitle: Dimensionality Reduction - spark.mllib
 ---
 
 * Table of contents
@@ -11,7 +11,7 @@ displayTitle: <a href="mllib-guide.html">MLlib</a> - Dimensionality Reduction
 of reducing the number of variables under consideration.
 It can be used to extract latent features from raw and noisy features
 or compress data while maintaining the structure.
-MLlib provides support for dimensionality reduction on the <a href="mllib-data-types.html#rowmatrix">RowMatrix</a> class.
+`spark.mllib` provides support for dimensionality reduction on the <a href="mllib-data-types.html#rowmatrix">RowMatrix</a> class.
 
 ## Singular value decomposition (SVD)
 
@@ -57,7 +57,7 @@ passes, $O(n)$ storage on each executor, and $O(n k)$ storage on the driver.
 
 ### SVD Example
  
-MLlib provides SVD functionality to row-oriented matrices, provided in the
+`spark.mllib` provides SVD functionality to row-oriented matrices, provided in the
 <a href="mllib-data-types.html#rowmatrix">RowMatrix</a> class. 
 
 <div class="codetabs">
@@ -141,7 +141,7 @@ statistical method to find a rotation such that the first coordinate has the lar
 possible, and each succeeding coordinate in turn has the largest variance possible. The columns of
 the rotation matrix are called principal components. PCA is used widely in dimensionality reduction.
 
-MLlib supports PCA for tall-and-skinny matrices stored in row-oriented format and any Vectors.
+`spark.mllib` supports PCA for tall-and-skinny matrices stored in row-oriented format and any Vectors.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index 50450e05d2abb..2416b6fa0aeb3 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Ensembles - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Ensembles
+title: Ensembles - spark.mllib
+displayTitle: Ensembles - spark.mllib
 ---
 
 * Table of contents
@@ -9,7 +9,7 @@ displayTitle: <a href="mllib-guide.html">MLlib</a> - Ensembles
 
 An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning)
 is a learning algorithm which creates a model composed of a set of other base models.
-MLlib supports two major ensemble algorithms: [`GradientBoostedTrees`](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees) and [`RandomForest`](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest).
+`spark.mllib` supports two major ensemble algorithms: [`GradientBoostedTrees`](api/scala/index.html#org.apache.spark.mllib.tree.GradientBoostedTrees) and [`RandomForest`](api/scala/index.html#org.apache.spark.mllib.tree.RandomForest).
 Both use [decision trees](mllib-decision-tree.html) as their base models.
 
 ## Gradient-Boosted Trees vs. Random Forests
@@ -33,9 +33,9 @@ Like decision trees, random forests handle categorical features,
 extend to the multiclass classification setting, do not require
 feature scaling, and are able to capture non-linearities and feature interactions.
 
-MLlib supports random forests for binary and multiclass classification and for regression,
+`spark.mllib` supports random forests for binary and multiclass classification and for regression,
 using both continuous and categorical features.
-MLlib implements random forests using the existing [decision tree](mllib-decision-tree.html)
+`spark.mllib` implements random forests using the existing [decision tree](mllib-decision-tree.html)
 implementation.  Please see the decision tree guide for more information on trees.
 
 ### Basic algorithm
@@ -155,9 +155,9 @@ Like decision trees, GBTs handle categorical features,
 extend to the multiclass classification setting, do not require
 feature scaling, and are able to capture non-linearities and feature interactions.
 
-MLlib supports GBTs for binary classification and for regression,
+`spark.mllib` supports GBTs for binary classification and for regression,
 using both continuous and categorical features.
-MLlib implements GBTs using the existing [decision tree](mllib-decision-tree.html) implementation.  Please see the decision tree guide for more information on trees.
+`spark.mllib` implements GBTs using the existing [decision tree](mllib-decision-tree.html) implementation.  Please see the decision tree guide for more information on trees.
 
 *Note*: GBTs do not yet support multiclass classification.  For multiclass problems, please use
 [decision trees](mllib-decision-tree.html) or [Random Forests](mllib-ensembles.html#Random-Forest).
@@ -171,7 +171,7 @@ The specific mechanism for re-labeling instances is defined by a loss function (
 
 #### Losses
 
-The table below lists the losses currently supported by GBTs in MLlib.
+The table below lists the losses currently supported by GBTs in `spark.mllib`.
 Note that each loss is applicable to one of classification or regression, not both.
 
 Notation: $N$ = number of instances. $y_i$ = label of instance $i$.  $x_i$ = features of instance $i$.  $F(x_i)$ = model's predicted label for instance $i$.
diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 6924037b941f3..774826c2703f8 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -1,20 +1,20 @@
 ---
 layout: global
-title: Evaluation Metrics - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Evaluation Metrics
+title: Evaluation Metrics - spark.mllib
+displayTitle: Evaluation Metrics - spark.mllib
 ---
 
 * Table of contents
 {:toc}
 
-Spark's MLlib comes with a number of machine learning algorithms that can be used to learn from and make predictions
+`spark.mllib` comes with a number of machine learning algorithms that can be used to learn from and make predictions
 on data. When these algorithms are applied to build machine learning models, there is a need to evaluate the performance
-of the model on some criteria, which depends on the application and its requirements. Spark's MLlib also provides a
+of the model on some criteria, which depends on the application and its requirements. `spark.mllib` also provides a
 suite of metrics for the purpose of evaluating the performance of machine learning models.
 
 Specific machine learning algorithms fall under broader types of machine learning applications like classification,
 regression, clustering, etc. Each of these types have well established metrics for performance evaluation and those
-metrics that are currently available in Spark's MLlib are detailed in this section.
+metrics that are currently available in `spark.mllib` are detailed in this section.
 
 ## Classification model evaluation
 
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 5bee170c61fe9..7796bac697562 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Feature Extraction and Transformation - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Feature Extraction and Transformation
+title: Feature Extraction and Transformation - spark.mllib
+displayTitle: Feature Extraction and Transformation - spark.mllib
 ---
 
 * Table of contents
@@ -31,7 +31,7 @@ The TF-IDF measure is simply the product of TF and IDF:
 TFIDF(t, d, D) = TF(t, d) \cdot IDF(t, D).
 \]`
 There are several variants on the definition of term frequency and document frequency.
-In MLlib, we separate TF and IDF to make them flexible.
+In `spark.mllib`, we separate TF and IDF to make them flexible.
 
 Our implementation of term frequency utilizes the
 [hashing trick](http://en.wikipedia.org/wiki/Feature_hashing).
@@ -44,7 +44,7 @@ To reduce the chance of collision, we can increase the target feature dimension,
 the number of buckets of the hash table.
 The default feature dimension is `$2^{20} = 1,048,576$`.
 
-**Note:** MLlib doesn't provide tools for text segmentation.
+**Note:** `spark.mllib` doesn't provide tools for text segmentation.
 We refer users to the [Stanford NLP Group](http://nlp.stanford.edu/) and 
 [scalanlp/chalk](https://github.com/scalanlp/chalk).
 
@@ -86,7 +86,7 @@ val idf = new IDF().fit(tf)
 val tfidf: RDD[Vector] = idf.transform(tf)
 {% endhighlight %}
 
-MLlib's IDF implementation provides an option for ignoring terms which occur in less than a
+`spark.mllib`'s IDF implementation provides an option for ignoring terms which occur in less than a
 minimum number of documents.  In such cases, the IDF for these terms is set to 0.  This feature
 can be used by passing the `minDocFreq` value to the IDF constructor.
 
@@ -134,7 +134,7 @@ idf = IDF().fit(tf)
 tfidf = idf.transform(tf)
 {% endhighlight %}
 
-MLLib's IDF implementation provides an option for ignoring terms which occur in less than a
+`spark.mllib`'s IDF implementation provides an option for ignoring terms which occur in less than a
 minimum number of documents.  In such cases, the IDF for these terms is set to 0.  This feature
 can be used by passing the `minDocFreq` value to the IDF constructor.
 
diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index fe42896a05d8e..2c8a8f236163f 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Frequent Pattern Mining - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Frequent Pattern Mining
+title: Frequent Pattern Mining - spark.mllib
+displayTitle: Frequent Pattern Mining - spark.mllib
 ---
 
 Mining frequent items, itemsets, subsequences, or other substructures is usually among the
@@ -9,7 +9,7 @@ first steps to analyze a large-scale dataset, which has been an active research
 data mining for years.
 We refer users to Wikipedia's [association rule learning](http://en.wikipedia.org/wiki/Association_rule_learning)
 for more information.
-MLlib provides a parallel implementation of FP-growth,
+`spark.mllib` provides a parallel implementation of FP-growth,
 a popular algorithm to mining frequent itemsets.
 
 ## FP-growth
@@ -22,13 +22,13 @@ Different from [Apriori-like](http://en.wikipedia.org/wiki/Apriori_algorithm) al
 the second step of FP-growth uses a suffix tree (FP-tree) structure to encode transactions without generating candidate sets
 explicitly, which are usually expensive to generate.
 After the second step, the frequent itemsets can be extracted from the FP-tree.
-In MLlib, we implemented a parallel version of FP-growth called PFP,
+In `spark.mllib`, we implemented a parallel version of FP-growth called PFP,
 as described in [Li et al., PFP: Parallel FP-growth for query recommendation](http://dx.doi.org/10.1145/1454008.1454027).
 PFP distributes the work of growing FP-trees based on the suffices of transactions,
 and hence more scalable than a single-machine implementation.
 We refer users to the papers for more details.
 
-MLlib's FP-growth implementation takes the following (hyper-)parameters:
+`spark.mllib`'s FP-growth implementation takes the following (hyper-)parameters:
 
 * `minSupport`: the minimum support for an itemset to be identified as frequent.
   For example, if an item appears 3 out of 5 transactions, it has a support of 3/5=0.6.
@@ -126,7 +126,7 @@ PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer
 the reader to the referenced paper for formalizing the sequential
 pattern mining problem.
 
-MLlib's PrefixSpan implementation takes the following parameters:
+`spark.mllib`'s PrefixSpan implementation takes the following parameters:
 
 * `minSupport`: the minimum support required to be considered a frequent
   sequential pattern.
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 3bc2b780601c2..7fef6b5c61f99 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -66,7 +66,7 @@ We list major functionality from both below, with links to detailed guides.
 
 # spark.ml: high-level APIs for ML pipelines
 
-* [Overview: estimators, transformers and pipelines](ml-intro.html)
+* [Overview: estimators, transformers and pipelines](ml-guide.html)
 * [Extracting, transforming and selecting features](ml-features.html)
 * [Classification and regression](ml-classification-regression.html)
 * [Clustering](ml-clustering.html)
diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md
index 85f9226b43416..8ede4407d5843 100644
--- a/docs/mllib-isotonic-regression.md
+++ b/docs/mllib-isotonic-regression.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Isotonic regression - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Regression
+title: Isotonic regression - spark.mllib
+displayTitle: Regression - spark.mllib
 ---
 
 ## Isotonic regression
@@ -23,7 +23,7 @@ Essentially isotonic regression is a
 [monotonic function](http://en.wikipedia.org/wiki/Monotonic_function)
 best fitting the original data points.
 
-MLlib supports a
+`spark.mllib` supports a
 [pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111)
 which uses an approach to
 [parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10).
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 132f8c354aa9c..20b35612cab95 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Linear Methods - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Linear Methods
+title: Linear Methods - spark.mllib
+displayTitle: Linear Methods - spark.mllib
 ---
 
 * Table of contents
@@ -41,7 +41,7 @@ the objective function is of the form
 Here the vectors `$\x_i\in\R^d$` are the training data examples, for `$1\le i\le n$`, and
 `$y_i\in\R$` are their corresponding labels, which we want to predict.
 We call the method *linear* if $L(\wv; \x, y)$ can be expressed as a function of $\wv^T x$ and $y$.
-Several of MLlib's classification and regression algorithms fall into this category,
+Several of `spark.mllib`'s classification and regression algorithms fall into this category,
 and are discussed here.
 
 The objective function `$f$` has two parts:
@@ -55,7 +55,7 @@ training error) and minimizing model complexity (i.e., to avoid overfitting).
 ### Loss functions
 
 The following table summarizes the loss functions and their gradients or sub-gradients for the
-methods MLlib supports:
+methods `spark.mllib` supports:
 
 <table class="table">
   <thead>
@@ -83,7 +83,7 @@ methods MLlib supports:
 The purpose of the
 [regularizer](http://en.wikipedia.org/wiki/Regularization_(mathematics)) is to
 encourage simple models and avoid overfitting.  We support the following
-regularizers in MLlib:
+regularizers in `spark.mllib`:
 
 <table class="table">
   <thead>
@@ -115,7 +115,10 @@ especially when the number of training examples is small.
 
 ### Optimization
 
-Under the hood, linear methods use convex optimization methods to optimize the objective functions.  MLlib uses two methods, SGD and L-BFGS, described in the [optimization section](mllib-optimization.html).  Currently, most algorithm APIs support Stochastic Gradient Descent (SGD), and a few support L-BFGS. Refer to [this optimization section](mllib-optimization.html#Choosing-an-Optimization-Method) for guidelines on choosing between optimization methods.
+Under the hood, linear methods use convex optimization methods to optimize the objective functions.
+`spark.mllib` uses two methods, SGD and L-BFGS, described in the [optimization section](mllib-optimization.html).
+Currently, most algorithm APIs support Stochastic Gradient Descent (SGD), and a few support L-BFGS.
+Refer to [this optimization section](mllib-optimization.html#Choosing-an-Optimization-Method) for guidelines on choosing between optimization methods.
 
 ## Classification
 
@@ -126,16 +129,16 @@ The most common classification type is
 categories, usually named positive and negative.
 If there are more than two categories, it is called
 [multiclass classification](http://en.wikipedia.org/wiki/Multiclass_classification).
-MLlib supports two linear methods for classification: linear Support Vector Machines (SVMs)
+`spark.mllib` supports two linear methods for classification: linear Support Vector Machines (SVMs)
 and logistic regression.
 Linear SVMs supports only binary classification, while logistic regression supports both binary and
 multiclass classification problems.
-For both methods, MLlib supports L1 and L2 regularized variants.
+For both methods, `spark.mllib` supports L1 and L2 regularized variants.
 The training data set is represented by an RDD of [LabeledPoint](mllib-data-types.html) in MLlib,
 where labels are class indices starting from zero: $0, 1, 2, \ldots$.
 Note that, in the mathematical formulation in this guide, a binary label $y$ is denoted as either
 $+1$ (positive) or $-1$ (negative), which is convenient for the formulation.
-*However*, the negative label is represented by $0$ in MLlib instead of $-1$, to be consistent with
+*However*, the negative label is represented by $0$ in `spark.mllib` instead of $-1$, to be consistent with
 multiclass labeling.
 
 ### Linear Support Vector Machines (SVMs)
@@ -207,7 +210,7 @@ val sameModel = SVMModel.load(sc, "myModelPath")
 The `SVMWithSGD.train()` method by default performs L2 regularization with the
 regularization parameter set to 1.0. If we want to configure this algorithm, we
 can customize `SVMWithSGD` further by creating a new object directly and
-calling setter methods. All other MLlib algorithms support customization in
+calling setter methods. All other `spark.mllib` algorithms support customization in
 this way as well. For example, the following code produces an L1 regularized
 variant of SVMs with regularization parameter set to 0.1, and runs the training
 algorithm for 200 iterations.
@@ -293,7 +296,7 @@ public class SVMClassifier {
 The `SVMWithSGD.train()` method by default performs L2 regularization with the
 regularization parameter set to 1.0. If we want to configure this algorithm, we
 can customize `SVMWithSGD` further by creating a new object directly and
-calling setter methods. All other MLlib algorithms support customization in
+calling setter methods. All other `spark.mllib` algorithms support customization in
 this way as well. For example, the following code produces an L1 regularized
 variant of SVMs with regularization parameter set to 0.1, and runs the training
 algorithm for 200 iterations.
@@ -375,7 +378,7 @@ Binary logistic regression can be generalized into
 train and predict multiclass classification problems.
 For example, for $K$ possible outcomes, one of the outcomes can be chosen as a "pivot", and the
 other $K - 1$ outcomes can be separately regressed against the pivot outcome.
-In MLlib, the first class $0$ is chosen as the "pivot" class.
+In `spark.mllib`, the first class $0$ is chosen as the "pivot" class.
 See Section 4.4 of
 [The Elements of Statistical Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/) for
 references.
@@ -726,7 +729,7 @@ a dependency.
 ###Streaming linear regression
 
 When data arrive in a streaming fashion, it is useful to fit regression models online,
-updating the parameters of the model as new data arrives. MLlib currently supports
+updating the parameters of the model as new data arrives. `spark.mllib` currently supports
 streaming linear regression using ordinary least squares. The fitting is similar
 to that performed offline, except fitting occurs on each batch of data, so that
 the model continually updates to reflect the data from the stream.
@@ -852,7 +855,7 @@ will get better!
 
 # Implementation (developer)
 
-Behind the scene, MLlib implements a simple distributed version of stochastic gradient descent
+Behind the scene, `spark.mllib` implements a simple distributed version of stochastic gradient descent
 (SGD), building on the underlying gradient descent primitive (as described in the <a
 href="mllib-optimization.html">optimization</a> section).  All provided algorithms take as input a
 regularization parameter (`regParam`) along with various parameters associated with stochastic
diff --git a/docs/mllib-migration-guides.md b/docs/mllib-migration-guides.md
index 774b85d1f773a..73e4fddf67fc0 100644
--- a/docs/mllib-migration-guides.md
+++ b/docs/mllib-migration-guides.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Old Migration Guides - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Old Migration Guides
+title: Old Migration Guides - spark.mllib
+displayTitle: Old Migration Guides - spark.mllib
 description: MLlib migration guides from before Spark SPARK_VERSION_SHORT
 ---
 
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 60ac6c7e5bb1a..d0d594af6a4ad 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Naive Bayes - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Naive Bayes
+title: Naive Bayes - spark.mllib
+displayTitle: Naive Bayes - spark.mllib
 ---
 
 [Naive Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier) is a simple
@@ -12,7 +12,7 @@ distribution of each feature given label, and then it applies Bayes' theorem to
 compute the conditional probability distribution of label given an observation
 and use it for prediction.
 
-MLlib supports [multinomial naive
+`spark.mllib` supports [multinomial naive
 Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes)
 and [Bernoulli naive Bayes](http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html).
 These models are typically used for [document classification](http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html).
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index ad7bcd9bfd407..f90b66f8e2c44 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Optimization - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Optimization
+title: Optimization - spark.mllib
+displayTitle: Optimization - spark.mllib
 ---
 
 * Table of contents
@@ -87,7 +87,7 @@ in the `$t$`-th iteration, with the input parameter `$s=$ stepSize`. Note that s
 step-size for SGD methods can often be delicate in practice and is a topic of active research.
 
 **Gradients.**
-A table of (sub)gradients of the machine learning methods implemented in MLlib, is available in
+A table of (sub)gradients of the machine learning methods implemented in `spark.mllib`, is available in
 the <a href="mllib-classification-regression.html">classification and regression</a> section.
 
 
@@ -140,7 +140,7 @@ other first-order optimization.
 
 ### Choosing an Optimization Method
 
-[Linear methods](mllib-linear-methods.html) use optimization internally, and some linear methods in MLlib support both SGD and L-BFGS.
+[Linear methods](mllib-linear-methods.html) use optimization internally, and some linear methods in `spark.mllib` support both SGD and L-BFGS.
 Different optimization methods can have different convergence guarantees depending on the properties of the objective function, and we cannot cover the literature here.
 In general, when L-BFGS is available, we recommend using it instead of SGD since L-BFGS tends to converge faster (in fewer iterations).
 
diff --git a/docs/mllib-pmml-model-export.md b/docs/mllib-pmml-model-export.md
index 615287125c032..b532ad907dfc5 100644
--- a/docs/mllib-pmml-model-export.md
+++ b/docs/mllib-pmml-model-export.md
@@ -1,21 +1,21 @@
 ---
 layout: global
-title: PMML model export - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - PMML model export
+title: PMML model export - spark.mllib
+displayTitle: PMML model export - spark.mllib
 ---
 
 * Table of contents
 {:toc}
 
-## MLlib supported models
+## `spark.mllib` supported models
 
-MLlib supports model export to Predictive Model Markup Language ([PMML](http://en.wikipedia.org/wiki/Predictive_Model_Markup_Language)).
+`spark.mllib` supports model export to Predictive Model Markup Language ([PMML](http://en.wikipedia.org/wiki/Predictive_Model_Markup_Language)).
 
-The table below outlines the MLlib models that can be exported to PMML and their equivalent PMML model.
+The table below outlines the `spark.mllib` models that can be exported to PMML and their equivalent PMML model.
 
 <table class="table">
   <thead>
-    <tr><th>MLlib model</th><th>PMML model</th></tr>
+    <tr><th>`spark.mllib` model</th><th>PMML model</th></tr>
   </thead>
   <tbody>
     <tr>
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index de209f68e19ca..652d215fa8653 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -1,7 +1,7 @@
 ---
 layout: global
-title: Basic Statistics - MLlib
-displayTitle: <a href="mllib-guide.html">MLlib</a> - Basic Statistics 
+title: Basic Statistics - spark.mllib
+displayTitle: Basic Statistics - spark.mllib
 ---
 
 * Table of contents
@@ -112,7 +112,7 @@ print(summary.numNonzeros())
 
 ## Correlations
 
-Calculating the correlation between two series of data is a common operation in Statistics. In MLlib
+Calculating the correlation between two series of data is a common operation in Statistics. In `spark.mllib`
 we provide the flexibility to calculate pairwise correlations among many series. The supported 
 correlation methods are currently Pearson's and Spearman's correlation.
  
@@ -209,7 +209,7 @@ print(Statistics.corr(data, method="pearson"))
 
 ## Stratified sampling
 
-Unlike the other statistics functions, which reside in MLlib, stratified sampling methods,
+Unlike the other statistics functions, which reside in `spark.mllib`, stratified sampling methods,
 `sampleByKey` and `sampleByKeyExact`, can be performed on RDD's of key-value pairs. For stratified
 sampling, the keys can be thought of as a label and the value as a specific attribute. For example 
 the key can be man or woman, or document ids, and the respective values can be the list of ages 
@@ -294,12 +294,12 @@ approxSample = data.sampleByKey(False, fractions);
 ## Hypothesis testing
 
 Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically 
-significant, whether this result occurred by chance or not. MLlib currently supports Pearson's 
+significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's 
 chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine
 whether the goodness of fit or the independence test is conducted. The goodness of fit test requires 
 an input type of `Vector`, whereas the independence test requires a `Matrix` as input.
 
-MLlib also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared 
+`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared 
 independence tests.
 
 <div class="codetabs">
@@ -438,7 +438,7 @@ for i, result in enumerate(featureTestResults):
 
 </div>
 
-Additionally, MLlib provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov (KS) test
+Additionally, `spark.mllib` provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov (KS) test
 for equality of probability distributions. By providing the name of a theoretical distribution
 (currently solely supported for the normal distribution) and its parameters, or a function to 
 calculate the cumulative distribution according to a given theoretical distribution, the user can
@@ -522,7 +522,7 @@ print(testResult) # summary of the test including the p-value, test statistic,
 </div>
 
 ### Streaming Significance Testing
-MLlib provides online implementations of some tests to support use cases
+`spark.mllib` provides online implementations of some tests to support use cases
 like A/B testing. These tests may be performed on a Spark Streaming
 `DStream[(Boolean,Double)]` where the first element of each tuple
 indicates control group (`false`) or treatment group (`true`) and the
@@ -550,7 +550,7 @@ provides streaming hypothesis testing.
 ## Random data generation
 
 Random data generation is useful for randomized algorithms, prototyping, and performance testing.
-MLlib supports generating random RDDs with i.i.d. values drawn from a given distribution:
+`spark.mllib` supports generating random RDDs with i.i.d. values drawn from a given distribution:
 uniform, standard normal, or Poisson.
 
 <div class="codetabs">
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 9f87accd30f40..3f9a831eddc88 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1233,6 +1233,13 @@ infer the data types of the partitioning columns. For these use cases, the autom
 can be configured by `spark.sql.sources.partitionColumnTypeInference.enabled`, which is default to
 `true`. When type inference is disabled, string type will be used for the partitioning columns.
 
+Starting from Spark 1.6.0, partition discovery only finds partitions under the given paths
+by default. For the above example, if users pass `path/to/table/gender=male` to either 
+`SQLContext.read.parquet` or `SQLContext.read.load`, `gender` will not be considered as a
+partitioning column. If users need to specify the base path that partition discovery
+should start with, they can set `basePath` in the data source options. For example,
+when `path/to/table/gender=male` is the path of the data and
+users set `basePath` to `path/to/table/`, `gender` will be a partitioning column.
 
 ### Schema Merging
 
diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md
index b00351b2fbcc0..5be73c42560f5 100644
--- a/docs/streaming-kafka-integration.md
+++ b/docs/streaming-kafka-integration.md
@@ -74,7 +74,7 @@ Next, we discuss how to use this approach in your streaming application.
 	[Maven repository](http://search.maven.org/#search|ga|1|a%3A%22spark-streaming-kafka-assembly_2.10%22%20AND%20v%3A%22{{site.SPARK_VERSION_SHORT}}%22) and add it to `spark-submit` with `--jars`.
 
 ## Approach 2: Direct Approach (No Receivers)
-This new receiver-less "direct" approach has been introduced in Spark 1.3 to ensure stronger end-to-end guarantees. Instead of using receivers to receive data, this approach periodically queries Kafka for the latest offsets in each topic+partition, and accordingly defines the offset ranges to process in each batch. When the jobs to process the data are launched, Kafka's simple consumer API is used to read the defined ranges of offsets from Kafka (similar to read files from a file system). Note that this is an experimental feature introduced in Spark 1.3 for the Scala and Java API. Spark 1.4 added a Python API, but it is not yet at full feature parity.
+This new receiver-less "direct" approach has been introduced in Spark 1.3 to ensure stronger end-to-end guarantees. Instead of using receivers to receive data, this approach periodically queries Kafka for the latest offsets in each topic+partition, and accordingly defines the offset ranges to process in each batch. When the jobs to process the data are launched, Kafka's simple consumer API is used to read the defined ranges of offsets from Kafka (similar to read files from a file system). Note that this is an experimental feature introduced in Spark 1.3 for the Scala and Java API, in Spark 1.4 for the Python API.
 
 This approach has the following advantages over the receiver-based approach (i.e. Approach 1).
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
new file mode 100644
index 0000000000000..9698cac504371
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.Binarizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaBinarizerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaBinarizerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(0, 0.1),
+      RowFactory.create(1, 0.8),
+      RowFactory.create(2, 0.2)
+    ));
+    StructType schema = new StructType(new StructField[]{
+      new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
+    });
+    DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
+    Binarizer binarizer = new Binarizer()
+      .setInputCol("feature")
+      .setOutputCol("binarized_feature")
+      .setThreshold(0.5);
+    DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
+    DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
+    for (Row r : binarizedFeatures.collect()) {
+    Double binarized_value = r.getDouble(0);
+      System.out.println(binarized_value);
+    }
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
new file mode 100644
index 0000000000000..8ad369cc93e8a
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.Bucketizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaBucketizerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaBucketizerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
+
+    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+      RowFactory.create(-0.5),
+      RowFactory.create(-0.3),
+      RowFactory.create(0.0),
+      RowFactory.create(0.2)
+    ));
+    StructType schema = new StructType(new StructField[]{
+      new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
+    });
+    DataFrame dataFrame = jsql.createDataFrame(data, schema);
+
+    Bucketizer bucketizer = new Bucketizer()
+      .setInputCol("features")
+      .setOutputCol("bucketedFeatures")
+      .setSplits(splits);
+
+    // Transform original data into its bucket index.
+    DataFrame bucketedData = bucketizer.transform(dataFrame);
+    bucketedData.show();
+    // $example off$
+    jsc.stop();
+  }
+}
+
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
new file mode 100644
index 0000000000000..35c0d534a45e9
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.DCT;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaDCTExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaDCTExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
+      RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
+      RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
+    ));
+    StructType schema = new StructType(new StructField[]{
+      new StructField("features", new VectorUDT(), false, Metadata.empty()),
+    });
+    DataFrame df = jsql.createDataFrame(data, schema);
+    DCT dct = new DCT()
+      .setInputCol("features")
+      .setOutputCol("featuresDCT")
+      .setInverse(false);
+    DataFrame dctDf = dct.transform(df);
+    dctDf.select("featuresDCT").show(3);
+    // $example off$
+    jsc.stop();
+  }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java
new file mode 100644
index 0000000000000..2898accec61b0
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.ElementwiseProduct;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaElementwiseProductExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    // Create some vector data; also works for sparse vectors
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
+      RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
+    ));
+
+    List<StructField> fields = new ArrayList<StructField>(2);
+    fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
+    fields.add(DataTypes.createStructField("vector", new VectorUDT(), false));
+
+    StructType schema = DataTypes.createStructType(fields);
+
+    DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema);
+
+    Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
+
+    ElementwiseProduct transformer = new ElementwiseProduct()
+      .setScalingVec(transformingVector)
+      .setInputCol("vector")
+      .setOutputCol("transformedVector");
+
+    // Batch transform the vectors to create new column:
+    transformer.transform(dataFrame).show();
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
new file mode 100644
index 0000000000000..2d50ba7faa1a1
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import org.apache.spark.ml.feature.MinMaxScaler;
+import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaMinMaxScalerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JaveMinMaxScalerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+    MinMaxScaler scaler = new MinMaxScaler()
+      .setInputCol("features")
+      .setOutputCol("scaledFeatures");
+
+    // Compute summary statistics and generate MinMaxScalerModel
+    MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
+
+    // rescale each feature to range [min, max].
+    DataFrame scaledData = scalerModel.transform(dataFrame);
+    scaledData.show();
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
new file mode 100644
index 0000000000000..8fd75ed8b5f4e
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.NGram;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaNGramExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaNGramExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
+      RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
+      RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
+    ));
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField(
+        "words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+    });
+
+    DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
+
+    NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
+
+    DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame);
+
+    for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
+      java.util.List<String> ngrams = r.getList(0);
+      for (String ngram : ngrams) System.out.print(ngram + " --- ");
+      System.out.println();
+    }
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
new file mode 100644
index 0000000000000..ed3f6163c0558
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import org.apache.spark.ml.feature.Normalizer;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaNormalizerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaNormalizerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+    // Normalize each Vector using $L^1$ norm.
+    Normalizer normalizer = new Normalizer()
+      .setInputCol("features")
+      .setOutputCol("normFeatures")
+      .setP(1.0);
+
+    DataFrame l1NormData = normalizer.transform(dataFrame);
+    l1NormData.show();
+
+    // Normalize each Vector using $L^\infty$ norm.
+    DataFrame lInfNormData =
+      normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
+    lInfNormData.show();
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
new file mode 100644
index 0000000000000..bc509607084b1
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.OneHotEncoder;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.ml.feature.StringIndexerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaOneHotEncoderExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaOneHotEncoderExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(0, "a"),
+      RowFactory.create(1, "b"),
+      RowFactory.create(2, "c"),
+      RowFactory.create(3, "a"),
+      RowFactory.create(4, "a"),
+      RowFactory.create(5, "c")
+    ));
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("category", DataTypes.StringType, false, Metadata.empty())
+    });
+
+    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+    StringIndexerModel indexer = new StringIndexer()
+      .setInputCol("category")
+      .setOutputCol("categoryIndex")
+      .fit(df);
+    DataFrame indexed = indexer.transform(df);
+
+    OneHotEncoder encoder = new OneHotEncoder()
+      .setInputCol("categoryIndex")
+      .setOutputCol("categoryVec");
+    DataFrame encoded = encoder.transform(indexed);
+    encoded.select("id", "categoryVec").show();
+    // $example off$
+    jsc.stop();
+  }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
new file mode 100644
index 0000000000000..8282fab084f36
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.PCA;
+import org.apache.spark.ml.feature.PCAModel;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaPCAExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaPCAExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
+      RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
+      RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+    ));
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("features", new VectorUDT(), false, Metadata.empty()),
+    });
+
+    DataFrame df = jsql.createDataFrame(data, schema);
+
+    PCAModel pca = new PCA()
+      .setInputCol("features")
+      .setOutputCol("pcaFeatures")
+      .setK(3)
+      .fit(df);
+
+    DataFrame result = pca.transform(df).select("pcaFeatures");
+    result.show();
+    // $example off$
+    jsc.stop();
+  }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
new file mode 100644
index 0000000000000..668f71e64056b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.PolynomialExpansion;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaPolynomialExpansionExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    PolynomialExpansion polyExpansion = new PolynomialExpansion()
+      .setInputCol("features")
+      .setOutputCol("polyFeatures")
+      .setDegree(3);
+
+    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Vectors.dense(-2.0, 2.3)),
+      RowFactory.create(Vectors.dense(0.0, 0.0)),
+      RowFactory.create(Vectors.dense(0.6, -1.1))
+    ));
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("features", new VectorUDT(), false, Metadata.empty()),
+    });
+
+    DataFrame df = jsql.createDataFrame(data, schema);
+    DataFrame polyDF = polyExpansion.transform(df);
+
+    Row[] row = polyDF.select("polyFeatures").take(3);
+    for (Row r : row) {
+      System.out.println(r.get(0));
+    }
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
new file mode 100644
index 0000000000000..1e1062b541ad9
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaRFormulaExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaRFormulaExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    StructType schema = createStructType(new StructField[]{
+      createStructField("id", IntegerType, false),
+      createStructField("country", StringType, false),
+      createStructField("hour", IntegerType, false),
+      createStructField("clicked", DoubleType, false)
+    });
+
+    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(7, "US", 18, 1.0),
+      RowFactory.create(8, "CA", 12, 0.0),
+      RowFactory.create(9, "NZ", 15, 0.0)
+    ));
+
+    DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+    RFormula formula = new RFormula()
+      .setFormula("clicked ~ country + hour")
+      .setFeaturesCol("features")
+      .setLabelCol("label");
+    DataFrame output = formula.fit(dataset).transform(dataset);
+    output.select("features", "label").show();
+    // $example off$
+    jsc.stop();
+  }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
new file mode 100644
index 0000000000000..da4756643f3c4
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import org.apache.spark.ml.feature.StandardScaler;
+import org.apache.spark.ml.feature.StandardScalerModel;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaStandardScalerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaStandardScalerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+    StandardScaler scaler = new StandardScaler()
+      .setInputCol("features")
+      .setOutputCol("scaledFeatures")
+      .setWithStd(true)
+      .setWithMean(false);
+
+    // Compute summary statistics by fitting the StandardScaler
+    StandardScalerModel scalerModel = scaler.fit(dataFrame);
+
+    // Normalize each feature to have unit standard deviation.
+    DataFrame scaledData = scalerModel.transform(dataFrame);
+    scaledData.show();
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
new file mode 100644
index 0000000000000..b6b201c6b68d2
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StopWordsRemover;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaStopWordsRemoverExample {
+
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemoverExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    StopWordsRemover remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered");
+
+    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+      RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+    ));
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField(
+        "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+    });
+
+    DataFrame dataset = jsql.createDataFrame(rdd, schema);
+    remover.transform(dataset).show();
+    // $example off$
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
new file mode 100644
index 0000000000000..05d12c1e702f1
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaStringIndexerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaStringIndexerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(0, "a"),
+      RowFactory.create(1, "b"),
+      RowFactory.create(2, "c"),
+      RowFactory.create(3, "a"),
+      RowFactory.create(4, "a"),
+      RowFactory.create(5, "c")
+    ));
+    StructType schema = new StructType(new StructField[]{
+      createStructField("id", IntegerType, false),
+      createStructField("category", StringType, false)
+    });
+    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+    StringIndexer indexer = new StringIndexer()
+      .setInputCol("category")
+      .setOutputCol("categoryIndex");
+    DataFrame indexed = indexer.fit(df).transform(df);
+    indexed.show();
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
new file mode 100644
index 0000000000000..617dc3f66e3bf
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RegexTokenizer;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaTokenizerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaTokenizerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(0, "Hi I heard about Spark"),
+      RowFactory.create(1, "I wish Java could use case classes"),
+      RowFactory.create(2, "Logistic,regression,models,are,neat")
+    ));
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
+    });
+
+    DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
+
+    Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
+
+    DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
+    for (Row r : wordsDataFrame.select("words", "label"). take(3)) {
+      java.util.List<String> words = r.getList(0);
+      for (String word : words) System.out.print(word + " ");
+      System.out.println();
+    }
+
+    RegexTokenizer regexTokenizer = new RegexTokenizer()
+      .setInputCol("sentence")
+      .setOutputCol("words")
+      .setPattern("\\W");  // alternatively .setPattern("\\w+").setGaps(false);
+    // $example off$
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
new file mode 100644
index 0000000000000..7e230b5897c1e
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaVectorAssemblerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaVectorAssemblerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    StructType schema = createStructType(new StructField[]{
+      createStructField("id", IntegerType, false),
+      createStructField("hour", IntegerType, false),
+      createStructField("mobile", DoubleType, false),
+      createStructField("userFeatures", new VectorUDT(), false),
+      createStructField("clicked", DoubleType, false)
+    });
+    Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+    JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
+    DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+    VectorAssembler assembler = new VectorAssembler()
+      .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
+      .setOutputCol("features");
+
+    DataFrame output = assembler.transform(dataset);
+    System.out.println(output.select("features", "clicked").first());
+    // $example off$
+    jsc.stop();
+  }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
new file mode 100644
index 0000000000000..545758e31d972
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Map;
+
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaVectorIndexerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaVectorIndexerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+    VectorIndexer indexer = new VectorIndexer()
+      .setInputCol("features")
+      .setOutputCol("indexed")
+      .setMaxCategories(10);
+    VectorIndexerModel indexerModel = indexer.fit(data);
+
+    Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
+    System.out.print("Chose " + categoryMaps.size() + " categorical features:");
+
+    for (Integer feature : categoryMaps.keySet()) {
+      System.out.print(" " + feature);
+    }
+    System.out.println();
+
+    // Create new column "indexed" with categorical values transformed to indices
+    DataFrame indexedData = indexerModel.transform(data);
+    indexedData.show();
+    // $example off$
+    jsc.stop();
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
new file mode 100644
index 0000000000000..4d5cb04ff5e2b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.attribute.Attribute;
+import org.apache.spark.ml.attribute.AttributeGroup;
+import org.apache.spark.ml.attribute.NumericAttribute;
+import org.apache.spark.ml.feature.VectorSlicer;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaVectorSlicerExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaVectorSlicerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // $example on$
+    Attribute[] attrs = new Attribute[]{
+      NumericAttribute.defaultAttr().withName("f1"),
+      NumericAttribute.defaultAttr().withName("f2"),
+      NumericAttribute.defaultAttr().withName("f3")
+    };
+    AttributeGroup group = new AttributeGroup("userFeatures", attrs);
+
+    JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+      RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
+      RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
+    ));
+
+    DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+
+    VectorSlicer vectorSlicer = new VectorSlicer()
+      .setInputCol("userFeatures").setOutputCol("features");
+
+    vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
+    // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
+
+    DataFrame output = vectorSlicer.transform(dataset);
+
+    System.out.println(output.select("userFeatures", "features").first());
+    // $example off$
+    jsc.stop();
+  }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java
index c400e4237abe3..14997c64d505e 100644
--- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java
+++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java
@@ -65,7 +65,7 @@ public static void main(String[] args) {
     JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));
     ssc.checkpoint(".");
 
-    // Initial RDD input to trackStateByKey
+    // Initial state RDD input to mapWithState
     @SuppressWarnings("unchecked")
     List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<String, Integer>("hello", 1),
             new Tuple2<String, Integer>("world", 1));
@@ -90,21 +90,21 @@ public Tuple2<String, Integer> call(String s) {
         });
 
     // Update the cumulative count function
-    final Function4<Time, String, Optional<Integer>, State<Integer>, Optional<Tuple2<String, Integer>>> trackStateFunc =
-        new Function4<Time, String, Optional<Integer>, State<Integer>, Optional<Tuple2<String, Integer>>>() {
+    final Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc =
+        new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
 
           @Override
-          public Optional<Tuple2<String, Integer>> call(Time time, String word, Optional<Integer> one, State<Integer> state) {
+          public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
             int sum = one.or(0) + (state.exists() ? state.get() : 0);
             Tuple2<String, Integer> output = new Tuple2<String, Integer>(word, sum);
             state.update(sum);
-            return Optional.of(output);
+            return output;
           }
         };
 
-    // This will give a Dstream made of state (which is the cumulative count of the words)
-    JavaTrackStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream =
-        wordsDstream.trackStateByKey(StateSpec.function(trackStateFunc).initialState(initialRDD));
+    // DStream made of get cumulative counts that get updated in every batch
+    JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream =
+        wordsDstream.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));
 
     stateDstream.print();
     ssc.start();
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
new file mode 100644
index 0000000000000..317cfa638a5a9
--- /dev/null
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Binarizer
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="BinarizerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    continuousDataFrame = sqlContext.createDataFrame([
+        (0, 0.1),
+        (1, 0.8),
+        (2, 0.2)
+    ], ["label", "feature"])
+    binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+    binarizedDataFrame = binarizer.transform(continuousDataFrame)
+    binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+    for binarized_feature, in binarizedFeatures.collect():
+        print(binarized_feature)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
new file mode 100644
index 0000000000000..4304255f350db
--- /dev/null
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Bucketizer
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="BucketizerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
+
+    data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+    dataFrame = sqlContext.createDataFrame(data, ["features"])
+
+    bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
+
+    # Transform original data into its bucket index.
+    bucketedData = bucketizer.transform(dataFrame)
+    bucketedData.show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py
new file mode 100644
index 0000000000000..c85cb0d89543c
--- /dev/null
+++ b/examples/src/main/python/ml/elementwise_product_example.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import ElementwiseProduct
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="ElementwiseProductExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
+    df = sqlContext.createDataFrame(data, ["vector"])
+    transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
+                                     inputCol="vector", outputCol="transformedVector")
+    transformer.transform(df).show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
new file mode 100644
index 0000000000000..f2d85f53e7219
--- /dev/null
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import NGram
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="NGramExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    wordDataFrame = sqlContext.createDataFrame([
+        (0, ["Hi", "I", "heard", "about", "Spark"]),
+        (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
+        (2, ["Logistic", "regression", "models", "are", "neat"])
+    ], ["label", "words"])
+    ngram = NGram(inputCol="words", outputCol="ngrams")
+    ngramDataFrame = ngram.transform(wordDataFrame)
+    for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
+        print(ngrams_label)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
new file mode 100644
index 0000000000000..d490221474c24
--- /dev/null
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Normalizer
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="NormalizerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+    # Normalize each Vector using $L^1$ norm.
+    normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
+    l1NormData = normalizer.transform(dataFrame)
+    l1NormData.show()
+
+    # Normalize each Vector using $L^\infty$ norm.
+    lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+    lInfNormData.show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
new file mode 100644
index 0000000000000..0f94c26638d35
--- /dev/null
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import OneHotEncoder, StringIndexer
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="OneHotEncoderExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    df = sqlContext.createDataFrame([
+        (0, "a"),
+        (1, "b"),
+        (2, "c"),
+        (3, "a"),
+        (4, "a"),
+        (5, "c")
+    ], ["id", "category"])
+
+    stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+    model = stringIndexer.fit(df)
+    indexed = model.transform(df)
+    encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec")
+    encoded = encoder.transform(indexed)
+    encoded.select("id", "categoryVec").show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py
new file mode 100644
index 0000000000000..a17181f1b8a51
--- /dev/null
+++ b/examples/src/main/python/ml/pca_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import PCA
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="PCAExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+            (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+            (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+    df = sqlContext.createDataFrame(data, ["features"])
+    pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
+    model = pca.fit(df)
+    result = model.transform(df).select("pcaFeatures")
+    result.show(truncate=False)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
new file mode 100644
index 0000000000000..3d4fafd1a42e9
--- /dev/null
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import PolynomialExpansion
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="PolynomialExpansionExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    df = sqlContext\
+        .createDataFrame([(Vectors.dense([-2.0, 2.3]), ),
+                          (Vectors.dense([0.0, 0.0]), ),
+                          (Vectors.dense([0.6, -1.1]), )],
+                         ["features"])
+    px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
+    polyDF = px.transform(df)
+    for expanded in polyDF.select("polyFeatures").take(3):
+        print(expanded)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py
new file mode 100644
index 0000000000000..b544a14700762
--- /dev/null
+++ b/examples/src/main/python/ml/rformula_example.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import RFormula
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="RFormulaExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    dataset = sqlContext.createDataFrame(
+        [(7, "US", 18, 1.0),
+         (8, "CA", 12, 0.0),
+         (9, "NZ", 15, 0.0)],
+        ["id", "country", "hour", "clicked"])
+    formula = RFormula(
+        formula="clicked ~ country + hour",
+        featuresCol="features",
+        labelCol="label")
+    output = formula.fit(dataset).transform(dataset)
+    output.select("features", "label").show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/standard_scaler_example.py b/examples/src/main/python/ml/standard_scaler_example.py
new file mode 100644
index 0000000000000..ae7aa85005bcd
--- /dev/null
+++ b/examples/src/main/python/ml/standard_scaler_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StandardScaler
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="StandardScalerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
+                            withStd=True, withMean=False)
+
+    # Compute summary statistics by fitting the StandardScaler
+    scalerModel = scaler.fit(dataFrame)
+
+    # Normalize each feature to have unit standard deviation.
+    scaledData = scalerModel.transform(dataFrame)
+    scaledData.show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
new file mode 100644
index 0000000000000..01f94af8ca752
--- /dev/null
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StopWordsRemover
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="StopWordsRemoverExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    sentenceData = sqlContext.createDataFrame([
+        (0, ["I", "saw", "the", "red", "baloon"]),
+        (1, ["Mary", "had", "a", "little", "lamb"])
+    ], ["label", "raw"])
+
+    remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
+    remover.transform(sentenceData).show(truncate=False)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py
new file mode 100644
index 0000000000000..58a8cb5d56b73
--- /dev/null
+++ b/examples/src/main/python/ml/string_indexer_example.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StringIndexer
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="StringIndexerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    df = sqlContext.createDataFrame(
+        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
+        ["id", "category"])
+    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+    indexed = indexer.fit(df).transform(df)
+    indexed.show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
new file mode 100644
index 0000000000000..ce9b225be5357
--- /dev/null
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Tokenizer, RegexTokenizer
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="TokenizerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    sentenceDataFrame = sqlContext.createDataFrame([
+        (0, "Hi I heard about Spark"),
+        (1, "I wish Java could use case classes"),
+        (2, "Logistic,regression,models,are,neat")
+    ], ["label", "sentence"])
+    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+    wordsDataFrame = tokenizer.transform(sentenceDataFrame)
+    for words_label in wordsDataFrame.select("words", "label").take(3):
+        print(words_label)
+    regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
+    # alternatively, pattern="\\w+", gaps(False)
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
new file mode 100644
index 0000000000000..04f64839f188d
--- /dev/null
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.mllib.linalg import Vectors
+from pyspark.ml.feature import VectorAssembler
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="VectorAssemblerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    dataset = sqlContext.createDataFrame(
+        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
+        ["id", "hour", "mobile", "userFeatures", "clicked"])
+    assembler = VectorAssembler(
+        inputCols=["hour", "mobile", "userFeatures"],
+        outputCol="features")
+    output = assembler.transform(dataset)
+    print(output.select("features", "clicked").first())
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
new file mode 100644
index 0000000000000..146f41c1dd903
--- /dev/null
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import VectorIndexer
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="VectorIndexerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+    indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
+    indexerModel = indexer.fit(data)
+
+    # Create new column "indexed" with categorical values transformed to indices
+    indexedData = indexerModel.transform(data)
+    indexedData.show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R
index 53b817144f6ac..62f60e57eebe6 100644
--- a/examples/src/main/r/dataframe.R
+++ b/examples/src/main/r/dataframe.R
@@ -35,7 +35,7 @@ printSchema(df)
 
 # Create a DataFrame from a JSON file
 path <- file.path(Sys.getenv("SPARK_HOME"), "examples/src/main/resources/people.json")
-peopleDF <- jsonFile(sqlContext, path)
+peopleDF <- read.json(sqlContext, path)
 printSchema(peopleDF)
 
 # Register this DataFrame as a table.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
new file mode 100644
index 0000000000000..e724aa587294b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Binarizer
+// $example off$
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.{SparkConf, SparkContext}
+
+object BinarizerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("BinarizerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    // $example on$
+    val data = Array((0, 0.1), (1, 0.8), (2, 0.2))
+    val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
+
+    val binarizer: Binarizer = new Binarizer()
+      .setInputCol("feature")
+      .setOutputCol("binarized_feature")
+      .setThreshold(0.5)
+
+    val binarizedDataFrame = binarizer.transform(dataFrame)
+    val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+    binarizedFeatures.collect().foreach(println)
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
new file mode 100644
index 0000000000000..7c75e3d72b47b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Bucketizer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object BucketizerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("BucketizerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
+
+    val data = Array(-0.5, -0.3, 0.0, 0.2)
+    val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+    val bucketizer = new Bucketizer()
+      .setInputCol("features")
+      .setOutputCol("bucketedFeatures")
+      .setSplits(splits)
+
+    // Transform original data into its bucket index.
+    val bucketedData = bucketizer.transform(dataFrame)
+    bucketedData.show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
new file mode 100644
index 0000000000000..314c2c28a2a10
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.DCT
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object DCTExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("DCTExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val data = Seq(
+      Vectors.dense(0.0, 1.0, -2.0, 3.0),
+      Vectors.dense(-1.0, 2.0, 4.0, -7.0),
+      Vectors.dense(14.0, -2.0, -5.0, 1.0))
+
+    val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+    val dct = new DCT()
+      .setInputCol("features")
+      .setOutputCol("featuresDCT")
+      .setInverse(false)
+
+    val dctDf = dct.transform(df)
+    dctDf.select("featuresDCT").show(3)
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
new file mode 100644
index 0000000000000..872de51dc75df
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.ElementwiseProduct
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object ElementwiseProductExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("ElementwiseProductExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    // Create some vector data; also works for sparse vectors
+    val dataFrame = sqlContext.createDataFrame(Seq(
+      ("a", Vectors.dense(1.0, 2.0, 3.0)),
+      ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
+
+    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
+    val transformer = new ElementwiseProduct()
+      .setScalingVec(transformingVector)
+      .setInputCol("vector")
+      .setOutputCol("transformedVector")
+
+    // Batch transform the vectors to create new column:
+    transformer.transform(dataFrame).show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
new file mode 100644
index 0000000000000..fb7f28c9886bb
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.MinMaxScaler
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object MinMaxScalerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("MinMaxScalerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+    val scaler = new MinMaxScaler()
+      .setInputCol("features")
+      .setOutputCol("scaledFeatures")
+
+    // Compute summary statistics and generate MinMaxScalerModel
+    val scalerModel = scaler.fit(dataFrame)
+
+    // rescale each feature to range [min, max].
+    val scaledData = scalerModel.transform(dataFrame)
+    scaledData.show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
new file mode 100644
index 0000000000000..8a85f71b56f3d
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.NGram
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object NGramExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("NGramExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val wordDataFrame = sqlContext.createDataFrame(Seq(
+      (0, Array("Hi", "I", "heard", "about", "Spark")),
+      (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
+      (2, Array("Logistic", "regression", "models", "are", "neat"))
+    )).toDF("label", "words")
+
+    val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
+    val ngramDataFrame = ngram.transform(wordDataFrame)
+    ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
new file mode 100644
index 0000000000000..1990b55e8c5e8
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Normalizer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object NormalizerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("NormalizerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+    // Normalize each Vector using $L^1$ norm.
+    val normalizer = new Normalizer()
+      .setInputCol("features")
+      .setOutputCol("normFeatures")
+      .setP(1.0)
+
+    val l1NormData = normalizer.transform(dataFrame)
+    l1NormData.show()
+
+    // Normalize each Vector using $L^\infty$ norm.
+    val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
+    lInfNormData.show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
new file mode 100644
index 0000000000000..66602e2118506
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object OneHotEncoderExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("OneHotEncoderExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val df = sqlContext.createDataFrame(Seq(
+      (0, "a"),
+      (1, "b"),
+      (2, "c"),
+      (3, "a"),
+      (4, "a"),
+      (5, "c")
+    )).toDF("id", "category")
+
+    val indexer = new StringIndexer()
+      .setInputCol("category")
+      .setOutputCol("categoryIndex")
+      .fit(df)
+    val indexed = indexer.transform(df)
+
+    val encoder = new OneHotEncoder()
+      .setInputCol("categoryIndex")
+      .setOutputCol("categoryVec")
+    val encoded = encoder.transform(indexed)
+    encoded.select("id", "categoryVec").show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
new file mode 100644
index 0000000000000..4c806f71a32c3
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object PCAExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("PCAExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val data = Array(
+      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+    )
+    val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+    val pca = new PCA()
+      .setInputCol("features")
+      .setOutputCol("pcaFeatures")
+      .setK(3)
+      .fit(df)
+    val pcaDF = pca.transform(df)
+    val result = pcaDF.select("pcaFeatures")
+    result.show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
new file mode 100644
index 0000000000000..39fb79af35766
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.PolynomialExpansion
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object PolynomialExpansionExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("PolynomialExpansionExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val data = Array(
+      Vectors.dense(-2.0, 2.3),
+      Vectors.dense(0.0, 0.0),
+      Vectors.dense(0.6, -1.1)
+    )
+    val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+    val polynomialExpansion = new PolynomialExpansion()
+      .setInputCol("features")
+      .setOutputCol("polyFeatures")
+      .setDegree(3)
+    val polyDF = polynomialExpansion.transform(df)
+    polyDF.select("polyFeatures").take(3).foreach(println)
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
new file mode 100644
index 0000000000000..286866edea502
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.RFormula
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object RFormulaExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("RFormulaExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val dataset = sqlContext.createDataFrame(Seq(
+      (7, "US", 18, 1.0),
+      (8, "CA", 12, 0.0),
+      (9, "NZ", 15, 0.0)
+    )).toDF("id", "country", "hour", "clicked")
+    val formula = new RFormula()
+      .setFormula("clicked ~ country + hour")
+      .setFeaturesCol("features")
+      .setLabelCol("label")
+    val output = formula.fit(dataset).transform(dataset)
+    output.select("features", "label").show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
new file mode 100644
index 0000000000000..e0a41e383a7ea
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StandardScaler
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StandardScalerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("StandardScalerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+    val scaler = new StandardScaler()
+      .setInputCol("features")
+      .setOutputCol("scaledFeatures")
+      .setWithStd(true)
+      .setWithMean(false)
+
+    // Compute summary statistics by fitting the StandardScaler.
+    val scalerModel = scaler.fit(dataFrame)
+
+    // Normalize each feature to have unit standard deviation.
+    val scaledData = scalerModel.transform(dataFrame)
+    scaledData.show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
new file mode 100644
index 0000000000000..655ffce08d3ab
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StopWordsRemover
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StopWordsRemoverExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("StopWordsRemoverExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val remover = new StopWordsRemover()
+      .setInputCol("raw")
+      .setOutputCol("filtered")
+
+    val dataSet = sqlContext.createDataFrame(Seq(
+      (0, Seq("I", "saw", "the", "red", "baloon")),
+      (1, Seq("Mary", "had", "a", "little", "lamb"))
+    )).toDF("id", "raw")
+
+    remover.transform(dataSet).show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
new file mode 100644
index 0000000000000..9fa494cd2473b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StringIndexer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StringIndexerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("StringIndexerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val df = sqlContext.createDataFrame(
+      Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+    ).toDF("id", "category")
+
+    val indexer = new StringIndexer()
+      .setInputCol("category")
+      .setOutputCol("categoryIndex")
+
+    val indexed = indexer.fit(df).transform(df)
+    indexed.show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
new file mode 100644
index 0000000000000..01e0d1388a2f4
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object TokenizerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("TokenizerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+      (0, "Hi I heard about Spark"),
+      (1, "I wish Java could use case classes"),
+      (2, "Logistic,regression,models,are,neat")
+    )).toDF("label", "sentence")
+
+    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
+    val regexTokenizer = new RegexTokenizer()
+      .setInputCol("sentence")
+      .setOutputCol("words")
+      .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+
+    val tokenized = tokenizer.transform(sentenceDataFrame)
+    tokenized.select("words", "label").take(3).foreach(println)
+    val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+    regexTokenized.select("words", "label").take(3).foreach(println)
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
new file mode 100644
index 0000000000000..d527924419f81
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.VectorAssembler
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object VectorAssemblerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("VectorAssemblerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val dataset = sqlContext.createDataFrame(
+      Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
+    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+
+    val assembler = new VectorAssembler()
+      .setInputCols(Array("hour", "mobile", "userFeatures"))
+      .setOutputCol("features")
+
+    val output = assembler.transform(dataset)
+    println(output.select("features", "clicked").first())
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
new file mode 100644
index 0000000000000..685891c164e70
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.VectorIndexer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object VectorIndexerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("VectorIndexerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+    val indexer = new VectorIndexer()
+      .setInputCol("features")
+      .setOutputCol("indexed")
+      .setMaxCategories(10)
+
+    val indexerModel = indexer.fit(data)
+
+    val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
+    println(s"Chose ${categoricalFeatures.size} categorical features: " +
+      categoricalFeatures.mkString(", "))
+
+    // Create new column "indexed" with categorical values transformed to indices
+    val indexedData = indexerModel.transform(data)
+    indexedData.show()
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
new file mode 100644
index 0000000000000..04f19829eff87
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
+import org.apache.spark.ml.feature.VectorSlicer
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.StructType
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object VectorSlicerExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("VectorSlicerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val data = Array(Row(Vectors.dense(-2.0, 2.3, 0.0)))
+
+    val defaultAttr = NumericAttribute.defaultAttr
+    val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
+    val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])
+
+    val dataRDD = sc.parallelize(data)
+    val dataset = sqlContext.createDataFrame(dataRDD, StructType(Array(attrGroup.toStructField())))
+
+    val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
+
+    slicer.setIndices(Array(1)).setNames(Array("f3"))
+    // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
+
+    val output = slicer.transform(dataset)
+    println(output.select("userFeatures", "features").first())
+    // $example off$
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
index a4f847f118b2c..2dce1820d9734 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/StatefulNetworkWordCount.scala
@@ -49,7 +49,7 @@ object StatefulNetworkWordCount {
     val ssc = new StreamingContext(sparkConf, Seconds(1))
     ssc.checkpoint(".")
 
-    // Initial RDD input to trackStateByKey
+    // Initial state RDD for mapWithState operation
     val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1)))
 
     // Create a ReceiverInputDStream on target ip:port and count the
@@ -58,17 +58,17 @@ object StatefulNetworkWordCount {
     val words = lines.flatMap(_.split(" "))
     val wordDstream = words.map(x => (x, 1))
 
-    // Update the cumulative count using updateStateByKey
+    // Update the cumulative count using mapWithState
     // This will give a DStream made of state (which is the cumulative count of the words)
-    val trackStateFunc = (batchTime: Time, word: String, one: Option[Int], state: State[Int]) => {
+    val mappingFunc = (word: String, one: Option[Int], state: State[Int]) => {
       val sum = one.getOrElse(0) + state.getOption.getOrElse(0)
       val output = (word, sum)
       state.update(sum)
-      Some(output)
+      output
     }
 
-    val stateDstream = wordDstream.trackStateByKey(
-      StateSpec.function(trackStateFunc).initialState(initialRDD))
+    val stateDstream = wordDstream.mapWithState(
+      StateSpec.function(mappingFunc).initialState(initialRDD))
     stateDstream.print()
     ssc.start()
     ssc.awaitTermination()
diff --git a/extras/java8-tests/src/test/java/org/apache/spark/streaming/Java8APISuite.java b/extras/java8-tests/src/test/java/org/apache/spark/streaming/Java8APISuite.java
index 4eee97bc89613..89e0c7fdf7eec 100644
--- a/extras/java8-tests/src/test/java/org/apache/spark/streaming/Java8APISuite.java
+++ b/extras/java8-tests/src/test/java/org/apache/spark/streaming/Java8APISuite.java
@@ -32,12 +32,10 @@
 import org.apache.spark.HashPartitioner;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.Function2;
-import org.apache.spark.api.java.function.Function4;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.streaming.api.java.JavaDStream;
 import org.apache.spark.streaming.api.java.JavaPairDStream;
-import org.apache.spark.streaming.api.java.JavaTrackStateDStream;
+import org.apache.spark.streaming.api.java.JavaMapWithStateDStream;
 
 /**
  * Most of these tests replicate org.apache.spark.streaming.JavaAPISuite using java 8
@@ -863,12 +861,12 @@ public void testFlatMapValues() {
   /**
    * This test is only for testing the APIs. It's not necessary to run it.
    */
-  public void testTrackStateByAPI() {
+  public void testMapWithStateAPI() {
     JavaPairRDD<String, Boolean> initialRDD = null;
     JavaPairDStream<String, Integer> wordsDstream = null;
 
-    JavaTrackStateDStream<String, Integer, Boolean, Double> stateDstream =
-        wordsDstream.trackStateByKey(
+    JavaMapWithStateDStream<String, Integer, Boolean, Double> stateDstream =
+        wordsDstream.mapWithState(
             StateSpec.<String, Integer, Boolean, Double> function((time, key, value, state) -> {
               // Use all State's methods here
               state.exists();
@@ -884,9 +882,9 @@ StateSpec.<String, Integer, Boolean, Double> function((time, key, value, state)
 
     JavaPairDStream<String, Boolean> emittedRecords = stateDstream.stateSnapshots();
 
-    JavaTrackStateDStream<String, Integer, Boolean, Double> stateDstream2 =
-        wordsDstream.trackStateByKey(
-            StateSpec.<String, Integer, Boolean, Double>function((value, state) -> {
+    JavaMapWithStateDStream<String, Integer, Boolean, Double> stateDstream2 =
+        wordsDstream.mapWithState(
+            StateSpec.<String, Integer, Boolean, Double>function((key, value, state) -> {
               state.exists();
               state.get();
               state.isTimingOut();
@@ -898,6 +896,6 @@ StateSpec.<String, Integer, Boolean, Double> function((time, key, value, state)
                 .partitioner(new HashPartitioner(10))
                 .timeout(Durations.seconds(10)));
 
-    JavaPairDStream<String, Boolean> emittedRecords2 = stateDstream2.stateSnapshots();
+    JavaPairDStream<String, Boolean> mappedDStream = stateDstream2.stateSnapshots();
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
index c22d2e0cd2d90..7a35f2d448f9d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
@@ -23,7 +23,7 @@
  * features into more suitable forms for model fitting.
  * Most feature transformers are implemented as {@link org.apache.spark.ml.Transformer}s, which
  * transforms one {@link org.apache.spark.sql.DataFrame} into another, e.g.,
- * {@link org.apache.spark.feature.HashingTF}.
+ * {@link org.apache.spark.ml.feature.HashingTF}.
  * Some feature transformers are implemented as {@link org.apache.spark.ml.Estimator}}s, because the
  * transformation requires some aggregated information of the dataset, e.g., document
  * frequencies in {@link org.apache.spark.ml.feature.IDF}.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 1db91666f21ab..5e5850963edc9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -529,7 +529,7 @@ class LinearRegressionSummary private[regression] (
     val predictionCol: String,
     val labelCol: String,
     val model: LinearRegressionModel,
-    val diagInvAtWA: Array[Double]) extends Serializable {
+    private val diagInvAtWA: Array[Double]) extends Serializable {
 
   @transient private val metrics = new RegressionMetrics(
     predictions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 82adfa6ffd596..54bf5102cc565 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -407,7 +407,7 @@ private object BisectingKMeans extends Serializable {
  */
 @Since("1.6.0")
 @Experimental
-class ClusteringTreeNode private[clustering] (
+private[clustering] class ClusteringTreeNode private[clustering] (
     val index: Int,
     val size: Long,
     private val centerWithNorm: VectorWithNorm,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index f942e5613ffaf..9ccf96b9395b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -32,8 +32,8 @@ import org.apache.spark.rdd.RDD
  */
 @Since("1.6.0")
 @Experimental
-class BisectingKMeansModel @Since("1.6.0") (
-    @Since("1.6.0") val root: ClusteringTreeNode
+class BisectingKMeansModel private[clustering] (
+    private[clustering] val root: ClusteringTreeNode
   ) extends Serializable with Logging {
 
   /**
diff --git a/pom.xml b/pom.xml
index da50fcbb57eb8..4a86c65925168 100644
--- a/pom.xml
+++ b/pom.xml
@@ -830,8 +830,36 @@
             <artifactId>guava</artifactId>
             <groupId>com.google.guava</groupId>
           </exclusion>
+          <exclusion>
+            <groupId>org.apache.httpcomponents</groupId>
+            <artifactId>httpclient</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.httpcomponents</groupId>
+            <artifactId>httpcore</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>commons-logging</groupId>
+            <artifactId>httpclient</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>commons-logging</groupId>
+            <artifactId>commons-logging</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <groupId>mysql</groupId>
+        <artifactId>mysql-connector-java</artifactId>
+        <version>5.1.34</version>
+        <scope>test</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.postgresql</groupId>
+        <artifactId>postgresql</artifactId>
+        <version>9.3-1102-jdbc41</version>
+        <scope>test</scope>
+      </dependency>
       <dependency>
         <groupId>org.apache.curator</groupId>
         <artifactId>curator-recipes</artifactId>
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 746bb55e14f22..78ab475eb466b 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -213,7 +213,7 @@ def explain(self, extended=False):
 
         >>> df.explain()
         == Physical Plan ==
-        Scan PhysicalRDD[age#0,name#1]
+        Scan ExistingRDD[age#0,name#1]
 
         >>> df.explain(True)
         == Parsed Logical Plan ==
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 0a9e74c96829c..dffb41aea0e62 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -1030,21 +1030,27 @@ case class ScalaUDF(
     // such as IntegerType, its javaType is `int` and the returned type of user-defined
     // function is Object. Trying to convert an Object to `int` will cause casting exception.
     val evalCode = evals.map(_.code).mkString
-    val funcArguments = converterTerms.zip(evals).map {
-      case (converter, eval) => s"$converter.apply(${eval.value})"
-    }.mkString(",")
-    val callFunc = s"${ctx.boxedType(ctx.javaType(dataType))} $resultTerm = " +
-      s"(${ctx.boxedType(ctx.javaType(dataType))})${catalystConverterTerm}" +
-        s".apply($funcTerm.apply($funcArguments));"
+    val (converters, funcArguments) = converterTerms.zipWithIndex.map { case (converter, i) =>
+      val eval = evals(i)
+      val argTerm = ctx.freshName("arg")
+      val convert = s"Object $argTerm = ${eval.isNull} ? null : $converter.apply(${eval.value});"
+      (convert, argTerm)
+    }.unzip
 
-    evalCode + s"""
-      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
-      Boolean ${ev.isNull};
+    val callFunc = s"${ctx.boxedType(dataType)} $resultTerm = " +
+      s"(${ctx.boxedType(dataType)})${catalystConverterTerm}" +
+        s".apply($funcTerm.apply(${funcArguments.mkString(", ")}));"
 
+    s"""
+      $evalCode
+      ${converters.mkString("\n")}
       $callFunc
 
-      ${ev.value} = $resultTerm;
-      ${ev.isNull} = $resultTerm == null;
+      boolean ${ev.isNull} = $resultTerm == null;
+      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+      if (!${ev.isNull}) {
+        ${ev.value} = $resultTerm;
+      }
     """
   }
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 3b7a9fea054d5..2397a3f440730 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -91,13 +91,11 @@
     <dependency>
       <groupId>mysql</groupId>
       <artifactId>mysql-connector-java</artifactId>
-      <version>5.1.34</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.postgresql</groupId>
       <artifactId>postgresql</artifactId>
-      <version>9.3-1102-jdbc41</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index d641fcac1c8ac..297ef2299cb36 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -84,7 +84,7 @@ class TypedColumn[-T, U](
  *   col("`a.column.with.dots`") // Escape `.` in column names.
  *   $"columnName"               // Scala short hand for a named column.
  *   expr("a + 1")               // A column that is constructed from a parsed SQL Expression.
- *   lit("1")                    // A column that produces a literal (constant) value.
+ *   lit("abc")                  // A column that produces a literal (constant) value.
  * }}}
  *
  * [[Column]] objects can be composed to form complex expressions:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index eb8700369275e..1acfe8419961e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1413,6 +1413,19 @@ class DataFrame private[sql](
    */
   def first(): Row = head()
 
+  /**
+   * Concise syntax for chaining custom transformations.
+   * {{{
+   *   def featurize(ds: DataFrame) = ...
+   *
+   *   df
+   *     .transform(featurize)
+   *     .transform(...)
+   * }}}
+   * @since 1.6.0
+   */
+  def transform[U](t: DataFrame => DataFrame): DataFrame = t(this)
+
   /**
    * Returns a new RDD by applying a function to all rows of this DataFrame.
    * @group rdd
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index 623348f6768a4..b8a43025882e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -97,22 +97,31 @@ private[sql] case class LogicalRDD(
 private[sql] case class PhysicalRDD(
     output: Seq[Attribute],
     rdd: RDD[InternalRow],
-    extraInformation: String,
+    override val nodeName: String,
+    override val metadata: Map[String, String] = Map.empty,
     override val outputsUnsafeRows: Boolean = false)
   extends LeafNode {
 
   protected override def doExecute(): RDD[InternalRow] = rdd
 
-  override def simpleString: String = "Scan " + extraInformation + output.mkString("[", ",", "]")
+  override def simpleString: String = {
+    val metadataEntries = for ((key, value) <- metadata.toSeq.sorted) yield s"$key: $value"
+    s"Scan $nodeName${output.mkString("[", ",", "]")}${metadataEntries.mkString(" ", ", ", "")}"
+  }
 }
 
 private[sql] object PhysicalRDD {
+  // Metadata keys
+  val INPUT_PATHS = "InputPaths"
+  val PUSHED_FILTERS = "PushedFilters"
+
   def createFromDataSource(
       output: Seq[Attribute],
       rdd: RDD[InternalRow],
       relation: BaseRelation,
-      extraInformation: String = ""): PhysicalRDD = {
-    PhysicalRDD(output, rdd, relation.toString + extraInformation,
-      relation.isInstanceOf[HadoopFsRelation])
+      metadata: Map[String, String] = Map.empty): PhysicalRDD = {
+    // All HadoopFsRelations output UnsafeRows
+    val outputUnsafeRows = relation.isInstanceOf[HadoopFsRelation]
+    PhysicalRDD(output, rdd, relation.toString, metadata, outputUnsafeRows)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 2bbe411a61707..583e5d93f5fdc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -87,6 +87,11 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     super.withNewChildren(newChildren).withLogicalPlan(logicalPlan)
   }
 
+  /**
+   * Return all metadata that describes more details of this SparkPlan.
+   */
+  private[sql] def metadata: Map[String, String] = Map.empty
+
   /**
    * Return all metrics containing metrics of this SparkPlan.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index f67c951bc0663..25e98c0bdd431 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -363,7 +363,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           expressions, nPartitions.getOrElse(numPartitions)), planLater(child)) :: Nil
       case e @ EvaluatePython(udf, child, _) =>
         BatchPythonEvaluation(udf, e.output, planLater(child)) :: Nil
-      case LogicalRDD(output, rdd) => PhysicalRDD(output, rdd, "PhysicalRDD") :: Nil
+      case LogicalRDD(output, rdd) => PhysicalRDD(output, rdd, "ExistingRDD") :: Nil
       case BroadcastHint(child) => apply(child)
       case _ => Nil
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index 20359c1e540e2..c0d00104e8bfd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -332,11 +332,6 @@ private[sql] case class ScalaUDAF(
   override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
     copy(inputAggBufferOffset = newInputAggBufferOffset)
 
-  require(
-    children.length == udaf.inputSchema.length,
-    s"$udaf only accepts ${udaf.inputSchema.length} arguments, " +
-      s"but ${children.length} are provided.")
-
   override def nullable: Boolean = true
 
   override def dataType: DataType = udaf.dataType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 544d5eccec037..8a15a51d825ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
 import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
@@ -25,6 +27,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions}
+import org.apache.spark.sql.execution.PhysicalRDD.{INPUT_PATHS, PUSHED_FILTERS}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{StringType, StructType}
@@ -315,7 +318,20 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     // `Filter`s or cannot be handled by `relation`.
     val filterCondition = unhandledPredicates.reduceLeftOption(expressions.And)
 
-    val pushedFiltersString = pushedFilters.mkString(" PushedFilter: [", ",", "] ")
+    val metadata: Map[String, String] = {
+      val pairs = ArrayBuffer.empty[(String, String)]
+
+      if (pushedFilters.nonEmpty) {
+        pairs += (PUSHED_FILTERS -> pushedFilters.mkString("[", ", ", "]"))
+      }
+
+      relation.relation match {
+        case r: HadoopFsRelation => pairs += INPUT_PATHS -> r.paths.mkString(", ")
+        case _ =>
+      }
+
+      pairs.toMap
+    }
 
     if (projects.map(_.toAttribute) == projects &&
         projectSet.size == projects.size &&
@@ -334,7 +350,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       val scan = execution.PhysicalRDD.createFromDataSource(
         projects.map(_.toAttribute),
         scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
-        relation.relation, pushedFiltersString)
+        relation.relation, metadata)
       filterCondition.map(execution.Filter(_, scan)).getOrElse(scan)
     } else {
       // Don't request columns that are only referenced by pushed filters.
@@ -344,7 +360,7 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       val scan = execution.PhysicalRDD.createFromDataSource(
         requestedColumns,
         scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
-        relation.relation, pushedFiltersString)
+        relation.relation, metadata)
       execution.Project(
         projects, filterCondition.map(execution.Filter(_, scan)).getOrElse(scan))
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index fdd745f48e973..4f1685a280ca6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -146,6 +146,12 @@ private[sql] class ParquetRelation(
     meta
   }
 
+  override def toString: String = {
+    parameters.get(ParquetRelation.METASTORE_TABLE_NAME).map { tableName =>
+      s"${getClass.getSimpleName}: $tableName"
+    }.getOrElse(super.toString)
+  }
+
   override def equals(other: Any): Boolean = other match {
     case that: ParquetRelation =>
       val schemaEquality = if (shouldMergeSchemas) {
@@ -521,6 +527,10 @@ private[sql] object ParquetRelation extends Logging {
   // internally.
   private[sql] val METASTORE_SCHEMA = "metastoreSchema"
 
+  // If a ParquetRelation is converted from a Hive metastore table, this option is set to the
+  // original Hive table name.
+  private[sql] val METASTORE_TABLE_NAME = "metastoreTableName"
+
   /**
    * If parquet's block size (row group size) setting is larger than the min split size,
    * we use parquet's block size setting as the min split size. Otherwise, we will create
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index aebfea5832402..8c7099ab5a34d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -334,7 +334,11 @@ private[joins] final class UnsafeHashedRelation(
     // so that tests compile:
     val taskMemoryManager = new TaskMemoryManager(
       new StaticMemoryManager(
-        new SparkConf().set("spark.unsafe.offHeap", "false"), Long.MaxValue, Long.MaxValue, 1), 0)
+        new SparkConf().set("spark.memory.offHeap.enabled", "false"),
+        Long.MaxValue,
+        Long.MaxValue,
+        1),
+      0)
 
     val pageSizeBytes = Option(SparkEnv.get).map(_.memoryManager.pageSizeBytes)
       .getOrElse(new SparkConf().getSizeAsBytes("spark.buffer.pageSize", "16m"))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
index f1fce5478a3fe..7a8002c912a63 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
@@ -66,7 +66,7 @@ private[sql] object SparkPlanGraph {
         metric.param.asInstanceOf[SQLMetricParam[SQLMetricValue[Any], Any]])
     }
     val node = SparkPlanGraphNode(
-      nodeIdGenerator.getAndIncrement(), plan.nodeName, plan.simpleString, metrics)
+      nodeIdGenerator.getAndIncrement(), plan.nodeName, plan.simpleString, plan.metadata, metrics)
     nodes += node
     val childrenNodes = plan.children.map(
       child => buildSparkPlanGraphNode(child, nodeIdGenerator, nodes, edges))
@@ -85,26 +85,33 @@ private[sql] object SparkPlanGraph {
  * @param metrics metrics that this SparkPlan node will track
  */
 private[ui] case class SparkPlanGraphNode(
-    id: Long, name: String, desc: String, metrics: Seq[SQLPlanMetric]) {
+    id: Long,
+    name: String,
+    desc: String,
+    metadata: Map[String, String],
+    metrics: Seq[SQLPlanMetric]) {
 
   def makeDotNode(metricsValue: Map[Long, String]): String = {
-    val values = {
-      for (metric <- metrics;
-           value <- metricsValue.get(metric.accumulatorId)) yield {
-        metric.name + ": " + value
-      }
+    val builder = new mutable.StringBuilder(name)
+
+    val values = for {
+      metric <- metrics
+      value <- metricsValue.get(metric.accumulatorId)
+    } yield {
+      metric.name + ": " + value
+    }
+
+    if (values.nonEmpty) {
+      // If there are metrics, display each entry in a separate line. We should use an escaped
+      // "\n" here to follow the dot syntax.
+      //
+      // Note: whitespace between two "\n"s is to create an empty line between the name of
+      // SparkPlan and metrics. If removing it, it won't display the empty line in UI.
+      builder ++= "\\n \\n"
+      builder ++= values.mkString("\\n")
     }
-    val label = if (values.isEmpty) {
-        name
-      } else {
-        // If there are metrics, display all metrics in a separate line. We should use an escaped
-        // "\n" here to follow the dot syntax.
-        //
-        // Note: whitespace between two "\n"s is to create an empty line between the name of
-        // SparkPlan and metrics. If removing it, it won't display the empty line in UI.
-        name + "\\n \\n" + values.mkString("\\n")
-      }
-    s"""  $id [label="$label"];"""
+
+    s"""  $id [label="${builder.toString()}"];"""
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 9ace25dc7d21b..fc8ce6901dfca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -422,7 +422,7 @@ abstract class HadoopFsRelation private[sql](
     parameters: Map[String, String])
   extends BaseRelation with FileRelation with Logging {
 
-  override def toString: String = getClass.getSimpleName + paths.mkString("[", ",", "]")
+  override def toString: String = getClass.getSimpleName
 
   def this() = this(None, Map.empty[String, String])
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 76e9648aa7533..1763eb53114b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1131,14 +1131,19 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-11725: correctly handle null inputs for ScalaUDF") {
-    val df = Seq(
+    val df = sparkContext.parallelize(Seq(
       new java.lang.Integer(22) -> "John",
-      null.asInstanceOf[java.lang.Integer] -> "Lucy").toDF("age", "name")
+      null.asInstanceOf[java.lang.Integer] -> "Lucy")).toDF("age", "name")
 
+    // passing null into the UDF that could handle it
     val boxedUDF = udf[java.lang.Integer, java.lang.Integer] {
-      (i: java.lang.Integer) => if (i == null) null else i * 2
+      (i: java.lang.Integer) => if (i == null) -10 else null
     }
-    checkAnswer(df.select(boxedUDF($"age")), Row(44) :: Row(null) :: Nil)
+    checkAnswer(df.select(boxedUDF($"age")), Row(null) :: Row(-10) :: Nil)
+
+    sqlContext.udf.register("boxedUDF",
+      (i: java.lang.Integer) => (if (i == null) -10 else null): java.lang.Integer)
+    checkAnswer(sql("select boxedUDF(null), boxedUDF(-1)"), Row(-10, null) :: Nil)
 
     val primitiveUDF = udf((i: Int) => i * 2)
     checkAnswer(df.select(primitiveUDF($"age")), Row(44) :: Row(null) :: Nil)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index a4626259b2823..2fb439f50117a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -169,7 +169,7 @@ class PlannerSuite extends SharedSQLContext {
 
       withTempTable("testPushed") {
         val exp = sql("select * from testPushed where key = 15").queryExecution.executedPlan
-        assert(exp.toString.contains("PushedFilter: [EqualTo(key,15)]"))
+        assert(exp.toString.contains("PushedFilters: [EqualTo(key,15)]"))
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index 7ceaee38d131b..5a8406789ab81 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -61,7 +61,7 @@ class UnsafeFixedWidthAggregationMapSuite
     }
 
     test(name) {
-      val conf = new SparkConf().set("spark.unsafe.offHeap", "false")
+      val conf = new SparkConf().set("spark.memory.offHeap.enabled", "false")
       memoryManager = new TestMemoryManager(conf)
       taskMemoryManager = new TaskMemoryManager(memoryManager, 0)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
index 7b80963ec8708..29027a664b4b4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
@@ -109,7 +109,7 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
       pageSize: Long,
       spill: Boolean): Unit = {
     val memoryManager =
-      new TestMemoryManager(new SparkConf().set("spark.unsafe.offHeap", "false"))
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false"))
     val taskMemMgr = new TaskMemoryManager(memoryManager, 0)
     TaskContext.setTaskContext(new TaskContextImpl(
       stageId = 0,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 10c315e45e917..e9af4e12d62ae 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -421,7 +421,12 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     // evil case insensitivity issue, which is reconciled within `ParquetRelation`.
     val parquetOptions = Map(
       ParquetRelation.METASTORE_SCHEMA -> metastoreSchema.json,
-      ParquetRelation.MERGE_SCHEMA -> mergeSchema.toString)
+      ParquetRelation.MERGE_SCHEMA -> mergeSchema.toString,
+      ParquetRelation.METASTORE_TABLE_NAME -> TableIdentifier(
+        metastoreRelation.tableName,
+        Some(metastoreRelation.databaseName)
+      ).unquotedString
+    )
     val tableIdentifier =
       QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 39c0a2a0de045..064c0004b801e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -66,6 +66,33 @@ class ScalaAggregateFunction(schema: StructType) extends UserDefinedAggregateFun
   }
 }
 
+class ScalaAggregateFunctionWithoutInputSchema extends UserDefinedAggregateFunction {
+
+  def inputSchema: StructType = StructType(Nil)
+
+  def bufferSchema: StructType = StructType(StructField("value", LongType) :: Nil)
+
+  def dataType: DataType = LongType
+
+  def deterministic: Boolean = true
+
+  def initialize(buffer: MutableAggregationBuffer): Unit = {
+    buffer.update(0, 0L)
+  }
+
+  def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
+    buffer.update(0, input.getAs[Seq[Row]](0).map(_.getAs[Int]("v")).sum + buffer.getLong(0))
+  }
+
+  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
+    buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0))
+  }
+
+  def evaluate(buffer: Row): Any = {
+    buffer.getLong(0)
+  }
+}
+
 class LongProductSum extends UserDefinedAggregateFunction {
   def inputSchema: StructType = new StructType()
     .add("a", LongType)
@@ -858,6 +885,43 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
       )
     }
   }
+
+  test("udaf without specifying inputSchema") {
+    withTempTable("noInputSchemaUDAF") {
+      sqlContext.udf.register("noInputSchema", new ScalaAggregateFunctionWithoutInputSchema)
+
+      val data =
+        Row(1, Seq(Row(1), Row(2), Row(3))) ::
+          Row(1, Seq(Row(4), Row(5), Row(6))) ::
+          Row(2, Seq(Row(-10))) :: Nil
+      val schema =
+        StructType(
+          StructField("key", IntegerType) ::
+            StructField("myArray",
+              ArrayType(StructType(StructField("v", IntegerType) :: Nil))) :: Nil)
+      sqlContext.createDataFrame(
+        sparkContext.parallelize(data, 2),
+        schema)
+        .registerTempTable("noInputSchemaUDAF")
+
+      checkAnswer(
+        sqlContext.sql(
+          """
+            |SELECT key, noInputSchema(myArray)
+            |FROM noInputSchemaUDAF
+            |GROUP BY key
+          """.stripMargin),
+        Row(1, 21) :: Row(2, -10) :: Nil)
+
+      checkAnswer(
+        sqlContext.sql(
+          """
+            |SELECT noInputSchema(myArray)
+            |FROM noInputSchemaUDAF
+          """.stripMargin),
+        Row(11) :: Nil)
+    }
+  }
 }
 
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/State.scala b/streaming/src/main/scala/org/apache/spark/streaming/State.scala
index 604e64fc61630..b47bdda2c2137 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/State.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/State.scala
@@ -23,14 +23,14 @@ import org.apache.spark.annotation.Experimental
 
 /**
  * :: Experimental ::
- * Abstract class for getting and updating the tracked state in the `trackStateByKey` operation of
- * a [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a
- * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java).
+ * Abstract class for getting and updating the state in mapping function used in the `mapWithState`
+ * operation of a [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala)
+ * or a [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java).
  *
  * Scala example of using `State`:
  * {{{
- *    // A tracking function that maintains an integer state and return a String
- *    def trackStateFunc(data: Option[Int], state: State[Int]): Option[String] = {
+ *    // A mapping function that maintains an integer state and returns a String
+ *    def mappingFunction(key: String, value: Option[Int], state: State[Int]): Option[String] = {
  *      // Check if state exists
  *      if (state.exists) {
  *        val existingState = state.get  // Get the existing state
@@ -52,12 +52,12 @@ import org.apache.spark.annotation.Experimental
  *
  * Java example of using `State`:
  * {{{
- *    // A tracking function that maintains an integer state and return a String
- *   Function2<Optional<Integer>, State<Integer>, Optional<String>> trackStateFunc =
- *       new Function2<Optional<Integer>, State<Integer>, Optional<String>>() {
+ *    // A mapping function that maintains an integer state and returns a String
+ *    Function3<String, Optional<Integer>, State<Integer>, String> mappingFunction =
+ *       new Function3<String, Optional<Integer>, State<Integer>, String>() {
  *
  *         @Override
- *         public Optional<String> call(Optional<Integer> one, State<Integer> state) {
+ *         public String call(String key, Optional<Integer> value, State<Integer> state) {
  *           if (state.exists()) {
  *             int existingState = state.get(); // Get the existing state
  *             boolean shouldRemove = ...; // Decide whether to remove the state
@@ -75,6 +75,8 @@ import org.apache.spark.annotation.Experimental
  *         }
  *       };
  * }}}
+ *
+ * @tparam S Class of the state
  */
 @Experimental
 sealed abstract class State[S] {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
index bea5b9df20b53..9f6f95223f619 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
@@ -20,7 +20,7 @@ package org.apache.spark.streaming
 import com.google.common.base.Optional
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.{JavaPairRDD, JavaUtils}
-import org.apache.spark.api.java.function.{Function2 => JFunction2, Function4 => JFunction4}
+import org.apache.spark.api.java.function.{Function3 => JFunction3, Function4 => JFunction4}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.ClosureCleaner
 import org.apache.spark.{HashPartitioner, Partitioner}
@@ -28,7 +28,7 @@ import org.apache.spark.{HashPartitioner, Partitioner}
 /**
  * :: Experimental ::
  * Abstract class representing all the specifications of the DStream transformation
- * `trackStateByKey` operation of a
+ * `mapWithState` operation of a
  * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a
  * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java).
  * Use the [[org.apache.spark.streaming.StateSpec StateSpec.apply()]] or
@@ -37,50 +37,63 @@ import org.apache.spark.{HashPartitioner, Partitioner}
  *
  * Example in Scala:
  * {{{
- *    def trackingFunction(data: Option[ValueType], wrappedState: State[StateType]): EmittedType = {
- *      ...
+ *    // A mapping function that maintains an integer state and return a String
+ *    def mappingFunction(key: String, value: Option[Int], state: State[Int]): Option[String] = {
+ *      // Use state.exists(), state.get(), state.update() and state.remove()
+ *      // to manage state, and return the necessary string
  *    }
  *
- *    val spec = StateSpec.function(trackingFunction).numPartitions(10)
+ *    val spec = StateSpec.function(mappingFunction).numPartitions(10)
  *
- *    val emittedRecordDStream = keyValueDStream.trackStateByKey[StateType, EmittedDataType](spec)
+ *    val mapWithStateDStream = keyValueDStream.mapWithState[StateType, MappedType](spec)
  * }}}
  *
  * Example in Java:
  * {{{
- *    StateSpec<KeyType, ValueType, StateType, EmittedDataType> spec =
- *      StateSpec.<KeyType, ValueType, StateType, EmittedDataType>function(trackingFunction)
- *                    .numPartition(10);
+ *   // A mapping function that maintains an integer state and return a string
+ *   Function3<String, Optional<Integer>, State<Integer>, String> mappingFunction =
+ *       new Function3<String, Optional<Integer>, State<Integer>, String>() {
+ *           @Override
+ *           public Optional<String> call(Optional<Integer> value, State<Integer> state) {
+ *               // Use state.exists(), state.get(), state.update() and state.remove()
+ *               // to manage state, and return the necessary string
+ *           }
+ *       };
  *
- *    JavaTrackStateDStream<KeyType, ValueType, StateType, EmittedType> emittedRecordDStream =
- *      javaPairDStream.<StateType, EmittedDataType>trackStateByKey(spec);
+ *    JavaMapWithStateDStream<String, Integer, Integer, String> mapWithStateDStream =
+ *        keyValueDStream.mapWithState(StateSpec.function(mappingFunc));
  * }}}
+ *
+ * @tparam KeyType    Class of the state key
+ * @tparam ValueType  Class of the state value
+ * @tparam StateType  Class of the state data
+ * @tparam MappedType Class of the mapped elements
  */
 @Experimental
-sealed abstract class StateSpec[KeyType, ValueType, StateType, EmittedType] extends Serializable {
+sealed abstract class StateSpec[KeyType, ValueType, StateType, MappedType] extends Serializable {
 
-  /** Set the RDD containing the initial states that will be used by `trackStateByKey` */
+  /** Set the RDD containing the initial states that will be used by `mapWithState` */
   def initialState(rdd: RDD[(KeyType, StateType)]): this.type
 
-  /** Set the RDD containing the initial states that will be used by `trackStateByKey` */
+  /** Set the RDD containing the initial states that will be used by `mapWithState` */
   def initialState(javaPairRDD: JavaPairRDD[KeyType, StateType]): this.type
 
   /**
-   * Set the number of partitions by which the state RDDs generated by `trackStateByKey`
+   * Set the number of partitions by which the state RDDs generated by `mapWithState`
    * will be partitioned. Hash partitioning will be used.
    */
   def numPartitions(numPartitions: Int): this.type
 
   /**
-   * Set the partitioner by which the state RDDs generated by `trackStateByKey` will be
+   * Set the partitioner by which the state RDDs generated by `mapWithState` will be
    * be partitioned.
    */
   def partitioner(partitioner: Partitioner): this.type
 
   /**
    * Set the duration after which the state of an idle key will be removed. A key and its state is
-   * considered idle if it has not received any data for at least the given duration. The state
-   * tracking function will be called one final time on the idle states that are going to be
+   * considered idle if it has not received any data for at least the given duration. The
+   * mapping function will be called one final time on the idle states that are going to be
    * removed; [[org.apache.spark.streaming.State State.isTimingOut()]] set
    * to `true` in that call.
    */
@@ -91,115 +104,124 @@ sealed abstract class StateSpec[KeyType, ValueType, StateType, EmittedType] exte
 /**
  * :: Experimental ::
  * Builder object for creating instances of [[org.apache.spark.streaming.StateSpec StateSpec]]
- * that is used for specifying the parameters of the DStream transformation `trackStateByKey`
+ * that is used for specifying the parameters of the DStream transformation `mapWithState`
  * that is used for specifying the parameters of the DStream transformation
- * `trackStateByKey` operation of a
+ * `mapWithState` operation of a
  * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a
  * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java).
  *
  * Example in Scala:
  * {{{
- *    def trackingFunction(data: Option[ValueType], wrappedState: State[StateType]): EmittedType = {
- *      ...
+ *    // A mapping function that maintains an integer state and return a String
+ *    def mappingFunction(key: String, value: Option[Int], state: State[Int]): Option[String] = {
+ *      // Use state.exists(), state.get(), state.update() and state.remove()
+ *      // to manage state, and return the necessary string
  *    }
  *
- *    val emittedRecordDStream = keyValueDStream.trackStateByKey[StateType, EmittedDataType](
- *        StateSpec.function(trackingFunction).numPartitions(10))
+ *    val spec = StateSpec.function(mappingFunction).numPartitions(10)
+ *
+ *    val mapWithStateDStream = keyValueDStream.mapWithState[StateType, MappedType](spec)
  * }}}
  *
  * Example in Java:
  * {{{
- *    StateSpec<KeyType, ValueType, StateType, EmittedDataType> spec =
- *      StateSpec.<KeyType, ValueType, StateType, EmittedDataType>function(trackingFunction)
- *                    .numPartition(10);
+ *   // A mapping function that maintains an integer state and return a string
+ *   Function3<String, Optional<Integer>, State<Integer>, String> mappingFunction =
+ *       new Function3<String, Optional<Integer>, State<Integer>, String>() {
+ *           @Override
+ *           public Optional<String> call(Optional<Integer> value, State<Integer> state) {
+ *               // Use state.exists(), state.get(), state.update() and state.remove()
+ *               // to manage state, and return the necessary string
+ *           }
+ *       };
  *
- *    JavaTrackStateDStream<KeyType, ValueType, StateType, EmittedType> emittedRecordDStream =
- *      javaPairDStream.<StateType, EmittedDataType>trackStateByKey(spec);
- * }}}
+ *    JavaMapWithStateDStream<String, Integer, Integer, String> mapWithStateDStream =
+ *        keyValueDStream.mapWithState(StateSpec.function(mappingFunc));
+ *}}}
  */
 @Experimental
 object StateSpec {
   /**
    * Create a [[org.apache.spark.streaming.StateSpec StateSpec]] for setting all the specifications
-   * of the `trackStateByKey` operation on a
+   * of the `mapWithState` operation on a
    * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]].
    *
-   * @param trackingFunction The function applied on every data item to manage the associated state
-   *                         and generate the emitted data
+   * @param mappingFunction The function applied on every data item to manage the associated state
+   *                         and generate the mapped data
    * @tparam KeyType      Class of the keys
    * @tparam ValueType    Class of the values
    * @tparam StateType    Class of the states data
-   * @tparam EmittedType  Class of the emitted data
+   * @tparam MappedType   Class of the mapped data
    */
-  def function[KeyType, ValueType, StateType, EmittedType](
-      trackingFunction: (Time, KeyType, Option[ValueType], State[StateType]) => Option[EmittedType]
-    ): StateSpec[KeyType, ValueType, StateType, EmittedType] = {
-    ClosureCleaner.clean(trackingFunction, checkSerializable = true)
-    new StateSpecImpl(trackingFunction)
+  def function[KeyType, ValueType, StateType, MappedType](
+      mappingFunction: (Time, KeyType, Option[ValueType], State[StateType]) => Option[MappedType]
+    ): StateSpec[KeyType, ValueType, StateType, MappedType] = {
+    ClosureCleaner.clean(mappingFunction, checkSerializable = true)
+    new StateSpecImpl(mappingFunction)
   }
 
   /**
    * Create a [[org.apache.spark.streaming.StateSpec StateSpec]] for setting all the specifications
-   * of the `trackStateByKey` operation on a
+   * of the `mapWithState` operation on a
    * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]].
    *
-   * @param trackingFunction The function applied on every data item to manage the associated state
-   *                         and generate the emitted data
+   * @param mappingFunction The function applied on every data item to manage the associated state
+   *                         and generate the mapped data
    * @tparam ValueType    Class of the values
    * @tparam StateType    Class of the states data
-   * @tparam EmittedType  Class of the emitted data
+   * @tparam MappedType   Class of the mapped data
    */
-  def function[KeyType, ValueType, StateType, EmittedType](
-      trackingFunction: (Option[ValueType], State[StateType]) => EmittedType
-    ): StateSpec[KeyType, ValueType, StateType, EmittedType] = {
-    ClosureCleaner.clean(trackingFunction, checkSerializable = true)
+  def function[KeyType, ValueType, StateType, MappedType](
+      mappingFunction: (KeyType, Option[ValueType], State[StateType]) => MappedType
+    ): StateSpec[KeyType, ValueType, StateType, MappedType] = {
+    ClosureCleaner.clean(mappingFunction, checkSerializable = true)
     val wrappedFunction =
-      (time: Time, key: Any, value: Option[ValueType], state: State[StateType]) => {
-        Some(trackingFunction(value, state))
+      (time: Time, key: KeyType, value: Option[ValueType], state: State[StateType]) => {
+        Some(mappingFunction(key, value, state))
       }
     new StateSpecImpl(wrappedFunction)
   }
 
   /**
    * Create a [[org.apache.spark.streaming.StateSpec StateSpec]] for setting all
-   * the specifications of the `trackStateByKey` operation on a
+   * the specifications of the `mapWithState` operation on a
    * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]].
    *
-   * @param javaTrackingFunction The function applied on every data item to manage the associated
-   *                             state and generate the emitted data
+   * @param mappingFunction The function applied on every data item to manage the associated
+   *                        state and generate the mapped data
    * @tparam KeyType      Class of the keys
    * @tparam ValueType    Class of the values
    * @tparam StateType    Class of the states data
-   * @tparam EmittedType  Class of the emitted data
+   * @tparam MappedType   Class of the mapped data
    */
-  def function[KeyType, ValueType, StateType, EmittedType](javaTrackingFunction:
-      JFunction4[Time, KeyType, Optional[ValueType], State[StateType], Optional[EmittedType]]):
-    StateSpec[KeyType, ValueType, StateType, EmittedType] = {
-    val trackingFunc = (time: Time, k: KeyType, v: Option[ValueType], s: State[StateType]) => {
-      val t = javaTrackingFunction.call(time, k, JavaUtils.optionToOptional(v), s)
+  def function[KeyType, ValueType, StateType, MappedType](mappingFunction:
+      JFunction4[Time, KeyType, Optional[ValueType], State[StateType], Optional[MappedType]]):
+    StateSpec[KeyType, ValueType, StateType, MappedType] = {
+    val wrappedFunc = (time: Time, k: KeyType, v: Option[ValueType], s: State[StateType]) => {
+      val t = mappingFunction.call(time, k, JavaUtils.optionToOptional(v), s)
       Option(t.orNull)
     }
-    StateSpec.function(trackingFunc)
+    StateSpec.function(wrappedFunc)
   }
 
   /**
    * Create a [[org.apache.spark.streaming.StateSpec StateSpec]] for setting all the specifications
-   * of the `trackStateByKey` operation on a
+   * of the `mapWithState` operation on a
    * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]].
    *
-   * @param javaTrackingFunction The function applied on every data item to manage the associated
-   *                             state and generate the emitted data
+   * @param mappingFunction The function applied on every data item to manage the associated
+   *                        state and generate the mapped data
    * @tparam ValueType    Class of the values
    * @tparam StateType    Class of the states data
-   * @tparam EmittedType  Class of the emitted data
+   * @tparam MappedType   Class of the mapped data
    */
-  def function[KeyType, ValueType, StateType, EmittedType](
-      javaTrackingFunction: JFunction2[Optional[ValueType], State[StateType], EmittedType]):
-    StateSpec[KeyType, ValueType, StateType, EmittedType] = {
-    val trackingFunc = (v: Option[ValueType], s: State[StateType]) => {
-      javaTrackingFunction.call(Optional.fromNullable(v.get), s)
+  def function[KeyType, ValueType, StateType, MappedType](
+      mappingFunction: JFunction3[KeyType, Optional[ValueType], State[StateType], MappedType]):
+    StateSpec[KeyType, ValueType, StateType, MappedType] = {
+    val wrappedFunc = (k: KeyType, v: Option[ValueType], s: State[StateType]) => {
+      mappingFunction.call(k, Optional.fromNullable(v.get), s)
     }
-    StateSpec.function(trackingFunc)
+    StateSpec.function(wrappedFunc)
   }
 }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 6fb8ad38abcec..53324e79a04e6 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -887,12 +887,13 @@ object StreamingContext extends Logging {
   }
 
   private[streaming] def rddToFileName[T](prefix: String, suffix: String, time: Time): String = {
-    if (prefix == null) {
-      time.milliseconds.toString
-    } else if (suffix == null || suffix.length ==0) {
-      prefix + "-" + time.milliseconds
-    } else {
-      prefix + "-" + time.milliseconds + "." + suffix
+    var result = time.milliseconds.toString
+    if (prefix != null && prefix.length > 0) {
+      result = s"$prefix-$result"
+    }
+    if (suffix != null && suffix.length > 0) {
+      result = s"$result.$suffix"
     }
+    result
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaTrackStateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaMapWithStateDStream.scala
similarity index 66%
rename from streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaTrackStateDStream.scala
rename to streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaMapWithStateDStream.scala
index f459930d0660b..16c0d6fff8229 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaTrackStateDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaMapWithStateDStream.scala
@@ -19,23 +19,23 @@ package org.apache.spark.streaming.api.java
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaSparkContext
-import org.apache.spark.streaming.dstream.TrackStateDStream
+import org.apache.spark.streaming.dstream.MapWithStateDStream
 
 /**
  * :: Experimental ::
- * [[JavaDStream]] representing the stream of records emitted by the tracking function in the
- * `trackStateByKey` operation on a [[JavaPairDStream]]. Additionally, it also gives access to the
+ * DStream representing the stream of data generated by `mapWithState` operation on a
+ * [[JavaPairDStream]]. Additionally, it also gives access to the
  * stream of state snapshots, that is, the state data of all keys after a batch has updated them.
  *
- * @tparam KeyType Class of the state key
- * @tparam ValueType Class of the state value
- * @tparam StateType Class of the state
- * @tparam EmittedType Class of the emitted records
+ * @tparam KeyType Class of the keys
+ * @tparam ValueType Class of the values
+ * @tparam StateType Class of the state data
+ * @tparam MappedType Class of the mapped data
  */
 @Experimental
-class JavaTrackStateDStream[KeyType, ValueType, StateType, EmittedType](
-    dstream: TrackStateDStream[KeyType, ValueType, StateType, EmittedType])
-  extends JavaDStream[EmittedType](dstream)(JavaSparkContext.fakeClassTag) {
+class JavaMapWithStateDStream[KeyType, ValueType, StateType, MappedType] private[streaming](
+    dstream: MapWithStateDStream[KeyType, ValueType, StateType, MappedType])
+  extends JavaDStream[MappedType](dstream)(JavaSparkContext.fakeClassTag) {
 
   def stateSnapshots(): JavaPairDStream[KeyType, StateType] =
     new JavaPairDStream(dstream.stateSnapshots())(
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index 70e32b383e458..42ddd63f0f06c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -430,42 +430,36 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
 
   /**
    * :: Experimental ::
-   * Return a new [[JavaDStream]] of data generated by combining the key-value data in `this` stream
-   * with a continuously updated per-key state. The user-provided state tracking function is
-   * applied on each keyed data item along with its corresponding state. The function can choose to
-   * update/remove the state and return a transformed data, which forms the
-   * [[JavaTrackStateDStream]].
+   * Return a [[JavaMapWithStateDStream]] by applying a function to every key-value element of
+   * `this` stream, while maintaining some state data for each unique key. The mapping function
+   * and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this
+   * transformation can be specified using [[StateSpec]] class. The state data is accessible in
+   * as a parameter of type [[State]] in the mapping function.
    *
-   * The specifications of this transformation is made through the
-   * [[org.apache.spark.streaming.StateSpec StateSpec]] class. Besides the tracking function, there
-   * are a number of optional parameters - initial state data, number of partitions, timeouts, etc.
-   * See the [[org.apache.spark.streaming.StateSpec StateSpec]] for more details.
-   *
-   * Example of using `trackStateByKey`:
+   * Example of using `mapWithState`:
    * {{{
-   *   // A tracking function that maintains an integer state and return a String
-   *   Function2<Optional<Integer>, State<Integer>, Optional<String>> trackStateFunc =
-   *       new Function2<Optional<Integer>, State<Integer>, Optional<String>>() {
-   *
-   *         @Override
-   *         public Optional<String> call(Optional<Integer> one, State<Integer> state) {
-   *           // Check if state exists, accordingly update/remove state and return transformed data
-   *         }
+   *   // A mapping function that maintains an integer state and return a string
+   *   Function3<String, Optional<Integer>, State<Integer>, String> mappingFunction =
+   *       new Function3<String, Optional<Integer>, State<Integer>, String>() {
+   *           @Override
+   *           public Optional<String> call(Optional<Integer> value, State<Integer> state) {
+   *               // Use state.exists(), state.get(), state.update() and state.remove()
+   *               // to manage state, and return the necessary string
+   *           }
    *       };
    *
-   *    JavaTrackStateDStream<Integer, Integer, Integer, String> trackStateDStream =
-   *        keyValueDStream.<Integer, String>trackStateByKey(
-   *                 StateSpec.function(trackStateFunc).numPartitions(10));
-   * }}}
+   *    JavaMapWithStateDStream<String, Integer, Integer, String> mapWithStateDStream =
+   *        keyValueDStream.mapWithState(StateSpec.function(mappingFunc));
+   *}}}
    *
    * @param spec          Specification of this transformation
-   * @tparam StateType    Class type of the state
-   * @tparam EmittedType  Class type of the tranformed data return by the tracking function
+   * @tparam StateType    Class type of the state data
+   * @tparam MappedType   Class type of the mapped data
    */
   @Experimental
-  def trackStateByKey[StateType, EmittedType](spec: StateSpec[K, V, StateType, EmittedType]):
-    JavaTrackStateDStream[K, V, StateType, EmittedType] = {
-    new JavaTrackStateDStream(dstream.trackStateByKey(spec)(
+  def mapWithState[StateType, MappedType](spec: StateSpec[K, V, StateType, MappedType]):
+    JavaMapWithStateDStream[K, V, StateType, MappedType] = {
+    new JavaMapWithStateDStream(dstream.mapWithState(spec)(
       JavaSparkContext.fakeClassTag,
       JavaSparkContext.fakeClassTag))
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TrackStateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
similarity index 72%
rename from streaming/src/main/scala/org/apache/spark/streaming/dstream/TrackStateDStream.scala
rename to streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
index ea6213420e7ab..706465d4e25d7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TrackStateDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
@@ -24,53 +24,52 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.rdd.{EmptyRDD, RDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming._
-import org.apache.spark.streaming.rdd.{TrackStateRDD, TrackStateRDDRecord}
-import org.apache.spark.streaming.dstream.InternalTrackStateDStream._
+import org.apache.spark.streaming.rdd.{MapWithStateRDD, MapWithStateRDDRecord}
+import org.apache.spark.streaming.dstream.InternalMapWithStateDStream._
 
 /**
  * :: Experimental ::
- * DStream representing the stream of records emitted by the tracking function in the
- * `trackStateByKey` operation on a
+ * DStream representing the stream of data generated by `mapWithState` operation on a
  * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]].
  * Additionally, it also gives access to the stream of state snapshots, that is, the state data of
  * all keys after a batch has updated them.
  *
- * @tparam KeyType Class of the state key
- * @tparam ValueType Class of the state value
+ * @tparam KeyType Class of the key
+ * @tparam ValueType Class of the value
  * @tparam StateType Class of the state data
- * @tparam EmittedType Class of the emitted records
+ * @tparam MappedType Class of the mapped data
  */
 @Experimental
-sealed abstract class TrackStateDStream[KeyType, ValueType, StateType, EmittedType: ClassTag](
-    ssc: StreamingContext) extends DStream[EmittedType](ssc) {
+sealed abstract class MapWithStateDStream[KeyType, ValueType, StateType, MappedType: ClassTag](
+    ssc: StreamingContext) extends DStream[MappedType](ssc) {
 
   /** Return a pair DStream where each RDD is the snapshot of the state of all the keys. */
   def stateSnapshots(): DStream[(KeyType, StateType)]
 }
 
-/** Internal implementation of the [[TrackStateDStream]] */
-private[streaming] class TrackStateDStreamImpl[
-    KeyType: ClassTag, ValueType: ClassTag, StateType: ClassTag, EmittedType: ClassTag](
+/** Internal implementation of the [[MapWithStateDStream]] */
+private[streaming] class MapWithStateDStreamImpl[
+    KeyType: ClassTag, ValueType: ClassTag, StateType: ClassTag, MappedType: ClassTag](
     dataStream: DStream[(KeyType, ValueType)],
-    spec: StateSpecImpl[KeyType, ValueType, StateType, EmittedType])
-  extends TrackStateDStream[KeyType, ValueType, StateType, EmittedType](dataStream.context) {
+    spec: StateSpecImpl[KeyType, ValueType, StateType, MappedType])
+  extends MapWithStateDStream[KeyType, ValueType, StateType, MappedType](dataStream.context) {
 
   private val internalStream =
-    new InternalTrackStateDStream[KeyType, ValueType, StateType, EmittedType](dataStream, spec)
+    new InternalMapWithStateDStream[KeyType, ValueType, StateType, MappedType](dataStream, spec)
 
   override def slideDuration: Duration = internalStream.slideDuration
 
   override def dependencies: List[DStream[_]] = List(internalStream)
 
-  override def compute(validTime: Time): Option[RDD[EmittedType]] = {
-    internalStream.getOrCompute(validTime).map { _.flatMap[EmittedType] { _.emittedRecords } }
+  override def compute(validTime: Time): Option[RDD[MappedType]] = {
+    internalStream.getOrCompute(validTime).map { _.flatMap[MappedType] { _.mappedData } }
   }
 
   /**
    * Forward the checkpoint interval to the internal DStream that computes the state maps. This
    * to make sure that this DStream does not get checkpointed, only the internal stream.
    */
-  override def checkpoint(checkpointInterval: Duration): DStream[EmittedType] = {
+  override def checkpoint(checkpointInterval: Duration): DStream[MappedType] = {
     internalStream.checkpoint(checkpointInterval)
     this
   }
@@ -87,32 +86,32 @@ private[streaming] class TrackStateDStreamImpl[
 
   def stateClass: Class[_] = implicitly[ClassTag[StateType]].runtimeClass
 
-  def emittedClass: Class[_] = implicitly[ClassTag[EmittedType]].runtimeClass
+  def mappedClass: Class[_] = implicitly[ClassTag[MappedType]].runtimeClass
 }
 
 /**
  * A DStream that allows per-key state to be maintains, and arbitrary records to be generated
- * based on updates to the state. This is the main DStream that implements the `trackStateByKey`
+ * based on updates to the state. This is the main DStream that implements the `mapWithState`
  * operation on DStreams.
  *
  * @param parent Parent (key, value) stream that is the source
- * @param spec Specifications of the trackStateByKey operation
+ * @param spec Specifications of the mapWithState operation
  * @tparam K   Key type
  * @tparam V   Value type
  * @tparam S   Type of the state maintained
- * @tparam E   Type of the emitted data
+ * @tparam E   Type of the mapped data
  */
 private[streaming]
-class InternalTrackStateDStream[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
+class InternalMapWithStateDStream[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
     parent: DStream[(K, V)], spec: StateSpecImpl[K, V, S, E])
-  extends DStream[TrackStateRDDRecord[K, S, E]](parent.context) {
+  extends DStream[MapWithStateRDDRecord[K, S, E]](parent.context) {
 
   persist(StorageLevel.MEMORY_ONLY)
 
   private val partitioner = spec.getPartitioner().getOrElse(
     new HashPartitioner(ssc.sc.defaultParallelism))
 
-  private val trackingFunction = spec.getFunction()
+  private val mappingFunction = spec.getFunction()
 
   override def slideDuration: Duration = parent.slideDuration
 
@@ -130,7 +129,7 @@ class InternalTrackStateDStream[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassT
   }
 
   /** Method that generates a RDD for the given time */
-  override def compute(validTime: Time): Option[RDD[TrackStateRDDRecord[K, S, E]]] = {
+  override def compute(validTime: Time): Option[RDD[MapWithStateRDDRecord[K, S, E]]] = {
     // Get the previous state or create a new empty state RDD
     val prevStateRDD = getOrCompute(validTime - slideDuration) match {
       case Some(rdd) =>
@@ -138,13 +137,13 @@ class InternalTrackStateDStream[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassT
           // If the RDD is not partitioned the right way, let us repartition it using the
           // partition index as the key. This is to ensure that state RDD is always partitioned
           // before creating another state RDD using it
-          TrackStateRDD.createFromRDD[K, V, S, E](
+          MapWithStateRDD.createFromRDD[K, V, S, E](
             rdd.flatMap { _.stateMap.getAll() }, partitioner, validTime)
         } else {
           rdd
         }
       case None =>
-        TrackStateRDD.createFromPairRDD[K, V, S, E](
+        MapWithStateRDD.createFromPairRDD[K, V, S, E](
           spec.getInitialStateRDD().getOrElse(new EmptyRDD[(K, S)](ssc.sparkContext)),
           partitioner,
           validTime
@@ -161,11 +160,11 @@ class InternalTrackStateDStream[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassT
     val timeoutThresholdTime = spec.getTimeoutInterval().map { interval =>
       (validTime - interval).milliseconds
     }
-    Some(new TrackStateRDD(
-      prevStateRDD, partitionedDataRDD, trackingFunction, validTime, timeoutThresholdTime))
+    Some(new MapWithStateRDD(
+      prevStateRDD, partitionedDataRDD, mappingFunction, validTime, timeoutThresholdTime))
   }
 }
 
-private[streaming] object InternalTrackStateDStream {
+private[streaming] object InternalMapWithStateDStream {
   private val DEFAULT_CHECKPOINT_DURATION_MULTIPLIER = 10
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index 2762309134eb1..a64a1fe93f40d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -352,39 +352,36 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
 
   /**
    * :: Experimental ::
-   * Return a new DStream of data generated by combining the key-value data in `this` stream
-   * with a continuously updated per-key state. The user-provided state tracking function is
-   * applied on each keyed data item along with its corresponding state. The function can choose to
-   * update/remove the state and return a transformed data, which forms the
-   * [[org.apache.spark.streaming.dstream.TrackStateDStream]].
+   * Return a [[MapWithStateDStream]] by applying a function to every key-value element of
+   * `this` stream, while maintaining some state data for each unique key. The mapping function
+   * and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this
+   * transformation can be specified using [[StateSpec]] class. The state data is accessible in
+   * as a parameter of type [[State]] in the mapping function.
    *
-   * The specifications of this transformation is made through the
-   * [[org.apache.spark.streaming.StateSpec StateSpec]] class. Besides the tracking function, there
-   * are a number of optional parameters - initial state data, number of partitions, timeouts, etc.
-   * See the [[org.apache.spark.streaming.StateSpec StateSpec spec docs]] for more details.
-   *
-   * Example of using `trackStateByKey`:
+   * Example of using `mapWithState`:
    * {{{
-   *    def trackingFunction(data: Option[Int], wrappedState: State[Int]): String = {
-   *      // Check if state exists, accordingly update/remove state and return transformed data
+   *    // A mapping function that maintains an integer state and return a String
+   *    def mappingFunction(key: String, value: Option[Int], state: State[Int]): Option[String] = {
+   *      // Use state.exists(), state.get(), state.update() and state.remove()
+   *      // to manage state, and return the necessary string
    *    }
    *
-   *    val spec = StateSpec.function(trackingFunction).numPartitions(10)
+   *    val spec = StateSpec.function(mappingFunction).numPartitions(10)
    *
-   *    val trackStateDStream = keyValueDStream.trackStateByKey[Int, String](spec)
+   *    val mapWithStateDStream = keyValueDStream.mapWithState[StateType, MappedType](spec)
    * }}}
    *
    * @param spec          Specification of this transformation
-   * @tparam StateType    Class type of the state
-   * @tparam EmittedType  Class type of the tranformed data return by the tracking function
+   * @tparam StateType    Class type of the state data
+   * @tparam MappedType   Class type of the mapped data
    */
   @Experimental
-  def trackStateByKey[StateType: ClassTag, EmittedType: ClassTag](
-      spec: StateSpec[K, V, StateType, EmittedType]
-    ): TrackStateDStream[K, V, StateType, EmittedType] = {
-    new TrackStateDStreamImpl[K, V, StateType, EmittedType](
+  def mapWithState[StateType: ClassTag, MappedType: ClassTag](
+      spec: StateSpec[K, V, StateType, MappedType]
+    ): MapWithStateDStream[K, V, StateType, MappedType] = {
+    new MapWithStateDStreamImpl[K, V, StateType, MappedType](
       self,
-      spec.asInstanceOf[StateSpecImpl[K, V, StateType, EmittedType]]
+      spec.asInstanceOf[StateSpecImpl[K, V, StateType, MappedType]]
     )
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/TrackStateRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala
similarity index 64%
rename from streaming/src/main/scala/org/apache/spark/streaming/rdd/TrackStateRDD.scala
rename to streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala
index 30aafcf1460e3..ed95171f73ee1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/TrackStateRDD.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala
@@ -29,60 +29,60 @@ import org.apache.spark.util.Utils
 import org.apache.spark._
 
 /**
- * Record storing the keyed-state [[TrackStateRDD]]. Each record contains a [[StateMap]] and a
- * sequence of records returned by the tracking function of `trackStateByKey`.
+ * Record storing the keyed-state [[MapWithStateRDD]]. Each record contains a [[StateMap]] and a
+ * sequence of records returned by the mapping function of `mapWithState`.
  */
-private[streaming] case class TrackStateRDDRecord[K, S, E](
-    var stateMap: StateMap[K, S], var emittedRecords: Seq[E])
+private[streaming] case class MapWithStateRDDRecord[K, S, E](
+    var stateMap: StateMap[K, S], var mappedData: Seq[E])
 
-private[streaming] object TrackStateRDDRecord {
+private[streaming] object MapWithStateRDDRecord {
   def updateRecordWithData[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
-    prevRecord: Option[TrackStateRDDRecord[K, S, E]],
+    prevRecord: Option[MapWithStateRDDRecord[K, S, E]],
     dataIterator: Iterator[(K, V)],
-    updateFunction: (Time, K, Option[V], State[S]) => Option[E],
+    mappingFunction: (Time, K, Option[V], State[S]) => Option[E],
     batchTime: Time,
     timeoutThresholdTime: Option[Long],
     removeTimedoutData: Boolean
-  ): TrackStateRDDRecord[K, S, E] = {
+  ): MapWithStateRDDRecord[K, S, E] = {
     // Create a new state map by cloning the previous one (if it exists) or by creating an empty one
     val newStateMap = prevRecord.map { _.stateMap.copy() }. getOrElse { new EmptyStateMap[K, S]() }
 
-    val emittedRecords = new ArrayBuffer[E]
+    val mappedData = new ArrayBuffer[E]
     val wrappedState = new StateImpl[S]()
 
-    // Call the tracking function on each record in the data iterator, and accordingly
-    // update the states touched, and collect the data returned by the tracking function
+    // Call the mapping function on each record in the data iterator, and accordingly
+    // update the states touched, and collect the data returned by the mapping function
     dataIterator.foreach { case (key, value) =>
       wrappedState.wrap(newStateMap.get(key))
-      val emittedRecord = updateFunction(batchTime, key, Some(value), wrappedState)
+      val returned = mappingFunction(batchTime, key, Some(value), wrappedState)
       if (wrappedState.isRemoved) {
         newStateMap.remove(key)
       } else if (wrappedState.isUpdated || timeoutThresholdTime.isDefined) {
         newStateMap.put(key, wrappedState.get(), batchTime.milliseconds)
       }
-      emittedRecords ++= emittedRecord
+      mappedData ++= returned
     }
 
-    // Get the timed out state records, call the tracking function on each and collect the
+    // Get the timed out state records, call the mapping function on each and collect the
     // data returned
     if (removeTimedoutData && timeoutThresholdTime.isDefined) {
       newStateMap.getByTime(timeoutThresholdTime.get).foreach { case (key, state, _) =>
         wrappedState.wrapTiminoutState(state)
-        val emittedRecord = updateFunction(batchTime, key, None, wrappedState)
-        emittedRecords ++= emittedRecord
+        val returned = mappingFunction(batchTime, key, None, wrappedState)
+        mappedData ++= returned
         newStateMap.remove(key)
       }
     }
 
-    TrackStateRDDRecord(newStateMap, emittedRecords)
+    MapWithStateRDDRecord(newStateMap, mappedData)
   }
 }
 
 /**
- * Partition of the [[TrackStateRDD]], which depends on corresponding partitions of prev state
+ * Partition of the [[MapWithStateRDD]], which depends on corresponding partitions of prev state
  * RDD, and a partitioned keyed-data RDD
  */
-private[streaming] class TrackStateRDDPartition(
+private[streaming] class MapWithStateRDDPartition(
     idx: Int,
     @transient private var prevStateRDD: RDD[_],
     @transient private var partitionedDataRDD: RDD[_]) extends Partition {
@@ -104,27 +104,28 @@ private[streaming] class TrackStateRDDPartition(
 
 
 /**
- * RDD storing the keyed-state of `trackStateByKey` and corresponding emitted records.
- * Each partition of this RDD has a single record of type [[TrackStateRDDRecord]]. This contains a
- * [[StateMap]] (containing the keyed-states) and the sequence of records returned by the tracking
- * function of  `trackStateByKey`.
- * @param prevStateRDD The previous TrackStateRDD on whose StateMap data `this` RDD will be created
+ * RDD storing the keyed states of `mapWithState` operation and corresponding mapped data.
+ * Each partition of this RDD has a single record of type [[MapWithStateRDDRecord]]. This contains a
+ * [[StateMap]] (containing the keyed-states) and the sequence of records returned by the mapping
+ * function of  `mapWithState`.
+ * @param prevStateRDD The previous MapWithStateRDD on whose StateMap data `this` RDD
+  *                    will be created
  * @param partitionedDataRDD The partitioned data RDD which is used update the previous StateMaps
  *                           in the `prevStateRDD` to create `this` RDD
- * @param trackingFunction The function that will be used to update state and return new data
+ * @param mappingFunction  The function that will be used to update state and return new data
  * @param batchTime        The time of the batch to which this RDD belongs to. Use to update
  * @param timeoutThresholdTime The time to indicate which keys are timeout
  */
-private[streaming] class TrackStateRDD[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
-    private var prevStateRDD: RDD[TrackStateRDDRecord[K, S, E]],
+private[streaming] class MapWithStateRDD[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
+    private var prevStateRDD: RDD[MapWithStateRDDRecord[K, S, E]],
     private var partitionedDataRDD: RDD[(K, V)],
-    trackingFunction: (Time, K, Option[V], State[S]) => Option[E],
+    mappingFunction: (Time, K, Option[V], State[S]) => Option[E],
     batchTime: Time,
     timeoutThresholdTime: Option[Long]
-  ) extends RDD[TrackStateRDDRecord[K, S, E]](
+  ) extends RDD[MapWithStateRDDRecord[K, S, E]](
     partitionedDataRDD.sparkContext,
     List(
-      new OneToOneDependency[TrackStateRDDRecord[K, S, E]](prevStateRDD),
+      new OneToOneDependency[MapWithStateRDDRecord[K, S, E]](prevStateRDD),
       new OneToOneDependency(partitionedDataRDD))
   ) {
 
@@ -141,19 +142,19 @@ private[streaming] class TrackStateRDD[K: ClassTag, V: ClassTag, S: ClassTag, E:
   }
 
   override def compute(
-      partition: Partition, context: TaskContext): Iterator[TrackStateRDDRecord[K, S, E]] = {
+      partition: Partition, context: TaskContext): Iterator[MapWithStateRDDRecord[K, S, E]] = {
 
-    val stateRDDPartition = partition.asInstanceOf[TrackStateRDDPartition]
+    val stateRDDPartition = partition.asInstanceOf[MapWithStateRDDPartition]
     val prevStateRDDIterator = prevStateRDD.iterator(
       stateRDDPartition.previousSessionRDDPartition, context)
     val dataIterator = partitionedDataRDD.iterator(
       stateRDDPartition.partitionedDataRDDPartition, context)
 
     val prevRecord = if (prevStateRDDIterator.hasNext) Some(prevStateRDDIterator.next()) else None
-    val newRecord = TrackStateRDDRecord.updateRecordWithData(
+    val newRecord = MapWithStateRDDRecord.updateRecordWithData(
       prevRecord,
       dataIterator,
-      trackingFunction,
+      mappingFunction,
       batchTime,
       timeoutThresholdTime,
       removeTimedoutData = doFullScan // remove timedout data only when full scan is enabled
@@ -163,7 +164,7 @@ private[streaming] class TrackStateRDD[K: ClassTag, V: ClassTag, S: ClassTag, E:
 
   override protected def getPartitions: Array[Partition] = {
     Array.tabulate(prevStateRDD.partitions.length) { i =>
-      new TrackStateRDDPartition(i, prevStateRDD, partitionedDataRDD)}
+      new MapWithStateRDDPartition(i, prevStateRDD, partitionedDataRDD)}
   }
 
   override def clearDependencies(): Unit = {
@@ -177,52 +178,46 @@ private[streaming] class TrackStateRDD[K: ClassTag, V: ClassTag, S: ClassTag, E:
   }
 }
 
-private[streaming] object TrackStateRDD {
+private[streaming] object MapWithStateRDD {
 
   def createFromPairRDD[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
       pairRDD: RDD[(K, S)],
       partitioner: Partitioner,
-      updateTime: Time): TrackStateRDD[K, V, S, E] = {
+      updateTime: Time): MapWithStateRDD[K, V, S, E] = {
 
-    val rddOfTrackStateRecords = pairRDD.partitionBy(partitioner).mapPartitions ({ iterator =>
+    val stateRDD = pairRDD.partitionBy(partitioner).mapPartitions ({ iterator =>
       val stateMap = StateMap.create[K, S](SparkEnv.get.conf)
       iterator.foreach { case (key, state) => stateMap.put(key, state, updateTime.milliseconds) }
-      Iterator(TrackStateRDDRecord(stateMap, Seq.empty[E]))
+      Iterator(MapWithStateRDDRecord(stateMap, Seq.empty[E]))
     }, preservesPartitioning = true)
 
     val emptyDataRDD = pairRDD.sparkContext.emptyRDD[(K, V)].partitionBy(partitioner)
 
     val noOpFunc = (time: Time, key: K, value: Option[V], state: State[S]) => None
 
-    new TrackStateRDD[K, V, S, E](rddOfTrackStateRecords, emptyDataRDD, noOpFunc, updateTime, None)
+    new MapWithStateRDD[K, V, S, E](
+      stateRDD, emptyDataRDD, noOpFunc, updateTime, None)
   }
 
   def createFromRDD[K: ClassTag, V: ClassTag, S: ClassTag, E: ClassTag](
       rdd: RDD[(K, S, Long)],
       partitioner: Partitioner,
-      updateTime: Time): TrackStateRDD[K, V, S, E] = {
+      updateTime: Time): MapWithStateRDD[K, V, S, E] = {
 
     val pairRDD = rdd.map { x => (x._1, (x._2, x._3)) }
-    val rddOfTrackStateRecords = pairRDD.partitionBy(partitioner).mapPartitions({ iterator =>
+    val stateRDD = pairRDD.partitionBy(partitioner).mapPartitions({ iterator =>
       val stateMap = StateMap.create[K, S](SparkEnv.get.conf)
       iterator.foreach { case (key, (state, updateTime)) =>
         stateMap.put(key, state, updateTime)
       }
-      Iterator(TrackStateRDDRecord(stateMap, Seq.empty[E]))
+      Iterator(MapWithStateRDDRecord(stateMap, Seq.empty[E]))
     }, preservesPartitioning = true)
 
     val emptyDataRDD = pairRDD.sparkContext.emptyRDD[(K, V)].partitionBy(partitioner)
 
     val noOpFunc = (time: Time, key: K, value: Option[V], state: State[S]) => None
 
-    new TrackStateRDD[K, V, S, E](rddOfTrackStateRecords, emptyDataRDD, noOpFunc, updateTime, None)
-  }
-}
-
-private[streaming] class EmittedRecordsRDD[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
-    parent: TrackStateRDD[K, V, S, T]) extends RDD[T](parent) {
-  override protected def getPartitions: Array[Partition] = parent.partitions
-  override def compute(partition: Partition, context: TaskContext): Iterator[T] = {
-    parent.compute(partition, context).flatMap { _.emittedRecords }
+    new MapWithStateRDD[K, V, S, E](
+      stateRDD, emptyDataRDD, noOpFunc, updateTime, None)
   }
 }
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaTrackStateByKeySuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaMapWithStateSuite.java
similarity index 79%
rename from streaming/src/test/java/org/apache/spark/streaming/JavaTrackStateByKeySuite.java
rename to streaming/src/test/java/org/apache/spark/streaming/JavaMapWithStateSuite.java
index eac4cdd14a683..bc4bc2eb42231 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaTrackStateByKeySuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaMapWithStateSuite.java
@@ -37,12 +37,12 @@
 
 import org.apache.spark.HashPartitioner;
 import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.Function3;
 import org.apache.spark.api.java.function.Function4;
 import org.apache.spark.streaming.api.java.JavaPairDStream;
-import org.apache.spark.streaming.api.java.JavaTrackStateDStream;
+import org.apache.spark.streaming.api.java.JavaMapWithStateDStream;
 
-public class JavaTrackStateByKeySuite extends LocalJavaStreamingContext implements Serializable {
+public class JavaMapWithStateSuite extends LocalJavaStreamingContext implements Serializable {
 
   /**
    * This test is only for testing the APIs. It's not necessary to run it.
@@ -52,7 +52,7 @@ public void testAPI() {
     JavaPairDStream<String, Integer> wordsDstream = null;
 
     final Function4<Time, String, Optional<Integer>, State<Boolean>, Optional<Double>>
-        trackStateFunc =
+        mappingFunc =
         new Function4<Time, String, Optional<Integer>, State<Boolean>, Optional<Double>>() {
 
           @Override
@@ -68,21 +68,21 @@ public Optional<Double> call(
           }
         };
 
-    JavaTrackStateDStream<String, Integer, Boolean, Double> stateDstream =
-        wordsDstream.trackStateByKey(
-            StateSpec.function(trackStateFunc)
+    JavaMapWithStateDStream<String, Integer, Boolean, Double> stateDstream =
+        wordsDstream.mapWithState(
+            StateSpec.function(mappingFunc)
                 .initialState(initialRDD)
                 .numPartitions(10)
                 .partitioner(new HashPartitioner(10))
                 .timeout(Durations.seconds(10)));
 
-    JavaPairDStream<String, Boolean> emittedRecords = stateDstream.stateSnapshots();
+    JavaPairDStream<String, Boolean> stateSnapshots = stateDstream.stateSnapshots();
 
-    final Function2<Optional<Integer>, State<Boolean>, Double> trackStateFunc2 =
-        new Function2<Optional<Integer>, State<Boolean>, Double>() {
+    final Function3<String, Optional<Integer>, State<Boolean>, Double> mappingFunc2 =
+        new Function3<String, Optional<Integer>, State<Boolean>, Double>() {
 
           @Override
-          public Double call(Optional<Integer> one, State<Boolean> state) {
+          public Double call(String key, Optional<Integer> one, State<Boolean> state) {
             // Use all State's methods here
             state.exists();
             state.get();
@@ -93,15 +93,15 @@ public Double call(Optional<Integer> one, State<Boolean> state) {
           }
         };
 
-    JavaTrackStateDStream<String, Integer, Boolean, Double> stateDstream2 =
-        wordsDstream.trackStateByKey(
-            StateSpec.<String, Integer, Boolean, Double> function(trackStateFunc2)
+    JavaMapWithStateDStream<String, Integer, Boolean, Double> stateDstream2 =
+        wordsDstream.mapWithState(
+            StateSpec.<String, Integer, Boolean, Double>function(mappingFunc2)
                 .initialState(initialRDD)
                 .numPartitions(10)
                 .partitioner(new HashPartitioner(10))
                 .timeout(Durations.seconds(10)));
 
-    JavaPairDStream<String, Boolean> emittedRecords2 = stateDstream2.stateSnapshots();
+    JavaPairDStream<String, Boolean> stateSnapshots2 = stateDstream2.stateSnapshots();
   }
 
   @Test
@@ -148,11 +148,11 @@ public void testBasicFunction() {
             new Tuple2<String, Integer>("c", 1))
     );
 
-    Function2<Optional<Integer>, State<Integer>, Integer> trackStateFunc =
-        new Function2<Optional<Integer>, State<Integer>, Integer>() {
+    Function3<String, Optional<Integer>, State<Integer>, Integer> mappingFunc =
+        new Function3<String, Optional<Integer>, State<Integer>, Integer>() {
 
           @Override
-          public Integer call(Optional<Integer> value, State<Integer> state) throws Exception {
+          public Integer call(String key, Optional<Integer> value, State<Integer> state) throws Exception {
             int sum = value.or(0) + (state.exists() ? state.get() : 0);
             state.update(sum);
             return sum;
@@ -160,29 +160,29 @@ public Integer call(Optional<Integer> value, State<Integer> state) throws Except
         };
     testOperation(
         inputData,
-        StateSpec.<String, Integer, Integer, Integer>function(trackStateFunc),
+        StateSpec.<String, Integer, Integer, Integer>function(mappingFunc),
         outputData,
         stateData);
   }
 
   private <K, S, T> void testOperation(
       List<List<K>> input,
-      StateSpec<K, Integer, S, T> trackStateSpec,
+      StateSpec<K, Integer, S, T> mapWithStateSpec,
       List<Set<T>> expectedOutputs,
       List<Set<Tuple2<K, S>>> expectedStateSnapshots) {
     int numBatches = expectedOutputs.size();
     JavaDStream<K> inputStream = JavaTestUtils.attachTestInputStream(ssc, input, 2);
-    JavaTrackStateDStream<K, Integer, S, T> trackeStateStream =
+    JavaMapWithStateDStream<K, Integer, S, T> mapWithStateDStream =
         JavaPairDStream.fromJavaDStream(inputStream.map(new Function<K, Tuple2<K, Integer>>() {
           @Override
           public Tuple2<K, Integer> call(K x) throws Exception {
             return new Tuple2<K, Integer>(x, 1);
           }
-        })).trackStateByKey(trackStateSpec);
+        })).mapWithState(mapWithStateSpec);
 
     final List<Set<T>> collectedOutputs =
         Collections.synchronizedList(Lists.<Set<T>>newArrayList());
-    trackeStateStream.foreachRDD(new Function<JavaRDD<T>, Void>() {
+    mapWithStateDStream.foreachRDD(new Function<JavaRDD<T>, Void>() {
       @Override
       public Void call(JavaRDD<T> rdd) throws Exception {
         collectedOutputs.add(Sets.newHashSet(rdd.collect()));
@@ -191,7 +191,7 @@ public Void call(JavaRDD<T> rdd) throws Exception {
     });
     final List<Set<Tuple2<K, S>>> collectedStateSnapshots =
         Collections.synchronizedList(Lists.<Set<Tuple2<K, S>>>newArrayList());
-    trackeStateStream.stateSnapshots().foreachRDD(new Function<JavaPairRDD<K, S>, Void>() {
+    mapWithStateDStream.stateSnapshots().foreachRDD(new Function<JavaPairRDD<K, S>, Void>() {
       @Override
       public Void call(JavaPairRDD<K, S> rdd) throws Exception {
         collectedStateSnapshots.add(Sets.newHashSet(rdd.collect()));
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TrackStateByKeySuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala
similarity index 77%
rename from streaming/src/test/scala/org/apache/spark/streaming/TrackStateByKeySuite.scala
rename to streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala
index 1fc320d31b18b..4b08085e09b1f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/TrackStateByKeySuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala
@@ -25,11 +25,11 @@ import scala.reflect.ClassTag
 import org.scalatest.PrivateMethodTester._
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 
-import org.apache.spark.streaming.dstream.{DStream, InternalTrackStateDStream, TrackStateDStream, TrackStateDStreamImpl}
+import org.apache.spark.streaming.dstream.{DStream, InternalMapWithStateDStream, MapWithStateDStream, MapWithStateDStreamImpl}
 import org.apache.spark.util.{ManualClock, Utils}
 import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 
-class TrackStateByKeySuite extends SparkFunSuite
+class MapWithStateSuite extends SparkFunSuite
   with DStreamCheckpointTester with BeforeAndAfterAll with BeforeAndAfter {
 
   private var sc: SparkContext = null
@@ -49,7 +49,7 @@ class TrackStateByKeySuite extends SparkFunSuite
   }
 
   override def beforeAll(): Unit = {
-    val conf = new SparkConf().setMaster("local").setAppName("TrackStateByKeySuite")
+    val conf = new SparkConf().setMaster("local").setAppName("MapWithStateSuite")
     conf.set("spark.streaming.clock", classOf[ManualClock].getName())
     sc = new SparkContext(conf)
   }
@@ -129,7 +129,7 @@ class TrackStateByKeySuite extends SparkFunSuite
     testState(Some(3), shouldBeTimingOut = true)
   }
 
-  test("trackStateByKey - basic operations with simple API") {
+  test("mapWithState - basic operations with simple API") {
     val inputData =
       Seq(
         Seq(),
@@ -164,17 +164,17 @@ class TrackStateByKeySuite extends SparkFunSuite
       )
 
     // state maintains running count, and updated count is returned
-    val trackStateFunc = (value: Option[Int], state: State[Int]) => {
+    val mappingFunc = (key: String, value: Option[Int], state: State[Int]) => {
       val sum = value.getOrElse(0) + state.getOption.getOrElse(0)
       state.update(sum)
       sum
     }
 
     testOperation[String, Int, Int](
-      inputData, StateSpec.function(trackStateFunc), outputData, stateData)
+      inputData, StateSpec.function(mappingFunc), outputData, stateData)
   }
 
-  test("trackStateByKey - basic operations with advanced API") {
+  test("mapWithState - basic operations with advanced API") {
     val inputData =
       Seq(
         Seq(),
@@ -209,65 +209,65 @@ class TrackStateByKeySuite extends SparkFunSuite
       )
 
     // state maintains running count, key string doubled and returned
-    val trackStateFunc = (batchTime: Time, key: String, value: Option[Int], state: State[Int]) => {
+    val mappingFunc = (batchTime: Time, key: String, value: Option[Int], state: State[Int]) => {
       val sum = value.getOrElse(0) + state.getOption.getOrElse(0)
       state.update(sum)
       Some(key * 2)
     }
 
-    testOperation(inputData, StateSpec.function(trackStateFunc), outputData, stateData)
+    testOperation(inputData, StateSpec.function(mappingFunc), outputData, stateData)
   }
 
-  test("trackStateByKey - type inferencing and class tags") {
+  test("mapWithState - type inferencing and class tags") {
 
-    // Simple track state function with value as Int, state as Double and emitted type as Double
-    val simpleFunc = (value: Option[Int], state: State[Double]) => {
+    // Simple track state function with value as Int, state as Double and mapped type as Double
+    val simpleFunc = (key: String, value: Option[Int], state: State[Double]) => {
       0L
     }
 
     // Advanced track state function with key as String, value as Int, state as Double and
-    // emitted type as Double
+    // mapped type as Double
     val advancedFunc = (time: Time, key: String, value: Option[Int], state: State[Double]) => {
       Some(0L)
     }
 
-    def testTypes(dstream: TrackStateDStream[_, _, _, _]): Unit = {
-      val dstreamImpl = dstream.asInstanceOf[TrackStateDStreamImpl[_, _, _, _]]
+    def testTypes(dstream: MapWithStateDStream[_, _, _, _]): Unit = {
+      val dstreamImpl = dstream.asInstanceOf[MapWithStateDStreamImpl[_, _, _, _]]
       assert(dstreamImpl.keyClass === classOf[String])
       assert(dstreamImpl.valueClass === classOf[Int])
       assert(dstreamImpl.stateClass === classOf[Double])
-      assert(dstreamImpl.emittedClass === classOf[Long])
+      assert(dstreamImpl.mappedClass === classOf[Long])
     }
     val ssc = new StreamingContext(sc, batchDuration)
     val inputStream = new TestInputStream[(String, Int)](ssc, Seq.empty, numPartitions = 2)
 
-    // Defining StateSpec inline with trackStateByKey and simple function implicitly gets the types
-    val simpleFunctionStateStream1 = inputStream.trackStateByKey(
+    // Defining StateSpec inline with mapWithState and simple function implicitly gets the types
+    val simpleFunctionStateStream1 = inputStream.mapWithState(
       StateSpec.function(simpleFunc).numPartitions(1))
     testTypes(simpleFunctionStateStream1)
 
     // Separately defining StateSpec with simple function requires explicitly specifying types
     val simpleFuncSpec = StateSpec.function[String, Int, Double, Long](simpleFunc)
-    val simpleFunctionStateStream2 = inputStream.trackStateByKey(simpleFuncSpec)
+    val simpleFunctionStateStream2 = inputStream.mapWithState(simpleFuncSpec)
     testTypes(simpleFunctionStateStream2)
 
     // Separately defining StateSpec with advanced function implicitly gets the types
     val advFuncSpec1 = StateSpec.function(advancedFunc)
-    val advFunctionStateStream1 = inputStream.trackStateByKey(advFuncSpec1)
+    val advFunctionStateStream1 = inputStream.mapWithState(advFuncSpec1)
     testTypes(advFunctionStateStream1)
 
-    // Defining StateSpec inline with trackStateByKey and advanced func implicitly gets the types
-    val advFunctionStateStream2 = inputStream.trackStateByKey(
+    // Defining StateSpec inline with mapWithState and advanced func implicitly gets the types
+    val advFunctionStateStream2 = inputStream.mapWithState(
       StateSpec.function(simpleFunc).numPartitions(1))
     testTypes(advFunctionStateStream2)
 
-    // Defining StateSpec inline with trackStateByKey and advanced func implicitly gets the types
+    // Defining StateSpec inline with mapWithState and advanced func implicitly gets the types
     val advFuncSpec2 = StateSpec.function[String, Int, Double, Long](advancedFunc)
-    val advFunctionStateStream3 = inputStream.trackStateByKey[Double, Long](advFuncSpec2)
+    val advFunctionStateStream3 = inputStream.mapWithState[Double, Long](advFuncSpec2)
     testTypes(advFunctionStateStream3)
   }
 
-  test("trackStateByKey - states as emitted records") {
+  test("mapWithState - states as mapped data") {
     val inputData =
       Seq(
         Seq(),
@@ -301,17 +301,17 @@ class TrackStateByKeySuite extends SparkFunSuite
         Seq(("a", 5), ("b", 3), ("c", 1))
       )
 
-    val trackStateFunc = (time: Time, key: String, value: Option[Int], state: State[Int]) => {
+    val mappingFunc = (time: Time, key: String, value: Option[Int], state: State[Int]) => {
       val sum = value.getOrElse(0) + state.getOption.getOrElse(0)
       val output = (key, sum)
       state.update(sum)
       Some(output)
     }
 
-    testOperation(inputData, StateSpec.function(trackStateFunc), outputData, stateData)
+    testOperation(inputData, StateSpec.function(mappingFunc), outputData, stateData)
   }
 
-  test("trackStateByKey - initial states, with nothing emitted") {
+  test("mapWithState - initial states, with nothing returned as from mapping function") {
 
     val initialState = Seq(("a", 5), ("b", 10), ("c", -20), ("d", 0))
 
@@ -339,18 +339,18 @@ class TrackStateByKeySuite extends SparkFunSuite
         Seq(("a", 10), ("b", 13), ("c", -19), ("d", 0))
       )
 
-    val trackStateFunc = (time: Time, key: String, value: Option[Int], state: State[Int]) => {
+    val mappingFunc = (time: Time, key: String, value: Option[Int], state: State[Int]) => {
       val sum = value.getOrElse(0) + state.getOption.getOrElse(0)
       val output = (key, sum)
       state.update(sum)
       None.asInstanceOf[Option[Int]]
     }
 
-    val trackStateSpec = StateSpec.function(trackStateFunc).initialState(sc.makeRDD(initialState))
-    testOperation(inputData, trackStateSpec, outputData, stateData)
+    val mapWithStateSpec = StateSpec.function(mappingFunc).initialState(sc.makeRDD(initialState))
+    testOperation(inputData, mapWithStateSpec, outputData, stateData)
   }
 
-  test("trackStateByKey - state removing") {
+  test("mapWithState - state removing") {
     val inputData =
       Seq(
         Seq(),
@@ -388,7 +388,7 @@ class TrackStateByKeySuite extends SparkFunSuite
         Seq()
       )
 
-    val trackStateFunc = (time: Time, key: String, value: Option[Int], state: State[Int]) => {
+    val mappingFunc = (time: Time, key: String, value: Option[Int], state: State[Int]) => {
       if (state.exists) {
         state.remove()
         Some(key)
@@ -399,10 +399,10 @@ class TrackStateByKeySuite extends SparkFunSuite
     }
 
     testOperation(
-      inputData, StateSpec.function(trackStateFunc).numPartitions(1), outputData, stateData)
+      inputData, StateSpec.function(mappingFunc).numPartitions(1), outputData, stateData)
   }
 
-  test("trackStateByKey - state timing out") {
+  test("mapWithState - state timing out") {
     val inputData =
       Seq(
         Seq("a", "b", "c"),
@@ -413,7 +413,7 @@ class TrackStateByKeySuite extends SparkFunSuite
         Seq("a") // a will not time out
       ) ++ Seq.fill(20)(Seq("a")) // a will continue to stay active
 
-    val trackStateFunc = (time: Time, key: String, value: Option[Int], state: State[Int]) => {
+    val mappingFunc = (time: Time, key: String, value: Option[Int], state: State[Int]) => {
       if (value.isDefined) {
         state.update(1)
       }
@@ -425,9 +425,9 @@ class TrackStateByKeySuite extends SparkFunSuite
     }
 
     val (collectedOutputs, collectedStateSnapshots) = getOperationOutput(
-      inputData, StateSpec.function(trackStateFunc).timeout(Seconds(3)), 20)
+      inputData, StateSpec.function(mappingFunc).timeout(Seconds(3)), 20)
 
-    // b and c should be emitted once each, when they were marked as expired
+    // b and c should be returned once each, when they were marked as expired
     assert(collectedOutputs.flatten.sorted === Seq("b", "c"))
 
     // States for a, b, c should be defined at one point of time
@@ -439,8 +439,8 @@ class TrackStateByKeySuite extends SparkFunSuite
     assert(collectedStateSnapshots.last.toSet === Set(("a", 1)))
   }
 
-  test("trackStateByKey - checkpoint durations") {
-    val privateMethod = PrivateMethod[InternalTrackStateDStream[_, _, _, _]]('internalStream)
+  test("mapWithState - checkpoint durations") {
+    val privateMethod = PrivateMethod[InternalMapWithStateDStream[_, _, _, _]]('internalStream)
 
     def testCheckpointDuration(
         batchDuration: Duration,
@@ -451,18 +451,18 @@ class TrackStateByKeySuite extends SparkFunSuite
 
       try {
         val inputStream = new TestInputStream(ssc, Seq.empty[Seq[Int]], 2).map(_ -> 1)
-        val dummyFunc = (value: Option[Int], state: State[Int]) => 0
-        val trackStateStream = inputStream.trackStateByKey(StateSpec.function(dummyFunc))
-        val internalTrackStateStream = trackStateStream invokePrivate privateMethod()
+        val dummyFunc = (key: Int, value: Option[Int], state: State[Int]) => 0
+        val mapWithStateStream = inputStream.mapWithState(StateSpec.function(dummyFunc))
+        val internalmapWithStateStream = mapWithStateStream invokePrivate privateMethod()
 
         explicitCheckpointDuration.foreach { d =>
-          trackStateStream.checkpoint(d)
+          mapWithStateStream.checkpoint(d)
         }
-        trackStateStream.register()
+        mapWithStateStream.register()
         ssc.checkpoint(checkpointDir.toString)
         ssc.start()  // should initialize all the checkpoint durations
-        assert(trackStateStream.checkpointDuration === null)
-        assert(internalTrackStateStream.checkpointDuration === expectedCheckpointDuration)
+        assert(mapWithStateStream.checkpointDuration === null)
+        assert(internalmapWithStateStream.checkpointDuration === expectedCheckpointDuration)
       } finally {
         ssc.stop(stopSparkContext = false)
       }
@@ -478,7 +478,7 @@ class TrackStateByKeySuite extends SparkFunSuite
   }
 
 
-  test("trackStateByKey - driver failure recovery") {
+  test("mapWithState - driver failure recovery") {
     val inputData =
       Seq(
         Seq(),
@@ -505,16 +505,16 @@ class TrackStateByKeySuite extends SparkFunSuite
 
       val checkpointDuration = batchDuration * (stateData.size / 2)
 
-      val runningCount = (value: Option[Int], state: State[Int]) => {
+      val runningCount = (key: String, value: Option[Int], state: State[Int]) => {
         state.update(state.getOption().getOrElse(0) + value.getOrElse(0))
         state.get()
       }
 
-      val trackStateStream = dstream.map { _ -> 1 }.trackStateByKey(
+      val mapWithStateStream = dstream.map { _ -> 1 }.mapWithState(
         StateSpec.function(runningCount))
       // Set internval make sure there is one RDD checkpointing
-      trackStateStream.checkpoint(checkpointDuration)
-      trackStateStream.stateSnapshots()
+      mapWithStateStream.checkpoint(checkpointDuration)
+      mapWithStateStream.stateSnapshots()
     }
 
     testCheckpointedOperation(inputData, operation, stateData, inputData.size / 2,
@@ -523,28 +523,28 @@ class TrackStateByKeySuite extends SparkFunSuite
 
   private def testOperation[K: ClassTag, S: ClassTag, T: ClassTag](
       input: Seq[Seq[K]],
-      trackStateSpec: StateSpec[K, Int, S, T],
+      mapWithStateSpec: StateSpec[K, Int, S, T],
       expectedOutputs: Seq[Seq[T]],
       expectedStateSnapshots: Seq[Seq[(K, S)]]
     ): Unit = {
     require(expectedOutputs.size == expectedStateSnapshots.size)
 
     val (collectedOutputs, collectedStateSnapshots) =
-      getOperationOutput(input, trackStateSpec, expectedOutputs.size)
+      getOperationOutput(input, mapWithStateSpec, expectedOutputs.size)
     assert(expectedOutputs, collectedOutputs, "outputs")
     assert(expectedStateSnapshots, collectedStateSnapshots, "state snapshots")
   }
 
   private def getOperationOutput[K: ClassTag, S: ClassTag, T: ClassTag](
       input: Seq[Seq[K]],
-      trackStateSpec: StateSpec[K, Int, S, T],
+      mapWithStateSpec: StateSpec[K, Int, S, T],
       numBatches: Int
     ): (Seq[Seq[T]], Seq[Seq[(K, S)]]) = {
 
     // Setup the stream computation
     val ssc = new StreamingContext(sc, Seconds(1))
     val inputStream = new TestInputStream(ssc, input, numPartitions = 2)
-    val trackeStateStream = inputStream.map(x => (x, 1)).trackStateByKey(trackStateSpec)
+    val trackeStateStream = inputStream.map(x => (x, 1)).mapWithState(mapWithStateSpec)
     val collectedOutputs = new ArrayBuffer[Seq[T]] with SynchronizedBuffer[Seq[T]]
     val outputStream = new TestOutputStream(trackeStateStream, collectedOutputs)
     val collectedStateSnapshots = new ArrayBuffer[Seq[(K, S)]] with SynchronizedBuffer[Seq[(K, S)]]
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/TrackStateRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala
similarity index 76%
rename from streaming/src/test/scala/org/apache/spark/streaming/rdd/TrackStateRDDSuite.scala
rename to streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala
index 3b2d43f2ce581..aa95bd33dda9f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/TrackStateRDDSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala
@@ -30,14 +30,14 @@ import org.apache.spark.streaming.util.OpenHashMapBasedStateMap
 import org.apache.spark.streaming.{State, Time}
 import org.apache.spark.util.Utils
 
-class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with BeforeAndAfterAll {
+class MapWithStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with BeforeAndAfterAll {
 
   private var sc: SparkContext = null
   private var checkpointDir: File = _
 
   override def beforeAll(): Unit = {
     sc = new SparkContext(
-      new SparkConf().setMaster("local").setAppName("TrackStateRDDSuite"))
+      new SparkConf().setMaster("local").setAppName("MapWithStateRDDSuite"))
     checkpointDir = Utils.createTempDir()
     sc.setCheckpointDir(checkpointDir.toString)
   }
@@ -54,7 +54,7 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
   test("creation from pair RDD") {
     val data = Seq((1, "1"), (2, "2"), (3, "3"))
     val partitioner = new HashPartitioner(10)
-    val rdd = TrackStateRDD.createFromPairRDD[Int, Int, String, Int](
+    val rdd = MapWithStateRDD.createFromPairRDD[Int, Int, String, Int](
       sc.parallelize(data), partitioner, Time(123))
     assertRDD[Int, Int, String, Int](rdd, data.map { x => (x._1, x._2, 123)}.toSet, Set.empty)
     assert(rdd.partitions.size === partitioner.numPartitions)
@@ -62,7 +62,7 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
     assert(rdd.partitioner === Some(partitioner))
   }
 
-  test("updating state and generating emitted data in TrackStateRecord") {
+  test("updating state and generating mapped data in MapWithStateRDDRecord") {
 
     val initialTime = 1000L
     val updatedTime = 2000L
@@ -71,7 +71,7 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
 
     /**
      * Assert that applying given data on a prior record generates correct updated record, with
-     * correct state map and emitted data
+     * correct state map and mapped data
      */
     def assertRecordUpdate(
         initStates: Iterable[Int],
@@ -86,18 +86,18 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
       val initialStateMap = new OpenHashMapBasedStateMap[String, Int]()
       initStates.foreach { s => initialStateMap.put("key", s, initialTime) }
       functionCalled = false
-      val record = TrackStateRDDRecord[String, Int, Int](initialStateMap, Seq.empty)
+      val record = MapWithStateRDDRecord[String, Int, Int](initialStateMap, Seq.empty)
       val dataIterator = data.map { v => ("key", v) }.iterator
       val removedStates = new ArrayBuffer[Int]
       val timingOutStates = new ArrayBuffer[Int]
       /**
-       * Tracking function that updates/removes state based on instructions in the data, and
+       * Mapping function that updates/removes state based on instructions in the data, and
        * return state (when instructed or when state is timing out).
        */
       def testFunc(t: Time, key: String, data: Option[String], state: State[Int]): Option[Int] = {
         functionCalled = true
 
-        assert(t.milliseconds === updatedTime, "tracking func called with wrong time")
+        assert(t.milliseconds === updatedTime, "mapping func called with wrong time")
 
         data match {
           case Some("noop") =>
@@ -120,22 +120,22 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
         }
       }
 
-      val updatedRecord = TrackStateRDDRecord.updateRecordWithData[String, String, Int, Int](
+      val updatedRecord = MapWithStateRDDRecord.updateRecordWithData[String, String, Int, Int](
         Some(record), dataIterator, testFunc,
         Time(updatedTime), timeoutThreshold, removeTimedoutData)
 
       val updatedStateData = updatedRecord.stateMap.getAll().map { x => (x._2, x._3) }
       assert(updatedStateData.toSet === expectedStates.toSet,
-        "states do not match after updating the TrackStateRecord")
+        "states do not match after updating the MapWithStateRDDRecord")
 
-      assert(updatedRecord.emittedRecords.toSet === expectedOutput.toSet,
-        "emitted data do not match after updating the TrackStateRecord")
+      assert(updatedRecord.mappedData.toSet === expectedOutput.toSet,
+        "mapped data do not match after updating the MapWithStateRDDRecord")
 
       assert(timingOutStates.toSet === expectedTimingOutStates.toSet, "timing out states do not " +
-        "match those that were expected to do so while updating the TrackStateRecord")
+        "match those that were expected to do so while updating the MapWithStateRDDRecord")
 
       assert(removedStates.toSet === expectedRemovedStates.toSet, "removed states do not " +
-        "match those that were expected to do so while updating the TrackStateRecord")
+        "match those that were expected to do so while updating the MapWithStateRDDRecord")
 
     }
 
@@ -187,12 +187,12 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
 
   }
 
-  test("states generated by TrackStateRDD") {
+  test("states generated by MapWithStateRDD") {
     val initStates = Seq(("k1", 0), ("k2", 0))
     val initTime = 123
     val initStateWthTime = initStates.map { x => (x._1, x._2, initTime) }.toSet
     val partitioner = new HashPartitioner(2)
-    val initStateRDD = TrackStateRDD.createFromPairRDD[String, Int, Int, Int](
+    val initStateRDD = MapWithStateRDD.createFromPairRDD[String, Int, Int, Int](
       sc.parallelize(initStates), partitioner, Time(initTime)).persist()
     assertRDD(initStateRDD, initStateWthTime, Set.empty)
 
@@ -203,21 +203,21 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
      * creates a new state RDD with expected states
      */
     def testStateUpdates(
-        testStateRDD: TrackStateRDD[String, Int, Int, Int],
+        testStateRDD: MapWithStateRDD[String, Int, Int, Int],
         testData: Seq[(String, Int)],
-        expectedStates: Set[(String, Int, Int)]): TrackStateRDD[String, Int, Int, Int] = {
+        expectedStates: Set[(String, Int, Int)]): MapWithStateRDD[String, Int, Int, Int] = {
 
-      // Persist the test TrackStateRDD so that its not recomputed while doing the next operation.
-      // This is to make sure that we only track which state keys are being touched in the next op.
+      // Persist the test MapWithStateRDD so that its not recomputed while doing the next operation.
+      // This is to make sure that we only touch which state keys are being touched in the next op.
       testStateRDD.persist().count()
 
       // To track which keys are being touched
-      TrackStateRDDSuite.touchedStateKeys.clear()
+      MapWithStateRDDSuite.touchedStateKeys.clear()
 
-      val trackingFunc = (time: Time, key: String, data: Option[Int], state: State[Int]) => {
+      val mappingFunction = (time: Time, key: String, data: Option[Int], state: State[Int]) => {
 
         // Track the key that has been touched
-        TrackStateRDDSuite.touchedStateKeys += key
+        MapWithStateRDDSuite.touchedStateKeys += key
 
         // If the data is 0, do not do anything with the state
         // else if the data is 1, increment the state if it exists, or set new state to 0
@@ -236,12 +236,12 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
 
       // Assert that the new state RDD has expected state data
       val newStateRDD = assertOperation(
-        testStateRDD, newDataRDD, trackingFunc, updateTime, expectedStates, Set.empty)
+        testStateRDD, newDataRDD, mappingFunction, updateTime, expectedStates, Set.empty)
 
       // Assert that the function was called only for the keys present in the data
-      assert(TrackStateRDDSuite.touchedStateKeys.size === testData.size,
+      assert(MapWithStateRDDSuite.touchedStateKeys.size === testData.size,
         "More number of keys are being touched than that is expected")
-      assert(TrackStateRDDSuite.touchedStateKeys.toSet === testData.toMap.keys,
+      assert(MapWithStateRDDSuite.touchedStateKeys.toSet === testData.toMap.keys,
         "Keys not in the data are being touched unexpectedly")
 
       // Assert that the test RDD's data has not changed
@@ -289,19 +289,19 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
 
   test("checkpointing") {
     /**
-     * This tests whether the TrackStateRDD correctly truncates any references to its parent RDDs -
-     * the data RDD and the parent TrackStateRDD.
+     * This tests whether the MapWithStateRDD correctly truncates any references to its parent RDDs
+     * - the data RDD and the parent MapWithStateRDD.
      */
-    def rddCollectFunc(rdd: RDD[TrackStateRDDRecord[Int, Int, Int]])
+    def rddCollectFunc(rdd: RDD[MapWithStateRDDRecord[Int, Int, Int]])
       : Set[(List[(Int, Int, Long)], List[Int])] = {
-      rdd.map { record => (record.stateMap.getAll().toList, record.emittedRecords.toList) }
+      rdd.map { record => (record.stateMap.getAll().toList, record.mappedData.toList) }
          .collect.toSet
     }
 
-    /** Generate TrackStateRDD with data RDD having a long lineage */
+    /** Generate MapWithStateRDD with data RDD having a long lineage */
     def makeStateRDDWithLongLineageDataRDD(longLineageRDD: RDD[Int])
-      : TrackStateRDD[Int, Int, Int, Int] = {
-      TrackStateRDD.createFromPairRDD(longLineageRDD.map { _ -> 1}, partitioner, Time(0))
+      : MapWithStateRDD[Int, Int, Int, Int] = {
+      MapWithStateRDD.createFromPairRDD(longLineageRDD.map { _ -> 1}, partitioner, Time(0))
     }
 
     testRDD(
@@ -309,15 +309,15 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
     testRDDPartitions(
       makeStateRDDWithLongLineageDataRDD, reliableCheckpoint = true, rddCollectFunc _)
 
-    /** Generate TrackStateRDD with parent state RDD having a long lineage */
+    /** Generate MapWithStateRDD with parent state RDD having a long lineage */
     def makeStateRDDWithLongLineageParenttateRDD(
-        longLineageRDD: RDD[Int]): TrackStateRDD[Int, Int, Int, Int] = {
+        longLineageRDD: RDD[Int]): MapWithStateRDD[Int, Int, Int, Int] = {
 
-      // Create a TrackStateRDD that has a long lineage using the data RDD with a long lineage
+      // Create a MapWithStateRDD that has a long lineage using the data RDD with a long lineage
       val stateRDDWithLongLineage = makeStateRDDWithLongLineageDataRDD(longLineageRDD)
 
-      // Create a new TrackStateRDD, with the lineage lineage TrackStateRDD as the parent
-      new TrackStateRDD[Int, Int, Int, Int](
+      // Create a new MapWithStateRDD, with the lineage lineage MapWithStateRDD as the parent
+      new MapWithStateRDD[Int, Int, Int, Int](
         stateRDDWithLongLineage,
         stateRDDWithLongLineage.sparkContext.emptyRDD[(Int, Int)].partitionBy(partitioner),
         (time: Time, key: Int, value: Option[Int], state: State[Int]) => None,
@@ -333,25 +333,25 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
   }
 
   test("checkpointing empty state RDD") {
-    val emptyStateRDD = TrackStateRDD.createFromPairRDD[Int, Int, Int, Int](
+    val emptyStateRDD = MapWithStateRDD.createFromPairRDD[Int, Int, Int, Int](
       sc.emptyRDD[(Int, Int)], new HashPartitioner(10), Time(0))
     emptyStateRDD.checkpoint()
     assert(emptyStateRDD.flatMap { _.stateMap.getAll() }.collect().isEmpty)
-    val cpRDD = sc.checkpointFile[TrackStateRDDRecord[Int, Int, Int]](
+    val cpRDD = sc.checkpointFile[MapWithStateRDDRecord[Int, Int, Int]](
       emptyStateRDD.getCheckpointFile.get)
     assert(cpRDD.flatMap { _.stateMap.getAll() }.collect().isEmpty)
   }
 
-  /** Assert whether the `trackStateByKey` operation generates expected results */
+  /** Assert whether the `mapWithState` operation generates expected results */
   private def assertOperation[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
-      testStateRDD: TrackStateRDD[K, V, S, T],
+      testStateRDD: MapWithStateRDD[K, V, S, T],
       newDataRDD: RDD[(K, V)],
-      trackStateFunc: (Time, K, Option[V], State[S]) => Option[T],
+      mappingFunction: (Time, K, Option[V], State[S]) => Option[T],
       currentTime: Long,
       expectedStates: Set[(K, S, Int)],
-      expectedEmittedRecords: Set[T],
+      expectedMappedData: Set[T],
       doFullScan: Boolean = false
-    ): TrackStateRDD[K, V, S, T] = {
+    ): MapWithStateRDD[K, V, S, T] = {
 
     val partitionedNewDataRDD = if (newDataRDD.partitioner != testStateRDD.partitioner) {
       newDataRDD.partitionBy(testStateRDD.partitioner.get)
@@ -359,31 +359,31 @@ class TrackStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with Bef
       newDataRDD
     }
 
-    val newStateRDD = new TrackStateRDD[K, V, S, T](
-      testStateRDD, newDataRDD, trackStateFunc, Time(currentTime), None)
+    val newStateRDD = new MapWithStateRDD[K, V, S, T](
+      testStateRDD, newDataRDD, mappingFunction, Time(currentTime), None)
     if (doFullScan) newStateRDD.setFullScan()
 
     // Persist to make sure that it gets computed only once and we can track precisely how many
     // state keys the computing touched
     newStateRDD.persist().count()
-    assertRDD(newStateRDD, expectedStates, expectedEmittedRecords)
+    assertRDD(newStateRDD, expectedStates, expectedMappedData)
     newStateRDD
   }
 
-  /** Assert whether the [[TrackStateRDD]] has the expected state ad emitted records */
+  /** Assert whether the [[MapWithStateRDD]] has the expected state and mapped data */
   private def assertRDD[K: ClassTag, V: ClassTag, S: ClassTag, T: ClassTag](
-      trackStateRDD: TrackStateRDD[K, V, S, T],
+      stateRDD: MapWithStateRDD[K, V, S, T],
       expectedStates: Set[(K, S, Int)],
-      expectedEmittedRecords: Set[T]): Unit = {
-    val states = trackStateRDD.flatMap { _.stateMap.getAll() }.collect().toSet
-    val emittedRecords = trackStateRDD.flatMap { _.emittedRecords }.collect().toSet
+      expectedMappedData: Set[T]): Unit = {
+    val states = stateRDD.flatMap { _.stateMap.getAll() }.collect().toSet
+    val mappedData = stateRDD.flatMap { _.mappedData }.collect().toSet
     assert(states === expectedStates,
-      "states after track state operation were not as expected")
-    assert(emittedRecords === expectedEmittedRecords,
-      "emitted records after track state operation were not as expected")
+      "states after mapWithState operation were not as expected")
+    assert(mappedData === expectedMappedData,
+      "mapped data after mapWithState operation were not as expected")
   }
 }
 
-object TrackStateRDDSuite {
+object MapWithStateRDDSuite {
   private val touchedStateKeys = new ArrayBuffer[String]()
 }