apache
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 8 additions & 7 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎R/pkg/tests/fulltests/test_sparkSQL.R‎
Lines changed: 80 additions & 25 deletions b/‎R/pkg/tests/fulltests/test_sparkSQL.R‎
Lines changed: 80 additions & 25 deletions
diff --git a/‎core/benchmarks/CoalescedRDDBenchmark-results.txt‎
Lines changed: 40 additions & 0 deletions b/‎core/benchmarks/CoalescedRDDBenchmark-results.txt‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala‎
Lines changed: 39 additions & 29 deletions b/‎core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala‎
Lines changed: 39 additions & 29 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/executor/Executor.scala‎
Lines changed: 2 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/executor/Executor.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/internal/config/Streaming.scala‎
Lines changed: 68 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/internal/config/Streaming.scala‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/internal/config/package.scala‎
Lines changed: 1 addition & 0 deletions b/‎core/src/main/scala/org/apache/spark/internal/config/package.scala‎
Lines changed: 1 addition & 0 deletions
@@ -2520,8 +2520,9 @@ setMethod("dropDuplicates",
 #' Column expression. If joinExpr is omitted, the default, inner join is attempted and an error is
 #' thrown if it would be a Cartesian Product. For Cartesian join, use crossJoin instead.
 #' @param joinType The type of join to perform, default 'inner'.
-#' Must be one of: 'inner', 'cross', 'outer', 'full', 'full_outer',
-#' 'left', 'left_outer', 'right', 'right_outer', 'left_semi', or 'left_anti'.
+#' Must be one of: 'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',
+#' 'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer', 'semi',
+#' 'leftsemi', 'left_semi', 'anti', 'leftanti', 'left_anti'.
 #' @return A SparkDataFrame containing the result of the join operation.
 #' @family SparkDataFrame functions
 #' @aliases join,SparkDataFrame,SparkDataFrame-method
@@ -2553,14 +2554,14 @@ setMethod("join",
                     "outer", "full", "fullouter", "full_outer",
                     "left", "leftouter", "left_outer",
                     "right", "rightouter", "right_outer",
-                    "left_semi", "leftsemi", "left_anti", "leftanti")) {
+                    "semi", "left_semi", "leftsemi", "anti", "left_anti", "leftanti")) {
                   joinType <- gsub("_", "", joinType)
                   sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType)
                 } else {
-                  stop("joinType must be one of the following types: ",
-                       "'inner', 'cross', 'outer', 'full', 'full_outer',",
-                       "'left', 'left_outer', 'right', 'right_outer',",
-                       "'left_semi', or 'left_anti'.")
+                  stop(paste("joinType must be one of the following types:",
+                       "'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',",
+                       "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',",
+                       "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'."))
                 }
               }
             }
 
@@ -2356,40 +2356,95 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   expect_equal(names(joined2), c("age", "name", "name", "test"))
   expect_equal(count(joined2), 3)
 
-  joined3 <- join(df, df2, df$name == df2$name, "rightouter")
+  joined3 <- join(df, df2, df$name == df2$name, "right")
   expect_equal(names(joined3), c("age", "name", "name", "test"))
   expect_equal(count(joined3), 4)
   expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
-
-  joined4 <- select(join(df, df2, df$name == df2$name, "outer"),
-                    alias(df$age + 5, "newAge"), df$name, df2$test)
-  expect_equal(names(joined4), c("newAge", "name", "test"))
+  
+  joined4 <- join(df, df2, df$name == df2$name, "right_outer")
+  expect_equal(names(joined4), c("age", "name", "name", "test"))
   expect_equal(count(joined4), 4)
-  expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
+  expect_true(is.na(collect(orderBy(joined4, joined4$age))$age[2]))
 
-  joined5 <- join(df, df2, df$name == df2$name, "leftouter")
+  joined5 <- join(df, df2, df$name == df2$name, "rightouter")
   expect_equal(names(joined5), c("age", "name", "name", "test"))
-  expect_equal(count(joined5), 3)
-  expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[1]))
-
-  joined6 <- join(df, df2, df$name == df2$name, "inner")
-  expect_equal(names(joined6), c("age", "name", "name", "test"))
-  expect_equal(count(joined6), 3)
+  expect_equal(count(joined5), 4)
+  expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[2]))
 
-  joined7 <- join(df, df2, df$name == df2$name, "leftsemi")
-  expect_equal(names(joined7), c("age", "name"))
-  expect_equal(count(joined7), 3)
 
-  joined8 <- join(df, df2, df$name == df2$name, "left_outer")
-  expect_equal(names(joined8), c("age", "name", "name", "test"))
-  expect_equal(count(joined8), 3)
-  expect_true(is.na(collect(orderBy(joined8, joined8$age))$age[1]))
-
-  joined9 <- join(df, df2, df$name == df2$name, "right_outer")
-  expect_equal(names(joined9), c("age", "name", "name", "test"))
+  joined6 <- select(join(df, df2, df$name == df2$name, "outer"),
+                    alias(df$age + 5, "newAge"), df$name, df2$test)
+  expect_equal(names(joined6), c("newAge", "name", "test"))
+  expect_equal(count(joined6), 4)
+  expect_equal(collect(orderBy(joined6, joined6$name))$newAge[3], 24)
+  
+  joined7 <- select(join(df, df2, df$name == df2$name, "full"),
+                    alias(df$age + 5, "newAge"), df$name, df2$test)
+  expect_equal(names(joined7), c("newAge", "name", "test"))
+  expect_equal(count(joined7), 4)
+  expect_equal(collect(orderBy(joined7, joined7$name))$newAge[3], 24)
+  
+  joined8 <- select(join(df, df2, df$name == df2$name, "fullouter"),
+                    alias(df$age + 5, "newAge"), df$name, df2$test)
+  expect_equal(names(joined8), c("newAge", "name", "test"))
+  expect_equal(count(joined8), 4)
+  expect_equal(collect(orderBy(joined8, joined8$name))$newAge[3], 24)
+  
+  joined9 <- select(join(df, df2, df$name == df2$name, "full_outer"),
+                    alias(df$age + 5, "newAge"), df$name, df2$test)
+  expect_equal(names(joined9), c("newAge", "name", "test"))
   expect_equal(count(joined9), 4)
-  expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
-
+  expect_equal(collect(orderBy(joined9, joined9$name))$newAge[3], 24)
+
+  joined10 <- join(df, df2, df$name == df2$name, "left")
+  expect_equal(names(joined10), c("age", "name", "name", "test"))
+  expect_equal(count(joined10), 3)
+  expect_true(is.na(collect(orderBy(joined10, joined10$age))$age[1]))
+  
+  joined11 <- join(df, df2, df$name == df2$name, "leftouter")
+  expect_equal(names(joined11), c("age", "name", "name", "test"))
+  expect_equal(count(joined11), 3)
+  expect_true(is.na(collect(orderBy(joined11, joined11$age))$age[1]))
+  
+  joined12 <- join(df, df2, df$name == df2$name, "left_outer")
+  expect_equal(names(joined12), c("age", "name", "name", "test"))
+  expect_equal(count(joined12), 3)
+  expect_true(is.na(collect(orderBy(joined12, joined12$age))$age[1]))
+
+  joined13 <- join(df, df2, df$name == df2$name, "inner")
+  expect_equal(names(joined13), c("age", "name", "name", "test"))
+  expect_equal(count(joined13), 3)
+
+  joined14 <- join(df, df2, df$name == df2$name, "semi")
+  expect_equal(names(joined14), c("age", "name"))
+  expect_equal(count(joined14), 3)
+  
+  joined14 <- join(df, df2, df$name == df2$name, "leftsemi")
+  expect_equal(names(joined14), c("age", "name"))
+  expect_equal(count(joined14), 3)
+  
+  joined15 <- join(df, df2, df$name == df2$name, "left_semi")
+  expect_equal(names(joined15), c("age", "name"))
+  expect_equal(count(joined15), 3)
+  
+  joined16 <- join(df2, df, df2$name == df$name, "anti")
+  expect_equal(names(joined16), c("name", "test"))
+  expect_equal(count(joined16), 1)
+  
+  joined17 <- join(df2, df, df2$name == df$name, "leftanti")
+  expect_equal(names(joined17), c("name", "test"))
+  expect_equal(count(joined17), 1)
+  
+  joined18 <- join(df2, df, df2$name == df$name, "left_anti")
+  expect_equal(names(joined18), c("name", "test"))
+  expect_equal(count(joined18), 1)
+
+  error_msg <- paste("joinType must be one of the following types:",
+                 "'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',",
+                 "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',",
+                 "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.")
+  expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg)
+  
   merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
   expect_equal(count(merged), 4)
   expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
 
@@ -0,0 +1,40 @@
+================================================================================================
+Coalesced RDD , large scale
+================================================================================================
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_201-b09 on Windows 10 10.0
+Intel64 Family 6 Model 63 Stepping 2, GenuineIntel
+Coalesced RDD:                            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Coalesce Num Partitions: 100 Num Hosts: 1            346            364          24          0.3        3458.9       1.0X
+Coalesce Num Partitions: 100 Num Hosts: 5            258            264           6          0.4        2579.0       1.3X
+Coalesce Num Partitions: 100 Num Hosts: 10            242            249           7          0.4        2415.2       1.4X
+Coalesce Num Partitions: 100 Num Hosts: 20            237            242           7          0.4        2371.7       1.5X
+Coalesce Num Partitions: 100 Num Hosts: 40            230            231           1          0.4        2299.8       1.5X
+Coalesce Num Partitions: 100 Num Hosts: 80            222            233          14          0.4        2223.0       1.6X
+Coalesce Num Partitions: 500 Num Hosts: 1            659            665           5          0.2        6590.4       0.5X
+Coalesce Num Partitions: 500 Num Hosts: 5            340            381          47          0.3        3395.2       1.0X
+Coalesce Num Partitions: 500 Num Hosts: 10            279            307          47          0.4        2788.3       1.2X
+Coalesce Num Partitions: 500 Num Hosts: 20            259            261           2          0.4        2591.9       1.3X
+Coalesce Num Partitions: 500 Num Hosts: 40            241            250          15          0.4        2406.5       1.4X
+Coalesce Num Partitions: 500 Num Hosts: 80            235            237           3          0.4        2349.9       1.5X
+Coalesce Num Partitions: 1000 Num Hosts: 1           1050           1053           4          0.1       10503.2       0.3X
+Coalesce Num Partitions: 1000 Num Hosts: 5            405            407           2          0.2        4049.5       0.9X
+Coalesce Num Partitions: 1000 Num Hosts: 10            320            322           2          0.3        3202.7       1.1X
+Coalesce Num Partitions: 1000 Num Hosts: 20            276            277           0          0.4        2762.3       1.3X
+Coalesce Num Partitions: 1000 Num Hosts: 40            257            260           5          0.4        2571.2       1.3X
+Coalesce Num Partitions: 1000 Num Hosts: 80            245            252          13          0.4        2448.9       1.4X
+Coalesce Num Partitions: 5000 Num Hosts: 1           3099           3145          55          0.0       30988.6       0.1X
+Coalesce Num Partitions: 5000 Num Hosts: 5           1037           1050          20          0.1       10374.4       0.3X
+Coalesce Num Partitions: 5000 Num Hosts: 10            626            633           8          0.2        6261.8       0.6X
+Coalesce Num Partitions: 5000 Num Hosts: 20            426            431           5          0.2        4258.6       0.8X
+Coalesce Num Partitions: 5000 Num Hosts: 40            328            341          22          0.3        3275.4       1.1X
+Coalesce Num Partitions: 5000 Num Hosts: 80            272            275           4          0.4        2721.4       1.3X
+Coalesce Num Partitions: 10000 Num Hosts: 1           5516           5526           9          0.0       55156.8       0.1X
+Coalesce Num Partitions: 10000 Num Hosts: 5           1956           1992          48          0.1       19560.9       0.2X
+Coalesce Num Partitions: 10000 Num Hosts: 10           1045           1057          18          0.1       10447.4       0.3X
+Coalesce Num Partitions: 10000 Num Hosts: 20            637            658          24          0.2        6373.2       0.5X
+Coalesce Num Partitions: 10000 Num Hosts: 40            431            448          15          0.2        4312.9       0.8X
+Coalesce Num Partitions: 10000 Num Hosts: 80            326            328           2          0.3        3263.4       1.1X
+
+
@@ -450,27 +450,32 @@ private[deploy] class Worker(
       // rpcEndpoint.
       // Copy ids so that it can be used in the cleanup thread.
       val appIds = (executors.values.map(_.appId) ++ drivers.values.map(_.driverId)).toSet
-      val cleanupFuture = concurrent.Future {
-        val appDirs = workDir.listFiles()
-        if (appDirs == null) {
-          throw new IOException("ERROR: Failed to list files in " + appDirs)
-        }
-        appDirs.filter { dir =>
-          // the directory is used by an application - check that the application is not running
-          // when cleaning up
-          val appIdFromDir = dir.getName
-          val isAppStillRunning = appIds.contains(appIdFromDir)
-          dir.isDirectory && !isAppStillRunning &&
-          !Utils.doesDirectoryContainAnyNewFiles(dir, APP_DATA_RETENTION_SECONDS)
-        }.foreach { dir =>
-          logInfo(s"Removing directory: ${dir.getPath}")
-          Utils.deleteRecursively(dir)
-        }
-      }(cleanupThreadExecutor)
+      try {
+        val cleanupFuture: concurrent.Future[Unit] = concurrent.Future {
+          val appDirs = workDir.listFiles()
+          if (appDirs == null) {
+            throw new IOException("ERROR: Failed to list files in " + appDirs)
+          }
+          appDirs.filter { dir =>
+            // the directory is used by an application - check that the application is not running
+            // when cleaning up
+            val appIdFromDir = dir.getName
+            val isAppStillRunning = appIds.contains(appIdFromDir)
+            dir.isDirectory && !isAppStillRunning &&
+              !Utils.doesDirectoryContainAnyNewFiles(dir, APP_DATA_RETENTION_SECONDS)
+          }.foreach { dir =>
+            logInfo(s"Removing directory: ${dir.getPath}")
+            Utils.deleteRecursively(dir)
+          }
+        }(cleanupThreadExecutor)
 
-      cleanupFuture.failed.foreach(e =>
-        logError("App dir cleanup failed: " + e.getMessage, e)
-      )(cleanupThreadExecutor)
+        cleanupFuture.failed.foreach(e =>
+          logError("App dir cleanup failed: " + e.getMessage, e)
+        )(cleanupThreadExecutor)
+      } catch {
+        case _: RejectedExecutionException if cleanupThreadExecutor.isShutdown =>
+          logWarning("Failed to cleanup work dir as executor pool was shutdown")
+      }
 
     case MasterChanged(masterRef, masterWebUiUrl) =>
       logInfo("Master has changed, new master is at " + masterRef.address.toSparkURL)
@@ -634,15 +639,20 @@ private[deploy] class Worker(
     val shouldCleanup = finishedApps.contains(id) && !executors.values.exists(_.appId == id)
     if (shouldCleanup) {
       finishedApps -= id
-      appDirectories.remove(id).foreach { dirList =>
-        concurrent.Future {
-          logInfo(s"Cleaning up local directories for application $id")
-          dirList.foreach { dir =>
-            Utils.deleteRecursively(new File(dir))
-          }
-        }(cleanupThreadExecutor).failed.foreach(e =>
-          logError(s"Clean up app dir $dirList failed: ${e.getMessage}", e)
-        )(cleanupThreadExecutor)
+      try {
+        appDirectories.remove(id).foreach { dirList =>
+          concurrent.Future {
+            logInfo(s"Cleaning up local directories for application $id")
+            dirList.foreach { dir =>
+              Utils.deleteRecursively(new File(dir))
+            }
+          }(cleanupThreadExecutor).failed.foreach(e =>
+            logError(s"Clean up app dir $dirList failed: ${e.getMessage}", e)
+          )(cleanupThreadExecutor)
+        }
+      } catch {
+        case _: RejectedExecutionException if cleanupThreadExecutor.isShutdown =>
+          logWarning("Failed to cleanup application as executor pool was shutdown")
       }
       shuffleService.applicationRemoved(id)
     }
 
@@ -175,8 +175,8 @@ private[spark] class Executor(
 
   /**
    * When an executor is unable to send heartbeats to the driver more than `HEARTBEAT_MAX_FAILURES`
-   * times, it should kill itself. The default value is 60. It means we will retry to send
-   * heartbeats about 10 minutes because the heartbeat interval is 10s.
+   * times, it should kill itself. The default value is 60. For example, if max failures is 60 and
+   * heartbeat interval is 10s, then it will try to send heartbeats for up to 600s (10 minutes).
    */
   private val HEARTBEAT_MAX_FAILURES = conf.get(EXECUTOR_HEARTBEAT_MAX_FAILURES)
 
 
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.config
+
+import java.util.concurrent.TimeUnit
+
+private[spark] object Streaming {
+
+  private[spark] val STREAMING_DYN_ALLOCATION_ENABLED =
+    ConfigBuilder("spark.streaming.dynamicAllocation.enabled")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val STREAMING_DYN_ALLOCATION_TESTING =
+    ConfigBuilder("spark.streaming.dynamicAllocation.testing")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val STREAMING_DYN_ALLOCATION_MIN_EXECUTORS =
+    ConfigBuilder("spark.streaming.dynamicAllocation.minExecutors")
+      .intConf
+      .checkValue(_ > 0, "The min executor number of streaming dynamic " +
+        "allocation must be positive.")
+      .createOptional
+
+  private[spark] val STREAMING_DYN_ALLOCATION_MAX_EXECUTORS =
+    ConfigBuilder("spark.streaming.dynamicAllocation.maxExecutors")
+      .intConf
+      .checkValue(_ > 0, "The max executor number of streaming dynamic " +
+        "allocation must be positive.")
+      .createWithDefault(Int.MaxValue)
+
+  private[spark] val STREAMING_DYN_ALLOCATION_SCALING_INTERVAL =
+    ConfigBuilder("spark.streaming.dynamicAllocation.scalingInterval")
+      .timeConf(TimeUnit.SECONDS)
+      .checkValue(_ > 0, "The scaling interval of streaming dynamic " +
+        "allocation must be positive.")
+      .createWithDefault(60)
+
+  private[spark] val STREAMING_DYN_ALLOCATION_SCALING_UP_RATIO =
+    ConfigBuilder("spark.streaming.dynamicAllocation.scalingUpRatio")
+      .doubleConf
+      .checkValue(_ > 0, "The scaling up ratio of streaming dynamic " +
+        "allocation must be positive.")
+      .createWithDefault(0.9)
+
+  private[spark] val STREAMING_DYN_ALLOCATION_SCALING_DOWN_RATIO =
+    ConfigBuilder("spark.streaming.dynamicAllocation.scalingDownRatio")
+      .doubleConf
+      .checkValue(_ > 0, "The scaling down ratio of streaming dynamic " +
+        "allocation must be positive.")
+      .createWithDefault(0.3)
+}
@@ -301,6 +301,7 @@ package object config {
       .doc("Number of subdirectories inside each path listed in spark.local.dir for " +
         "hashing Block files into.")
       .intConf
+      .checkValue(_ > 0, "The number of subdirectories must be positive.")
       .createWithDefault(64)
 
   private[spark] val BLOCK_FAILURES_BEFORE_LOCATION_REFRESH =