CodingCat
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 7 additions & 1 deletion b/‎R/pkg/R/DataFrame.R‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎R/pkg/inst/tests/test_sparkSQL.R‎
Lines changed: 5 additions & 0 deletions b/‎R/pkg/inst/tests/test_sparkSQL.R‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/src/main/java/org/apache/spark/api/java/function/Function0.java‎
Lines changed: 27 additions & 0 deletions b/‎core/src/main/java/org/apache/spark/api/java/function/Function0.java‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala‎
Lines changed: 16 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala‎
Lines changed: 20 additions & 12 deletions b/‎core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/executor/Executor.scala‎
Lines changed: 6 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/executor/Executor.scala‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala‎
Lines changed: 2 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala‎
Lines changed: 2 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/Task.scala‎
Lines changed: 7 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/Task.scala‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ui/ToolTips.scala‎
Lines changed: 3 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/ui/ToolTips.scala‎
Lines changed: 3 additions & 1 deletion
@@ -790,9 +790,12 @@ setMethod("$", signature(x = "DataFrame"),
 
 setMethod("$<-", signature(x = "DataFrame"),
           function(x, name, value) {
-            stopifnot(class(value) == "Column")
+            stopifnot(class(value) == "Column" || is.null(value))
             cols <- columns(x)
             if (name %in% cols) {
+              if (is.null(value)) {
+                cols <- Filter(function(c) { c != name }, cols)
+              }
               cols <- lapply(cols, function(c) {
                 if (c == name) {
                   alias(value, name)
@@ -802,6 +805,9 @@ setMethod("$<-", signature(x = "DataFrame"),
               })
               nx <- select(x, cols)
             } else {
+              if (is.null(value)) {
+                return(x)
+              }
               nx <- withColumn(x, name, value)
             }
             x@sdf <- nx@sdf
 
@@ -449,6 +449,11 @@ test_that("select operators", {
   df$age2 <- df$age * 2
   expect_equal(columns(df), c("name", "age", "age2"))
   expect_equal(count(where(df, df$age2 == df$age * 2)), 2)
+
+  df$age2 <- NULL
+  expect_equal(columns(df), c("name", "age"))
+  df$age3 <- NULL
+  expect_equal(columns(df), c("name", "age"))
 })
 
 test_that("select with column", {
 
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.java.function;
+
+import java.io.Serializable;
+
+/**
+ * A zero-argument function that returns an R.
+ */
+public interface Function0<R> extends Serializable {
+  public R call() throws Exception;
+}
@@ -77,12 +77,8 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     if (verbose) SparkSubmit.printStream.println(s"Using properties file: $propertiesFile")
     Option(propertiesFile).foreach { filename =>
       Utils.getPropertiesFromFile(filename).foreach { case (k, v) =>
-        if (k.startsWith("spark.")) {
-          defaultProperties(k) = v
-          if (verbose) SparkSubmit.printStream.println(s"Adding default property: $k=$v")
-        } else {
-          SparkSubmit.printWarning(s"Ignoring non-spark config property: $k=$v")
-        }
+        defaultProperties(k) = v
+        if (verbose) SparkSubmit.printStream.println(s"Adding default property: $k=$v")
       }
     }
     defaultProperties
@@ -97,6 +93,8 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   }
   // Populate `sparkProperties` map from properties file
   mergeDefaultSparkProperties()
+  // Remove keys that don't start with "spark." from `sparkProperties`.
+  ignoreNonSparkProperties()
   // Use `sparkProperties` map along with env vars to fill in any missing parameters
   loadEnvironmentArguments()
 
@@ -117,6 +115,18 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     }
   }
 
+  /**
+   * Remove keys that don't start with "spark." from `sparkProperties`.
+   */
+  private def ignoreNonSparkProperties(): Unit = {
+    sparkProperties.foreach { case (k, v) =>
+      if (!k.startsWith("spark.")) {
+        sparkProperties -= k
+        SparkSubmit.printWarning(s"Ignoring non-spark config property: $k=$v")
+      }
+    }
+  }
+
   /**
    * Load arguments from environment variables, Spark properties etc.
    */
 
@@ -35,7 +35,6 @@ import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{ThreadUtils, Utils}
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 
-
 /**
  * A class that provides application history from event logs stored in the file system.
  * This provider checks for new finished applications in the background periodically and
@@ -76,6 +75,9 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
   @volatile private var applications: mutable.LinkedHashMap[String, FsApplicationHistoryInfo]
     = new mutable.LinkedHashMap()
 
+  // List of applications to be deleted by event log cleaner.
+  private var appsToClean = new mutable.ListBuffer[FsApplicationHistoryInfo]
+
   // Constants used to parse Spark 1.0.0 log directories.
   private[history] val LOG_PREFIX = "EVENT_LOG_"
   private[history] val SPARK_VERSION_PREFIX = EventLoggingListener.SPARK_VERSION_KEY + "_"
@@ -266,34 +268,40 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
    */
   private def cleanLogs(): Unit = {
     try {
-      val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
-        .getOrElse(Seq[FileStatus]())
       val maxAge = conf.getTimeAsSeconds("spark.history.fs.cleaner.maxAge", "7d") * 1000
 
       val now = System.currentTimeMillis()
       val appsToRetain = new mutable.LinkedHashMap[String, FsApplicationHistoryInfo]()
 
+      // Scan all logs from the log directory.
+      // Only completed applications older than the specified max age will be deleted.
       applications.values.foreach { info =>
-        if (now - info.lastUpdated <= maxAge) {
+        if (now - info.lastUpdated <= maxAge || !info.completed) {
           appsToRetain += (info.id -> info)
+        } else {
+          appsToClean += info
         }
       }
 
       applications = appsToRetain
 
-      // Scan all logs from the log directory.
-      // Only directories older than the specified max age will be deleted
-      statusList.foreach { dir =>
+      val leftToClean = new mutable.ListBuffer[FsApplicationHistoryInfo]
+      appsToClean.foreach { info =>
         try {
-          if (now - dir.getModificationTime() > maxAge) {
-            // if path is a directory and set to  true,
-            // the directory is deleted else throws an exception
-            fs.delete(dir.getPath, true)
+          val path = new Path(logDir, info.logPath)
+          if (fs.exists(path)) {
+            fs.delete(path, true)
           }
         } catch {
-          case t: IOException => logError(s"IOException in cleaning logs of $dir", t)
+          case e: AccessControlException =>
+            logInfo(s"No permission to delete ${info.logPath}, ignoring.")
+          case t: IOException =>
+            logError(s"IOException in cleaning logs of ${info.logPath}", t)
+            leftToClean += info
         }
       }
+
+      appsToClean = leftToClean
     } catch {
       case t: Exception => logError("Exception in cleaning logs", t)
     }
 
@@ -220,8 +220,12 @@ private[spark] class Executor(
         val afterSerialization = System.currentTimeMillis()
 
         for (m <- task.metrics) {
-          m.setExecutorDeserializeTime(taskStart - deserializeStartTime)
-          m.setExecutorRunTime(taskFinish - taskStart)
+          // Deserialization happens in two parts: first, we deserialize a Task object, which
+          // includes the Partition. Second, Task.run() deserializes the RDD and function to be run.
+          m.setExecutorDeserializeTime(
+            (taskStart - deserializeStartTime) + task.executorDeserializeTime)
+          // We need to subtract Task.run()'s deserialization time to avoid double-counting
+          m.setExecutorRunTime((taskFinish - taskStart) - task.executorDeserializeTime)
           m.setJvmGCTime(computeTotalGcTime() - startGCTime)
           m.setResultSerializationTime(afterSerialization - beforeSerialization)
         }
 
@@ -53,9 +53,11 @@ private[spark] class ResultTask[T, U](
 
   override def runTask(context: TaskContext): U = {
     // Deserialize the RDD and the func using the broadcast variables.
+    val deserializeStartTime = System.currentTimeMillis()
     val ser = SparkEnv.get.closureSerializer.newInstance()
     val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
       ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
+    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
 
     metrics = Some(context.taskMetrics)
     func(context, rdd.iterator(partition, context))
 
@@ -56,9 +56,11 @@ private[spark] class ShuffleMapTask(
 
   override def runTask(context: TaskContext): MapStatus = {
     // Deserialize the RDD using the broadcast variable.
+    val deserializeStartTime = System.currentTimeMillis()
     val ser = SparkEnv.get.closureSerializer.newInstance()
     val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
       ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
+    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
 
     metrics = Some(context.taskMetrics)
     var writer: ShuffleWriter[Any, Any] = null
 
@@ -87,11 +87,18 @@ private[spark] abstract class Task[T](val stageId: Int, var partitionId: Int) ex
   // initialized when kill() is invoked.
   @volatile @transient private var _killed = false
 
+  protected var _executorDeserializeTime: Long = 0
+
   /**
    * Whether the task has been killed.
    */
   def killed: Boolean = _killed
 
+  /**
+   * Returns the amount of time spent deserializing the RDD and function to be run.
+   */
+  def executorDeserializeTime: Long = _executorDeserializeTime
+
   /**
    * Kills a task by setting the interrupted flag to true. This relies on the upper level Spark
    * code and user code to properly handle the flag. This function should be idempotent so it can
 
@@ -24,7 +24,9 @@ private[spark] object ToolTips {
        scheduler delay is large, consider decreasing the size of tasks or decreasing the size
        of task results."""
 
-  val TASK_DESERIALIZATION_TIME = "Time spent deserializing the task closure on the executor."
+  val TASK_DESERIALIZATION_TIME =
+    """Time spent deserializing the task closure on the executor, including the time to read the
+       broadcasted task."""
 
   val SHUFFLE_READ_BLOCKED_TIME =
     "Time that the task spent blocked waiting for shuffle data to be read from remote machines."