Removed unncessary TimeStampedHashMap from DAGScheduler, added try-catches in finalize() methods, and replaced ArrayBlockingQueue to LinkedBlockingQueue to avoid blocking in Java's finalizing thread.

tdas · tdas · commit ae9da88b3a88 · 2014-03-11T17:56:36.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
@@ -19,7 +19,7 @@ package org.apache.spark
 
 import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
 
-import java.util.concurrent.{ArrayBlockingQueue, TimeUnit}
+import java.util.concurrent.{LinkedBlockingQueue, TimeUnit}
 
 import org.apache.spark.rdd.RDD
 
@@ -40,8 +40,7 @@ private[spark] class ContextCleaner(env: SparkEnv) extends Logging {
   private case class CleanShuffle(id: Int) extends CleaningTask
   // TODO: add CleanBroadcast
 
-  private val QUEUE_CAPACITY = 1000
-  private val queue = new ArrayBlockingQueue[CleaningTask](QUEUE_CAPACITY)
+  private val queue = new LinkedBlockingQueue[CleaningTask]
 
   protected val listeners = new ArrayBuffer[CleanerListener]
     with SynchronizedBuffer[CleanerListener]
diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala
@@ -49,13 +49,26 @@ class ShuffleDependency[K, V](
     @transient rdd: RDD[_ <: Product2[K, V]],
     val partitioner: Partitioner,
     val serializerClass: String = null)
-  extends Dependency(rdd.asInstanceOf[RDD[Product2[K, V]]]) {
+  extends Dependency(rdd.asInstanceOf[RDD[Product2[K, V]]]) with Logging {
 
   val shuffleId: Int = rdd.context.newShuffleId()
 
   override def finalize() {
-    if (rdd != null) {
-      rdd.sparkContext.cleaner.cleanShuffle(shuffleId)
+    try {
+      if (rdd != null) {
+        rdd.sparkContext.cleaner.cleanShuffle(shuffleId)
+      }
+    } catch {
+      case t: Throwable =>
+        // Paranoia - If logError throws error as well, report to stderr.
+        try {
+          logError("Error in finalize", t)
+        } catch {
+          case _ =>
+            System.err.println("Error in finalize (and could not write to logError): " + t)
+        }
+    } finally {
+      super.finalize()
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1026,6 +1026,7 @@ abstract class RDD[T: ClassTag](
   }
 
   def cleanup() {
+    logInfo("Cleanup called on RDD " + id)
     sc.cleaner.cleanRDD(this)
     dependencies.filter(_.isInstanceOf[ShuffleDependency[_, _]])
                 .map(_.asInstanceOf[ShuffleDependency[_, _]].shuffleId)
@@ -1112,6 +1113,19 @@ abstract class RDD[T: ClassTag](
   }
 
   override def finalize() {
-    cleanup()
+    try {
+      cleanup()
+    } catch {
+      case t: Throwable =>
+        // Paranoia - If logError throws error as well, report to stderr.
+        try {
+          logError("Error in finalize", t)
+        } catch {
+          case _ =>
+            System.err.println("Error in finalize (and could not write to logError): " + t)
+        }
+    } finally {
+      super.finalize()
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -123,17 +123,17 @@ class DAGScheduler(
 
   private val nextStageId = new AtomicInteger(0)
 
-  private[scheduler] val jobIdToStageIds = new TimeStampedHashMap[Int, HashSet[Int]]
+  private[scheduler] val jobIdToStageIds = new HashMap[Int, HashSet[Int]]
 
-  private[scheduler] val stageIdToJobIds = new TimeStampedHashMap[Int, HashSet[Int]]
+  private[scheduler] val stageIdToJobIds = new HashMap[Int, HashSet[Int]]
 
-  private[scheduler] val stageIdToStage = new TimeStampedHashMap[Int, Stage]
+  private[scheduler] val stageIdToStage = new HashMap[Int, Stage]
 
-  private[scheduler] val shuffleToMapStage = new TimeStampedHashMap[Int, Stage]
+  private[scheduler] val shuffleToMapStage = new HashMap[Int, Stage]
 
-  private[spark] val stageToInfos = new TimeStampedHashMap[Stage, StageInfo]
+  private[spark] val stageToInfos = new HashMap[Stage, StageInfo]
 
-  // An async scheduler event bus. The bus should be stopped when DAGSCheduler is stopped.
+  // An async scheduler event bus. The bus should be stopped when DAGScheduler is stopped.
   private[spark] val listenerBus = new SparkListenerBus
 
   // Contains the locations that each RDD's partitions are cached on
@@ -159,9 +159,6 @@ class DAGScheduler(
   val activeJobs = new HashSet[ActiveJob]
   val resultStageToJob = new HashMap[Stage, ActiveJob]
 
-  val metadataCleaner = new MetadataCleaner(
-    MetadataCleanerType.DAG_SCHEDULER, this.cleanup, env.conf)
-
   /**
    * Starts the event processing actor.  The actor has two responsibilities:
    *
@@ -1094,26 +1091,10 @@ class DAGScheduler(
     Nil
   }
 
-  private def cleanup(cleanupTime: Long) {
-    Map(
-      "stageIdToStage" -> stageIdToStage,
-      "shuffleToMapStage" -> shuffleToMapStage,
-      "pendingTasks" -> pendingTasks,
-      "stageToInfos" -> stageToInfos,
-      "jobIdToStageIds" -> jobIdToStageIds,
-      "stageIdToJobIds" -> stageIdToJobIds).
-      foreach { case(s, t) => {
-      val sizeBefore = t.size
-      t.clearOldValues(cleanupTime)
-      logInfo("%s %d --> %d".format(s, sizeBefore, t.size))
-    }}
-  }
-
   def stop() {
     if (eventProcessActor != null) {
       eventProcessActor ! StopDAGScheduler
     }
-    metadataCleaner.cancel()
     taskSched.stop()
     listenerBus.stop()
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -341,9 +341,11 @@ abstract class DStream[T: ClassTag] (
    */
   private[streaming] def clearMetadata(time: Time) {
     val oldRDDs = generatedRDDs.filter(_._1 <= (time - rememberDuration))
+    logDebug("Clearing references to old RDDs: [" +
+      oldRDDs.map(x => s"${x._1} -> ${x._2.id}").mkString(", ") + "]")
     generatedRDDs --= oldRDDs.keys
     if (ssc.conf.getBoolean("spark.streaming.unpersist", false)) {
-      logDebug("Unpersisting old RDDs: " + oldRDDs.keys.mkString(", "))
+      logDebug("Unpersisting old RDDs: " + oldRDDs.values.map(_.id).mkString(", "))
       oldRDDs.values.foreach(_.unpersist(false))
     }
     logDebug("Cleared " + oldRDDs.size + " RDDs that were older than " +