Add cleanup behavior and cleanup tests for sort-based shuffle

mateiz · mateiz · commit 0174149a45ee · 2014-07-30T10:58:38.000-07:00
This also required creating a BlockId subclass for shuffle index blocks
so that the BlockManagers can report back their lists of blocks.
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -126,6 +126,9 @@ private[spark] class SortShuffleWriter[K, V, C](
       out.close()
     }
 
+    // Register our map output with the ShuffleBlockManager, which handles cleaning it over time
+    blockManager.shuffleBlockManager.addCompletedMap(dep.shuffleId, mapId, numPartitions)
+
     mapStatus = new MapStatus(blockManager.blockManagerId,
       lengths.map(MapOutputTracker.compressSize))
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockId.scala b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
@@ -59,6 +59,12 @@ case class ShuffleBlockId(shuffleId: Int, mapId: Int, reduceId: Int)
   def name = "shuffle_" + shuffleId + "_" + mapId + "_" + reduceId
 }
 
+@DeveloperApi
+case class ShuffleIndexBlockId(shuffleId: Int, mapId: Int, reduceId: Int)
+  extends BlockId {
+  def name = "shuffle_" + shuffleId + "_" + mapId + "_" + reduceId + ".index"
+}
+
 @DeveloperApi
 case class BroadcastBlockId(broadcastId: Long, field: String = "") extends BlockId {
   def name = "broadcast_" + broadcastId + (if (field == "") "" else "_" + field)
@@ -88,6 +94,7 @@ private[spark] case class TestBlockId(id: String) extends BlockId {
 object BlockId {
   val RDD = "rdd_([0-9]+)_([0-9]+)".r
   val SHUFFLE = "shuffle_([0-9]+)_([0-9]+)_([0-9]+)".r
+  val SHUFFLE_INDEX = "shuffle_([0-9]+)_([0-9]+)_([0-9]+).index".r
   val BROADCAST = "broadcast_([0-9]+)([_A-Za-z0-9]*)".r
   val TASKRESULT = "taskresult_([0-9]+)".r
   val STREAM = "input-([0-9]+)-([0-9]+)".r
@@ -99,6 +106,8 @@ object BlockId {
       RDDBlockId(rddId.toInt, splitIndex.toInt)
     case SHUFFLE(shuffleId, mapId, reduceId) =>
       ShuffleBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt)
+    case SHUFFLE_INDEX(shuffleId, mapId, reduceId) =>
+      ShuffleIndexBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt)
     case BROADCAST(broadcastId, field) =>
       BroadcastBlockId(broadcastId.toLong, field.stripPrefix("_"))
     case TASKRESULT(taskId) =>
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala
@@ -28,6 +28,7 @@ import org.apache.spark.serializer.Serializer
 import org.apache.spark.storage.ShuffleBlockManager.ShuffleFileGroup
 import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap}
 import org.apache.spark.util.collection.{PrimitiveKeyOpenHashMap, PrimitiveVector}
+import org.apache.spark.shuffle.sort.SortShuffleManager
 
 /** A group of writers for a ShuffleMapTask, one writer per reducer. */
 private[spark] trait ShuffleWriterGroup {
@@ -58,6 +59,7 @@ private[spark] trait ShuffleWriterGroup {
  * each block stored in each file. In order to find the location of a shuffle block, we search the
  * files within a ShuffleFileGroups associated with the block's reducer.
  */
+// TODO: Factor this into a separate class for each ShuffleManager implementation
 private[spark]
 class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   def conf = blockManager.conf
@@ -67,6 +69,10 @@ class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   val consolidateShuffleFiles =
     conf.getBoolean("spark.shuffle.consolidateFiles", false)
 
+  // Are we using sort-based shuffle?
+  val sortBasedShuffle =
+    conf.get("spark.shuffle.manager", "") == classOf[SortShuffleManager].getName
+
   private val bufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
 
   /**
@@ -91,6 +97,20 @@ class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   private val metadataCleaner =
     new MetadataCleaner(MetadataCleanerType.SHUFFLE_BLOCK_MANAGER, this.cleanup, conf)
 
+  /**
+   * Register a completed map without getting a ShuffleWriterGroup. Used by sort-based shuffle
+   * because it just writes a single file by itself.
+   */
+  def addCompletedMap(shuffleId: Int, mapId: Int, numBuckets: Int): Unit = {
+    shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numBuckets))
+    val shuffleState = shuffleStates(shuffleId)
+    shuffleState.completedMapTasks.add(mapId)
+  }
+
+  /**
+   * Get a ShuffleWriterGroup for the given map task, which will register it as complete
+   * when the writers are closed successfully
+   */
   def forMapTask(shuffleId: Int, mapId: Int, numBuckets: Int, serializer: Serializer) = {
     new ShuffleWriterGroup {
       shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numBuckets))
@@ -182,7 +202,14 @@ class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   private def removeShuffleBlocks(shuffleId: ShuffleId): Boolean = {
     shuffleStates.get(shuffleId) match {
       case Some(state) =>
-        if (consolidateShuffleFiles) {
+        if (sortBasedShuffle) {
+          // There's a single block ID for each map, plus an index file for it
+          for (mapId <- state.completedMapTasks) {
+            val blockId = new ShuffleBlockId(shuffleId, mapId, 0)
+            blockManager.diskBlockManager.getFile(blockId).delete()
+            blockManager.diskBlockManager.getFile(blockId.name + ".index").delete()
+          }
+        } else if (consolidateShuffleFiles) {
           for (fileGroup <- state.allFileGroups; file <- fileGroup.files) {
             file.delete()
           }
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -34,16 +34,28 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
-import org.apache.spark.storage.{BlockId, BroadcastBlockId, RDDBlockId, ShuffleBlockId}
-
-class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
-
+import org.apache.spark.storage._
+import org.apache.spark.shuffle.hash.HashShuffleManager
+import org.apache.spark.shuffle.sort.SortShuffleManager
+import org.apache.spark.storage.BroadcastBlockId
+import org.apache.spark.storage.RDDBlockId
+import org.apache.spark.storage.ShuffleBlockId
+import org.apache.spark.storage.ShuffleIndexBlockId
+
+/**
+ * An abstract base class for context cleaner tests, which sets up a context with a config
+ * suitable for cleaner tests and provides some utility functions. Subclasses can use different
+ * config options, in particular, a different shuffle manager class
+ */
+abstract class ContextCleanerSuiteBase(val shuffleManager: Class[_] = classOf[HashShuffleManager])
+  extends FunSuite with BeforeAndAfter with LocalSparkContext
+{
   implicit val defaultTimeout = timeout(10000 millis)
   val conf = new SparkConf()
     .setMaster("local[2]")
     .setAppName("ContextCleanerSuite")
     .set("spark.cleaner.referenceTracking.blocking", "true")
-    .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")
+    .set("spark.shuffle.manager", shuffleManager.getName)
 
   before {
     sc = new SparkContext(conf)
@@ -56,6 +68,59 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
     }
   }
 
+  //------ Helper functions ------
+
+  protected def newRDD() = sc.makeRDD(1 to 10)
+  protected def newPairRDD() = newRDD().map(_ -> 1)
+  protected def newShuffleRDD() = newPairRDD().reduceByKey(_ + _)
+  protected def newBroadcast() = sc.broadcast(1 to 100)
+
+  protected def newRDDWithShuffleDependencies(): (RDD[_], Seq[ShuffleDependency[_, _, _]]) = {
+    def getAllDependencies(rdd: RDD[_]): Seq[Dependency[_]] = {
+      rdd.dependencies ++ rdd.dependencies.flatMap { dep =>
+        getAllDependencies(dep.rdd)
+      }
+    }
+    val rdd = newShuffleRDD()
+
+    // Get all the shuffle dependencies
+    val shuffleDeps = getAllDependencies(rdd)
+      .filter(_.isInstanceOf[ShuffleDependency[_, _, _]])
+      .map(_.asInstanceOf[ShuffleDependency[_, _, _]])
+    (rdd, shuffleDeps)
+  }
+
+  protected def randomRdd() = {
+    val rdd: RDD[_] = Random.nextInt(3) match {
+      case 0 => newRDD()
+      case 1 => newShuffleRDD()
+      case 2 => newPairRDD.join(newPairRDD())
+    }
+    if (Random.nextBoolean()) rdd.persist()
+    rdd.count()
+    rdd
+  }
+
+  /** Run GC and make sure it actually has run */
+  private def runGC() {
+    val weakRef = new WeakReference(new Object())
+    val startTime = System.currentTimeMillis
+    System.gc() // Make a best effort to run the garbage collection. It *usually* runs GC.
+    // Wait until a weak reference object has been GCed
+    while (System.currentTimeMillis - startTime < 10000 && weakRef.get != null) {
+      System.gc()
+      Thread.sleep(200)
+    }
+  }
+
+  protected def cleaner = sc.cleaner.get
+}
+
+
+/**
+ * Basic ContextCleanerSuite, which uses sort-based shuffle
+ */
+class ContextCleanerSuite extends ContextCleanerSuiteBase {
   test("cleanup RDD") {
     val rdd = newRDD().persist()
     val collected = rdd.collect().toList
@@ -181,7 +246,7 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
       .setMaster("local-cluster[2, 1, 512]")
       .setAppName("ContextCleanerSuite")
       .set("spark.cleaner.referenceTracking.blocking", "true")
-      .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")
+      .set("spark.shuffle.manager", shuffleManager.getName)
     sc = new SparkContext(conf2)
 
     val numRdds = 10
@@ -212,57 +277,82 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
       case _ => false
     }, askSlaves = true).isEmpty)
   }
+}
 
-  //------ Helper functions ------
 
-  private def newRDD() = sc.makeRDD(1 to 10)
-  private def newPairRDD() = newRDD().map(_ -> 1)
-  private def newShuffleRDD() = newPairRDD().reduceByKey(_ + _)
-  private def newBroadcast() = sc.broadcast(1 to 100)
+/**
+ * A copy of the shuffle tests for sort-based shuffle
+ */
+class SortShuffleContextCleanerSuite extends ContextCleanerSuiteBase(classOf[SortShuffleManager]) {
+  test("cleanup shuffle") {
+    val (rdd, shuffleDeps) = newRDDWithShuffleDependencies()
+    val collected = rdd.collect().toList
+    val tester = new CleanerTester(sc, shuffleIds = shuffleDeps.map(_.shuffleId))
 
-  private def newRDDWithShuffleDependencies(): (RDD[_], Seq[ShuffleDependency[_, _, _]]) = {
-    def getAllDependencies(rdd: RDD[_]): Seq[Dependency[_]] = {
-      rdd.dependencies ++ rdd.dependencies.flatMap { dep =>
-        getAllDependencies(dep.rdd)
-      }
-    }
-    val rdd = newShuffleRDD()
+    // Explicit cleanup
+    shuffleDeps.foreach(s => cleaner.doCleanupShuffle(s.shuffleId, blocking = true))
+    tester.assertCleanup()
 
-    // Get all the shuffle dependencies
-    val shuffleDeps = getAllDependencies(rdd)
-      .filter(_.isInstanceOf[ShuffleDependency[_, _, _]])
-      .map(_.asInstanceOf[ShuffleDependency[_, _, _]])
-    (rdd, shuffleDeps)
+    // Verify that shuffles can be re-executed after cleaning up
+    assert(rdd.collect().toList.equals(collected))
   }
 
-  private def randomRdd() = {
-    val rdd: RDD[_] = Random.nextInt(3) match {
-      case 0 => newRDD()
-      case 1 => newShuffleRDD()
-      case 2 => newPairRDD.join(newPairRDD())
-    }
-    if (Random.nextBoolean()) rdd.persist()
+  test("automatically cleanup shuffle") {
+    var rdd = newShuffleRDD()
     rdd.count()
-    rdd
-  }
 
-  private def randomBroadcast() = {
-    sc.broadcast(Random.nextInt(Int.MaxValue))
+    // Test that GC does not cause shuffle cleanup due to a strong reference
+    val preGCTester = new CleanerTester(sc, shuffleIds = Seq(0))
+    runGC()
+    intercept[Exception] {
+      preGCTester.assertCleanup()(timeout(1000 millis))
+    }
+
+    // Test that GC causes shuffle cleanup after dereferencing the RDD
+    val postGCTester = new CleanerTester(sc, shuffleIds = Seq(0))
+    rdd = null  // Make RDD out of scope, so that corresponding shuffle goes out of scope
+    runGC()
+    postGCTester.assertCleanup()
   }
 
-  /** Run GC and make sure it actually has run */
-  private def runGC() {
-    val weakRef = new WeakReference(new Object())
-    val startTime = System.currentTimeMillis
-    System.gc() // Make a best effort to run the garbage collection. It *usually* runs GC.
-    // Wait until a weak reference object has been GCed
-    while (System.currentTimeMillis - startTime < 10000 && weakRef.get != null) {
-      System.gc()
-      Thread.sleep(200)
+  test("automatically cleanup RDD + shuffle + broadcast in distributed mode") {
+    sc.stop()
+
+    val conf2 = new SparkConf()
+      .setMaster("local-cluster[2, 1, 512]")
+      .setAppName("ContextCleanerSuite")
+      .set("spark.cleaner.referenceTracking.blocking", "true")
+      .set("spark.shuffle.manager", shuffleManager.getName)
+    sc = new SparkContext(conf2)
+
+    val numRdds = 10
+    val numBroadcasts = 4 // Broadcasts are more costly
+    val rddBuffer = (1 to numRdds).map(i => randomRdd).toBuffer
+    val broadcastBuffer = (1 to numBroadcasts).map(i => randomBroadcast).toBuffer
+    val rddIds = sc.persistentRdds.keys.toSeq
+    val shuffleIds = 0 until sc.newShuffleId()
+    val broadcastIds = broadcastBuffer.map(_.id)
+
+    val preGCTester = new CleanerTester(sc, rddIds, shuffleIds, broadcastIds)
+    runGC()
+    intercept[Exception] {
+      preGCTester.assertCleanup()(timeout(1000 millis))
     }
-  }
 
-  private def cleaner = sc.cleaner.get
+    // Test that GC triggers the cleanup of all variables after the dereferencing them
+    val postGCTester = new CleanerTester(sc, rddIds, shuffleIds, broadcastIds)
+    broadcastBuffer.clear()
+    rddBuffer.clear()
+    runGC()
+    postGCTester.assertCleanup()
+
+    // Make sure the broadcasted task closure no longer exists after GC.
+    val taskClosureBroadcastId = broadcastIds.max + 1
+    assert(sc.env.blockManager.master.getMatchingBlockIds({
+      case BroadcastBlockId(`taskClosureBroadcastId`, _) => true
+      case _ => false
+    }, askSlaves = true).isEmpty)
+  }
 }
 
 
@@ -420,6 +510,7 @@ class CleanerTester(
   private def getShuffleBlocks(shuffleId: Int): Seq[BlockId] = {
     blockManager.master.getMatchingBlockIds( _ match {
       case ShuffleBlockId(`shuffleId`, _, _) => true
+      case ShuffleIndexBlockId(`shuffleId`, _, _) => true
       case _ => false
     }, askSlaves = true)
   }

Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,9 @@ private[spark] class SortShuffleWriter[K, V, C](`
`126`	`126`	`out.close()`
`127`	`127`	`}`
`128`	`128`
	`129`	`+ // Register our map output with the ShuffleBlockManager, which handles cleaning it over time`
	`130`	`+ blockManager.shuffleBlockManager.addCompletedMap(dep.shuffleId, mapId, numPartitions)`
	`131`	`+`
`129`	`132`	`mapStatus = new MapStatus(blockManager.blockManagerId,`
`130`	`133`	`lengths.map(MapOutputTracker.compressSize))`
`131`	`134`	`}`