apache · squito · Oct 21, 2015 · Oct 22, 2015 · Oct 22, 2015 · Oct 22, 2015
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -21,26 +21,27 @@
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.util.Arrays;
 import javax.annotation.Nullable;
 
 import scala.None$;
 import scala.Option;
 import scala.Product2;
 import scala.Tuple2;
 import scala.collection.Iterator;
+import scala.collection.JavaConverters;
+import scala.collection.Seq;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.io.Closeables;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.spark.Partitioner;
-import org.apache.spark.ShuffleDependency;
-import org.apache.spark.SparkConf;
-import org.apache.spark.TaskContext;
+import org.apache.spark.*;
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.scheduler.MapStatus;
 import org.apache.spark.scheduler.MapStatus$;
+import org.apache.spark.shuffle.TmpDestShuffleFile;
 import org.apache.spark.serializer.Serializer;
 import org.apache.spark.serializer.SerializerInstance;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
@@ -121,13 +122,26 @@ public BypassMergeSortShuffleWriter(
   }
 
   @Override
-  public void write(Iterator<Product2<K, V>> records) throws IOException {
+  public Seq<TmpDestShuffleFile> write(Iterator<Product2<K, V>> records) throws IOException {
     assert (partitionWriters == null);
+    final File indexFile = shuffleBlockResolver.getIndexFile(shuffleId, mapId);
+    final File dataFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
+    final File tmpIndexFile = tmpShuffleFile(indexFile);
+    final File tmpDataFile = tmpShuffleFile(dataFile);
     if (!records.hasNext()) {
       partitionLengths = new long[numPartitions];
-      shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths);
       mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths);
-      return;
+      // create empty data file so we always commit same set of shuffle output files, even if
+      // data is non-deterministic
+      if (!tmpDataFile.createNewFile()) {
+        // only possible if the file already exists, from a race in createTempShuffleBlock, which
+        // should be super-rare
+        throw new IOException("could not create shuffle data file: " + tmpDataFile);
+      }
+      return JavaConverters.asScalaBufferConverter(Arrays.asList(
+        new TmpDestShuffleFile(tmpIndexFile, indexFile),
+        new TmpDestShuffleFile(tmpDataFile, dataFile)
+      )).asScala();
     }
     final SerializerInstance serInstance = serializer.newInstance();
     final long openStartTime = System.nanoTime();
@@ -155,10 +169,14 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {
       writer.commitAndClose();
     }
 
-    partitionLengths =
-      writePartitionedFile(shuffleBlockResolver.getDataFile(shuffleId, mapId));
-    shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths);
+    partitionLengths = writePartitionedFile(tmpDataFile);
+    shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths, tmpIndexFile);
     mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths);
+    return JavaConverters.asScalaBufferConverter(Arrays.asList(
+      new TmpDestShuffleFile(tmpIndexFile, indexFile),
+      new TmpDestShuffleFile(tmpDataFile, dataFile)
+    )).asScala();
+
   }
 
   @VisibleForTesting

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -20,11 +20,13 @@
 import javax.annotation.Nullable;
 import java.io.*;
 import java.nio.channels.FileChannel;
+import java.util.Arrays;
 import java.util.Iterator;
 
 import scala.Option;
 import scala.Product2;
 import scala.collection.JavaConverters;
+import scala.collection.Seq;
 import scala.collection.immutable.Map;
 import scala.reflect.ClassTag;
 import scala.reflect.ClassTag$;
@@ -42,18 +44,19 @@
 import org.apache.spark.io.CompressionCodec;
 import org.apache.spark.io.CompressionCodec$;
 import org.apache.spark.io.LZFCompressionCodec;
+import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.network.util.LimitedInputStream;
 import org.apache.spark.scheduler.MapStatus;
 import org.apache.spark.scheduler.MapStatus$;
 import org.apache.spark.serializer.SerializationStream;
 import org.apache.spark.serializer.Serializer;
 import org.apache.spark.serializer.SerializerInstance;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
+import org.apache.spark.shuffle.TmpDestShuffleFile;
 import org.apache.spark.shuffle.ShuffleWriter;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.storage.TimeTrackingOutputStream;
 import org.apache.spark.unsafe.Platform;
-import org.apache.spark.memory.TaskMemoryManager;
 
 @Private
 public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
@@ -149,12 +152,12 @@ public long getPeakMemoryUsedBytes() {
    * This convenience method should only be called in test code.
    */
   @VisibleForTesting
-  public void write(Iterator<Product2<K, V>> records) throws IOException {
-    write(JavaConverters.asScalaIteratorConverter(records).asScala());
+  public Seq<TmpDestShuffleFile> write(Iterator<Product2<K, V>> records) throws IOException {
+    return write(JavaConverters.asScalaIteratorConverter(records).asScala());
   }
 
   @Override
-  public void write(scala.collection.Iterator<Product2<K, V>> records) throws IOException {
+  public Seq<TmpDestShuffleFile> write(scala.collection.Iterator<Product2<K, V>> records) throws IOException {
     // Keep track of success so we know if we encountered an exception
     // We do this rather than a standard try/catch/re-throw to handle
     // generic throwables.
@@ -163,8 +166,9 @@ public void write(scala.collection.Iterator<Product2<K, V>> records) throws IOEx
       while (records.hasNext()) {
         insertRecordIntoSorter(records.next());
       }
-      closeAndWriteOutput();
+      final Seq<TmpDestShuffleFile> result = closeAndWriteOutput();
       success = true;
+      return result;
     } finally {
       if (sorter != null) {
         try {
@@ -198,25 +202,34 @@ private void open() throws IOException {
   }
 
   @VisibleForTesting
-  void closeAndWriteOutput() throws IOException {
+  Seq<TmpDestShuffleFile> closeAndWriteOutput() throws IOException {
     assert(sorter != null);
     updatePeakMemoryUsed();
     serBuffer = null;
     serOutputStream = null;
     final SpillInfo[] spills = sorter.closeAndGetSpills();
     sorter = null;
     final long[] partitionLengths;
+    final File dataFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
+    final File indexFile = shuffleBlockResolver.getIndexFile(shuffleId, mapId);
+    final File tmpDataFile = tmpShuffleFile(dataFile);
+    final File tmpIndexFile = tmpShuffleFile(indexFile);
     try {
-      partitionLengths = mergeSpills(spills);
+      partitionLengths = mergeSpills(spills, tmpDataFile);
     } finally {
       for (SpillInfo spill : spills) {
         if (spill.file.exists() && ! spill.file.delete()) {
           logger.error("Error while deleting spill file {}", spill.file.getPath());
         }
       }
     }
-    shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths);
+    shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths, tmpIndexFile);
     mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths);
+
+    return JavaConverters.asScalaBufferConverter(Arrays.asList(
+      new TmpDestShuffleFile(tmpIndexFile, indexFile),
+      new TmpDestShuffleFile(tmpDataFile, dataFile)
+    )).asScala();
   }
 
   @VisibleForTesting
@@ -248,8 +261,7 @@ void forceSorterToSpill() throws IOException {
    *
    * @return the partition lengths in the merged file.
    */
-  private long[] mergeSpills(SpillInfo[] spills) throws IOException {
-    final File outputFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
+  private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOException {
     final boolean compressionEnabled = sparkConf.getBoolean("spark.shuffle.compress", true);
     final CompressionCodec compressionCodec = CompressionCodec$.MODULE$.createCodec(sparkConf);
     final boolean fastMergeEnabled =
@@ -475,4 +487,5 @@ public Option<MapStatus> stop(boolean success) {
       }
     }
   }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -24,7 +24,7 @@ import scala.language.existentials
 import org.apache.spark._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import org.apache.spark.shuffle.ShuffleWriter
+import org.apache.spark.shuffle.{ShuffleOutputCoordinator, ShuffleWriter}
 
 /**
  * A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner
@@ -70,8 +70,12 @@ private[spark] class ShuffleMapTask(
     try {
       val manager = SparkEnv.get.shuffleManager
       writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
-      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
-      writer.stop(success = true).get
+      val tmpToDestFiles = writer.write(
+        rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
+      val mapStatus = writer.stop(success = true).get
+      // SPARK-8029 make sure only one task on this executor writes the final shuffle files
+      ShuffleOutputCoordinator.commitOutputs(dep.shuffleId, partitionId, tmpToDestFiles, mapStatus,
+        SparkEnv.get)._2
     } catch {
       case e: Exception =>
         try {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -615,18 +615,22 @@ private[spark] class TaskSetManager(
     val index = info.index
     info.markSuccessful()
     removeRunningTask(tid)
+    val task = tasks(index)
     // This method is called by "TaskSchedulerImpl.handleSuccessfulTask" which holds the
     // "TaskSchedulerImpl" lock until exiting. To avoid the SPARK-7655 issue, we should not
     // "deserialize" the value when holding a lock to avoid blocking other threads. So we call
     // "result.value()" in "TaskResultGetter.enqueueSuccessfulTask" before reaching here.
     // Note: "result.value()" only deserializes the value when it's called at the first time, so
     // here "result.value()" just returns the value and won't block other threads.
     sched.dagScheduler.taskEnded(
-      tasks(index), Success, result.value(), result.accumUpdates, info, result.metrics)
+      task, Success, result.value(), result.accumUpdates, info, result.metrics)
     if (!successful(index)) {
       tasksSuccessful += 1
-      logInfo("Finished task %s in stage %s (TID %d) in %d ms on %s (%d/%d)".format(
-        info.id, taskSet.id, info.taskId, info.duration, info.host, tasksSuccessful, numTasks))
+      // include the partition here b/c on a stage retry, the partition is *not* necessarily
+      // the same as info.id
+      logInfo(s"Finished task ${info.id} in stage ${taskSet.id} (TID ${info.taskId}}, " +
+        s"partition ${task.partitionId}) in ${info.duration} ms on executor ${info.executorId} " +
+        s"(${info.host}) ($tasksSuccessful/$numTasks)")
       // Mark successful and stop if all the tasks have succeeded.
       successful(index) = true
       if (tasksSuccessful == numTasks) {

diff --git a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.shuffle
 
+import java.io.File
 import java.util.concurrent.ConcurrentLinkedQueue
 
 import scala.collection.JavaConverters._
@@ -31,7 +32,7 @@ import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedH
 
 /** A group of writers for a ShuffleMapTask, one writer per reducer. */
 private[spark] trait ShuffleWriterGroup {
-  val writers: Array[DiskBlockObjectWriter]
+  val writers: Array[(DiskBlockObjectWriter, File)]
 
   /** @param success Indicates all writes were successful. If false, no blocks will be recorded. */
   def releaseWriters(success: Boolean)
@@ -80,10 +81,11 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
 
       val openStartTime = System.nanoTime
       val serializerInstance = serializer.newInstance()
-      val writers: Array[DiskBlockObjectWriter] = {
-        Array.tabulate[DiskBlockObjectWriter](numReducers) { bucketId =>
+      val writers: Array[(DiskBlockObjectWriter, File)] = {
+        Array.tabulate[(DiskBlockObjectWriter, File)](numReducers) { bucketId =>
           val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
           val blockFile = blockManager.diskBlockManager.getFile(blockId)
+          val tmpBlockFile = ShuffleWriter.tmpShuffleFile(blockFile)
           // Because of previous failures, the shuffle file may already exist on this machine.
           // If so, remove it.
           if (blockFile.exists) {
@@ -93,8 +95,8 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
               logWarning(s"Failed to remove existing shuffle file $blockFile")
             }
           }
-          blockManager.getDiskWriter(blockId, blockFile, serializerInstance, bufferSize,
-            writeMetrics)
+          blockManager.getDiskWriter(blockId, tmpBlockFile, serializerInstance, bufferSize,
+            writeMetrics) -> blockFile
         }
       }
       // Creating the file to write to and creating a disk writer both involve interacting with
@@ -132,6 +134,13 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
             logWarning(s"Error deleting ${file.getPath()}")
           }
         }
+        for (mapId <- state.completedMapTasks.asScala) {
+          val mapStatusFile =
+            blockManager.diskBlockManager.getFile(ShuffleMapStatusBlockId(shuffleId, mapId))
+          if (mapStatusFile.exists() && !mapStatusFile.delete()) {
+            logWarning(s"Error deleting MapStatus file ${mapStatusFile.getPath()}")
+          }
+        }
         logInfo("Deleted all files for shuffle " + shuffleId)
         true
       case None =>

diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -51,7 +51,7 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB
     blockManager.diskBlockManager.getFile(ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID))
   }
 
-  private def getIndexFile(shuffleId: Int, mapId: Int): File = {
+  private[shuffle] def getIndexFile(shuffleId: Int, mapId: Int): File = {
     blockManager.diskBlockManager.getFile(ShuffleIndexBlockId(shuffleId, mapId, NOOP_REDUCE_ID))
   }
 
@@ -72,16 +72,20 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB
         logWarning(s"Error deleting index ${file.getPath()}")
       }
     }
+
+    file = blockManager.diskBlockManager.getFile(ShuffleMapStatusBlockId(shuffleId, mapId))
+    if (file.exists() && !file.delete()) {
+      logWarning(s"Error deleting MapStatus file ${file.getPath()}")
+    }
   }
 
   /**
    * Write an index file with the offsets of each block, plus a final offset at the end for the
    * end of the output file. This will be used by getBlockData to figure out where each block
-   * begins and ends.
-   * */
-  def writeIndexFile(shuffleId: Int, mapId: Int, lengths: Array[Long]): Unit = {
-    val indexFile = getIndexFile(shuffleId, mapId)
-    val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexFile)))
+   * begins and ends.  Writes to a temp file, and returns that file.
+   */
+  def writeIndexFile(shuffleId: Int, mapId: Int, lengths: Array[Long], tmpIndexFile: File): Unit = {
+    val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tmpIndexFile)))
     Utils.tryWithSafeFinally {
       // We take in lengths of each block, need to convert it to offsets.
       var offset = 0L