apache · viirya · Apr 18, 2020 · Apr 18, 2020 · Apr 19, 2020 · Ngone51
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
@@ -199,8 +199,8 @@ private[spark] object ReliableCheckpointRDD extends Logging {
 
     val finalOutputName = ReliableCheckpointRDD.checkpointFileName(ctx.partitionId())
     val finalOutputPath = new Path(outputDir, finalOutputName)
-    val tempOutputPath =
-      new Path(outputDir, s".$finalOutputName-attempt-${ctx.attemptNumber()}")
+    val tempOutputPath = new Path(outputDir,
+      s".$finalOutputName-attempt-${ctx.stageAttemptNumber()}-${ctx.attemptNumber()}")
 
     val bufferSize = env.conf.get(BUFFER_SIZE)
 
@@ -218,11 +218,16 @@ private[spark] object ReliableCheckpointRDD extends Logging {
     }
     val serializer = env.serializer.newInstance()
     val serializeStream = serializer.serializeStream(fileOutputStream)
-    Utils.tryWithSafeFinally {
+    Utils.tryWithSafeFinallyAndFailureCallbacks {
       serializeStream.writeAll(iterator)
-    } {
+    } (catchBlock = {
+      val deleted = fs.delete(tempOutputPath, false)
+      if (!deleted) {
+        logInfo(s"Failed to delete tempOutputPath $tempOutputPath.")
+      }
+    }, finallyBlock = {
       serializeStream.close()
-    }
+    })
 
     if (!fs.rename(tempOutputPath, finalOutputPath)) {
       if (!fs.exists(finalOutputPath)) {

diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark.internal.config.CACHE_CHECKPOINT_PREFERRED_LOCS_EXPIRE_T
 import org.apache.spark.internal.config.UI._
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.rdd._
+import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.storage.{BlockId, StorageLevel, TestBlockId}
 import org.apache.spark.util.Utils
 
@@ -642,4 +643,29 @@ class CheckpointStorageSuite extends SparkFunSuite with LocalSparkContext {
       assert(preferredLoc == checkpointedRDD.cachedPreferredLocations.get(partiton))
     }
   }
+
+  test("SPARK-31484: checkpoint should not fail in retry") {
+    withTempDir { checkpointDir =>
+      val conf = new SparkConf()
+        .set(UI_ENABLED.key, "false")
+      sc = new SparkContext("local[1]", "test", conf)
+      sc.setCheckpointDir(checkpointDir.toString)
+      val rdd = sc.makeRDD(1 to 200, numSlices = 4).repartition(1).mapPartitions { iter =>
+        iter.map { i =>
+          if (i > 100 && TaskContext.get().stageAttemptNumber() == 0) {
+            // throw new SparkException("Make first attemp failed.")
+            // Throw FetchFailedException to explicitly trigger stage resubmission.
+            // A normal exception will only trigger task resubmission in the same stage.
+            throw new FetchFailedException(null, 0, 0L, 0, 0, "Fake")
+          } else {
+            i
+          }
+        }
+      }
+      rdd.checkpoint()
+      assert(rdd.collect().toSeq === (1 to 200))
+      // Verify that RDD is checkpointed
+      assert(rdd.firstParent.isInstanceOf[ReliableCheckpointRDD[_]])
+    }
+  }
 }