-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-8582][Core] Add CheckpointingIterator to optimize checkpointing #7021
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d863516
1a3055e
3c5b203
a829a7d
2f43ff3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,7 @@ import org.apache.spark.partial.CountEvaluator | |
| import org.apache.spark.partial.GroupedCountEvaluator | ||
| import org.apache.spark.partial.PartialResult | ||
| import org.apache.spark.storage.StorageLevel | ||
| import org.apache.spark.util.{BoundedPriorityQueue, Utils} | ||
| import org.apache.spark.util.{BoundedPriorityQueue, CheckpointingIterator, Utils} | ||
| import org.apache.spark.util.collection.OpenHashMap | ||
| import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, BernoulliCellSampler, | ||
| SamplingUtils} | ||
|
|
@@ -238,11 +238,16 @@ abstract class RDD[T: ClassTag]( | |
| * subclasses of RDD. | ||
| */ | ||
| final def iterator(split: Partition, context: TaskContext): Iterator[T] = { | ||
| if (storageLevel != StorageLevel.NONE) { | ||
| val iter = if (storageLevel != StorageLevel.NONE) { | ||
| SparkEnv.get.cacheManager.getOrCompute(this, split, context, storageLevel) | ||
| } else { | ||
| computeOrReadCheckpoint(split, context) | ||
| } | ||
| if (checkpointData.isDefined) { | ||
| checkpointData.get.getCheckpointIterator(iter, context, split.index) | ||
| } else { | ||
| iter | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this could be |
||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,9 +19,11 @@ package org.apache.spark.rdd | |
|
|
||
| import scala.reflect.ClassTag | ||
|
|
||
| import org.apache.hadoop.fs.FileSystem | ||
| import org.apache.hadoop.fs.Path | ||
|
|
||
| import org.apache.spark._ | ||
| import org.apache.spark.util.{CheckpointingIterator, SerializableConfiguration} | ||
| import org.apache.spark.util.SerializableConfiguration | ||
|
|
||
| /** | ||
|
|
@@ -44,6 +46,26 @@ private[spark] class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T]) | |
|
|
||
| import CheckpointState._ | ||
|
|
||
| // Because SparkContext is transient in RDD, so we can't get the id and checkpointDir later. | ||
| // So keep a copy of the id and checkpointDir. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if I understand this comment. How did it work before then? Even before this patch |
||
| // The id of RDD | ||
| val rddId: Int = rdd.id | ||
|
|
||
| // The path the checkpoint data will write to. | ||
| val checkpointDir = rdd.context.checkpointDir | ||
| @transient var checkpointPath: Path = null | ||
| @transient var fs: FileSystem = null | ||
| if (checkpointDir.isDefined) { | ||
| checkpointPath = new Path(checkpointDir.get, "rdd-" + rddId) | ||
| fs = checkpointPath.getFileSystem(rdd.context.hadoopConfiguration) | ||
| if (!fs.mkdirs(checkpointPath)) { | ||
| throw new SparkException("Failed to create checkpoint path " + checkpointPath) | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these don't need to be vars right? In fact, |
||
|
|
||
| val broadcastedConf = rdd.context.broadcast( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please make all of these private |
||
| new SerializableConfiguration(rdd.context.hadoopConfiguration)) | ||
|
|
||
| // The checkpoint state of the associated RDD. | ||
| private var cpState = Initialized | ||
|
|
||
|
|
@@ -66,6 +88,27 @@ private[spark] class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T]) | |
| cpFile | ||
| } | ||
|
|
||
| // Get the iterator used to write checkpoint data to HDFS | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make this a real java doc: |
||
| def getCheckpointIterator( | ||
| rddIterator: Iterator[T], | ||
| context: TaskContext, | ||
| partitionId: Int): Iterator[T] = { | ||
| RDDCheckpointData.synchronized { | ||
| if (cpState == Initialized) { | ||
| // Create the output path for the checkpoint | ||
| val path = new Path(checkpointDir.get, "rdd-" + rddId) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't this read from
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually, the path here should just be |
||
| CheckpointingIterator[T]( | ||
| rddIterator, | ||
| path.toString, | ||
| broadcastedConf, | ||
| partitionId, | ||
| context) | ||
| } else { | ||
| rddIterator | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Materialize this RDD and write its content to a reliable DFS. | ||
| * This is called immediately after the first action invoked on this RDD has completed. | ||
|
|
@@ -82,25 +125,13 @@ private[spark] class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T]) | |
| } | ||
| } | ||
|
|
||
| // Create the output path for the checkpoint | ||
| val path = RDDCheckpointData.rddCheckpointDataPath(rdd.context, rdd.id).get | ||
| val fs = path.getFileSystem(rdd.context.hadoopConfiguration) | ||
| if (!fs.mkdirs(path)) { | ||
| throw new SparkException(s"Failed to create checkpoint path $path") | ||
| } | ||
|
|
||
| // Save to file, and reload it as an RDD | ||
| val broadcastedConf = rdd.context.broadcast( | ||
| new SerializableConfiguration(rdd.context.hadoopConfiguration)) | ||
| val path = checkpointPath | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to declare another variable here? Just use |
||
| val newRDD = new CheckpointRDD[T](rdd.context, path.toString) | ||
| if (rdd.conf.getBoolean("spark.cleaner.referenceTracking.cleanCheckpoints", false)) { | ||
| rdd.context.cleaner.foreach { cleaner => | ||
| cleaner.registerRDDCheckpointDataForCleanup(newRDD, rdd.id) | ||
| cleaner.registerRDDCheckpointDataForCleanup(newRDD, rddId) | ||
| } | ||
| } | ||
|
|
||
| // TODO: This is expensive because it computes the RDD again unnecessarily (SPARK-8582) | ||
| rdd.context.runJob(rdd, CheckpointRDD.writeToFile[T](path.toString, broadcastedConf) _) | ||
| if (newRDD.partitions.length != rdd.partitions.length) { | ||
| throw new SparkException( | ||
| "Checkpoint RDD " + newRDD + "(" + newRDD.partitions.length + ") has different " + | ||
|
|
@@ -114,7 +145,7 @@ private[spark] class RDDCheckpointData[T: ClassTag](@transient rdd: RDD[T]) | |
| rdd.markCheckpointed(newRDD) // Update the RDD's dependencies and partitions | ||
| cpState = Checkpointed | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The state transition here is incorrect. At this point the RDD has not been checkpointed yet. It's not safe to truncate the RDD's lineage until we drain the iterator. |
||
| } | ||
| logInfo(s"Done checkpointing RDD ${rdd.id} to $path, new parent is RDD ${newRDD.id}") | ||
| logInfo(s"Done checkpointing RDD ${rddId} to $path, new parent is RDD ${newRDD.id}") | ||
| } | ||
|
|
||
| def getPartitions: Array[Partition] = RDDCheckpointData.synchronized { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,151 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.util | ||
|
|
||
| import java.io.IOException | ||
|
|
||
| import scala.reflect.ClassTag | ||
|
|
||
| import org.apache.hadoop.fs.FileSystem | ||
| import org.apache.hadoop.fs.Path | ||
|
|
||
| import org.apache.spark._ | ||
| import org.apache.spark.broadcast.Broadcast | ||
| import org.apache.spark.rdd.CheckpointRDD | ||
| import org.apache.spark.serializer.SerializationStream | ||
|
|
||
| /** | ||
| * Wrapper around an iterator which writes checkpoint data to HDFS while running action on | ||
| * a RDD to support checkpointing RDD. | ||
| */ | ||
| private[spark] class CheckpointingIterator[A: ClassTag]( | ||
| values: Iterator[A], | ||
| path: String, | ||
| broadcastedConf: Broadcast[SerializableConfiguration], | ||
| partitionId: Int, | ||
| context: TaskContext, | ||
| blockSize: Int = -1) extends Iterator[A] with Logging { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the java doc, could you document what each of these variables represent? |
||
|
|
||
| private val env = SparkEnv.get | ||
| private var fs: FileSystem = null | ||
| private val bufferSize = env.conf.getInt("spark.buffer.size", 65536) | ||
| private var serializeStream: SerializationStream = null | ||
|
|
||
| private var finalOutputPath: Path = null | ||
| private var tempOutputPath: Path = null | ||
|
|
||
| /** | ||
| * Initialize this iterator by creating temporary output path and serializer instance. | ||
| * | ||
| */ | ||
| def init(): this.type = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we remove this |
||
| val outputDir = new Path(path) | ||
| fs = outputDir.getFileSystem(broadcastedConf.value.value) | ||
|
|
||
| val finalOutputName = CheckpointRDD.splitIdToFile(partitionId) | ||
| finalOutputPath = new Path(outputDir, finalOutputName) | ||
| tempOutputPath = | ||
| new Path(outputDir, "." + finalOutputName + "-attempt-" + context.attemptNumber) | ||
|
|
||
| if (fs.exists(tempOutputPath)) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is possible that more than one iterators for the same split are created and used, e.g., |
||
| // There are more than one iterator of the RDD is consumed. | ||
| // Don't checkpoint data in this iterator. | ||
| doCheckpoint = false | ||
| return this | ||
| } | ||
|
|
||
| val fileOutputStream = if (blockSize < 0) { | ||
| fs.create(tempOutputPath, false, bufferSize) | ||
| } else { | ||
| // This is mainly for testing purpose | ||
| fs.create(tempOutputPath, false, bufferSize, fs.getDefaultReplication, blockSize) | ||
| } | ||
| val serializer = env.serializer.newInstance() | ||
| serializeStream = serializer.serializeStream(fileOutputStream) | ||
| this | ||
| } | ||
|
|
||
| /** | ||
| * Called when this iterator is on the latest element by `hasNext`. | ||
| * This method will rename temporary output path to final output path of checkpoint data. | ||
| */ | ||
| def completion(): Unit = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does |
||
| if (!doCheckpoint) { | ||
| return | ||
| } | ||
|
|
||
| serializeStream.close() | ||
|
|
||
| if (!fs.rename(tempOutputPath, finalOutputPath)) { | ||
| if (!fs.exists(finalOutputPath)) { | ||
| logInfo("Deleting tempOutputPath " + tempOutputPath) | ||
| fs.delete(tempOutputPath, false) | ||
| throw new IOException("Checkpoint failed: failed to save output of task: " | ||
| + context.attemptNumber + " and final output path does not exist") | ||
| } else { | ||
| // Some other copy of this task must've finished before us and renamed it | ||
| logInfo("Final output path " + finalOutputPath + " already exists; not overwriting it") | ||
| fs.delete(tempOutputPath, false) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def checkpointing(item: A): Unit = { | ||
| serializeStream.writeObject(item) | ||
| } | ||
|
|
||
| override def next(): A = { | ||
| val item = values.next() | ||
| if (doCheckpoint) { | ||
| checkpointing(item) | ||
| } | ||
| // If this the latest item, call hasNext will write to final output early. | ||
| hasNext | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sometimes the rdd.iterator will not be consumed to call |
||
| item | ||
| } | ||
|
|
||
| private[this] var doCheckpoint = true | ||
| private[this] var completed = false | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please declare all variables at the top |
||
|
|
||
| override def hasNext: Boolean = { | ||
| val r = values.hasNext | ||
| if (!r && !completed) { | ||
| completed = true | ||
| completion() | ||
| } | ||
| r | ||
| } | ||
| } | ||
|
|
||
| private[spark] object CheckpointingIterator { | ||
| def apply[A: ClassTag]( | ||
| values: Iterator[A], | ||
| path: String, | ||
| broadcastedConf: Broadcast[SerializableConfiguration], | ||
| partitionId: Int, | ||
| context: TaskContext, | ||
| blockSize: Int = -1) : CheckpointingIterator[A] = { | ||
| new CheckpointingIterator[A]( | ||
| values, | ||
| path, | ||
| broadcastedConf, | ||
| partitionId, | ||
| context, | ||
| blockSize).init() | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -359,7 +359,7 @@ class CheckpointSuite extends SparkFunSuite with LocalSparkContext with Logging | |
| * have large size. | ||
| */ | ||
| def generateFatPairRDD(): RDD[(Int, Int)] = { | ||
| new FatPairRDD(sc.makeRDD(1 to 100, 4), partitioner).mapValues(x => x) | ||
| new FatPairRDD(sc.makeRDD(1 to 100, 2), partitioner).mapValues(x => x) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The partitioners of the rdds might have different
numPartitions. It will causes error later.