-
Notifications
You must be signed in to change notification settings - Fork 28.9k
SPARK-2791: Fix committing, reverting and state tracking in shuffle file consolidation #1678
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,16 +39,16 @@ private[spark] abstract class BlockObjectWriter(val blockId: BlockId) { | |
| def isOpen: Boolean | ||
|
|
||
| /** | ||
| * Flush the partial writes and commit them as a single atomic block. Return the | ||
| * number of bytes written for this commit. | ||
| * Flush the partial writes and commit them as a single atomic block. | ||
| */ | ||
| def commit(): Long | ||
| def commitAndClose(): Unit | ||
|
|
||
| /** | ||
| * Reverts writes that haven't been flushed yet. Callers should invoke this function | ||
| * when there are runtime exceptions. | ||
| * when there are runtime exceptions. This method will not throw, though it may be | ||
| * unsuccessful in truncating written data. | ||
| */ | ||
| def revertPartialWrites() | ||
| def revertPartialWritesAndClose() | ||
|
|
||
| /** | ||
| * Writes an object. | ||
|
|
@@ -57,6 +57,7 @@ private[spark] abstract class BlockObjectWriter(val blockId: BlockId) { | |
|
|
||
| /** | ||
| * Returns the file segment of committed data that this Writer has written. | ||
| * This is only valid after commitAndClose() has been called. | ||
| */ | ||
| def fileSegment(): FileSegment | ||
|
|
||
|
|
@@ -108,15 +109,14 @@ private[spark] class DiskBlockObjectWriter( | |
| private var ts: TimeTrackingOutputStream = null | ||
| private var objOut: SerializationStream = null | ||
| private val initialPosition = file.length() | ||
| private var lastValidPosition = initialPosition | ||
| private var finalPosition: Long = -1 | ||
| private var initialized = false | ||
| private var _timeWriting = 0L | ||
|
|
||
| override def open(): BlockObjectWriter = { | ||
| fos = new FileOutputStream(file, true) | ||
| ts = new TimeTrackingOutputStream(fos) | ||
| channel = fos.getChannel() | ||
| lastValidPosition = initialPosition | ||
| bs = compressStream(new BufferedOutputStream(ts, bufferSize)) | ||
| objOut = serializer.newInstance().serializeStream(bs) | ||
| initialized = true | ||
|
|
@@ -147,28 +147,36 @@ private[spark] class DiskBlockObjectWriter( | |
|
|
||
| override def isOpen: Boolean = objOut != null | ||
|
|
||
| override def commit(): Long = { | ||
| override def commitAndClose(): Unit = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should remove close from the interface, and make it private to this class btw.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Absolutely -- I did not do that in this patch because ExternalAppendOnlyMap did a close without a commit, which is a fix outside of the scope of this PR, but definitely one that should be made.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I merged the sort patch, and modified EAOM, it was simply replace close with commitAndClose.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing close() actually now requires a very minor refactor of ExternalSorter for the |
||
| if (initialized) { | ||
| // NOTE: Because Kryo doesn't flush the underlying stream we explicitly flush both the | ||
| // serializer stream and the lower level stream. | ||
| objOut.flush() | ||
| bs.flush() | ||
| val prevPos = lastValidPosition | ||
| lastValidPosition = channel.position() | ||
| lastValidPosition - prevPos | ||
| } else { | ||
| // lastValidPosition is zero if stream is uninitialized | ||
| lastValidPosition | ||
| close() | ||
| } | ||
| finalPosition = file.length() | ||
| } | ||
|
|
||
| override def revertPartialWrites() { | ||
| if (initialized) { | ||
| // Discard current writes. We do this by flushing the outstanding writes and | ||
| // truncate the file to the last valid position. | ||
| objOut.flush() | ||
| bs.flush() | ||
| channel.truncate(lastValidPosition) | ||
| // Discard current writes. We do this by flushing the outstanding writes and then | ||
| // truncating the file to its initial position. | ||
| override def revertPartialWritesAndClose() { | ||
| try { | ||
| if (initialized) { | ||
| objOut.flush() | ||
| bs.flush() | ||
| close() | ||
| } | ||
|
|
||
| val truncateStream = new FileOutputStream(file, true) | ||
| try { | ||
| truncateStream.getChannel.truncate(initialPosition) | ||
| } finally { | ||
| truncateStream.close() | ||
| } | ||
| } catch { | ||
| case e: Exception => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This catches |
||
| logError("Uncaught exception while reverting partial writes to file " + file, e) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the use of writers in HashShuffleWriter, it is possible for a closed stream to be reverted (if some other stream's close failed for example).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Closed streams should not inherently throw (since we check
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant the former case : close on a writer fails with an exception; while earlier streams succeeded. On the face of it, I agree, it should not cause issues : but then since the expectation from this class is never enforced; and so can silently fail.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not certain I understand. The situation I am imagining is that we commit to the first Writer, then the second one fails. In HashShuffleWriter, we will then call revertPartialWritesAndClose() on all Writers, causing us to revert all the changes back to "initialPosition", which should revert even the committed data.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, did not notice that the "if (initialized)" did not include the truncate call ! |
||
| } | ||
| } | ||
|
|
||
|
|
@@ -188,6 +196,7 @@ private[spark] class DiskBlockObjectWriter( | |
|
|
||
| // Only valid if called after commit() | ||
| override def bytesWritten: Long = { | ||
| lastValidPosition - initialPosition | ||
| assert(finalPosition != -1, "bytesWritten is only valid after successful commit()") | ||
| finalPosition - initialPosition | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
revert can throw exception : which will cause other writers to not revert.
We need to wrap it in try/catch, log and continue
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Revert actually doesn't throw, per its (updated) comment.