Skip to content

Commit c7aeecd

Browse files
jerryshaoJoshRosen
authored andcommitted
[SPARK-3948][Shuffle]Fix stream corruption bug in sort-based shuffle
Kernel 2.6.32 bug will lead to unexpected behavior of transferTo in copyStream, and this will corrupt the shuffle output file in sort-based shuffle, which will somehow introduce PARSING_ERROR(2), deserialization error or offset out of range. Here fix this by adding append flag, also add some position checking code. Details can be seen in [SPARK-3948](https://issues.apache.org/jira/browse/SPARK-3948). Author: jerryshao <[email protected]> Closes #2824 from jerryshao/SPARK-3948 and squashes the following commits: be0533a [jerryshao] Address the comments a82b184 [jerryshao] add configuration to control the NIO way of copying stream e17ada2 [jerryshao] Fix kernel 2.6.32 bug led unexpected behavior of transferTo
1 parent d1966f3 commit c7aeecd

File tree

2 files changed

+28
-6
lines changed

2 files changed

+28
-6
lines changed

core/src/main/scala/org/apache/spark/util/Utils.scala

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -269,23 +269,44 @@ private[spark] object Utils extends Logging {
269269
dir
270270
}
271271

272-
/** Copy all data from an InputStream to an OutputStream */
272+
/** Copy all data from an InputStream to an OutputStream. NIO way of file stream to file stream
273+
* copying is disabled by default unless explicitly set transferToEnabled as true,
274+
* the parameter transferToEnabled should be configured by spark.file.transferTo = [true|false].
275+
*/
273276
def copyStream(in: InputStream,
274277
out: OutputStream,
275-
closeStreams: Boolean = false): Long =
278+
closeStreams: Boolean = false,
279+
transferToEnabled: Boolean = false): Long =
276280
{
277281
var count = 0L
278282
try {
279-
if (in.isInstanceOf[FileInputStream] && out.isInstanceOf[FileOutputStream]) {
283+
if (in.isInstanceOf[FileInputStream] && out.isInstanceOf[FileOutputStream]
284+
&& transferToEnabled) {
280285
// When both streams are File stream, use transferTo to improve copy performance.
281286
val inChannel = in.asInstanceOf[FileInputStream].getChannel()
282287
val outChannel = out.asInstanceOf[FileOutputStream].getChannel()
288+
val initialPos = outChannel.position()
283289
val size = inChannel.size()
284290

285291
// In case transferTo method transferred less data than we have required.
286292
while (count < size) {
287293
count += inChannel.transferTo(count, size - count, outChannel)
288294
}
295+
296+
// Check the position after transferTo loop to see if it is in the right position and
297+
// give user information if not.
298+
// Position will not be increased to the expected length after calling transferTo in
299+
// kernel version 2.6.32, this issue can be seen in
300+
// https://bugs.openjdk.java.net/browse/JDK-7052359
301+
// This will lead to stream corruption issue when using sort-based shuffle (SPARK-3948).
302+
val finalPos = outChannel.position()
303+
assert(finalPos == initialPos + size,
304+
s"""
305+
|Current position $finalPos do not equal to expected position ${initialPos + size}
306+
|after transferTo, please check your kernel version to see if it is 2.6.32,
307+
|this is a kernel bug which will lead to unexpected behavior when using transferTo.
308+
|You can set spark.file.transferTo = false to disable this NIO feature.
309+
""".stripMargin)
289310
} else {
290311
val buf = new Array[Byte](8192)
291312
var n = 0
@@ -727,7 +748,7 @@ private[spark] object Utils extends Logging {
727748

728749
/**
729750
* Determines if a directory contains any files newer than cutoff seconds.
730-
*
751+
*
731752
* @param dir must be the path to a directory, or IllegalArgumentException is thrown
732753
* @param cutoff measured in seconds. Returns true if there are any files or directories in the
733754
* given directory whose last modified time is later than this many seconds ago

core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ private[spark] class ExternalSorter[K, V, C](
9393
private val conf = SparkEnv.get.conf
9494
private val spillingEnabled = conf.getBoolean("spark.shuffle.spill", true)
9595
private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 32) * 1024
96+
private val transferToEnabled = conf.getBoolean("spark.file.transferTo", true)
9697

9798
// Size of object batches when reading/writing from serializers.
9899
//
@@ -705,10 +706,10 @@ private[spark] class ExternalSorter[K, V, C](
705706
var out: FileOutputStream = null
706707
var in: FileInputStream = null
707708
try {
708-
out = new FileOutputStream(outputFile)
709+
out = new FileOutputStream(outputFile, true)
709710
for (i <- 0 until numPartitions) {
710711
in = new FileInputStream(partitionWriters(i).fileSegment().file)
711-
val size = org.apache.spark.util.Utils.copyStream(in, out, false)
712+
val size = org.apache.spark.util.Utils.copyStream(in, out, false, transferToEnabled)
712713
in.close()
713714
in = null
714715
lengths(i) = size

0 commit comments

Comments
 (0)