Skip to content

Commit 8b70cde

Browse files
committed
Added comment about potential inaccuracy of bytesRead
1 parent d1016e8 commit 8b70cde

File tree

2 files changed

+6
-0
lines changed

2 files changed

+6
-0
lines changed

core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ class HadoopRDD[K, V](
201201
// Set the task input metrics.
202202
val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
203203
try {
204+
/* bytesRead may not exactly equal the bytes read by a task: split boundaries aren't
205+
* always at record boundaries, so tasks may need to read into other splits to complete
206+
* a record. */
204207
inputMetrics.bytesRead = split.inputSplit.value.getLength()
205208
} catch {
206209
case e: java.io.IOException =>

core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ class NewHadoopRDD[K, V](
115115

116116
val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
117117
try {
118+
/* bytesRead may not exactly equal the bytes read by a task: split boundaries aren't
119+
* always at record boundaries, so tasks may need to read into other splits to complete
120+
* a record. */
118121
inputMetrics.bytesRead = split.serializableHadoopSplit.value.getLength()
119122
} catch {
120123
case e: Exception =>

0 commit comments

Comments
 (0)