Added comment about potential inaccuracy of bytesRead

kayousterhout · kayousterhout · commit 8b70cdec6e99 · 2014-06-29T15:00:17.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -201,6 +201,9 @@ class HadoopRDD[K, V](
       // Set the task input metrics.
       val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
       try {
+        /* bytesRead may not exactly equal the bytes read by a task: split boundaries aren't
+         * always at record boundaries, so tasks may need to read into other splits to complete
+         * a record. */
         inputMetrics.bytesRead = split.inputSplit.value.getLength()
       } catch {
         case e: java.io.IOException =>
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -115,6 +115,9 @@ class NewHadoopRDD[K, V](
 
       val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
       try {
+        /* bytesRead may not exactly equal the bytes read by a task: split boundaries aren't
+         * always at record boundaries, so tasks may need to read into other splits to complete
+         * a record. */
         inputMetrics.bytesRead = split.serializableHadoopSplit.value.getLength()
       } catch {
         case e: Exception =>