[SPARK-18003][SPARK CORE] Fix bug of RDD zipWithIndex & zipWithUniqueId index value overflowing

WeichenXu123 · Robert Kruszewski · commit a9f66fb65e8f · 2016-10-31T22:52:39.000Z
## What changes were proposed in this pull request? - Fix bug of RDD `zipWithIndex` generating wrong result when one partition contains more than 2147483647 records. - Fix bug of RDD `zipWithUniqueId` generating wrong result when one partition contains more than 2147483647 records. ## How was this patch tested? test added. Author: WeichenXu <WeichenXu123@outlook.com> Closes apache#15550 from WeichenXu123/fix_rdd_zipWithIndex_overflow.
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1278,7 +1278,7 @@ abstract class RDD[T: ClassTag](
   def zipWithUniqueId(): RDD[(T, Long)] = withScope {
     val n = this.partitions.length.toLong
     this.mapPartitionsWithIndex { case (k, iter) =>
-      iter.zipWithIndex.map { case (item, i) =>
+      Utils.getIteratorZipWithIndex(iter, 0L).map { case (item, i) =>
         (item, i * n + k)
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -64,8 +64,7 @@ class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends RDD[(T, Long)](prev)
 
   override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
     val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
-    firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>
-      (x._1, split.startIndex + x._2)
-    }
+    val parentIter = firstParent[T].iterator(split.prev, context)
+    Utils.getIteratorZipWithIndex(parentIter, split.startIndex)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1759,6 +1759,21 @@ private[spark] object Utils extends Logging {
     count
   }
 
+  /**
+   * Generate a zipWithIndex iterator, avoid index value overflowing problem
+   * in scala's zipWithIndex
+   */
+  def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): Iterator[(T, Long)] = {
+    new Iterator[(T, Long)] {
+      var index: Long = startIndex - 1L
+      def hasNext: Boolean = iterator.hasNext
+      def next(): (T, Long) = {
+        index += 1L
+        (iterator.next(), index)
+      }
+    }
+  }
+
   /**
    * Creates a symlink.
    *
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -396,6 +396,13 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assert(Utils.getIteratorSize(iterator) === 5L)
   }
 
+  test("getIteratorZipWithIndex") {
+    val iterator = Utils.getIteratorZipWithIndex(Iterator(0, 1, 2), -1L + Int.MaxValue)
+    assert(iterator.toArray === Array(
+      (0, -1L + Int.MaxValue), (1, 0L + Int.MaxValue), (2, 1L + Int.MaxValue)
+    ))
+  }
+
   test("doesDirectoryContainFilesNewerThan") {
     // create some temporary directories and files
     val parent: File = Utils.createTempDir()

Original file line number	Diff line number	Diff line change
`@@ -1278,7 +1278,7 @@ abstract class RDD[T: ClassTag](`
`1278`	`1278`	`def zipWithUniqueId(): RDD[(T, Long)] = withScope {`
`1279`	`1279`	`val n = this.partitions.length.toLong`
`1280`	`1280`	`this.mapPartitionsWithIndex { case (k, iter) =>`
`1281`		`- iter.zipWithIndex.map { case (item, i) =>`
	`1281`	`+ Utils.getIteratorZipWithIndex(iter, 0L).map { case (item, i) =>`
`1282`	`1282`	`(item, i * n + k)`
`1283`	`1283`	`}`
`1284`	`1284`	`}`
Original file line number	Diff line number	Diff line change
`@@ -64,8 +64,7 @@ class ZippedWithIndexRDD[T: ClassTag](prev: RDD[T]) extends RDD[(T, Long)](prev)`
`64`	`64`
`65`	`65`	`override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {`
`66`	`66`	`val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]`
`67`		`- firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>`
`68`		`- (x._1, split.startIndex + x._2)`
`69`		`- }`
	`67`	`+ val parentIter = firstParent[T].iterator(split.prev, context)`
	`68`	`+ Utils.getIteratorZipWithIndex(parentIter, split.startIndex)`
`70`	`69`	`}`
`71`	`70`	`}`