Quadruple instead of double for a minor speedup

ash211 · ash211 · commit 8b2299a6608f · 2014-09-03T03:13:10.000-07:00
Speedup occurs when calling take() on an RDD with most partitions empty
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1064,10 +1064,10 @@ abstract class RDD[T: ClassTag](
       // greater than totalParts because we actually cap it at totalParts in runJob.
       var numPartsToTry = 1
       if (partsScanned > 0) {
-        // If we didn't find any rows after the previous iteration, double and retry.  Otherwise,
+        // If we didn't find any rows after the previous iteration, quadruple and retry.  Otherwise,
         // interpolate the number of partitions we need to try, but overestimate it by 50%.
         if (buf.size == 0) {
-          numPartsToTry = partsScanned * 2
+          numPartsToTry = partsScanned * 4
         } else {
           numPartsToTry = (1.5 * num * partsScanned / buf.size).toInt
         }
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -1032,10 +1032,10 @@ def take(self, num):
             numPartsToTry = 1
             if partsScanned > 0:
                 # If we didn't find any rows after the previous iteration,
-                # double and retry.  Otherwise, interpolate the number of
+                # quadruple and retry.  Otherwise, interpolate the number of
                 # partitions we need to try, but overestimate it by 50%.
                 if len(items) == 0:
-                    numPartsToTry = partsScanned * 2
+                    numPartsToTry = partsScanned * 4
                 else:
                     numPartsToTry = int(1.5 * num * partsScanned / len(items))