Skip to content

Commit 8b2299a

Browse files
committed
Quadruple instead of double for a minor speedup
Speedup occurs when calling take() on an RDD with most partitions empty
1 parent e5f7e4d commit 8b2299a

File tree

2 files changed

+4
-4
lines changed

2 files changed

+4
-4
lines changed

core/src/main/scala/org/apache/spark/rdd/RDD.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,10 +1064,10 @@ abstract class RDD[T: ClassTag](
10641064
// greater than totalParts because we actually cap it at totalParts in runJob.
10651065
var numPartsToTry = 1
10661066
if (partsScanned > 0) {
1067-
// If we didn't find any rows after the previous iteration, double and retry. Otherwise,
1067+
// If we didn't find any rows after the previous iteration, quadruple and retry. Otherwise,
10681068
// interpolate the number of partitions we need to try, but overestimate it by 50%.
10691069
if (buf.size == 0) {
1070-
numPartsToTry = partsScanned * 2
1070+
numPartsToTry = partsScanned * 4
10711071
} else {
10721072
numPartsToTry = (1.5 * num * partsScanned / buf.size).toInt
10731073
}

python/pyspark/rdd.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,10 +1032,10 @@ def take(self, num):
10321032
numPartsToTry = 1
10331033
if partsScanned > 0:
10341034
# If we didn't find any rows after the previous iteration,
1035-
# double and retry. Otherwise, interpolate the number of
1035+
# quadruple and retry. Otherwise, interpolate the number of
10361036
# partitions we need to try, but overestimate it by 50%.
10371037
if len(items) == 0:
1038-
numPartsToTry = partsScanned * 2
1038+
numPartsToTry = partsScanned * 4
10391039
else:
10401040
numPartsToTry = int(1.5 * num * partsScanned / len(items))
10411041

0 commit comments

Comments
 (0)