Skip to content

Commit 78bf997

Browse files
author
Davies Liu
committed
fix tests, do not use numpy in randomSplit, no performance gain
1 parent f5fdf63 commit 78bf997

File tree

2 files changed

+4
-3
lines changed

2 files changed

+4
-3
lines changed

python/pyspark/rdd.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,11 +325,11 @@ def randomSplit(self, weights, seed=None):
325325
:return: split RDDs in a list
326326
327327
>>> rdd = sc.parallelize(range(5), 1)
328-
>>> rdd1, rdd2 = rdd.randomSplit([2, 3], 101)
328+
>>> rdd1, rdd2 = rdd.randomSplit([2, 3], 17)
329329
>>> rdd1.collect()
330-
[2, 3]
330+
[1, 3]
331331
>>> rdd2.collect()
332-
[0, 1, 4]
332+
[0, 2, 4]
333333
"""
334334
s = float(sum(weights))
335335
cweights = [0.0]

python/pyspark/rddsampler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ class RDDRangeSampler(RDDSamplerBase):
119119

120120
def __init__(self, lowerBound, upperBound, seed=None):
121121
RDDSamplerBase.__init__(self, False, seed)
122+
self._use_numpy = False # no performance gain from numpy
122123
self._lowerBound = lowerBound
123124
self._upperBound = upperBound
124125

0 commit comments

Comments
 (0)