From dc385ef6f103a28361de3e7599a1e15528973180 Mon Sep 17 00:00:00 2001 From: freeman Date: Wed, 22 Oct 2014 02:45:40 -0400 Subject: [PATCH] Change maximum value for default seed - Fixes bug in NumPy v1.9 which truncates random seeds larger than or equal to 2 ** 32 - Add an extra test for sampling with default seed --- python/pyspark/rddsampler.py | 4 ++-- python/pyspark/tests.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py index 55e247da0e4dc..528a181e8905a 100644 --- a/python/pyspark/rddsampler.py +++ b/python/pyspark/rddsampler.py @@ -31,7 +31,7 @@ def __init__(self, withReplacement, seed=None): "Falling back to default random generator for sampling.") self._use_numpy = False - self._seed = seed if seed is not None else random.randint(0, sys.maxint) + self._seed = seed if seed is not None else random.randint(0, 2 ** 32 - 1) self._withReplacement = withReplacement self._random = None self._split = None @@ -47,7 +47,7 @@ def initRandomGenerator(self, split): for _ in range(0, split): # discard the next few values in the sequence to have a # different seed for the different splits - self._random.randint(0, sys.maxint) + self._random.randint(0, 2 ** 32 - 1) self._split = split self._rand_initialized = True diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index f5ccf31abb3fa..1a8e4150e63c3 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -433,6 +433,12 @@ def test_deleting_input_files(self): os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) + def test_sampling_default_seed(self): + # Test for SPARK-3995 (default seed setting) + data = self.sc.parallelize(range(1000), 1) + subset = data.takeSample(False, 10) + self.assertEqual(len(subset), 10) + def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)