Returns an iterator of lists

HyukjinKwon · HyukjinKwon · commit 0703b67405fa · 2017-09-14T12:29:39.000+09:00
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -316,8 +316,9 @@ def _load_stream_without_unbatching(self, stream):
         key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)
         val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)
         for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):
-            # for correctness with repeated cartesian/zip this must be returned as one batch
-            yield product(key_batch, val_batch)
+            # for correctness with repeated cartesian/zip this must be returned as
+            # one batch (a list)
+            yield list(product(key_batch, val_batch))
 
     def load_stream(self, stream):
         return chain.from_iterable(self._load_stream_without_unbatching(stream))
@@ -346,8 +347,9 @@ def _load_stream_without_unbatching(self, stream):
             if len(key_batch) != len(val_batch):
                 raise ValueError("Can not deserialize PairRDD with different number of items"
                                  " in batches: (%d, %d)" % (len(key_batch), len(val_batch)))
-            # for correctness with repeated cartesian/zip this must be returned as one batch
-            yield zip(key_batch, val_batch)
+            # for correctness with repeated cartesian/zip this must be returned as
+            # one batch (a list)
+            yield list(zip(key_batch, val_batch))
 
     def load_stream(self, stream):
         return chain.from_iterable(self._load_stream_without_unbatching(stream))
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
@@ -644,6 +644,19 @@ def test_cartesian_chaining(self):
             set([(x, (y, y)) for x in range(10) for y in range(10)])
         )
 
+    def test_zip_chaining(self):
+        # Tests for SPARK-21985
+        rdd = self.sc.parallelize(range(3), 2)
+        self.assertSetEqual(
+            set(rdd.zip(rdd).zip(rdd).collect()),
+            set(zip(zip(range(3), range(3)), range(3)))
+        )
+
+        self.assertSetEqual(
+            set(rdd.zip(rdd.zip(rdd)).collect()),
+            set(zip(range(3), zip(range(3), range(3))))
+        )
+
     def test_deleting_input_files(self):
         # Regression test for SPARK-1025
         tempFile = tempfile.NamedTemporaryFile(delete=False)