add/amend docstring and comments per @holdenk review

aray · aray · commit 12f3ab0bdd0f · 2016-12-08T11:05:34.000-06:00
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -96,6 +96,11 @@ def load_stream(self, stream):
         raise NotImplementedError
 
     def _load_stream_without_unbatching(self, stream):
+        """
+        Return an iterator of deserialized batches (lists) of objects from the input stream.
+        if the serializer does not operate on batches the default implementation returns an
+        iterator of single element lists.
+        """
         return map(lambda x: [x], self.load_stream(stream))
 
     # Note: our notion of "equality" is that output generated by
@@ -282,6 +287,8 @@ class CartesianDeserializer(Serializer):
 
     """
     Deserializes the JavaRDD cartesian() of two PythonRDDs.
+    Due to pyspark batching we cannot simply use the result of the Java RDD cartesian,
+    we additionally need to do the cartesian within each pair of batches.
     """
 
     def __init__(self, key_ser, val_ser):
@@ -292,6 +299,7 @@ def _load_stream_without_unbatching(self, stream):
         key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)
         val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)
         for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):
+            # for correctness with repeated cartesian/zip this must be returned as one batch
             yield product(key_batch, val_batch)
 
     def load_stream(self, stream):
@@ -306,6 +314,8 @@ class PairDeserializer(Serializer):
 
     """
     Deserializes the JavaRDD zip() of two PythonRDDs.
+    Due to pyspark batching we cannot simply use the result of the Java RDD zip,
+    we additionally need to do the zip within each pair of batches.
     """
 
     def __init__(self, key_ser, val_ser):
@@ -319,6 +329,7 @@ def _load_stream_without_unbatching(self, stream):
             if len(key_batch) != len(val_batch):
                 raise ValueError("Can not deserialize PairRDD with different number of items"
                                  " in batches: (%d, %d)" % (len(key_batch), len(val_batch)))
+            # for correctness with repeated cartesian/zip this must be returned as one batch
             yield zip(key_batch, val_batch)
 
     def load_stream(self, stream):