From e22f753ebcd5c7f44f9699481ad26d2263290097 Mon Sep 17 00:00:00 2001 From: Timothy Hopper Date: Wed, 22 Oct 2014 15:56:19 -0400 Subject: [PATCH] Clarify docstring for foreachPartition Due to the underlying use of `mapPartitions` which requires a function that maps partitions to partitions, `foreachPartition` requires the function passed to be a generator function or return an iterable (although these results are discarded). This is currently not stated in the documentation except through the unexplained example. It would help users to understand that example and not waste time with this error: ``` TypeError: 'NoneType' object is not iterable ``` --- python/pyspark/rdd.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 15be4bfec92f9..7112d41e6693a 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -634,6 +634,11 @@ def processPartition(iterator): def foreachPartition(self, f): """ Applies a function to each partition of this RDD. + + + Note: Due to implementation, f must either return an iterable object + or be a generator function. However, foreachPartition always returns + `None`. >>> def f(iterator): ... for x in iterator: