Skip to content

Commit 05be704

Browse files
committed
Merge pull request apache#505 from JoshRosen/SPARK-1026
Deprecate mapPartitionsWithSplit in PySpark (SPARK-1026) This commit deprecates `mapPartitionsWithSplit` in PySpark (see [SPARK-1026](https://spark-project.atlassian.net/browse/SPARK-1026) and removes the remaining references to it from the docs.
2 parents 3d6e754 + 4cebb79 commit 05be704

File tree

2 files changed

+23
-6
lines changed

2 files changed

+23
-6
lines changed

docs/scala-programming-guide.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,9 @@ The following tables list the transformations and actions currently supported (s
168168
Iterator[T] => Iterator[U] when running on an RDD of type T. </td>
169169
</tr>
170170
<tr>
171-
<td> <b>mapPartitionsWithSplit</b>(<i>func</i>) </td>
171+
<td> <b>mapPartitionsWithIndex</b>(<i>func</i>) </td>
172172
<td> Similar to mapPartitions, but also provides <i>func</i> with an integer value representing the index of
173-
the split, so <i>func</i> must be of type (Int, Iterator[T]) => Iterator[U] when running on an RDD of type T.
173+
the partition, so <i>func</i> must be of type (Int, Iterator[T]) => Iterator[U] when running on an RDD of type T.
174174
</td>
175175
</tr>
176176
<tr>

python/pyspark/rdd.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from subprocess import Popen, PIPE
2828
from tempfile import NamedTemporaryFile
2929
from threading import Thread
30+
import warnings
3031

3132
from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
3233
BatchedSerializer, CloudPickleSerializer, pack_long
@@ -179,7 +180,7 @@ def flatMap(self, f, preservesPartitioning=False):
179180
[(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]
180181
"""
181182
def func(s, iterator): return chain.from_iterable(imap(f, iterator))
182-
return self.mapPartitionsWithSplit(func, preservesPartitioning)
183+
return self.mapPartitionsWithIndex(func, preservesPartitioning)
183184

184185
def mapPartitions(self, f, preservesPartitioning=False):
185186
"""
@@ -191,10 +192,24 @@ def mapPartitions(self, f, preservesPartitioning=False):
191192
[3, 7]
192193
"""
193194
def func(s, iterator): return f(iterator)
194-
return self.mapPartitionsWithSplit(func)
195+
return self.mapPartitionsWithIndex(func)
196+
197+
def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
198+
"""
199+
Return a new RDD by applying a function to each partition of this RDD,
200+
while tracking the index of the original partition.
201+
202+
>>> rdd = sc.parallelize([1, 2, 3, 4], 4)
203+
>>> def f(splitIndex, iterator): yield splitIndex
204+
>>> rdd.mapPartitionsWithIndex(f).sum()
205+
6
206+
"""
207+
return PipelinedRDD(self, f, preservesPartitioning)
195208

196209
def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
197210
"""
211+
Deprecated: use mapPartitionsWithIndex instead.
212+
198213
Return a new RDD by applying a function to each partition of this RDD,
199214
while tracking the index of the original partition.
200215
@@ -203,7 +218,9 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
203218
>>> rdd.mapPartitionsWithSplit(f).sum()
204219
6
205220
"""
206-
return PipelinedRDD(self, f, preservesPartitioning)
221+
warnings.warn("mapPartitionsWithSplit is deprecated; "
222+
"use mapPartitionsWithIndex instead", DeprecationWarning, stacklevel=2)
223+
return self.mapPartitionsWithIndex(f, preservesPartitioning)
207224

208225
def filter(self, f):
209226
"""
@@ -235,7 +252,7 @@ def sample(self, withReplacement, fraction, seed):
235252
>>> sc.parallelize(range(0, 100)).sample(False, 0.1, 2).collect() #doctest: +SKIP
236253
[2, 3, 20, 21, 24, 41, 42, 66, 67, 89, 90, 98]
237254
"""
238-
return self.mapPartitionsWithSplit(RDDSampler(withReplacement, fraction, seed).func, True)
255+
return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True)
239256

240257
# this is ported from scala/spark/RDD.scala
241258
def takeSample(self, withReplacement, num, seed):

0 commit comments

Comments
 (0)