Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ class MaxHeapQ(object):

"""
An implementation of MaxHeap.

>>> import pyspark.rdd
>>> heap = pyspark.rdd.MaxHeapQ(5)
>>> [heap.insert(i) for i in range(10)]
Expand Down Expand Up @@ -381,6 +382,7 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
def getNumPartitions(self):
"""
Returns the number of partitions in RDD

>>> rdd = sc.parallelize([1, 2, 3, 4], 2)
>>> rdd.getNumPartitions()
2
Expand Down Expand Up @@ -570,6 +572,7 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
"""
Sorts this RDD, which is assumed to consist of (key, value) pairs.
# noqa

>>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
>>> sc.parallelize(tmp).sortByKey(True, 2).collect()
[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
Expand Down Expand Up @@ -1205,6 +1208,7 @@ def collectAsMap(self):
def keys(self):
"""
Return an RDD with the keys of each tuple.

>>> m = sc.parallelize([(1, 2), (3, 4)]).keys()
>>> m.collect()
[1, 3]
Expand All @@ -1214,6 +1218,7 @@ def keys(self):
def values(self):
"""
Return an RDD with the values of each tuple.

>>> m = sc.parallelize([(1, 2), (3, 4)]).values()
>>> m.collect()
[2, 4]
Expand Down Expand Up @@ -1638,6 +1643,7 @@ def repartition(self, numPartitions):
Internally, this uses a shuffle to redistribute data.
If you are decreasing the number of partitions in this RDD, consider
using `coalesce`, which can avoid performing a shuffle.

>>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4)
>>> sorted(rdd.glom().collect())
[[1], [2, 3], [4, 5], [6, 7]]
Expand All @@ -1652,6 +1658,7 @@ def repartition(self, numPartitions):
def coalesce(self, numPartitions, shuffle=False):
"""
Return a new RDD that is reduced into `numPartitions` partitions.

>>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect()
[[1], [2, 3], [4, 5]]
>>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect()
Expand Down Expand Up @@ -1690,6 +1697,7 @@ def name(self):
def setName(self, name):
"""
Assign a name to this RDD.

>>> rdd1 = sc.parallelize([1,2])
>>> rdd1.setName('RDD1')
>>> rdd1.name()
Expand Down Expand Up @@ -1749,6 +1757,7 @@ class PipelinedRDD(RDD):

"""
Pipelined maps:

>>> rdd = sc.parallelize([1, 2, 3, 4])
>>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect()
[4, 8, 12, 16]
Expand Down