@@ -134,6 +134,7 @@ class MaxHeapQ(object):
134134
135135 """
136136 An implementation of MaxHeap.
137+
137138 >>> import pyspark.rdd
138139 >>> heap = pyspark.rdd.MaxHeapQ(5)
139140 >>> [heap.insert(i) for i in range(10)]
@@ -381,6 +382,7 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
381382 def getNumPartitions (self ):
382383 """
383384 Returns the number of partitions in RDD
385+
384386 >>> rdd = sc.parallelize([1, 2, 3, 4], 2)
385387 >>> rdd.getNumPartitions()
386388 2
@@ -570,6 +572,7 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
570572 """
571573 Sorts this RDD, which is assumed to consist of (key, value) pairs.
572574 # noqa
575+
573576 >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
574577 >>> sc.parallelize(tmp).sortByKey(True, 2).collect()
575578 [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
@@ -1209,6 +1212,7 @@ def collectAsMap(self):
12091212 def keys (self ):
12101213 """
12111214 Return an RDD with the keys of each tuple.
1215+
12121216 >>> m = sc.parallelize([(1, 2), (3, 4)]).keys()
12131217 >>> m.collect()
12141218 [1, 3]
@@ -1218,6 +1222,7 @@ def keys(self):
12181222 def values (self ):
12191223 """
12201224 Return an RDD with the values of each tuple.
1225+
12211226 >>> m = sc.parallelize([(1, 2), (3, 4)]).values()
12221227 >>> m.collect()
12231228 [2, 4]
@@ -1642,6 +1647,7 @@ def repartition(self, numPartitions):
16421647 Internally, this uses a shuffle to redistribute data.
16431648 If you are decreasing the number of partitions in this RDD, consider
16441649 using `coalesce`, which can avoid performing a shuffle.
1650+
16451651 >>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4)
16461652 >>> sorted(rdd.glom().collect())
16471653 [[1], [2, 3], [4, 5], [6, 7]]
@@ -1656,6 +1662,7 @@ def repartition(self, numPartitions):
16561662 def coalesce (self , numPartitions , shuffle = False ):
16571663 """
16581664 Return a new RDD that is reduced into `numPartitions` partitions.
1665+
16591666 >>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect()
16601667 [[1], [2, 3], [4, 5]]
16611668 >>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect()
@@ -1694,6 +1701,7 @@ def name(self):
16941701 def setName (self , name ):
16951702 """
16961703 Assign a name to this RDD.
1704+
16971705 >>> rdd1 = sc.parallelize([1,2])
16981706 >>> rdd1.setName('RDD1')
16991707 >>> rdd1.name()
@@ -1753,6 +1761,7 @@ class PipelinedRDD(RDD):
17531761
17541762 """
17551763 Pipelined maps:
1764+
17561765 >>> rdd = sc.parallelize([1, 2, 3, 4])
17571766 >>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect()
17581767 [4, 8, 12, 16]
0 commit comments