@@ -1197,7 +1197,7 @@ def take(self, num):
11971197 [91, 92, 93]
11981198 """
11991199 items = []
1200- totalParts = self ._jrdd . partitions (). size ()
1200+ totalParts = self .getNumPartitions ()
12011201 partsScanned = 0
12021202
12031203 while len (items ) < num and partsScanned < totalParts :
@@ -1260,7 +1260,7 @@ def isEmpty(self):
12601260 >>> sc.parallelize([1]).isEmpty()
12611261 False
12621262 """
1263- return self ._jrdd . partitions (). size () == 0 or len (self .take (1 )) == 0
1263+ return self .getNumPartitions () == 0 or len (self .take (1 )) == 0
12641264
12651265 def saveAsNewAPIHadoopDataset (self , conf , keyConverter = None , valueConverter = None ):
12661266 """
@@ -2235,11 +2235,9 @@ def _prepare_for_python_RDD(sc, command, obj=None):
22352235 ser = CloudPickleSerializer ()
22362236 pickled_command = ser .dumps ((command , sys .version_info [:2 ]))
22372237 if len (pickled_command ) > (1 << 20 ): # 1M
2238+ # The broadcast will have same life cycle as created PythonRDD
22382239 broadcast = sc .broadcast (pickled_command )
22392240 pickled_command = ser .dumps (broadcast )
2240- # tracking the life cycle by obj
2241- if obj is not None :
2242- obj ._broadcast = broadcast
22432241 broadcast_vars = ListConverter ().convert (
22442242 [x ._jbroadcast for x in sc ._pickled_broadcast_vars ],
22452243 sc ._gateway ._gateway_client )
@@ -2294,12 +2292,9 @@ def pipeline_func(split, iterator):
22942292 self ._jrdd_deserializer = self .ctx .serializer
22952293 self ._bypass_serializer = False
22962294 self .partitioner = prev .partitioner if self .preservesPartitioning else None
2297- self ._broadcast = None
22982295
2299- def __del__ (self ):
2300- if self ._broadcast :
2301- self ._broadcast .unpersist ()
2302- self ._broadcast = None
2296+ def getNumPartitions (self ):
2297+ return self ._prev_jrdd .partitions ().size ()
23032298
23042299 @property
23052300 def _jrdd (self ):
0 commit comments