@@ -297,8 +297,6 @@ def wholeTextFiles(self, path):
297297 return RDD (self ._jsc .wholeTextFiles (path ), self ,
298298 PairDeserializer (UTF8Deserializer (), UTF8Deserializer ()))
299299
300- ###
301-
302300 def sequenceFile (self , name , key_class = "org.apache.hadoop.io.Text" , value_class = "org.apache.hadoop.io.Text" ,
303301 key_wrapper = "" , value_wrapper = "" , minSplits = None ):
304302 """
@@ -308,30 +306,33 @@ def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class=
308306 1. A Java RDD is created from the SequenceFile, key and value classes
309307 2. Serialization is attempted via MsgPack
310308 3. If this fails, the fallback is to call 'toString' on each key and value
311- 4. C{MsgPackDeserializer } is used to deserialize data on the Python side
309+ 4. C{MsgpackSerializer } is used to deserialize data on the Python side
312310
313311 >>> sc.sequenceFile("test_support/data/sfint/").collect()
314312 [(1, 'aa'), (2, 'bb'), (2, 'aa'), (3, 'cc'), (2, 'bb'), (1, 'aa')]
315313 >>> sc.sequenceFile("test_support/data/sfdouble/").collect()
316314 [(1.0, 'aa'), (2.0, 'bb'), (2.0, 'aa'), (3.0, 'cc'), (2.0, 'bb'), (1.0, 'aa')]
317315 >>> sc.sequenceFile("test_support/data/sftext/").collect()
318316 [('1', 'aa'), ('2', 'bb'), ('2', 'aa'), ('3', 'cc'), ('2', 'bb'), ('1', 'aa')]
317+ >>> sc.sequenceFile("test_support/data/sfbool/").collect()
318+ [(1, True), (2, True), (2, False), (3, True), (2, False), (1, False)]
319+ >>> sc.sequenceFile("test_support/data/sfnull/").collect()
320+ [(1, None), (2, None), (2, None), (3, None), (2, None), (1, None)]
321+ >>> sc.sequenceFile("test_support/data/sfmap/").collect()
322+ [(1, {2.0: 'aa'}), (2, {3.0: 'bb'}), (2, {1.0: 'cc'}), (3, {2.0: 'dd'}), (2, {1.0: 'aa'}), (1, {3.0: 'bb'})]
319323 """
320324 minSplits = minSplits or min (self .defaultParallelism , 2 )
321325 jrdd = self ._jvm .PythonRDD .sequenceFile (self ._jsc , name , key_class , value_class , key_wrapper , value_wrapper ,
322326 minSplits )
323327 return RDD (jrdd , self , MsgpackSerializer ())
324328
325- def newAPIHadoopFile (self , name , inputformat_class , key_class , value_class , key_wrapper = "toString" ,
329+ def newAPIHadoopFile (self , name , inputformat_class , key_class = "org.apache.hadoop.io.Text" ,
330+ value_class = "org.apache.hadoop.io.Text" , key_wrapper = "toString" ,
326331 value_wrapper = "toString" , conf = {}):
327332 """
328333 Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
329334 a local file system (available on all nodes), or any Hadoop-supported file system URI.
330- The mechanism is as follows:
331- 1. A Java RDD is created from the InputFormat, key and value classes
332- 2. Serialization is attempted via MsgPack
333- 3. If this fails, the fallback is to call 'toString' on each key and value
334- 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
335+ The mechanism is the same as for sc.sequenceFile.
335336
336337 A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
337338 """
@@ -342,16 +343,12 @@ def newAPIHadoopFile(self, name, inputformat_class, key_class, value_class, key_
342343 key_wrapper , value_wrapper , jconf )
343344 return RDD (jrdd , self , MsgpackSerializer ())
344345
345- def newAPIHadoopRDD (self , inputformat_class , key_class , value_class , key_wrapper = "toString " ,
346- value_wrapper = "toString " , conf = {}):
346+ def newAPIHadoopRDD (self , inputformat_class , key_class = "org.apache.hadoop.io.Text " ,
347+ value_class = "org.apache.hadoop.io.Text" , key_wrapper = "" , value_wrapper = "" , conf = {}):
347348 """
348349 Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
349- that is passed in as a Python dict. This will be converted into a Configuration in Java.
350- The mechanism is as follows:
351- 1. A Java RDD is created from the InputFormat, key and value classes
352- 2. Serialization is attempted via MsgPack
353- 3. If this fails, the fallback is to call 'toString' on each key and value
354- 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
350+ which is passed in as a Python dict. This will be converted into a Configuration in Java.
351+ The mechanism is the same as for sc.sequenceFile.
355352 """
356353 jconf = self ._jvm .java .util .HashMap ()
357354 for k , v in conf .iteritems ():
@@ -360,16 +357,12 @@ def newAPIHadoopRDD(self, inputformat_class, key_class, value_class, key_wrapper
360357 value_wrapper , jconf )
361358 return RDD (jrdd , self , MsgpackSerializer ())
362359
363- def hadoopFile (self , name , inputformat_class , key_class , value_class , key_wrapper = "toString " ,
364- value_wrapper = "toString " , conf = {}):
360+ def hadoopFile (self , name , inputformat_class , key_class = "org.apache.hadoop.io.Text " ,
361+ value_class = "org.apache.hadoop.io.Text" , key_wrapper = "" , value_wrapper = "" , conf = {}):
365362 """
366363 Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
367364 a local file system (available on all nodes), or any Hadoop-supported file system URI.
368- The mechanism is as follows:
369- 1. A Java RDD is created from the InputFormat, key and value classes
370- 2. Serialization is attempted via MsgPack
371- 3. If this fails, the fallback is to call 'toString' on each key and value
372- 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
365+ The mechanism is the same as for sc.sequenceFile.
373366
374367 A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
375368 """
@@ -380,16 +373,12 @@ def hadoopFile(self, name, inputformat_class, key_class, value_class, key_wrappe
380373 value_wrapper , jconf )
381374 return RDD (jrdd , self , MsgpackSerializer ())
382375
383- def hadoopRDD (self , inputformat_class , key_class , value_class , key_wrapper = "toString " ,
384- value_wrapper = "toString " , conf = {}):
376+ def hadoopRDD (self , inputformat_class , key_class = "org.apache.hadoop.io.Text " ,
377+ value_class = "org.apache.hadoop.io.Text" , key_wrapper = "" , value_wrapper = "" , conf = {}):
385378 """
386379 Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
387- that is passed in as a Python dict. This will be converted into a Configuration in Java.
388- The mechanism is as follows:
389- 1. A Java RDD is created from the InputFormat, key and value classes
390- 2. Serialization is attempted via MsgPack
391- 3. If this fails, the fallback is to call 'toString' on each key and value
392- 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
380+ which is passed in as a Python dict. This will be converted into a Configuration in Java.
381+ The mechanism is the same as for sc.sequenceFile.
393382 """
394383 jconf = self ._jvm .java .util .HashMap ()
395384 for k , v in conf .iteritems ():
@@ -398,8 +387,6 @@ def hadoopRDD(self, inputformat_class, key_class, value_class, key_wrapper="toSt
398387 value_wrapper , jconf )
399388 return RDD (jrdd , self , MsgpackSerializer ())
400389
401- ####
402-
403390 def _checkpointFile (self , name , input_deserializer ):
404391 jrdd = self ._jsc .checkpointFile (name )
405392 return RDD (jrdd , self , input_deserializer )
0 commit comments