@@ -328,8 +328,14 @@ def wholeTextFiles(self, path, minPartitions=None):
328328 return RDD (self ._jsc .wholeTextFiles (path , minPartitions ), self ,
329329 PairDeserializer (UTF8Deserializer (), UTF8Deserializer ()))
330330
331- def sequenceFile (self , name , key_class = "org.apache.hadoop.io.Text" , value_class = "org.apache.hadoop.io.Text" ,
332- key_wrapper = "" , value_wrapper = "" , minSplits = None ):
331+ def dictToJavaMap (self , d ):
332+ jm = self ._jvm .java .util .HashMap ()
333+ for k , v in d .iteritems ():
334+ jm [k ] = v
335+ return jm
336+
337+ def sequenceFile (self , path , keyClass , valueClass , keyConverter = "" , valueConverter = "" ,
338+ minSplits = None ):
333339 """
334340 Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
335341 a local file system (available on all nodes), or any Hadoop-supported file system URI.
@@ -338,6 +344,13 @@ def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class=
338344 2. Serialization is attempted via Pyrolite pickling
339345 3. If this fails, the fallback is to call 'toString' on each key and value
340346 4. C{PickleSerializer} is used to deserialize pickled objects on the Python side
347+
348+ @param path:
349+ @param keyClass:
350+ @param valueClass:
351+ @param keyWrapper:
352+ @param valueWrapper:
353+ @param minSplits:
341354 >>> sorted(sc.sequenceFile(tempdir + "/sftestdata/sfint/").collect())
342355 [(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
343356 >>> sorted(sc.sequenceFile(tempdir + "/sftestdata/sfdouble/").collect())
@@ -355,69 +368,66 @@ def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class=
355368 True
356369 """
357370 minSplits = minSplits or min (self .defaultParallelism , 2 )
358- jrdd = self ._jvm .PythonRDD .sequenceFile (self ._jsc , name , key_class , value_class , key_wrapper , value_wrapper ,
359- minSplits )
371+ jrdd = self ._jvm .PythonRDD .sequenceFile (self ._jsc , path , keyClass , valueClass ,
372+ keyConverter , valueConverter , minSplits )
360373 return RDD (jrdd , self , PickleSerializer ())
361374
362- def newAPIHadoopFile (self , name , inputformat_class , key_class = "org.apache.hadoop.io.Text" ,
363- value_class = "org.apache.hadoop.io.Text" , key_wrapper = "toString" ,
364- value_wrapper = "toString" , conf = {}):
375+ def newAPIHadoopFile (self , path , inputFormatClass , keyClass , valueClass ,
376+ keyConverter = "" , valueConverter = "" , conf = {}):
365377 """
366378 Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
367379 a local file system (available on all nodes), or any Hadoop-supported file system URI.
368380 The mechanism is the same as for sc.sequenceFile.
369381
370- A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
382+ A Hadoop configuration can be passed in as a Python dict. This will be converted into a
383+ Configuration in Java
371384 """
372- jconf = self ._jvm .java .util .HashMap ()
373- for k , v in conf .iteritems ():
374- jconf [k ] = v
375- jrdd = self ._jvm .PythonRDD .newAPIHadoopFile (self ._jsc , name , inputformat_class , key_class , value_class ,
376- key_wrapper , value_wrapper , jconf )
385+ jconf = self .dictToJavaMap (conf )
386+ jrdd = self ._jvm .PythonRDD .newAPIHadoopFile (self ._jsc , path , inputFormatClass , keyClass ,
387+ valueClass , keyConverter , valueConverter , jconf )
377388 return RDD (jrdd , self , PickleSerializer ())
378389
379- def newAPIHadoopRDD (self , inputformat_class , key_class = "org.apache.hadoop.io.Text" ,
380- value_class = "org.apache.hadoop.io.Text " , key_wrapper = "" , value_wrapper = "" , conf = {}):
390+ def newAPIHadoopRDD (self , inputFormatClass , keyClass , valueClass ,
391+ keyConverter = " " , valueConverter = "" , conf = {}):
381392 """
382- Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
393+ Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
394+ Hadoop configuration,
383395 which is passed in as a Python dict. This will be converted into a Configuration in Java.
384396 The mechanism is the same as for sc.sequenceFile.
385397 """
386- jconf = self ._jvm .java .util .HashMap ()
387- for k , v in conf .iteritems ():
388- jconf [k ] = v
389- jrdd = self ._jvm .PythonRDD .newAPIHadoopRDD (self ._jsc , inputformat_class , key_class , value_class , key_wrapper ,
390- value_wrapper , jconf )
398+ jconf = self .dictToJavaMap (conf )
399+ jrdd = self ._jvm .PythonRDD .newAPIHadoopRDD (self ._jsc , inputFormatClass , keyClass ,
400+ valueClass , keyConverter , valueConverter , jconf )
391401 return RDD (jrdd , self , PickleSerializer ())
392402
393- def hadoopFile (self , name , inputformat_class , key_class = "org.apache.hadoop.io.Text" ,
394- value_class = "org.apache.hadoop.io.Text " , key_wrapper = "" , value_wrapper = "" , conf = {}):
403+ def hadoopFile (self , path , inputFormatClass , keyClass , valueClass ,
404+ keyConverter = " " , valueConverter = "" , conf = {}):
395405 """
396406 Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
397407 a local file system (available on all nodes), or any Hadoop-supported file system URI.
398408 The mechanism is the same as for sc.sequenceFile.
399409
400- A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
410+ A Hadoop configuration can be passed in as a Python dict. This will be converted into a
411+ Configuration in Java
401412 """
402- jconf = self ._jvm . java . util . HashMap ( )
413+ jconf = self .dictToJavaMap ( conf )
403414 for k , v in conf .iteritems ():
404415 jconf [k ] = v
405- jrdd = self ._jvm .PythonRDD .hadoopFile (self ._jsc , name , inputformat_class , key_class , value_class , key_wrapper ,
406- value_wrapper , jconf )
416+ jrdd = self ._jvm .PythonRDD .hadoopFile (self ._jsc , path , inputFormatClass , keyClass ,
417+ valueClass , keyConverter , valueConverter , jconf )
407418 return RDD (jrdd , self , PickleSerializer ())
408419
409- def hadoopRDD (self , inputformat_class , key_class = "org.apache.hadoop.io.Text" ,
410- value_class = "org.apache.hadoop.io.Text " , key_wrapper = "" , value_wrapper = "" , conf = {}):
420+ def hadoopRDD (self , inputFormatClass , keyClass , valueClass ,
421+ keyConverter = " " , valueConverter = "" , conf = {}):
411422 """
412- Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
423+ Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
424+ Hadoop configuration,
413425 which is passed in as a Python dict. This will be converted into a Configuration in Java.
414426 The mechanism is the same as for sc.sequenceFile.
415427 """
416- jconf = self ._jvm .java .util .HashMap ()
417- for k , v in conf .iteritems ():
418- jconf [k ] = v
419- jrdd = self ._jvm .PythonRDD .hadoopRDD (self ._jsc , inputformat_class , key_class , value_class , key_wrapper ,
420- value_wrapper , jconf )
428+ jconf = self .dictToJavaMap (conf )
429+ jrdd = self ._jvm .PythonRDD .hadoopRDD (self ._jsc , inputFormatClass , keyClass , valueClass ,
430+ keyConverter , valueConverter , jconf )
421431 return RDD (jrdd , self , PickleSerializer ())
422432
423433 def _checkpointFile (self , name , input_deserializer ):
0 commit comments