@@ -214,56 +214,104 @@ def textFile(self, name, minSplits=None):
214214 MUTF8Deserializer ())
215215
216216 ###
217- def sequenceFile (self , name , keyClass = "org.apache.hadoop.io.Text" , valueClass = "org.apache.hadoop.io.Text" ,
218- keyWrapper = "" , valueWrapper = "" , minSplits = None ):
217+ def sequenceFile (self , name , key_class = "org.apache.hadoop.io.Text" , value_class = "org.apache.hadoop.io.Text" ,
218+ key_wrapper = "" , value_wrapper = "" , minSplits = None ):
219219 """
220- Read a Hadoopp SequenceFile with arbitrary key and value class from HDFS,
221- a local file system (available on all nodes), or any Hadoop-supported file system URI,
222- and return it as an RDD of (String, String) where the key and value representations
223- are generated using the 'toString()' method of the relevant Java class.
220+ Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
221+ a local file system (available on all nodes), or any Hadoop-supported file system URI.
222+ The mechanism is as follows:
223+ 1. A Java RDD is created from the SequenceFile, key and value classes
224+ 2. Serialization is attempted via MsgPack
225+ 3. If this fails, the fallback is to call 'toString' on each key and value
226+ 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
224227
225228 >>> sc.sequenceFile("test_support/data/sfint/").collect()
226229 [(1, 'aa'), (2, 'bb'), (2, 'aa'), (3, 'cc'), (2, 'bb'), (1, 'aa')]
230+ >>> sc.sequenceFile("test_support/data/sfdouble/").collect()
231+ [(1.0, 'aa'), (2.0, 'bb'), (2.0, 'aa'), (3.0, 'cc'), (2.0, 'bb'), (1.0, 'aa')]
227232 >>> sc.sequenceFile("test_support/data/sftext/").collect()
228233 [('1', 'aa'), ('2', 'bb'), ('2', 'aa'), ('3', 'cc'), ('2', 'bb'), ('1', 'aa')]
229234 """
230235 minSplits = minSplits or min (self .defaultParallelism , 2 )
231- jrdd = self ._jvm .PythonRDD .sequenceFile (self ._jsc , name , keyClass , valueClass , keyWrapper , valueWrapper ,
236+ jrdd = self ._jvm .PythonRDD .sequenceFile (self ._jsc , name , key_class , value_class , key_wrapper , value_wrapper ,
232237 minSplits )
233- #jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, name, keyWrapper, valueWrapper, minSplits)
234- return RDD (jrdd , self , MsgPackDeserializer ()) # MsgPackDeserializer PairMUTF8Deserializer
238+ return RDD (jrdd , self , MsgPackDeserializer ())
235239
236- def newAPIHadoopFile (self , name , inputFormat , keyClass , valueClass , keyWrapper = "toString" , valueWrapper = "toString" ,
237- conf = {}):
240+ def newAPIHadoopFile (self , name , inputformat_class , key_class , value_class , key_wrapper = "toString" ,
241+ value_wrapper = "toString" , conf = {}):
238242 """
239- Read a Hadoopp file with arbitrary InputFormat, key and value class from HDFS,
240- a local file system (available on all nodes), or any Hadoop-supported file system URI,
241- and return it as an RDD of (String, String), where the key and value representations
242- are generated using the 'toString()' method of the relevant Java class.
243+ Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
244+ a local file system (available on all nodes), or any Hadoop-supported file system URI.
245+ The mechanism is as follows:
246+ 1. A Java RDD is created from the InputFormat, key and value classes
247+ 2. Serialization is attempted via MsgPack
248+ 3. If this fails, the fallback is to call 'toString' on each key and value
249+ 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
250+
251+ A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
243252 """
244253 jconf = self ._jvm .java .util .HashMap ()
245254 for k , v in conf .iteritems ():
246255 jconf [k ] = v
247- jrdd = self ._jvm .PythonRDD .newAPIHadoopFile (self ._jsc , name , inputFormat , keyClass , valueClass , keyWrapper ,
248- valueWrapper , jconf )
256+ jrdd = self ._jvm .PythonRDD .newAPIHadoopFile (self ._jsc , name , inputformat_class , key_class , value_class ,
257+ key_wrapper , value_wrapper , jconf )
249258 return RDD (jrdd , self , MsgPackDeserializer ())
250259
251- def newAPIHadoopRDD (self , inputFormat , keyClass , valueClass , keyWrapper = "toString" , valueWrapper = "toString" ,
252- conf = {}):
260+ def newAPIHadoopRDD (self , inputformat_class , key_class , value_class , key_wrapper = "toString" ,
261+ value_wrapper = "toString" , conf = {}):
253262 """
254- Read a Hadoopp file with arbitrary InputFormat, key and value class from HDFS,
255- a local file system (available on all nodes), or any Hadoop-supported file system URI,
256- and return it as an RDD of (String, String), where the key and value representations
257- are generated using the 'toString()' method of the relevant Java class.
263+ Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
264+ that is passed in as a Python dict. This will be converted into a Configuration in Java.
265+ The mechanism is as follows:
266+ 1. A Java RDD is created from the InputFormat, key and value classes
267+ 2. Serialization is attempted via MsgPack
268+ 3. If this fails, the fallback is to call 'toString' on each key and value
269+ 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
258270 """
259271 jconf = self ._jvm .java .util .HashMap ()
260272 for k , v in conf .iteritems ():
261273 jconf [k ] = v
262- jrdd = self ._jvm .PythonRDD .newAPIHadoopRDD (self ._jsc , inputFormat , keyClass , valueClass , keyWrapper ,
263- valueWrapper , jconf )
274+ jrdd = self ._jvm .PythonRDD .newAPIHadoopRDD (self ._jsc , inputformat_class , key_class , value_class , key_wrapper ,
275+ value_wrapper , jconf )
264276 return RDD (jrdd , self , MsgPackDeserializer ())
265277
266- ###
278+ def hadoopFile (self , name , inputformat_class , key_class , value_class , key_wrapper = "toString" ,
279+ value_wrapper = "toString" , conf = {}):
280+ """
281+ Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
282+ a local file system (available on all nodes), or any Hadoop-supported file system URI.
283+ The mechanism is as follows:
284+ 1. A Java RDD is created from the InputFormat, key and value classes
285+ 2. Serialization is attempted via MsgPack
286+ 3. If this fails, the fallback is to call 'toString' on each key and value
287+ 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
288+
289+ A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
290+ """
291+ jconf = self ._jvm .java .util .HashMap ()
292+ for k , v in conf .iteritems ():
293+ jconf [k ] = v
294+ jrdd = self ._jvm .PythonRDD .hadoopFile (self ._jsc , name , inputformat_class , key_class , value_class , key_wrapper ,
295+ value_wrapper , jconf )
296+ return RDD (jrdd , self , MsgPackDeserializer ())
297+
298+ def hadoopRDD (self , inputformat_class , key_class , value_class , key_wrapper = "toString" ,
299+ value_wrapper = "toString" , conf = {}):
300+ """
301+ Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
302+ that is passed in as a Python dict. This will be converted into a Configuration in Java.
303+ The mechanism is as follows:
304+ 1. A Java RDD is created from the InputFormat, key and value classes
305+ 2. Serialization is attempted via MsgPack
306+ 3. If this fails, the fallback is to call 'toString' on each key and value
307+ 4. C{MsgPackDeserializer} is used to deserialize data on the Python side
308+ """
309+ jconf = self ._jvm .java .util .HashMap ()
310+ for k , v in conf .iteritems ():
311+ jconf [k ] = v
312+ jrdd = self ._jvm .PythonRDD .hadoopRDD (self ._jsc , inputformat_class , key_class , value_class , key_wrapper ,
313+ value_wrapper , jconf )
314+ return RDD (jrdd , self , MsgPackDeserializer ())
267315
268316 def _checkpointFile (self , name , input_deserializer ):
269317 jrdd = self ._jsc .checkpointFile (name )
0 commit comments