Skip to content

Commit 15a7d07

Browse files
committed
Remove default args for key/value classes. Arg names to camelCase
1 parent 9fe6bd5 commit 15a7d07

File tree

1 file changed

+45
-35
lines changed

1 file changed

+45
-35
lines changed

python/pyspark/context.py

Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -328,8 +328,14 @@ def wholeTextFiles(self, path, minPartitions=None):
328328
return RDD(self._jsc.wholeTextFiles(path, minPartitions), self,
329329
PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
330330

331-
def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class="org.apache.hadoop.io.Text",
332-
key_wrapper="", value_wrapper="", minSplits=None):
331+
def dictToJavaMap(self, d):
332+
jm = self._jvm.java.util.HashMap()
333+
for k, v in d.iteritems():
334+
jm[k] = v
335+
return jm
336+
337+
def sequenceFile(self, path, keyClass, valueClass, keyConverter="", valueConverter="",
338+
minSplits=None):
333339
"""
334340
Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
335341
a local file system (available on all nodes), or any Hadoop-supported file system URI.
@@ -338,6 +344,13 @@ def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class=
338344
2. Serialization is attempted via Pyrolite pickling
339345
3. If this fails, the fallback is to call 'toString' on each key and value
340346
4. C{PickleSerializer} is used to deserialize pickled objects on the Python side
347+
348+
@param path:
349+
@param keyClass:
350+
@param valueClass:
351+
@param keyWrapper:
352+
@param valueWrapper:
353+
@param minSplits:
341354
>>> sorted(sc.sequenceFile(tempdir + "/sftestdata/sfint/").collect())
342355
[(1, u'aa'), (1, u'aa'), (2, u'aa'), (2, u'bb'), (2, u'bb'), (3, u'cc')]
343356
>>> sorted(sc.sequenceFile(tempdir + "/sftestdata/sfdouble/").collect())
@@ -355,69 +368,66 @@ def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class=
355368
True
356369
"""
357370
minSplits = minSplits or min(self.defaultParallelism, 2)
358-
jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, name, key_class, value_class, key_wrapper, value_wrapper,
359-
minSplits)
371+
jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass,
372+
keyConverter, valueConverter, minSplits)
360373
return RDD(jrdd, self, PickleSerializer())
361374

362-
def newAPIHadoopFile(self, name, inputformat_class, key_class="org.apache.hadoop.io.Text",
363-
value_class="org.apache.hadoop.io.Text", key_wrapper="toString",
364-
value_wrapper="toString", conf={}):
375+
def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass,
376+
keyConverter="", valueConverter="", conf={}):
365377
"""
366378
Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
367379
a local file system (available on all nodes), or any Hadoop-supported file system URI.
368380
The mechanism is the same as for sc.sequenceFile.
369381
370-
A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
382+
A Hadoop configuration can be passed in as a Python dict. This will be converted into a
383+
Configuration in Java
371384
"""
372-
jconf = self._jvm.java.util.HashMap()
373-
for k, v in conf.iteritems():
374-
jconf[k] = v
375-
jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, name, inputformat_class, key_class, value_class,
376-
key_wrapper, value_wrapper, jconf)
385+
jconf = self.dictToJavaMap(conf)
386+
jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass,
387+
valueClass, keyConverter, valueConverter, jconf)
377388
return RDD(jrdd, self, PickleSerializer())
378389

379-
def newAPIHadoopRDD(self, inputformat_class, key_class="org.apache.hadoop.io.Text",
380-
value_class="org.apache.hadoop.io.Text", key_wrapper="", value_wrapper="", conf={}):
390+
def newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass,
391+
keyConverter="", valueConverter="", conf={}):
381392
"""
382-
Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
393+
Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
394+
Hadoop configuration,
383395
which is passed in as a Python dict. This will be converted into a Configuration in Java.
384396
The mechanism is the same as for sc.sequenceFile.
385397
"""
386-
jconf = self._jvm.java.util.HashMap()
387-
for k, v in conf.iteritems():
388-
jconf[k] = v
389-
jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputformat_class, key_class, value_class, key_wrapper,
390-
value_wrapper, jconf)
398+
jconf = self.dictToJavaMap(conf)
399+
jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
400+
valueClass, keyConverter, valueConverter, jconf)
391401
return RDD(jrdd, self, PickleSerializer())
392402

393-
def hadoopFile(self, name, inputformat_class, key_class="org.apache.hadoop.io.Text",
394-
value_class="org.apache.hadoop.io.Text", key_wrapper="", value_wrapper="", conf={}):
403+
def hadoopFile(self, path, inputFormatClass, keyClass, valueClass,
404+
keyConverter="", valueConverter="", conf={}):
395405
"""
396406
Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
397407
a local file system (available on all nodes), or any Hadoop-supported file system URI.
398408
The mechanism is the same as for sc.sequenceFile.
399409
400-
A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
410+
A Hadoop configuration can be passed in as a Python dict. This will be converted into a
411+
Configuration in Java
401412
"""
402-
jconf = self._jvm.java.util.HashMap()
413+
jconf = self.dictToJavaMap(conf)
403414
for k, v in conf.iteritems():
404415
jconf[k] = v
405-
jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, name, inputformat_class, key_class, value_class, key_wrapper,
406-
value_wrapper, jconf)
416+
jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, path, inputFormatClass, keyClass,
417+
valueClass, keyConverter, valueConverter, jconf)
407418
return RDD(jrdd, self, PickleSerializer())
408419

409-
def hadoopRDD(self, inputformat_class, key_class="org.apache.hadoop.io.Text",
410-
value_class="org.apache.hadoop.io.Text", key_wrapper="", value_wrapper="", conf={}):
420+
def hadoopRDD(self, inputFormatClass, keyClass, valueClass,
421+
keyConverter="", valueConverter="", conf={}):
411422
"""
412-
Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
423+
Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary
424+
Hadoop configuration,
413425
which is passed in as a Python dict. This will be converted into a Configuration in Java.
414426
The mechanism is the same as for sc.sequenceFile.
415427
"""
416-
jconf = self._jvm.java.util.HashMap()
417-
for k, v in conf.iteritems():
418-
jconf[k] = v
419-
jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputformat_class, key_class, value_class, key_wrapper,
420-
value_wrapper, jconf)
428+
jconf = self.dictToJavaMap(conf)
429+
jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputFormatClass, keyClass, valueClass,
430+
keyConverter, valueConverter, jconf)
421431
return RDD(jrdd, self, PickleSerializer())
422432

423433
def _checkpointFile(self, name, input_deserializer):

0 commit comments

Comments
 (0)