Skip to content

Commit 4e08983

Browse files
committed
Clean up docs for PySpark context methods
1 parent b20ec7e commit 4e08983

File tree

1 file changed

+21
-34
lines changed

1 file changed

+21
-34
lines changed

python/pyspark/context.py

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,6 @@ def wholeTextFiles(self, path):
297297
return RDD(self._jsc.wholeTextFiles(path), self,
298298
PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
299299

300-
###
301-
302300
def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class="org.apache.hadoop.io.Text",
303301
key_wrapper="", value_wrapper="", minSplits=None):
304302
"""
@@ -308,30 +306,33 @@ def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class=
308306
1. A Java RDD is created from the SequenceFile, key and value classes
309307
2. Serialization is attempted via MsgPack
310308
3. If this fails, the fallback is to call 'toString' on each key and value
311-
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
309+
4. C{MsgpackSerializer} is used to deserialize data on the Python side
312310
313311
>>> sc.sequenceFile("test_support/data/sfint/").collect()
314312
[(1, 'aa'), (2, 'bb'), (2, 'aa'), (3, 'cc'), (2, 'bb'), (1, 'aa')]
315313
>>> sc.sequenceFile("test_support/data/sfdouble/").collect()
316314
[(1.0, 'aa'), (2.0, 'bb'), (2.0, 'aa'), (3.0, 'cc'), (2.0, 'bb'), (1.0, 'aa')]
317315
>>> sc.sequenceFile("test_support/data/sftext/").collect()
318316
[('1', 'aa'), ('2', 'bb'), ('2', 'aa'), ('3', 'cc'), ('2', 'bb'), ('1', 'aa')]
317+
>>> sc.sequenceFile("test_support/data/sfbool/").collect()
318+
[(1, True), (2, True), (2, False), (3, True), (2, False), (1, False)]
319+
>>> sc.sequenceFile("test_support/data/sfnull/").collect()
320+
[(1, None), (2, None), (2, None), (3, None), (2, None), (1, None)]
321+
>>> sc.sequenceFile("test_support/data/sfmap/").collect()
322+
[(1, {2.0: 'aa'}), (2, {3.0: 'bb'}), (2, {1.0: 'cc'}), (3, {2.0: 'dd'}), (2, {1.0: 'aa'}), (1, {3.0: 'bb'})]
319323
"""
320324
minSplits = minSplits or min(self.defaultParallelism, 2)
321325
jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, name, key_class, value_class, key_wrapper, value_wrapper,
322326
minSplits)
323327
return RDD(jrdd, self, MsgpackSerializer())
324328

325-
def newAPIHadoopFile(self, name, inputformat_class, key_class, value_class, key_wrapper="toString",
329+
def newAPIHadoopFile(self, name, inputformat_class, key_class="org.apache.hadoop.io.Text",
330+
value_class="org.apache.hadoop.io.Text", key_wrapper="toString",
326331
value_wrapper="toString", conf={}):
327332
"""
328333
Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
329334
a local file system (available on all nodes), or any Hadoop-supported file system URI.
330-
The mechanism is as follows:
331-
1. A Java RDD is created from the InputFormat, key and value classes
332-
2. Serialization is attempted via MsgPack
333-
3. If this fails, the fallback is to call 'toString' on each key and value
334-
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
335+
The mechanism is the same as for sc.sequenceFile.
335336
336337
A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
337338
"""
@@ -342,16 +343,12 @@ def newAPIHadoopFile(self, name, inputformat_class, key_class, value_class, key_
342343
key_wrapper, value_wrapper, jconf)
343344
return RDD(jrdd, self, MsgpackSerializer())
344345

345-
def newAPIHadoopRDD(self, inputformat_class, key_class, value_class, key_wrapper="toString",
346-
value_wrapper="toString", conf={}):
346+
def newAPIHadoopRDD(self, inputformat_class, key_class="org.apache.hadoop.io.Text",
347+
value_class="org.apache.hadoop.io.Text", key_wrapper="", value_wrapper="", conf={}):
347348
"""
348349
Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
349-
that is passed in as a Python dict. This will be converted into a Configuration in Java.
350-
The mechanism is as follows:
351-
1. A Java RDD is created from the InputFormat, key and value classes
352-
2. Serialization is attempted via MsgPack
353-
3. If this fails, the fallback is to call 'toString' on each key and value
354-
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
350+
which is passed in as a Python dict. This will be converted into a Configuration in Java.
351+
The mechanism is the same as for sc.sequenceFile.
355352
"""
356353
jconf = self._jvm.java.util.HashMap()
357354
for k, v in conf.iteritems():
@@ -360,16 +357,12 @@ def newAPIHadoopRDD(self, inputformat_class, key_class, value_class, key_wrapper
360357
value_wrapper, jconf)
361358
return RDD(jrdd, self, MsgpackSerializer())
362359

363-
def hadoopFile(self, name, inputformat_class, key_class, value_class, key_wrapper="toString",
364-
value_wrapper="toString", conf={}):
360+
def hadoopFile(self, name, inputformat_class, key_class="org.apache.hadoop.io.Text",
361+
value_class="org.apache.hadoop.io.Text", key_wrapper="", value_wrapper="", conf={}):
365362
"""
366363
Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
367364
a local file system (available on all nodes), or any Hadoop-supported file system URI.
368-
The mechanism is as follows:
369-
1. A Java RDD is created from the InputFormat, key and value classes
370-
2. Serialization is attempted via MsgPack
371-
3. If this fails, the fallback is to call 'toString' on each key and value
372-
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
365+
The mechanism is the same as for sc.sequenceFile.
373366
374367
A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
375368
"""
@@ -380,16 +373,12 @@ def hadoopFile(self, name, inputformat_class, key_class, value_class, key_wrappe
380373
value_wrapper, jconf)
381374
return RDD(jrdd, self, MsgpackSerializer())
382375

383-
def hadoopRDD(self, inputformat_class, key_class, value_class, key_wrapper="toString",
384-
value_wrapper="toString", conf={}):
376+
def hadoopRDD(self, inputformat_class, key_class="org.apache.hadoop.io.Text",
377+
value_class="org.apache.hadoop.io.Text", key_wrapper="", value_wrapper="", conf={}):
385378
"""
386379
Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
387-
that is passed in as a Python dict. This will be converted into a Configuration in Java.
388-
The mechanism is as follows:
389-
1. A Java RDD is created from the InputFormat, key and value classes
390-
2. Serialization is attempted via MsgPack
391-
3. If this fails, the fallback is to call 'toString' on each key and value
392-
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
380+
which is passed in as a Python dict. This will be converted into a Configuration in Java.
381+
The mechanism is the same as for sc.sequenceFile.
393382
"""
394383
jconf = self._jvm.java.util.HashMap()
395384
for k, v in conf.iteritems():
@@ -398,8 +387,6 @@ def hadoopRDD(self, inputformat_class, key_class, value_class, key_wrapper="toSt
398387
value_wrapper, jconf)
399388
return RDD(jrdd, self, MsgpackSerializer())
400389

401-
####
402-
403390
def _checkpointFile(self, name, input_deserializer):
404391
jrdd = self._jsc.checkpointFile(name)
405392
return RDD(jrdd, self, input_deserializer)

0 commit comments

Comments
 (0)