Skip to content

Commit 4294cbb

Browse files
committed
Add old Hadoop api methods. Clean up and expand comments. Clean up argument names
1 parent 818a1e6 commit 4294cbb

File tree

1 file changed

+74
-26
lines changed

1 file changed

+74
-26
lines changed

python/pyspark/context.py

Lines changed: 74 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -214,56 +214,104 @@ def textFile(self, name, minSplits=None):
214214
MUTF8Deserializer())
215215

216216
###
217-
def sequenceFile(self, name, keyClass="org.apache.hadoop.io.Text", valueClass="org.apache.hadoop.io.Text",
218-
keyWrapper="", valueWrapper="", minSplits=None):
217+
def sequenceFile(self, name, key_class="org.apache.hadoop.io.Text", value_class="org.apache.hadoop.io.Text",
218+
key_wrapper="", value_wrapper="", minSplits=None):
219219
"""
220-
Read a Hadoopp SequenceFile with arbitrary key and value class from HDFS,
221-
a local file system (available on all nodes), or any Hadoop-supported file system URI,
222-
and return it as an RDD of (String, String) where the key and value representations
223-
are generated using the 'toString()' method of the relevant Java class.
220+
Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,
221+
a local file system (available on all nodes), or any Hadoop-supported file system URI.
222+
The mechanism is as follows:
223+
1. A Java RDD is created from the SequenceFile, key and value classes
224+
2. Serialization is attempted via MsgPack
225+
3. If this fails, the fallback is to call 'toString' on each key and value
226+
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
224227
225228
>>> sc.sequenceFile("test_support/data/sfint/").collect()
226229
[(1, 'aa'), (2, 'bb'), (2, 'aa'), (3, 'cc'), (2, 'bb'), (1, 'aa')]
230+
>>> sc.sequenceFile("test_support/data/sfdouble/").collect()
231+
[(1.0, 'aa'), (2.0, 'bb'), (2.0, 'aa'), (3.0, 'cc'), (2.0, 'bb'), (1.0, 'aa')]
227232
>>> sc.sequenceFile("test_support/data/sftext/").collect()
228233
[('1', 'aa'), ('2', 'bb'), ('2', 'aa'), ('3', 'cc'), ('2', 'bb'), ('1', 'aa')]
229234
"""
230235
minSplits = minSplits or min(self.defaultParallelism, 2)
231-
jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, name, keyClass, valueClass, keyWrapper, valueWrapper,
236+
jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, name, key_class, value_class, key_wrapper, value_wrapper,
232237
minSplits)
233-
#jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, name, keyWrapper, valueWrapper, minSplits)
234-
return RDD(jrdd, self, MsgPackDeserializer()) # MsgPackDeserializer PairMUTF8Deserializer
238+
return RDD(jrdd, self, MsgPackDeserializer())
235239

236-
def newAPIHadoopFile(self, name, inputFormat, keyClass, valueClass, keyWrapper="toString", valueWrapper="toString",
237-
conf = {}):
240+
def newAPIHadoopFile(self, name, inputformat_class, key_class, value_class, key_wrapper="toString",
241+
value_wrapper="toString", conf={}):
238242
"""
239-
Read a Hadoopp file with arbitrary InputFormat, key and value class from HDFS,
240-
a local file system (available on all nodes), or any Hadoop-supported file system URI,
241-
and return it as an RDD of (String, String), where the key and value representations
242-
are generated using the 'toString()' method of the relevant Java class.
243+
Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,
244+
a local file system (available on all nodes), or any Hadoop-supported file system URI.
245+
The mechanism is as follows:
246+
1. A Java RDD is created from the InputFormat, key and value classes
247+
2. Serialization is attempted via MsgPack
248+
3. If this fails, the fallback is to call 'toString' on each key and value
249+
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
250+
251+
A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
243252
"""
244253
jconf = self._jvm.java.util.HashMap()
245254
for k, v in conf.iteritems():
246255
jconf[k] = v
247-
jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, name, inputFormat, keyClass, valueClass, keyWrapper,
248-
valueWrapper, jconf)
256+
jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, name, inputformat_class, key_class, value_class,
257+
key_wrapper, value_wrapper, jconf)
249258
return RDD(jrdd, self, MsgPackDeserializer())
250259

251-
def newAPIHadoopRDD(self, inputFormat, keyClass, valueClass, keyWrapper="toString", valueWrapper="toString",
252-
conf = {}):
260+
def newAPIHadoopRDD(self, inputformat_class, key_class, value_class, key_wrapper="toString",
261+
value_wrapper="toString", conf={}):
253262
"""
254-
Read a Hadoopp file with arbitrary InputFormat, key and value class from HDFS,
255-
a local file system (available on all nodes), or any Hadoop-supported file system URI,
256-
and return it as an RDD of (String, String), where the key and value representations
257-
are generated using the 'toString()' method of the relevant Java class.
263+
Read a 'new API' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
264+
that is passed in as a Python dict. This will be converted into a Configuration in Java.
265+
The mechanism is as follows:
266+
1. A Java RDD is created from the InputFormat, key and value classes
267+
2. Serialization is attempted via MsgPack
268+
3. If this fails, the fallback is to call 'toString' on each key and value
269+
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
258270
"""
259271
jconf = self._jvm.java.util.HashMap()
260272
for k, v in conf.iteritems():
261273
jconf[k] = v
262-
jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormat, keyClass, valueClass, keyWrapper,
263-
valueWrapper, jconf)
274+
jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputformat_class, key_class, value_class, key_wrapper,
275+
value_wrapper, jconf)
264276
return RDD(jrdd, self, MsgPackDeserializer())
265277

266-
###
278+
def hadoopFile(self, name, inputformat_class, key_class, value_class, key_wrapper="toString",
279+
value_wrapper="toString", conf={}):
280+
"""
281+
Read an 'old' Hadoop InputFormat with arbitrary key and value class from HDFS,
282+
a local file system (available on all nodes), or any Hadoop-supported file system URI.
283+
The mechanism is as follows:
284+
1. A Java RDD is created from the InputFormat, key and value classes
285+
2. Serialization is attempted via MsgPack
286+
3. If this fails, the fallback is to call 'toString' on each key and value
287+
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
288+
289+
A Hadoop configuration can be passed in as a Python dict. This will be converted into a Configuration in Java
290+
"""
291+
jconf = self._jvm.java.util.HashMap()
292+
for k, v in conf.iteritems():
293+
jconf[k] = v
294+
jrdd = self._jvm.PythonRDD.hadoopFile(self._jsc, name, inputformat_class, key_class, value_class, key_wrapper,
295+
value_wrapper, jconf)
296+
return RDD(jrdd, self, MsgPackDeserializer())
297+
298+
def hadoopRDD(self, inputformat_class, key_class, value_class, key_wrapper="toString",
299+
value_wrapper="toString", conf={}):
300+
"""
301+
Read an 'old' Hadoop InputFormat with arbitrary key and value class, from an arbitrary Hadoop configuration,
302+
that is passed in as a Python dict. This will be converted into a Configuration in Java.
303+
The mechanism is as follows:
304+
1. A Java RDD is created from the InputFormat, key and value classes
305+
2. Serialization is attempted via MsgPack
306+
3. If this fails, the fallback is to call 'toString' on each key and value
307+
4. C{MsgPackDeserializer} is used to deserialize data on the Python side
308+
"""
309+
jconf = self._jvm.java.util.HashMap()
310+
for k, v in conf.iteritems():
311+
jconf[k] = v
312+
jrdd = self._jvm.PythonRDD.hadoopRDD(self._jsc, inputformat_class, key_class, value_class, key_wrapper,
313+
value_wrapper, jconf)
314+
return RDD(jrdd, self, MsgPackDeserializer())
267315

268316
def _checkpointFile(self, name, input_deserializer):
269317
jrdd = self._jsc.checkpointFile(name)

0 commit comments

Comments
 (0)