@@ -20,6 +20,8 @@ package org.apache.spark.api.java
2020import java .util
2121import java .util .{Map => JMap }
2222
23+ import java .io .DataInputStream
24+
2325import scala .collection .JavaConversions
2426import scala .collection .JavaConversions ._
2527import scala .language .implicitConversions
@@ -180,6 +182,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
180182 def textFile (path : String , minPartitions : Int ): JavaRDD [String ] =
181183 sc.textFile(path, minPartitions)
182184
185+
186+
183187 /**
184188 * Read a directory of text files from HDFS, a local file system (available on all nodes), or any
185189 * Hadoop-supported file system URI. Each file is read as a single record and returned in a
@@ -210,6 +214,66 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
210214 def wholeTextFiles (path : String , minPartitions : Int ): JavaPairRDD [String , String ] =
211215 new JavaPairRDD (sc.wholeTextFiles(path, minPartitions))
212216
217+ /**
218+ * Read a directory of binary files from HDFS, a local file system (available on all nodes), or any
219+ * Hadoop-supported file system URI as a byte array. Each file is read as a single record and returned in a
220+ * key-value pair, where the key is the path of each file, the value is the content of each file.
221+ *
222+ * <p> For example, if you have the following files:
223+ * {{{
224+ * hdfs://a-hdfs-path/part-00000
225+ * hdfs://a-hdfs-path/part-00001
226+ * ...
227+ * hdfs://a-hdfs-path/part-nnnnn
228+ * }}}
229+ *
230+ * Do `JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
231+ *
232+ * <p> then `rdd` contains
233+ * {{{
234+ * (a-hdfs-path/part-00000, its content)
235+ * (a-hdfs-path/part-00001, its content)
236+ * ...
237+ * (a-hdfs-path/part-nnnnn, its content)
238+ * }}}
239+ *
240+ * @note Small files are preferred, large file is also allowable, but may cause bad performance.
241+ *
242+ * @param minPartitions A suggestion value of the minimal splitting number for input data.
243+ */
244+ def dataStreamFiles (path : String , minPartitions : Int = defaultMinPartitions): JavaPairRDD [String ,DataInputStream ] =
245+ new JavaPairRDD (sc.dataStreamFiles(path,minPartitions))
246+
247+ /**
248+ * Read a directory of files as DataInputStreams from HDFS, a local file system (available on all nodes), or any
249+ * Hadoop-supported file system URI as a byte array. Each file is read as a single record and returned in a
250+ * key-value pair, where the key is the path of each file, the value is the content of each file.
251+ *
252+ * <p> For example, if you have the following files:
253+ * {{{
254+ * hdfs://a-hdfs-path/part-00000
255+ * hdfs://a-hdfs-path/part-00001
256+ * ...
257+ * hdfs://a-hdfs-path/part-nnnnn
258+ * }}}
259+ *
260+ * Do `JavaPairRDD<String, DataInputStream> rdd = sparkContext.binaryFiles("hdfs://a-hdfs-path")`,
261+ *
262+ * <p> then `rdd` contains
263+ * {{{
264+ * (a-hdfs-path/part-00000, its content)
265+ * (a-hdfs-path/part-00001, its content)
266+ * ...
267+ * (a-hdfs-path/part-nnnnn, its content)
268+ * }}}
269+ *
270+ * @note Small files are preferred, large file is also allowable, but may cause bad performance.
271+ *
272+ * @param minPartitions A suggestion value of the minimal splitting number for input data.
273+ */
274+ def binaryFiles (path : String , minPartitions : Int = defaultMinPartitions): JavaPairRDD [String ,Array [Byte ]] =
275+ new JavaPairRDD (sc.binaryFiles(path,minPartitions))
276+
213277 /**
214278 * Read a directory of text files from HDFS, a local file system (available on all nodes), or any
215279 * Hadoop-supported file system URI. Each file is read as a single record and returned in a
0 commit comments