apache · zsxwing · Oct 11, 2014 · Oct 12, 2014 · Oct 17, 2014 · Oct 17, 2014
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
@@ -653,8 +653,7 @@ methods for creating DStreams from files and Akka actors as input sources.
     </div>
     </div>
 
-	Spark Streaming will monitor the directory `dataDirectory` and process any files created in that directory (files written in nested directories not supported). Note that
-
+	Spark Streaming will monitor the directory `dataDirectory` and process any files created in that directory. It won't search the nested directories by default. You can set the optional `depth` parameter to a value greater than 1 to monitor files in subdirectories. Note that
      + The files must have the same data format.
      + The files must be created in the `dataDirectory` by atomically *moving* or *renaming* them into
      the data directory.

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
@@ -253,14 +253,18 @@ def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_
         return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self,
                        UTF8Deserializer())
 
-    def textFileStream(self, directory):
+    def textFileStream(self, directory, depth=1):
         """
         Create an input stream that monitors a Hadoop-compatible file system
         for new files and reads them as text files. Files must be wrriten to the
         monitored directory by "moving" them from another location within the same
         file system. File names starting with . are ignored.
+
+        @param directory:      The directory to monitor
+        @param depth:          The max depth to search in the directory. The default
+                               value 1 means only searching files in the current directory
         """
-        return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
+        return DStream(self._jssc.textFileStream(directory, depth), self, UTF8Deserializer())
 
     def binaryRecordsStream(self, directory, recordLength):
         """

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -386,6 +386,27 @@ class StreamingContext private[streaming] (
     new FileInputDStream[K, V, F](this, directory)
   }
 
+  /**
+   * Create a input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them using the given key-value types and input format.
+   * Files must be written to the monitored directory by "moving" them from another
+   * location within the same file system. File names starting with . are ignored.
+   * It can also monitor files in subdirectories by setting the optional `depth`
+   * parameter to a value greater than 1.
+   * @param directory HDFS directory to monitor for new file
+   * @param depth Searching depth of HDFS directory
+   * @tparam K Key type for reading HDFS file
+   * @tparam V Value type for reading HDFS file
+   * @tparam F Input format for reading HDFS file
+   */
+  def fileStream[
+    K: ClassTag,
+    V: ClassTag,
+    F <: NewInputFormat[K, V]: ClassTag
+  ] (directory: String, depth: Int): InputDStream[(K, V)] = {
+    new FileInputDStream[K, V, F](this, directory, depth)
+  }
+
   /**
    * Create a input stream that monitors a Hadoop-compatible filesystem
    * for new files and reads them using the given key-value types and input format.
@@ -403,7 +424,33 @@ class StreamingContext private[streaming] (
     V: ClassTag,
     F <: NewInputFormat[K, V]: ClassTag
   ] (directory: String, filter: Path => Boolean, newFilesOnly: Boolean): InputDStream[(K, V)] = {
-    new FileInputDStream[K, V, F](this, directory, filter, newFilesOnly)
+    new FileInputDStream[K, V, F](this, directory, 1, filter, newFilesOnly)
+  }
+
+  /**
+   * Create a input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them using the given key-value types and input format.
+   * Files must be written to the monitored directory by "moving" them from another
+   * location within the same file system.
+   * It can also monitor files in subdirectories by setting the optional `depth`
+   * parameter to a value greater than 1.
+   * @param directory HDFS directory to monitor for new file
+   * @param filter Function to filter paths to process
+   * @param newFilesOnly Should process only new files and ignore existing files in the directory
+   * @param depth Searching depth of HDFS directory
+   * @tparam K Key type for reading HDFS file
+   * @tparam V Value type for reading HDFS file
+   * @tparam F Input format for reading HDFS file
+   */
+  def fileStream[
+    K: ClassTag,
+    V: ClassTag,
+    F <: NewInputFormat[K, V]: ClassTag
+  ] (directory: String,
+     filter: Path => Boolean,
+     newFilesOnly: Boolean,
+     depth: Int): InputDStream[(K, V)] = {
+    new FileInputDStream[K, V, F](this, directory, depth, filter, newFilesOnly)
   }
 
   /**
@@ -427,7 +474,35 @@ class StreamingContext private[streaming] (
      filter: Path => Boolean,
      newFilesOnly: Boolean,
      conf: Configuration): InputDStream[(K, V)] = {
-    new FileInputDStream[K, V, F](this, directory, filter, newFilesOnly, Option(conf))
+    new FileInputDStream[K, V, F](this, directory, 1, filter, newFilesOnly, Option(conf))
+  }
+
+  /**
+   * Create a input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them using the given key-value types and input format.
+   * Files must be written to the monitored directory by "moving" them from another
+   * location within the same file system. File names starting with . are ignored.
+   * It can also monitor files in subdirectories by setting the optional `depth`
+   * parameter to a value greater than 1.
+   * @param directory HDFS directory to monitor for new file
+   * @param filter Function to filter paths to process
+   * @param newFilesOnly Should process only new files and ignore existing files in the directory
+   * @param conf Hadoop configuration
+   * @param depth Searching depth of HDFS directory
+   * @tparam K Key type for reading HDFS file
+   * @tparam V Value type for reading HDFS file
+   * @tparam F Input format for reading HDFS file
+   */
+  def fileStream[
+    K: ClassTag,
+    V: ClassTag,
+    F <: NewInputFormat[K, V]: ClassTag
+  ] (directory: String,
+     filter: Path => Boolean,
+     newFilesOnly: Boolean,
+     conf: Configuration,
+     depth: Int): InputDStream[(K, V)] = {
+    new FileInputDStream[K, V, F](this, directory, depth, filter, newFilesOnly, Option(conf))
   }
 
   /**
@@ -442,6 +517,23 @@ class StreamingContext private[streaming] (
     fileStream[LongWritable, Text, TextInputFormat](directory).map(_._2.toString)
   }
 
+  /**
+   * Create a input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them as text files (using key as LongWritable, value
+   * as Text and input format as TextInputFormat). Files must be written to the
+   * monitored directory by "moving" them from another location within the same
+   * file system. File names starting with . are ignored.
+   * It can also monitor files in subdirectories by setting the optional `depth`
+   * parameter to a value greater than 1.
+   * @param directory HDFS directory to monitor for new file
+   * @param depth Searching depth of HDFS directory
+   */
+  def textFileStream(
+      directory: String,
+      depth: Int): DStream[String] = withNamedScope("text file stream") {
+    fileStream[LongWritable, Text, TextInputFormat](directory, depth).map(_._2.toString)
+  }
+
   /**
    * :: Experimental ::
    *
@@ -450,21 +542,25 @@ class StreamingContext private[streaming] (
    * generating one byte array per record. Files must be written to the monitored directory
    * by "moving" them from another location within the same file system. File names
    * starting with . are ignored.
+   * It can also monitor files in subdirectories by setting the optional `depth`
+   * parameter to a value greater than 1.
    *
    * '''Note:''' We ensure that the byte array for each record in the
    * resulting RDDs of the DStream has the provided record length.
    *
    * @param directory HDFS directory to monitor for new file
    * @param recordLength length of each record in bytes
+   * @param depth Searching depth of HDFS directory
    */
   @Experimental
   def binaryRecordsStream(
       directory: String,
-      recordLength: Int): DStream[Array[Byte]] = withNamedScope("binary records stream") {
+      recordLength: Int,
+      depth: Int): DStream[Array[Byte]] = withNamedScope("binary records stream") {
     val conf = sc_.hadoopConfiguration
     conf.setInt(FixedLengthBinaryInputFormat.RECORD_LENGTH_PROPERTY, recordLength)
     val br = fileStream[LongWritable, BytesWritable, FixedLengthBinaryInputFormat](
-      directory, FileInputDStream.defaultFilter: Path => Boolean, newFilesOnly = true, conf)
+      directory, FileInputDStream.defaultFilter: Path => Boolean, newFilesOnly = true, conf, depth)
     val data = br.map { case (k, v) =>
       val bytes = v.getBytes
       require(bytes.length == recordLength, "Byte array does not have correct length. " +
@@ -474,6 +570,28 @@ class StreamingContext private[streaming] (
     data
   }
 
+  /**
+   * :: Experimental ::
+   *
+   * Create an input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them as flat binary files, assuming a fixed length per record,
+   * generating one byte array per record. Files must be written to the monitored directory
+   * by "moving" them from another location within the same file system. File names
+   * starting with . are ignored.
+   *
+   * '''Note:''' We ensure that the byte array for each record in the
+   * resulting RDDs of the DStream has the provided record length.
+   *
+   * @param directory HDFS directory to monitor for new file
+   * @param recordLength length of each record in bytes
+   */
+  @Experimental
+  def binaryRecordsStream(
+      directory: String,
+      recordLength: Int): DStream[Array[Byte]] = withNamedScope("binary records stream") {
+      binaryRecordsStream(directory, recordLength, 1)
+  }
+
   /**
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -214,6 +214,15 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     ssc.textFileStream(directory)
   }
 
+  /**
+   * Create an input stream that monitor files in subdirectories for new files
+   * and reads them as text files.
+   * @param directory HDFS directory to monitor for new file
+   * @param depth Searching depth of HDFS directory
+   */
+  def textFileStream(directory: String, depth: Int): JavaDStream[String] = {
+    ssc.textFileStream(directory, depth)
+  }
   /**
    * :: Experimental ::
    *
@@ -292,6 +301,34 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     ssc.fileStream[K, V, F](directory)
   }
 
+  /**
+   * Create an input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them using the given key-value types and input format.
+   * Files must be written to the monitored directory by "moving" them from another
+   * location within the same file system. File names starting with . are ignored.
+   * It can also monitor files in subdirectories by setting the optional `depth`
+   * parameter to a value greater than 1.
+   * @param directory HDFS directory to monitor for new file
+   * @param depth Searching depth of HDFS directory
+   * @param kClass class of key for reading HDFS file
+   * @param vClass class of value for reading HDFS file
+   * @param fClass class of input format for reading HDFS file
+   * @tparam K Key type for reading HDFS file
+   * @tparam V Value type for reading HDFS file
+   * @tparam F Input format for reading HDFS file
+   */
+  def fileStream[K, V, F <: NewInputFormat[K, V]](
+      directory: String,
+      depth: Int,
+      kClass: Class[K],
+      vClass: Class[V],
+      fClass: Class[F]): JavaPairInputDStream[K, V] = {
+    implicit val cmk: ClassTag[K] = ClassTag(kClass)
+    implicit val cmv: ClassTag[V] = ClassTag(vClass)
+    implicit val cmf: ClassTag[F] = ClassTag(fClass)
+    ssc.fileStream[K, V, F](directory, depth)
+  }
+
   /**
    * Create an input stream that monitors a Hadoop-compatible filesystem
    * for new files and reads them using the given key-value types and input format.
@@ -321,6 +358,39 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     ssc.fileStream[K, V, F](directory, fn, newFilesOnly)
   }
 
+  /**
+   * Create an input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them using the given key-value types and input format.
+   * Files must be written to the monitored directory by "moving" them from another
+   * location within the same file system. File names starting with . are ignored.
+   * It can also monitor files in subdirectories by setting the optional `depth`
+   * parameter to a value greater than 1.
+   * @param directory HDFS directory to monitor for new file
+   * @param kClass class of key for reading HDFS file
+   * @param vClass class of value for reading HDFS file
+   * @param fClass class of input format for reading HDFS file
+   * @param filter Function to filter paths to process
+   * @param newFilesOnly Should process only new files and ignore existing files in the directory
+   * @param depth Searching depth of HDFS directory
+   * @tparam K Key type for reading HDFS file
+   * @tparam V Value type for reading HDFS file
+   * @tparam F Input format for reading HDFS file
+   */
+  def fileStream[K, V, F <: NewInputFormat[K, V]](
+      directory: String,
+      kClass: Class[K],
+      vClass: Class[V],
+      fClass: Class[F],
+      filter: JFunction[Path, JBoolean],
+      newFilesOnly: Boolean,
+      depth: Int): JavaPairInputDStream[K, V] = {
+    implicit val cmk: ClassTag[K] = ClassTag(kClass)
+    implicit val cmv: ClassTag[V] = ClassTag(vClass)
+    implicit val cmf: ClassTag[F] = ClassTag(fClass)
+    def fn: (Path) => Boolean = (x: Path) => filter.call(x).booleanValue()
+    ssc.fileStream[K, V, F](directory, fn, newFilesOnly, depth)
+  }
+
   /**
    * Create an input stream that monitors a Hadoop-compatible filesystem
    * for new files and reads them using the given key-value types and input format.
@@ -352,6 +422,41 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
     ssc.fileStream[K, V, F](directory, fn, newFilesOnly, conf)
   }
 
+  /**
+   * Create an input stream that monitors a Hadoop-compatible filesystem
+   * for new files and reads them using the given key-value types and input format.
+   * Files must be written to the monitored directory by "moving" them from another
+   * location within the same file system. File names starting with . are ignored.
+   * It can also monitor files in subdirectories by setting the optional `depth`
+   * parameter to a value greater than 1.
+   * @param directory HDFS directory to monitor for new file
+   * @param kClass class of key for reading HDFS file
+   * @param vClass class of value for reading HDFS file
+   * @param fClass class of input format for reading HDFS file
+   * @param filter Function to filter paths to process
+   * @param newFilesOnly Should process only new files and ignore existing files in the directory
+   * @param conf Hadoop configuration
+   * @param depth Searching depth of HDFS directory
+   * @tparam K Key type for reading HDFS file
+   * @tparam V Value type for reading HDFS file
+   * @tparam F Input format for reading HDFS file
+   */
+  def fileStream[K, V, F <: NewInputFormat[K, V]](
+      directory: String,
+      kClass: Class[K],
+      vClass: Class[V],
+      fClass: Class[F],
+      filter: JFunction[Path, JBoolean],
+      newFilesOnly: Boolean,
+      conf: Configuration,
+      depth: Int): JavaPairInputDStream[K, V] = {
+    implicit val cmk: ClassTag[K] = ClassTag(kClass)
+    implicit val cmv: ClassTag[V] = ClassTag(vClass)
+    implicit val cmf: ClassTag[F] = ClassTag(fClass)
+    def fn: (Path) => Boolean = (x: Path) => filter.call(x).booleanValue()
+    ssc.fileStream[K, V, F](directory, fn, newFilesOnly, conf, depth)
+  }
+
   /**
    * Create an input stream with any arbitrary user implemented actor receiver.
    * @param props Props object defining creation of the actor